Qwen3-1.7-regexp-GRPO-s9-1k-steps / trainer_state.json
ismaelR's picture
Upload folder using huggingface_hub
ff5a818 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 250,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 110.15,
"epoch": 0.1,
"grad_norm": 0.0,
"kl": 0.001616668701171875,
"learning_rate": 9.97e-07,
"loss": -0.0,
"reward": 0.0008232486434280872,
"reward_std": 0.0021940818056464194,
"rewards/DCR_reward": 0.0008232486434280872,
"step": 10
},
{
"completion_length": 98.4125,
"epoch": 0.2,
"grad_norm": 0.0,
"kl": 0.0034271240234375,
"learning_rate": 9.936666666666667e-07,
"loss": 0.0,
"reward": 0.00022143989917822182,
"reward_std": 0.000626326643396169,
"rewards/DCR_reward": 0.00022143989917822182,
"step": 20
},
{
"completion_length": 84.2,
"epoch": 0.3,
"grad_norm": 0.0,
"kl": 0.003081512451171875,
"learning_rate": 9.903333333333333e-07,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/DCR_reward": 0.0,
"step": 30
},
{
"completion_length": 105.125,
"epoch": 0.4,
"grad_norm": 0.0,
"kl": 0.004283905029296875,
"learning_rate": 9.87e-07,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/DCR_reward": 0.0,
"step": 40
},
{
"completion_length": 81.4,
"epoch": 0.5,
"grad_norm": 26.5,
"kl": 0.0041351318359375,
"learning_rate": 9.836666666666666e-07,
"loss": -0.0,
"reward": 0.018291093641892076,
"reward_std": 0.05173502548132092,
"rewards/DCR_reward": 0.018291093641892076,
"step": 50
},
{
"completion_length": 104.425,
"epoch": 0.6,
"grad_norm": 0.0,
"kl": 0.006622314453125,
"learning_rate": 9.803333333333332e-07,
"loss": -0.0,
"reward": 0.009192863292992116,
"reward_std": 0.025964342057704926,
"rewards/DCR_reward": 0.009192863292992116,
"step": 60
},
{
"completion_length": 109.35,
"epoch": 0.7,
"grad_norm": 39.25,
"kl": 0.003626251220703125,
"learning_rate": 9.77e-07,
"loss": 0.0,
"reward": 0.013767135608941317,
"reward_std": 0.03893933929502964,
"rewards/DCR_reward": 0.013767135608941317,
"step": 70
},
{
"completion_length": 89.1,
"epoch": 0.8,
"grad_norm": 0.0,
"kl": 0.013692855834960938,
"learning_rate": 9.736666666666667e-07,
"loss": 0.0,
"reward": 0.011479373268957715,
"reward_std": 0.032468570384662596,
"rewards/DCR_reward": 0.011479373268957715,
"step": 80
},
{
"completion_length": 97.575,
"epoch": 0.9,
"grad_norm": 0.0,
"kl": 0.0066986083984375,
"learning_rate": 9.703333333333332e-07,
"loss": -0.0,
"reward": 0.009072506427764892,
"reward_std": 0.025660922378301622,
"rewards/DCR_reward": 0.009072506427764892,
"step": 90
},
{
"completion_length": 114.3625,
"epoch": 1.0,
"grad_norm": 0.0,
"kl": 0.00914459228515625,
"learning_rate": 9.67e-07,
"loss": 0.0,
"reward": 0.011166714504361153,
"reward_std": 0.03158423751592636,
"rewards/DCR_reward": 0.011166714504361153,
"step": 100
},
{
"completion_length": 88.5375,
"epoch": 1.1,
"grad_norm": 0.0,
"kl": 0.011919403076171875,
"learning_rate": 9.636666666666666e-07,
"loss": 0.0,
"reward": 2.8343783924356103e-05,
"reward_std": 8.016832871362566e-05,
"rewards/DCR_reward": 2.8343783924356103e-05,
"step": 110
},
{
"completion_length": 103.875,
"epoch": 1.2,
"grad_norm": 0.0,
"kl": 0.010530471801757812,
"learning_rate": 9.603333333333333e-07,
"loss": 0.0,
"reward": 0.005421069198928308,
"reward_std": 0.015333099680719896,
"rewards/DCR_reward": 0.005421069198928308,
"step": 120
},
{
"completion_length": 121.75,
"epoch": 1.3,
"grad_norm": 0.0,
"kl": 0.00739288330078125,
"learning_rate": 9.57e-07,
"loss": -0.0,
"reward": 3.4938057069666684e-05,
"reward_std": 9.881975129246711e-05,
"rewards/DCR_reward": 3.4938057069666684e-05,
"step": 130
},
{
"completion_length": 98.975,
"epoch": 1.4,
"grad_norm": 0.0,
"kl": 0.009772491455078126,
"learning_rate": 9.536666666666667e-07,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/DCR_reward": 0.0,
"step": 140
},
{
"completion_length": 112.7875,
"epoch": 1.5,
"grad_norm": 0.0,
"kl": 0.01072845458984375,
"learning_rate": 9.503333333333333e-07,
"loss": 0.0,
"reward": 4.633456701412797e-05,
"reward_std": 8.579618879593909e-05,
"rewards/DCR_reward": 4.633456701412797e-05,
"step": 150
},
{
"completion_length": 116.125,
"epoch": 1.6,
"grad_norm": 16.25,
"kl": 0.0097686767578125,
"learning_rate": 9.469999999999999e-07,
"loss": -0.0,
"reward": 0.011431791516952217,
"reward_std": 0.03233398855663836,
"rewards/DCR_reward": 0.011431791516952217,
"step": 160
},
{
"completion_length": 100.25,
"epoch": 1.7,
"grad_norm": 25.875,
"kl": 0.0120208740234375,
"learning_rate": 9.436666666666667e-07,
"loss": 0.0,
"reward": 0.006299582323117647,
"reward_std": 0.017817909209406936,
"rewards/DCR_reward": 0.006299582323117647,
"step": 170
},
{
"completion_length": 106.425,
"epoch": 1.8,
"grad_norm": 0.0,
"kl": 0.0339019775390625,
"learning_rate": 9.403333333333333e-07,
"loss": 0.0,
"reward": 0.014636733755469322,
"reward_std": 0.04139893501996994,
"rewards/DCR_reward": 0.014636733755469322,
"step": 180
},
{
"completion_length": 82.725,
"epoch": 1.9,
"grad_norm": 33.75,
"kl": 0.008979415893554688,
"learning_rate": 9.37e-07,
"loss": 0.0,
"reward": 0.005478436907287687,
"reward_std": 0.015495359338819981,
"rewards/DCR_reward": 0.005478436907287687,
"step": 190
},
{
"completion_length": 71.5875,
"epoch": 2.0,
"grad_norm": 0.0,
"kl": 0.01548919677734375,
"learning_rate": 9.336666666666666e-07,
"loss": -0.0,
"reward": 0.025108913704752923,
"reward_std": 0.046553592692362145,
"rewards/DCR_reward": 0.025108913704752923,
"step": 200
},
{
"completion_length": 128.8125,
"epoch": 2.1,
"grad_norm": 10.25,
"kl": 0.016112518310546876,
"learning_rate": 9.303333333333333e-07,
"loss": 0.0,
"reward": 0.006037246529012918,
"reward_std": 0.017075913585722448,
"rewards/DCR_reward": 0.006037246529012918,
"step": 210
},
{
"completion_length": 91.65,
"epoch": 2.2,
"grad_norm": 0.0,
"kl": 0.0196868896484375,
"learning_rate": 9.27e-07,
"loss": 0.0,
"reward": 0.018143273887835674,
"reward_std": 0.05131692748691421,
"rewards/DCR_reward": 0.018143273887835674,
"step": 220
},
{
"completion_length": 102.8,
"epoch": 2.3,
"grad_norm": 23.5,
"kl": 0.02627716064453125,
"learning_rate": 9.236666666666666e-07,
"loss": 0.0,
"reward": 0.006400418069824809,
"reward_std": 0.018103116168640555,
"rewards/DCR_reward": 0.006400418069824809,
"step": 230
},
{
"completion_length": 94.325,
"epoch": 2.4,
"grad_norm": 0.0,
"kl": 0.01078948974609375,
"learning_rate": 9.203333333333333e-07,
"loss": 0.0,
"reward": 0.0037326388992369175,
"reward_std": 0.010557496920228004,
"rewards/DCR_reward": 0.0037326388992369175,
"step": 240
},
{
"completion_length": 108.1125,
"epoch": 2.5,
"grad_norm": 0.0,
"kl": 0.0536102294921875,
"learning_rate": 9.17e-07,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/DCR_reward": 0.0,
"step": 250
},
{
"epoch": 2.5,
"eval_completion_length": 109.77125,
"eval_kl": 0.026533355712890627,
"eval_loss": 1.797125004365796e-09,
"eval_reward": 0.014695163480864722,
"eval_reward_std": 0.03626910645980388,
"eval_rewards/DCR_reward": 0.014695163480864722,
"eval_runtime": 851.0362,
"eval_samples_per_second": 0.118,
"eval_steps_per_second": 0.015,
"step": 250
},
{
"completion_length": 112.5,
"epoch": 2.6,
"grad_norm": 0.0,
"kl": 0.0502593994140625,
"learning_rate": 9.136666666666666e-07,
"loss": -0.0,
"reward": 6.693824325338937e-05,
"reward_std": 0.00018932993698399513,
"rewards/DCR_reward": 6.693824325338937e-05,
"step": 260
},
{
"completion_length": 102.9875,
"epoch": 2.7,
"grad_norm": 20.875,
"kl": 0.0537139892578125,
"learning_rate": 9.103333333333333e-07,
"loss": 0.0,
"reward": 0.005209430219838396,
"reward_std": 0.014680334192235023,
"rewards/DCR_reward": 0.005209430219838396,
"step": 270
},
{
"completion_length": 104.2875,
"epoch": 2.8,
"grad_norm": 0.0,
"kl": 0.0519927978515625,
"learning_rate": 9.07e-07,
"loss": 0.0,
"reward": 3.591954009607434e-05,
"reward_std": 0.00010159580269828439,
"rewards/DCR_reward": 3.591954009607434e-05,
"step": 280
},
{
"completion_length": 132.2,
"epoch": 2.9,
"grad_norm": 0.0,
"kl": 0.0234283447265625,
"learning_rate": 9.036666666666666e-07,
"loss": 0.0,
"reward": 0.01878409832715988,
"reward_std": 0.05312945321202278,
"rewards/DCR_reward": 0.01878409832715988,
"step": 290
},
{
"completion_length": 97.65,
"epoch": 3.0,
"grad_norm": 16.25,
"kl": 0.01286468505859375,
"learning_rate": 9.003333333333333e-07,
"loss": -0.0,
"reward": 0.023834712347888854,
"reward_std": 0.033087541203713045,
"rewards/DCR_reward": 0.023834712347888854,
"step": 300
},
{
"completion_length": 113.275,
"epoch": 3.1,
"grad_norm": 15.75,
"kl": 0.085699462890625,
"learning_rate": 8.969999999999999e-07,
"loss": -0.0,
"reward": 0.054491185687948016,
"reward_std": 0.0751757369551342,
"rewards/DCR_reward": 0.054491185687948016,
"step": 310
},
{
"completion_length": 96.375,
"epoch": 3.2,
"grad_norm": 0.0,
"kl": 0.051617431640625,
"learning_rate": 8.936666666666667e-07,
"loss": -0.0,
"reward": 0.02024147715419531,
"reward_std": 0.05725154206156731,
"rewards/DCR_reward": 0.02024147715419531,
"step": 320
},
{
"completion_length": 118.075,
"epoch": 3.3,
"grad_norm": 0.0,
"kl": 0.020208740234375,
"learning_rate": 8.903333333333333e-07,
"loss": -0.0,
"reward": 0.03521969501452986,
"reward_std": 0.08483822367852553,
"rewards/DCR_reward": 0.03521969501452986,
"step": 330
},
{
"completion_length": 117.7625,
"epoch": 3.4,
"grad_norm": 0.0,
"kl": 0.026739501953125,
"learning_rate": 8.869999999999999e-07,
"loss": -0.0,
"reward": 0.021311641685315408,
"reward_std": 0.03974629848089535,
"rewards/DCR_reward": 0.021311641685315408,
"step": 340
},
{
"completion_length": 126.9375,
"epoch": 3.5,
"grad_norm": 0.0,
"kl": 0.041534423828125,
"learning_rate": 8.836666666666667e-07,
"loss": 0.0,
"reward": 0.018433896452188493,
"reward_std": 0.04472574144601822,
"rewards/DCR_reward": 0.018433896452188493,
"step": 350
},
{
"completion_length": 123.275,
"epoch": 3.6,
"grad_norm": 0.0,
"kl": 0.029638671875,
"learning_rate": 8.803333333333333e-07,
"loss": 0.0,
"reward": 0.008903130898397648,
"reward_std": 0.023928308754693716,
"rewards/DCR_reward": 0.008903130898397648,
"step": 360
},
{
"completion_length": 141.2875,
"epoch": 3.7,
"grad_norm": 0.0,
"kl": 0.021136474609375,
"learning_rate": 8.769999999999999e-07,
"loss": -0.0,
"reward": 0.0002970047564303968,
"reward_std": 0.000840056300512515,
"rewards/DCR_reward": 0.0002970047564303968,
"step": 370
},
{
"completion_length": 143.6125,
"epoch": 3.8,
"grad_norm": 28.625,
"kl": 0.071319580078125,
"learning_rate": 8.736666666666667e-07,
"loss": 0.0,
"reward": 0.025264605600386857,
"reward_std": 0.062286792299710216,
"rewards/DCR_reward": 0.025264605600386857,
"step": 380
},
{
"completion_length": 123.4125,
"epoch": 3.9,
"grad_norm": 0.0,
"kl": 0.0508056640625,
"learning_rate": 8.703333333333333e-07,
"loss": 0.0,
"reward": 0.011455658166960347,
"reward_std": 0.032401493715588,
"rewards/DCR_reward": 0.011455658166960347,
"step": 390
},
{
"completion_length": 99.8875,
"epoch": 4.0,
"grad_norm": 64.5,
"kl": 0.0475677490234375,
"learning_rate": 8.669999999999999e-07,
"loss": -0.0,
"reward": 0.020158628193894402,
"reward_std": 0.04393248584237881,
"rewards/DCR_reward": 0.020158628193894402,
"step": 400
},
{
"completion_length": 122.4,
"epoch": 4.1,
"grad_norm": 0.0,
"kl": 0.0520751953125,
"learning_rate": 8.636666666666667e-07,
"loss": 0.0,
"reward": 0.027829134710191283,
"reward_std": 0.07858068596397061,
"rewards/DCR_reward": 0.027829134710191283,
"step": 410
},
{
"completion_length": 161.3375,
"epoch": 4.2,
"grad_norm": 63.75,
"kl": 0.0515869140625,
"learning_rate": 8.603333333333332e-07,
"loss": 0.0,
"reward": 0.021562288980931044,
"reward_std": 0.06093565103947185,
"rewards/DCR_reward": 0.021562288980931044,
"step": 420
},
{
"completion_length": 132.5625,
"epoch": 4.3,
"grad_norm": 0.0,
"kl": 0.06763916015625,
"learning_rate": 8.569999999999999e-07,
"loss": 0.0,
"reward": 0.01166691112157423,
"reward_std": 0.0329432392900344,
"rewards/DCR_reward": 0.01166691112157423,
"step": 430
},
{
"completion_length": 116.35,
"epoch": 4.4,
"grad_norm": 0.0,
"kl": 0.05010986328125,
"learning_rate": 8.536666666666667e-07,
"loss": -0.0,
"reward": 0.001521215244429186,
"reward_std": 0.004162183034350164,
"rewards/DCR_reward": 0.001521215244429186,
"step": 440
},
{
"completion_length": 151.25,
"epoch": 4.5,
"grad_norm": 0.0,
"kl": 0.087615966796875,
"learning_rate": 8.503333333333333e-07,
"loss": -0.0,
"reward": 0.017801515758037567,
"reward_std": 0.05035028904676438,
"rewards/DCR_reward": 0.017801515758037567,
"step": 450
},
{
"completion_length": 104.8125,
"epoch": 4.6,
"grad_norm": 16.875,
"kl": 0.05291748046875,
"learning_rate": 8.469999999999999e-07,
"loss": -0.0,
"reward": 0.007641428161878139,
"reward_std": 0.021491285803494974,
"rewards/DCR_reward": 0.007641428161878139,
"step": 460
},
{
"completion_length": 126.9,
"epoch": 4.7,
"grad_norm": 19.75,
"kl": 0.10247802734375,
"learning_rate": 8.436666666666667e-07,
"loss": 0.0,
"reward": 0.011310203482571524,
"reward_std": 0.03199008805677295,
"rewards/DCR_reward": 0.011310203482571524,
"step": 470
},
{
"completion_length": 141.6625,
"epoch": 4.8,
"grad_norm": 0.0,
"kl": 0.0375762939453125,
"learning_rate": 8.403333333333333e-07,
"loss": -0.0,
"reward": 0.0044936020654859025,
"reward_std": 0.012646565132308751,
"rewards/DCR_reward": 0.0044936020654859025,
"step": 480
},
{
"completion_length": 121.0375,
"epoch": 4.9,
"grad_norm": 0.0,
"kl": 0.0927490234375,
"learning_rate": 8.369999999999999e-07,
"loss": 0.0,
"reward": 0.0428094768547453,
"reward_std": 0.096446827147156,
"rewards/DCR_reward": 0.0428094768547453,
"step": 490
},
{
"completion_length": 159.7625,
"epoch": 5.0,
"grad_norm": 21.5,
"kl": 0.05220947265625,
"learning_rate": 8.336666666666667e-07,
"loss": -0.0,
"reward": 0.028198828249878717,
"reward_std": 0.05388287528476212,
"rewards/DCR_reward": 0.028198828249878717,
"step": 500
},
{
"epoch": 5.0,
"eval_completion_length": 119.94125,
"eval_kl": 0.0939080810546875,
"eval_loss": -1.4601639897193763e-09,
"eval_reward": 0.02450084381052875,
"eval_reward_std": 0.053787164441891945,
"eval_rewards/DCR_reward": 0.02450084381052875,
"eval_runtime": 1081.7831,
"eval_samples_per_second": 0.092,
"eval_steps_per_second": 0.012,
"step": 500
},
{
"completion_length": 114.225,
"epoch": 5.1,
"grad_norm": 14.875,
"kl": 0.0663818359375,
"learning_rate": 8.303333333333333e-07,
"loss": 0.0,
"reward": 0.04443554246754502,
"reward_std": 0.07462939244578592,
"rewards/DCR_reward": 0.04443554246754502,
"step": 510
},
{
"completion_length": 111.1875,
"epoch": 5.2,
"grad_norm": 0.0,
"kl": 0.0774383544921875,
"learning_rate": 8.269999999999999e-07,
"loss": 0.0,
"reward": 0.0301513435493689,
"reward_std": 0.06074108343455009,
"rewards/DCR_reward": 0.0301513435493689,
"step": 520
},
{
"completion_length": 118.0,
"epoch": 5.3,
"grad_norm": 0.0,
"kl": 0.07442626953125,
"learning_rate": 8.236666666666666e-07,
"loss": -0.0,
"reward": 0.022635633018944647,
"reward_std": 0.060236827132757756,
"rewards/DCR_reward": 0.022635633018944647,
"step": 530
},
{
"completion_length": 146.4625,
"epoch": 5.4,
"grad_norm": 25.375,
"kl": 0.0943115234375,
"learning_rate": 8.203333333333333e-07,
"loss": -0.0,
"reward": 0.027474326699302765,
"reward_std": 0.04014584248652682,
"rewards/DCR_reward": 0.027474326699302765,
"step": 540
},
{
"completion_length": 135.9125,
"epoch": 5.5,
"grad_norm": 0.0,
"kl": 0.10826416015625,
"learning_rate": 8.169999999999999e-07,
"loss": -0.0,
"reward": 0.03092897320893826,
"reward_std": 0.06343855138984508,
"rewards/DCR_reward": 0.03092897320893826,
"step": 550
},
{
"completion_length": 113.6125,
"epoch": 5.6,
"grad_norm": 60.75,
"kl": 0.07705078125,
"learning_rate": 8.136666666666666e-07,
"loss": -0.0,
"reward": 0.028095364570617676,
"reward_std": 0.05137596220884007,
"rewards/DCR_reward": 0.028095364570617676,
"step": 560
},
{
"completion_length": 120.625,
"epoch": 5.7,
"grad_norm": 0.0,
"kl": 0.212255859375,
"learning_rate": 8.103333333333333e-07,
"loss": -0.0,
"reward": 0.0055281239823671054,
"reward_std": 0.015635895385639743,
"rewards/DCR_reward": 0.0055281239823671054,
"step": 570
},
{
"completion_length": 112.575,
"epoch": 5.8,
"grad_norm": 0.0,
"kl": 0.0977294921875,
"learning_rate": 8.070000000000001e-07,
"loss": -0.0,
"reward": 0.027797411862411536,
"reward_std": 0.05734402615926228,
"rewards/DCR_reward": 0.027797411862411536,
"step": 580
},
{
"completion_length": 146.475,
"epoch": 5.9,
"grad_norm": 101.5,
"kl": 0.1387939453125,
"learning_rate": 8.036666666666666e-07,
"loss": -0.0,
"reward": 0.0692019445807091,
"reward_std": 0.09863622895500157,
"rewards/DCR_reward": 0.0692019445807091,
"step": 590
},
{
"completion_length": 135.4875,
"epoch": 6.0,
"grad_norm": 0.0,
"kl": 0.07392578125,
"learning_rate": 8.003333333333333e-07,
"loss": 0.0,
"reward": 0.042109476366749735,
"reward_std": 0.07917119265184738,
"rewards/DCR_reward": 0.042109476366749735,
"step": 600
},
{
"completion_length": 151.15,
"epoch": 6.1,
"grad_norm": 66.0,
"kl": 0.112451171875,
"learning_rate": 7.970000000000001e-07,
"loss": 0.0,
"reward": 0.022725265215558465,
"reward_std": 0.03912785020947922,
"rewards/DCR_reward": 0.022725265215558465,
"step": 610
},
{
"completion_length": 114.7375,
"epoch": 6.2,
"grad_norm": 41.0,
"kl": 0.0866943359375,
"learning_rate": 7.936666666666666e-07,
"loss": -0.0,
"reward": 0.06757222047017422,
"reward_std": 0.10912240504694637,
"rewards/DCR_reward": 0.06757222047017422,
"step": 620
},
{
"completion_length": 175.425,
"epoch": 6.3,
"grad_norm": 18.0,
"kl": 0.0874755859375,
"learning_rate": 7.903333333333333e-07,
"loss": 0.0,
"reward": 0.017062357207760215,
"reward_std": 0.03701434804825112,
"rewards/DCR_reward": 0.017062357207760215,
"step": 630
},
{
"completion_length": 101.1,
"epoch": 6.4,
"grad_norm": 32.5,
"kl": 0.1383544921875,
"learning_rate": 7.87e-07,
"loss": 0.0,
"reward": 0.031146167902625164,
"reward_std": 0.06654989629168995,
"rewards/DCR_reward": 0.031146167902625164,
"step": 640
},
{
"completion_length": 147.6875,
"epoch": 6.5,
"grad_norm": 22.375,
"kl": 0.1053955078125,
"learning_rate": 7.836666666666666e-07,
"loss": 0.0,
"reward": 0.03336541847675108,
"reward_std": 0.0738158110238146,
"rewards/DCR_reward": 0.03336541847675108,
"step": 650
},
{
"completion_length": 167.375,
"epoch": 6.6,
"grad_norm": 37.75,
"kl": 0.09075927734375,
"learning_rate": 7.803333333333333e-07,
"loss": 0.0,
"reward": 0.054513926352956335,
"reward_std": 0.11440533803834115,
"rewards/DCR_reward": 0.054513926352956335,
"step": 660
},
{
"completion_length": 113.5,
"epoch": 6.7,
"grad_norm": 24.375,
"kl": 0.13349609375,
"learning_rate": 7.77e-07,
"loss": 0.0,
"reward": 0.022631679168262052,
"reward_std": 0.04243781621917151,
"rewards/DCR_reward": 0.022631679168262052,
"step": 670
},
{
"completion_length": 128.1875,
"epoch": 6.8,
"grad_norm": 10.8125,
"kl": 0.098681640625,
"learning_rate": 7.736666666666666e-07,
"loss": -0.0,
"reward": 0.011361529098212485,
"reward_std": 0.03208850735099986,
"rewards/DCR_reward": 0.011361529098212485,
"step": 680
},
{
"completion_length": 122.525,
"epoch": 6.9,
"grad_norm": 43.75,
"kl": 0.1169921875,
"learning_rate": 7.703333333333333e-07,
"loss": -0.0,
"reward": 0.017346063705190318,
"reward_std": 0.04881023944763001,
"rewards/DCR_reward": 0.017346063705190318,
"step": 690
},
{
"completion_length": 150.05,
"epoch": 7.0,
"grad_norm": 22.875,
"kl": 0.112109375,
"learning_rate": 7.67e-07,
"loss": 0.0,
"reward": 0.04065933156089159,
"reward_std": 0.10374580940115266,
"rewards/DCR_reward": 0.04065933156089159,
"step": 700
},
{
"completion_length": 116.2125,
"epoch": 7.1,
"grad_norm": 132.0,
"kl": 0.1211669921875,
"learning_rate": 7.636666666666667e-07,
"loss": -0.0,
"reward": 0.06279535398061853,
"reward_std": 0.11337692766683176,
"rewards/DCR_reward": 0.06279535398061853,
"step": 710
},
{
"completion_length": 159.625,
"epoch": 7.2,
"grad_norm": 0.0,
"kl": 0.1438232421875,
"learning_rate": 7.603333333333332e-07,
"loss": -0.0,
"reward": 0.020437985911848956,
"reward_std": 0.048951211775420236,
"rewards/DCR_reward": 0.020437985911848956,
"step": 720
},
{
"completion_length": 165.2625,
"epoch": 7.3,
"grad_norm": 0.0,
"kl": 0.209228515625,
"learning_rate": 7.57e-07,
"loss": -0.0,
"reward": 0.02226241144235246,
"reward_std": 0.05247671899269335,
"rewards/DCR_reward": 0.02226241144235246,
"step": 730
},
{
"completion_length": 115.8,
"epoch": 7.4,
"grad_norm": 32.5,
"kl": 0.1295166015625,
"learning_rate": 7.536666666666667e-07,
"loss": 0.0,
"reward": 0.014459636958054033,
"reward_std": 0.04082847375248093,
"rewards/DCR_reward": 0.014459636958054033,
"step": 740
},
{
"completion_length": 158.675,
"epoch": 7.5,
"grad_norm": 0.0,
"kl": 0.190478515625,
"learning_rate": 7.503333333333332e-07,
"loss": -0.0,
"reward": 0.010101895526895532,
"reward_std": 0.02844048692204524,
"rewards/DCR_reward": 0.010101895526895532,
"step": 750
},
{
"epoch": 7.5,
"eval_completion_length": 154.69125,
"eval_kl": 0.22284423828125,
"eval_loss": 3.744010435013934e-09,
"eval_reward": 0.04045595111856528,
"eval_reward_std": 0.0814886652509449,
"eval_rewards/DCR_reward": 0.04045595111856528,
"eval_runtime": 1402.6056,
"eval_samples_per_second": 0.071,
"eval_steps_per_second": 0.009,
"step": 750
},
{
"completion_length": 166.85,
"epoch": 7.6,
"grad_norm": 20.75,
"kl": 0.117431640625,
"learning_rate": 7.47e-07,
"loss": 0.0,
"reward": 0.005159654482849873,
"reward_std": 0.01448775691096671,
"rewards/DCR_reward": 0.005159654482849873,
"step": 760
},
{
"completion_length": 116.125,
"epoch": 7.7,
"grad_norm": 0.0,
"kl": 0.1875244140625,
"learning_rate": 7.436666666666667e-07,
"loss": -0.0,
"reward": 0.028679777635261416,
"reward_std": 0.059860612216289154,
"rewards/DCR_reward": 0.028679777635261416,
"step": 770
},
{
"completion_length": 139.5,
"epoch": 7.8,
"grad_norm": 30.125,
"kl": 0.116748046875,
"learning_rate": 7.403333333333332e-07,
"loss": -0.0,
"reward": 0.05339058640311123,
"reward_std": 0.09481030252063646,
"rewards/DCR_reward": 0.05339058640311123,
"step": 780
},
{
"completion_length": 145.7,
"epoch": 7.9,
"grad_norm": 26.875,
"kl": 0.118505859375,
"learning_rate": 7.37e-07,
"loss": -0.0,
"reward": 0.07039015240879962,
"reward_std": 0.11058784131892026,
"rewards/DCR_reward": 0.07039015240879962,
"step": 790
},
{
"completion_length": 133.6875,
"epoch": 8.0,
"grad_norm": 62.75,
"kl": 0.1423095703125,
"learning_rate": 7.336666666666667e-07,
"loss": -0.0,
"reward": 0.04605545370723121,
"reward_std": 0.07735684312647209,
"rewards/DCR_reward": 0.04605545370723121,
"step": 800
},
{
"completion_length": 149.9625,
"epoch": 8.1,
"grad_norm": 0.0,
"kl": 0.10341796875,
"learning_rate": 7.303333333333332e-07,
"loss": -0.0,
"reward": 0.04631754137226381,
"reward_std": 0.0893157648271881,
"rewards/DCR_reward": 0.04631754137226381,
"step": 810
},
{
"completion_length": 128.975,
"epoch": 8.2,
"grad_norm": 0.0,
"kl": 0.152685546875,
"learning_rate": 7.27e-07,
"loss": 0.0,
"reward": 0.02897152792502311,
"reward_std": 0.058125091606052594,
"rewards/DCR_reward": 0.02897152792502311,
"step": 820
},
{
"completion_length": 200.3,
"epoch": 8.3,
"grad_norm": 0.0,
"kl": 0.18502197265625,
"learning_rate": 7.236666666666666e-07,
"loss": 0.0,
"reward": 0.017227291383460398,
"reward_std": 0.03782382531207986,
"rewards/DCR_reward": 0.017227291383460398,
"step": 830
},
{
"completion_length": 153.6,
"epoch": 8.4,
"grad_norm": 44.0,
"kl": 0.122509765625,
"learning_rate": 7.203333333333333e-07,
"loss": -0.0,
"reward": 0.05631122866761871,
"reward_std": 0.09404923066613265,
"rewards/DCR_reward": 0.05631122866761871,
"step": 840
},
{
"completion_length": 118.475,
"epoch": 8.5,
"grad_norm": 37.25,
"kl": 0.15625,
"learning_rate": 7.17e-07,
"loss": 0.0,
"reward": 0.057633067340066194,
"reward_std": 0.09432423976832069,
"rewards/DCR_reward": 0.057633067340066194,
"step": 850
},
{
"completion_length": 144.9375,
"epoch": 8.6,
"grad_norm": 20.375,
"kl": 0.2320068359375,
"learning_rate": 7.136666666666666e-07,
"loss": -0.0,
"reward": 0.05644137697381666,
"reward_std": 0.09782474825915415,
"rewards/DCR_reward": 0.05644137697381666,
"step": 860
},
{
"completion_length": 122.225,
"epoch": 8.7,
"grad_norm": 17.125,
"kl": 0.144970703125,
"learning_rate": 7.103333333333333e-07,
"loss": 0.0,
"reward": 0.026275344849273095,
"reward_std": 0.06332521875447128,
"rewards/DCR_reward": 0.026275344849273095,
"step": 870
},
{
"completion_length": 162.525,
"epoch": 8.8,
"grad_norm": 0.0,
"kl": 0.161083984375,
"learning_rate": 7.07e-07,
"loss": 0.0,
"reward": 0.07339370545232668,
"reward_std": 0.13417805570643396,
"rewards/DCR_reward": 0.07339370545232668,
"step": 880
},
{
"completion_length": 140.4875,
"epoch": 8.9,
"grad_norm": 0.0,
"kl": 0.202880859375,
"learning_rate": 7.036666666666666e-07,
"loss": 0.0,
"reward": 0.03404433502000757,
"reward_std": 0.06663689499837347,
"rewards/DCR_reward": 0.03404433502000757,
"step": 890
},
{
"completion_length": 166.25,
"epoch": 9.0,
"grad_norm": 50.75,
"kl": 0.18447265625,
"learning_rate": 7.003333333333333e-07,
"loss": 0.0,
"reward": 0.07551841043459717,
"reward_std": 0.10837250614131336,
"rewards/DCR_reward": 0.07551841043459717,
"step": 900
},
{
"completion_length": 171.9,
"epoch": 9.1,
"grad_norm": 70.5,
"kl": 0.1385986328125,
"learning_rate": 6.97e-07,
"loss": 0.0,
"reward": 0.06921597410691901,
"reward_std": 0.09217902371892706,
"rewards/DCR_reward": 0.06921597410691901,
"step": 910
},
{
"completion_length": 134.325,
"epoch": 9.2,
"grad_norm": 0.0,
"kl": 0.2784912109375,
"learning_rate": 6.936666666666666e-07,
"loss": 0.0,
"reward": 0.022432816278160315,
"reward_std": 0.04160968857759144,
"rewards/DCR_reward": 0.022432816278160315,
"step": 920
},
{
"completion_length": 149.1875,
"epoch": 9.3,
"grad_norm": 0.0,
"kl": 0.164208984375,
"learning_rate": 6.903333333333333e-07,
"loss": -0.0,
"reward": 0.04467101704794914,
"reward_std": 0.09531959185260348,
"rewards/DCR_reward": 0.04467101704794914,
"step": 930
},
{
"completion_length": 147.125,
"epoch": 9.4,
"grad_norm": 13.5625,
"kl": 0.1576171875,
"learning_rate": 6.87e-07,
"loss": -0.0,
"reward": 0.053139488815213555,
"reward_std": 0.11639446567569393,
"rewards/DCR_reward": 0.053139488815213555,
"step": 940
},
{
"completion_length": 171.2875,
"epoch": 9.5,
"grad_norm": 0.0,
"kl": 0.1521484375,
"learning_rate": 6.836666666666666e-07,
"loss": 0.0,
"reward": 0.014011116385518108,
"reward_std": 0.01976964412315283,
"rewards/DCR_reward": 0.014011116385518108,
"step": 950
},
{
"completion_length": 130.7625,
"epoch": 9.6,
"grad_norm": 48.5,
"kl": 0.184765625,
"learning_rate": 6.803333333333333e-07,
"loss": -0.0,
"reward": 0.05880497039906914,
"reward_std": 0.08855972705059685,
"rewards/DCR_reward": 0.05880497039906914,
"step": 960
},
{
"completion_length": 139.825,
"epoch": 9.7,
"grad_norm": 25.0,
"kl": 0.162353515625,
"learning_rate": 6.77e-07,
"loss": -0.0,
"reward": 0.07512311937389313,
"reward_std": 0.12237471891276072,
"rewards/DCR_reward": 0.07512311937389313,
"step": 970
},
{
"completion_length": 151.1125,
"epoch": 9.8,
"grad_norm": 25.75,
"kl": 0.1543701171875,
"learning_rate": 6.736666666666666e-07,
"loss": -0.0,
"reward": 0.020507018158969003,
"reward_std": 0.03885748497850727,
"rewards/DCR_reward": 0.020507018158969003,
"step": 980
},
{
"completion_length": 119.05,
"epoch": 9.9,
"grad_norm": 68.0,
"kl": 0.168115234375,
"learning_rate": 6.703333333333333e-07,
"loss": 0.0,
"reward": 0.024185732538899173,
"reward_std": 0.05010633007332217,
"rewards/DCR_reward": 0.024185732538899173,
"step": 990
},
{
"completion_length": 128.65,
"epoch": 10.0,
"grad_norm": 0.0,
"kl": 0.177001953125,
"learning_rate": 6.67e-07,
"loss": -0.0,
"reward": 0.059286916424025546,
"reward_std": 0.09370468626730144,
"rewards/DCR_reward": 0.059286916424025546,
"step": 1000
},
{
"epoch": 10.0,
"eval_completion_length": 154.835,
"eval_kl": 0.180048828125,
"eval_loss": 3.1449687298845674e-09,
"eval_reward": 0.0444011578221398,
"eval_reward_std": 0.07596371626394102,
"eval_rewards/DCR_reward": 0.0444011578221398,
"eval_runtime": 1449.1033,
"eval_samples_per_second": 0.069,
"eval_steps_per_second": 0.009,
"step": 1000
}
],
"logging_steps": 10,
"max_steps": 3000,
"num_input_tokens_seen": 0,
"num_train_epochs": 30,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}