{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 250, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 110.15, "epoch": 0.1, "grad_norm": 0.0, "kl": 0.001616668701171875, "learning_rate": 9.97e-07, "loss": -0.0, "reward": 0.0008232486434280872, "reward_std": 0.0021940818056464194, "rewards/DCR_reward": 0.0008232486434280872, "step": 10 }, { "completion_length": 98.4125, "epoch": 0.2, "grad_norm": 0.0, "kl": 0.0034271240234375, "learning_rate": 9.936666666666667e-07, "loss": 0.0, "reward": 0.00022143989917822182, "reward_std": 0.000626326643396169, "rewards/DCR_reward": 0.00022143989917822182, "step": 20 }, { "completion_length": 84.2, "epoch": 0.3, "grad_norm": 0.0, "kl": 0.003081512451171875, "learning_rate": 9.903333333333333e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/DCR_reward": 0.0, "step": 30 }, { "completion_length": 105.125, "epoch": 0.4, "grad_norm": 0.0, "kl": 0.004283905029296875, "learning_rate": 9.87e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/DCR_reward": 0.0, "step": 40 }, { "completion_length": 81.4, "epoch": 0.5, "grad_norm": 26.5, "kl": 0.0041351318359375, "learning_rate": 9.836666666666666e-07, "loss": -0.0, "reward": 0.018291093641892076, "reward_std": 0.05173502548132092, "rewards/DCR_reward": 0.018291093641892076, "step": 50 }, { "completion_length": 104.425, "epoch": 0.6, "grad_norm": 0.0, "kl": 0.006622314453125, "learning_rate": 9.803333333333332e-07, "loss": -0.0, "reward": 0.009192863292992116, "reward_std": 0.025964342057704926, "rewards/DCR_reward": 0.009192863292992116, "step": 60 }, { "completion_length": 109.35, "epoch": 0.7, "grad_norm": 39.25, "kl": 0.003626251220703125, "learning_rate": 9.77e-07, "loss": 0.0, "reward": 0.013767135608941317, "reward_std": 0.03893933929502964, "rewards/DCR_reward": 0.013767135608941317, "step": 70 }, { "completion_length": 89.1, "epoch": 0.8, "grad_norm": 0.0, "kl": 0.013692855834960938, "learning_rate": 9.736666666666667e-07, "loss": 0.0, "reward": 0.011479373268957715, "reward_std": 0.032468570384662596, "rewards/DCR_reward": 0.011479373268957715, "step": 80 }, { "completion_length": 97.575, "epoch": 0.9, "grad_norm": 0.0, "kl": 0.0066986083984375, "learning_rate": 9.703333333333332e-07, "loss": -0.0, "reward": 0.009072506427764892, "reward_std": 0.025660922378301622, "rewards/DCR_reward": 0.009072506427764892, "step": 90 }, { "completion_length": 114.3625, "epoch": 1.0, "grad_norm": 0.0, "kl": 0.00914459228515625, "learning_rate": 9.67e-07, "loss": 0.0, "reward": 0.011166714504361153, "reward_std": 0.03158423751592636, "rewards/DCR_reward": 0.011166714504361153, "step": 100 }, { "completion_length": 88.5375, "epoch": 1.1, "grad_norm": 0.0, "kl": 0.011919403076171875, "learning_rate": 9.636666666666666e-07, "loss": 0.0, "reward": 2.8343783924356103e-05, "reward_std": 8.016832871362566e-05, "rewards/DCR_reward": 2.8343783924356103e-05, "step": 110 }, { "completion_length": 103.875, "epoch": 1.2, "grad_norm": 0.0, "kl": 0.010530471801757812, "learning_rate": 9.603333333333333e-07, "loss": 0.0, "reward": 0.005421069198928308, "reward_std": 0.015333099680719896, "rewards/DCR_reward": 0.005421069198928308, "step": 120 }, { "completion_length": 121.75, "epoch": 1.3, "grad_norm": 0.0, "kl": 0.00739288330078125, "learning_rate": 9.57e-07, "loss": -0.0, "reward": 3.4938057069666684e-05, "reward_std": 9.881975129246711e-05, "rewards/DCR_reward": 3.4938057069666684e-05, "step": 130 }, { "completion_length": 98.975, "epoch": 1.4, "grad_norm": 0.0, "kl": 0.009772491455078126, "learning_rate": 9.536666666666667e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/DCR_reward": 0.0, "step": 140 }, { "completion_length": 112.7875, "epoch": 1.5, "grad_norm": 0.0, "kl": 0.01072845458984375, "learning_rate": 9.503333333333333e-07, "loss": 0.0, "reward": 4.633456701412797e-05, "reward_std": 8.579618879593909e-05, "rewards/DCR_reward": 4.633456701412797e-05, "step": 150 }, { "completion_length": 116.125, "epoch": 1.6, "grad_norm": 16.25, "kl": 0.0097686767578125, "learning_rate": 9.469999999999999e-07, "loss": -0.0, "reward": 0.011431791516952217, "reward_std": 0.03233398855663836, "rewards/DCR_reward": 0.011431791516952217, "step": 160 }, { "completion_length": 100.25, "epoch": 1.7, "grad_norm": 25.875, "kl": 0.0120208740234375, "learning_rate": 9.436666666666667e-07, "loss": 0.0, "reward": 0.006299582323117647, "reward_std": 0.017817909209406936, "rewards/DCR_reward": 0.006299582323117647, "step": 170 }, { "completion_length": 106.425, "epoch": 1.8, "grad_norm": 0.0, "kl": 0.0339019775390625, "learning_rate": 9.403333333333333e-07, "loss": 0.0, "reward": 0.014636733755469322, "reward_std": 0.04139893501996994, "rewards/DCR_reward": 0.014636733755469322, "step": 180 }, { "completion_length": 82.725, "epoch": 1.9, "grad_norm": 33.75, "kl": 0.008979415893554688, "learning_rate": 9.37e-07, "loss": 0.0, "reward": 0.005478436907287687, "reward_std": 0.015495359338819981, "rewards/DCR_reward": 0.005478436907287687, "step": 190 }, { "completion_length": 71.5875, "epoch": 2.0, "grad_norm": 0.0, "kl": 0.01548919677734375, "learning_rate": 9.336666666666666e-07, "loss": -0.0, "reward": 0.025108913704752923, "reward_std": 0.046553592692362145, "rewards/DCR_reward": 0.025108913704752923, "step": 200 }, { "completion_length": 128.8125, "epoch": 2.1, "grad_norm": 10.25, "kl": 0.016112518310546876, "learning_rate": 9.303333333333333e-07, "loss": 0.0, "reward": 0.006037246529012918, "reward_std": 0.017075913585722448, "rewards/DCR_reward": 0.006037246529012918, "step": 210 }, { "completion_length": 91.65, "epoch": 2.2, "grad_norm": 0.0, "kl": 0.0196868896484375, "learning_rate": 9.27e-07, "loss": 0.0, "reward": 0.018143273887835674, "reward_std": 0.05131692748691421, "rewards/DCR_reward": 0.018143273887835674, "step": 220 }, { "completion_length": 102.8, "epoch": 2.3, "grad_norm": 23.5, "kl": 0.02627716064453125, "learning_rate": 9.236666666666666e-07, "loss": 0.0, "reward": 0.006400418069824809, "reward_std": 0.018103116168640555, "rewards/DCR_reward": 0.006400418069824809, "step": 230 }, { "completion_length": 94.325, "epoch": 2.4, "grad_norm": 0.0, "kl": 0.01078948974609375, "learning_rate": 9.203333333333333e-07, "loss": 0.0, "reward": 0.0037326388992369175, "reward_std": 0.010557496920228004, "rewards/DCR_reward": 0.0037326388992369175, "step": 240 }, { "completion_length": 108.1125, "epoch": 2.5, "grad_norm": 0.0, "kl": 0.0536102294921875, "learning_rate": 9.17e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/DCR_reward": 0.0, "step": 250 }, { "epoch": 2.5, "eval_completion_length": 109.77125, "eval_kl": 0.026533355712890627, "eval_loss": 1.797125004365796e-09, "eval_reward": 0.014695163480864722, "eval_reward_std": 0.03626910645980388, "eval_rewards/DCR_reward": 0.014695163480864722, "eval_runtime": 851.0362, "eval_samples_per_second": 0.118, "eval_steps_per_second": 0.015, "step": 250 }, { "completion_length": 112.5, "epoch": 2.6, "grad_norm": 0.0, "kl": 0.0502593994140625, "learning_rate": 9.136666666666666e-07, "loss": -0.0, "reward": 6.693824325338937e-05, "reward_std": 0.00018932993698399513, "rewards/DCR_reward": 6.693824325338937e-05, "step": 260 }, { "completion_length": 102.9875, "epoch": 2.7, "grad_norm": 20.875, "kl": 0.0537139892578125, "learning_rate": 9.103333333333333e-07, "loss": 0.0, "reward": 0.005209430219838396, "reward_std": 0.014680334192235023, "rewards/DCR_reward": 0.005209430219838396, "step": 270 }, { "completion_length": 104.2875, "epoch": 2.8, "grad_norm": 0.0, "kl": 0.0519927978515625, "learning_rate": 9.07e-07, "loss": 0.0, "reward": 3.591954009607434e-05, "reward_std": 0.00010159580269828439, "rewards/DCR_reward": 3.591954009607434e-05, "step": 280 }, { "completion_length": 132.2, "epoch": 2.9, "grad_norm": 0.0, "kl": 0.0234283447265625, "learning_rate": 9.036666666666666e-07, "loss": 0.0, "reward": 0.01878409832715988, "reward_std": 0.05312945321202278, "rewards/DCR_reward": 0.01878409832715988, "step": 290 }, { "completion_length": 97.65, "epoch": 3.0, "grad_norm": 16.25, "kl": 0.01286468505859375, "learning_rate": 9.003333333333333e-07, "loss": -0.0, "reward": 0.023834712347888854, "reward_std": 0.033087541203713045, "rewards/DCR_reward": 0.023834712347888854, "step": 300 }, { "completion_length": 113.275, "epoch": 3.1, "grad_norm": 15.75, "kl": 0.085699462890625, "learning_rate": 8.969999999999999e-07, "loss": -0.0, "reward": 0.054491185687948016, "reward_std": 0.0751757369551342, "rewards/DCR_reward": 0.054491185687948016, "step": 310 }, { "completion_length": 96.375, "epoch": 3.2, "grad_norm": 0.0, "kl": 0.051617431640625, "learning_rate": 8.936666666666667e-07, "loss": -0.0, "reward": 0.02024147715419531, "reward_std": 0.05725154206156731, "rewards/DCR_reward": 0.02024147715419531, "step": 320 }, { "completion_length": 118.075, "epoch": 3.3, "grad_norm": 0.0, "kl": 0.020208740234375, "learning_rate": 8.903333333333333e-07, "loss": -0.0, "reward": 0.03521969501452986, "reward_std": 0.08483822367852553, "rewards/DCR_reward": 0.03521969501452986, "step": 330 }, { "completion_length": 117.7625, "epoch": 3.4, "grad_norm": 0.0, "kl": 0.026739501953125, "learning_rate": 8.869999999999999e-07, "loss": -0.0, "reward": 0.021311641685315408, "reward_std": 0.03974629848089535, "rewards/DCR_reward": 0.021311641685315408, "step": 340 }, { "completion_length": 126.9375, "epoch": 3.5, "grad_norm": 0.0, "kl": 0.041534423828125, "learning_rate": 8.836666666666667e-07, "loss": 0.0, "reward": 0.018433896452188493, "reward_std": 0.04472574144601822, "rewards/DCR_reward": 0.018433896452188493, "step": 350 }, { "completion_length": 123.275, "epoch": 3.6, "grad_norm": 0.0, "kl": 0.029638671875, "learning_rate": 8.803333333333333e-07, "loss": 0.0, "reward": 0.008903130898397648, "reward_std": 0.023928308754693716, "rewards/DCR_reward": 0.008903130898397648, "step": 360 }, { "completion_length": 141.2875, "epoch": 3.7, "grad_norm": 0.0, "kl": 0.021136474609375, "learning_rate": 8.769999999999999e-07, "loss": -0.0, "reward": 0.0002970047564303968, "reward_std": 0.000840056300512515, "rewards/DCR_reward": 0.0002970047564303968, "step": 370 }, { "completion_length": 143.6125, "epoch": 3.8, "grad_norm": 28.625, "kl": 0.071319580078125, "learning_rate": 8.736666666666667e-07, "loss": 0.0, "reward": 0.025264605600386857, "reward_std": 0.062286792299710216, "rewards/DCR_reward": 0.025264605600386857, "step": 380 }, { "completion_length": 123.4125, "epoch": 3.9, "grad_norm": 0.0, "kl": 0.0508056640625, "learning_rate": 8.703333333333333e-07, "loss": 0.0, "reward": 0.011455658166960347, "reward_std": 0.032401493715588, "rewards/DCR_reward": 0.011455658166960347, "step": 390 }, { "completion_length": 99.8875, "epoch": 4.0, "grad_norm": 64.5, "kl": 0.0475677490234375, "learning_rate": 8.669999999999999e-07, "loss": -0.0, "reward": 0.020158628193894402, "reward_std": 0.04393248584237881, "rewards/DCR_reward": 0.020158628193894402, "step": 400 }, { "completion_length": 122.4, "epoch": 4.1, "grad_norm": 0.0, "kl": 0.0520751953125, "learning_rate": 8.636666666666667e-07, "loss": 0.0, "reward": 0.027829134710191283, "reward_std": 0.07858068596397061, "rewards/DCR_reward": 0.027829134710191283, "step": 410 }, { "completion_length": 161.3375, "epoch": 4.2, "grad_norm": 63.75, "kl": 0.0515869140625, "learning_rate": 8.603333333333332e-07, "loss": 0.0, "reward": 0.021562288980931044, "reward_std": 0.06093565103947185, "rewards/DCR_reward": 0.021562288980931044, "step": 420 }, { "completion_length": 132.5625, "epoch": 4.3, "grad_norm": 0.0, "kl": 0.06763916015625, "learning_rate": 8.569999999999999e-07, "loss": 0.0, "reward": 0.01166691112157423, "reward_std": 0.0329432392900344, "rewards/DCR_reward": 0.01166691112157423, "step": 430 }, { "completion_length": 116.35, "epoch": 4.4, "grad_norm": 0.0, "kl": 0.05010986328125, "learning_rate": 8.536666666666667e-07, "loss": -0.0, "reward": 0.001521215244429186, "reward_std": 0.004162183034350164, "rewards/DCR_reward": 0.001521215244429186, "step": 440 }, { "completion_length": 151.25, "epoch": 4.5, "grad_norm": 0.0, "kl": 0.087615966796875, "learning_rate": 8.503333333333333e-07, "loss": -0.0, "reward": 0.017801515758037567, "reward_std": 0.05035028904676438, "rewards/DCR_reward": 0.017801515758037567, "step": 450 }, { "completion_length": 104.8125, "epoch": 4.6, "grad_norm": 16.875, "kl": 0.05291748046875, "learning_rate": 8.469999999999999e-07, "loss": -0.0, "reward": 0.007641428161878139, "reward_std": 0.021491285803494974, "rewards/DCR_reward": 0.007641428161878139, "step": 460 }, { "completion_length": 126.9, "epoch": 4.7, "grad_norm": 19.75, "kl": 0.10247802734375, "learning_rate": 8.436666666666667e-07, "loss": 0.0, "reward": 0.011310203482571524, "reward_std": 0.03199008805677295, "rewards/DCR_reward": 0.011310203482571524, "step": 470 }, { "completion_length": 141.6625, "epoch": 4.8, "grad_norm": 0.0, "kl": 0.0375762939453125, "learning_rate": 8.403333333333333e-07, "loss": -0.0, "reward": 0.0044936020654859025, "reward_std": 0.012646565132308751, "rewards/DCR_reward": 0.0044936020654859025, "step": 480 }, { "completion_length": 121.0375, "epoch": 4.9, "grad_norm": 0.0, "kl": 0.0927490234375, "learning_rate": 8.369999999999999e-07, "loss": 0.0, "reward": 0.0428094768547453, "reward_std": 0.096446827147156, "rewards/DCR_reward": 0.0428094768547453, "step": 490 }, { "completion_length": 159.7625, "epoch": 5.0, "grad_norm": 21.5, "kl": 0.05220947265625, "learning_rate": 8.336666666666667e-07, "loss": -0.0, "reward": 0.028198828249878717, "reward_std": 0.05388287528476212, "rewards/DCR_reward": 0.028198828249878717, "step": 500 }, { "epoch": 5.0, "eval_completion_length": 119.94125, "eval_kl": 0.0939080810546875, "eval_loss": -1.4601639897193763e-09, "eval_reward": 0.02450084381052875, "eval_reward_std": 0.053787164441891945, "eval_rewards/DCR_reward": 0.02450084381052875, "eval_runtime": 1081.7831, "eval_samples_per_second": 0.092, "eval_steps_per_second": 0.012, "step": 500 }, { "completion_length": 114.225, "epoch": 5.1, "grad_norm": 14.875, "kl": 0.0663818359375, "learning_rate": 8.303333333333333e-07, "loss": 0.0, "reward": 0.04443554246754502, "reward_std": 0.07462939244578592, "rewards/DCR_reward": 0.04443554246754502, "step": 510 }, { "completion_length": 111.1875, "epoch": 5.2, "grad_norm": 0.0, "kl": 0.0774383544921875, "learning_rate": 8.269999999999999e-07, "loss": 0.0, "reward": 0.0301513435493689, "reward_std": 0.06074108343455009, "rewards/DCR_reward": 0.0301513435493689, "step": 520 }, { "completion_length": 118.0, "epoch": 5.3, "grad_norm": 0.0, "kl": 0.07442626953125, "learning_rate": 8.236666666666666e-07, "loss": -0.0, "reward": 0.022635633018944647, "reward_std": 0.060236827132757756, "rewards/DCR_reward": 0.022635633018944647, "step": 530 }, { "completion_length": 146.4625, "epoch": 5.4, "grad_norm": 25.375, "kl": 0.0943115234375, "learning_rate": 8.203333333333333e-07, "loss": -0.0, "reward": 0.027474326699302765, "reward_std": 0.04014584248652682, "rewards/DCR_reward": 0.027474326699302765, "step": 540 }, { "completion_length": 135.9125, "epoch": 5.5, "grad_norm": 0.0, "kl": 0.10826416015625, "learning_rate": 8.169999999999999e-07, "loss": -0.0, "reward": 0.03092897320893826, "reward_std": 0.06343855138984508, "rewards/DCR_reward": 0.03092897320893826, "step": 550 }, { "completion_length": 113.6125, "epoch": 5.6, "grad_norm": 60.75, "kl": 0.07705078125, "learning_rate": 8.136666666666666e-07, "loss": -0.0, "reward": 0.028095364570617676, "reward_std": 0.05137596220884007, "rewards/DCR_reward": 0.028095364570617676, "step": 560 }, { "completion_length": 120.625, "epoch": 5.7, "grad_norm": 0.0, "kl": 0.212255859375, "learning_rate": 8.103333333333333e-07, "loss": -0.0, "reward": 0.0055281239823671054, "reward_std": 0.015635895385639743, "rewards/DCR_reward": 0.0055281239823671054, "step": 570 }, { "completion_length": 112.575, "epoch": 5.8, "grad_norm": 0.0, "kl": 0.0977294921875, "learning_rate": 8.070000000000001e-07, "loss": -0.0, "reward": 0.027797411862411536, "reward_std": 0.05734402615926228, "rewards/DCR_reward": 0.027797411862411536, "step": 580 }, { "completion_length": 146.475, "epoch": 5.9, "grad_norm": 101.5, "kl": 0.1387939453125, "learning_rate": 8.036666666666666e-07, "loss": -0.0, "reward": 0.0692019445807091, "reward_std": 0.09863622895500157, "rewards/DCR_reward": 0.0692019445807091, "step": 590 }, { "completion_length": 135.4875, "epoch": 6.0, "grad_norm": 0.0, "kl": 0.07392578125, "learning_rate": 8.003333333333333e-07, "loss": 0.0, "reward": 0.042109476366749735, "reward_std": 0.07917119265184738, "rewards/DCR_reward": 0.042109476366749735, "step": 600 }, { "completion_length": 151.15, "epoch": 6.1, "grad_norm": 66.0, "kl": 0.112451171875, "learning_rate": 7.970000000000001e-07, "loss": 0.0, "reward": 0.022725265215558465, "reward_std": 0.03912785020947922, "rewards/DCR_reward": 0.022725265215558465, "step": 610 }, { "completion_length": 114.7375, "epoch": 6.2, "grad_norm": 41.0, "kl": 0.0866943359375, "learning_rate": 7.936666666666666e-07, "loss": -0.0, "reward": 0.06757222047017422, "reward_std": 0.10912240504694637, "rewards/DCR_reward": 0.06757222047017422, "step": 620 }, { "completion_length": 175.425, "epoch": 6.3, "grad_norm": 18.0, "kl": 0.0874755859375, "learning_rate": 7.903333333333333e-07, "loss": 0.0, "reward": 0.017062357207760215, "reward_std": 0.03701434804825112, "rewards/DCR_reward": 0.017062357207760215, "step": 630 }, { "completion_length": 101.1, "epoch": 6.4, "grad_norm": 32.5, "kl": 0.1383544921875, "learning_rate": 7.87e-07, "loss": 0.0, "reward": 0.031146167902625164, "reward_std": 0.06654989629168995, "rewards/DCR_reward": 0.031146167902625164, "step": 640 }, { "completion_length": 147.6875, "epoch": 6.5, "grad_norm": 22.375, "kl": 0.1053955078125, "learning_rate": 7.836666666666666e-07, "loss": 0.0, "reward": 0.03336541847675108, "reward_std": 0.0738158110238146, "rewards/DCR_reward": 0.03336541847675108, "step": 650 }, { "completion_length": 167.375, "epoch": 6.6, "grad_norm": 37.75, "kl": 0.09075927734375, "learning_rate": 7.803333333333333e-07, "loss": 0.0, "reward": 0.054513926352956335, "reward_std": 0.11440533803834115, "rewards/DCR_reward": 0.054513926352956335, "step": 660 }, { "completion_length": 113.5, "epoch": 6.7, "grad_norm": 24.375, "kl": 0.13349609375, "learning_rate": 7.77e-07, "loss": 0.0, "reward": 0.022631679168262052, "reward_std": 0.04243781621917151, "rewards/DCR_reward": 0.022631679168262052, "step": 670 }, { "completion_length": 128.1875, "epoch": 6.8, "grad_norm": 10.8125, "kl": 0.098681640625, "learning_rate": 7.736666666666666e-07, "loss": -0.0, "reward": 0.011361529098212485, "reward_std": 0.03208850735099986, "rewards/DCR_reward": 0.011361529098212485, "step": 680 }, { "completion_length": 122.525, "epoch": 6.9, "grad_norm": 43.75, "kl": 0.1169921875, "learning_rate": 7.703333333333333e-07, "loss": -0.0, "reward": 0.017346063705190318, "reward_std": 0.04881023944763001, "rewards/DCR_reward": 0.017346063705190318, "step": 690 }, { "completion_length": 150.05, "epoch": 7.0, "grad_norm": 22.875, "kl": 0.112109375, "learning_rate": 7.67e-07, "loss": 0.0, "reward": 0.04065933156089159, "reward_std": 0.10374580940115266, "rewards/DCR_reward": 0.04065933156089159, "step": 700 }, { "completion_length": 116.2125, "epoch": 7.1, "grad_norm": 132.0, "kl": 0.1211669921875, "learning_rate": 7.636666666666667e-07, "loss": -0.0, "reward": 0.06279535398061853, "reward_std": 0.11337692766683176, "rewards/DCR_reward": 0.06279535398061853, "step": 710 }, { "completion_length": 159.625, "epoch": 7.2, "grad_norm": 0.0, "kl": 0.1438232421875, "learning_rate": 7.603333333333332e-07, "loss": -0.0, "reward": 0.020437985911848956, "reward_std": 0.048951211775420236, "rewards/DCR_reward": 0.020437985911848956, "step": 720 }, { "completion_length": 165.2625, "epoch": 7.3, "grad_norm": 0.0, "kl": 0.209228515625, "learning_rate": 7.57e-07, "loss": -0.0, "reward": 0.02226241144235246, "reward_std": 0.05247671899269335, "rewards/DCR_reward": 0.02226241144235246, "step": 730 }, { "completion_length": 115.8, "epoch": 7.4, "grad_norm": 32.5, "kl": 0.1295166015625, "learning_rate": 7.536666666666667e-07, "loss": 0.0, "reward": 0.014459636958054033, "reward_std": 0.04082847375248093, "rewards/DCR_reward": 0.014459636958054033, "step": 740 }, { "completion_length": 158.675, "epoch": 7.5, "grad_norm": 0.0, "kl": 0.190478515625, "learning_rate": 7.503333333333332e-07, "loss": -0.0, "reward": 0.010101895526895532, "reward_std": 0.02844048692204524, "rewards/DCR_reward": 0.010101895526895532, "step": 750 }, { "epoch": 7.5, "eval_completion_length": 154.69125, "eval_kl": 0.22284423828125, "eval_loss": 3.744010435013934e-09, "eval_reward": 0.04045595111856528, "eval_reward_std": 0.0814886652509449, "eval_rewards/DCR_reward": 0.04045595111856528, "eval_runtime": 1402.6056, "eval_samples_per_second": 0.071, "eval_steps_per_second": 0.009, "step": 750 }, { "completion_length": 166.85, "epoch": 7.6, "grad_norm": 20.75, "kl": 0.117431640625, "learning_rate": 7.47e-07, "loss": 0.0, "reward": 0.005159654482849873, "reward_std": 0.01448775691096671, "rewards/DCR_reward": 0.005159654482849873, "step": 760 }, { "completion_length": 116.125, "epoch": 7.7, "grad_norm": 0.0, "kl": 0.1875244140625, "learning_rate": 7.436666666666667e-07, "loss": -0.0, "reward": 0.028679777635261416, "reward_std": 0.059860612216289154, "rewards/DCR_reward": 0.028679777635261416, "step": 770 }, { "completion_length": 139.5, "epoch": 7.8, "grad_norm": 30.125, "kl": 0.116748046875, "learning_rate": 7.403333333333332e-07, "loss": -0.0, "reward": 0.05339058640311123, "reward_std": 0.09481030252063646, "rewards/DCR_reward": 0.05339058640311123, "step": 780 }, { "completion_length": 145.7, "epoch": 7.9, "grad_norm": 26.875, "kl": 0.118505859375, "learning_rate": 7.37e-07, "loss": -0.0, "reward": 0.07039015240879962, "reward_std": 0.11058784131892026, "rewards/DCR_reward": 0.07039015240879962, "step": 790 }, { "completion_length": 133.6875, "epoch": 8.0, "grad_norm": 62.75, "kl": 0.1423095703125, "learning_rate": 7.336666666666667e-07, "loss": -0.0, "reward": 0.04605545370723121, "reward_std": 0.07735684312647209, "rewards/DCR_reward": 0.04605545370723121, "step": 800 }, { "completion_length": 149.9625, "epoch": 8.1, "grad_norm": 0.0, "kl": 0.10341796875, "learning_rate": 7.303333333333332e-07, "loss": -0.0, "reward": 0.04631754137226381, "reward_std": 0.0893157648271881, "rewards/DCR_reward": 0.04631754137226381, "step": 810 }, { "completion_length": 128.975, "epoch": 8.2, "grad_norm": 0.0, "kl": 0.152685546875, "learning_rate": 7.27e-07, "loss": 0.0, "reward": 0.02897152792502311, "reward_std": 0.058125091606052594, "rewards/DCR_reward": 0.02897152792502311, "step": 820 }, { "completion_length": 200.3, "epoch": 8.3, "grad_norm": 0.0, "kl": 0.18502197265625, "learning_rate": 7.236666666666666e-07, "loss": 0.0, "reward": 0.017227291383460398, "reward_std": 0.03782382531207986, "rewards/DCR_reward": 0.017227291383460398, "step": 830 }, { "completion_length": 153.6, "epoch": 8.4, "grad_norm": 44.0, "kl": 0.122509765625, "learning_rate": 7.203333333333333e-07, "loss": -0.0, "reward": 0.05631122866761871, "reward_std": 0.09404923066613265, "rewards/DCR_reward": 0.05631122866761871, "step": 840 }, { "completion_length": 118.475, "epoch": 8.5, "grad_norm": 37.25, "kl": 0.15625, "learning_rate": 7.17e-07, "loss": 0.0, "reward": 0.057633067340066194, "reward_std": 0.09432423976832069, "rewards/DCR_reward": 0.057633067340066194, "step": 850 }, { "completion_length": 144.9375, "epoch": 8.6, "grad_norm": 20.375, "kl": 0.2320068359375, "learning_rate": 7.136666666666666e-07, "loss": -0.0, "reward": 0.05644137697381666, "reward_std": 0.09782474825915415, "rewards/DCR_reward": 0.05644137697381666, "step": 860 }, { "completion_length": 122.225, "epoch": 8.7, "grad_norm": 17.125, "kl": 0.144970703125, "learning_rate": 7.103333333333333e-07, "loss": 0.0, "reward": 0.026275344849273095, "reward_std": 0.06332521875447128, "rewards/DCR_reward": 0.026275344849273095, "step": 870 }, { "completion_length": 162.525, "epoch": 8.8, "grad_norm": 0.0, "kl": 0.161083984375, "learning_rate": 7.07e-07, "loss": 0.0, "reward": 0.07339370545232668, "reward_std": 0.13417805570643396, "rewards/DCR_reward": 0.07339370545232668, "step": 880 }, { "completion_length": 140.4875, "epoch": 8.9, "grad_norm": 0.0, "kl": 0.202880859375, "learning_rate": 7.036666666666666e-07, "loss": 0.0, "reward": 0.03404433502000757, "reward_std": 0.06663689499837347, "rewards/DCR_reward": 0.03404433502000757, "step": 890 }, { "completion_length": 166.25, "epoch": 9.0, "grad_norm": 50.75, "kl": 0.18447265625, "learning_rate": 7.003333333333333e-07, "loss": 0.0, "reward": 0.07551841043459717, "reward_std": 0.10837250614131336, "rewards/DCR_reward": 0.07551841043459717, "step": 900 }, { "completion_length": 171.9, "epoch": 9.1, "grad_norm": 70.5, "kl": 0.1385986328125, "learning_rate": 6.97e-07, "loss": 0.0, "reward": 0.06921597410691901, "reward_std": 0.09217902371892706, "rewards/DCR_reward": 0.06921597410691901, "step": 910 }, { "completion_length": 134.325, "epoch": 9.2, "grad_norm": 0.0, "kl": 0.2784912109375, "learning_rate": 6.936666666666666e-07, "loss": 0.0, "reward": 0.022432816278160315, "reward_std": 0.04160968857759144, "rewards/DCR_reward": 0.022432816278160315, "step": 920 }, { "completion_length": 149.1875, "epoch": 9.3, "grad_norm": 0.0, "kl": 0.164208984375, "learning_rate": 6.903333333333333e-07, "loss": -0.0, "reward": 0.04467101704794914, "reward_std": 0.09531959185260348, "rewards/DCR_reward": 0.04467101704794914, "step": 930 }, { "completion_length": 147.125, "epoch": 9.4, "grad_norm": 13.5625, "kl": 0.1576171875, "learning_rate": 6.87e-07, "loss": -0.0, "reward": 0.053139488815213555, "reward_std": 0.11639446567569393, "rewards/DCR_reward": 0.053139488815213555, "step": 940 }, { "completion_length": 171.2875, "epoch": 9.5, "grad_norm": 0.0, "kl": 0.1521484375, "learning_rate": 6.836666666666666e-07, "loss": 0.0, "reward": 0.014011116385518108, "reward_std": 0.01976964412315283, "rewards/DCR_reward": 0.014011116385518108, "step": 950 }, { "completion_length": 130.7625, "epoch": 9.6, "grad_norm": 48.5, "kl": 0.184765625, "learning_rate": 6.803333333333333e-07, "loss": -0.0, "reward": 0.05880497039906914, "reward_std": 0.08855972705059685, "rewards/DCR_reward": 0.05880497039906914, "step": 960 }, { "completion_length": 139.825, "epoch": 9.7, "grad_norm": 25.0, "kl": 0.162353515625, "learning_rate": 6.77e-07, "loss": -0.0, "reward": 0.07512311937389313, "reward_std": 0.12237471891276072, "rewards/DCR_reward": 0.07512311937389313, "step": 970 }, { "completion_length": 151.1125, "epoch": 9.8, "grad_norm": 25.75, "kl": 0.1543701171875, "learning_rate": 6.736666666666666e-07, "loss": -0.0, "reward": 0.020507018158969003, "reward_std": 0.03885748497850727, "rewards/DCR_reward": 0.020507018158969003, "step": 980 }, { "completion_length": 119.05, "epoch": 9.9, "grad_norm": 68.0, "kl": 0.168115234375, "learning_rate": 6.703333333333333e-07, "loss": 0.0, "reward": 0.024185732538899173, "reward_std": 0.05010633007332217, "rewards/DCR_reward": 0.024185732538899173, "step": 990 }, { "completion_length": 128.65, "epoch": 10.0, "grad_norm": 0.0, "kl": 0.177001953125, "learning_rate": 6.67e-07, "loss": -0.0, "reward": 0.059286916424025546, "reward_std": 0.09370468626730144, "rewards/DCR_reward": 0.059286916424025546, "step": 1000 }, { "epoch": 10.0, "eval_completion_length": 154.835, "eval_kl": 0.180048828125, "eval_loss": 3.1449687298845674e-09, "eval_reward": 0.0444011578221398, "eval_reward_std": 0.07596371626394102, "eval_rewards/DCR_reward": 0.0444011578221398, "eval_runtime": 1449.1033, "eval_samples_per_second": 0.069, "eval_steps_per_second": 0.009, "step": 1000 } ], "logging_steps": 10, "max_steps": 3000, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }