|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 20.0, |
|
"eval_steps": 250, |
|
"global_step": 2000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 67.9625, |
|
"epoch": 0.1, |
|
"grad_norm": 0.0, |
|
"kl": 0.0, |
|
"learning_rate": 9.97e-07, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/DCR_reward": 0.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 67.125, |
|
"epoch": 0.2, |
|
"grad_norm": 0.0, |
|
"kl": 0.0, |
|
"learning_rate": 9.936666666666667e-07, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/DCR_reward": 0.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 58.3125, |
|
"epoch": 0.3, |
|
"grad_norm": 0.0, |
|
"kl": 1.6021728515625e-05, |
|
"learning_rate": 9.903333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.0007361111231148243, |
|
"reward_std": 0.0019508397206664085, |
|
"rewards/DCR_reward": 0.0007361111231148243, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 81.925, |
|
"epoch": 0.4, |
|
"grad_norm": 0.0, |
|
"kl": 0.0004387378692626953, |
|
"learning_rate": 9.87e-07, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/DCR_reward": 0.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 69.9625, |
|
"epoch": 0.5, |
|
"grad_norm": 0.0, |
|
"kl": 0.001148056983947754, |
|
"learning_rate": 9.836666666666666e-07, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/DCR_reward": 0.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 103.45, |
|
"epoch": 0.6, |
|
"grad_norm": 0.0, |
|
"kl": 0.02473886013031006, |
|
"learning_rate": 9.803333333333332e-07, |
|
"loss": -0.0, |
|
"reward": 0.004910160228610039, |
|
"reward_std": 0.013888029754161835, |
|
"rewards/DCR_reward": 0.004910160228610039, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 74.0125, |
|
"epoch": 0.7, |
|
"grad_norm": 24.0, |
|
"kl": 0.000667726993560791, |
|
"learning_rate": 9.77e-07, |
|
"loss": -0.0, |
|
"reward": 0.052958965534344316, |
|
"reward_std": 0.022137212846428157, |
|
"rewards/DCR_reward": 0.052958965534344316, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 64.8625, |
|
"epoch": 0.8, |
|
"grad_norm": 0.0, |
|
"kl": 0.000375521183013916, |
|
"learning_rate": 9.736666666666667e-07, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/DCR_reward": 0.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 65.25, |
|
"epoch": 0.9, |
|
"grad_norm": 0.0, |
|
"kl": 0.0009482383728027343, |
|
"learning_rate": 9.703333333333332e-07, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/DCR_reward": 0.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 67.1875, |
|
"epoch": 1.0, |
|
"grad_norm": 0.0, |
|
"kl": 0.0008351325988769532, |
|
"learning_rate": 9.67e-07, |
|
"loss": 0.0, |
|
"reward": 0.02388598620891571, |
|
"reward_std": 0.026214474439620973, |
|
"rewards/DCR_reward": 0.02388598620891571, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 58.2125, |
|
"epoch": 1.1, |
|
"grad_norm": 0.0, |
|
"kl": 0.0007820606231689453, |
|
"learning_rate": 9.636666666666666e-07, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/DCR_reward": 0.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 63.0375, |
|
"epoch": 1.2, |
|
"grad_norm": 0.0, |
|
"kl": 0.002427983283996582, |
|
"learning_rate": 9.603333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.009990863502025604, |
|
"reward_std": 0.01850293278694153, |
|
"rewards/DCR_reward": 0.009990863502025604, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 98.425, |
|
"epoch": 1.3, |
|
"grad_norm": 0.0, |
|
"kl": 0.0005758762359619141, |
|
"learning_rate": 9.57e-07, |
|
"loss": 0.0, |
|
"reward": 0.00625, |
|
"reward_std": 0.01767766922712326, |
|
"rewards/DCR_reward": 0.00625, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 74.8375, |
|
"epoch": 1.4, |
|
"grad_norm": 0.0, |
|
"kl": 0.0006227374076843261, |
|
"learning_rate": 9.536666666666667e-07, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/DCR_reward": 0.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 74.525, |
|
"epoch": 1.5, |
|
"grad_norm": 0.0, |
|
"kl": 0.0026155948638916016, |
|
"learning_rate": 9.503333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.05994144082069397, |
|
"reward_std": 0.0, |
|
"rewards/DCR_reward": 0.05994144082069397, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 107.5625, |
|
"epoch": 1.6, |
|
"grad_norm": 0.0, |
|
"kl": 0.0027020096778869627, |
|
"learning_rate": 9.469999999999999e-07, |
|
"loss": 0.0, |
|
"reward": 0.028785842657089233, |
|
"reward_std": 0.024412646889686584, |
|
"rewards/DCR_reward": 0.028785842657089233, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 92.0, |
|
"epoch": 1.7, |
|
"grad_norm": 0.0, |
|
"kl": 0.001540231704711914, |
|
"learning_rate": 9.436666666666667e-07, |
|
"loss": -0.0, |
|
"reward": 0.005954481801018119, |
|
"reward_std": 0.01657787673175335, |
|
"rewards/DCR_reward": 0.005954481801018119, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 89.55, |
|
"epoch": 1.8, |
|
"grad_norm": 0.0, |
|
"kl": 0.00039608478546142577, |
|
"learning_rate": 9.403333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/DCR_reward": 0.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 66.075, |
|
"epoch": 1.9, |
|
"grad_norm": 0.0, |
|
"kl": 0.0017781257629394531, |
|
"learning_rate": 9.37e-07, |
|
"loss": 0.0, |
|
"reward": 0.0013888888992369176, |
|
"reward_std": 0.003928371146321297, |
|
"rewards/DCR_reward": 0.0013888888992369176, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 57.3625, |
|
"epoch": 2.0, |
|
"grad_norm": 0.0, |
|
"kl": 0.0024953842163085937, |
|
"learning_rate": 9.336666666666666e-07, |
|
"loss": 0.0, |
|
"reward": 0.00016590238083153964, |
|
"reward_std": 6.703471299260855e-05, |
|
"rewards/DCR_reward": 0.00016590238083153964, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 112.0625, |
|
"epoch": 2.1, |
|
"grad_norm": 0.0, |
|
"kl": 0.004532432556152344, |
|
"learning_rate": 9.303333333333333e-07, |
|
"loss": -0.0, |
|
"reward": 0.005078960955142975, |
|
"reward_std": 0.014365470409393311, |
|
"rewards/DCR_reward": 0.005078960955142975, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 87.725, |
|
"epoch": 2.2, |
|
"grad_norm": 0.0, |
|
"kl": 0.009652423858642577, |
|
"learning_rate": 9.27e-07, |
|
"loss": -0.0, |
|
"reward": 0.03418266177177429, |
|
"reward_std": 0.021894259750843047, |
|
"rewards/DCR_reward": 0.03418266177177429, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 99.2375, |
|
"epoch": 2.3, |
|
"grad_norm": 0.0, |
|
"kl": 0.001990985870361328, |
|
"learning_rate": 9.236666666666666e-07, |
|
"loss": 0.0, |
|
"reward": 0.006875000149011612, |
|
"reward_std": 0.01944543719291687, |
|
"rewards/DCR_reward": 0.006875000149011612, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 87.775, |
|
"epoch": 2.4, |
|
"grad_norm": 0.0, |
|
"kl": 0.0026398658752441405, |
|
"learning_rate": 9.203333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.002777777798473835, |
|
"reward_std": 0.005143444985151291, |
|
"rewards/DCR_reward": 0.002777777798473835, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 74.575, |
|
"epoch": 2.5, |
|
"grad_norm": 0.0, |
|
"kl": 0.01164557933807373, |
|
"learning_rate": 9.17e-07, |
|
"loss": 0.0, |
|
"reward": 0.00416666679084301, |
|
"reward_std": 0.0025717224925756454, |
|
"rewards/DCR_reward": 0.00416666679084301, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"eval_completion_length": 84.36375, |
|
"eval_kl": 0.006733891963958741, |
|
"eval_loss": -2.638111595842929e-07, |
|
"eval_reward": 0.016272501772618853, |
|
"eval_reward_std": 0.014607391188983741, |
|
"eval_rewards/DCR_reward": 0.016272501772618853, |
|
"eval_runtime": 478.3232, |
|
"eval_samples_per_second": 0.209, |
|
"eval_steps_per_second": 0.027, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 89.6375, |
|
"epoch": 2.6, |
|
"grad_norm": 0.0, |
|
"kl": 0.009255027770996094, |
|
"learning_rate": 9.136666666666666e-07, |
|
"loss": 0.0, |
|
"reward": 0.018863770365715026, |
|
"reward_std": 0.025783935189247133, |
|
"rewards/DCR_reward": 0.018863770365715026, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 60.0, |
|
"epoch": 2.7, |
|
"grad_norm": 0.0, |
|
"kl": 0.005229568481445313, |
|
"learning_rate": 9.103333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/DCR_reward": 0.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 89.1875, |
|
"epoch": 2.8, |
|
"grad_norm": 0.0, |
|
"kl": 0.005951988697052002, |
|
"learning_rate": 9.07e-07, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/DCR_reward": 0.0, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 76.9125, |
|
"epoch": 2.9, |
|
"grad_norm": 0.0, |
|
"kl": 0.008041763305664062, |
|
"learning_rate": 9.036666666666666e-07, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/DCR_reward": 0.0, |
|
"step": 290 |
|
}, |
|
{ |
|
"completion_length": 64.475, |
|
"epoch": 3.0, |
|
"grad_norm": 0.0, |
|
"kl": 0.005487966537475586, |
|
"learning_rate": 9.003333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.06013102883007378, |
|
"reward_std": 3.2332184218830664e-08, |
|
"rewards/DCR_reward": 0.06013102883007378, |
|
"step": 300 |
|
}, |
|
{ |
|
"completion_length": 97.425, |
|
"epoch": 3.1, |
|
"grad_norm": 0.0, |
|
"kl": 0.00860748291015625, |
|
"learning_rate": 8.969999999999999e-07, |
|
"loss": 0.0, |
|
"reward": 0.041901327669620514, |
|
"reward_std": 0.027376230992376804, |
|
"rewards/DCR_reward": 0.041901327669620514, |
|
"step": 310 |
|
}, |
|
{ |
|
"completion_length": 96.8125, |
|
"epoch": 3.2, |
|
"grad_norm": 0.0, |
|
"kl": 0.01550760269165039, |
|
"learning_rate": 8.936666666666667e-07, |
|
"loss": 0.0, |
|
"reward": 0.00040142778307199477, |
|
"reward_std": 0.001058001583442092, |
|
"rewards/DCR_reward": 0.00040142778307199477, |
|
"step": 320 |
|
}, |
|
{ |
|
"completion_length": 90.7875, |
|
"epoch": 3.3, |
|
"grad_norm": 0.0, |
|
"kl": 0.0132171630859375, |
|
"learning_rate": 8.903333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.05994144082069397, |
|
"reward_std": 0.0, |
|
"rewards/DCR_reward": 0.05994144082069397, |
|
"step": 330 |
|
}, |
|
{ |
|
"completion_length": 68.4875, |
|
"epoch": 3.4, |
|
"grad_norm": 0.0, |
|
"kl": 0.01154632568359375, |
|
"learning_rate": 8.869999999999999e-07, |
|
"loss": 0.0, |
|
"reward": 0.005439520999789238, |
|
"reward_std": 0.015385289490222932, |
|
"rewards/DCR_reward": 0.005439520999789238, |
|
"step": 340 |
|
}, |
|
{ |
|
"completion_length": 98.8875, |
|
"epoch": 3.5, |
|
"grad_norm": 0.0, |
|
"kl": 0.00932455062866211, |
|
"learning_rate": 8.836666666666667e-07, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/DCR_reward": 0.0, |
|
"step": 350 |
|
}, |
|
{ |
|
"completion_length": 89.1875, |
|
"epoch": 3.6, |
|
"grad_norm": 0.0, |
|
"kl": 0.009013175964355469, |
|
"learning_rate": 8.803333333333333e-07, |
|
"loss": -0.0, |
|
"reward": 0.025011462631664472, |
|
"reward_std": 0.04632342683034949, |
|
"rewards/DCR_reward": 0.025011462631664472, |
|
"step": 360 |
|
}, |
|
{ |
|
"completion_length": 72.875, |
|
"epoch": 3.7, |
|
"grad_norm": 0.0, |
|
"kl": 0.01702561378479004, |
|
"learning_rate": 8.769999999999999e-07, |
|
"loss": 0.0, |
|
"reward": 0.009234594414010644, |
|
"reward_std": 0.0059265575400786474, |
|
"rewards/DCR_reward": 0.009234594414010644, |
|
"step": 370 |
|
}, |
|
{ |
|
"completion_length": 113.2875, |
|
"epoch": 3.8, |
|
"grad_norm": 0.0, |
|
"kl": 0.02369537353515625, |
|
"learning_rate": 8.736666666666667e-07, |
|
"loss": 0.0, |
|
"reward": 0.021757069602608682, |
|
"reward_std": 0.045767249166965486, |
|
"rewards/DCR_reward": 0.021757069602608682, |
|
"step": 380 |
|
}, |
|
{ |
|
"completion_length": 93.05, |
|
"epoch": 3.9, |
|
"grad_norm": 0.0, |
|
"kl": 0.017023229598999025, |
|
"learning_rate": 8.703333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.024103129375725986, |
|
"reward_std": 0.0580611415207386, |
|
"rewards/DCR_reward": 0.024103129375725986, |
|
"step": 390 |
|
}, |
|
{ |
|
"completion_length": 92.2875, |
|
"epoch": 4.0, |
|
"grad_norm": 3.109375, |
|
"kl": 0.0402587890625, |
|
"learning_rate": 8.669999999999999e-07, |
|
"loss": 0.0, |
|
"reward": 0.06285573422210292, |
|
"reward_std": 0.07154790845233946, |
|
"rewards/DCR_reward": 0.06285573422210292, |
|
"step": 400 |
|
}, |
|
{ |
|
"completion_length": 66.1, |
|
"epoch": 4.1, |
|
"grad_norm": 0.0, |
|
"kl": 0.022429752349853515, |
|
"learning_rate": 8.636666666666667e-07, |
|
"loss": 0.0, |
|
"reward": 0.0001454122830182314, |
|
"reward_std": 0.0003706513671204448, |
|
"rewards/DCR_reward": 0.0001454122830182314, |
|
"step": 410 |
|
}, |
|
{ |
|
"completion_length": 137.125, |
|
"epoch": 4.2, |
|
"grad_norm": 0.0, |
|
"kl": 0.024456501007080078, |
|
"learning_rate": 8.603333333333332e-07, |
|
"loss": 0.0, |
|
"reward": 0.05007606785511598, |
|
"reward_std": 0.06857064368668944, |
|
"rewards/DCR_reward": 0.05007606785511598, |
|
"step": 420 |
|
}, |
|
{ |
|
"completion_length": 92.9625, |
|
"epoch": 4.3, |
|
"grad_norm": 0.0, |
|
"kl": 0.023699188232421876, |
|
"learning_rate": 8.569999999999999e-07, |
|
"loss": 0.0, |
|
"reward": 0.0036682719714008273, |
|
"reward_std": 0.0028008831664919852, |
|
"rewards/DCR_reward": 0.0036682719714008273, |
|
"step": 430 |
|
}, |
|
{ |
|
"completion_length": 95.85, |
|
"epoch": 4.4, |
|
"grad_norm": 0.0, |
|
"kl": 0.03151016235351563, |
|
"learning_rate": 8.536666666666667e-07, |
|
"loss": -0.0, |
|
"reward": 0.05042804731056094, |
|
"reward_std": 0.05373804932460189, |
|
"rewards/DCR_reward": 0.05042804731056094, |
|
"step": 440 |
|
}, |
|
{ |
|
"completion_length": 118.325, |
|
"epoch": 4.5, |
|
"grad_norm": 7.3125, |
|
"kl": 0.03622512817382813, |
|
"learning_rate": 8.503333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.011024426942458376, |
|
"reward_std": 0.02974587368662469, |
|
"rewards/DCR_reward": 0.011024426942458376, |
|
"step": 450 |
|
}, |
|
{ |
|
"completion_length": 73.175, |
|
"epoch": 4.6, |
|
"grad_norm": 1.015625, |
|
"kl": 0.03716583251953125, |
|
"learning_rate": 8.469999999999999e-07, |
|
"loss": 0.0, |
|
"reward": 0.07448566257953644, |
|
"reward_std": 0.023564168593065916, |
|
"rewards/DCR_reward": 0.07448566257953644, |
|
"step": 460 |
|
}, |
|
{ |
|
"completion_length": 81.0875, |
|
"epoch": 4.7, |
|
"grad_norm": 0.0, |
|
"kl": 0.052947998046875, |
|
"learning_rate": 8.436666666666667e-07, |
|
"loss": 0.0, |
|
"reward": 0.09821241516910958, |
|
"reward_std": 0.0942218255950138, |
|
"rewards/DCR_reward": 0.09821241516910958, |
|
"step": 470 |
|
}, |
|
{ |
|
"completion_length": 99.0375, |
|
"epoch": 4.8, |
|
"grad_norm": 9.8125, |
|
"kl": 0.03128204345703125, |
|
"learning_rate": 8.403333333333333e-07, |
|
"loss": -0.0, |
|
"reward": 0.04295953951077536, |
|
"reward_std": 0.01739810509607196, |
|
"rewards/DCR_reward": 0.04295953951077536, |
|
"step": 480 |
|
}, |
|
{ |
|
"completion_length": 98.1, |
|
"epoch": 4.9, |
|
"grad_norm": 28.5, |
|
"kl": 0.051171875, |
|
"learning_rate": 8.369999999999999e-07, |
|
"loss": 0.0, |
|
"reward": 0.08375828897114843, |
|
"reward_std": 0.1000140183372423, |
|
"rewards/DCR_reward": 0.08375828897114843, |
|
"step": 490 |
|
}, |
|
{ |
|
"completion_length": 117.9375, |
|
"epoch": 5.0, |
|
"grad_norm": 0.0, |
|
"kl": 0.039456844329833984, |
|
"learning_rate": 8.336666666666667e-07, |
|
"loss": -0.0, |
|
"reward": 0.006845223042182625, |
|
"reward_std": 0.017963543720543384, |
|
"rewards/DCR_reward": 0.006845223042182625, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_completion_length": 102.48375, |
|
"eval_kl": 0.06146286010742188, |
|
"eval_loss": 1.9775862369897368e-07, |
|
"eval_reward": 0.08125734120461857, |
|
"eval_reward_std": 0.05342855209446043, |
|
"eval_rewards/DCR_reward": 0.08125734120461857, |
|
"eval_runtime": 1460.9247, |
|
"eval_samples_per_second": 0.068, |
|
"eval_steps_per_second": 0.009, |
|
"step": 500 |
|
}, |
|
{ |
|
"completion_length": 105.3125, |
|
"epoch": 5.1, |
|
"grad_norm": 14.875, |
|
"kl": 0.073431396484375, |
|
"learning_rate": 8.303333333333333e-07, |
|
"loss": -0.0, |
|
"reward": 0.09785518775461241, |
|
"reward_std": 0.08415243490599096, |
|
"rewards/DCR_reward": 0.09785518775461241, |
|
"step": 510 |
|
}, |
|
{ |
|
"completion_length": 92.6375, |
|
"epoch": 5.2, |
|
"grad_norm": 0.0, |
|
"kl": 0.05964393615722656, |
|
"learning_rate": 8.269999999999999e-07, |
|
"loss": 0.0, |
|
"reward": 0.0921564630290959, |
|
"reward_std": 0.04181258587050252, |
|
"rewards/DCR_reward": 0.0921564630290959, |
|
"step": 520 |
|
}, |
|
{ |
|
"completion_length": 91.2875, |
|
"epoch": 5.3, |
|
"grad_norm": 19.5, |
|
"kl": 0.08701934814453124, |
|
"learning_rate": 8.236666666666666e-07, |
|
"loss": 0.0, |
|
"reward": 0.16571124200709164, |
|
"reward_std": 0.04422773125115782, |
|
"rewards/DCR_reward": 0.16571124200709164, |
|
"step": 530 |
|
}, |
|
{ |
|
"completion_length": 118.0875, |
|
"epoch": 5.4, |
|
"grad_norm": 4.90625, |
|
"kl": 0.09522705078125, |
|
"learning_rate": 8.203333333333333e-07, |
|
"loss": -0.0, |
|
"reward": 0.12790518356487154, |
|
"reward_std": 0.04112411521346075, |
|
"rewards/DCR_reward": 0.12790518356487154, |
|
"step": 540 |
|
}, |
|
{ |
|
"completion_length": 92.725, |
|
"epoch": 5.5, |
|
"grad_norm": 29.25, |
|
"kl": 0.08489990234375, |
|
"learning_rate": 8.169999999999999e-07, |
|
"loss": -0.0, |
|
"reward": 0.09454966578632593, |
|
"reward_std": 0.04570461367693497, |
|
"rewards/DCR_reward": 0.09454966578632593, |
|
"step": 550 |
|
}, |
|
{ |
|
"completion_length": 83.2375, |
|
"epoch": 5.6, |
|
"grad_norm": 0.0, |
|
"kl": 0.0607513427734375, |
|
"learning_rate": 8.136666666666666e-07, |
|
"loss": 0.0, |
|
"reward": 0.07220683824270964, |
|
"reward_std": 0.050523467175662515, |
|
"rewards/DCR_reward": 0.07220683824270964, |
|
"step": 560 |
|
}, |
|
{ |
|
"completion_length": 99.6875, |
|
"epoch": 5.7, |
|
"grad_norm": 0.0, |
|
"kl": 0.068798828125, |
|
"learning_rate": 8.103333333333333e-07, |
|
"loss": -0.0, |
|
"reward": 0.096117812365992, |
|
"reward_std": 0.05572115568793379, |
|
"rewards/DCR_reward": 0.096117812365992, |
|
"step": 570 |
|
}, |
|
{ |
|
"completion_length": 61.5, |
|
"epoch": 5.8, |
|
"grad_norm": 0.0, |
|
"kl": 0.07020721435546876, |
|
"learning_rate": 8.070000000000001e-07, |
|
"loss": -0.0, |
|
"reward": 0.047906511649489406, |
|
"reward_std": 4.001859270204022e-05, |
|
"rewards/DCR_reward": 0.047906511649489406, |
|
"step": 580 |
|
}, |
|
{ |
|
"completion_length": 99.3125, |
|
"epoch": 5.9, |
|
"grad_norm": 0.0, |
|
"kl": 0.123101806640625, |
|
"learning_rate": 8.036666666666666e-07, |
|
"loss": -0.0, |
|
"reward": 0.13380602395627647, |
|
"reward_std": 0.10035560713149608, |
|
"rewards/DCR_reward": 0.13380602395627647, |
|
"step": 590 |
|
}, |
|
{ |
|
"completion_length": 103.025, |
|
"epoch": 6.0, |
|
"grad_norm": 0.0, |
|
"kl": 0.081585693359375, |
|
"learning_rate": 8.003333333333333e-07, |
|
"loss": -0.0, |
|
"reward": 0.061570275388658044, |
|
"reward_std": 0.04306753019336611, |
|
"rewards/DCR_reward": 0.061570275388658044, |
|
"step": 600 |
|
}, |
|
{ |
|
"completion_length": 97.9125, |
|
"epoch": 6.1, |
|
"grad_norm": 0.0, |
|
"kl": 0.11351318359375, |
|
"learning_rate": 7.970000000000001e-07, |
|
"loss": 0.0, |
|
"reward": 0.09666887713829056, |
|
"reward_std": 0.059037036258087025, |
|
"rewards/DCR_reward": 0.09666887713829056, |
|
"step": 610 |
|
}, |
|
{ |
|
"completion_length": 91.7, |
|
"epoch": 6.2, |
|
"grad_norm": 0.0, |
|
"kl": 0.055133056640625, |
|
"learning_rate": 7.936666666666666e-07, |
|
"loss": 0.0, |
|
"reward": 0.13362326713686343, |
|
"reward_std": 0.07874106459098584, |
|
"rewards/DCR_reward": 0.13362326713686343, |
|
"step": 620 |
|
}, |
|
{ |
|
"completion_length": 107.4125, |
|
"epoch": 6.3, |
|
"grad_norm": 9.25, |
|
"kl": 0.1246337890625, |
|
"learning_rate": 7.903333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.08847815722692758, |
|
"reward_std": 0.06451865802846442, |
|
"rewards/DCR_reward": 0.08847815722692758, |
|
"step": 630 |
|
}, |
|
{ |
|
"completion_length": 77.0875, |
|
"epoch": 6.4, |
|
"grad_norm": 25.0, |
|
"kl": 0.15296096801757814, |
|
"learning_rate": 7.87e-07, |
|
"loss": -0.0, |
|
"reward": 0.1423336612060666, |
|
"reward_std": 0.055696507578250024, |
|
"rewards/DCR_reward": 0.1423336612060666, |
|
"step": 640 |
|
}, |
|
{ |
|
"completion_length": 99.6375, |
|
"epoch": 6.5, |
|
"grad_norm": 0.0, |
|
"kl": 0.08688135147094726, |
|
"learning_rate": 7.836666666666666e-07, |
|
"loss": -0.0, |
|
"reward": 0.173234991542995, |
|
"reward_std": 0.06990011496527586, |
|
"rewards/DCR_reward": 0.173234991542995, |
|
"step": 650 |
|
}, |
|
{ |
|
"completion_length": 95.975, |
|
"epoch": 6.6, |
|
"grad_norm": 12.9375, |
|
"kl": 0.14229736328125, |
|
"learning_rate": 7.803333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.19796406209934503, |
|
"reward_std": 0.08495658059261757, |
|
"rewards/DCR_reward": 0.19796406209934503, |
|
"step": 660 |
|
}, |
|
{ |
|
"completion_length": 51.9875, |
|
"epoch": 6.7, |
|
"grad_norm": 26.0, |
|
"kl": 0.19248046875, |
|
"learning_rate": 7.77e-07, |
|
"loss": 0.0, |
|
"reward": 0.12773155540926381, |
|
"reward_std": 0.018970384920248762, |
|
"rewards/DCR_reward": 0.12773155540926381, |
|
"step": 670 |
|
}, |
|
{ |
|
"completion_length": 81.6875, |
|
"epoch": 6.8, |
|
"grad_norm": 10.0625, |
|
"kl": 0.09774169921875, |
|
"learning_rate": 7.736666666666666e-07, |
|
"loss": -0.0, |
|
"reward": 0.09495227632578462, |
|
"reward_std": 0.05938452887904759, |
|
"rewards/DCR_reward": 0.09495227632578462, |
|
"step": 680 |
|
}, |
|
{ |
|
"completion_length": 100.675, |
|
"epoch": 6.9, |
|
"grad_norm": 19.75, |
|
"kl": 0.1118896484375, |
|
"learning_rate": 7.703333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.09238526365661529, |
|
"reward_std": 0.04735179884301033, |
|
"rewards/DCR_reward": 0.09238526365661529, |
|
"step": 690 |
|
}, |
|
{ |
|
"completion_length": 63.1625, |
|
"epoch": 7.0, |
|
"grad_norm": 17.625, |
|
"kl": 0.0974609375, |
|
"learning_rate": 7.67e-07, |
|
"loss": 0.0, |
|
"reward": 0.15320108719170095, |
|
"reward_std": 0.04531068232899997, |
|
"rewards/DCR_reward": 0.15320108719170095, |
|
"step": 700 |
|
}, |
|
{ |
|
"completion_length": 93.4125, |
|
"epoch": 7.1, |
|
"grad_norm": 18.5, |
|
"kl": 0.101190185546875, |
|
"learning_rate": 7.636666666666667e-07, |
|
"loss": 0.0, |
|
"reward": 0.02544752674875781, |
|
"reward_std": 0.05906593499239534, |
|
"rewards/DCR_reward": 0.02544752674875781, |
|
"step": 710 |
|
}, |
|
{ |
|
"completion_length": 98.0875, |
|
"epoch": 7.2, |
|
"grad_norm": 0.0, |
|
"kl": 0.12484130859375, |
|
"learning_rate": 7.603333333333332e-07, |
|
"loss": -0.0, |
|
"reward": 0.17555389162153007, |
|
"reward_std": 0.06005739986721892, |
|
"rewards/DCR_reward": 0.17555389162153007, |
|
"step": 720 |
|
}, |
|
{ |
|
"completion_length": 85.3, |
|
"epoch": 7.3, |
|
"grad_norm": 0.03076171875, |
|
"kl": 0.06852807998657226, |
|
"learning_rate": 7.57e-07, |
|
"loss": 0.0, |
|
"reward": 0.09980509513407014, |
|
"reward_std": 0.022045876948665465, |
|
"rewards/DCR_reward": 0.09980509513407014, |
|
"step": 730 |
|
}, |
|
{ |
|
"completion_length": 60.525, |
|
"epoch": 7.4, |
|
"grad_norm": 0.0, |
|
"kl": 0.146484375, |
|
"learning_rate": 7.536666666666667e-07, |
|
"loss": 0.0, |
|
"reward": 0.17459992747753858, |
|
"reward_std": 0.04686348429240752, |
|
"rewards/DCR_reward": 0.17459992747753858, |
|
"step": 740 |
|
}, |
|
{ |
|
"completion_length": 146.6375, |
|
"epoch": 7.5, |
|
"grad_norm": 5.125, |
|
"kl": 0.13475341796875, |
|
"learning_rate": 7.503333333333332e-07, |
|
"loss": -0.0, |
|
"reward": 0.12549885590560733, |
|
"reward_std": 0.003405572484291497, |
|
"rewards/DCR_reward": 0.12549885590560733, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"eval_completion_length": 85.11125, |
|
"eval_kl": 0.133615665435791, |
|
"eval_loss": -8.860533853294328e-07, |
|
"eval_reward": 0.13431876484770328, |
|
"eval_reward_std": 0.05950616509797925, |
|
"eval_rewards/DCR_reward": 0.13431876484770328, |
|
"eval_runtime": 2234.9394, |
|
"eval_samples_per_second": 0.045, |
|
"eval_steps_per_second": 0.006, |
|
"step": 750 |
|
}, |
|
{ |
|
"completion_length": 79.35, |
|
"epoch": 7.6, |
|
"grad_norm": 17.875, |
|
"kl": 0.166937255859375, |
|
"learning_rate": 7.47e-07, |
|
"loss": -0.0, |
|
"reward": 0.04600536972284317, |
|
"reward_std": 0.030051297834233992, |
|
"rewards/DCR_reward": 0.04600536972284317, |
|
"step": 760 |
|
}, |
|
{ |
|
"completion_length": 59.425, |
|
"epoch": 7.7, |
|
"grad_norm": 18.875, |
|
"kl": 0.144036865234375, |
|
"learning_rate": 7.436666666666667e-07, |
|
"loss": -0.0, |
|
"reward": 0.25486029861494897, |
|
"reward_std": 0.06726857685171125, |
|
"rewards/DCR_reward": 0.25486029861494897, |
|
"step": 770 |
|
}, |
|
{ |
|
"completion_length": 59.25, |
|
"epoch": 7.8, |
|
"grad_norm": 13.875, |
|
"kl": 0.1705810546875, |
|
"learning_rate": 7.403333333333332e-07, |
|
"loss": 0.0, |
|
"reward": 0.18612688397988678, |
|
"reward_std": 0.060024100821465254, |
|
"rewards/DCR_reward": 0.18612688397988678, |
|
"step": 780 |
|
}, |
|
{ |
|
"completion_length": 91.675, |
|
"epoch": 7.9, |
|
"grad_norm": 8.3125, |
|
"kl": 0.1150146484375, |
|
"learning_rate": 7.37e-07, |
|
"loss": -0.0, |
|
"reward": 0.22636549319140614, |
|
"reward_std": 0.09487768108156161, |
|
"rewards/DCR_reward": 0.22636549319140614, |
|
"step": 790 |
|
}, |
|
{ |
|
"completion_length": 89.375, |
|
"epoch": 8.0, |
|
"grad_norm": 13.25, |
|
"kl": 0.1668212890625, |
|
"learning_rate": 7.336666666666667e-07, |
|
"loss": -0.0, |
|
"reward": 0.11243776695337146, |
|
"reward_std": 0.0434065388621093, |
|
"rewards/DCR_reward": 0.11243776695337146, |
|
"step": 800 |
|
}, |
|
{ |
|
"completion_length": 60.225, |
|
"epoch": 8.1, |
|
"grad_norm": 0.0, |
|
"kl": 0.144354248046875, |
|
"learning_rate": 7.303333333333332e-07, |
|
"loss": 0.0, |
|
"reward": 0.17428544777212665, |
|
"reward_std": 0.03378924725689103, |
|
"rewards/DCR_reward": 0.17428544777212665, |
|
"step": 810 |
|
}, |
|
{ |
|
"completion_length": 67.4375, |
|
"epoch": 8.2, |
|
"grad_norm": 29.625, |
|
"kl": 0.2034912109375, |
|
"learning_rate": 7.27e-07, |
|
"loss": -0.0, |
|
"reward": 0.09552836455404759, |
|
"reward_std": 0.0525470721883039, |
|
"rewards/DCR_reward": 0.09552836455404759, |
|
"step": 820 |
|
}, |
|
{ |
|
"completion_length": 78.75, |
|
"epoch": 8.3, |
|
"grad_norm": 0.0, |
|
"kl": 0.185302734375, |
|
"learning_rate": 7.236666666666666e-07, |
|
"loss": -0.0, |
|
"reward": 0.10288139216136187, |
|
"reward_std": 0.07548805264668772, |
|
"rewards/DCR_reward": 0.10288139216136187, |
|
"step": 830 |
|
}, |
|
{ |
|
"completion_length": 104.2875, |
|
"epoch": 8.4, |
|
"grad_norm": 0.0, |
|
"kl": 0.1417724609375, |
|
"learning_rate": 7.203333333333333e-07, |
|
"loss": -0.0, |
|
"reward": 0.12569777632597834, |
|
"reward_std": 0.0003777303310926072, |
|
"rewards/DCR_reward": 0.12569777632597834, |
|
"step": 840 |
|
}, |
|
{ |
|
"completion_length": 89.9625, |
|
"epoch": 8.5, |
|
"grad_norm": 0.0, |
|
"kl": 0.124444580078125, |
|
"learning_rate": 7.17e-07, |
|
"loss": 0.0, |
|
"reward": 0.08354408431332558, |
|
"reward_std": 0.016342163346146778, |
|
"rewards/DCR_reward": 0.08354408431332558, |
|
"step": 850 |
|
}, |
|
{ |
|
"completion_length": 126.1625, |
|
"epoch": 8.6, |
|
"grad_norm": 5.75, |
|
"kl": 0.15697021484375, |
|
"learning_rate": 7.136666666666666e-07, |
|
"loss": -0.0, |
|
"reward": 0.18000225534196942, |
|
"reward_std": 0.11962818971369416, |
|
"rewards/DCR_reward": 0.18000225534196942, |
|
"step": 860 |
|
}, |
|
{ |
|
"completion_length": 66.125, |
|
"epoch": 8.7, |
|
"grad_norm": 15.8125, |
|
"kl": 0.131787109375, |
|
"learning_rate": 7.103333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.14025002100970596, |
|
"reward_std": 0.07934097726297296, |
|
"rewards/DCR_reward": 0.14025002100970596, |
|
"step": 870 |
|
}, |
|
{ |
|
"completion_length": 85.55, |
|
"epoch": 8.8, |
|
"grad_norm": 30.875, |
|
"kl": 0.14227294921875, |
|
"learning_rate": 7.07e-07, |
|
"loss": 0.0, |
|
"reward": 0.32092891409993174, |
|
"reward_std": 0.12094202971202321, |
|
"rewards/DCR_reward": 0.32092891409993174, |
|
"step": 880 |
|
}, |
|
{ |
|
"completion_length": 80.625, |
|
"epoch": 8.9, |
|
"grad_norm": 0.0, |
|
"kl": 0.11814746856689454, |
|
"learning_rate": 7.036666666666666e-07, |
|
"loss": 0.0, |
|
"reward": 0.09874425530433655, |
|
"reward_std": 0.020111887441453292, |
|
"rewards/DCR_reward": 0.09874425530433655, |
|
"step": 890 |
|
}, |
|
{ |
|
"completion_length": 74.525, |
|
"epoch": 9.0, |
|
"grad_norm": 0.0, |
|
"kl": 0.107318115234375, |
|
"learning_rate": 7.003333333333333e-07, |
|
"loss": -0.0, |
|
"reward": 0.17968311300501227, |
|
"reward_std": 0.03413876986596733, |
|
"rewards/DCR_reward": 0.17968311300501227, |
|
"step": 900 |
|
}, |
|
{ |
|
"completion_length": 57.0625, |
|
"epoch": 9.1, |
|
"grad_norm": 17.625, |
|
"kl": 0.158935546875, |
|
"learning_rate": 6.97e-07, |
|
"loss": 0.0, |
|
"reward": 0.1742305759107694, |
|
"reward_std": 0.07847042060523109, |
|
"rewards/DCR_reward": 0.1742305759107694, |
|
"step": 910 |
|
}, |
|
{ |
|
"completion_length": 64.8625, |
|
"epoch": 9.2, |
|
"grad_norm": 0.0, |
|
"kl": 0.17060546875, |
|
"learning_rate": 6.936666666666666e-07, |
|
"loss": -0.0, |
|
"reward": 0.18635233133099974, |
|
"reward_std": 0.07983016533326008, |
|
"rewards/DCR_reward": 0.18635233133099974, |
|
"step": 920 |
|
}, |
|
{ |
|
"completion_length": 77.625, |
|
"epoch": 9.3, |
|
"grad_norm": 23.25, |
|
"kl": 0.218511962890625, |
|
"learning_rate": 6.903333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.19767590372357519, |
|
"reward_std": 0.043497299042064695, |
|
"rewards/DCR_reward": 0.19767590372357519, |
|
"step": 930 |
|
}, |
|
{ |
|
"completion_length": 47.325, |
|
"epoch": 9.4, |
|
"grad_norm": 21.375, |
|
"kl": 0.189459228515625, |
|
"learning_rate": 6.87e-07, |
|
"loss": 0.0, |
|
"reward": 0.22449640462873505, |
|
"reward_std": 0.0027420094997694378, |
|
"rewards/DCR_reward": 0.22449640462873505, |
|
"step": 940 |
|
}, |
|
{ |
|
"completion_length": 74.6, |
|
"epoch": 9.5, |
|
"grad_norm": 0.0, |
|
"kl": 0.14364013671875, |
|
"learning_rate": 6.836666666666666e-07, |
|
"loss": -0.0, |
|
"reward": 0.1431873946392443, |
|
"reward_std": 0.001265111715017042, |
|
"rewards/DCR_reward": 0.1431873946392443, |
|
"step": 950 |
|
}, |
|
{ |
|
"completion_length": 95.7625, |
|
"epoch": 9.6, |
|
"grad_norm": 0.0, |
|
"kl": 0.1601806640625, |
|
"learning_rate": 6.803333333333333e-07, |
|
"loss": -0.0, |
|
"reward": 0.1265127849765122, |
|
"reward_std": 0.06392315039120149, |
|
"rewards/DCR_reward": 0.1265127849765122, |
|
"step": 960 |
|
}, |
|
{ |
|
"completion_length": 79.325, |
|
"epoch": 9.7, |
|
"grad_norm": 0.019287109375, |
|
"kl": 0.1714111328125, |
|
"learning_rate": 6.77e-07, |
|
"loss": 0.0, |
|
"reward": 0.1620770814595744, |
|
"reward_std": 0.07664856179035269, |
|
"rewards/DCR_reward": 0.1620770814595744, |
|
"step": 970 |
|
}, |
|
{ |
|
"completion_length": 113.475, |
|
"epoch": 9.8, |
|
"grad_norm": 40.5, |
|
"kl": 0.09014892578125, |
|
"learning_rate": 6.736666666666666e-07, |
|
"loss": -0.0, |
|
"reward": 0.12962240994675084, |
|
"reward_std": 0.08673616686003242, |
|
"rewards/DCR_reward": 0.12962240994675084, |
|
"step": 980 |
|
}, |
|
{ |
|
"completion_length": 132.1875, |
|
"epoch": 9.9, |
|
"grad_norm": 3.171875, |
|
"kl": 0.1203857421875, |
|
"learning_rate": 6.703333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.0844914206303656, |
|
"reward_std": 0.04573890994070098, |
|
"rewards/DCR_reward": 0.0844914206303656, |
|
"step": 990 |
|
}, |
|
{ |
|
"completion_length": 91.7, |
|
"epoch": 10.0, |
|
"grad_norm": 0.0, |
|
"kl": 0.14872217178344727, |
|
"learning_rate": 6.67e-07, |
|
"loss": -0.0, |
|
"reward": 0.22451465255580844, |
|
"reward_std": 0.03193944031372666, |
|
"rewards/DCR_reward": 0.22451465255580844, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_completion_length": 88.34125, |
|
"eval_kl": 0.16021126747131348, |
|
"eval_loss": -2.1401815786248335e-07, |
|
"eval_reward": 0.170399680956034, |
|
"eval_reward_std": 0.052541274107022674, |
|
"eval_rewards/DCR_reward": 0.170399680956034, |
|
"eval_runtime": 2514.5153, |
|
"eval_samples_per_second": 0.04, |
|
"eval_steps_per_second": 0.005, |
|
"step": 1000 |
|
}, |
|
{ |
|
"completion_length": 69.625, |
|
"epoch": 10.1, |
|
"grad_norm": 0.0, |
|
"kl": 0.16669921875, |
|
"learning_rate": 6.636666666666666e-07, |
|
"loss": -0.0, |
|
"reward": 0.18533597313798963, |
|
"reward_std": 0.06167281661574009, |
|
"rewards/DCR_reward": 0.18533597313798963, |
|
"step": 1010 |
|
}, |
|
{ |
|
"completion_length": 60.6875, |
|
"epoch": 10.2, |
|
"grad_norm": 0.0, |
|
"kl": 0.1216064453125, |
|
"learning_rate": 6.603333333333333e-07, |
|
"loss": -0.0, |
|
"reward": 0.12192644038586878, |
|
"reward_std": 0.01787745998954051, |
|
"rewards/DCR_reward": 0.12192644038586878, |
|
"step": 1020 |
|
}, |
|
{ |
|
"completion_length": 76.7625, |
|
"epoch": 10.3, |
|
"grad_norm": 7.5, |
|
"kl": 0.17220458984375, |
|
"learning_rate": 6.57e-07, |
|
"loss": 0.0, |
|
"reward": 0.11648766156286001, |
|
"reward_std": 0.0692232246074127, |
|
"rewards/DCR_reward": 0.11648766156286001, |
|
"step": 1030 |
|
}, |
|
{ |
|
"completion_length": 88.8375, |
|
"epoch": 10.4, |
|
"grad_norm": 38.5, |
|
"kl": 0.130328369140625, |
|
"learning_rate": 6.536666666666666e-07, |
|
"loss": 0.0, |
|
"reward": 0.21984734574798495, |
|
"reward_std": 0.008847105817403644, |
|
"rewards/DCR_reward": 0.21984734574798495, |
|
"step": 1040 |
|
}, |
|
{ |
|
"completion_length": 68.9125, |
|
"epoch": 10.5, |
|
"grad_norm": 0.0, |
|
"kl": 0.168310546875, |
|
"learning_rate": 6.503333333333332e-07, |
|
"loss": 0.0, |
|
"reward": 0.0662006882019341, |
|
"reward_std": 0.089736894213479, |
|
"rewards/DCR_reward": 0.0662006882019341, |
|
"step": 1050 |
|
}, |
|
{ |
|
"completion_length": 92.8875, |
|
"epoch": 10.6, |
|
"grad_norm": 21.625, |
|
"kl": 0.1709716796875, |
|
"learning_rate": 6.47e-07, |
|
"loss": 0.0, |
|
"reward": 0.3175446430454031, |
|
"reward_std": 0.04590535781462677, |
|
"rewards/DCR_reward": 0.3175446430454031, |
|
"step": 1060 |
|
}, |
|
{ |
|
"completion_length": 104.0875, |
|
"epoch": 10.7, |
|
"grad_norm": 0.0013275146484375, |
|
"kl": 0.1356201171875, |
|
"learning_rate": 6.436666666666667e-07, |
|
"loss": 0.0, |
|
"reward": 0.17272857704083436, |
|
"reward_std": 0.04932637963789972, |
|
"rewards/DCR_reward": 0.17272857704083436, |
|
"step": 1070 |
|
}, |
|
{ |
|
"completion_length": 73.5875, |
|
"epoch": 10.8, |
|
"grad_norm": 0.0, |
|
"kl": 0.17826080322265625, |
|
"learning_rate": 6.403333333333332e-07, |
|
"loss": 0.0, |
|
"reward": 0.19967932105064393, |
|
"reward_std": 0.012677951477235183, |
|
"rewards/DCR_reward": 0.19967932105064393, |
|
"step": 1080 |
|
}, |
|
{ |
|
"completion_length": 61.4375, |
|
"epoch": 10.9, |
|
"grad_norm": 14.9375, |
|
"kl": 0.2052734375, |
|
"learning_rate": 6.37e-07, |
|
"loss": 0.0, |
|
"reward": 0.21544204794627148, |
|
"reward_std": 0.016153086804820305, |
|
"rewards/DCR_reward": 0.21544204794627148, |
|
"step": 1090 |
|
}, |
|
{ |
|
"completion_length": 89.9125, |
|
"epoch": 11.0, |
|
"grad_norm": 18.875, |
|
"kl": 0.18170166015625, |
|
"learning_rate": 6.336666666666667e-07, |
|
"loss": 0.0, |
|
"reward": 0.09034715148736723, |
|
"reward_std": 0.07943847334618112, |
|
"rewards/DCR_reward": 0.09034715148736723, |
|
"step": 1100 |
|
}, |
|
{ |
|
"completion_length": 64.05, |
|
"epoch": 11.1, |
|
"grad_norm": 15.0625, |
|
"kl": 0.154913330078125, |
|
"learning_rate": 6.303333333333332e-07, |
|
"loss": 0.0, |
|
"reward": 0.24353665355592966, |
|
"reward_std": 0.08834277796122478, |
|
"rewards/DCR_reward": 0.24353665355592966, |
|
"step": 1110 |
|
}, |
|
{ |
|
"completion_length": 61.9, |
|
"epoch": 11.2, |
|
"grad_norm": 0.1220703125, |
|
"kl": 0.15966796875, |
|
"learning_rate": 6.27e-07, |
|
"loss": -0.0, |
|
"reward": 0.08767885738052428, |
|
"reward_std": 0.05409979920323167, |
|
"rewards/DCR_reward": 0.08767885738052428, |
|
"step": 1120 |
|
}, |
|
{ |
|
"completion_length": 54.05, |
|
"epoch": 11.3, |
|
"grad_norm": 0.0, |
|
"kl": 0.251708984375, |
|
"learning_rate": 6.236666666666667e-07, |
|
"loss": -0.0, |
|
"reward": 0.1877214941661805, |
|
"reward_std": 0.014956248462726762, |
|
"rewards/DCR_reward": 0.1877214941661805, |
|
"step": 1130 |
|
}, |
|
{ |
|
"completion_length": 79.475, |
|
"epoch": 11.4, |
|
"grad_norm": 16.5, |
|
"kl": 0.163287353515625, |
|
"learning_rate": 6.203333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.18192651600111276, |
|
"reward_std": 0.04434406632919945, |
|
"rewards/DCR_reward": 0.18192651600111276, |
|
"step": 1140 |
|
}, |
|
{ |
|
"completion_length": 139.775, |
|
"epoch": 11.5, |
|
"grad_norm": 2.921875, |
|
"kl": 0.231005859375, |
|
"learning_rate": 6.17e-07, |
|
"loss": -0.0, |
|
"reward": 0.22696539172902702, |
|
"reward_std": 0.03848955475841649, |
|
"rewards/DCR_reward": 0.22696539172902702, |
|
"step": 1150 |
|
}, |
|
{ |
|
"completion_length": 95.55, |
|
"epoch": 11.6, |
|
"grad_norm": 23.25, |
|
"kl": 0.170263671875, |
|
"learning_rate": 6.136666666666666e-07, |
|
"loss": -0.0, |
|
"reward": 0.24207095536403359, |
|
"reward_std": 0.07178211783611914, |
|
"rewards/DCR_reward": 0.24207095536403359, |
|
"step": 1160 |
|
}, |
|
{ |
|
"completion_length": 89.1, |
|
"epoch": 11.7, |
|
"grad_norm": 15.5, |
|
"kl": 0.126556396484375, |
|
"learning_rate": 6.103333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.11935207918286324, |
|
"reward_std": 0.07003376996144653, |
|
"rewards/DCR_reward": 0.11935207918286324, |
|
"step": 1170 |
|
}, |
|
{ |
|
"completion_length": 63.4375, |
|
"epoch": 11.8, |
|
"grad_norm": 28.625, |
|
"kl": 0.18448925018310547, |
|
"learning_rate": 6.07e-07, |
|
"loss": -0.0, |
|
"reward": 0.11633564964868129, |
|
"reward_std": 0.026135455832263687, |
|
"rewards/DCR_reward": 0.11633564964868129, |
|
"step": 1180 |
|
}, |
|
{ |
|
"completion_length": 89.6125, |
|
"epoch": 11.9, |
|
"grad_norm": 0.0299072265625, |
|
"kl": 0.12894287109375, |
|
"learning_rate": 6.036666666666666e-07, |
|
"loss": 0.0, |
|
"reward": 0.16700884115416556, |
|
"reward_std": 0.036743837507117405, |
|
"rewards/DCR_reward": 0.16700884115416556, |
|
"step": 1190 |
|
}, |
|
{ |
|
"completion_length": 89.675, |
|
"epoch": 12.0, |
|
"grad_norm": 0.0009307861328125, |
|
"kl": 0.133929443359375, |
|
"learning_rate": 6.003333333333334e-07, |
|
"loss": -0.0, |
|
"reward": 0.2129200980300084, |
|
"reward_std": 0.017186377505953487, |
|
"rewards/DCR_reward": 0.2129200980300084, |
|
"step": 1200 |
|
}, |
|
{ |
|
"completion_length": 65.125, |
|
"epoch": 12.1, |
|
"grad_norm": 0.0, |
|
"kl": 0.1867431640625, |
|
"learning_rate": 5.97e-07, |
|
"loss": 0.0, |
|
"reward": 0.2869118741014972, |
|
"reward_std": 0.05356231460464187, |
|
"rewards/DCR_reward": 0.2869118741014972, |
|
"step": 1210 |
|
}, |
|
{ |
|
"completion_length": 61.1875, |
|
"epoch": 12.2, |
|
"grad_norm": 19.125, |
|
"kl": 0.209033203125, |
|
"learning_rate": 5.936666666666666e-07, |
|
"loss": -0.0, |
|
"reward": 0.2106538300169632, |
|
"reward_std": 0.01354630084197197, |
|
"rewards/DCR_reward": 0.2106538300169632, |
|
"step": 1220 |
|
}, |
|
{ |
|
"completion_length": 122.4125, |
|
"epoch": 12.3, |
|
"grad_norm": 3.703125, |
|
"kl": 0.15078125, |
|
"learning_rate": 5.903333333333334e-07, |
|
"loss": 0.0, |
|
"reward": 0.19904457703232764, |
|
"reward_std": 0.015228879620144653, |
|
"rewards/DCR_reward": 0.19904457703232764, |
|
"step": 1230 |
|
}, |
|
{ |
|
"completion_length": 60.0625, |
|
"epoch": 12.4, |
|
"grad_norm": 0.263671875, |
|
"kl": 0.17626953125, |
|
"learning_rate": 5.87e-07, |
|
"loss": -0.0, |
|
"reward": 0.06604900020174682, |
|
"reward_std": 0.015421837849709163, |
|
"rewards/DCR_reward": 0.06604900020174682, |
|
"step": 1240 |
|
}, |
|
{ |
|
"completion_length": 77.025, |
|
"epoch": 12.5, |
|
"grad_norm": 0.0, |
|
"kl": 0.1548828125, |
|
"learning_rate": 5.836666666666666e-07, |
|
"loss": 0.0, |
|
"reward": 0.24018247241619975, |
|
"reward_std": 0.11031100240943488, |
|
"rewards/DCR_reward": 0.24018247241619975, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 12.5, |
|
"eval_completion_length": 79.80125, |
|
"eval_kl": 0.1788983154296875, |
|
"eval_loss": -2.466062483108544e-07, |
|
"eval_reward": 0.18051586833898908, |
|
"eval_reward_std": 0.04664378996535504, |
|
"eval_rewards/DCR_reward": 0.18051586833898908, |
|
"eval_runtime": 2637.5023, |
|
"eval_samples_per_second": 0.038, |
|
"eval_steps_per_second": 0.005, |
|
"step": 1250 |
|
}, |
|
{ |
|
"completion_length": 109.5375, |
|
"epoch": 12.6, |
|
"grad_norm": 0.0, |
|
"kl": 0.1450439453125, |
|
"learning_rate": 5.803333333333334e-07, |
|
"loss": 0.0, |
|
"reward": 0.18088525887578727, |
|
"reward_std": 0.07833153888532252, |
|
"rewards/DCR_reward": 0.18088525887578727, |
|
"step": 1260 |
|
}, |
|
{ |
|
"completion_length": 93.2375, |
|
"epoch": 12.7, |
|
"grad_norm": 24.625, |
|
"kl": 0.15348119735717775, |
|
"learning_rate": 5.769999999999999e-07, |
|
"loss": -0.0, |
|
"reward": 0.22176313707605005, |
|
"reward_std": 0.08856261784967501, |
|
"rewards/DCR_reward": 0.22176313707605005, |
|
"step": 1270 |
|
}, |
|
{ |
|
"completion_length": 82.95, |
|
"epoch": 12.8, |
|
"grad_norm": 0.0, |
|
"kl": 0.184228515625, |
|
"learning_rate": 5.736666666666666e-07, |
|
"loss": 0.0, |
|
"reward": 0.1458639702643268, |
|
"reward_std": 0.0540214991623742, |
|
"rewards/DCR_reward": 0.1458639702643268, |
|
"step": 1280 |
|
}, |
|
{ |
|
"completion_length": 108.1875, |
|
"epoch": 12.9, |
|
"grad_norm": 25.125, |
|
"kl": 0.19346923828125, |
|
"learning_rate": 5.703333333333334e-07, |
|
"loss": -0.0, |
|
"reward": 0.09487569569610059, |
|
"reward_std": 0.09456775116559583, |
|
"rewards/DCR_reward": 0.09487569569610059, |
|
"step": 1290 |
|
}, |
|
{ |
|
"completion_length": 60.0375, |
|
"epoch": 13.0, |
|
"grad_norm": 12.1875, |
|
"kl": 0.14573974609375, |
|
"learning_rate": 5.669999999999999e-07, |
|
"loss": 0.0, |
|
"reward": 0.09252699612407014, |
|
"reward_std": 0.0609970541823742, |
|
"rewards/DCR_reward": 0.09252699612407014, |
|
"step": 1300 |
|
}, |
|
{ |
|
"completion_length": 87.975, |
|
"epoch": 13.1, |
|
"grad_norm": 8.375, |
|
"kl": 0.24951171875, |
|
"learning_rate": 5.636666666666666e-07, |
|
"loss": -0.0, |
|
"reward": 0.18639756126794965, |
|
"reward_std": 0.059623961660099666, |
|
"rewards/DCR_reward": 0.18639756126794965, |
|
"step": 1310 |
|
}, |
|
{ |
|
"completion_length": 46.525, |
|
"epoch": 13.2, |
|
"grad_norm": 0.0, |
|
"kl": 0.183843994140625, |
|
"learning_rate": 5.603333333333334e-07, |
|
"loss": 0.0, |
|
"reward": 0.27727809102507306, |
|
"reward_std": 0.012234579344567464, |
|
"rewards/DCR_reward": 0.27727809102507306, |
|
"step": 1320 |
|
}, |
|
{ |
|
"completion_length": 81.5375, |
|
"epoch": 13.3, |
|
"grad_norm": 15.25, |
|
"kl": 0.1978515625, |
|
"learning_rate": 5.57e-07, |
|
"loss": -0.0, |
|
"reward": 0.34632972672116014, |
|
"reward_std": 0.02853981898369966, |
|
"rewards/DCR_reward": 0.34632972672116014, |
|
"step": 1330 |
|
}, |
|
{ |
|
"completion_length": 116.1875, |
|
"epoch": 13.4, |
|
"grad_norm": 7.46875, |
|
"kl": 0.14244384765625, |
|
"learning_rate": 5.536666666666666e-07, |
|
"loss": -0.0, |
|
"reward": 0.1364185765822185, |
|
"reward_std": 0.03053054096526466, |
|
"rewards/DCR_reward": 0.1364185765822185, |
|
"step": 1340 |
|
}, |
|
{ |
|
"completion_length": 96.25, |
|
"epoch": 13.5, |
|
"grad_norm": 16.875, |
|
"kl": 0.1332763671875, |
|
"learning_rate": 5.503333333333334e-07, |
|
"loss": -0.0, |
|
"reward": 0.174199710926041, |
|
"reward_std": 0.06987538231981034, |
|
"rewards/DCR_reward": 0.174199710926041, |
|
"step": 1350 |
|
}, |
|
{ |
|
"completion_length": 73.625, |
|
"epoch": 13.6, |
|
"grad_norm": 0.01190185546875, |
|
"kl": 0.145501708984375, |
|
"learning_rate": 5.47e-07, |
|
"loss": 0.0, |
|
"reward": 0.1467902946518734, |
|
"reward_std": 0.021008528914126145, |
|
"rewards/DCR_reward": 0.1467902946518734, |
|
"step": 1360 |
|
}, |
|
{ |
|
"completion_length": 102.9, |
|
"epoch": 13.7, |
|
"grad_norm": 0.0, |
|
"kl": 0.150146484375, |
|
"learning_rate": 5.436666666666666e-07, |
|
"loss": -0.0, |
|
"reward": 0.14278438028413803, |
|
"reward_std": 0.06553829507544151, |
|
"rewards/DCR_reward": 0.14278438028413803, |
|
"step": 1370 |
|
}, |
|
{ |
|
"completion_length": 67.125, |
|
"epoch": 13.8, |
|
"grad_norm": 34.75, |
|
"kl": 0.18953857421875, |
|
"learning_rate": 5.403333333333333e-07, |
|
"loss": -0.0, |
|
"reward": 0.04754302315413952, |
|
"reward_std": 0.0033664418617263435, |
|
"rewards/DCR_reward": 0.04754302315413952, |
|
"step": 1380 |
|
}, |
|
{ |
|
"completion_length": 80.1875, |
|
"epoch": 13.9, |
|
"grad_norm": 27.875, |
|
"kl": 0.1549560546875, |
|
"learning_rate": 5.37e-07, |
|
"loss": 0.0, |
|
"reward": 0.06470744522521273, |
|
"reward_std": 0.06195358677759941, |
|
"rewards/DCR_reward": 0.06470744522521273, |
|
"step": 1390 |
|
}, |
|
{ |
|
"completion_length": 76.525, |
|
"epoch": 14.0, |
|
"grad_norm": 20.5, |
|
"kl": 0.18204002380371093, |
|
"learning_rate": 5.336666666666666e-07, |
|
"loss": -0.0, |
|
"reward": 0.25884356582537293, |
|
"reward_std": 0.07922300189136422, |
|
"rewards/DCR_reward": 0.25884356582537293, |
|
"step": 1400 |
|
}, |
|
{ |
|
"completion_length": 61.475, |
|
"epoch": 14.1, |
|
"grad_norm": 0.0, |
|
"kl": 0.1508544921875, |
|
"learning_rate": 5.303333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.20719661605544387, |
|
"reward_std": 0.0746672638963446, |
|
"rewards/DCR_reward": 0.20719661605544387, |
|
"step": 1410 |
|
}, |
|
{ |
|
"completion_length": 79.1875, |
|
"epoch": 14.2, |
|
"grad_norm": 0.0, |
|
"kl": 0.1884521484375, |
|
"learning_rate": 5.27e-07, |
|
"loss": 0.0, |
|
"reward": 0.1095911561860703, |
|
"reward_std": 0.02852085893282492, |
|
"rewards/DCR_reward": 0.1095911561860703, |
|
"step": 1420 |
|
}, |
|
{ |
|
"completion_length": 99.4875, |
|
"epoch": 14.3, |
|
"grad_norm": 8.5625, |
|
"kl": 0.183203125, |
|
"learning_rate": 5.236666666666666e-07, |
|
"loss": 0.0, |
|
"reward": 0.26417584040900693, |
|
"reward_std": 0.03152382288160993, |
|
"rewards/DCR_reward": 0.26417584040900693, |
|
"step": 1430 |
|
}, |
|
{ |
|
"completion_length": 81.4375, |
|
"epoch": 14.4, |
|
"grad_norm": 11.8125, |
|
"kl": 0.147418212890625, |
|
"learning_rate": 5.203333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.27334287738776764, |
|
"reward_std": 0.059623092689435, |
|
"rewards/DCR_reward": 0.27334287738776764, |
|
"step": 1440 |
|
}, |
|
{ |
|
"completion_length": 64.7375, |
|
"epoch": 14.5, |
|
"grad_norm": 22.0, |
|
"kl": 0.223583984375, |
|
"learning_rate": 5.17e-07, |
|
"loss": -0.0, |
|
"reward": 0.10534097602358088, |
|
"reward_std": 0.01799371653714843, |
|
"rewards/DCR_reward": 0.10534097602358088, |
|
"step": 1450 |
|
}, |
|
{ |
|
"completion_length": 69.275, |
|
"epoch": 14.6, |
|
"grad_norm": 0.0, |
|
"kl": 0.224200439453125, |
|
"learning_rate": 5.136666666666666e-07, |
|
"loss": -0.0, |
|
"reward": 0.2516032636165619, |
|
"reward_std": 0.07302998011000454, |
|
"rewards/DCR_reward": 0.2516032636165619, |
|
"step": 1460 |
|
}, |
|
{ |
|
"completion_length": 99.375, |
|
"epoch": 14.7, |
|
"grad_norm": 19.5, |
|
"kl": 0.16717529296875, |
|
"learning_rate": 5.103333333333333e-07, |
|
"loss": -0.0, |
|
"reward": 0.10548254007007926, |
|
"reward_std": 0.02692217687581433, |
|
"rewards/DCR_reward": 0.10548254007007926, |
|
"step": 1470 |
|
}, |
|
{ |
|
"completion_length": 79.325, |
|
"epoch": 14.8, |
|
"grad_norm": 26.375, |
|
"kl": 0.181884765625, |
|
"learning_rate": 5.07e-07, |
|
"loss": 0.0, |
|
"reward": 0.07880783905275165, |
|
"reward_std": 0.05647614029903707, |
|
"rewards/DCR_reward": 0.07880783905275165, |
|
"step": 1480 |
|
}, |
|
{ |
|
"completion_length": 99.9125, |
|
"epoch": 14.9, |
|
"grad_norm": 18.25, |
|
"kl": 0.09996337890625, |
|
"learning_rate": 5.036666666666666e-07, |
|
"loss": -0.0, |
|
"reward": 0.15730613842606544, |
|
"reward_std": 0.0505793450953206, |
|
"rewards/DCR_reward": 0.15730613842606544, |
|
"step": 1490 |
|
}, |
|
{ |
|
"completion_length": 74.3375, |
|
"epoch": 15.0, |
|
"grad_norm": 0.0, |
|
"kl": 0.167626953125, |
|
"learning_rate": 5.003333333333333e-07, |
|
"loss": -0.0, |
|
"reward": 0.20553053587209433, |
|
"reward_std": 0.06437594342569355, |
|
"rewards/DCR_reward": 0.20553053587209433, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_completion_length": 83.67125, |
|
"eval_kl": 0.21793760299682619, |
|
"eval_loss": -1.1107649697805755e-06, |
|
"eval_reward": 0.1789407900039805, |
|
"eval_reward_std": 0.04753444435028598, |
|
"eval_rewards/DCR_reward": 0.1789407900039805, |
|
"eval_runtime": 2622.8316, |
|
"eval_samples_per_second": 0.038, |
|
"eval_steps_per_second": 0.005, |
|
"step": 1500 |
|
}, |
|
{ |
|
"completion_length": 77.8125, |
|
"epoch": 15.1, |
|
"grad_norm": 8.75, |
|
"kl": 0.12496776580810547, |
|
"learning_rate": 4.97e-07, |
|
"loss": 0.0, |
|
"reward": 0.2007469806820154, |
|
"reward_std": 0.11496121380478144, |
|
"rewards/DCR_reward": 0.2007469806820154, |
|
"step": 1510 |
|
}, |
|
{ |
|
"completion_length": 112.4375, |
|
"epoch": 15.2, |
|
"grad_norm": 17.375, |
|
"kl": 0.1972900390625, |
|
"learning_rate": 4.936666666666666e-07, |
|
"loss": -0.0, |
|
"reward": 0.1523078629281372, |
|
"reward_std": 0.0487046109745279, |
|
"rewards/DCR_reward": 0.1523078629281372, |
|
"step": 1520 |
|
}, |
|
{ |
|
"completion_length": 106.325, |
|
"epoch": 15.3, |
|
"grad_norm": 33.75, |
|
"kl": 0.1178955078125, |
|
"learning_rate": 4.903333333333333e-07, |
|
"loss": -0.0, |
|
"reward": 0.14267266684328206, |
|
"reward_std": 0.13161113108944847, |
|
"rewards/DCR_reward": 0.14267266684328206, |
|
"step": 1530 |
|
}, |
|
{ |
|
"completion_length": 70.2875, |
|
"epoch": 15.4, |
|
"grad_norm": 14.8125, |
|
"kl": 0.2324951171875, |
|
"learning_rate": 4.87e-07, |
|
"loss": 0.0, |
|
"reward": 0.2017348323017359, |
|
"reward_std": 0.04265181252852699, |
|
"rewards/DCR_reward": 0.2017348323017359, |
|
"step": 1540 |
|
}, |
|
{ |
|
"completion_length": 45.4625, |
|
"epoch": 15.5, |
|
"grad_norm": 0.0, |
|
"kl": 0.15888671875, |
|
"learning_rate": 4.836666666666666e-07, |
|
"loss": -0.0, |
|
"reward": 0.2259118565125391, |
|
"reward_std": 0.026894852996608164, |
|
"rewards/DCR_reward": 0.2259118565125391, |
|
"step": 1550 |
|
}, |
|
{ |
|
"completion_length": 88.6625, |
|
"epoch": 15.6, |
|
"grad_norm": 23.5, |
|
"kl": 0.164312744140625, |
|
"learning_rate": 4.803333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.15437748426338657, |
|
"reward_std": 0.032754498023996347, |
|
"rewards/DCR_reward": 0.15437748426338657, |
|
"step": 1560 |
|
}, |
|
{ |
|
"completion_length": 78.4, |
|
"epoch": 15.7, |
|
"grad_norm": 0.0, |
|
"kl": 0.164697265625, |
|
"learning_rate": 4.769999999999999e-07, |
|
"loss": -0.0, |
|
"reward": 0.09430048232898117, |
|
"reward_std": 0.03383164100494014, |
|
"rewards/DCR_reward": 0.09430048232898117, |
|
"step": 1570 |
|
}, |
|
{ |
|
"completion_length": 79.2875, |
|
"epoch": 15.8, |
|
"grad_norm": 0.0, |
|
"kl": 0.176513671875, |
|
"learning_rate": 4.7366666666666666e-07, |
|
"loss": -0.0, |
|
"reward": 0.23642000226536766, |
|
"reward_std": 0.04199942027457837, |
|
"rewards/DCR_reward": 0.23642000226536766, |
|
"step": 1580 |
|
}, |
|
{ |
|
"completion_length": 86.6875, |
|
"epoch": 15.9, |
|
"grad_norm": 28.875, |
|
"kl": 0.234124755859375, |
|
"learning_rate": 4.703333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.20127527262666262, |
|
"reward_std": 0.026311779970637873, |
|
"rewards/DCR_reward": 0.20127527262666262, |
|
"step": 1590 |
|
}, |
|
{ |
|
"completion_length": 70.875, |
|
"epoch": 16.0, |
|
"grad_norm": 0.0, |
|
"kl": 0.175665283203125, |
|
"learning_rate": 4.67e-07, |
|
"loss": -0.0, |
|
"reward": 0.14691403629258276, |
|
"reward_std": 0.04396011229930537, |
|
"rewards/DCR_reward": 0.14691403629258276, |
|
"step": 1600 |
|
}, |
|
{ |
|
"completion_length": 57.625, |
|
"epoch": 16.1, |
|
"grad_norm": 17.0, |
|
"kl": 0.20440673828125, |
|
"learning_rate": 4.6366666666666665e-07, |
|
"loss": -0.0, |
|
"reward": 0.13289597362745553, |
|
"reward_std": 0.0672353014729822, |
|
"rewards/DCR_reward": 0.13289597362745553, |
|
"step": 1610 |
|
}, |
|
{ |
|
"completion_length": 99.6375, |
|
"epoch": 16.2, |
|
"grad_norm": 0.0, |
|
"kl": 0.117718505859375, |
|
"learning_rate": 4.603333333333333e-07, |
|
"loss": -0.0, |
|
"reward": 0.26933112973347306, |
|
"reward_std": 0.06138738352313169, |
|
"rewards/DCR_reward": 0.26933112973347306, |
|
"step": 1620 |
|
}, |
|
{ |
|
"completion_length": 85.85, |
|
"epoch": 16.3, |
|
"grad_norm": 0.0, |
|
"kl": 0.206988525390625, |
|
"learning_rate": 4.57e-07, |
|
"loss": -0.0, |
|
"reward": 0.1808759123814525, |
|
"reward_std": 0.023974006343632937, |
|
"rewards/DCR_reward": 0.1808759123814525, |
|
"step": 1630 |
|
}, |
|
{ |
|
"completion_length": 74.9625, |
|
"epoch": 16.4, |
|
"grad_norm": 19.125, |
|
"kl": 0.205908203125, |
|
"learning_rate": 4.5366666666666664e-07, |
|
"loss": -0.0, |
|
"reward": 0.2561442313250154, |
|
"reward_std": 0.05999695781356422, |
|
"rewards/DCR_reward": 0.2561442313250154, |
|
"step": 1640 |
|
}, |
|
{ |
|
"completion_length": 51.425, |
|
"epoch": 16.5, |
|
"grad_norm": 0.0, |
|
"kl": 0.19951171875, |
|
"learning_rate": 4.503333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.23342568413354456, |
|
"reward_std": 0.030879499143338762, |
|
"rewards/DCR_reward": 0.23342568413354456, |
|
"step": 1650 |
|
}, |
|
{ |
|
"completion_length": 83.3625, |
|
"epoch": 16.6, |
|
"grad_norm": 0.0, |
|
"kl": 0.13411979675292968, |
|
"learning_rate": 4.4699999999999997e-07, |
|
"loss": 0.0, |
|
"reward": 0.10717827337794006, |
|
"reward_std": 0.055167136660065806, |
|
"rewards/DCR_reward": 0.10717827337794006, |
|
"step": 1660 |
|
}, |
|
{ |
|
"completion_length": 80.0875, |
|
"epoch": 16.7, |
|
"grad_norm": 14.75, |
|
"kl": 0.2007080078125, |
|
"learning_rate": 4.4366666666666663e-07, |
|
"loss": 0.0, |
|
"reward": 0.21203166521154343, |
|
"reward_std": 0.05453803092241287, |
|
"rewards/DCR_reward": 0.21203166521154343, |
|
"step": 1670 |
|
}, |
|
{ |
|
"completion_length": 106.975, |
|
"epoch": 16.8, |
|
"grad_norm": 16.625, |
|
"kl": 0.1846435546875, |
|
"learning_rate": 4.4033333333333335e-07, |
|
"loss": 0.0, |
|
"reward": 0.08938784400233998, |
|
"reward_std": 0.05009205757160089, |
|
"rewards/DCR_reward": 0.08938784400233998, |
|
"step": 1680 |
|
}, |
|
{ |
|
"completion_length": 105.3875, |
|
"epoch": 16.9, |
|
"grad_norm": 0.0, |
|
"kl": 0.1232177734375, |
|
"learning_rate": 4.3699999999999996e-07, |
|
"loss": -0.0, |
|
"reward": 0.18415257817832753, |
|
"reward_std": 0.007105620090851516, |
|
"rewards/DCR_reward": 0.18415257817832753, |
|
"step": 1690 |
|
}, |
|
{ |
|
"completion_length": 89.2375, |
|
"epoch": 17.0, |
|
"grad_norm": 17.625, |
|
"kl": 0.1628662109375, |
|
"learning_rate": 4.336666666666666e-07, |
|
"loss": 0.0, |
|
"reward": 0.10958491688361391, |
|
"reward_std": 0.041172702021503936, |
|
"rewards/DCR_reward": 0.10958491688361391, |
|
"step": 1700 |
|
}, |
|
{ |
|
"completion_length": 90.1875, |
|
"epoch": 17.1, |
|
"grad_norm": 0.0, |
|
"kl": 0.15859375, |
|
"learning_rate": 4.3033333333333334e-07, |
|
"loss": 0.0, |
|
"reward": 0.09857021539355629, |
|
"reward_std": 0.02386656640956062, |
|
"rewards/DCR_reward": 0.09857021539355629, |
|
"step": 1710 |
|
}, |
|
{ |
|
"completion_length": 63.25, |
|
"epoch": 17.2, |
|
"grad_norm": 0.0, |
|
"kl": 0.16219091415405273, |
|
"learning_rate": 4.2699999999999995e-07, |
|
"loss": -0.0, |
|
"reward": 0.054912311234511436, |
|
"reward_std": 0.021657621535587167, |
|
"rewards/DCR_reward": 0.054912311234511436, |
|
"step": 1720 |
|
}, |
|
{ |
|
"completion_length": 75.025, |
|
"epoch": 17.3, |
|
"grad_norm": 23.875, |
|
"kl": 0.208575439453125, |
|
"learning_rate": 4.2366666666666666e-07, |
|
"loss": 0.0, |
|
"reward": 0.3105363720096648, |
|
"reward_std": 0.03136685772915371, |
|
"rewards/DCR_reward": 0.3105363720096648, |
|
"step": 1730 |
|
}, |
|
{ |
|
"completion_length": 76.5875, |
|
"epoch": 17.4, |
|
"grad_norm": 11.875, |
|
"kl": 0.22567138671875, |
|
"learning_rate": 4.203333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.1657306909793988, |
|
"reward_std": 0.06016184531727049, |
|
"rewards/DCR_reward": 0.1657306909793988, |
|
"step": 1740 |
|
}, |
|
{ |
|
"completion_length": 76.5625, |
|
"epoch": 17.5, |
|
"grad_norm": 0.10302734375, |
|
"kl": 0.21397705078125, |
|
"learning_rate": 4.17e-07, |
|
"loss": 0.0, |
|
"reward": 0.3156662947498262, |
|
"reward_std": 0.017444778925437276, |
|
"rewards/DCR_reward": 0.3156662947498262, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 17.5, |
|
"eval_completion_length": 83.015, |
|
"eval_kl": 0.1820037841796875, |
|
"eval_loss": 1.8720327261689818e-06, |
|
"eval_reward": 0.1803470617614221, |
|
"eval_reward_std": 0.047306431781344714, |
|
"eval_rewards/DCR_reward": 0.1803470617614221, |
|
"eval_runtime": 2666.1674, |
|
"eval_samples_per_second": 0.038, |
|
"eval_steps_per_second": 0.005, |
|
"step": 1750 |
|
}, |
|
{ |
|
"completion_length": 71.3125, |
|
"epoch": 17.6, |
|
"grad_norm": 18.25, |
|
"kl": 0.187255859375, |
|
"learning_rate": 4.1366666666666665e-07, |
|
"loss": 0.0, |
|
"reward": 0.20512793064117432, |
|
"reward_std": 0.047461129271408706, |
|
"rewards/DCR_reward": 0.20512793064117432, |
|
"step": 1760 |
|
}, |
|
{ |
|
"completion_length": 59.45, |
|
"epoch": 17.7, |
|
"grad_norm": 17.0, |
|
"kl": 0.1356689453125, |
|
"learning_rate": 4.103333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.14152073024306447, |
|
"reward_std": 0.09260614971240103, |
|
"rewards/DCR_reward": 0.14152073024306447, |
|
"step": 1770 |
|
}, |
|
{ |
|
"completion_length": 66.0625, |
|
"epoch": 17.8, |
|
"grad_norm": 0.0, |
|
"kl": 0.1925048828125, |
|
"learning_rate": 4.07e-07, |
|
"loss": -0.0, |
|
"reward": 0.25652417142409834, |
|
"reward_std": 0.04226240784919355, |
|
"rewards/DCR_reward": 0.25652417142409834, |
|
"step": 1780 |
|
}, |
|
{ |
|
"completion_length": 109.575, |
|
"epoch": 17.9, |
|
"grad_norm": 0.0, |
|
"kl": 0.165313720703125, |
|
"learning_rate": 4.0366666666666664e-07, |
|
"loss": 0.0, |
|
"reward": 0.21949415714479983, |
|
"reward_std": 0.08642580564569471, |
|
"rewards/DCR_reward": 0.21949415714479983, |
|
"step": 1790 |
|
}, |
|
{ |
|
"completion_length": 64.35, |
|
"epoch": 18.0, |
|
"grad_norm": 20.875, |
|
"kl": 0.1558349609375, |
|
"learning_rate": 4.003333333333333e-07, |
|
"loss": -0.0, |
|
"reward": 0.16233385398518294, |
|
"reward_std": 0.04793143280548975, |
|
"rewards/DCR_reward": 0.16233385398518294, |
|
"step": 1800 |
|
}, |
|
{ |
|
"completion_length": 47.7375, |
|
"epoch": 18.1, |
|
"grad_norm": 13.125, |
|
"kl": 0.280615234375, |
|
"learning_rate": 3.97e-07, |
|
"loss": -0.0, |
|
"reward": 0.1926513019599952, |
|
"reward_std": 0.022014413893526808, |
|
"rewards/DCR_reward": 0.1926513019599952, |
|
"step": 1810 |
|
}, |
|
{ |
|
"completion_length": 86.2875, |
|
"epoch": 18.2, |
|
"grad_norm": 0.0, |
|
"kl": 0.19091796875, |
|
"learning_rate": 3.9366666666666663e-07, |
|
"loss": 0.0, |
|
"reward": 0.18852764302864672, |
|
"reward_std": 0.033248070168701814, |
|
"rewards/DCR_reward": 0.18852764302864672, |
|
"step": 1820 |
|
}, |
|
{ |
|
"completion_length": 80.175, |
|
"epoch": 18.3, |
|
"grad_norm": 13.0, |
|
"kl": 0.12606201171875, |
|
"learning_rate": 3.903333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.17205136871198193, |
|
"reward_std": 0.019140477599285076, |
|
"rewards/DCR_reward": 0.17205136871198193, |
|
"step": 1830 |
|
}, |
|
{ |
|
"completion_length": 79.5125, |
|
"epoch": 18.4, |
|
"grad_norm": 0.96484375, |
|
"kl": 0.17064599990844725, |
|
"learning_rate": 3.87e-07, |
|
"loss": 0.0, |
|
"reward": 0.07664455490885302, |
|
"reward_std": 0.04274343762583612, |
|
"rewards/DCR_reward": 0.07664455490885302, |
|
"step": 1840 |
|
}, |
|
{ |
|
"completion_length": 81.4375, |
|
"epoch": 18.5, |
|
"grad_norm": 0.0, |
|
"kl": 0.1732177734375, |
|
"learning_rate": 3.836666666666666e-07, |
|
"loss": 0.0, |
|
"reward": 0.23127210177481175, |
|
"reward_std": 0.04357012182008475, |
|
"rewards/DCR_reward": 0.23127210177481175, |
|
"step": 1850 |
|
}, |
|
{ |
|
"completion_length": 75.625, |
|
"epoch": 18.6, |
|
"grad_norm": 3.03125, |
|
"kl": 0.159375, |
|
"learning_rate": 3.8033333333333334e-07, |
|
"loss": 0.0, |
|
"reward": 0.27262264720629903, |
|
"reward_std": 0.05521087486195313, |
|
"rewards/DCR_reward": 0.27262264720629903, |
|
"step": 1860 |
|
}, |
|
{ |
|
"completion_length": 134.3125, |
|
"epoch": 18.7, |
|
"grad_norm": 0.36328125, |
|
"kl": 0.1446624755859375, |
|
"learning_rate": 3.77e-07, |
|
"loss": 0.0, |
|
"reward": 0.11476055827224627, |
|
"reward_std": 0.02939585350050038, |
|
"rewards/DCR_reward": 0.11476055827224627, |
|
"step": 1870 |
|
}, |
|
{ |
|
"completion_length": 72.4125, |
|
"epoch": 18.8, |
|
"grad_norm": 23.5, |
|
"kl": 0.1951904296875, |
|
"learning_rate": 3.736666666666666e-07, |
|
"loss": -0.0, |
|
"reward": 0.14638857576064765, |
|
"reward_std": 0.04362176135448124, |
|
"rewards/DCR_reward": 0.14638857576064765, |
|
"step": 1880 |
|
}, |
|
{ |
|
"completion_length": 70.75, |
|
"epoch": 18.9, |
|
"grad_norm": 0.0, |
|
"kl": 0.186572265625, |
|
"learning_rate": 3.7033333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.1708667165134102, |
|
"reward_std": 0.07184823253192008, |
|
"rewards/DCR_reward": 0.1708667165134102, |
|
"step": 1890 |
|
}, |
|
{ |
|
"completion_length": 52.675, |
|
"epoch": 19.0, |
|
"grad_norm": 19.625, |
|
"kl": 0.183404541015625, |
|
"learning_rate": 3.67e-07, |
|
"loss": -0.0, |
|
"reward": 0.2897725820541382, |
|
"reward_std": 0.0576688679928111, |
|
"rewards/DCR_reward": 0.2897725820541382, |
|
"step": 1900 |
|
}, |
|
{ |
|
"completion_length": 81.7625, |
|
"epoch": 19.1, |
|
"grad_norm": 11.5, |
|
"kl": 0.217041015625, |
|
"learning_rate": 3.6366666666666665e-07, |
|
"loss": 0.0, |
|
"reward": 0.156359511311166, |
|
"reward_std": 0.051926983702384175, |
|
"rewards/DCR_reward": 0.156359511311166, |
|
"step": 1910 |
|
}, |
|
{ |
|
"completion_length": 130.8125, |
|
"epoch": 19.2, |
|
"grad_norm": 9.8125, |
|
"kl": 0.10965576171875, |
|
"learning_rate": 3.603333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.10657796601299196, |
|
"reward_std": 0.0012149818532634527, |
|
"rewards/DCR_reward": 0.10657796601299196, |
|
"step": 1920 |
|
}, |
|
{ |
|
"completion_length": 63.2375, |
|
"epoch": 19.3, |
|
"grad_norm": 19.0, |
|
"kl": 0.26494140625, |
|
"learning_rate": 3.57e-07, |
|
"loss": 0.0, |
|
"reward": 0.25602573398500683, |
|
"reward_std": 0.04811685611639405, |
|
"rewards/DCR_reward": 0.25602573398500683, |
|
"step": 1930 |
|
}, |
|
{ |
|
"completion_length": 71.2125, |
|
"epoch": 19.4, |
|
"grad_norm": 0.039794921875, |
|
"kl": 0.2076171875, |
|
"learning_rate": 3.5366666666666664e-07, |
|
"loss": 0.0, |
|
"reward": 0.2313489816733636, |
|
"reward_std": 0.037926042393394255, |
|
"rewards/DCR_reward": 0.2313489816733636, |
|
"step": 1940 |
|
}, |
|
{ |
|
"completion_length": 80.075, |
|
"epoch": 19.5, |
|
"grad_norm": 4.40625, |
|
"kl": 0.194873046875, |
|
"learning_rate": 3.503333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.2062260712031275, |
|
"reward_std": 0.07047836606834608, |
|
"rewards/DCR_reward": 0.2062260712031275, |
|
"step": 1950 |
|
}, |
|
{ |
|
"completion_length": 47.3875, |
|
"epoch": 19.6, |
|
"grad_norm": 9.125, |
|
"kl": 0.159039306640625, |
|
"learning_rate": 3.4699999999999997e-07, |
|
"loss": 0.0, |
|
"reward": 0.3275539556518197, |
|
"reward_std": 0.056122380661634, |
|
"rewards/DCR_reward": 0.3275539556518197, |
|
"step": 1960 |
|
}, |
|
{ |
|
"completion_length": 81.525, |
|
"epoch": 19.7, |
|
"grad_norm": 17.875, |
|
"kl": 0.14603710174560547, |
|
"learning_rate": 3.436666666666667e-07, |
|
"loss": -0.0, |
|
"reward": 0.08158219500910491, |
|
"reward_std": 0.0475987725701998, |
|
"rewards/DCR_reward": 0.08158219500910491, |
|
"step": 1970 |
|
}, |
|
{ |
|
"completion_length": 90.35, |
|
"epoch": 19.8, |
|
"grad_norm": 0.0, |
|
"kl": 0.16241455078125, |
|
"learning_rate": 3.403333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.14252315481426195, |
|
"reward_std": 0.0510443922455579, |
|
"rewards/DCR_reward": 0.14252315481426195, |
|
"step": 1980 |
|
}, |
|
{ |
|
"completion_length": 77.0875, |
|
"epoch": 19.9, |
|
"grad_norm": 0.8359375, |
|
"kl": 0.1235107421875, |
|
"learning_rate": 3.37e-07, |
|
"loss": -0.0, |
|
"reward": 0.11888001729967072, |
|
"reward_std": 0.041191360527292886, |
|
"rewards/DCR_reward": 0.11888001729967072, |
|
"step": 1990 |
|
}, |
|
{ |
|
"completion_length": 100.05, |
|
"epoch": 20.0, |
|
"grad_norm": 8.125, |
|
"kl": 0.209521484375, |
|
"learning_rate": 3.336666666666667e-07, |
|
"loss": 0.0, |
|
"reward": 0.16669638943858445, |
|
"reward_std": 0.028858381987083702, |
|
"rewards/DCR_reward": 0.16669638943858445, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_completion_length": 84.2325, |
|
"eval_kl": 0.17778059005737304, |
|
"eval_loss": 6.65434640723106e-07, |
|
"eval_reward": 0.18110041963984258, |
|
"eval_reward_std": 0.0566114658890848, |
|
"eval_rewards/DCR_reward": 0.18110041963984258, |
|
"eval_runtime": 2663.7287, |
|
"eval_samples_per_second": 0.038, |
|
"eval_steps_per_second": 0.005, |
|
"step": 2000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 30, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|