|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.008565310492505354, |
|
"eval_steps": 1, |
|
"global_step": 16, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 327.875, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 422.0, |
|
"completions/max_terminated_length": 422.0, |
|
"completions/mean_length": 327.875, |
|
"completions/mean_terminated_length": 327.875, |
|
"completions/min_length": 256.0, |
|
"completions/min_terminated_length": 256.0, |
|
"epoch": 0.0005353319057815846, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": NaN, |
|
"kl": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 0.0, |
|
"num_tokens": 3479.0, |
|
"reward": 1.3822917938232422, |
|
"reward_std": 0.001679160282947123, |
|
"rewards/r_correctness/mean": 1.0, |
|
"rewards/r_correctness/std": 1.0690449476242065, |
|
"rewards/r_shaping/mean": -0.01770833320915699, |
|
"rewards/r_shaping/std": 0.00442595174536109, |
|
"rewards/r_soft/mean": 0.30000001192092896, |
|
"rewards/r_soft/std": 0.0, |
|
"rewards/r_strict/mean": 0.0, |
|
"rewards/r_strict/std": 0.0, |
|
"rewards/r_xmlcount/mean": 0.10000000149011612, |
|
"rewards/r_xmlcount/std": 0.0, |
|
"step": 1 |
|
}, |
|
{ |
|
"completion_length": 382.25, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 486.0, |
|
"completions/max_terminated_length": 486.0, |
|
"completions/mean_length": 382.25, |
|
"completions/mean_terminated_length": 382.25, |
|
"completions/min_length": 313.0, |
|
"completions/min_terminated_length": 313.0, |
|
"epoch": 0.0010706638115631692, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": NaN, |
|
"kl": 0.0, |
|
"learning_rate": 0.0005, |
|
"loss": 0.0001, |
|
"num_tokens": 7457.0, |
|
"reward": 2.382591724395752, |
|
"reward_std": 0.0023273415863513947, |
|
"rewards/r_correctness/mean": 2.0, |
|
"rewards/r_correctness/std": 0.0, |
|
"rewards/r_shaping/mean": -0.017408333718776703, |
|
"rewards/r_shaping/std": 0.005641527008265257, |
|
"rewards/r_soft/mean": 0.30000001192092896, |
|
"rewards/r_soft/std": 0.0, |
|
"rewards/r_strict/mean": 0.0, |
|
"rewards/r_strict/std": 0.0, |
|
"rewards/r_xmlcount/mean": 0.10000000149011612, |
|
"rewards/r_xmlcount/std": 0.0, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 555.125, |
|
"completions/clipped_ratio": 0.5, |
|
"completions/max_length": 700.0, |
|
"completions/max_terminated_length": 534.0, |
|
"completions/mean_length": 555.125, |
|
"completions/mean_terminated_length": 410.25, |
|
"completions/min_length": 228.0, |
|
"completions/min_terminated_length": 228.0, |
|
"epoch": 0.0016059957173447537, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": NaN, |
|
"kl": 0.0, |
|
"learning_rate": 0.0004998023493068255, |
|
"loss": 0.0, |
|
"num_tokens": 12818.0, |
|
"reward": 1.117954134941101, |
|
"reward_std": 0.009724103845655918, |
|
"rewards/r_correctness/mean": 1.0, |
|
"rewards/r_correctness/std": 1.0690449476242065, |
|
"rewards/r_shaping/mean": -0.028920834884047508, |
|
"rewards/r_shaping/std": 0.008277526125311852, |
|
"rewards/r_soft/mean": 0.15000000596046448, |
|
"rewards/r_soft/std": 0.16035674512386322, |
|
"rewards/r_strict/mean": 0.0, |
|
"rewards/r_strict/std": 0.0, |
|
"rewards/r_xmlcount/mean": -0.0031250007450580597, |
|
"rewards/r_xmlcount/std": 0.11054855585098267, |
|
"step": 3 |
|
}, |
|
{ |
|
"completion_length": 633.375, |
|
"completions/clipped_ratio": 0.125, |
|
"completions/max_length": 700.0, |
|
"completions/max_terminated_length": 676.0, |
|
"completions/mean_length": 633.375, |
|
"completions/mean_terminated_length": 623.857177734375, |
|
"completions/min_length": 507.0, |
|
"completions/min_terminated_length": 507.0, |
|
"epoch": 0.0021413276231263384, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": NaN, |
|
"kl": 0.0, |
|
"learning_rate": 0.0004992097097536739, |
|
"loss": 0.0, |
|
"num_tokens": 18821.0, |
|
"reward": 1.3096479177474976, |
|
"reward_std": 0.12074954062700272, |
|
"rewards/r_correctness/mean": 1.0, |
|
"rewards/r_correctness/std": 1.0690449476242065, |
|
"rewards/r_shaping/mean": -0.03097708337008953, |
|
"rewards/r_shaping/std": 0.004845558665692806, |
|
"rewards/r_soft/mean": 0.26250001788139343, |
|
"rewards/r_soft/std": 0.1060660257935524, |
|
"rewards/r_strict/mean": 0.0, |
|
"rewards/r_strict/std": 0.0, |
|
"rewards/r_xmlcount/mean": 0.078125, |
|
"rewards/r_xmlcount/std": 0.06187184527516365, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 510.875, |
|
"completions/clipped_ratio": 0.5, |
|
"completions/max_length": 700.0, |
|
"completions/max_terminated_length": 415.0, |
|
"completions/mean_length": 510.875, |
|
"completions/mean_terminated_length": 321.75, |
|
"completions/min_length": 272.0, |
|
"completions/min_terminated_length": 272.0, |
|
"epoch": 0.0026766595289079227, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": NaN, |
|
"kl": 0.0, |
|
"learning_rate": 0.0004982230184254933, |
|
"loss": 0.0, |
|
"num_tokens": 23780.0, |
|
"reward": 1.125322937965393, |
|
"reward_std": 0.004006261937320232, |
|
"rewards/r_correctness/mean": 1.0, |
|
"rewards/r_correctness/std": 1.0690449476242065, |
|
"rewards/r_shaping/mean": -0.024677084758877754, |
|
"rewards/r_shaping/std": 0.011852074414491653, |
|
"rewards/r_soft/mean": 0.15000000596046448, |
|
"rewards/r_soft/std": 0.16035674512386322, |
|
"rewards/r_strict/mean": 0.0, |
|
"rewards/r_strict/std": 0.0, |
|
"rewards/r_xmlcount/mean": 0.0, |
|
"rewards/r_xmlcount/std": 0.10690450668334961, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 578.0, |
|
"completions/clipped_ratio": 0.5, |
|
"completions/max_length": 700.0, |
|
"completions/max_terminated_length": 493.0, |
|
"completions/mean_length": 578.0, |
|
"completions/mean_terminated_length": 456.0, |
|
"completions/min_length": 432.0, |
|
"completions/min_terminated_length": 432.0, |
|
"epoch": 0.0032119914346895075, |
|
"frac_reward_zero_std": 0.5, |
|
"grad_norm": 0.11385109275579453, |
|
"kl": 0.0, |
|
"learning_rate": 0.0004968438354840834, |
|
"loss": 0.0, |
|
"num_tokens": 29236.0, |
|
"reward": 1.1172062158584595, |
|
"reward_std": 0.001010744832456112, |
|
"rewards/r_correctness/mean": 1.0, |
|
"rewards/r_correctness/std": 1.0690449476242065, |
|
"rewards/r_shaping/mean": -0.03279374912381172, |
|
"rewards/r_shaping/std": 0.007816643454134464, |
|
"rewards/r_soft/mean": 0.15000000596046448, |
|
"rewards/r_soft/std": 0.16035674512386322, |
|
"rewards/r_strict/mean": 0.0, |
|
"rewards/r_strict/std": 0.0, |
|
"rewards/r_xmlcount/mean": 0.0, |
|
"rewards/r_xmlcount/std": 0.10690450668334961, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 565.125, |
|
"completions/clipped_ratio": 0.5, |
|
"completions/max_length": 700.0, |
|
"completions/max_terminated_length": 448.0, |
|
"completions/mean_length": 565.125, |
|
"completions/mean_terminated_length": 430.25, |
|
"completions/min_length": 417.0, |
|
"completions/min_terminated_length": 417.0, |
|
"epoch": 0.003747323340471092, |
|
"frac_reward_zero_std": 0.5, |
|
"grad_norm": NaN, |
|
"kl": 0.002949223853647709, |
|
"learning_rate": 0.0004950743417011591, |
|
"loss": 0.0001, |
|
"num_tokens": 34769.0, |
|
"reward": 1.1194353103637695, |
|
"reward_std": 0.00033387652365490794, |
|
"rewards/r_correctness/mean": 1.0, |
|
"rewards/r_correctness/std": 1.0690449476242065, |
|
"rewards/r_shaping/mean": -0.030564583837985992, |
|
"rewards/r_shaping/std": 0.010096355341374874, |
|
"rewards/r_soft/mean": 0.15000000596046448, |
|
"rewards/r_soft/std": 0.16035674512386322, |
|
"rewards/r_strict/mean": 0.0, |
|
"rewards/r_strict/std": 0.0, |
|
"rewards/r_xmlcount/mean": 0.0, |
|
"rewards/r_xmlcount/std": 0.10690450668334961, |
|
"step": 7 |
|
}, |
|
{ |
|
"completion_length": 612.0, |
|
"completions/clipped_ratio": 0.5, |
|
"completions/max_length": 700.0, |
|
"completions/max_terminated_length": 577.0, |
|
"completions/mean_length": 612.0, |
|
"completions/mean_terminated_length": 524.0, |
|
"completions/min_length": 427.0, |
|
"completions/min_terminated_length": 427.0, |
|
"epoch": 0.004282655246252677, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": NaN, |
|
"kl": 0.005263926927000284, |
|
"learning_rate": 0.0004929173350101025, |
|
"loss": 0.0003, |
|
"num_tokens": 40597.0, |
|
"reward": 1.1142561435699463, |
|
"reward_std": 0.007277170196175575, |
|
"rewards/r_correctness/mean": 1.0, |
|
"rewards/r_correctness/std": 1.0690449476242065, |
|
"rewards/r_shaping/mean": -0.032618746161460876, |
|
"rewards/r_shaping/std": 0.00728320237249136, |
|
"rewards/r_soft/mean": 0.15000000596046448, |
|
"rewards/r_soft/std": 0.16035674512386322, |
|
"rewards/r_strict/mean": 0.0, |
|
"rewards/r_strict/std": 0.0, |
|
"rewards/r_xmlcount/mean": -0.0031250007450580597, |
|
"rewards/r_xmlcount/std": 0.11054855585098267, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 540.75, |
|
"completions/clipped_ratio": 0.125, |
|
"completions/max_length": 700.0, |
|
"completions/max_terminated_length": 666.0, |
|
"completions/mean_length": 540.75, |
|
"completions/mean_terminated_length": 518.0, |
|
"completions/min_length": 298.0, |
|
"completions/min_terminated_length": 298.0, |
|
"epoch": 0.004817987152034261, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": NaN, |
|
"kl": 0.003534165909513831, |
|
"learning_rate": 0.0004903762260818551, |
|
"loss": 0.0002, |
|
"num_tokens": 45663.0, |
|
"reward": 2.0576353073120117, |
|
"reward_std": 0.6352876424789429, |
|
"rewards/r_correctness/mean": 1.75, |
|
"rewards/r_correctness/std": 0.7071067690849304, |
|
"rewards/r_shaping/mean": -0.026739582419395447, |
|
"rewards/r_shaping/std": 0.007363913580775261, |
|
"rewards/r_soft/mean": 0.26250001788139343, |
|
"rewards/r_soft/std": 0.1060660257935524, |
|
"rewards/r_strict/mean": 0.0, |
|
"rewards/r_strict/std": 0.0, |
|
"rewards/r_xmlcount/mean": 0.07187500596046448, |
|
"rewards/r_xmlcount/std": 0.07954951375722885, |
|
"step": 9 |
|
}, |
|
{ |
|
"completion_length": 471.375, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 593.0, |
|
"completions/max_terminated_length": 593.0, |
|
"completions/mean_length": 471.375, |
|
"completions/mean_terminated_length": 471.375, |
|
"completions/min_length": 353.0, |
|
"completions/min_terminated_length": 353.0, |
|
"epoch": 0.0053533190578158455, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": NaN, |
|
"kl": 0.007448031101375818, |
|
"learning_rate": 0.0004874550329319457, |
|
"loss": 0.0004, |
|
"num_tokens": 50302.0, |
|
"reward": 2.3762893676757812, |
|
"reward_std": 0.002844305010512471, |
|
"rewards/r_correctness/mean": 2.0, |
|
"rewards/r_correctness/std": 0.0, |
|
"rewards/r_shaping/mean": -0.02371041476726532, |
|
"rewards/r_shaping/std": 0.0028945477679371834, |
|
"rewards/r_soft/mean": 0.30000001192092896, |
|
"rewards/r_soft/std": 0.0, |
|
"rewards/r_strict/mean": 0.0, |
|
"rewards/r_strict/std": 0.0, |
|
"rewards/r_xmlcount/mean": 0.10000000149011612, |
|
"rewards/r_xmlcount/std": 0.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 486.875, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 586.0, |
|
"completions/max_terminated_length": 586.0, |
|
"completions/mean_length": 486.875, |
|
"completions/mean_terminated_length": 486.875, |
|
"completions/min_length": 404.0, |
|
"completions/min_terminated_length": 404.0, |
|
"epoch": 0.005888650963597431, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": NaN, |
|
"kl": 0.0019934047013521194, |
|
"learning_rate": 0.00048415837456718195, |
|
"loss": 0.0001, |
|
"num_tokens": 55077.0, |
|
"reward": 2.3770666122436523, |
|
"reward_std": 0.0032932735048234463, |
|
"rewards/r_correctness/mean": 2.0, |
|
"rewards/r_correctness/std": 0.0, |
|
"rewards/r_shaping/mean": -0.02293333411216736, |
|
"rewards/r_shaping/std": 0.0030542060267180204, |
|
"rewards/r_soft/mean": 0.30000001192092896, |
|
"rewards/r_soft/std": 0.0, |
|
"rewards/r_strict/mean": 0.0, |
|
"rewards/r_strict/std": 0.0, |
|
"rewards/r_xmlcount/mean": 0.10000000149011612, |
|
"rewards/r_xmlcount/std": 0.0, |
|
"step": 11 |
|
}, |
|
{ |
|
"completion_length": 520.625, |
|
"completions/clipped_ratio": 0.125, |
|
"completions/max_length": 700.0, |
|
"completions/max_terminated_length": 604.0, |
|
"completions/mean_length": 520.625, |
|
"completions/mean_terminated_length": 495.0000305175781, |
|
"completions/min_length": 386.0, |
|
"completions/min_terminated_length": 386.0, |
|
"epoch": 0.006423982869379015, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": NaN, |
|
"kl": 0.0034809389617294073, |
|
"learning_rate": 0.0004804914636820517, |
|
"loss": 0.0002, |
|
"num_tokens": 60182.0, |
|
"reward": 2.0551228523254395, |
|
"reward_std": 0.6347294449806213, |
|
"rewards/r_correctness/mean": 1.75, |
|
"rewards/r_correctness/std": 0.7071067690849304, |
|
"rewards/r_shaping/mean": -0.029252082109451294, |
|
"rewards/r_shaping/std": 0.005904427729547024, |
|
"rewards/r_soft/mean": 0.26250001788139343, |
|
"rewards/r_soft/std": 0.1060660257935524, |
|
"rewards/r_strict/mean": 0.0, |
|
"rewards/r_strict/std": 0.0, |
|
"rewards/r_xmlcount/mean": 0.07187500596046448, |
|
"rewards/r_xmlcount/std": 0.07954951375722885, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 601.25, |
|
"completions/clipped_ratio": 0.375, |
|
"completions/max_length": 700.0, |
|
"completions/max_terminated_length": 698.0, |
|
"completions/mean_length": 601.25, |
|
"completions/mean_terminated_length": 542.0, |
|
"completions/min_length": 400.0, |
|
"completions/min_terminated_length": 400.0, |
|
"epoch": 0.006959314775160599, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": NaN, |
|
"kl": 0.0022265929728746414, |
|
"learning_rate": 0.00047646009841638084, |
|
"loss": 0.0001, |
|
"num_tokens": 66032.0, |
|
"reward": 1.6835541725158691, |
|
"reward_std": 0.6659948229789734, |
|
"rewards/r_correctness/mean": 1.5, |
|
"rewards/r_correctness/std": 0.9258201122283936, |
|
"rewards/r_shaping/mean": -0.0320708304643631, |
|
"rewards/r_shaping/std": 0.006900239735841751, |
|
"rewards/r_soft/mean": 0.1875, |
|
"rewards/r_soft/std": 0.15526476502418518, |
|
"rewards/r_strict/mean": 0.0, |
|
"rewards/r_strict/std": 0.0, |
|
"rewards/r_xmlcount/mean": 0.02812499925494194, |
|
"rewards/r_xmlcount/std": 0.0994965061545372, |
|
"step": 13 |
|
}, |
|
{ |
|
"completion_length": 478.375, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 600.0, |
|
"completions/max_terminated_length": 600.0, |
|
"completions/mean_length": 478.375, |
|
"completions/mean_terminated_length": 478.375, |
|
"completions/min_length": 269.0, |
|
"completions/min_terminated_length": 269.0, |
|
"epoch": 0.007494646680942184, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": NaN, |
|
"kl": 0.002636353485286236, |
|
"learning_rate": 0.00047207065318728296, |
|
"loss": 0.0001, |
|
"num_tokens": 70759.0, |
|
"reward": 2.3731250762939453, |
|
"reward_std": 0.0061251213774085045, |
|
"rewards/r_correctness/mean": 2.0, |
|
"rewards/r_correctness/std": 0.0, |
|
"rewards/r_shaping/mean": -0.026875000447034836, |
|
"rewards/r_shaping/std": 0.006678614765405655, |
|
"rewards/r_soft/mean": 0.30000001192092896, |
|
"rewards/r_soft/std": 0.0, |
|
"rewards/r_strict/mean": 0.0, |
|
"rewards/r_strict/std": 0.0, |
|
"rewards/r_xmlcount/mean": 0.10000000149011612, |
|
"rewards/r_xmlcount/std": 0.0, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 367.125, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 524.0, |
|
"completions/max_terminated_length": 524.0, |
|
"completions/mean_length": 367.125, |
|
"completions/mean_terminated_length": 367.125, |
|
"completions/min_length": 214.0, |
|
"completions/min_terminated_length": 214.0, |
|
"epoch": 0.008029978586723769, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": NaN, |
|
"kl": 0.005754380952566862, |
|
"learning_rate": 0.00046733006860989566, |
|
"loss": 0.0003, |
|
"num_tokens": 74608.0, |
|
"reward": 2.3812456130981445, |
|
"reward_std": 0.0021071869414299726, |
|
"rewards/r_correctness/mean": 2.0, |
|
"rewards/r_correctness/std": 0.0, |
|
"rewards/r_shaping/mean": -0.01875416561961174, |
|
"rewards/r_shaping/std": 0.005434602499008179, |
|
"rewards/r_soft/mean": 0.30000001192092896, |
|
"rewards/r_soft/std": 0.0, |
|
"rewards/r_strict/mean": 0.0, |
|
"rewards/r_strict/std": 0.0, |
|
"rewards/r_xmlcount/mean": 0.10000000149011612, |
|
"rewards/r_xmlcount/std": 0.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 515.875, |
|
"completions/clipped_ratio": 0.5, |
|
"completions/max_length": 700.0, |
|
"completions/max_terminated_length": 366.0, |
|
"completions/mean_length": 515.875, |
|
"completions/mean_terminated_length": 331.75, |
|
"completions/min_length": 296.0, |
|
"completions/min_terminated_length": 296.0, |
|
"epoch": 0.008565310492505354, |
|
"frac_reward_zero_std": 0.5, |
|
"grad_norm": NaN, |
|
"kl": 0.0027227874379605055, |
|
"learning_rate": 0.000462245840522841, |
|
"loss": 0.0001, |
|
"num_tokens": 79783.0, |
|
"reward": 1.1220020055770874, |
|
"reward_std": 0.0010650103213265538, |
|
"rewards/r_correctness/mean": 1.0, |
|
"rewards/r_correctness/std": 1.0690449476242065, |
|
"rewards/r_shaping/mean": -0.02799791656434536, |
|
"rewards/r_shaping/std": 0.012906314805150032, |
|
"rewards/r_soft/mean": 0.15000000596046448, |
|
"rewards/r_soft/std": 0.16035674512386322, |
|
"rewards/r_strict/mean": 0.0, |
|
"rewards/r_strict/std": 0.0, |
|
"rewards/r_xmlcount/mean": 0.0, |
|
"rewards/r_xmlcount/std": 0.10690450668334961, |
|
"step": 16 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 80, |
|
"num_input_tokens_seen": 79783, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|