FF_L0.2_H0.2_dr_grpo / trainer_state.json
LLucass's picture
Model save
36853fa verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.22857142857142856,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.671875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1734.0,
"completions/mean_length": 1702.03125,
"completions/mean_terminated_length": 993.6190795898438,
"completions/min_length": 483.0,
"completions/min_terminated_length": 483.0,
"epoch": 0.001142857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20052470266819,
"learning_rate": 0.0,
"loss": 0.0427,
"num_tokens": 118418.0,
"reward": 0.17899775505065918,
"reward_std": 0.7650213241577148,
"rewards/cosine_scaled_reward/mean": -0.09800112992525101,
"rewards/cosine_scaled_reward/std": 0.37953105568885803,
"rewards/format_reward/mean": 0.375,
"rewards/format_reward/std": 0.48795005679130554,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1894.0,
"completions/mean_length": 1738.90625,
"completions/mean_terminated_length": 949.0,
"completions/min_length": 435.0,
"completions/min_terminated_length": 435.0,
"epoch": 0.002285714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19504369795322418,
"learning_rate": 5e-08,
"loss": 0.0561,
"num_tokens": 239748.0,
"reward": 0.3848632574081421,
"reward_std": 0.9111153483390808,
"rewards/cosine_scaled_reward/mean": 0.020556632429361343,
"rewards/cosine_scaled_reward/std": 0.4492928683757782,
"rewards/format_reward/mean": 0.34375,
"rewards/format_reward/std": 0.4787135720252991,
"step": 2
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.90625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1896.0,
"completions/mean_length": 1948.96875,
"completions/mean_terminated_length": 991.6666870117188,
"completions/min_length": 534.0,
"completions/min_terminated_length": 534.0,
"epoch": 0.0034285714285714284,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.23850594460964203,
"learning_rate": 1e-07,
"loss": 0.0525,
"num_tokens": 374954.0,
"reward": -0.2894650101661682,
"reward_std": 0.40320682525634766,
"rewards/cosine_scaled_reward/mean": -0.1916075050830841,
"rewards/cosine_scaled_reward/std": 0.17467568814754486,
"rewards/format_reward/mean": 0.09375,
"rewards/format_reward/std": 0.29378482699394226,
"step": 3
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.53125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1653.0,
"completions/mean_length": 1545.390625,
"completions/mean_terminated_length": 975.7667236328125,
"completions/min_length": 564.0,
"completions/min_terminated_length": 564.0,
"epoch": 0.004571428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19908685982227325,
"learning_rate": 1.5e-07,
"loss": 0.0836,
"num_tokens": 483667.0,
"reward": 0.1905757486820221,
"reward_std": 0.6709368824958801,
"rewards/cosine_scaled_reward/mean": -0.16252461075782776,
"rewards/cosine_scaled_reward/std": 0.27594515681266785,
"rewards/format_reward/mean": 0.515625,
"rewards/format_reward/std": 0.5037065148353577,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.90625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2019.0,
"completions/mean_length": 1966.78125,
"completions/mean_terminated_length": 1181.666748046875,
"completions/min_length": 474.0,
"completions/min_terminated_length": 474.0,
"epoch": 0.005714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21755796670913696,
"learning_rate": 2e-07,
"loss": 0.0519,
"num_tokens": 620357.0,
"reward": -0.402042031288147,
"reward_std": 0.399784117937088,
"rewards/cosine_scaled_reward/mean": -0.24789603054523468,
"rewards/cosine_scaled_reward/std": 0.18156999349594116,
"rewards/format_reward/mean": 0.09375,
"rewards/format_reward/std": 0.29378482699394226,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1264.0,
"completions/mean_length": 1897.390625,
"completions/mean_terminated_length": 843.125,
"completions/min_length": 628.0,
"completions/min_terminated_length": 628.0,
"epoch": 0.006857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2295181304216385,
"learning_rate": 2.5e-07,
"loss": 0.0729,
"num_tokens": 753438.0,
"reward": -0.3786737024784088,
"reward_std": 0.4345499277114868,
"rewards/cosine_scaled_reward/mean": -0.2596493363380432,
"rewards/cosine_scaled_reward/std": 0.1708926111459732,
"rewards/format_reward/mean": 0.140625,
"rewards/format_reward/std": 0.3503824472427368,
"step": 6
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.859375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2000.0,
"completions/mean_length": 1933.21875,
"completions/mean_terminated_length": 1231.77783203125,
"completions/min_length": 863.0,
"completions/min_terminated_length": 863.0,
"epoch": 0.008,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20217153429985046,
"learning_rate": 3e-07,
"loss": 0.0254,
"num_tokens": 887572.0,
"reward": -0.13325583934783936,
"reward_std": 0.5423575639724731,
"rewards/cosine_scaled_reward/mean": -0.17600291967391968,
"rewards/cosine_scaled_reward/std": 0.35686567425727844,
"rewards/format_reward/mean": 0.21875,
"rewards/format_reward/std": 0.4166666865348816,
"step": 7
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2024.0,
"completions/mean_length": 1776.96875,
"completions/mean_terminated_length": 1180.7000732421875,
"completions/min_length": 342.0,
"completions/min_terminated_length": 342.0,
"epoch": 0.009142857142857144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19864660501480103,
"learning_rate": 3.5e-07,
"loss": -0.0092,
"num_tokens": 1011714.0,
"reward": 0.35212597250938416,
"reward_std": 0.7144544720649719,
"rewards/cosine_scaled_reward/mean": -0.003624534234404564,
"rewards/cosine_scaled_reward/std": 0.515006422996521,
"rewards/format_reward/mean": 0.359375,
"rewards/format_reward/std": 0.4836103618144989,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.890625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1941.0,
"completions/mean_length": 1951.0625,
"completions/mean_terminated_length": 1161.71435546875,
"completions/min_length": 636.0,
"completions/min_terminated_length": 636.0,
"epoch": 0.010285714285714285,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20887432992458344,
"learning_rate": 4e-07,
"loss": 0.0806,
"num_tokens": 1148038.0,
"reward": -0.3706062436103821,
"reward_std": 0.4610140025615692,
"rewards/cosine_scaled_reward/mean": -0.25561562180519104,
"rewards/cosine_scaled_reward/std": 0.1772036999464035,
"rewards/format_reward/mean": 0.140625,
"rewards/format_reward/std": 0.3503824472427368,
"step": 9
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.703125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1271.0,
"completions/mean_length": 1669.9375,
"completions/mean_terminated_length": 774.5263061523438,
"completions/min_length": 303.0,
"completions/min_terminated_length": 303.0,
"epoch": 0.011428571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20181182026863098,
"learning_rate": 4.5e-07,
"loss": 0.043,
"num_tokens": 1265746.0,
"reward": 0.0919075608253479,
"reward_std": 0.5226040482521057,
"rewards/cosine_scaled_reward/mean": -0.10248372703790665,
"rewards/cosine_scaled_reward/std": 0.37469154596328735,
"rewards/format_reward/mean": 0.296875,
"rewards/format_reward/std": 0.4604927599430084,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.921875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 839.0,
"completions/mean_length": 1948.453125,
"completions/mean_terminated_length": 773.7999877929688,
"completions/min_length": 659.0,
"completions/min_terminated_length": 659.0,
"epoch": 0.012571428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21668891608715057,
"learning_rate": 5e-07,
"loss": 0.0312,
"num_tokens": 1402119.0,
"reward": -0.4548088014125824,
"reward_std": 0.35335251688957214,
"rewards/cosine_scaled_reward/mean": -0.2664669156074524,
"rewards/cosine_scaled_reward/std": 0.1670963168144226,
"rewards/format_reward/mean": 0.078125,
"rewards/format_reward/std": 0.27048972249031067,
"step": 11
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.578125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1787.0,
"completions/mean_length": 1666.046875,
"completions/mean_terminated_length": 1142.629638671875,
"completions/min_length": 157.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.013714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.22070375084877014,
"learning_rate": 5.5e-07,
"loss": 0.0437,
"num_tokens": 1519690.0,
"reward": 0.07585961371660233,
"reward_std": 0.7337090373039246,
"rewards/cosine_scaled_reward/mean": -0.21207019686698914,
"rewards/cosine_scaled_reward/std": 0.32506927847862244,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.5039526224136353,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.703125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1837.0,
"completions/mean_length": 1780.578125,
"completions/mean_terminated_length": 1147.2105712890625,
"completions/min_length": 780.0,
"completions/min_terminated_length": 780.0,
"epoch": 0.014857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21096666157245636,
"learning_rate": 6e-07,
"loss": 0.0463,
"num_tokens": 1644687.0,
"reward": 0.10567126423120499,
"reward_std": 0.7079647779464722,
"rewards/cosine_scaled_reward/mean": -0.11122686415910721,
"rewards/cosine_scaled_reward/std": 0.3569961190223694,
"rewards/format_reward/mean": 0.328125,
"rewards/format_reward/std": 0.4732423722743988,
"step": 13
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.765625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1880.0,
"completions/mean_length": 1887.984375,
"completions/mean_terminated_length": 1365.2667236328125,
"completions/min_length": 824.0,
"completions/min_terminated_length": 824.0,
"epoch": 0.016,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21131716668605804,
"learning_rate": 6.5e-07,
"loss": 0.0144,
"num_tokens": 1776126.0,
"reward": -0.0225231796503067,
"reward_std": 0.5179126262664795,
"rewards/cosine_scaled_reward/mean": -0.14407408237457275,
"rewards/cosine_scaled_reward/std": 0.33444011211395264,
"rewards/format_reward/mean": 0.265625,
"rewards/format_reward/std": 0.44515693187713623,
"step": 14
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1351.0,
"completions/mean_length": 1718.78125,
"completions/mean_terminated_length": 731.125,
"completions/min_length": 420.0,
"completions/min_terminated_length": 420.0,
"epoch": 0.017142857142857144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1991148591041565,
"learning_rate": 7e-07,
"loss": 0.0049,
"num_tokens": 1897048.0,
"reward": 0.19555333256721497,
"reward_std": 0.40205830335617065,
"rewards/cosine_scaled_reward/mean": -0.04284832626581192,
"rewards/cosine_scaled_reward/std": 0.4670048952102661,
"rewards/format_reward/mean": 0.28125,
"rewards/format_reward/std": 0.4531635046005249,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.96875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1697.0,
"completions/mean_length": 2027.5,
"completions/mean_terminated_length": 1392.0,
"completions/min_length": 1087.0,
"completions/min_terminated_length": 1087.0,
"epoch": 0.018285714285714287,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.22394295036792755,
"learning_rate": 7.5e-07,
"loss": 0.0187,
"num_tokens": 2037248.0,
"reward": -0.47975414991378784,
"reward_std": 0.3722427487373352,
"rewards/cosine_scaled_reward/mean": -0.2555020749568939,
"rewards/cosine_scaled_reward/std": 0.17358116805553436,
"rewards/format_reward/mean": 0.03125,
"rewards/format_reward/std": 0.17536810040473938,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.640625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1789.0,
"completions/mean_length": 1608.859375,
"completions/mean_terminated_length": 826.0435180664062,
"completions/min_length": 325.0,
"completions/min_terminated_length": 325.0,
"epoch": 0.019428571428571427,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20954757928848267,
"learning_rate": 8e-07,
"loss": 0.0717,
"num_tokens": 2150735.0,
"reward": 0.09985511004924774,
"reward_std": 0.7668930292129517,
"rewards/cosine_scaled_reward/mean": -0.13757243752479553,
"rewards/cosine_scaled_reward/std": 0.3857298791408539,
"rewards/format_reward/mean": 0.375,
"rewards/format_reward/std": 0.48795005679130554,
"step": 17
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.78125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1656.0,
"completions/mean_length": 1832.9375,
"completions/mean_terminated_length": 1064.857177734375,
"completions/min_length": 616.0,
"completions/min_terminated_length": 616.0,
"epoch": 0.02057142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19936956465244293,
"learning_rate": 8.499999999999999e-07,
"loss": 0.0415,
"num_tokens": 2278419.0,
"reward": -0.09606979787349701,
"reward_std": 0.6028552055358887,
"rewards/cosine_scaled_reward/mean": -0.1886598914861679,
"rewards/cosine_scaled_reward/std": 0.2934761047363281,
"rewards/format_reward/mean": 0.28125,
"rewards/format_reward/std": 0.4531635046005249,
"step": 18
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1992.0,
"completions/mean_length": 1797.421875,
"completions/mean_terminated_length": 1157.0555419921875,
"completions/min_length": 548.0,
"completions/min_terminated_length": 548.0,
"epoch": 0.021714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20787546038627625,
"learning_rate": 9e-07,
"loss": 0.0691,
"num_tokens": 2404710.0,
"reward": 0.3256925344467163,
"reward_std": 0.7026835680007935,
"rewards/cosine_scaled_reward/mean": -0.02465374395251274,
"rewards/cosine_scaled_reward/std": 0.48578760027885437,
"rewards/format_reward/mean": 0.375,
"rewards/format_reward/std": 0.48795005679130554,
"step": 19
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.609375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1981.0,
"completions/mean_length": 1595.921875,
"completions/mean_terminated_length": 890.6799926757812,
"completions/min_length": 357.0,
"completions/min_terminated_length": 357.0,
"epoch": 0.022857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19203181564807892,
"learning_rate": 9.499999999999999e-07,
"loss": 0.0843,
"num_tokens": 2518201.0,
"reward": 0.2115776240825653,
"reward_std": 0.6924929618835449,
"rewards/cosine_scaled_reward/mean": -0.09733618050813675,
"rewards/cosine_scaled_reward/std": 0.4008020758628845,
"rewards/format_reward/mean": 0.40625,
"rewards/format_reward/std": 0.49501484632492065,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.65625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1892.0,
"completions/mean_length": 1669.71875,
"completions/mean_terminated_length": 947.5454711914062,
"completions/min_length": 333.0,
"completions/min_terminated_length": 333.0,
"epoch": 0.024,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19905951619148254,
"learning_rate": 1e-06,
"loss": 0.0554,
"num_tokens": 2635871.0,
"reward": -0.04711771011352539,
"reward_std": 0.6225218772888184,
"rewards/cosine_scaled_reward/mean": -0.2032463699579239,
"rewards/cosine_scaled_reward/std": 0.32066139578819275,
"rewards/format_reward/mean": 0.359375,
"rewards/format_reward/std": 0.4836103618144989,
"step": 21
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1776.0,
"completions/mean_length": 1381.5625,
"completions/mean_terminated_length": 793.5294189453125,
"completions/min_length": 290.0,
"completions/min_terminated_length": 290.0,
"epoch": 0.025142857142857144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2047095149755478,
"learning_rate": 9.99931462820376e-07,
"loss": 0.0102,
"num_tokens": 2733307.0,
"reward": 0.5420082807540894,
"reward_std": 0.5808548927307129,
"rewards/cosine_scaled_reward/mean": -0.04149584099650383,
"rewards/cosine_scaled_reward/std": 0.45060864090919495,
"rewards/format_reward/mean": 0.625,
"rewards/format_reward/std": 0.48795005679130554,
"step": 22
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.640625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1883.0,
"completions/mean_length": 1658.0,
"completions/mean_terminated_length": 962.7826538085938,
"completions/min_length": 405.0,
"completions/min_terminated_length": 405.0,
"epoch": 0.026285714285714287,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19252249598503113,
"learning_rate": 9.997258721585931e-07,
"loss": 0.0556,
"num_tokens": 2850211.0,
"reward": -0.003935225307941437,
"reward_std": 0.5448156595230103,
"rewards/cosine_scaled_reward/mean": -0.21290510892868042,
"rewards/cosine_scaled_reward/std": 0.3244985342025757,
"rewards/format_reward/mean": 0.421875,
"rewards/format_reward/std": 0.49776285886764526,
"step": 23
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.65625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1979.0,
"completions/mean_length": 1739.015625,
"completions/mean_terminated_length": 1149.1363525390625,
"completions/min_length": 512.0,
"completions/min_terminated_length": 512.0,
"epoch": 0.027428571428571427,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20268025994300842,
"learning_rate": 9.993832906395582e-07,
"loss": 0.0283,
"num_tokens": 2972436.0,
"reward": 0.023234538733959198,
"reward_std": 0.5804120898246765,
"rewards/cosine_scaled_reward/mean": -0.1836952269077301,
"rewards/cosine_scaled_reward/std": 0.3640914857387543,
"rewards/format_reward/mean": 0.390625,
"rewards/format_reward/std": 0.4917473793029785,
"step": 24
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1737.0,
"completions/mean_length": 1718.3125,
"completions/mean_terminated_length": 875.7777709960938,
"completions/min_length": 484.0,
"completions/min_terminated_length": 484.0,
"epoch": 0.02857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21169544756412506,
"learning_rate": 9.989038226169207e-07,
"loss": 0.0649,
"num_tokens": 3092704.0,
"reward": -0.048267342150211334,
"reward_std": 0.6947153210639954,
"rewards/cosine_scaled_reward/mean": -0.17257116734981537,
"rewards/cosine_scaled_reward/std": 0.33179494738578796,
"rewards/format_reward/mean": 0.296875,
"rewards/format_reward/std": 0.4604927599430084,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.796875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2039.0,
"completions/mean_length": 1931.46875,
"completions/mean_terminated_length": 1474.3077392578125,
"completions/min_length": 860.0,
"completions/min_terminated_length": 860.0,
"epoch": 0.029714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21874327957630157,
"learning_rate": 9.982876141412855e-07,
"loss": 0.0248,
"num_tokens": 3226950.0,
"reward": 0.07520664483308792,
"reward_std": 0.5721991658210754,
"rewards/cosine_scaled_reward/mean": -0.09520917385816574,
"rewards/cosine_scaled_reward/std": 0.355131059885025,
"rewards/format_reward/mean": 0.265625,
"rewards/format_reward/std": 0.44515693187713623,
"step": 26
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.859375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1709.0,
"completions/mean_length": 1887.21875,
"completions/mean_terminated_length": 904.6666870117188,
"completions/min_length": 505.0,
"completions/min_terminated_length": 505.0,
"epoch": 0.030857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2260063886642456,
"learning_rate": 9.975348529157229e-07,
"loss": 0.0282,
"num_tokens": 3358020.0,
"reward": -0.12340383231639862,
"reward_std": 0.6229674220085144,
"rewards/cosine_scaled_reward/mean": -0.1788894236087799,
"rewards/cosine_scaled_reward/std": 0.27315112948417664,
"rewards/format_reward/mean": 0.234375,
"rewards/format_reward/std": 0.42695629596710205,
"step": 27
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1976.0,
"completions/mean_length": 1818.03125,
"completions/mean_terminated_length": 1128.125,
"completions/min_length": 441.0,
"completions/min_terminated_length": 441.0,
"epoch": 0.032,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2172878384590149,
"learning_rate": 9.96645768238595e-07,
"loss": 0.0203,
"num_tokens": 3484710.0,
"reward": -0.06130418926477432,
"reward_std": 0.6516651511192322,
"rewards/cosine_scaled_reward/mean": -0.17908960580825806,
"rewards/cosine_scaled_reward/std": 0.3907976746559143,
"rewards/format_reward/mean": 0.296875,
"rewards/format_reward/std": 0.4604927599430084,
"step": 28
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1187.0,
"completions/mean_length": 1990.765625,
"completions/mean_terminated_length": 827.0,
"completions/min_length": 625.0,
"completions/min_terminated_length": 625.0,
"epoch": 0.03314285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21073698997497559,
"learning_rate": 9.956206309337066e-07,
"loss": 0.0469,
"num_tokens": 3622591.0,
"reward": -0.33952879905700684,
"reward_std": 0.447256475687027,
"rewards/cosine_scaled_reward/mean": -0.20882689952850342,
"rewards/cosine_scaled_reward/std": 0.20297211408615112,
"rewards/format_reward/mean": 0.078125,
"rewards/format_reward/std": 0.27048972249031067,
"step": 29
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2034.0,
"completions/mean_length": 1843.828125,
"completions/mean_terminated_length": 1231.3125,
"completions/min_length": 767.0,
"completions/min_terminated_length": 767.0,
"epoch": 0.03428571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21709226071834564,
"learning_rate": 9.944597532678119e-07,
"loss": 0.0171,
"num_tokens": 3751132.0,
"reward": -0.024381320923566818,
"reward_std": 0.6315211057662964,
"rewards/cosine_scaled_reward/mean": -0.16062816977500916,
"rewards/cosine_scaled_reward/std": 0.2835782468318939,
"rewards/format_reward/mean": 0.296875,
"rewards/format_reward/std": 0.4604927599430084,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.859375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1040.0,
"completions/mean_length": 1853.625,
"completions/mean_terminated_length": 665.7777709960938,
"completions/min_length": 496.0,
"completions/min_terminated_length": 496.0,
"epoch": 0.03542857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20489497482776642,
"learning_rate": 9.931634888554935e-07,
"loss": 0.0071,
"num_tokens": 3880260.0,
"reward": -0.22396349906921387,
"reward_std": 0.6550674438476562,
"rewards/cosine_scaled_reward/mean": -0.19791924953460693,
"rewards/cosine_scaled_reward/std": 0.3350917100906372,
"rewards/format_reward/mean": 0.171875,
"rewards/format_reward/std": 0.38025420904159546,
"step": 31
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.8125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1997.0,
"completions/mean_length": 1902.109375,
"completions/mean_terminated_length": 1269.916748046875,
"completions/min_length": 772.0,
"completions/min_terminated_length": 772.0,
"epoch": 0.036571428571428574,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20957782864570618,
"learning_rate": 9.917322325514487e-07,
"loss": 0.0611,
"num_tokens": 4012347.0,
"reward": -0.22782376408576965,
"reward_std": 0.6326622366905212,
"rewards/cosine_scaled_reward/mean": -0.22328688204288483,
"rewards/cosine_scaled_reward/std": 0.3028508722782135,
"rewards/format_reward/mean": 0.21875,
"rewards/format_reward/std": 0.4166666865348816,
"step": 32
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1698.0,
"completions/mean_length": 1945.34375,
"completions/mean_terminated_length": 1226.75,
"completions/min_length": 887.0,
"completions/min_terminated_length": 887.0,
"epoch": 0.037714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.22317089140415192,
"learning_rate": 9.901664203302124e-07,
"loss": 0.0347,
"num_tokens": 4148065.0,
"reward": -0.47040778398513794,
"reward_std": 0.4409722089767456,
"rewards/cosine_scaled_reward/mean": -0.30551639199256897,
"rewards/cosine_scaled_reward/std": 0.22323259711265564,
"rewards/format_reward/mean": 0.140625,
"rewards/format_reward/std": 0.3503824472427368,
"step": 33
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.515625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1826.0,
"completions/mean_length": 1541.515625,
"completions/mean_terminated_length": 1002.3547973632812,
"completions/min_length": 475.0,
"completions/min_terminated_length": 475.0,
"epoch": 0.038857142857142854,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2360963523387909,
"learning_rate": 9.88466529153356e-07,
"loss": 0.0712,
"num_tokens": 4256274.0,
"reward": 0.5805569291114807,
"reward_std": 0.8525061011314392,
"rewards/cosine_scaled_reward/mean": 0.04027845710515976,
"rewards/cosine_scaled_reward/std": 0.49936607480049133,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.5039526224136353,
"step": 34
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.796875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1861.0,
"completions/mean_length": 1808.921875,
"completions/mean_terminated_length": 871.0000610351562,
"completions/min_length": 466.0,
"completions/min_terminated_length": 466.0,
"epoch": 0.04,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1972445547580719,
"learning_rate": 9.866330768241983e-07,
"loss": 0.0577,
"num_tokens": 4383541.0,
"reward": 0.00036025047302246094,
"reward_std": 0.8111597895622253,
"rewards/cosine_scaled_reward/mean": -0.10919487476348877,
"rewards/cosine_scaled_reward/std": 0.44675883650779724,
"rewards/format_reward/mean": 0.21875,
"rewards/format_reward/std": 0.4166666865348816,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.921875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1979.0,
"completions/mean_length": 1990.765625,
"completions/mean_terminated_length": 1315.4000244140625,
"completions/min_length": 937.0,
"completions/min_terminated_length": 937.0,
"epoch": 0.04114285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2348623126745224,
"learning_rate": 9.846666218300807e-07,
"loss": 0.0216,
"num_tokens": 4522062.0,
"reward": -0.4222595691680908,
"reward_std": 0.4755689203739166,
"rewards/cosine_scaled_reward/mean": -0.2501922845840454,
"rewards/cosine_scaled_reward/std": 0.2129606157541275,
"rewards/format_reward/mean": 0.078125,
"rewards/format_reward/std": 0.27048972249031067,
"step": 36
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.84375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1928.0,
"completions/mean_length": 1911.296875,
"completions/mean_terminated_length": 1173.0999755859375,
"completions/min_length": 629.0,
"completions/min_terminated_length": 629.0,
"epoch": 0.04228571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.22154958546161652,
"learning_rate": 9.825677631722435e-07,
"loss": 0.0467,
"num_tokens": 4655409.0,
"reward": -0.2846450209617615,
"reward_std": 0.4525028467178345,
"rewards/cosine_scaled_reward/mean": -0.23607251048088074,
"rewards/cosine_scaled_reward/std": 0.19240929186344147,
"rewards/format_reward/mean": 0.1875,
"rewards/format_reward/std": 0.39339789748191833,
"step": 37
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.84375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1530.0,
"completions/mean_length": 1906.65625,
"completions/mean_terminated_length": 1143.4000244140625,
"completions/min_length": 530.0,
"completions/min_terminated_length": 530.0,
"epoch": 0.04342857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2259596437215805,
"learning_rate": 9.80337140183366e-07,
"loss": 0.0219,
"num_tokens": 4789147.0,
"reward": -0.14314083755016327,
"reward_std": 0.4587753117084503,
"rewards/cosine_scaled_reward/mean": -0.14969542622566223,
"rewards/cosine_scaled_reward/std": 0.30969110131263733,
"rewards/format_reward/mean": 0.15625,
"rewards/format_reward/std": 0.36596253514289856,
"step": 38
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.765625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2042.0,
"completions/mean_length": 1729.984375,
"completions/mean_terminated_length": 691.1333618164062,
"completions/min_length": 312.0,
"completions/min_terminated_length": 312.0,
"epoch": 0.044571428571428574,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1975395530462265,
"learning_rate": 9.779754323328192e-07,
"loss": 0.0518,
"num_tokens": 4910650.0,
"reward": 0.20782151818275452,
"reward_std": 0.5801891088485718,
"rewards/cosine_scaled_reward/mean": -0.08358924090862274,
"rewards/cosine_scaled_reward/std": 0.3715744912624359,
"rewards/format_reward/mean": 0.375,
"rewards/format_reward/std": 0.48795005679130554,
"step": 39
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.546875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1936.0,
"completions/mean_length": 1565.40625,
"completions/mean_terminated_length": 982.9655151367188,
"completions/min_length": 393.0,
"completions/min_terminated_length": 393.0,
"epoch": 0.045714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19556699693202972,
"learning_rate": 9.754833590196926e-07,
"loss": 0.0176,
"num_tokens": 5020908.0,
"reward": 0.21666434407234192,
"reward_std": 0.47607892751693726,
"rewards/cosine_scaled_reward/mean": -0.12604281306266785,
"rewards/cosine_scaled_reward/std": 0.4459211230278015,
"rewards/format_reward/mean": 0.46875,
"rewards/format_reward/std": 0.5029674172401428,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1971.0,
"completions/mean_length": 1847.96875,
"completions/mean_terminated_length": 1247.875,
"completions/min_length": 799.0,
"completions/min_terminated_length": 799.0,
"epoch": 0.046857142857142854,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19488316774368286,
"learning_rate": 9.728616793536587e-07,
"loss": 0.0491,
"num_tokens": 5150330.0,
"reward": -0.15268605947494507,
"reward_std": 0.6881446838378906,
"rewards/cosine_scaled_reward/mean": -0.22478052973747253,
"rewards/cosine_scaled_reward/std": 0.3324533700942993,
"rewards/format_reward/mean": 0.296875,
"rewards/format_reward/std": 0.4604927599430084,
"step": 41
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2018.0,
"completions/mean_length": 1661.296875,
"completions/mean_terminated_length": 673.0555419921875,
"completions/min_length": 134.0,
"completions/min_terminated_length": 134.0,
"epoch": 0.048,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21376171708106995,
"learning_rate": 9.701111919237408e-07,
"loss": 0.0433,
"num_tokens": 5267013.0,
"reward": -0.20060807466506958,
"reward_std": 0.34422361850738525,
"rewards/cosine_scaled_reward/mean": -0.24874155223369598,
"rewards/cosine_scaled_reward/std": 0.17742608487606049,
"rewards/format_reward/mean": 0.296875,
"rewards/format_reward/std": 0.4604927599430084,
"step": 42
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.78125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2040.0,
"completions/mean_length": 1802.484375,
"completions/mean_terminated_length": 925.6428833007812,
"completions/min_length": 580.0,
"completions/min_terminated_length": 580.0,
"epoch": 0.04914285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20949861407279968,
"learning_rate": 9.672327345550543e-07,
"loss": 0.0468,
"num_tokens": 5393988.0,
"reward": 0.1097467839717865,
"reward_std": 0.4439903795719147,
"rewards/cosine_scaled_reward/mean": -0.07012660801410675,
"rewards/cosine_scaled_reward/std": 0.35852304100990295,
"rewards/format_reward/mean": 0.25,
"rewards/format_reward/std": 0.4364357888698578,
"step": 43
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1497.0,
"completions/mean_length": 1639.375,
"completions/mean_terminated_length": 740.4000244140625,
"completions/min_length": 186.0,
"completions/min_terminated_length": 186.0,
"epoch": 0.05028571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20765061676502228,
"learning_rate": 9.64227184053598e-07,
"loss": 0.0677,
"num_tokens": 5509604.0,
"reward": 0.1744289994239807,
"reward_std": 0.7545564770698547,
"rewards/cosine_scaled_reward/mean": -0.09247300028800964,
"rewards/cosine_scaled_reward/std": 0.486594021320343,
"rewards/format_reward/mean": 0.359375,
"rewards/format_reward/std": 0.4836103618144989,
"step": 44
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.921875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1784.0,
"completions/mean_length": 2015.1875,
"completions/mean_terminated_length": 1628.0,
"completions/min_length": 1485.0,
"completions/min_terminated_length": 1485.0,
"epoch": 0.05142857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.22293689846992493,
"learning_rate": 9.610954559391704e-07,
"loss": 0.0141,
"num_tokens": 5650232.0,
"reward": -0.28319618105888367,
"reward_std": 0.44461578130722046,
"rewards/cosine_scaled_reward/mean": -0.19628559052944183,
"rewards/cosine_scaled_reward/std": 0.2942677140235901,
"rewards/format_reward/mean": 0.109375,
"rewards/format_reward/std": 0.3145764470100403,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.765625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1232.0,
"completions/mean_length": 1769.8125,
"completions/mean_terminated_length": 861.0667114257812,
"completions/min_length": 538.0,
"completions/min_terminated_length": 538.0,
"epoch": 0.052571428571428575,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21012793481349945,
"learning_rate": 9.578385041664925e-07,
"loss": 0.0845,
"num_tokens": 5774668.0,
"reward": -0.19958055019378662,
"reward_std": 0.37389740347862244,
"rewards/cosine_scaled_reward/mean": -0.2247902750968933,
"rewards/cosine_scaled_reward/std": 0.18379005789756775,
"rewards/format_reward/mean": 0.25,
"rewards/format_reward/std": 0.4364357888698578,
"step": 46
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1969.0,
"completions/mean_length": 1761.734375,
"completions/mean_terminated_length": 1131.9500732421875,
"completions/min_length": 370.0,
"completions/min_terminated_length": 370.0,
"epoch": 0.053714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2044854760169983,
"learning_rate": 9.54457320834625e-07,
"loss": 0.0366,
"num_tokens": 5897819.0,
"reward": -0.11128583550453186,
"reward_std": 0.7243642210960388,
"rewards/cosine_scaled_reward/mean": -0.22751793265342712,
"rewards/cosine_scaled_reward/std": 0.341621071100235,
"rewards/format_reward/mean": 0.34375,
"rewards/format_reward/std": 0.4787135720252991,
"step": 47
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.671875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2015.0,
"completions/mean_length": 1720.890625,
"completions/mean_terminated_length": 1051.09521484375,
"completions/min_length": 430.0,
"completions/min_terminated_length": 430.0,
"epoch": 0.054857142857142854,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19758965075016022,
"learning_rate": 9.509529358847654e-07,
"loss": 0.046,
"num_tokens": 6018500.0,
"reward": 0.026797622442245483,
"reward_std": 0.5594782829284668,
"rewards/cosine_scaled_reward/mean": -0.16628868877887726,
"rewards/cosine_scaled_reward/std": 0.29110410809516907,
"rewards/format_reward/mean": 0.359375,
"rewards/format_reward/std": 0.4836103618144989,
"step": 48
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.546875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1723.0,
"completions/mean_length": 1488.71875,
"completions/mean_terminated_length": 813.72412109375,
"completions/min_length": 402.0,
"completions/min_terminated_length": 402.0,
"epoch": 0.056,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1846495270729065,
"learning_rate": 9.473264167865171e-07,
"loss": 0.0203,
"num_tokens": 6123842.0,
"reward": 0.3029339909553528,
"reward_std": 0.6658899188041687,
"rewards/cosine_scaled_reward/mean": -0.09853300452232361,
"rewards/cosine_scaled_reward/std": 0.4083656370639801,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.5039526224136353,
"step": 49
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1959.0,
"completions/mean_length": 1733.59375,
"completions/mean_terminated_length": 790.375,
"completions/min_length": 305.0,
"completions/min_terminated_length": 305.0,
"epoch": 0.05714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19527027010917664,
"learning_rate": 9.43578868212728e-07,
"loss": 0.0136,
"num_tokens": 6245608.0,
"reward": 0.15902790427207947,
"reward_std": 0.46005839109420776,
"rewards/cosine_scaled_reward/mean": -0.06892354786396027,
"rewards/cosine_scaled_reward/std": 0.4567166864871979,
"rewards/format_reward/mean": 0.296875,
"rewards/format_reward/std": 0.4604927599430084,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.515625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1916.0,
"completions/mean_length": 1432.421875,
"completions/mean_terminated_length": 777.1290283203125,
"completions/min_length": 401.0,
"completions/min_terminated_length": 401.0,
"epoch": 0.05828571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21701110899448395,
"learning_rate": 9.397114317029974e-07,
"loss": 0.0239,
"num_tokens": 6347491.0,
"reward": 0.2233203500509262,
"reward_std": 0.6041151285171509,
"rewards/cosine_scaled_reward/mean": -0.1383398175239563,
"rewards/cosine_scaled_reward/std": 0.3747152090072632,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.5039526224136353,
"step": 51
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1448.0,
"completions/mean_length": 1720.046875,
"completions/mean_terminated_length": 736.1875,
"completions/min_length": 301.0,
"completions/min_terminated_length": 301.0,
"epoch": 0.05942857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19577208161354065,
"learning_rate": 9.357252853159505e-07,
"loss": 0.0066,
"num_tokens": 6468926.0,
"reward": -0.1786521077156067,
"reward_std": 0.3358575701713562,
"rewards/cosine_scaled_reward/mean": -0.21432605385780334,
"rewards/cosine_scaled_reward/std": 0.3689535856246948,
"rewards/format_reward/mean": 0.25,
"rewards/format_reward/std": 0.4364357888698578,
"step": 52
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1931.0,
"completions/mean_length": 1718.9375,
"completions/mean_terminated_length": 878.0,
"completions/min_length": 468.0,
"completions/min_terminated_length": 468.0,
"epoch": 0.060571428571428575,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21421696245670319,
"learning_rate": 9.316216432703916e-07,
"loss": 0.0423,
"num_tokens": 6589770.0,
"reward": -0.03741084039211273,
"reward_std": 0.7027454376220703,
"rewards/cosine_scaled_reward/mean": -0.17495542764663696,
"rewards/cosine_scaled_reward/std": 0.29642969369888306,
"rewards/format_reward/mean": 0.3125,
"rewards/format_reward/std": 0.467176616191864,
"step": 53
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2031.0,
"completions/mean_length": 1664.625,
"completions/mean_terminated_length": 1171.71435546875,
"completions/min_length": 518.0,
"completions/min_terminated_length": 518.0,
"epoch": 0.061714285714285715,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19009248912334442,
"learning_rate": 9.274017555754407e-07,
"loss": 0.0958,
"num_tokens": 6707450.0,
"reward": 0.2984742522239685,
"reward_std": 1.0811007022857666,
"rewards/cosine_scaled_reward/mean": -0.08513787388801575,
"rewards/cosine_scaled_reward/std": 0.455229252576828,
"rewards/format_reward/mean": 0.46875,
"rewards/format_reward/std": 0.5029674172401428,
"step": 54
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.765625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1352.0,
"completions/mean_length": 1757.359375,
"completions/mean_terminated_length": 807.933349609375,
"completions/min_length": 517.0,
"completions/min_terminated_length": 517.0,
"epoch": 0.06285714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1981392800807953,
"learning_rate": 9.230669076497687e-07,
"loss": 0.0294,
"num_tokens": 6830209.0,
"reward": 0.0005421042442321777,
"reward_std": 0.512083888053894,
"rewards/cosine_scaled_reward/mean": -0.1403539478778839,
"rewards/cosine_scaled_reward/std": 0.37260064482688904,
"rewards/format_reward/mean": 0.28125,
"rewards/format_reward/std": 0.4531635046005249,
"step": 55
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.65625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1955.0,
"completions/mean_length": 1717.890625,
"completions/mean_terminated_length": 1087.681884765625,
"completions/min_length": 494.0,
"completions/min_terminated_length": 494.0,
"epoch": 0.064,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21031354367733002,
"learning_rate": 9.186184199300463e-07,
"loss": 0.0425,
"num_tokens": 6951114.0,
"reward": 0.25747445225715637,
"reward_std": 0.5027350187301636,
"rewards/cosine_scaled_reward/mean": -0.08220025897026062,
"rewards/cosine_scaled_reward/std": 0.4609789550304413,
"rewards/format_reward/mean": 0.421875,
"rewards/format_reward/std": 0.49776285886764526,
"step": 56
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.828125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1899.0,
"completions/mean_length": 1946.203125,
"completions/mean_terminated_length": 1455.727294921875,
"completions/min_length": 844.0,
"completions/min_terminated_length": 844.0,
"epoch": 0.06514285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1840263158082962,
"learning_rate": 9.140576474687263e-07,
"loss": 0.0406,
"num_tokens": 7087239.0,
"reward": -0.31278592348098755,
"reward_std": 0.5103937387466431,
"rewards/cosine_scaled_reward/mean": -0.2501429617404938,
"rewards/cosine_scaled_reward/std": 0.23870430886745453,
"rewards/format_reward/mean": 0.1875,
"rewards/format_reward/std": 0.39339789748191833,
"step": 57
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.515625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2034.0,
"completions/mean_length": 1514.5625,
"completions/mean_terminated_length": 946.7096557617188,
"completions/min_length": 411.0,
"completions/min_terminated_length": 411.0,
"epoch": 0.06628571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.18003003299236298,
"learning_rate": 9.093859795212817e-07,
"loss": 0.0669,
"num_tokens": 7194267.0,
"reward": 0.3626611530780792,
"reward_std": 0.6513576507568359,
"rewards/cosine_scaled_reward/mean": -0.09991942346096039,
"rewards/cosine_scaled_reward/std": 0.42993852496147156,
"rewards/format_reward/mean": 0.5625,
"rewards/format_reward/std": 0.5,
"step": 58
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2017.0,
"completions/mean_length": 1704.8125,
"completions/mean_terminated_length": 1132.8333740234375,
"completions/min_length": 524.0,
"completions/min_terminated_length": 524.0,
"epoch": 0.06742857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17114725708961487,
"learning_rate": 9.046048391230247e-07,
"loss": 0.0061,
"num_tokens": 7313839.0,
"reward": 0.15319865942001343,
"reward_std": 0.6165874004364014,
"rewards/cosine_scaled_reward/mean": -0.11871317774057388,
"rewards/cosine_scaled_reward/std": 0.3659735918045044,
"rewards/format_reward/mean": 0.390625,
"rewards/format_reward/std": 0.4917473793029785,
"step": 59
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1896.0,
"completions/mean_length": 1767.53125,
"completions/mean_terminated_length": 1050.77783203125,
"completions/min_length": 459.0,
"completions/min_terminated_length": 459.0,
"epoch": 0.06857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1782463639974594,
"learning_rate": 8.997156826556369e-07,
"loss": 0.0527,
"num_tokens": 7437849.0,
"reward": -0.09879650175571442,
"reward_std": 0.6538424491882324,
"rewards/cosine_scaled_reward/mean": -0.2212732434272766,
"rewards/cosine_scaled_reward/std": 0.3128809630870819,
"rewards/format_reward/mean": 0.34375,
"rewards/format_reward/std": 0.4787135720252991,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1989.0,
"completions/mean_length": 1799.53125,
"completions/mean_terminated_length": 1054.125,
"completions/min_length": 420.0,
"completions/min_terminated_length": 420.0,
"epoch": 0.06971428571428571,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19245384633541107,
"learning_rate": 8.9471999940354e-07,
"loss": 0.0533,
"num_tokens": 7564539.0,
"reward": 0.1226256862282753,
"reward_std": 0.7401602268218994,
"rewards/cosine_scaled_reward/mean": -0.11056216061115265,
"rewards/cosine_scaled_reward/std": 0.314616322517395,
"rewards/format_reward/mean": 0.34375,
"rewards/format_reward/std": 0.4787135720252991,
"step": 61
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.453125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2029.0,
"completions/mean_length": 1458.1875,
"completions/mean_terminated_length": 969.4857177734375,
"completions/min_length": 364.0,
"completions/min_terminated_length": 364.0,
"epoch": 0.07085714285714285,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17495828866958618,
"learning_rate": 8.896193111002475e-07,
"loss": 0.0785,
"num_tokens": 7668095.0,
"reward": 0.6185990571975708,
"reward_std": 0.6951406598091125,
"rewards/cosine_scaled_reward/mean": 0.020237013697624207,
"rewards/cosine_scaled_reward/std": 0.42793402075767517,
"rewards/format_reward/mean": 0.578125,
"rewards/format_reward/std": 0.49776285886764526,
"step": 62
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1880.0,
"completions/mean_length": 1369.65625,
"completions/mean_terminated_length": 962.6500244140625,
"completions/min_length": 384.0,
"completions/min_terminated_length": 384.0,
"epoch": 0.072,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17925356328487396,
"learning_rate": 8.844151714648274e-07,
"loss": 0.0802,
"num_tokens": 7766009.0,
"reward": 0.588592529296875,
"reward_std": 0.7614073753356934,
"rewards/cosine_scaled_reward/mean": -0.0260162390768528,
"rewards/cosine_scaled_reward/std": 0.47686251997947693,
"rewards/format_reward/mean": 0.640625,
"rewards/format_reward/std": 0.4836103618144989,
"step": 63
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1849.0,
"completions/mean_length": 1493.0625,
"completions/mean_terminated_length": 1061.4444580078125,
"completions/min_length": 421.0,
"completions/min_terminated_length": 421.0,
"epoch": 0.07314285714285715,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.197045236825943,
"learning_rate": 8.791091657286267e-07,
"loss": 0.1112,
"num_tokens": 7872517.0,
"reward": 0.4587404727935791,
"reward_std": 0.7483726739883423,
"rewards/cosine_scaled_reward/mean": -0.08312976360321045,
"rewards/cosine_scaled_reward/std": 0.3704431354999542,
"rewards/format_reward/mean": 0.625,
"rewards/format_reward/std": 0.48795005679130554,
"step": 64
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1805.0,
"completions/mean_length": 1561.09375,
"completions/mean_terminated_length": 749.5833740234375,
"completions/min_length": 276.0,
"completions/min_terminated_length": 276.0,
"epoch": 0.07428571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17185057699680328,
"learning_rate": 8.737029101523929e-07,
"loss": 0.0652,
"num_tokens": 7983131.0,
"reward": -0.022998124361038208,
"reward_std": 0.5443873405456543,
"rewards/cosine_scaled_reward/mean": -0.2146240472793579,
"rewards/cosine_scaled_reward/std": 0.39696088433265686,
"rewards/format_reward/mean": 0.40625,
"rewards/format_reward/std": 0.49501484632492065,
"step": 65
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1822.0,
"completions/mean_length": 1160.96875,
"completions/mean_terminated_length": 757.7727661132812,
"completions/min_length": 245.0,
"completions/min_terminated_length": 245.0,
"epoch": 0.07542857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15682660043239594,
"learning_rate": 8.681980515339463e-07,
"loss": 0.0317,
"num_tokens": 8067665.0,
"reward": 0.7723344564437866,
"reward_std": 0.5304180979728699,
"rewards/cosine_scaled_reward/mean": 0.03460472822189331,
"rewards/cosine_scaled_reward/std": 0.47199109196662903,
"rewards/format_reward/mean": 0.703125,
"rewards/format_reward/std": 0.4604927599430084,
"step": 66
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1529.0,
"completions/mean_length": 1760.03125,
"completions/mean_terminated_length": 1024.111083984375,
"completions/min_length": 494.0,
"completions/min_terminated_length": 494.0,
"epoch": 0.07657142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.18002018332481384,
"learning_rate": 8.625962667065487e-07,
"loss": 0.0536,
"num_tokens": 8191043.0,
"reward": -0.27919694781303406,
"reward_std": 0.3664131164550781,
"rewards/cosine_scaled_reward/mean": -0.2724109888076782,
"rewards/cosine_scaled_reward/std": 0.16395430266857147,
"rewards/format_reward/mean": 0.265625,
"rewards/format_reward/std": 0.44515693187713623,
"step": 67
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1307.0,
"completions/mean_length": 917.34375,
"completions/mean_terminated_length": 600.760009765625,
"completions/min_length": 295.0,
"completions/min_terminated_length": 295.0,
"epoch": 0.07771428571428571,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13612917065620422,
"learning_rate": 8.568992620281243e-07,
"loss": 0.0077,
"num_tokens": 8259009.0,
"reward": 0.6957368850708008,
"reward_std": 0.5402743816375732,
"rewards/cosine_scaled_reward/mean": -0.04275655001401901,
"rewards/cosine_scaled_reward/std": 0.434044748544693,
"rewards/format_reward/mean": 0.78125,
"rewards/format_reward/std": 0.4166666865348816,
"step": 68
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1526.0,
"completions/mean_length": 1233.78125,
"completions/mean_terminated_length": 863.6818237304688,
"completions/min_length": 343.0,
"completions/min_terminated_length": 343.0,
"epoch": 0.07885714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19043830037117004,
"learning_rate": 8.511087728614862e-07,
"loss": 0.0558,
"num_tokens": 8348315.0,
"reward": 0.21049074828624725,
"reward_std": 0.5405222177505493,
"rewards/cosine_scaled_reward/mean": -0.24631711840629578,
"rewards/cosine_scaled_reward/std": 0.2778205871582031,
"rewards/format_reward/mean": 0.703125,
"rewards/format_reward/std": 0.4604927599430084,
"step": 69
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.796875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2019.0,
"completions/mean_length": 1808.9375,
"completions/mean_terminated_length": 871.0769653320312,
"completions/min_length": 513.0,
"completions/min_terminated_length": 513.0,
"epoch": 0.08,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19704341888427734,
"learning_rate": 8.452265630457282e-07,
"loss": 0.0391,
"num_tokens": 8475543.0,
"reward": -0.18982277810573578,
"reward_std": 0.5247766971588135,
"rewards/cosine_scaled_reward/mean": -0.2355363965034485,
"rewards/cosine_scaled_reward/std": 0.3067134916782379,
"rewards/format_reward/mean": 0.28125,
"rewards/format_reward/std": 0.4531635046005249,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.609375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1800.0,
"completions/mean_length": 1563.21875,
"completions/mean_terminated_length": 806.9599609375,
"completions/min_length": 315.0,
"completions/min_terminated_length": 315.0,
"epoch": 0.08114285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.18498718738555908,
"learning_rate": 8.392544243589427e-07,
"loss": 0.016,
"num_tokens": 8586309.0,
"reward": 0.19864726066589355,
"reward_std": 0.576451301574707,
"rewards/cosine_scaled_reward/mean": -0.10380134731531143,
"rewards/cosine_scaled_reward/std": 0.476872056722641,
"rewards/format_reward/mean": 0.40625,
"rewards/format_reward/std": 0.49501484632492065,
"step": 71
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2048.0,
"completions/mean_length": 1406.0625,
"completions/mean_terminated_length": 906.7777709960938,
"completions/min_length": 353.0,
"completions/min_terminated_length": 353.0,
"epoch": 0.08228571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17368191480636597,
"learning_rate": 8.331941759724268e-07,
"loss": 0.0237,
"num_tokens": 8686649.0,
"reward": 0.22483232617378235,
"reward_std": 0.45926159620285034,
"rewards/cosine_scaled_reward/mean": -0.20789632201194763,
"rewards/cosine_scaled_reward/std": 0.294547975063324,
"rewards/format_reward/mean": 0.640625,
"rewards/format_reward/std": 0.4836103618144989,
"step": 72
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.8125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1930.0,
"completions/mean_length": 1912.875,
"completions/mean_terminated_length": 1327.3333740234375,
"completions/min_length": 878.0,
"completions/min_terminated_length": 878.0,
"epoch": 0.08342857142857144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20081810653209686,
"learning_rate": 8.270476638965461e-07,
"loss": 0.0223,
"num_tokens": 8819801.0,
"reward": -0.18328779935836792,
"reward_std": 0.5305245518684387,
"rewards/cosine_scaled_reward/mean": -0.20883139967918396,
"rewards/cosine_scaled_reward/std": 0.2695733904838562,
"rewards/format_reward/mean": 0.234375,
"rewards/format_reward/std": 0.42695629596710205,
"step": 73
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2012.0,
"completions/mean_length": 1517.875,
"completions/mean_terminated_length": 987.75,
"completions/min_length": 560.0,
"completions/min_terminated_length": 560.0,
"epoch": 0.08457142857142858,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1813385784626007,
"learning_rate": 8.208167604184217e-07,
"loss": 0.085,
"num_tokens": 8926873.0,
"reward": 0.46356096863746643,
"reward_std": 0.6926693916320801,
"rewards/cosine_scaled_reward/mean": -0.018219511955976486,
"rewards/cosine_scaled_reward/std": 0.47079169750213623,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.5039526224136353,
"step": 74
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1907.0,
"completions/mean_length": 1515.734375,
"completions/mean_terminated_length": 1046.0882568359375,
"completions/min_length": 374.0,
"completions/min_terminated_length": 374.0,
"epoch": 0.08571428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.18714174628257751,
"learning_rate": 8.145033635316128e-07,
"loss": 0.0989,
"num_tokens": 9034840.0,
"reward": 0.5457433462142944,
"reward_std": 0.6619582176208496,
"rewards/cosine_scaled_reward/mean": -0.00837831199169159,
"rewards/cosine_scaled_reward/std": 0.5059990882873535,
"rewards/format_reward/mean": 0.5625,
"rewards/format_reward/std": 0.5,
"step": 75
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1222.0,
"completions/mean_length": 1340.484375,
"completions/mean_terminated_length": 790.1944580078125,
"completions/min_length": 407.0,
"completions/min_terminated_length": 407.0,
"epoch": 0.08685714285714285,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17108581960201263,
"learning_rate": 8.081093963579707e-07,
"loss": 0.0209,
"num_tokens": 9131031.0,
"reward": 0.19882698357105255,
"reward_std": 0.5817238092422485,
"rewards/cosine_scaled_reward/mean": -0.18964898586273193,
"rewards/cosine_scaled_reward/std": 0.3000561594963074,
"rewards/format_reward/mean": 0.578125,
"rewards/format_reward/std": 0.49776285886764526,
"step": 76
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1755.0,
"completions/mean_length": 1518.765625,
"completions/mean_terminated_length": 1051.7940673828125,
"completions/min_length": 641.0,
"completions/min_terminated_length": 641.0,
"epoch": 0.088,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1759587675333023,
"learning_rate": 8.01636806561836e-07,
"loss": 0.0869,
"num_tokens": 9239808.0,
"reward": 0.2113216668367386,
"reward_std": 0.5600536465644836,
"rewards/cosine_scaled_reward/mean": -0.1599641740322113,
"rewards/cosine_scaled_reward/std": 0.33541423082351685,
"rewards/format_reward/mean": 0.53125,
"rewards/format_reward/std": 0.5029674172401428,
"step": 77
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.640625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1659.0,
"completions/mean_length": 1656.15625,
"completions/mean_terminated_length": 957.6522216796875,
"completions/min_length": 530.0,
"completions/min_terminated_length": 530.0,
"epoch": 0.08914285714285715,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17662394046783447,
"learning_rate": 7.950875657567621e-07,
"loss": 0.0177,
"num_tokens": 9356522.0,
"reward": 0.25513648986816406,
"reward_std": 0.5462654829025269,
"rewards/cosine_scaled_reward/mean": -0.05993174761533737,
"rewards/cosine_scaled_reward/std": 0.4486319124698639,
"rewards/format_reward/mean": 0.375,
"rewards/format_reward/std": 0.48795005679130554,
"step": 78
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1957.0,
"completions/mean_length": 1289.359375,
"completions/mean_terminated_length": 834.1749877929688,
"completions/min_length": 229.0,
"completions/min_terminated_length": 229.0,
"epoch": 0.09028571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15610884130001068,
"learning_rate": 7.884636689049422e-07,
"loss": 0.026,
"num_tokens": 9449137.0,
"reward": 0.4372347593307495,
"reward_std": 0.5517712831497192,
"rewards/cosine_scaled_reward/mean": -0.10950762033462524,
"rewards/cosine_scaled_reward/std": 0.3864338994026184,
"rewards/format_reward/mean": 0.65625,
"rewards/format_reward/std": 0.4787135720252991,
"step": 79
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.546875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1902.0,
"completions/mean_length": 1623.5,
"completions/mean_terminated_length": 1111.17236328125,
"completions/min_length": 538.0,
"completions/min_terminated_length": 538.0,
"epoch": 0.09142857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.18003858625888824,
"learning_rate": 7.817671337095244e-07,
"loss": 0.017,
"num_tokens": 9563433.0,
"reward": 0.11363417655229568,
"reward_std": 0.5530154705047607,
"rewards/cosine_scaled_reward/mean": -0.16974541544914246,
"rewards/cosine_scaled_reward/std": 0.3006208539009094,
"rewards/format_reward/mean": 0.453125,
"rewards/format_reward/std": 0.501733124256134,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1987.0,
"completions/mean_length": 1432.125,
"completions/mean_terminated_length": 1010.7368774414062,
"completions/min_length": 287.0,
"completions/min_terminated_length": 287.0,
"epoch": 0.09257142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2004833072423935,
"learning_rate": 7.75e-07,
"loss": 0.0603,
"num_tokens": 9666361.0,
"reward": 0.512394905090332,
"reward_std": 0.7596394419670105,
"rewards/cosine_scaled_reward/mean": -0.05630255863070488,
"rewards/cosine_scaled_reward/std": 0.43662360310554504,
"rewards/format_reward/mean": 0.625,
"rewards/format_reward/std": 0.48795005679130554,
"step": 81
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1721.0,
"completions/mean_length": 1341.03125,
"completions/mean_terminated_length": 970.7142944335938,
"completions/min_length": 431.0,
"completions/min_terminated_length": 431.0,
"epoch": 0.09371428571428571,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1563584953546524,
"learning_rate": 7.681643291108517e-07,
"loss": 0.0182,
"num_tokens": 9762515.0,
"reward": 0.746865451335907,
"reward_std": 0.571272611618042,
"rewards/cosine_scaled_reward/mean": 0.037495262920856476,
"rewards/cosine_scaled_reward/std": 0.5523709654808044,
"rewards/format_reward/mean": 0.671875,
"rewards/format_reward/std": 0.4732423722743988,
"step": 82
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.421875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1739.0,
"completions/mean_length": 1357.640625,
"completions/mean_terminated_length": 853.8648681640625,
"completions/min_length": 455.0,
"completions/min_terminated_length": 455.0,
"epoch": 0.09485714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17990301549434662,
"learning_rate": 7.612622032536507e-07,
"loss": 0.0313,
"num_tokens": 9860492.0,
"reward": 0.4607480764389038,
"reward_std": 0.4022068381309509,
"rewards/cosine_scaled_reward/mean": -0.0665009543299675,
"rewards/cosine_scaled_reward/std": 0.36611077189445496,
"rewards/format_reward/mean": 0.59375,
"rewards/format_reward/std": 0.49501484632492065,
"step": 83
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.390625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2044.0,
"completions/mean_length": 1392.609375,
"completions/mean_terminated_length": 972.4871826171875,
"completions/min_length": 395.0,
"completions/min_terminated_length": 395.0,
"epoch": 0.096,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.16545262932777405,
"learning_rate": 7.54295724882796e-07,
"loss": 0.016,
"num_tokens": 9960315.0,
"reward": 0.3932368755340576,
"reward_std": 0.662509024143219,
"rewards/cosine_scaled_reward/mean": -0.11588154733181,
"rewards/cosine_scaled_reward/std": 0.428220272064209,
"rewards/format_reward/mean": 0.625,
"rewards/format_reward/std": 0.48795005679130554,
"step": 84
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2034.0,
"completions/mean_length": 1427.515625,
"completions/mean_terminated_length": 1220.6875,
"completions/min_length": 234.0,
"completions/min_terminated_length": 234.0,
"epoch": 0.09714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14229631423950195,
"learning_rate": 7.472670160550848e-07,
"loss": -0.0247,
"num_tokens": 10061996.0,
"reward": 0.7478936910629272,
"reward_std": 0.8706425428390503,
"rewards/cosine_scaled_reward/mean": -0.00886566936969757,
"rewards/cosine_scaled_reward/std": 0.4233645796775818,
"rewards/format_reward/mean": 0.765625,
"rewards/format_reward/std": 0.42695629596710205,
"step": 85
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.390625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2023.0,
"completions/mean_length": 1459.28125,
"completions/mean_terminated_length": 1081.8974609375,
"completions/min_length": 496.0,
"completions/min_terminated_length": 496.0,
"epoch": 0.09828571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19291459023952484,
"learning_rate": 7.401782177833147e-07,
"loss": 0.0182,
"num_tokens": 10166246.0,
"reward": 0.30948999524116516,
"reward_std": 0.55961012840271,
"rewards/cosine_scaled_reward/mean": -0.1733800172805786,
"rewards/cosine_scaled_reward/std": 0.30220499634742737,
"rewards/format_reward/mean": 0.65625,
"rewards/format_reward/std": 0.4787135720252991,
"step": 86
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1866.0,
"completions/mean_length": 1140.828125,
"completions/mean_terminated_length": 972.8333129882812,
"completions/min_length": 353.0,
"completions/min_terminated_length": 353.0,
"epoch": 0.09942857142857142,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14790062606334686,
"learning_rate": 7.330314893841101e-07,
"loss": 0.0321,
"num_tokens": 10249379.0,
"reward": 0.429340660572052,
"reward_std": 0.47173961997032166,
"rewards/cosine_scaled_reward/mean": -0.207204669713974,
"rewards/cosine_scaled_reward/std": 0.27721449732780457,
"rewards/format_reward/mean": 0.84375,
"rewards/format_reward/std": 0.36596253514289856,
"step": 87
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.328125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1519.0,
"completions/mean_length": 1258.484375,
"completions/mean_terminated_length": 872.906982421875,
"completions/min_length": 246.0,
"completions/min_terminated_length": 246.0,
"epoch": 0.10057142857142858,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1598745733499527,
"learning_rate": 7.258290078201731e-07,
"loss": 0.0482,
"num_tokens": 10340434.0,
"reward": 0.8419445157051086,
"reward_std": 0.7817317247390747,
"rewards/cosine_scaled_reward/mean": 0.06940975040197372,
"rewards/cosine_scaled_reward/std": 0.4935828149318695,
"rewards/format_reward/mean": 0.703125,
"rewards/format_reward/std": 0.4604927599430084,
"step": 88
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1919.0,
"completions/mean_length": 1373.1875,
"completions/mean_terminated_length": 1166.6121826171875,
"completions/min_length": 675.0,
"completions/min_terminated_length": 675.0,
"epoch": 0.10171428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1521584838628769,
"learning_rate": 7.185729670371604e-07,
"loss": 0.0547,
"num_tokens": 10439318.0,
"reward": 0.648002028465271,
"reward_std": 0.6874127984046936,
"rewards/cosine_scaled_reward/mean": -0.0978739783167839,
"rewards/cosine_scaled_reward/std": 0.41632241010665894,
"rewards/format_reward/mean": 0.84375,
"rewards/format_reward/std": 0.36596253514289856,
"step": 89
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2023.0,
"completions/mean_length": 1239.703125,
"completions/mean_terminated_length": 992.2652587890625,
"completions/min_length": 162.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.10285714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1843656599521637,
"learning_rate": 7.11265577295385e-07,
"loss": 0.0371,
"num_tokens": 10528659.0,
"reward": 0.4645897150039673,
"reward_std": 0.6535974740982056,
"rewards/cosine_scaled_reward/mean": -0.15833015739917755,
"rewards/cosine_scaled_reward/std": 0.3457205295562744,
"rewards/format_reward/mean": 0.78125,
"rewards/format_reward/std": 0.4166666865348816,
"step": 90
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.453125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1961.0,
"completions/mean_length": 1610.09375,
"completions/mean_terminated_length": 1247.2572021484375,
"completions/min_length": 205.0,
"completions/min_terminated_length": 205.0,
"epoch": 0.104,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17640981078147888,
"learning_rate": 7.039090644965509e-07,
"loss": 0.0305,
"num_tokens": 10642273.0,
"reward": 0.5222002267837524,
"reward_std": 0.9113218784332275,
"rewards/cosine_scaled_reward/mean": -0.05139988660812378,
"rewards/cosine_scaled_reward/std": 0.4710950553417206,
"rewards/format_reward/mean": 0.625,
"rewards/format_reward/std": 0.48795005679130554,
"step": 91
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2040.0,
"completions/mean_length": 1320.09375,
"completions/mean_terminated_length": 938.8095703125,
"completions/min_length": 332.0,
"completions/min_terminated_length": 332.0,
"epoch": 0.10514285714285715,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15313342213630676,
"learning_rate": 6.965056695057204e-07,
"loss": 0.0055,
"num_tokens": 10736751.0,
"reward": 0.4166978597640991,
"reward_std": 0.6364502310752869,
"rewards/cosine_scaled_reward/mean": -0.13540107011795044,
"rewards/cosine_scaled_reward/std": 0.3054071068763733,
"rewards/format_reward/mean": 0.6875,
"rewards/format_reward/std": 0.467176616191864,
"step": 92
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.703125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1964.0,
"completions/mean_length": 1770.671875,
"completions/mean_terminated_length": 1113.8421630859375,
"completions/min_length": 632.0,
"completions/min_terminated_length": 632.0,
"epoch": 0.10628571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21292737126350403,
"learning_rate": 6.890576474687263e-07,
"loss": 0.0675,
"num_tokens": 10861418.0,
"reward": -0.15841422975063324,
"reward_std": 0.4093279242515564,
"rewards/cosine_scaled_reward/mean": -0.24326962232589722,
"rewards/cosine_scaled_reward/std": 0.16840828955173492,
"rewards/format_reward/mean": 0.328125,
"rewards/format_reward/std": 0.4732423722743988,
"step": 93
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.53125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2042.0,
"completions/mean_length": 1532.65625,
"completions/mean_terminated_length": 948.6000366210938,
"completions/min_length": 511.0,
"completions/min_terminated_length": 511.0,
"epoch": 0.10742857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20122359693050385,
"learning_rate": 6.815672671252315e-07,
"loss": 0.0623,
"num_tokens": 10969276.0,
"reward": 0.20252148807048798,
"reward_std": 0.345744788646698,
"rewards/cosine_scaled_reward/mean": -0.1409267634153366,
"rewards/cosine_scaled_reward/std": 0.4320366382598877,
"rewards/format_reward/mean": 0.484375,
"rewards/format_reward/std": 0.5037065148353577,
"step": 94
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.453125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1745.0,
"completions/mean_length": 1530.03125,
"completions/mean_terminated_length": 1100.857177734375,
"completions/min_length": 700.0,
"completions/min_terminated_length": 700.0,
"epoch": 0.10857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.16728746891021729,
"learning_rate": 6.740368101176495e-07,
"loss": 0.0592,
"num_tokens": 11077726.0,
"reward": 0.05856095254421234,
"reward_std": 0.5498154163360596,
"rewards/cosine_scaled_reward/mean": -0.25196951627731323,
"rewards/cosine_scaled_reward/std": 0.27556198835372925,
"rewards/format_reward/mean": 0.5625,
"rewards/format_reward/std": 0.5,
"step": 95
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1633.0,
"completions/mean_length": 1279.6875,
"completions/mean_terminated_length": 1044.48974609375,
"completions/min_length": 452.0,
"completions/min_terminated_length": 452.0,
"epoch": 0.10971428571428571,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1483285129070282,
"learning_rate": 6.664685702961344e-07,
"loss": 0.0161,
"num_tokens": 11170762.0,
"reward": 0.8373413681983948,
"reward_std": 0.4410895109176636,
"rewards/cosine_scaled_reward/mean": 0.01242067664861679,
"rewards/cosine_scaled_reward/std": 0.46624863147735596,
"rewards/format_reward/mean": 0.8125,
"rewards/format_reward/std": 0.39339789748191833,
"step": 96
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1905.0,
"completions/mean_length": 1312.640625,
"completions/mean_terminated_length": 1024.891357421875,
"completions/min_length": 343.0,
"completions/min_terminated_length": 343.0,
"epoch": 0.11085714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.16424083709716797,
"learning_rate": 6.588648530198504e-07,
"loss": 0.0535,
"num_tokens": 11265035.0,
"reward": 0.5586233139038086,
"reward_std": 0.7126098871231079,
"rewards/cosine_scaled_reward/mean": -0.11131332814693451,
"rewards/cosine_scaled_reward/std": 0.3577263653278351,
"rewards/format_reward/mean": 0.78125,
"rewards/format_reward/std": 0.4166666865348816,
"step": 97
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1940.0,
"completions/mean_length": 1376.53125,
"completions/mean_terminated_length": 1024.8095703125,
"completions/min_length": 372.0,
"completions/min_terminated_length": 372.0,
"epoch": 0.112,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17384155094623566,
"learning_rate": 6.512279744547392e-07,
"loss": 0.0164,
"num_tokens": 11364197.0,
"reward": 0.6794039607048035,
"reward_std": 0.4869590997695923,
"rewards/cosine_scaled_reward/mean": -0.02748553454875946,
"rewards/cosine_scaled_reward/std": 0.45645180344581604,
"rewards/format_reward/mean": 0.734375,
"rewards/format_reward/std": 0.44515693187713623,
"step": 98
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.296875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2048.0,
"completions/mean_length": 1280.09375,
"completions/mean_terminated_length": 955.86669921875,
"completions/min_length": 415.0,
"completions/min_terminated_length": 415.0,
"epoch": 0.11314285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17342573404312134,
"learning_rate": 6.435602608679916e-07,
"loss": 0.0816,
"num_tokens": 11457291.0,
"reward": 0.7432724237442017,
"reward_std": 0.6722617745399475,
"rewards/cosine_scaled_reward/mean": -0.003363795578479767,
"rewards/cosine_scaled_reward/std": 0.4415356516838074,
"rewards/format_reward/mean": 0.75,
"rewards/format_reward/std": 0.4364357888698578,
"step": 99
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1965.0,
"completions/mean_length": 1247.765625,
"completions/mean_terminated_length": 1063.09619140625,
"completions/min_length": 520.0,
"completions/min_terminated_length": 520.0,
"epoch": 0.11428571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15301530063152313,
"learning_rate": 6.358640479194451e-07,
"loss": 0.0125,
"num_tokens": 11546860.0,
"reward": 0.803851306438446,
"reward_std": 0.6947499513626099,
"rewards/cosine_scaled_reward/mean": -0.019949357956647873,
"rewards/cosine_scaled_reward/std": 0.4705973267555237,
"rewards/format_reward/mean": 0.84375,
"rewards/format_reward/std": 0.36596253514289856,
"step": 100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1911.0,
"completions/mean_length": 1269.671875,
"completions/mean_terminated_length": 1125.5369873046875,
"completions/min_length": 485.0,
"completions/min_terminated_length": 485.0,
"epoch": 0.11542857142857142,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1690932661294937,
"learning_rate": 6.281416799501187e-07,
"loss": 0.013,
"num_tokens": 11639551.0,
"reward": 0.6836185455322266,
"reward_std": 0.5046678781509399,
"rewards/cosine_scaled_reward/mean": -0.08787819743156433,
"rewards/cosine_scaled_reward/std": 0.40181559324264526,
"rewards/format_reward/mean": 0.859375,
"rewards/format_reward/std": 0.3503824472427368,
"step": 101
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1872.0,
"completions/mean_length": 1174.265625,
"completions/mean_terminated_length": 1012.4629516601562,
"completions/min_length": 340.0,
"completions/min_terminated_length": 340.0,
"epoch": 0.11657142857142858,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.16043449938297272,
"learning_rate": 6.203955092681039e-07,
"loss": 0.032,
"num_tokens": 11724856.0,
"reward": 0.67606520652771,
"reward_std": 0.6234960556030273,
"rewards/cosine_scaled_reward/mean": -0.09165491163730621,
"rewards/cosine_scaled_reward/std": 0.37837859988212585,
"rewards/format_reward/mean": 0.859375,
"rewards/format_reward/std": 0.3503824472427368,
"step": 102
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.203125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1953.0,
"completions/mean_length": 1157.8125,
"completions/mean_terminated_length": 930.9019775390625,
"completions/min_length": 247.0,
"completions/min_terminated_length": 247.0,
"epoch": 0.11771428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1574372500181198,
"learning_rate": 6.126278954320294e-07,
"loss": 0.0249,
"num_tokens": 11809308.0,
"reward": 0.4326379895210266,
"reward_std": 0.5444109439849854,
"rewards/cosine_scaled_reward/mean": -0.1977435052394867,
"rewards/cosine_scaled_reward/std": 0.3261271119117737,
"rewards/format_reward/mean": 0.828125,
"rewards/format_reward/std": 0.38025420904159546,
"step": 103
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1988.0,
"completions/mean_length": 1330.484375,
"completions/mean_terminated_length": 954.6428833007812,
"completions/min_length": 371.0,
"completions/min_terminated_length": 371.0,
"epoch": 0.11885714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.18211479485034943,
"learning_rate": 6.048412045323164e-07,
"loss": 0.0439,
"num_tokens": 11904923.0,
"reward": 0.4620264172554016,
"reward_std": 0.5293800830841064,
"rewards/cosine_scaled_reward/mean": -0.12054930627346039,
"rewards/cosine_scaled_reward/std": 0.3497216999530792,
"rewards/format_reward/mean": 0.703125,
"rewards/format_reward/std": 0.4604927599430084,
"step": 104
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2003.0,
"completions/mean_length": 1224.859375,
"completions/mean_terminated_length": 994.3800048828125,
"completions/min_length": 499.0,
"completions/min_terminated_length": 499.0,
"epoch": 0.12,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1528584063053131,
"learning_rate": 5.97037808470444e-07,
"loss": 0.0233,
"num_tokens": 11994602.0,
"reward": 0.7569347620010376,
"reward_std": 0.6899948120117188,
"rewards/cosine_scaled_reward/mean": -0.027782641351222992,
"rewards/cosine_scaled_reward/std": 0.5096075534820557,
"rewards/format_reward/mean": 0.8125,
"rewards/format_reward/std": 0.39339789748191833,
"step": 105
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.296875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2046.0,
"completions/mean_length": 1239.625,
"completions/mean_terminated_length": 898.3111572265625,
"completions/min_length": 293.0,
"completions/min_terminated_length": 293.0,
"epoch": 0.12114285714285715,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1494080275297165,
"learning_rate": 5.892200842364462e-07,
"loss": 0.0226,
"num_tokens": 12084770.0,
"reward": 1.043992519378662,
"reward_std": 0.7194849252700806,
"rewards/cosine_scaled_reward/mean": 0.13918372988700867,
"rewards/cosine_scaled_reward/std": 0.46339961886405945,
"rewards/format_reward/mean": 0.765625,
"rewards/format_reward/std": 0.42695629596710205,
"step": 106
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2036.0,
"completions/mean_length": 1139.515625,
"completions/mean_terminated_length": 971.2777709960938,
"completions/min_length": 401.0,
"completions/min_terminated_length": 401.0,
"epoch": 0.12228571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1769389808177948,
"learning_rate": 5.813904131848564e-07,
"loss": 0.0041,
"num_tokens": 12168851.0,
"reward": 0.46204712986946106,
"reward_std": 0.5935191512107849,
"rewards/cosine_scaled_reward/mean": -0.18303894996643066,
"rewards/cosine_scaled_reward/std": 0.30380427837371826,
"rewards/format_reward/mean": 0.828125,
"rewards/format_reward/std": 0.38025420904159546,
"step": 107
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1791.0,
"completions/mean_length": 1382.375,
"completions/mean_terminated_length": 983.0,
"completions/min_length": 348.0,
"completions/min_terminated_length": 348.0,
"epoch": 0.12342857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.16175994277000427,
"learning_rate": 5.735511803093248e-07,
"loss": 0.0651,
"num_tokens": 12267683.0,
"reward": 0.3516117334365845,
"reward_std": 0.7561339735984802,
"rewards/cosine_scaled_reward/mean": -0.17575663328170776,
"rewards/cosine_scaled_reward/std": 0.35719168186187744,
"rewards/format_reward/mean": 0.703125,
"rewards/format_reward/std": 0.4604927599430084,
"step": 108
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1965.0,
"completions/mean_length": 1392.828125,
"completions/mean_terminated_length": 1049.642822265625,
"completions/min_length": 543.0,
"completions/min_terminated_length": 543.0,
"epoch": 0.12457142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.16766144335269928,
"learning_rate": 5.657047735161255e-07,
"loss": 0.0149,
"num_tokens": 12368072.0,
"reward": 0.7171763181686401,
"reward_std": 0.4656876027584076,
"rewards/cosine_scaled_reward/mean": 0.007025681436061859,
"rewards/cosine_scaled_reward/std": 0.4227021336555481,
"rewards/format_reward/mean": 0.703125,
"rewards/format_reward/std": 0.4604927599430084,
"step": 109
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1881.0,
"completions/mean_length": 1239.796875,
"completions/mean_terminated_length": 923.5435180664062,
"completions/min_length": 194.0,
"completions/min_terminated_length": 194.0,
"epoch": 0.12571428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14556895196437836,
"learning_rate": 5.578535828967777e-07,
"loss": 0.0102,
"num_tokens": 12458195.0,
"reward": 0.3774694800376892,
"reward_std": 0.654548704624176,
"rewards/cosine_scaled_reward/mean": -0.1784527748823166,
"rewards/cosine_scaled_reward/std": 0.331076443195343,
"rewards/format_reward/mean": 0.734375,
"rewards/format_reward/std": 0.44515693187713623,
"step": 110
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1808.0,
"completions/mean_length": 1170.40625,
"completions/mean_terminated_length": 1045.0357666015625,
"completions/min_length": 508.0,
"completions/min_terminated_length": 508.0,
"epoch": 0.12685714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1637505292892456,
"learning_rate": 5.5e-07,
"loss": 0.0741,
"num_tokens": 12543221.0,
"reward": 0.6489747762680054,
"reward_std": 0.654654860496521,
"rewards/cosine_scaled_reward/mean": -0.12082511186599731,
"rewards/cosine_scaled_reward/std": 0.34212014079093933,
"rewards/format_reward/mean": 0.890625,
"rewards/format_reward/std": 0.3145764470100403,
"step": 111
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.421875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1831.0,
"completions/mean_length": 1475.09375,
"completions/mean_terminated_length": 1057.027099609375,
"completions/min_length": 351.0,
"completions/min_terminated_length": 351.0,
"epoch": 0.128,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1809089183807373,
"learning_rate": 5.421464171032224e-07,
"loss": 0.0187,
"num_tokens": 12648723.0,
"reward": 0.6672303676605225,
"reward_std": 0.7431913614273071,
"rewards/cosine_scaled_reward/mean": 0.01330268383026123,
"rewards/cosine_scaled_reward/std": 0.4883294403553009,
"rewards/format_reward/mean": 0.640625,
"rewards/format_reward/std": 0.4836103618144989,
"step": 112
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1961.0,
"completions/mean_length": 1056.4375,
"completions/mean_terminated_length": 914.7857666015625,
"completions/min_length": 340.0,
"completions/min_terminated_length": 340.0,
"epoch": 0.12914285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1637895107269287,
"learning_rate": 5.342952264838747e-07,
"loss": 0.0644,
"num_tokens": 12726631.0,
"reward": 0.6515660881996155,
"reward_std": 0.5848349332809448,
"rewards/cosine_scaled_reward/mean": -0.11952944099903107,
"rewards/cosine_scaled_reward/std": 0.4174686074256897,
"rewards/format_reward/mean": 0.890625,
"rewards/format_reward/std": 0.3145764470100403,
"step": 113
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1711.0,
"completions/mean_length": 1097.859375,
"completions/mean_terminated_length": 962.1250610351562,
"completions/min_length": 141.0,
"completions/min_terminated_length": 141.0,
"epoch": 0.13028571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1319950371980667,
"learning_rate": 5.264488196906752e-07,
"loss": 0.0226,
"num_tokens": 12806742.0,
"reward": 0.6668691635131836,
"reward_std": 0.6580501794815063,
"rewards/cosine_scaled_reward/mean": -0.1431279182434082,
"rewards/cosine_scaled_reward/std": 0.378142774105072,
"rewards/format_reward/mean": 0.953125,
"rewards/format_reward/std": 0.21304203569889069,
"step": 114
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1997.0,
"completions/mean_length": 1380.0625,
"completions/mean_terminated_length": 1076.45458984375,
"completions/min_length": 322.0,
"completions/min_terminated_length": 322.0,
"epoch": 0.13142857142857142,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1882496327161789,
"learning_rate": 5.186095868151436e-07,
"loss": 0.04,
"num_tokens": 12906282.0,
"reward": 0.43996283411979675,
"reward_std": 0.6503387093544006,
"rewards/cosine_scaled_reward/mean": -0.13939358294010162,
"rewards/cosine_scaled_reward/std": 0.3781909942626953,
"rewards/format_reward/mean": 0.71875,
"rewards/format_reward/std": 0.4531635046005249,
"step": 115
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1852.0,
"completions/mean_length": 1364.125,
"completions/mean_terminated_length": 953.7999877929688,
"completions/min_length": 343.0,
"completions/min_terminated_length": 343.0,
"epoch": 0.13257142857142856,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1886526346206665,
"learning_rate": 5.107799157635538e-07,
"loss": 0.1079,
"num_tokens": 13004970.0,
"reward": 0.5331847667694092,
"reward_std": 0.7935209274291992,
"rewards/cosine_scaled_reward/mean": -0.08497010916471481,
"rewards/cosine_scaled_reward/std": 0.4501515328884125,
"rewards/format_reward/mean": 0.703125,
"rewards/format_reward/std": 0.4604927599430084,
"step": 116
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.109375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1944.0,
"completions/mean_length": 1136.4375,
"completions/mean_terminated_length": 1024.4912109375,
"completions/min_length": 505.0,
"completions/min_terminated_length": 505.0,
"epoch": 0.1337142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1523984968662262,
"learning_rate": 5.02962191529556e-07,
"loss": 0.0671,
"num_tokens": 13088726.0,
"reward": 0.7468037009239197,
"reward_std": 0.7615803480148315,
"rewards/cosine_scaled_reward/mean": -0.08753564208745956,
"rewards/cosine_scaled_reward/std": 0.44001707434654236,
"rewards/format_reward/mean": 0.921875,
"rewards/format_reward/std": 0.27048972249031067,
"step": 117
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1918.0,
"completions/mean_length": 1274.578125,
"completions/mean_terminated_length": 971.934814453125,
"completions/min_length": 474.0,
"completions/min_terminated_length": 474.0,
"epoch": 0.13485714285714287,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14875811338424683,
"learning_rate": 4.951587954676837e-07,
"loss": 0.0166,
"num_tokens": 13180835.0,
"reward": 0.6522707939147949,
"reward_std": 0.589940071105957,
"rewards/cosine_scaled_reward/mean": -0.041052110493183136,
"rewards/cosine_scaled_reward/std": 0.5126345157623291,
"rewards/format_reward/mean": 0.734375,
"rewards/format_reward/std": 0.44515693187713623,
"step": 118
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1962.0,
"completions/mean_length": 1070.359375,
"completions/mean_terminated_length": 844.7500610351562,
"completions/min_length": 333.0,
"completions/min_terminated_length": 333.0,
"epoch": 0.136,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15418609976768494,
"learning_rate": 4.873721045679706e-07,
"loss": 0.0135,
"num_tokens": 13259746.0,
"reward": 0.8924436569213867,
"reward_std": 0.6925675272941589,
"rewards/cosine_scaled_reward/mean": 0.00872182846069336,
"rewards/cosine_scaled_reward/std": 0.49334391951560974,
"rewards/format_reward/mean": 0.875,
"rewards/format_reward/std": 0.3333333432674408,
"step": 119
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1899.0,
"completions/mean_length": 868.90625,
"completions/mean_terminated_length": 850.1905517578125,
"completions/min_length": 137.0,
"completions/min_terminated_length": 137.0,
"epoch": 0.13714285714285715,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.12889909744262695,
"learning_rate": 4.79604490731896e-07,
"loss": -0.0038,
"num_tokens": 13325812.0,
"reward": 0.833016574382782,
"reward_std": 0.6583147048950195,
"rewards/cosine_scaled_reward/mean": -0.08349171280860901,
"rewards/cosine_scaled_reward/std": 0.43434619903564453,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 120
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1890.0,
"completions/mean_length": 792.359375,
"completions/mean_terminated_length": 730.6065063476562,
"completions/min_length": 246.0,
"completions/min_terminated_length": 246.0,
"epoch": 0.1382857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14095140993595123,
"learning_rate": 4.7185832004988133e-07,
"loss": 0.0479,
"num_tokens": 13386219.0,
"reward": 1.289149284362793,
"reward_std": 0.6984070539474487,
"rewards/cosine_scaled_reward/mean": 0.16801217198371887,
"rewards/cosine_scaled_reward/std": 0.5607498288154602,
"rewards/format_reward/mean": 0.953125,
"rewards/format_reward/std": 0.21304203569889069,
"step": 121
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.328125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1805.0,
"completions/mean_length": 1384.5,
"completions/mean_terminated_length": 1060.465087890625,
"completions/min_length": 334.0,
"completions/min_terminated_length": 334.0,
"epoch": 0.13942857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17925438284873962,
"learning_rate": 4.641359520805548e-07,
"loss": 0.0462,
"num_tokens": 13486387.0,
"reward": 0.4263126254081726,
"reward_std": 0.6481289267539978,
"rewards/cosine_scaled_reward/mean": -0.1462186872959137,
"rewards/cosine_scaled_reward/std": 0.3027765154838562,
"rewards/format_reward/mean": 0.71875,
"rewards/format_reward/std": 0.4531635046005249,
"step": 122
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1833.0,
"completions/mean_length": 1208.140625,
"completions/mean_terminated_length": 1014.3269653320312,
"completions/min_length": 519.0,
"completions/min_terminated_length": 519.0,
"epoch": 0.14057142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14084899425506592,
"learning_rate": 4.5643973913200837e-07,
"loss": -0.0087,
"num_tokens": 13573940.0,
"reward": 0.5345523357391357,
"reward_std": 0.35669955611228943,
"rewards/cosine_scaled_reward/mean": -0.16241134703159332,
"rewards/cosine_scaled_reward/std": 0.3877701759338379,
"rewards/format_reward/mean": 0.859375,
"rewards/format_reward/std": 0.3503824472427368,
"step": 123
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2019.0,
"completions/mean_length": 1185.703125,
"completions/mean_terminated_length": 1026.0185546875,
"completions/min_length": 455.0,
"completions/min_terminated_length": 455.0,
"epoch": 0.1417142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13726963102817535,
"learning_rate": 4.4877202554526084e-07,
"loss": 0.0309,
"num_tokens": 13660777.0,
"reward": 0.802190363407135,
"reward_std": 0.6432194709777832,
"rewards/cosine_scaled_reward/mean": -0.044217295944690704,
"rewards/cosine_scaled_reward/std": 0.4381820559501648,
"rewards/format_reward/mean": 0.890625,
"rewards/format_reward/std": 0.3145764470100403,
"step": 124
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.203125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2033.0,
"completions/mean_length": 1272.40625,
"completions/mean_terminated_length": 1074.7059326171875,
"completions/min_length": 451.0,
"completions/min_terminated_length": 451.0,
"epoch": 0.14285714285714285,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1567695438861847,
"learning_rate": 4.4113514698014953e-07,
"loss": -0.0098,
"num_tokens": 13753139.0,
"reward": 0.8288029432296753,
"reward_std": 0.6727226972579956,
"rewards/cosine_scaled_reward/mean": 0.0003389418125152588,
"rewards/cosine_scaled_reward/std": 0.501276433467865,
"rewards/format_reward/mean": 0.828125,
"rewards/format_reward/std": 0.38025420904159546,
"step": 125
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1738.0,
"completions/mean_length": 1239.6875,
"completions/mean_terminated_length": 992.244873046875,
"completions/min_length": 609.0,
"completions/min_terminated_length": 609.0,
"epoch": 0.144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1638517528772354,
"learning_rate": 4.3353142970386557e-07,
"loss": 0.0357,
"num_tokens": 13843775.0,
"reward": 0.8066681623458862,
"reward_std": 0.8093670010566711,
"rewards/cosine_scaled_reward/mean": -0.0029159002006053925,
"rewards/cosine_scaled_reward/std": 0.40039899945259094,
"rewards/format_reward/mean": 0.8125,
"rewards/format_reward/std": 0.39339789748191833,
"step": 126
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1968.0,
"completions/mean_length": 1259.078125,
"completions/mean_terminated_length": 1038.179931640625,
"completions/min_length": 430.0,
"completions/min_terminated_length": 430.0,
"epoch": 0.14514285714285713,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.18749745190143585,
"learning_rate": 4.2596318988235037e-07,
"loss": 0.1248,
"num_tokens": 13935452.0,
"reward": 0.3689166009426117,
"reward_std": 0.5908951759338379,
"rewards/cosine_scaled_reward/mean": -0.22960419952869415,
"rewards/cosine_scaled_reward/std": 0.2868925929069519,
"rewards/format_reward/mean": 0.828125,
"rewards/format_reward/std": 0.38025420904159546,
"step": 127
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1942.0,
"completions/mean_length": 1131.578125,
"completions/mean_terminated_length": 1070.4833984375,
"completions/min_length": 449.0,
"completions/min_terminated_length": 449.0,
"epoch": 0.1462857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14950093626976013,
"learning_rate": 4.1843273287476854e-07,
"loss": 0.0065,
"num_tokens": 14018225.0,
"reward": 0.9214786291122437,
"reward_std": 0.7154524922370911,
"rewards/cosine_scaled_reward/mean": -0.01582319289445877,
"rewards/cosine_scaled_reward/std": 0.47363659739494324,
"rewards/format_reward/mean": 0.953125,
"rewards/format_reward/std": 0.21304203569889069,
"step": 128
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1927.0,
"completions/mean_length": 1407.71875,
"completions/mean_terminated_length": 1211.7142333984375,
"completions/min_length": 455.0,
"completions/min_terminated_length": 455.0,
"epoch": 0.14742857142857144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.16768500208854675,
"learning_rate": 4.1094235253127374e-07,
"loss": 0.0736,
"num_tokens": 14119023.0,
"reward": 0.5007042288780212,
"reward_std": 0.6594030261039734,
"rewards/cosine_scaled_reward/mean": -0.14808538556098938,
"rewards/cosine_scaled_reward/std": 0.3597432076931,
"rewards/format_reward/mean": 0.796875,
"rewards/format_reward/std": 0.40550529956817627,
"step": 129
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.359375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1705.0,
"completions/mean_length": 1331.546875,
"completions/mean_terminated_length": 929.6340942382812,
"completions/min_length": 364.0,
"completions/min_terminated_length": 364.0,
"epoch": 0.14857142857142858,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1533743441104889,
"learning_rate": 4.034943304942796e-07,
"loss": 0.065,
"num_tokens": 14214746.0,
"reward": 0.18521776795387268,
"reward_std": 0.4527278244495392,
"rewards/cosine_scaled_reward/mean": -0.25895363092422485,
"rewards/cosine_scaled_reward/std": 0.2297503650188446,
"rewards/format_reward/mean": 0.703125,
"rewards/format_reward/std": 0.4604927599430084,
"step": 130
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1983.0,
"completions/mean_length": 1240.703125,
"completions/mean_terminated_length": 971.6041870117188,
"completions/min_length": 348.0,
"completions/min_terminated_length": 348.0,
"epoch": 0.14971428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.16382953524589539,
"learning_rate": 3.9609093550344907e-07,
"loss": 0.0716,
"num_tokens": 14303887.0,
"reward": 1.0216246843338013,
"reward_std": 0.8127155303955078,
"rewards/cosine_scaled_reward/mean": 0.10456232726573944,
"rewards/cosine_scaled_reward/std": 0.48323893547058105,
"rewards/format_reward/mean": 0.8125,
"rewards/format_reward/std": 0.39339789748191833,
"step": 131
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1521.0,
"completions/mean_length": 1340.046875,
"completions/mean_terminated_length": 855.6578979492188,
"completions/min_length": 297.0,
"completions/min_terminated_length": 297.0,
"epoch": 0.15085714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1577305793762207,
"learning_rate": 3.8873442270461485e-07,
"loss": 0.0288,
"num_tokens": 14400714.0,
"reward": 0.4232841432094574,
"reward_std": 0.6519888639450073,
"rewards/cosine_scaled_reward/mean": -0.1008579432964325,
"rewards/cosine_scaled_reward/std": 0.42636433243751526,
"rewards/format_reward/mean": 0.625,
"rewards/format_reward/std": 0.48795005679130554,
"step": 132
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1879.0,
"completions/mean_length": 1292.671875,
"completions/mean_terminated_length": 1061.448974609375,
"completions/min_length": 459.0,
"completions/min_terminated_length": 459.0,
"epoch": 0.152,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1743871122598648,
"learning_rate": 3.8142703296283953e-07,
"loss": 0.0212,
"num_tokens": 14494669.0,
"reward": 0.6165566444396973,
"reward_std": 0.5660312175750732,
"rewards/cosine_scaled_reward/mean": -0.08234670013189316,
"rewards/cosine_scaled_reward/std": 0.31525060534477234,
"rewards/format_reward/mean": 0.78125,
"rewards/format_reward/std": 0.4166666865348816,
"step": 133
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2039.0,
"completions/mean_length": 1096.953125,
"completions/mean_terminated_length": 998.5689697265625,
"completions/min_length": 364.0,
"completions/min_terminated_length": 364.0,
"epoch": 0.15314285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.16208074986934662,
"learning_rate": 3.7417099217982686e-07,
"loss": 0.0283,
"num_tokens": 14575442.0,
"reward": 1.0213682651519775,
"reward_std": 0.6743905544281006,
"rewards/cosine_scaled_reward/mean": 0.041934188455343246,
"rewards/cosine_scaled_reward/std": 0.5223273038864136,
"rewards/format_reward/mean": 0.9375,
"rewards/format_reward/std": 0.24397502839565277,
"step": 134
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1720.0,
"completions/mean_length": 861.328125,
"completions/mean_terminated_length": 738.5689697265625,
"completions/min_length": 284.0,
"completions/min_terminated_length": 284.0,
"epoch": 0.15428571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15264089405536652,
"learning_rate": 3.6696851061588994e-07,
"loss": 0.0412,
"num_tokens": 14641039.0,
"reward": 1.173776388168335,
"reward_std": 0.741400957107544,
"rewards/cosine_scaled_reward/mean": 0.12595069408416748,
"rewards/cosine_scaled_reward/std": 0.5099307298660278,
"rewards/format_reward/mean": 0.921875,
"rewards/format_reward/std": 0.27048972249031067,
"step": 135
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2047.0,
"completions/mean_length": 1234.375,
"completions/mean_terminated_length": 1118.1429443359375,
"completions/min_length": 429.0,
"completions/min_terminated_length": 429.0,
"epoch": 0.15542857142857142,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14350080490112305,
"learning_rate": 3.5982178221668533e-07,
"loss": 0.0484,
"num_tokens": 14730711.0,
"reward": 0.7637453675270081,
"reward_std": 0.6790728569030762,
"rewards/cosine_scaled_reward/mean": -0.10250230133533478,
"rewards/cosine_scaled_reward/std": 0.4094173312187195,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 136
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1865.0,
"completions/mean_length": 1222.53125,
"completions/mean_terminated_length": 1152.5762939453125,
"completions/min_length": 555.0,
"completions/min_terminated_length": 555.0,
"epoch": 0.15657142857142858,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1349533200263977,
"learning_rate": 3.5273298394491515e-07,
"loss": -0.0333,
"num_tokens": 14819561.0,
"reward": 0.6314640641212463,
"reward_std": 0.6037685871124268,
"rewards/cosine_scaled_reward/mean": -0.16083045303821564,
"rewards/cosine_scaled_reward/std": 0.3636666238307953,
"rewards/format_reward/mean": 0.953125,
"rewards/format_reward/std": 0.21304203569889069,
"step": 137
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1955.0,
"completions/mean_length": 1173.65625,
"completions/mean_terminated_length": 1048.75,
"completions/min_length": 320.0,
"completions/min_terminated_length": 320.0,
"epoch": 0.15771428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13684934377670288,
"learning_rate": 3.45704275117204e-07,
"loss": 0.0176,
"num_tokens": 14905987.0,
"reward": 0.8157724142074585,
"reward_std": 0.7757042646408081,
"rewards/cosine_scaled_reward/mean": -0.04523882642388344,
"rewards/cosine_scaled_reward/std": 0.4742158055305481,
"rewards/format_reward/mean": 0.90625,
"rewards/format_reward/std": 0.29378482699394226,
"step": 138
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2009.0,
"completions/mean_length": 1268.453125,
"completions/mean_terminated_length": 1124.0926513671875,
"completions/min_length": 291.0,
"completions/min_terminated_length": 291.0,
"epoch": 0.15885714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14245833456516266,
"learning_rate": 3.387377967463493e-07,
"loss": -0.0152,
"num_tokens": 14997808.0,
"reward": 0.7688822746276855,
"reward_std": 0.5957136750221252,
"rewards/cosine_scaled_reward/mean": -0.09212135523557663,
"rewards/cosine_scaled_reward/std": 0.42672204971313477,
"rewards/format_reward/mean": 0.953125,
"rewards/format_reward/std": 0.21304203569889069,
"step": 139
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.140625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1748.0,
"completions/mean_length": 1121.875,
"completions/mean_terminated_length": 970.3272705078125,
"completions/min_length": 471.0,
"completions/min_terminated_length": 471.0,
"epoch": 0.16,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.14092370867729187,
"learning_rate": 3.3183567088914833e-07,
"loss": 0.0336,
"num_tokens": 15079832.0,
"reward": 0.6852799654006958,
"reward_std": 0.412535697221756,
"rewards/cosine_scaled_reward/mean": -0.0948600098490715,
"rewards/cosine_scaled_reward/std": 0.46610429883003235,
"rewards/format_reward/mean": 0.875,
"rewards/format_reward/std": 0.3333333432674408,
"step": 140
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1891.0,
"completions/mean_length": 1039.921875,
"completions/mean_terminated_length": 954.4915161132812,
"completions/min_length": 442.0,
"completions/min_terminated_length": 442.0,
"epoch": 0.16114285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1313817948102951,
"learning_rate": 3.250000000000001e-07,
"loss": 0.0563,
"num_tokens": 15156947.0,
"reward": 1.052842140197754,
"reward_std": 0.7119845151901245,
"rewards/cosine_scaled_reward/mean": 0.03423358500003815,
"rewards/cosine_scaled_reward/std": 0.4524931311607361,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 141
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.296875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1770.0,
"completions/mean_length": 1376.234375,
"completions/mean_terminated_length": 1092.5999755859375,
"completions/min_length": 326.0,
"completions/min_terminated_length": 326.0,
"epoch": 0.16228571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1682240515947342,
"learning_rate": 3.182328662904756e-07,
"loss": 0.0936,
"num_tokens": 15255530.0,
"reward": 0.44548213481903076,
"reward_std": 0.7928640842437744,
"rewards/cosine_scaled_reward/mean": -0.18350891768932343,
"rewards/cosine_scaled_reward/std": 0.36820653080940247,
"rewards/format_reward/mean": 0.8125,
"rewards/format_reward/std": 0.39339789748191833,
"step": 142
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1989.0,
"completions/mean_length": 1057.03125,
"completions/mean_terminated_length": 1008.2950439453125,
"completions/min_length": 416.0,
"completions/min_terminated_length": 416.0,
"epoch": 0.16342857142857142,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15701259672641754,
"learning_rate": 3.115363310950578e-07,
"loss": 0.032,
"num_tokens": 15333996.0,
"reward": 0.681940495967865,
"reward_std": 0.6061316728591919,
"rewards/cosine_scaled_reward/mean": -0.1434047520160675,
"rewards/cosine_scaled_reward/std": 0.31647545099258423,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 143
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.203125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1879.0,
"completions/mean_length": 1187.875,
"completions/mean_terminated_length": 968.6275024414062,
"completions/min_length": 316.0,
"completions/min_terminated_length": 316.0,
"epoch": 0.16457142857142856,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1424182802438736,
"learning_rate": 3.0491243424323783e-07,
"loss": 0.0431,
"num_tokens": 15421508.0,
"reward": 1.0751841068267822,
"reward_std": 0.7788275480270386,
"rewards/cosine_scaled_reward/mean": 0.12352952361106873,
"rewards/cosine_scaled_reward/std": 0.5238592028617859,
"rewards/format_reward/mean": 0.828125,
"rewards/format_reward/std": 0.38025420904159546,
"step": 144
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1908.0,
"completions/mean_length": 908.546875,
"completions/mean_terminated_length": 852.5081787109375,
"completions/min_length": 261.0,
"completions/min_terminated_length": 261.0,
"epoch": 0.1657142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1289522349834442,
"learning_rate": 2.9836319343816397e-07,
"loss": 0.0264,
"num_tokens": 15489599.0,
"reward": 1.0159393548965454,
"reward_std": 0.6956236958503723,
"rewards/cosine_scaled_reward/mean": 0.023594655096530914,
"rewards/cosine_scaled_reward/std": 0.472563236951828,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 145
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1742.0,
"completions/mean_length": 1074.765625,
"completions/mean_terminated_length": 1009.8833618164062,
"completions/min_length": 457.0,
"completions/min_terminated_length": 457.0,
"epoch": 0.16685714285714287,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.12770003080368042,
"learning_rate": 2.918906036420294e-07,
"loss": 0.0393,
"num_tokens": 15569000.0,
"reward": 0.5655175447463989,
"reward_std": 0.5674481987953186,
"rewards/cosine_scaled_reward/mean": -0.19380369782447815,
"rewards/cosine_scaled_reward/std": 0.32235828042030334,
"rewards/format_reward/mean": 0.953125,
"rewards/format_reward/std": 0.21304203569889069,
"step": 146
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2029.0,
"completions/mean_length": 1272.375,
"completions/mean_terminated_length": 1111.396240234375,
"completions/min_length": 399.0,
"completions/min_terminated_length": 399.0,
"epoch": 0.168,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.160521000623703,
"learning_rate": 2.854966364683872e-07,
"loss": 0.017,
"num_tokens": 15661216.0,
"reward": 0.5459345579147339,
"reward_std": 0.7825783491134644,
"rewards/cosine_scaled_reward/mean": -0.14890772104263306,
"rewards/cosine_scaled_reward/std": 0.4268314838409424,
"rewards/format_reward/mean": 0.84375,
"rewards/format_reward/std": 0.36596253514289856,
"step": 147
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.109375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1729.0,
"completions/mean_length": 1068.40625,
"completions/mean_terminated_length": 948.1052856445312,
"completions/min_length": 388.0,
"completions/min_terminated_length": 388.0,
"epoch": 0.16914285714285715,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13231147825717926,
"learning_rate": 2.791832395815782e-07,
"loss": 0.0355,
"num_tokens": 15740778.0,
"reward": 0.8093540668487549,
"reward_std": 0.5906412601470947,
"rewards/cosine_scaled_reward/mean": -0.08751046657562256,
"rewards/cosine_scaled_reward/std": 0.40702494978904724,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 148
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1865.0,
"completions/mean_length": 1392.28125,
"completions/mean_terminated_length": 998.8500366210938,
"completions/min_length": 559.0,
"completions/min_terminated_length": 559.0,
"epoch": 0.1702857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1449200063943863,
"learning_rate": 2.729523361034538e-07,
"loss": 0.0033,
"num_tokens": 15841780.0,
"reward": 0.4228026866912842,
"reward_std": 0.745114266872406,
"rewards/cosine_scaled_reward/mean": -0.1323486566543579,
"rewards/cosine_scaled_reward/std": 0.37805312871932983,
"rewards/format_reward/mean": 0.6875,
"rewards/format_reward/std": 0.467176616191864,
"step": 149
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2032.0,
"completions/mean_length": 1004.125,
"completions/mean_terminated_length": 952.7868041992188,
"completions/min_length": 378.0,
"completions/min_terminated_length": 378.0,
"epoch": 0.17142857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13021093606948853,
"learning_rate": 2.6680582402757324e-07,
"loss": 0.0355,
"num_tokens": 15916548.0,
"reward": 0.7599377632141113,
"reward_std": 0.5821801424026489,
"rewards/cosine_scaled_reward/mean": -0.11221860349178314,
"rewards/cosine_scaled_reward/std": 0.3788122236728668,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 150
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1859.0,
"completions/mean_length": 1107.015625,
"completions/mean_terminated_length": 911.7169799804688,
"completions/min_length": 179.0,
"completions/min_terminated_length": 179.0,
"epoch": 0.17257142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14420656859874725,
"learning_rate": 2.6074557564105724e-07,
"loss": 0.0295,
"num_tokens": 15998077.0,
"reward": 1.2211229801177979,
"reward_std": 0.7430520057678223,
"rewards/cosine_scaled_reward/mean": 0.18087396025657654,
"rewards/cosine_scaled_reward/std": 0.5226595401763916,
"rewards/format_reward/mean": 0.859375,
"rewards/format_reward/std": 0.3503824472427368,
"step": 151
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2028.0,
"completions/mean_length": 1256.234375,
"completions/mean_terminated_length": 946.4130859375,
"completions/min_length": 167.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.1737142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15784548223018646,
"learning_rate": 2.547734369542718e-07,
"loss": 0.0658,
"num_tokens": 16089140.0,
"reward": 0.6517580151557922,
"reward_std": 0.7057055830955505,
"rewards/cosine_scaled_reward/mean": -0.056933484971523285,
"rewards/cosine_scaled_reward/std": 0.403768390417099,
"rewards/format_reward/mean": 0.765625,
"rewards/format_reward/std": 0.42695629596710205,
"step": 152
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2045.0,
"completions/mean_length": 1169.125,
"completions/mean_terminated_length": 1043.571533203125,
"completions/min_length": 332.0,
"completions/min_terminated_length": 332.0,
"epoch": 0.17485714285714285,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13566994667053223,
"learning_rate": 2.488912271385139e-07,
"loss": 0.0529,
"num_tokens": 16175108.0,
"reward": 0.4462122321128845,
"reward_std": 0.4172056317329407,
"rewards/cosine_scaled_reward/mean": -0.22220639884471893,
"rewards/cosine_scaled_reward/std": 0.19565363228321075,
"rewards/format_reward/mean": 0.890625,
"rewards/format_reward/std": 0.3145764470100403,
"step": 153
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1909.0,
"completions/mean_length": 1226.546875,
"completions/mean_terminated_length": 975.0816040039062,
"completions/min_length": 443.0,
"completions/min_terminated_length": 443.0,
"epoch": 0.176,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15715065598487854,
"learning_rate": 2.4310073797187573e-07,
"loss": 0.0615,
"num_tokens": 16264671.0,
"reward": 0.6308701038360596,
"reward_std": 0.6271623373031616,
"rewards/cosine_scaled_reward/mean": -0.11425244808197021,
"rewards/cosine_scaled_reward/std": 0.37054499983787537,
"rewards/format_reward/mean": 0.859375,
"rewards/format_reward/std": 0.3503824472427368,
"step": 154
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.140625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2018.0,
"completions/mean_length": 995.125,
"completions/mean_terminated_length": 822.8363647460938,
"completions/min_length": 273.0,
"completions/min_terminated_length": 273.0,
"epoch": 0.17714285714285713,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13968248665332794,
"learning_rate": 2.374037332934512e-07,
"loss": -0.0003,
"num_tokens": 16338983.0,
"reward": 0.7562404870986938,
"reward_std": 0.70821213722229,
"rewards/cosine_scaled_reward/mean": -0.08281721919775009,
"rewards/cosine_scaled_reward/std": 0.44696903228759766,
"rewards/format_reward/mean": 0.921875,
"rewards/format_reward/std": 0.27048972249031067,
"step": 155
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1915.0,
"completions/mean_length": 1327.6875,
"completions/mean_terminated_length": 950.3809814453125,
"completions/min_length": 284.0,
"completions/min_terminated_length": 284.0,
"epoch": 0.1782857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15466168522834778,
"learning_rate": 2.3180194846605364e-07,
"loss": 0.0102,
"num_tokens": 16434059.0,
"reward": 0.6187171936035156,
"reward_std": 0.7333636283874512,
"rewards/cosine_scaled_reward/mean": -0.026578888297080994,
"rewards/cosine_scaled_reward/std": 0.49515098333358765,
"rewards/format_reward/mean": 0.671875,
"rewards/format_reward/std": 0.4732423722743988,
"step": 156
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2021.0,
"completions/mean_length": 1259.796875,
"completions/mean_terminated_length": 1077.9039306640625,
"completions/min_length": 471.0,
"completions/min_terminated_length": 471.0,
"epoch": 0.17942857142857144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15207421779632568,
"learning_rate": 2.2629708984760706e-07,
"loss": 0.002,
"num_tokens": 16524646.0,
"reward": 0.48799604177474976,
"reward_std": 0.5923628211021423,
"rewards/cosine_scaled_reward/mean": -0.18568949401378632,
"rewards/cosine_scaled_reward/std": 0.28887510299682617,
"rewards/format_reward/mean": 0.859375,
"rewards/format_reward/std": 0.3503824472427368,
"step": 157
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1980.0,
"completions/mean_length": 1055.890625,
"completions/mean_terminated_length": 971.8135375976562,
"completions/min_length": 132.0,
"completions/min_terminated_length": 132.0,
"epoch": 0.18057142857142858,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.13212887942790985,
"learning_rate": 2.2089083427137329e-07,
"loss": 0.0164,
"num_tokens": 16602343.0,
"reward": 0.9118403196334839,
"reward_std": 0.5433474779129028,
"rewards/cosine_scaled_reward/mean": -0.03626735508441925,
"rewards/cosine_scaled_reward/std": 0.5205101370811462,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 158
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.140625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2008.0,
"completions/mean_length": 1232.875,
"completions/mean_terminated_length": 1099.4908447265625,
"completions/min_length": 426.0,
"completions/min_terminated_length": 426.0,
"epoch": 0.18171428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1433304101228714,
"learning_rate": 2.1558482853517253e-07,
"loss": 0.066,
"num_tokens": 16692927.0,
"reward": 0.4464070796966553,
"reward_std": 0.5299515128135681,
"rewards/cosine_scaled_reward/mean": -0.22210896015167236,
"rewards/cosine_scaled_reward/std": 0.27688807249069214,
"rewards/format_reward/mean": 0.890625,
"rewards/format_reward/std": 0.3145764470100403,
"step": 159
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.140625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1549.0,
"completions/mean_length": 1081.25,
"completions/mean_terminated_length": 923.0545043945312,
"completions/min_length": 361.0,
"completions/min_terminated_length": 361.0,
"epoch": 0.18285714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15510301291942596,
"learning_rate": 2.1038068889975259e-07,
"loss": 0.071,
"num_tokens": 16773711.0,
"reward": 0.8579483032226562,
"reward_std": 0.7331453561782837,
"rewards/cosine_scaled_reward/mean": -0.024150855839252472,
"rewards/cosine_scaled_reward/std": 0.43525949120521545,
"rewards/format_reward/mean": 0.90625,
"rewards/format_reward/std": 0.29378482699394226,
"step": 160
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1896.0,
"completions/mean_length": 1213.984375,
"completions/mean_terminated_length": 1059.5369873046875,
"completions/min_length": 421.0,
"completions/min_terminated_length": 421.0,
"epoch": 0.184,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1579800397157669,
"learning_rate": 2.0528000059645995e-07,
"loss": 0.0563,
"num_tokens": 16861398.0,
"reward": 0.790129542350769,
"reward_std": 0.8190513849258423,
"rewards/cosine_scaled_reward/mean": -0.04243520647287369,
"rewards/cosine_scaled_reward/std": 0.4257972538471222,
"rewards/format_reward/mean": 0.875,
"rewards/format_reward/std": 0.3333333432674408,
"step": 161
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1871.0,
"completions/mean_length": 1173.828125,
"completions/mean_terminated_length": 972.09619140625,
"completions/min_length": 513.0,
"completions/min_terminated_length": 513.0,
"epoch": 0.18514285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14933519065380096,
"learning_rate": 2.0028431734436308e-07,
"loss": 0.0335,
"num_tokens": 16946827.0,
"reward": 0.6066349744796753,
"reward_std": 0.807995080947876,
"rewards/cosine_scaled_reward/mean": -0.11855749785900116,
"rewards/cosine_scaled_reward/std": 0.40160706639289856,
"rewards/format_reward/mean": 0.84375,
"rewards/format_reward/std": 0.36596253514289856,
"step": 162
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1776.0,
"completions/mean_length": 1202.625,
"completions/mean_terminated_length": 920.8333740234375,
"completions/min_length": 410.0,
"completions/min_terminated_length": 410.0,
"epoch": 0.18628571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17073573172092438,
"learning_rate": 1.9539516087697517e-07,
"loss": 0.008,
"num_tokens": 17034979.0,
"reward": 1.164199709892273,
"reward_std": 0.6732690930366516,
"rewards/cosine_scaled_reward/mean": 0.22272484004497528,
"rewards/cosine_scaled_reward/std": 0.5151689648628235,
"rewards/format_reward/mean": 0.71875,
"rewards/format_reward/std": 0.4531635046005249,
"step": 163
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.109375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2032.0,
"completions/mean_length": 1055.265625,
"completions/mean_terminated_length": 933.3508911132812,
"completions/min_length": 278.0,
"completions/min_terminated_length": 278.0,
"epoch": 0.18742857142857142,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14955352246761322,
"learning_rate": 1.9061402047871833e-07,
"loss": 0.0617,
"num_tokens": 17113044.0,
"reward": 1.0745567083358765,
"reward_std": 0.44688692688941956,
"rewards/cosine_scaled_reward/mean": 0.07634085416793823,
"rewards/cosine_scaled_reward/std": 0.45942261815071106,
"rewards/format_reward/mean": 0.921875,
"rewards/format_reward/std": 0.27048972249031067,
"step": 164
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1950.0,
"completions/mean_length": 1082.15625,
"completions/mean_terminated_length": 982.2413940429688,
"completions/min_length": 425.0,
"completions/min_terminated_length": 425.0,
"epoch": 0.18857142857142858,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15888644754886627,
"learning_rate": 1.8594235253127372e-07,
"loss": 0.0088,
"num_tokens": 17193718.0,
"reward": 0.9942861199378967,
"reward_std": 0.6299077272415161,
"rewards/cosine_scaled_reward/mean": 0.02058056741952896,
"rewards/cosine_scaled_reward/std": 0.46080252528190613,
"rewards/format_reward/mean": 0.953125,
"rewards/format_reward/std": 0.21304203569889069,
"step": 165
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1826.0,
"completions/mean_length": 1271.890625,
"completions/mean_terminated_length": 1013.1875,
"completions/min_length": 363.0,
"completions/min_terminated_length": 363.0,
"epoch": 0.18971428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14011530578136444,
"learning_rate": 1.8138158006995363e-07,
"loss": 0.0285,
"num_tokens": 17286695.0,
"reward": 0.5431326627731323,
"reward_std": 0.6457577347755432,
"rewards/cosine_scaled_reward/mean": -0.13468365371227264,
"rewards/cosine_scaled_reward/std": 0.3553418219089508,
"rewards/format_reward/mean": 0.8125,
"rewards/format_reward/std": 0.39339789748191833,
"step": 166
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1882.0,
"completions/mean_length": 1100.046875,
"completions/mean_terminated_length": 1001.9827270507812,
"completions/min_length": 476.0,
"completions/min_terminated_length": 476.0,
"epoch": 0.19085714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14937180280685425,
"learning_rate": 1.7693309235023127e-07,
"loss": 0.0807,
"num_tokens": 17368642.0,
"reward": 0.6264936923980713,
"reward_std": 0.5748982429504395,
"rewards/cosine_scaled_reward/mean": -0.14769065380096436,
"rewards/cosine_scaled_reward/std": 0.2645467221736908,
"rewards/format_reward/mean": 0.921875,
"rewards/format_reward/std": 0.27048972249031067,
"step": 167
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1992.0,
"completions/mean_length": 1130.84375,
"completions/mean_terminated_length": 1116.2857666015625,
"completions/min_length": 437.0,
"completions/min_terminated_length": 437.0,
"epoch": 0.192,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13785859942436218,
"learning_rate": 1.7259824442455923e-07,
"loss": -0.0247,
"num_tokens": 17451856.0,
"reward": 1.0183875560760498,
"reward_std": 0.7866266965866089,
"rewards/cosine_scaled_reward/mean": 0.017006313428282738,
"rewards/cosine_scaled_reward/std": 0.48554277420043945,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 168
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1877.0,
"completions/mean_length": 963.734375,
"completions/mean_terminated_length": 928.758056640625,
"completions/min_length": 498.0,
"completions/min_terminated_length": 498.0,
"epoch": 0.19314285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.132929727435112,
"learning_rate": 1.6837835672960831e-07,
"loss": 0.0452,
"num_tokens": 17524159.0,
"reward": 1.5141942501068115,
"reward_std": 0.7578620910644531,
"rewards/cosine_scaled_reward/mean": 0.26490968465805054,
"rewards/cosine_scaled_reward/std": 0.53211909532547,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 169
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1697.0,
"completions/mean_length": 1058.453125,
"completions/mean_terminated_length": 781.3800048828125,
"completions/min_length": 337.0,
"completions/min_terminated_length": 337.0,
"epoch": 0.19428571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14723296463489532,
"learning_rate": 1.6427471468404952e-07,
"loss": 0.0659,
"num_tokens": 17601684.0,
"reward": 0.8584200739860535,
"reward_std": 0.4904913902282715,
"rewards/cosine_scaled_reward/mean": 0.007335059344768524,
"rewards/cosine_scaled_reward/std": 0.44158241152763367,
"rewards/format_reward/mean": 0.84375,
"rewards/format_reward/std": 0.36596253514289856,
"step": 170
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1732.0,
"completions/mean_length": 1232.28125,
"completions/mean_terminated_length": 960.375,
"completions/min_length": 414.0,
"completions/min_terminated_length": 414.0,
"epoch": 0.19542857142857142,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.16656361520290375,
"learning_rate": 1.6028856829700258e-07,
"loss": -0.0264,
"num_tokens": 17690942.0,
"reward": 0.6898657083511353,
"reward_std": 0.6278946399688721,
"rewards/cosine_scaled_reward/mean": -0.030067168176174164,
"rewards/cosine_scaled_reward/std": 0.45971429347991943,
"rewards/format_reward/mean": 0.75,
"rewards/format_reward/std": 0.4364357888698578,
"step": 171
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.140625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1805.0,
"completions/mean_length": 1040.625,
"completions/mean_terminated_length": 875.7817993164062,
"completions/min_length": 162.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.19657142857142856,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15691731870174408,
"learning_rate": 1.5642113178727193e-07,
"loss": 0.0625,
"num_tokens": 17768158.0,
"reward": 1.2213534116744995,
"reward_std": 0.6515992879867554,
"rewards/cosine_scaled_reward/mean": 0.17317672073841095,
"rewards/cosine_scaled_reward/std": 0.5265737771987915,
"rewards/format_reward/mean": 0.875,
"rewards/format_reward/std": 0.3333333432674408,
"step": 172
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.109375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1873.0,
"completions/mean_length": 899.28125,
"completions/mean_terminated_length": 758.2105102539062,
"completions/min_length": 292.0,
"completions/min_terminated_length": 292.0,
"epoch": 0.1977142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1264735609292984,
"learning_rate": 1.5267358321348285e-07,
"loss": 0.0532,
"num_tokens": 17837024.0,
"reward": 0.7364885210990906,
"reward_std": 0.6678578853607178,
"rewards/cosine_scaled_reward/mean": -0.0848807543516159,
"rewards/cosine_scaled_reward/std": 0.4483066201210022,
"rewards/format_reward/mean": 0.90625,
"rewards/format_reward/std": 0.29378482699394226,
"step": 173
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1777.0,
"completions/max_terminated_length": 1777.0,
"completions/mean_length": 953.328125,
"completions/mean_terminated_length": 953.328125,
"completions/min_length": 508.0,
"completions/min_terminated_length": 508.0,
"epoch": 0.19885714285714284,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13611741364002228,
"learning_rate": 1.4904706411523448e-07,
"loss": 0.0037,
"num_tokens": 17908373.0,
"reward": 0.9751720428466797,
"reward_std": 0.5935230255126953,
"rewards/cosine_scaled_reward/mean": -0.012413978576660156,
"rewards/cosine_scaled_reward/std": 0.4495556354522705,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 174
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1927.0,
"completions/mean_length": 1108.25,
"completions/mean_terminated_length": 974.0000610351562,
"completions/min_length": 390.0,
"completions/min_terminated_length": 390.0,
"epoch": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15151762962341309,
"learning_rate": 1.4554267916537495e-07,
"loss": 0.0591,
"num_tokens": 17990125.0,
"reward": 0.881943941116333,
"reward_std": 0.575822114944458,
"rewards/cosine_scaled_reward/mean": -0.0121530219912529,
"rewards/cosine_scaled_reward/std": 0.49256107211112976,
"rewards/format_reward/mean": 0.90625,
"rewards/format_reward/std": 0.29378482699394226,
"step": 175
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1940.0,
"completions/mean_length": 1168.84375,
"completions/mean_terminated_length": 986.3773803710938,
"completions/min_length": 257.0,
"completions/min_terminated_length": 257.0,
"epoch": 0.20114285714285715,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15197090804576874,
"learning_rate": 1.4216149583350755e-07,
"loss": 0.0193,
"num_tokens": 18076099.0,
"reward": 0.5906968712806702,
"reward_std": 0.5817879438400269,
"rewards/cosine_scaled_reward/mean": -0.12652656435966492,
"rewards/cosine_scaled_reward/std": 0.3300129473209381,
"rewards/format_reward/mean": 0.84375,
"rewards/format_reward/std": 0.36596253514289856,
"step": 176
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1916.0,
"completions/mean_length": 1038.0625,
"completions/mean_terminated_length": 970.7333984375,
"completions/min_length": 390.0,
"completions/min_terminated_length": 390.0,
"epoch": 0.2022857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15584589540958405,
"learning_rate": 1.3890454406082956e-07,
"loss": 0.0401,
"num_tokens": 18152759.0,
"reward": 1.083601474761963,
"reward_std": 0.8219331502914429,
"rewards/cosine_scaled_reward/mean": 0.08086325228214264,
"rewards/cosine_scaled_reward/std": 0.47295841574668884,
"rewards/format_reward/mean": 0.921875,
"rewards/format_reward/std": 0.27048972249031067,
"step": 177
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1839.0,
"completions/mean_length": 1048.5625,
"completions/mean_terminated_length": 905.7857666015625,
"completions/min_length": 300.0,
"completions/min_terminated_length": 300.0,
"epoch": 0.20342857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14128631353378296,
"learning_rate": 1.3577281594640182e-07,
"loss": 0.0298,
"num_tokens": 18231403.0,
"reward": 0.9733308553695679,
"reward_std": 0.6629190444946289,
"rewards/cosine_scaled_reward/mean": 0.02572791464626789,
"rewards/cosine_scaled_reward/std": 0.47114452719688416,
"rewards/format_reward/mean": 0.921875,
"rewards/format_reward/std": 0.27048972249031067,
"step": 178
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.265625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1927.0,
"completions/mean_length": 1293.578125,
"completions/mean_terminated_length": 1020.7020874023438,
"completions/min_length": 245.0,
"completions/min_terminated_length": 245.0,
"epoch": 0.20457142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.16287265717983246,
"learning_rate": 1.3276726544494571e-07,
"loss": 0.0196,
"num_tokens": 18325024.0,
"reward": 0.5872488617897034,
"reward_std": 0.6428846120834351,
"rewards/cosine_scaled_reward/mean": -0.08137557655572891,
"rewards/cosine_scaled_reward/std": 0.3453543484210968,
"rewards/format_reward/mean": 0.75,
"rewards/format_reward/std": 0.4364357888698578,
"step": 179
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1951.0,
"completions/mean_length": 1036.78125,
"completions/mean_terminated_length": 932.1724243164062,
"completions/min_length": 262.0,
"completions/min_terminated_length": 262.0,
"epoch": 0.2057142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14584973454475403,
"learning_rate": 1.2988880807625927e-07,
"loss": 0.0066,
"num_tokens": 18402554.0,
"reward": 1.347097396850586,
"reward_std": 0.8030112385749817,
"rewards/cosine_scaled_reward/mean": 0.19698619842529297,
"rewards/cosine_scaled_reward/std": 0.48687708377838135,
"rewards/format_reward/mean": 0.953125,
"rewards/format_reward/std": 0.21304203569889069,
"step": 180
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.203125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1869.0,
"completions/mean_length": 1165.484375,
"completions/mean_terminated_length": 940.5294189453125,
"completions/min_length": 442.0,
"completions/min_terminated_length": 442.0,
"epoch": 0.20685714285714285,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1534472554922104,
"learning_rate": 1.2713832064634125e-07,
"loss": 0.0795,
"num_tokens": 18488617.0,
"reward": 0.6842443346977234,
"reward_std": 0.6290575265884399,
"rewards/cosine_scaled_reward/mean": -0.0563153512775898,
"rewards/cosine_scaled_reward/std": 0.5009898543357849,
"rewards/format_reward/mean": 0.796875,
"rewards/format_reward/std": 0.40550529956817627,
"step": 181
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.265625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2037.0,
"completions/mean_length": 1240.1875,
"completions/mean_terminated_length": 948.0,
"completions/min_length": 264.0,
"completions/min_terminated_length": 264.0,
"epoch": 0.208,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13502204418182373,
"learning_rate": 1.2451664098030743e-07,
"loss": 0.0042,
"num_tokens": 18577781.0,
"reward": 0.5206961631774902,
"reward_std": 0.6657352447509766,
"rewards/cosine_scaled_reward/mean": -0.1380893886089325,
"rewards/cosine_scaled_reward/std": 0.3631601333618164,
"rewards/format_reward/mean": 0.796875,
"rewards/format_reward/std": 0.40550529956817627,
"step": 182
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1898.0,
"completions/mean_length": 1029.140625,
"completions/mean_terminated_length": 942.796630859375,
"completions/min_length": 459.0,
"completions/min_terminated_length": 459.0,
"epoch": 0.20914285714285713,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.12827463448047638,
"learning_rate": 1.220245676671809e-07,
"loss": 0.0038,
"num_tokens": 18654262.0,
"reward": 1.1728923320770264,
"reward_std": 0.6444723010063171,
"rewards/cosine_scaled_reward/mean": 0.08644616603851318,
"rewards/cosine_scaled_reward/std": 0.49451789259910583,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 183
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2041.0,
"completions/mean_length": 1162.984375,
"completions/mean_terminated_length": 979.3018798828125,
"completions/min_length": 100.0,
"completions/min_terminated_length": 100.0,
"epoch": 0.2102857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1496909260749817,
"learning_rate": 1.1966285981663407e-07,
"loss": 0.0474,
"num_tokens": 18740045.0,
"reward": 0.738210916519165,
"reward_std": 0.540239155292511,
"rewards/cosine_scaled_reward/mean": -0.07620704174041748,
"rewards/cosine_scaled_reward/std": 0.37467995285987854,
"rewards/format_reward/mean": 0.890625,
"rewards/format_reward/std": 0.3145764470100403,
"step": 184
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1991.0,
"completions/mean_length": 1057.15625,
"completions/mean_terminated_length": 991.1000366210938,
"completions/min_length": 290.0,
"completions/min_terminated_length": 290.0,
"epoch": 0.21142857142857144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13016612827777863,
"learning_rate": 1.1743223682775649e-07,
"loss": 0.0175,
"num_tokens": 18817887.0,
"reward": 0.5949590802192688,
"reward_std": 0.6293296813964844,
"rewards/cosine_scaled_reward/mean": -0.1868954598903656,
"rewards/cosine_scaled_reward/std": 0.4017287492752075,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 185
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1410.0,
"completions/mean_length": 1108.890625,
"completions/mean_terminated_length": 974.732177734375,
"completions/min_length": 354.0,
"completions/min_terminated_length": 354.0,
"epoch": 0.21257142857142858,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14262138307094574,
"learning_rate": 1.1533337816991931e-07,
"loss": 0.0015,
"num_tokens": 18899552.0,
"reward": 0.6897875070571899,
"reward_std": 0.5968158841133118,
"rewards/cosine_scaled_reward/mean": -0.08479373157024384,
"rewards/cosine_scaled_reward/std": 0.4098339378833771,
"rewards/format_reward/mean": 0.859375,
"rewards/format_reward/std": 0.3503824472427368,
"step": 186
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2022.0,
"completions/mean_length": 1067.40625,
"completions/mean_terminated_length": 1002.0333862304688,
"completions/min_length": 408.0,
"completions/min_terminated_length": 408.0,
"epoch": 0.21371428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.16053920984268188,
"learning_rate": 1.1336692317580158e-07,
"loss": 0.0076,
"num_tokens": 18978290.0,
"reward": 0.7425481677055359,
"reward_std": 0.5081203579902649,
"rewards/cosine_scaled_reward/mean": -0.12091340124607086,
"rewards/cosine_scaled_reward/std": 0.43119898438453674,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 187
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.109375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1906.0,
"completions/mean_length": 1119.453125,
"completions/mean_terminated_length": 1005.4210815429688,
"completions/min_length": 563.0,
"completions/min_terminated_length": 563.0,
"epoch": 0.21485714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1582225263118744,
"learning_rate": 1.1153347084664419e-07,
"loss": 0.0305,
"num_tokens": 19061735.0,
"reward": 0.5219712257385254,
"reward_std": 0.5593596696853638,
"rewards/cosine_scaled_reward/mean": -0.1999519169330597,
"rewards/cosine_scaled_reward/std": 0.32119491696357727,
"rewards/format_reward/mean": 0.921875,
"rewards/format_reward/std": 0.27048972249031067,
"step": 188
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1796.0,
"completions/mean_length": 872.5625,
"completions/mean_terminated_length": 814.7540283203125,
"completions/min_length": 287.0,
"completions/min_terminated_length": 287.0,
"epoch": 0.216,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13409367203712463,
"learning_rate": 1.0983357966978745e-07,
"loss": 0.0456,
"num_tokens": 19126867.0,
"reward": 0.7454105615615845,
"reward_std": 0.605484127998352,
"rewards/cosine_scaled_reward/mean": -0.11166971176862717,
"rewards/cosine_scaled_reward/std": 0.4444236159324646,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 189
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2046.0,
"completions/mean_length": 1080.25,
"completions/mean_terminated_length": 998.2373046875,
"completions/min_length": 423.0,
"completions/min_terminated_length": 423.0,
"epoch": 0.21714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.12813109159469604,
"learning_rate": 1.0826776744855121e-07,
"loss": -0.0287,
"num_tokens": 19205771.0,
"reward": 1.0522401332855225,
"reward_std": 0.5290870070457458,
"rewards/cosine_scaled_reward/mean": 0.026120096445083618,
"rewards/cosine_scaled_reward/std": 0.4774343967437744,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 190
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1906.0,
"completions/mean_length": 952.4375,
"completions/mean_terminated_length": 898.5573120117188,
"completions/min_length": 284.0,
"completions/min_terminated_length": 284.0,
"epoch": 0.21828571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13046617805957794,
"learning_rate": 1.068365111445064e-07,
"loss": 0.0316,
"num_tokens": 19277015.0,
"reward": 1.01558518409729,
"reward_std": 0.6485674381256104,
"rewards/cosine_scaled_reward/mean": 0.023417577147483826,
"rewards/cosine_scaled_reward/std": 0.4800501763820648,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 191
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.328125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1859.0,
"completions/mean_length": 1371.515625,
"completions/mean_terminated_length": 1041.1395263671875,
"completions/min_length": 382.0,
"completions/min_terminated_length": 382.0,
"epoch": 0.21942857142857142,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14968900382518768,
"learning_rate": 1.0554024673218806e-07,
"loss": 0.0953,
"num_tokens": 19376088.0,
"reward": 0.3939949572086334,
"reward_std": 0.577399730682373,
"rewards/cosine_scaled_reward/mean": -0.19362753629684448,
"rewards/cosine_scaled_reward/std": 0.30269211530685425,
"rewards/format_reward/mean": 0.78125,
"rewards/format_reward/std": 0.4166666865348816,
"step": 192
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1859.0,
"completions/mean_length": 1186.921875,
"completions/mean_terminated_length": 945.8200073242188,
"completions/min_length": 493.0,
"completions/min_terminated_length": 493.0,
"epoch": 0.22057142857142858,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.16263115406036377,
"learning_rate": 1.0437936906629334e-07,
"loss": 0.0846,
"num_tokens": 19463195.0,
"reward": 0.6804449558258057,
"reward_std": 0.794600248336792,
"rewards/cosine_scaled_reward/mean": -0.058215029537677765,
"rewards/cosine_scaled_reward/std": 0.45185160636901855,
"rewards/format_reward/mean": 0.796875,
"rewards/format_reward/std": 0.40550529956817627,
"step": 193
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1870.0,
"completions/mean_length": 1393.921875,
"completions/mean_terminated_length": 1051.3095703125,
"completions/min_length": 483.0,
"completions/min_terminated_length": 483.0,
"epoch": 0.22171428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1629265695810318,
"learning_rate": 1.0335423176140511e-07,
"loss": -0.0049,
"num_tokens": 19563766.0,
"reward": 0.7986553907394409,
"reward_std": 0.874267578125,
"rewards/cosine_scaled_reward/mean": 0.03214021399617195,
"rewards/cosine_scaled_reward/std": 0.47694674134254456,
"rewards/format_reward/mean": 0.734375,
"rewards/format_reward/std": 0.44515693187713623,
"step": 194
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1993.0,
"completions/mean_length": 1115.015625,
"completions/mean_terminated_length": 1035.9490966796875,
"completions/min_length": 458.0,
"completions/min_terminated_length": 458.0,
"epoch": 0.22285714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.139028862118721,
"learning_rate": 1.0246514708427701e-07,
"loss": -0.0035,
"num_tokens": 19646271.0,
"reward": 0.7042949795722961,
"reward_std": 0.5829262733459473,
"rewards/cosine_scaled_reward/mean": -0.10879002511501312,
"rewards/cosine_scaled_reward/std": 0.38450202345848083,
"rewards/format_reward/mean": 0.921875,
"rewards/format_reward/std": 0.27048972249031067,
"step": 195
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1984.0,
"completions/mean_length": 1338.078125,
"completions/mean_terminated_length": 1139.2999267578125,
"completions/min_length": 390.0,
"completions/min_terminated_length": 390.0,
"epoch": 0.224,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17117220163345337,
"learning_rate": 1.017123858587145e-07,
"loss": 0.0298,
"num_tokens": 19743500.0,
"reward": 0.3932352066040039,
"reward_std": 0.6573115587234497,
"rewards/cosine_scaled_reward/mean": -0.20181991159915924,
"rewards/cosine_scaled_reward/std": 0.3404424488544464,
"rewards/format_reward/mean": 0.796875,
"rewards/format_reward/std": 0.40550529956817627,
"step": 196
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1562.0,
"completions/mean_length": 872.078125,
"completions/mean_terminated_length": 853.4127197265625,
"completions/min_length": 416.0,
"completions/min_terminated_length": 416.0,
"epoch": 0.22514285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.12287131696939468,
"learning_rate": 1.0109617738307911e-07,
"loss": -0.0059,
"num_tokens": 19809681.0,
"reward": 1.2395715713500977,
"reward_std": 0.6934706568717957,
"rewards/cosine_scaled_reward/mean": 0.11978581547737122,
"rewards/cosine_scaled_reward/std": 0.5448962450027466,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 197
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2024.0,
"completions/mean_length": 1120.28125,
"completions/mean_terminated_length": 987.7500610351562,
"completions/min_length": 320.0,
"completions/min_terminated_length": 320.0,
"epoch": 0.22628571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15039725601673126,
"learning_rate": 1.0061670936044178e-07,
"loss": 0.0362,
"num_tokens": 19892883.0,
"reward": 1.0277272462844849,
"reward_std": 0.74528968334198,
"rewards/cosine_scaled_reward/mean": 0.021676115691661835,
"rewards/cosine_scaled_reward/std": 0.5368949174880981,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 198
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1958.0,
"completions/mean_length": 1294.71875,
"completions/mean_terminated_length": 1187.107177734375,
"completions/min_length": 577.0,
"completions/min_terminated_length": 577.0,
"epoch": 0.22742857142857142,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15263773500919342,
"learning_rate": 1.002741278414069e-07,
"loss": 0.0084,
"num_tokens": 19987249.0,
"reward": 0.6131043434143066,
"reward_std": 0.7018917798995972,
"rewards/cosine_scaled_reward/mean": -0.1543852984905243,
"rewards/cosine_scaled_reward/std": 0.35418131947517395,
"rewards/format_reward/mean": 0.921875,
"rewards/format_reward/std": 0.27048972249031067,
"step": 199
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1991.0,
"completions/mean_length": 1115.796875,
"completions/mean_terminated_length": 922.3207397460938,
"completions/min_length": 509.0,
"completions/min_terminated_length": 509.0,
"epoch": 0.22857142857142856,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.16787739098072052,
"learning_rate": 1.0006853717962393e-07,
"loss": 0.0407,
"num_tokens": 20068780.0,
"reward": 0.9602231979370117,
"reward_std": 0.8039394617080688,
"rewards/cosine_scaled_reward/mean": 0.05823659524321556,
"rewards/cosine_scaled_reward/std": 0.5022075772285461,
"rewards/format_reward/mean": 0.84375,
"rewards/format_reward/std": 0.36596253514289856,
"step": 200
},
{
"epoch": 0.22857142857142856,
"step": 200,
"total_flos": 0.0,
"train_loss": 0.03711814505979419,
"train_runtime": 10340.5912,
"train_samples_per_second": 1.238,
"train_steps_per_second": 0.019
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 20068780,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}