ACC_TT_L0.2_H0.2_dr_grpo / trainer_state.json
LLucass's picture
Model save
ef99db5 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.11428571428571428,
"eval_steps": 500,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.671875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1734.0,
"completions/mean_length": 1702.03125,
"completions/mean_terminated_length": 993.6190795898438,
"completions/min_length": 483.0,
"completions/min_terminated_length": 483.0,
"epoch": 0.001142857142857143,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2005368024110794,
"learning_rate": 0.0,
"loss": 0.0427,
"num_tokens": 118418.0,
"reward": 0.17899775505065918,
"reward_std": 0.7650213241577148,
"rewards/cosine_scaled_reward/mean": -0.09800112992525101,
"rewards/cosine_scaled_reward/std": 0.37953105568885803,
"rewards/format_reward/mean": 0.375,
"rewards/format_reward/std": 0.48795005679130554,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1894.0,
"completions/mean_length": 1738.90625,
"completions/mean_terminated_length": 949.0,
"completions/min_length": 435.0,
"completions/min_terminated_length": 435.0,
"epoch": 0.002285714285714286,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19502800703048706,
"learning_rate": 1e-07,
"loss": 0.0561,
"num_tokens": 239748.0,
"reward": 0.3848632574081421,
"reward_std": 0.9111153483390808,
"rewards/cosine_scaled_reward/mean": 0.020556632429361343,
"rewards/cosine_scaled_reward/std": 0.4492928683757782,
"rewards/format_reward/mean": 0.34375,
"rewards/format_reward/std": 0.4787135720252991,
"step": 2
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.9375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 861.0,
"completions/mean_length": 1963.75,
"completions/mean_terminated_length": 700.0,
"completions/min_length": 498.0,
"completions/min_terminated_length": 498.0,
"epoch": 0.0034285714285714284,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2347632497549057,
"learning_rate": 2e-07,
"loss": 0.0465,
"num_tokens": 375900.0,
"reward": -0.33020514249801636,
"reward_std": 0.3351452350616455,
"rewards/cosine_scaled_reward/mean": -0.1963525414466858,
"rewards/cosine_scaled_reward/std": 0.16515092551708221,
"rewards/format_reward/mean": 0.0625,
"rewards/format_reward/std": 0.24397502839565277,
"step": 3
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1923.0,
"completions/mean_length": 1518.109375,
"completions/mean_terminated_length": 988.21875,
"completions/min_length": 447.0,
"completions/min_terminated_length": 447.0,
"epoch": 0.004571428571428572,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21861855685710907,
"learning_rate": 3e-07,
"loss": 0.0509,
"num_tokens": 482867.0,
"reward": 0.2307693362236023,
"reward_std": 0.7756893038749695,
"rewards/cosine_scaled_reward/mean": -0.15024033188819885,
"rewards/cosine_scaled_reward/std": 0.32144343852996826,
"rewards/format_reward/mean": 0.53125,
"rewards/format_reward/std": 0.5029674172401428,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.890625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1849.0,
"completions/mean_length": 1964.09375,
"completions/mean_terminated_length": 1280.857177734375,
"completions/min_length": 531.0,
"completions/min_terminated_length": 531.0,
"epoch": 0.005714285714285714,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.23443199694156647,
"learning_rate": 4e-07,
"loss": 0.0273,
"num_tokens": 619385.0,
"reward": -0.36384251713752747,
"reward_std": 0.4326132535934448,
"rewards/cosine_scaled_reward/mean": -0.24442125856876373,
"rewards/cosine_scaled_reward/std": 0.22642402350902557,
"rewards/format_reward/mean": 0.125,
"rewards/format_reward/std": 0.3333333432674408,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.84375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1733.0,
"completions/mean_length": 1865.78125,
"completions/mean_terminated_length": 881.7999877929688,
"completions/min_length": 520.0,
"completions/min_terminated_length": 520.0,
"epoch": 0.006857142857142857,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2527252733707428,
"learning_rate": 5e-07,
"loss": 0.0473,
"num_tokens": 750443.0,
"reward": -0.36761316657066345,
"reward_std": 0.4643300175666809,
"rewards/cosine_scaled_reward/mean": -0.2697440981864929,
"rewards/cosine_scaled_reward/std": 0.1977701485157013,
"rewards/format_reward/mean": 0.171875,
"rewards/format_reward/std": 0.38025420904159546,
"step": 6
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.84375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2000.0,
"completions/mean_length": 1941.34375,
"completions/mean_terminated_length": 1365.4000244140625,
"completions/min_length": 607.0,
"completions/min_terminated_length": 607.0,
"epoch": 0.008,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21566396951675415,
"learning_rate": 6e-07,
"loss": 0.0344,
"num_tokens": 885097.0,
"reward": -0.08318325877189636,
"reward_std": 0.5441455841064453,
"rewards/cosine_scaled_reward/mean": -0.150966614484787,
"rewards/cosine_scaled_reward/std": 0.3548375070095062,
"rewards/format_reward/mean": 0.21875,
"rewards/format_reward/std": 0.4166666865348816,
"step": 7
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.25,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1943.0,
"completions/mean_length": 1715.359375,
"completions/mean_terminated_length": 717.4375,
"completions/min_length": 311.0,
"completions/min_terminated_length": 311.0,
"epoch": 0.009142857142857144,
"epsilon_high_adjusted": 0.25,
"epsilon_low_adjusted": 0.25,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19708961248397827,
"learning_rate": 7e-07,
"loss": -0.0031,
"num_tokens": 1005296.0,
"reward": 0.1628682017326355,
"reward_std": 0.6152325868606567,
"rewards/cosine_scaled_reward/mean": -0.09044088423252106,
"rewards/cosine_scaled_reward/std": 0.45745164155960083,
"rewards/format_reward/mean": 0.34375,
"rewards/format_reward/std": 0.4787135720252991,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1989.0,
"completions/mean_length": 1971.171875,
"completions/mean_terminated_length": 1433.375,
"completions/min_length": 578.0,
"completions/min_terminated_length": 578.0,
"epoch": 0.010285714285714285,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2231767773628235,
"learning_rate": 8e-07,
"loss": 0.0363,
"num_tokens": 1142907.0,
"reward": -0.22906573116779327,
"reward_std": 0.5889308452606201,
"rewards/cosine_scaled_reward/mean": -0.20828285813331604,
"rewards/cosine_scaled_reward/std": 0.2633083164691925,
"rewards/format_reward/mean": 0.1875,
"rewards/format_reward/std": 0.39339789748191833,
"step": 9
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.65625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1642.0,
"completions/mean_length": 1645.703125,
"completions/mean_terminated_length": 877.6818237304688,
"completions/min_length": 316.0,
"completions/min_terminated_length": 316.0,
"epoch": 0.011428571428571429,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19458407163619995,
"learning_rate": 9e-07,
"loss": 0.0565,
"num_tokens": 1259064.0,
"reward": 0.11773137003183365,
"reward_std": 0.6405072212219238,
"rewards/cosine_scaled_reward/mean": -0.12082181870937347,
"rewards/cosine_scaled_reward/std": 0.33084097504615784,
"rewards/format_reward/mean": 0.359375,
"rewards/format_reward/std": 0.4836103618144989,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.921875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1271.0,
"completions/mean_length": 1966.6875,
"completions/mean_terminated_length": 1007.2000122070312,
"completions/min_length": 789.0,
"completions/min_terminated_length": 789.0,
"epoch": 0.012571428571428572,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2074805498123169,
"learning_rate": 1e-06,
"loss": 0.0544,
"num_tokens": 1396604.0,
"reward": -0.38875678181648254,
"reward_std": 0.4678027033805847,
"rewards/cosine_scaled_reward/mean": -0.24906589090824127,
"rewards/cosine_scaled_reward/std": 0.22343340516090393,
"rewards/format_reward/mean": 0.109375,
"rewards/format_reward/std": 0.3145764470100403,
"step": 11
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1913.0,
"completions/mean_length": 1694.546875,
"completions/mean_terminated_length": 1105.4583740234375,
"completions/min_length": 300.0,
"completions/min_terminated_length": 300.0,
"epoch": 0.013714285714285714,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21040339767932892,
"learning_rate": 9.997258721585931e-07,
"loss": 0.0575,
"num_tokens": 1515999.0,
"reward": 0.16435137391090393,
"reward_std": 0.7284502983093262,
"rewards/cosine_scaled_reward/mean": -0.13657432794570923,
"rewards/cosine_scaled_reward/std": 0.40020695328712463,
"rewards/format_reward/mean": 0.4375,
"rewards/format_reward/std": 0.5,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0625,
"completions/clipped_ratio": 0.671875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1980.0,
"completions/mean_length": 1735.546875,
"completions/mean_terminated_length": 1095.761962890625,
"completions/min_length": 591.0,
"completions/min_terminated_length": 591.0,
"epoch": 0.014857142857142857,
"epsilon_high_adjusted": 0.22187500000000002,
"epsilon_low_adjusted": 0.22187500000000002,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2161007821559906,
"learning_rate": 9.989038226169207e-07,
"loss": 0.0381,
"num_tokens": 1638114.0,
"reward": 0.03836393356323242,
"reward_std": 0.6106836199760437,
"rewards/cosine_scaled_reward/mean": -0.1448805332183838,
"rewards/cosine_scaled_reward/std": 0.3520916700363159,
"rewards/format_reward/mean": 0.328125,
"rewards/format_reward/std": 0.4732423722743988,
"step": 13
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.765625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1844.0,
"completions/mean_length": 1819.390625,
"completions/mean_terminated_length": 1072.60009765625,
"completions/min_length": 482.0,
"completions/min_terminated_length": 482.0,
"epoch": 0.016,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.18751561641693115,
"learning_rate": 9.975348529157229e-07,
"loss": 0.0728,
"num_tokens": 1765163.0,
"reward": -0.1004548892378807,
"reward_std": 0.7881962060928345,
"rewards/cosine_scaled_reward/mean": -0.17522744834423065,
"rewards/cosine_scaled_reward/std": 0.3718147575855255,
"rewards/format_reward/mean": 0.25,
"rewards/format_reward/std": 0.4364357888698578,
"step": 14
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1960.0,
"completions/mean_length": 1727.703125,
"completions/mean_terminated_length": 842.1764526367188,
"completions/min_length": 406.0,
"completions/min_terminated_length": 406.0,
"epoch": 0.017142857142857144,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20928962528705597,
"learning_rate": 9.956206309337066e-07,
"loss": 0.0091,
"num_tokens": 1886656.0,
"reward": 0.2893116772174835,
"reward_std": 0.44170767068862915,
"rewards/cosine_scaled_reward/mean": -0.0037816911935806274,
"rewards/cosine_scaled_reward/std": 0.493231862783432,
"rewards/format_reward/mean": 0.296875,
"rewards/format_reward/std": 0.4604927599430084,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 2048.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 2048.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.018285714285714287,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.23164308071136475,
"learning_rate": 9.931634888554935e-07,
"loss": -0.0,
"num_tokens": 2028168.0,
"reward": -0.4323144555091858,
"reward_std": 0.27591273188591003,
"rewards/cosine_scaled_reward/mean": -0.2161572128534317,
"rewards/cosine_scaled_reward/std": 0.16956526041030884,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.03125,
"completions/clipped_ratio": 0.625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1988.0,
"completions/mean_length": 1624.109375,
"completions/mean_terminated_length": 917.625,
"completions/min_length": 406.0,
"completions/min_terminated_length": 406.0,
"epoch": 0.019428571428571427,
"epsilon_high_adjusted": 0.2109375,
"epsilon_low_adjusted": 0.2109375,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.22363872826099396,
"learning_rate": 9.901664203302124e-07,
"loss": 0.0656,
"num_tokens": 2142631.0,
"reward": -0.03016360104084015,
"reward_std": 0.6786063313484192,
"rewards/cosine_scaled_reward/mean": -0.20258180797100067,
"rewards/cosine_scaled_reward/std": 0.34620094299316406,
"rewards/format_reward/mean": 0.375,
"rewards/format_reward/std": 0.48795005679130554,
"step": 17
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.125,
"completions/clipped_ratio": 0.734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1899.0,
"completions/mean_length": 1762.875,
"completions/mean_terminated_length": 974.5882568359375,
"completions/min_length": 462.0,
"completions/min_terminated_length": 462.0,
"epoch": 0.02057142857142857,
"epsilon_high_adjusted": 0.23750000000000002,
"epsilon_low_adjusted": 0.23750000000000002,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19568873941898346,
"learning_rate": 9.866330768241983e-07,
"loss": 0.015,
"num_tokens": 2265831.0,
"reward": -0.024384755641222,
"reward_std": 0.760321855545044,
"rewards/cosine_scaled_reward/mean": -0.16844238340854645,
"rewards/cosine_scaled_reward/std": 0.35202282667160034,
"rewards/format_reward/mean": 0.3125,
"rewards/format_reward/std": 0.467176616191864,
"step": 18
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1934.0,
"completions/mean_length": 1800.796875,
"completions/mean_terminated_length": 1117.3529052734375,
"completions/min_length": 510.0,
"completions/min_terminated_length": 510.0,
"epoch": 0.021714285714285714,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20510442554950714,
"learning_rate": 9.825677631722435e-07,
"loss": 0.0322,
"num_tokens": 2392338.0,
"reward": 0.2440589964389801,
"reward_std": 0.7532124519348145,
"rewards/cosine_scaled_reward/mean": -0.03422052040696144,
"rewards/cosine_scaled_reward/std": 0.49625054001808167,
"rewards/format_reward/mean": 0.3125,
"rewards/format_reward/std": 0.467176616191864,
"step": 19
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.25,
"completions/clipped_ratio": 0.625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2002.0,
"completions/mean_length": 1650.203125,
"completions/mean_terminated_length": 987.2083740234375,
"completions/min_length": 392.0,
"completions/min_terminated_length": 392.0,
"epoch": 0.022857142857142857,
"epsilon_high_adjusted": 0.25,
"epsilon_low_adjusted": 0.25,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.179988294839859,
"learning_rate": 9.779754323328192e-07,
"loss": -0.0019,
"num_tokens": 2509303.0,
"reward": 0.29894062876701355,
"reward_std": 0.6712355613708496,
"rewards/cosine_scaled_reward/mean": -0.06927968561649323,
"rewards/cosine_scaled_reward/std": 0.3939419388771057,
"rewards/format_reward/mean": 0.4375,
"rewards/format_reward/std": 0.5,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.640625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1751.0,
"completions/mean_length": 1687.578125,
"completions/mean_terminated_length": 1045.0870361328125,
"completions/min_length": 312.0,
"completions/min_terminated_length": 312.0,
"epoch": 0.024,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2064765989780426,
"learning_rate": 9.728616793536587e-07,
"loss": 0.0494,
"num_tokens": 2628116.0,
"reward": -0.134785458445549,
"reward_std": 0.6005781888961792,
"rewards/cosine_scaled_reward/mean": -0.2392677217721939,
"rewards/cosine_scaled_reward/std": 0.2986987829208374,
"rewards/format_reward/mean": 0.34375,
"rewards/format_reward/std": 0.4787135720252991,
"step": 21
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1695.0,
"completions/mean_length": 1251.390625,
"completions/mean_terminated_length": 706.3421020507812,
"completions/min_length": 231.0,
"completions/min_terminated_length": 231.0,
"epoch": 0.025142857142857144,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20877699553966522,
"learning_rate": 9.672327345550543e-07,
"loss": 0.0521,
"num_tokens": 2717221.0,
"reward": 0.46772587299346924,
"reward_std": 0.5018335580825806,
"rewards/cosine_scaled_reward/mean": -0.07863706350326538,
"rewards/cosine_scaled_reward/std": 0.3792650103569031,
"rewards/format_reward/mean": 0.625,
"rewards/format_reward/std": 0.48795005679130554,
"step": 22
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.03125,
"completions/clipped_ratio": 0.53125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1967.0,
"completions/mean_length": 1559.6875,
"completions/mean_terminated_length": 1006.2667236328125,
"completions/min_length": 362.0,
"completions/min_terminated_length": 362.0,
"epoch": 0.026285714285714287,
"epsilon_high_adjusted": 0.2109375,
"epsilon_low_adjusted": 0.2109375,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20191854238510132,
"learning_rate": 9.610954559391704e-07,
"loss": 0.0816,
"num_tokens": 2827833.0,
"reward": 0.0568242147564888,
"reward_std": 0.5486289858818054,
"rewards/cosine_scaled_reward/mean": -0.2059628963470459,
"rewards/cosine_scaled_reward/std": 0.3543168008327484,
"rewards/format_reward/mean": 0.46875,
"rewards/format_reward/std": 0.5029674172401428,
"step": 23
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1648.0,
"completions/mean_length": 1799.359375,
"completions/mean_terminated_length": 1111.941162109375,
"completions/min_length": 496.0,
"completions/min_terminated_length": 496.0,
"epoch": 0.027428571428571427,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20426414906978607,
"learning_rate": 9.54457320834625e-07,
"loss": 0.0531,
"num_tokens": 2953920.0,
"reward": -0.08145741373300552,
"reward_std": 0.5499895811080933,
"rewards/cosine_scaled_reward/mean": -0.18916621804237366,
"rewards/cosine_scaled_reward/std": 0.33400654792785645,
"rewards/format_reward/mean": 0.296875,
"rewards/format_reward/std": 0.4604927599430084,
"step": 24
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.640625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1891.0,
"completions/mean_length": 1627.65625,
"completions/mean_terminated_length": 878.3478393554688,
"completions/min_length": 460.0,
"completions/min_terminated_length": 460.0,
"epoch": 0.02857142857142857,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2195703387260437,
"learning_rate": 9.473264167865171e-07,
"loss": 0.0179,
"num_tokens": 3068386.0,
"reward": 0.166388601064682,
"reward_std": 0.6624079942703247,
"rewards/cosine_scaled_reward/mean": -0.1199306920170784,
"rewards/cosine_scaled_reward/std": 0.3789914548397064,
"rewards/format_reward/mean": 0.40625,
"rewards/format_reward/std": 0.49501484632492065,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.78125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1969.0,
"completions/mean_length": 1917.65625,
"completions/mean_terminated_length": 1452.1429443359375,
"completions/min_length": 652.0,
"completions/min_terminated_length": 652.0,
"epoch": 0.029714285714285714,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21889279782772064,
"learning_rate": 9.397114317029974e-07,
"loss": 0.0068,
"num_tokens": 3201748.0,
"reward": 0.21682679653167725,
"reward_std": 0.5016080737113953,
"rewards/cosine_scaled_reward/mean": -0.055649105459451675,
"rewards/cosine_scaled_reward/std": 0.3608931601047516,
"rewards/format_reward/mean": 0.328125,
"rewards/format_reward/std": 0.4732423722743988,
"step": 26
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.78125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1958.0,
"completions/mean_length": 1814.1875,
"completions/mean_terminated_length": 979.1428833007812,
"completions/min_length": 399.0,
"completions/min_terminated_length": 399.0,
"epoch": 0.030857142857142857,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21265852451324463,
"learning_rate": 9.316216432703916e-07,
"loss": 0.0397,
"num_tokens": 3328144.0,
"reward": -0.08447478711605072,
"reward_std": 0.48191577196121216,
"rewards/cosine_scaled_reward/mean": -0.17504990100860596,
"rewards/cosine_scaled_reward/std": 0.2491498440504074,
"rewards/format_reward/mean": 0.265625,
"rewards/format_reward/std": 0.44515693187713623,
"step": 27
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.765625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1964.0,
"completions/mean_length": 1854.546875,
"completions/mean_terminated_length": 1222.60009765625,
"completions/min_length": 444.0,
"completions/min_terminated_length": 444.0,
"epoch": 0.032,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.22452472150325775,
"learning_rate": 9.230669076497687e-07,
"loss": 0.0473,
"num_tokens": 3457171.0,
"reward": 0.11354446411132812,
"reward_std": 0.7764405608177185,
"rewards/cosine_scaled_reward/mean": -0.10729026794433594,
"rewards/cosine_scaled_reward/std": 0.42263516783714294,
"rewards/format_reward/mean": 0.328125,
"rewards/format_reward/std": 0.4732423722743988,
"step": 28
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.84375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1669.0,
"completions/mean_length": 1899.125,
"completions/mean_terminated_length": 1095.2000732421875,
"completions/min_length": 482.0,
"completions/min_terminated_length": 482.0,
"epoch": 0.03314285714285714,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21414655447006226,
"learning_rate": 9.140576474687263e-07,
"loss": 0.079,
"num_tokens": 3589187.0,
"reward": -0.2690792381763458,
"reward_std": 0.4953657388687134,
"rewards/cosine_scaled_reward/mean": -0.2282896190881729,
"rewards/cosine_scaled_reward/std": 0.20246519148349762,
"rewards/format_reward/mean": 0.1875,
"rewards/format_reward/std": 0.39339789748191833,
"step": 29
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0625,
"completions/clipped_ratio": 0.796875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1905.0,
"completions/mean_length": 1904.59375,
"completions/mean_terminated_length": 1342.0,
"completions/min_length": 905.0,
"completions/min_terminated_length": 905.0,
"epoch": 0.03428571428571429,
"epsilon_high_adjusted": 0.22187500000000002,
"epsilon_low_adjusted": 0.22187500000000002,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21034272015094757,
"learning_rate": 9.046048391230247e-07,
"loss": 0.011,
"num_tokens": 3721617.0,
"reward": 0.04454480856657028,
"reward_std": 0.765734851360321,
"rewards/cosine_scaled_reward/mean": -0.13397759199142456,
"rewards/cosine_scaled_reward/std": 0.35358336567878723,
"rewards/format_reward/mean": 0.3125,
"rewards/format_reward/std": 0.467176616191864,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.8125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1971.0,
"completions/mean_length": 1835.0625,
"completions/mean_terminated_length": 912.3333740234375,
"completions/min_length": 369.0,
"completions/min_terminated_length": 369.0,
"epoch": 0.03542857142857143,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20196685194969177,
"learning_rate": 8.9471999940354e-07,
"loss": 0.0248,
"num_tokens": 3849557.0,
"reward": -0.2954040765762329,
"reward_std": 0.5728512406349182,
"rewards/cosine_scaled_reward/mean": -0.26488950848579407,
"rewards/cosine_scaled_reward/std": 0.30517446994781494,
"rewards/format_reward/mean": 0.234375,
"rewards/format_reward/std": 0.42695629596710205,
"step": 31
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.828125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2010.0,
"completions/mean_length": 1934.859375,
"completions/mean_terminated_length": 1389.727294921875,
"completions/min_length": 696.0,
"completions/min_terminated_length": 696.0,
"epoch": 0.036571428571428574,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21605251729488373,
"learning_rate": 8.844151714648274e-07,
"loss": 0.0572,
"num_tokens": 3983740.0,
"reward": -0.1224304735660553,
"reward_std": 0.7502877712249756,
"rewards/cosine_scaled_reward/mean": -0.17059023678302765,
"rewards/cosine_scaled_reward/std": 0.398355633020401,
"rewards/format_reward/mean": 0.21875,
"rewards/format_reward/std": 0.4166666865348816,
"step": 32
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.9375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1452.0,
"completions/mean_length": 1997.625,
"completions/mean_terminated_length": 1242.0,
"completions/min_length": 1108.0,
"completions/min_terminated_length": 1108.0,
"epoch": 0.037714285714285714,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21664302051067352,
"learning_rate": 8.737029101523929e-07,
"loss": -0.0025,
"num_tokens": 4122804.0,
"reward": -0.4877340793609619,
"reward_std": 0.35244429111480713,
"rewards/cosine_scaled_reward/mean": -0.29074203968048096,
"rewards/cosine_scaled_reward/std": 0.20779016613960266,
"rewards/format_reward/mean": 0.09375,
"rewards/format_reward/std": 0.29378482699394226,
"step": 33
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.28125,
"completions/clipped_ratio": 0.5,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2034.0,
"completions/mean_length": 1607.84375,
"completions/mean_terminated_length": 1167.6875,
"completions/min_length": 484.0,
"completions/min_terminated_length": 484.0,
"epoch": 0.038857142857142854,
"epsilon_high_adjusted": 0.2703125,
"epsilon_low_adjusted": 0.2703125,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21048779785633087,
"learning_rate": 8.625962667065487e-07,
"loss": 0.0735,
"num_tokens": 4235258.0,
"reward": 0.5191864967346191,
"reward_std": 0.8498681783676147,
"rewards/cosine_scaled_reward/mean": -0.006031747907400131,
"rewards/cosine_scaled_reward/std": 0.5057411193847656,
"rewards/format_reward/mean": 0.53125,
"rewards/format_reward/std": 0.5029674172401428,
"step": 34
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.828125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2035.0,
"completions/mean_length": 1834.703125,
"completions/mean_terminated_length": 807.0,
"completions/min_length": 429.0,
"completions/min_terminated_length": 429.0,
"epoch": 0.04,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2056789994239807,
"learning_rate": 8.511087728614862e-07,
"loss": 0.0218,
"num_tokens": 4364175.0,
"reward": -0.20920395851135254,
"reward_std": 0.6680670976638794,
"rewards/cosine_scaled_reward/mean": -0.20616447925567627,
"rewards/cosine_scaled_reward/std": 0.2824583351612091,
"rewards/format_reward/mean": 0.203125,
"rewards/format_reward/std": 0.40550529956817627,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.828125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1944.0,
"completions/mean_length": 1948.0625,
"completions/mean_terminated_length": 1466.5455322265625,
"completions/min_length": 805.0,
"completions/min_terminated_length": 805.0,
"epoch": 0.04114285714285714,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.23063796758651733,
"learning_rate": 8.392544243589427e-07,
"loss": 0.0303,
"num_tokens": 4499963.0,
"reward": -0.2987564504146576,
"reward_std": 0.4857916235923767,
"rewards/cosine_scaled_reward/mean": -0.2353157252073288,
"rewards/cosine_scaled_reward/std": 0.23094965517520905,
"rewards/format_reward/mean": 0.171875,
"rewards/format_reward/std": 0.38025420904159546,
"step": 36
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.921875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2033.0,
"completions/mean_length": 2002.296875,
"completions/mean_terminated_length": 1463.0,
"completions/min_length": 992.0,
"completions/min_terminated_length": 992.0,
"epoch": 0.04228571428571429,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21713794767856598,
"learning_rate": 8.270476638965461e-07,
"loss": 0.0445,
"num_tokens": 4639134.0,
"reward": -0.38547438383102417,
"reward_std": 0.39840468764305115,
"rewards/cosine_scaled_reward/mean": -0.23961219191551208,
"rewards/cosine_scaled_reward/std": 0.19388997554779053,
"rewards/format_reward/mean": 0.09375,
"rewards/format_reward/std": 0.29378482699394226,
"step": 37
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1437.0,
"completions/mean_length": 1932.40625,
"completions/mean_terminated_length": 1123.25,
"completions/min_length": 809.0,
"completions/min_terminated_length": 809.0,
"epoch": 0.04342857142857143,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.23154403269290924,
"learning_rate": 8.145033635316128e-07,
"loss": 0.0361,
"num_tokens": 4774520.0,
"reward": -0.15762007236480713,
"reward_std": 0.4076302647590637,
"rewards/cosine_scaled_reward/mean": -0.14912253618240356,
"rewards/cosine_scaled_reward/std": 0.30022993683815,
"rewards/format_reward/mean": 0.140625,
"rewards/format_reward/std": 0.3503824472427368,
"step": 38
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.765625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1862.0,
"completions/mean_length": 1762.90625,
"completions/mean_terminated_length": 831.6000366210938,
"completions/min_length": 281.0,
"completions/min_terminated_length": 281.0,
"epoch": 0.044571428571428574,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19119793176651,
"learning_rate": 8.01636806561836e-07,
"loss": 0.0427,
"num_tokens": 4898130.0,
"reward": 0.09733833372592926,
"reward_std": 0.525000274181366,
"rewards/cosine_scaled_reward/mean": -0.09976834803819656,
"rewards/cosine_scaled_reward/std": 0.3302258253097534,
"rewards/format_reward/mean": 0.296875,
"rewards/format_reward/std": 0.4604927599430084,
"step": 39
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.578125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1934.0,
"completions/mean_length": 1580.0625,
"completions/mean_terminated_length": 938.8148193359375,
"completions/min_length": 222.0,
"completions/min_terminated_length": 222.0,
"epoch": 0.045714285714285714,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.198013037443161,
"learning_rate": 7.884636689049422e-07,
"loss": 0.043,
"num_tokens": 5009326.0,
"reward": 0.2950524389743805,
"reward_std": 0.6312240958213806,
"rewards/cosine_scaled_reward/mean": -0.07903628796339035,
"rewards/cosine_scaled_reward/std": 0.4338403344154358,
"rewards/format_reward/mean": 0.453125,
"rewards/format_reward/std": 0.501733124256134,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.125,
"completions/clipped_ratio": 0.703125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1725.0,
"completions/mean_length": 1816.5,
"completions/mean_terminated_length": 1268.2105712890625,
"completions/min_length": 671.0,
"completions/min_terminated_length": 671.0,
"epoch": 0.046857142857142854,
"epsilon_high_adjusted": 0.2375,
"epsilon_low_adjusted": 0.2375,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20052184164524078,
"learning_rate": 7.75e-07,
"loss": 0.049,
"num_tokens": 5136734.0,
"reward": -0.04274348169565201,
"reward_std": 0.6400065422058105,
"rewards/cosine_scaled_reward/mean": -0.2010592371225357,
"rewards/cosine_scaled_reward/std": 0.31733086705207825,
"rewards/format_reward/mean": 0.359375,
"rewards/format_reward/std": 0.4836103618144989,
"step": 41
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2008.0,
"completions/mean_length": 1649.609375,
"completions/mean_terminated_length": 631.5,
"completions/min_length": 209.0,
"completions/min_terminated_length": 209.0,
"epoch": 0.048,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21507494151592255,
"learning_rate": 7.612622032536507e-07,
"loss": 0.035,
"num_tokens": 5252669.0,
"reward": -0.1898493468761444,
"reward_std": 0.3643849194049835,
"rewards/cosine_scaled_reward/mean": -0.2355496734380722,
"rewards/cosine_scaled_reward/std": 0.18636876344680786,
"rewards/format_reward/mean": 0.28125,
"rewards/format_reward/std": 0.4531635046005249,
"step": 42
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2012.0,
"completions/mean_length": 1699.6875,
"completions/mean_terminated_length": 933.4000244140625,
"completions/min_length": 456.0,
"completions/min_terminated_length": 456.0,
"epoch": 0.04914285714285714,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1977306753396988,
"learning_rate": 7.472670160550848e-07,
"loss": 0.0412,
"num_tokens": 5373065.0,
"reward": 0.10587131977081299,
"reward_std": 0.5526303052902222,
"rewards/cosine_scaled_reward/mean": -0.1033143401145935,
"rewards/cosine_scaled_reward/std": 0.3173230290412903,
"rewards/format_reward/mean": 0.3125,
"rewards/format_reward/std": 0.467176616191864,
"step": 43
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.640625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1779.0,
"completions/mean_length": 1585.3125,
"completions/mean_terminated_length": 760.521728515625,
"completions/min_length": 251.0,
"completions/min_terminated_length": 251.0,
"epoch": 0.05028571428571429,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.18540318310260773,
"learning_rate": 7.330314893841101e-07,
"loss": 0.0628,
"num_tokens": 5485221.0,
"reward": 0.1987997144460678,
"reward_std": 0.7999765872955322,
"rewards/cosine_scaled_reward/mean": -0.10372515767812729,
"rewards/cosine_scaled_reward/std": 0.43109309673309326,
"rewards/format_reward/mean": 0.40625,
"rewards/format_reward/std": 0.49501484632492065,
"step": 44
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.1875,
"completions/clipped_ratio": 0.953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1830.0,
"completions/mean_length": 2018.203125,
"completions/mean_terminated_length": 1412.3333740234375,
"completions/min_length": 1141.0,
"completions/min_terminated_length": 1141.0,
"epoch": 0.05142857142857143,
"epsilon_high_adjusted": 0.246875,
"epsilon_low_adjusted": 0.246875,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.22048652172088623,
"learning_rate": 7.185729670371604e-07,
"loss": -0.012,
"num_tokens": 5626042.0,
"reward": -0.2534261643886566,
"reward_std": 0.46692758798599243,
"rewards/cosine_scaled_reward/mean": -0.1892130821943283,
"rewards/cosine_scaled_reward/std": 0.3184278905391693,
"rewards/format_reward/mean": 0.125,
"rewards/format_reward/std": 0.3333333432674408,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.8125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1438.0,
"completions/mean_length": 1841.4375,
"completions/mean_terminated_length": 946.3333740234375,
"completions/min_length": 674.0,
"completions/min_terminated_length": 674.0,
"epoch": 0.052571428571428575,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2475695013999939,
"learning_rate": 7.039090644965509e-07,
"loss": 0.0372,
"num_tokens": 5755062.0,
"reward": -0.2845799922943115,
"reward_std": 0.37563323974609375,
"rewards/cosine_scaled_reward/mean": -0.23603998124599457,
"rewards/cosine_scaled_reward/std": 0.17336885631084442,
"rewards/format_reward/mean": 0.1875,
"rewards/format_reward/std": 0.39339789748191833,
"step": 46
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.703125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2008.0,
"completions/mean_length": 1692.21875,
"completions/mean_terminated_length": 849.5789794921875,
"completions/min_length": 287.0,
"completions/min_terminated_length": 287.0,
"epoch": 0.053714285714285714,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20771269500255585,
"learning_rate": 6.890576474687263e-07,
"loss": 0.0472,
"num_tokens": 5873764.0,
"reward": 0.05584225058555603,
"reward_std": 0.7296371459960938,
"rewards/cosine_scaled_reward/mean": -0.1361413598060608,
"rewards/cosine_scaled_reward/std": 0.43819770216941833,
"rewards/format_reward/mean": 0.328125,
"rewards/format_reward/std": 0.4732423722743988,
"step": 47
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.03125,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1544.0,
"completions/mean_length": 1739.1875,
"completions/mean_terminated_length": 950.0,
"completions/min_length": 531.0,
"completions/min_terminated_length": 531.0,
"epoch": 0.054857142857142854,
"epsilon_high_adjusted": 0.2109375,
"epsilon_low_adjusted": 0.2109375,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19528591632843018,
"learning_rate": 6.740368101176495e-07,
"loss": 0.0163,
"num_tokens": 5995616.0,
"reward": 0.06744927167892456,
"reward_std": 0.6208744049072266,
"rewards/cosine_scaled_reward/mean": -0.10690036416053772,
"rewards/cosine_scaled_reward/std": 0.3725154995918274,
"rewards/format_reward/mean": 0.28125,
"rewards/format_reward/std": 0.4531635046005249,
"step": 48
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.546875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1689.0,
"completions/mean_length": 1523.296875,
"completions/mean_terminated_length": 890.0344848632812,
"completions/min_length": 242.0,
"completions/min_terminated_length": 242.0,
"epoch": 0.056,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17805328965187073,
"learning_rate": 6.588648530198504e-07,
"loss": 0.0665,
"num_tokens": 6103171.0,
"reward": 0.17353929579257965,
"reward_std": 0.6857056617736816,
"rewards/cosine_scaled_reward/mean": -0.14760535955429077,
"rewards/cosine_scaled_reward/std": 0.40281444787979126,
"rewards/format_reward/mean": 0.46875,
"rewards/format_reward/std": 0.5029674172401428,
"step": 49
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.703125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1965.0,
"completions/mean_length": 1694.578125,
"completions/mean_terminated_length": 857.5263061523438,
"completions/min_length": 344.0,
"completions/min_terminated_length": 344.0,
"epoch": 0.05714285714285714,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17739826440811157,
"learning_rate": 6.435602608679916e-07,
"loss": 0.0495,
"num_tokens": 6222440.0,
"reward": 0.12661927938461304,
"reward_std": 0.588830828666687,
"rewards/cosine_scaled_reward/mean": -0.08512786030769348,
"rewards/cosine_scaled_reward/std": 0.43910878896713257,
"rewards/format_reward/mean": 0.296875,
"rewards/format_reward/std": 0.4604927599430084,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0625,
"completions/clipped_ratio": 0.5,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1975.0,
"completions/mean_length": 1466.984375,
"completions/mean_terminated_length": 885.96875,
"completions/min_length": 376.0,
"completions/min_terminated_length": 376.0,
"epoch": 0.05828571428571429,
"epsilon_high_adjusted": 0.22187500000000002,
"epsilon_low_adjusted": 0.22187500000000002,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20479989051818848,
"learning_rate": 6.281416799501187e-07,
"loss": 0.0226,
"num_tokens": 6326535.0,
"reward": 0.3101663589477539,
"reward_std": 0.6823086738586426,
"rewards/cosine_scaled_reward/mean": -0.10272930562496185,
"rewards/cosine_scaled_reward/std": 0.344821572303772,
"rewards/format_reward/mean": 0.515625,
"rewards/format_reward/std": 0.5037065148353577,
"step": 51
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.125,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1510.0,
"completions/mean_length": 1744.640625,
"completions/mean_terminated_length": 834.5625,
"completions/min_length": 280.0,
"completions/min_terminated_length": 280.0,
"epoch": 0.05942857142857143,
"epsilon_high_adjusted": 0.23750000000000002,
"epsilon_low_adjusted": 0.23750000000000002,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19819842278957367,
"learning_rate": 6.126278954320294e-07,
"loss": 0.0083,
"num_tokens": 6449544.0,
"reward": -0.00986415147781372,
"reward_std": 0.685615062713623,
"rewards/cosine_scaled_reward/mean": -0.14555707573890686,
"rewards/cosine_scaled_reward/std": 0.41420355439186096,
"rewards/format_reward/mean": 0.28125,
"rewards/format_reward/std": 0.4531635046005249,
"step": 52
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1977.0,
"completions/mean_length": 1733.171875,
"completions/mean_terminated_length": 1040.550048828125,
"completions/min_length": 525.0,
"completions/min_terminated_length": 525.0,
"epoch": 0.060571428571428575,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20193032920360565,
"learning_rate": 5.97037808470444e-07,
"loss": 0.0262,
"num_tokens": 6571299.0,
"reward": 0.08663126826286316,
"reward_std": 0.661508321762085,
"rewards/cosine_scaled_reward/mean": -0.15980936586856842,
"rewards/cosine_scaled_reward/std": 0.30268651247024536,
"rewards/format_reward/mean": 0.40625,
"rewards/format_reward/std": 0.49501484632492065,
"step": 53
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.59375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1997.0,
"completions/mean_length": 1674.375,
"completions/mean_terminated_length": 1128.3077392578125,
"completions/min_length": 660.0,
"completions/min_terminated_length": 660.0,
"epoch": 0.061714285714285715,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19670119881629944,
"learning_rate": 5.813904131848564e-07,
"loss": 0.0604,
"num_tokens": 6689603.0,
"reward": 0.6827424764633179,
"reward_std": 0.8742384910583496,
"rewards/cosine_scaled_reward/mean": 0.07574622333049774,
"rewards/cosine_scaled_reward/std": 0.5349056124687195,
"rewards/format_reward/mean": 0.53125,
"rewards/format_reward/std": 0.5029674172401428,
"step": 54
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1565.0,
"completions/mean_length": 1711.09375,
"completions/mean_terminated_length": 850.1111450195312,
"completions/min_length": 611.0,
"completions/min_terminated_length": 611.0,
"epoch": 0.06285714285714286,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20655271410942078,
"learning_rate": 5.657047735161255e-07,
"loss": 0.0611,
"num_tokens": 6809401.0,
"reward": 0.08179579675197601,
"reward_std": 0.5935317277908325,
"rewards/cosine_scaled_reward/mean": -0.099727101624012,
"rewards/cosine_scaled_reward/std": 0.41786429286003113,
"rewards/format_reward/mean": 0.28125,
"rewards/format_reward/std": 0.4531635046005249,
"step": 55
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.671875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1946.0,
"completions/mean_length": 1734.6875,
"completions/mean_terminated_length": 1093.142822265625,
"completions/min_length": 556.0,
"completions/min_terminated_length": 556.0,
"epoch": 0.064,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.202525794506073,
"learning_rate": 5.5e-07,
"loss": 0.0524,
"num_tokens": 6931381.0,
"reward": 0.15938684344291687,
"reward_std": 0.5165223479270935,
"rewards/cosine_scaled_reward/mean": -0.10780657827854156,
"rewards/cosine_scaled_reward/std": 0.4498305320739746,
"rewards/format_reward/mean": 0.375,
"rewards/format_reward/std": 0.48795005679130554,
"step": 56
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.828125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1849.0,
"completions/mean_length": 1927.875,
"completions/mean_terminated_length": 1349.0909423828125,
"completions/min_length": 554.0,
"completions/min_terminated_length": 554.0,
"epoch": 0.06514285714285714,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19767992198467255,
"learning_rate": 5.342952264838747e-07,
"loss": 0.011,
"num_tokens": 7066333.0,
"reward": -0.3306891620159149,
"reward_std": 0.4264768958091736,
"rewards/cosine_scaled_reward/mean": -0.28253209590911865,
"rewards/cosine_scaled_reward/std": 0.2055179625749588,
"rewards/format_reward/mean": 0.234375,
"rewards/format_reward/std": 0.42695629596710205,
"step": 57
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.421875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1905.0,
"completions/mean_length": 1486.875,
"completions/mean_terminated_length": 1077.4053955078125,
"completions/min_length": 432.0,
"completions/min_terminated_length": 432.0,
"epoch": 0.06628571428571428,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17724664509296417,
"learning_rate": 5.186095868151436e-07,
"loss": 0.0289,
"num_tokens": 7171589.0,
"reward": 0.5129991769790649,
"reward_std": 0.8471602201461792,
"rewards/cosine_scaled_reward/mean": -0.048187918961048126,
"rewards/cosine_scaled_reward/std": 0.4703964293003082,
"rewards/format_reward/mean": 0.609375,
"rewards/format_reward/std": 0.4917473793029785,
"step": 58
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.640625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1912.0,
"completions/mean_length": 1684.71875,
"completions/mean_terminated_length": 1037.1304931640625,
"completions/min_length": 369.0,
"completions/min_terminated_length": 369.0,
"epoch": 0.06742857142857143,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17898693680763245,
"learning_rate": 5.02962191529556e-07,
"loss": 0.0154,
"num_tokens": 7289875.0,
"reward": 0.19179533421993256,
"reward_std": 0.6819975972175598,
"rewards/cosine_scaled_reward/mean": -0.11503982543945312,
"rewards/cosine_scaled_reward/std": 0.4170202612876892,
"rewards/format_reward/mean": 0.421875,
"rewards/format_reward/std": 0.49776285886764526,
"step": 59
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.765625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1773.0,
"completions/mean_length": 1784.640625,
"completions/mean_terminated_length": 924.3333740234375,
"completions/min_length": 501.0,
"completions/min_terminated_length": 501.0,
"epoch": 0.06857142857142857,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.18929247558116913,
"learning_rate": 4.873721045679706e-07,
"loss": 0.0206,
"num_tokens": 7414980.0,
"reward": -0.2165871113538742,
"reward_std": 0.4006432890892029,
"rewards/cosine_scaled_reward/mean": -0.2411060631275177,
"rewards/cosine_scaled_reward/std": 0.22077766060829163,
"rewards/format_reward/mean": 0.265625,
"rewards/format_reward/std": 0.44515693187713623,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2035.0,
"completions/mean_length": 1769.8125,
"completions/mean_terminated_length": 1058.888916015625,
"completions/min_length": 470.0,
"completions/min_terminated_length": 470.0,
"epoch": 0.06971428571428571,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.18952307105064392,
"learning_rate": 4.7185832004988133e-07,
"loss": 0.0408,
"num_tokens": 7539768.0,
"reward": 0.1118076741695404,
"reward_std": 0.7766213417053223,
"rewards/cosine_scaled_reward/mean": -0.1315961629152298,
"rewards/cosine_scaled_reward/std": 0.3166446387767792,
"rewards/format_reward/mean": 0.375,
"rewards/format_reward/std": 0.48795005679130554,
"step": 61
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.546875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1766.0,
"completions/mean_length": 1529.890625,
"completions/mean_terminated_length": 904.586181640625,
"completions/min_length": 337.0,
"completions/min_terminated_length": 337.0,
"epoch": 0.07085714285714285,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17763926088809967,
"learning_rate": 4.5643973913200837e-07,
"loss": 0.067,
"num_tokens": 7647913.0,
"reward": 0.33338215947151184,
"reward_std": 0.7777395248413086,
"rewards/cosine_scaled_reward/mean": -0.07549642026424408,
"rewards/cosine_scaled_reward/std": 0.42954275012016296,
"rewards/format_reward/mean": 0.484375,
"rewards/format_reward/std": 0.5037065148353577,
"step": 62
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.421875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1890.0,
"completions/mean_length": 1445.671875,
"completions/mean_terminated_length": 1006.1351318359375,
"completions/min_length": 326.0,
"completions/min_terminated_length": 326.0,
"epoch": 0.072,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19693626463413239,
"learning_rate": 4.4113514698014953e-07,
"loss": 0.0716,
"num_tokens": 7750692.0,
"reward": 0.4808192253112793,
"reward_std": 0.7516437768936157,
"rewards/cosine_scaled_reward/mean": -0.05646540969610214,
"rewards/cosine_scaled_reward/std": 0.43912824988365173,
"rewards/format_reward/mean": 0.59375,
"rewards/format_reward/std": 0.49501484632492065,
"step": 63
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.65625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1802.0,
"completions/mean_length": 1667.59375,
"completions/mean_terminated_length": 941.3636474609375,
"completions/min_length": 481.0,
"completions/min_terminated_length": 481.0,
"epoch": 0.07314285714285715,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21556542813777924,
"learning_rate": 4.2596318988235037e-07,
"loss": 0.0673,
"num_tokens": 7868370.0,
"reward": -0.013222754001617432,
"reward_std": 0.603374183177948,
"rewards/cosine_scaled_reward/mean": -0.17848637700080872,
"rewards/cosine_scaled_reward/std": 0.3722720146179199,
"rewards/format_reward/mean": 0.34375,
"rewards/format_reward/std": 0.4787135720252991,
"step": 64
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.703125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1521.0,
"completions/mean_length": 1636.25,
"completions/mean_terminated_length": 661.0526123046875,
"completions/min_length": 300.0,
"completions/min_terminated_length": 300.0,
"epoch": 0.07428571428571429,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.196330264210701,
"learning_rate": 4.1094235253127374e-07,
"loss": 0.0079,
"num_tokens": 7983794.0,
"reward": -0.02694564312696457,
"reward_std": 0.43651264905929565,
"rewards/cosine_scaled_reward/mean": -0.17753534018993378,
"rewards/cosine_scaled_reward/std": 0.3935491740703583,
"rewards/format_reward/mean": 0.328125,
"rewards/format_reward/std": 0.4732423722743988,
"step": 65
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1875.0,
"completions/mean_length": 1233.84375,
"completions/mean_terminated_length": 745.3500366210938,
"completions/min_length": 269.0,
"completions/min_terminated_length": 269.0,
"epoch": 0.07542857142857143,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15430262684822083,
"learning_rate": 3.9609093550344907e-07,
"loss": 0.0418,
"num_tokens": 8072992.0,
"reward": 0.660415530204773,
"reward_std": 0.8181198835372925,
"rewards/cosine_scaled_reward/mean": 0.017707787454128265,
"rewards/cosine_scaled_reward/std": 0.4732138216495514,
"rewards/format_reward/mean": 0.625,
"rewards/format_reward/std": 0.48795005679130554,
"step": 66
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.796875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1643.0,
"completions/mean_length": 1831.359375,
"completions/mean_terminated_length": 981.4615478515625,
"completions/min_length": 689.0,
"completions/min_terminated_length": 689.0,
"epoch": 0.07657142857142857,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.18633423745632172,
"learning_rate": 3.8142703296283953e-07,
"loss": 0.0704,
"num_tokens": 8200935.0,
"reward": -0.28378796577453613,
"reward_std": 0.5433966517448425,
"rewards/cosine_scaled_reward/mean": -0.25908148288726807,
"rewards/cosine_scaled_reward/std": 0.23884578049182892,
"rewards/format_reward/mean": 0.234375,
"rewards/format_reward/std": 0.42695629596710205,
"step": 67
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1710.0,
"completions/mean_length": 945.390625,
"completions/mean_terminated_length": 636.6599731445312,
"completions/min_length": 222.0,
"completions/min_terminated_length": 222.0,
"epoch": 0.07771428571428571,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1517961174249649,
"learning_rate": 3.6696851061588994e-07,
"loss": 0.084,
"num_tokens": 8270696.0,
"reward": 0.8194406032562256,
"reward_std": 0.7585938572883606,
"rewards/cosine_scaled_reward/mean": 0.019095297902822495,
"rewards/cosine_scaled_reward/std": 0.46527862548828125,
"rewards/format_reward/mean": 0.78125,
"rewards/format_reward/std": 0.4166666865348816,
"step": 68
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1374.0,
"completions/mean_length": 1397.921875,
"completions/mean_terminated_length": 824.3235473632812,
"completions/min_length": 273.0,
"completions/min_terminated_length": 273.0,
"epoch": 0.07885714285714286,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1929248720407486,
"learning_rate": 3.5273298394491515e-07,
"loss": 0.0769,
"num_tokens": 8370507.0,
"reward": 0.018557976931333542,
"reward_std": 0.6102144122123718,
"rewards/cosine_scaled_reward/mean": -0.24072101712226868,
"rewards/cosine_scaled_reward/std": 0.2912290096282959,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.5039526224136353,
"step": 69
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.84375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1670.0,
"completions/mean_length": 1859.625,
"completions/mean_terminated_length": 842.4000244140625,
"completions/min_length": 510.0,
"completions/min_terminated_length": 510.0,
"epoch": 0.08,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20638665556907654,
"learning_rate": 3.387377967463493e-07,
"loss": 0.03,
"num_tokens": 8500979.0,
"reward": -0.2084158957004547,
"reward_std": 0.5423504114151001,
"rewards/cosine_scaled_reward/mean": -0.23702044785022736,
"rewards/cosine_scaled_reward/std": 0.24943575263023376,
"rewards/format_reward/mean": 0.265625,
"rewards/format_reward/std": 0.44515693187713623,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.1875,
"completions/clipped_ratio": 0.625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1880.0,
"completions/mean_length": 1677.53125,
"completions/mean_terminated_length": 1060.0833740234375,
"completions/min_length": 362.0,
"completions/min_terminated_length": 362.0,
"epoch": 0.08114285714285714,
"epsilon_high_adjusted": 0.25625,
"epsilon_low_adjusted": 0.25625,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20180849730968475,
"learning_rate": 3.250000000000001e-07,
"loss": 0.042,
"num_tokens": 8619061.0,
"reward": 0.19083726406097412,
"reward_std": 0.4570698142051697,
"rewards/cosine_scaled_reward/mean": -0.09989387542009354,
"rewards/cosine_scaled_reward/std": 0.5225576758384705,
"rewards/format_reward/mean": 0.390625,
"rewards/format_reward/std": 0.4917473793029785,
"step": 71
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1769.0,
"completions/mean_length": 1432.4375,
"completions/mean_terminated_length": 953.6666870117188,
"completions/min_length": 432.0,
"completions/min_terminated_length": 432.0,
"epoch": 0.08228571428571428,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.18141792714595795,
"learning_rate": 3.115363310950578e-07,
"loss": 0.0705,
"num_tokens": 8721089.0,
"reward": 0.1964300572872162,
"reward_std": 0.4526848793029785,
"rewards/cosine_scaled_reward/mean": -0.1830349564552307,
"rewards/cosine_scaled_reward/std": 0.23432116210460663,
"rewards/format_reward/mean": 0.5625,
"rewards/format_reward/std": 0.5,
"step": 72
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.859375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1711.0,
"completions/mean_length": 1940.59375,
"completions/mean_terminated_length": 1284.2222900390625,
"completions/min_length": 712.0,
"completions/min_terminated_length": 712.0,
"epoch": 0.08342857142857144,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19840198755264282,
"learning_rate": 2.9836319343816397e-07,
"loss": 0.0456,
"num_tokens": 8856015.0,
"reward": -0.27154141664505005,
"reward_std": 0.5744385719299316,
"rewards/cosine_scaled_reward/mean": -0.24514569342136383,
"rewards/cosine_scaled_reward/std": 0.2793368399143219,
"rewards/format_reward/mean": 0.21875,
"rewards/format_reward/std": 0.4166666865348816,
"step": 73
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.375,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1927.0,
"completions/mean_length": 1764.953125,
"completions/mean_terminated_length": 1041.611083984375,
"completions/min_length": 508.0,
"completions/min_terminated_length": 508.0,
"epoch": 0.08457142857142858,
"epsilon_high_adjusted": 0.275,
"epsilon_low_adjusted": 0.275,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1787436455488205,
"learning_rate": 2.854966364683872e-07,
"loss": 0.0899,
"num_tokens": 8978900.0,
"reward": 0.19575530290603638,
"reward_std": 0.7323085069656372,
"rewards/cosine_scaled_reward/mean": -0.06618484109640121,
"rewards/cosine_scaled_reward/std": 0.48987188935279846,
"rewards/format_reward/mean": 0.328125,
"rewards/format_reward/std": 0.4732423722743988,
"step": 74
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.59375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2039.0,
"completions/mean_length": 1696.3125,
"completions/mean_terminated_length": 1182.3077392578125,
"completions/min_length": 664.0,
"completions/min_terminated_length": 664.0,
"epoch": 0.08571428571428572,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.18963930010795593,
"learning_rate": 2.729523361034538e-07,
"loss": 0.0517,
"num_tokens": 9098424.0,
"reward": 0.3912990391254425,
"reward_std": 0.6338691115379333,
"rewards/cosine_scaled_reward/mean": -0.023100484162569046,
"rewards/cosine_scaled_reward/std": 0.47905832529067993,
"rewards/format_reward/mean": 0.4375,
"rewards/format_reward/std": 0.5,
"step": 75
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1340.0,
"completions/mean_length": 1663.8125,
"completions/mean_terminated_length": 682.0,
"completions/min_length": 306.0,
"completions/min_terminated_length": 306.0,
"epoch": 0.08685714285714285,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1930101066827774,
"learning_rate": 2.6074557564105724e-07,
"loss": 0.0121,
"num_tokens": 9215308.0,
"reward": -0.3120652437210083,
"reward_std": 0.3665449023246765,
"rewards/cosine_scaled_reward/mean": -0.29665762186050415,
"rewards/cosine_scaled_reward/std": 0.17376884818077087,
"rewards/format_reward/mean": 0.28125,
"rewards/format_reward/std": 0.4531635046005249,
"step": 76
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1849.0,
"completions/mean_length": 1733.296875,
"completions/mean_terminated_length": 1040.9500732421875,
"completions/min_length": 568.0,
"completions/min_terminated_length": 568.0,
"epoch": 0.088,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1888083517551422,
"learning_rate": 2.488912271385139e-07,
"loss": 0.0391,
"num_tokens": 9337815.0,
"reward": 0.12663224339485168,
"reward_std": 0.4842023551464081,
"rewards/cosine_scaled_reward/mean": -0.10855888575315475,
"rewards/cosine_scaled_reward/std": 0.3368559777736664,
"rewards/format_reward/mean": 0.34375,
"rewards/format_reward/std": 0.4787135720252991,
"step": 77
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.703125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1698.0,
"completions/mean_length": 1747.84375,
"completions/mean_terminated_length": 1036.9473876953125,
"completions/min_length": 634.0,
"completions/min_terminated_length": 634.0,
"epoch": 0.08914285714285715,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.18638849258422852,
"learning_rate": 2.374037332934512e-07,
"loss": 0.0141,
"num_tokens": 9460397.0,
"reward": 0.23273038864135742,
"reward_std": 0.6073111295700073,
"rewards/cosine_scaled_reward/mean": -0.047697313129901886,
"rewards/cosine_scaled_reward/std": 0.48325926065444946,
"rewards/format_reward/mean": 0.328125,
"rewards/format_reward/std": 0.4732423722743988,
"step": 78
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1959.0,
"completions/mean_length": 1371.6875,
"completions/mean_terminated_length": 845.6666870117188,
"completions/min_length": 231.0,
"completions/min_terminated_length": 231.0,
"epoch": 0.09028571428571429,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1628364771604538,
"learning_rate": 2.2629708984760706e-07,
"loss": 0.0366,
"num_tokens": 9558281.0,
"reward": 0.29462432861328125,
"reward_std": 0.553225040435791,
"rewards/cosine_scaled_reward/mean": -0.13393783569335938,
"rewards/cosine_scaled_reward/std": 0.3635351061820984,
"rewards/format_reward/mean": 0.5625,
"rewards/format_reward/std": 0.5,
"step": 79
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.65625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1994.0,
"completions/mean_length": 1768.25,
"completions/mean_terminated_length": 1234.181884765625,
"completions/min_length": 385.0,
"completions/min_terminated_length": 385.0,
"epoch": 0.09142857142857143,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19675402343273163,
"learning_rate": 2.1558482853517253e-07,
"loss": 0.0099,
"num_tokens": 9681841.0,
"reward": 0.07091247290372849,
"reward_std": 0.7199949026107788,
"rewards/cosine_scaled_reward/mean": -0.17548125982284546,
"rewards/cosine_scaled_reward/std": 0.39285531640052795,
"rewards/format_reward/mean": 0.421875,
"rewards/format_reward/std": 0.49776285886764526,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1883.0,
"completions/mean_length": 1689.15625,
"completions/mean_terminated_length": 899.7000122070312,
"completions/min_length": 471.0,
"completions/min_terminated_length": 471.0,
"epoch": 0.09257142857142857,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2319558709859848,
"learning_rate": 2.0528000059645995e-07,
"loss": 0.0239,
"num_tokens": 9801219.0,
"reward": 0.11250954866409302,
"reward_std": 0.6092942953109741,
"rewards/cosine_scaled_reward/mean": -0.1468702107667923,
"rewards/cosine_scaled_reward/std": 0.3265360891819,
"rewards/format_reward/mean": 0.40625,
"rewards/format_reward/std": 0.49501484632492065,
"step": 81
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.515625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1900.0,
"completions/mean_length": 1523.6875,
"completions/mean_terminated_length": 965.54833984375,
"completions/min_length": 474.0,
"completions/min_terminated_length": 474.0,
"epoch": 0.09371428571428571,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2026216685771942,
"learning_rate": 1.9539516087697517e-07,
"loss": 0.0721,
"num_tokens": 9909063.0,
"reward": 0.5980618000030518,
"reward_std": 0.7195340394973755,
"rewards/cosine_scaled_reward/mean": 0.04903092980384827,
"rewards/cosine_scaled_reward/std": 0.5532049536705017,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.5039526224136353,
"step": 82
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.53125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1988.0,
"completions/mean_length": 1559.015625,
"completions/mean_terminated_length": 1004.8333740234375,
"completions/min_length": 459.0,
"completions/min_terminated_length": 459.0,
"epoch": 0.09485714285714286,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19982481002807617,
"learning_rate": 1.8594235253127372e-07,
"loss": 0.054,
"num_tokens": 10019928.0,
"reward": 0.3029702305793762,
"reward_std": 0.49975115060806274,
"rewards/cosine_scaled_reward/mean": -0.09851488471031189,
"rewards/cosine_scaled_reward/std": 0.3822130560874939,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.5039526224136353,
"step": 83
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.59375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1714.0,
"completions/mean_length": 1600.859375,
"completions/mean_terminated_length": 947.34619140625,
"completions/min_length": 433.0,
"completions/min_terminated_length": 433.0,
"epoch": 0.096,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.18633437156677246,
"learning_rate": 1.7693309235023127e-07,
"loss": 0.0865,
"num_tokens": 10133079.0,
"reward": 0.08319186419248581,
"reward_std": 0.6657856702804565,
"rewards/cosine_scaled_reward/mean": -0.1927790641784668,
"rewards/cosine_scaled_reward/std": 0.3336566388607025,
"rewards/format_reward/mean": 0.46875,
"rewards/format_reward/std": 0.5029674172401428,
"step": 84
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.484375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2030.0,
"completions/mean_length": 1632.65625,
"completions/mean_terminated_length": 1242.48486328125,
"completions/min_length": 666.0,
"completions/min_terminated_length": 666.0,
"epoch": 0.09714285714285714,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.16532936692237854,
"learning_rate": 1.6837835672960831e-07,
"loss": 0.0714,
"num_tokens": 10247889.0,
"reward": 0.259367436170578,
"reward_std": 0.7487314939498901,
"rewards/cosine_scaled_reward/mean": -0.151566281914711,
"rewards/cosine_scaled_reward/std": 0.3289223909378052,
"rewards/format_reward/mean": 0.5625,
"rewards/format_reward/std": 0.5,
"step": 85
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.65625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1633.0,
"completions/mean_length": 1671.6875,
"completions/mean_terminated_length": 953.2727661132812,
"completions/min_length": 617.0,
"completions/min_terminated_length": 617.0,
"epoch": 0.09828571428571428,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19505523145198822,
"learning_rate": 1.6028856829700258e-07,
"loss": 0.0388,
"num_tokens": 10365733.0,
"reward": 0.07625935971736908,
"reward_std": 0.707979679107666,
"rewards/cosine_scaled_reward/mean": -0.14937034249305725,
"rewards/cosine_scaled_reward/std": 0.39694419503211975,
"rewards/format_reward/mean": 0.375,
"rewards/format_reward/std": 0.48795005679130554,
"step": 86
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.484375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1800.0,
"completions/mean_length": 1431.3125,
"completions/mean_terminated_length": 852.0,
"completions/min_length": 423.0,
"completions/min_terminated_length": 423.0,
"epoch": 0.09942857142857142,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21713945269584656,
"learning_rate": 1.5267358321348285e-07,
"loss": 0.0879,
"num_tokens": 10467457.0,
"reward": 0.15931269526481628,
"reward_std": 0.6175140142440796,
"rewards/cosine_scaled_reward/mean": -0.18596863746643066,
"rewards/cosine_scaled_reward/std": 0.2911415696144104,
"rewards/format_reward/mean": 0.53125,
"rewards/format_reward/std": 0.5029674172401428,
"step": 87
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2047.0,
"completions/mean_length": 1565.265625,
"completions/mean_terminated_length": 944.607177734375,
"completions/min_length": 100.0,
"completions/min_terminated_length": 100.0,
"epoch": 0.10057142857142858,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20137667655944824,
"learning_rate": 1.4554267916537495e-07,
"loss": 0.0868,
"num_tokens": 10578146.0,
"reward": 0.4069916307926178,
"reward_std": 0.7737945318222046,
"rewards/cosine_scaled_reward/mean": -0.023066692054271698,
"rewards/cosine_scaled_reward/std": 0.39842066168785095,
"rewards/format_reward/mean": 0.453125,
"rewards/format_reward/std": 0.501733124256134,
"step": 88
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.1875,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2011.0,
"completions/mean_length": 1554.59375,
"completions/mean_terminated_length": 1217.0,
"completions/min_length": 537.0,
"completions/min_terminated_length": 537.0,
"epoch": 0.10171428571428572,
"epsilon_high_adjusted": 0.246875,
"epsilon_low_adjusted": 0.246875,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.18177422881126404,
"learning_rate": 1.3890454406082956e-07,
"loss": 0.0376,
"num_tokens": 10688640.0,
"reward": 0.5939216017723083,
"reward_std": 0.7109141945838928,
"rewards/cosine_scaled_reward/mean": -0.046789199113845825,
"rewards/cosine_scaled_reward/std": 0.46814003586769104,
"rewards/format_reward/mean": 0.6875,
"rewards/format_reward/std": 0.467176616191864,
"step": 89
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1729.0,
"completions/mean_length": 1442.46875,
"completions/mean_terminated_length": 971.5,
"completions/min_length": 374.0,
"completions/min_terminated_length": 374.0,
"epoch": 0.10285714285714286,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20881003141403198,
"learning_rate": 1.3276726544494571e-07,
"loss": 0.0727,
"num_tokens": 10790958.0,
"reward": 0.2375994473695755,
"reward_std": 0.6029033660888672,
"rewards/cosine_scaled_reward/mean": -0.17807528376579285,
"rewards/cosine_scaled_reward/std": 0.2965840995311737,
"rewards/format_reward/mean": 0.59375,
"rewards/format_reward/std": 0.49501484632492065,
"step": 90
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1871.0,
"completions/mean_length": 1856.3125,
"completions/mean_terminated_length": 1366.4444580078125,
"completions/min_length": 886.0,
"completions/min_terminated_length": 886.0,
"epoch": 0.104,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20903776586055756,
"learning_rate": 1.2713832064634125e-07,
"loss": 0.0415,
"num_tokens": 10920330.0,
"reward": -0.05261840671300888,
"reward_std": 0.6418163776397705,
"rewards/cosine_scaled_reward/mean": -0.19818420708179474,
"rewards/cosine_scaled_reward/std": 0.30021125078201294,
"rewards/format_reward/mean": 0.34375,
"rewards/format_reward/std": 0.4787135720252991,
"step": 91
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2027.0,
"completions/mean_length": 1401.984375,
"completions/mean_terminated_length": 959.9736938476562,
"completions/min_length": 416.0,
"completions/min_terminated_length": 416.0,
"epoch": 0.10514285714285715,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1638815999031067,
"learning_rate": 1.220245676671809e-07,
"loss": 0.0186,
"num_tokens": 11020049.0,
"reward": 0.21965916454792023,
"reward_std": 0.5625333786010742,
"rewards/cosine_scaled_reward/mean": -0.19485794007778168,
"rewards/cosine_scaled_reward/std": 0.27887240052223206,
"rewards/format_reward/mean": 0.609375,
"rewards/format_reward/std": 0.4917473793029785,
"step": 92
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.78125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1866.0,
"completions/mean_length": 1846.921875,
"completions/mean_terminated_length": 1128.7857666015625,
"completions/min_length": 544.0,
"completions/min_terminated_length": 544.0,
"epoch": 0.10628571428571429,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.22502079606056213,
"learning_rate": 1.1743223682775649e-07,
"loss": 0.0765,
"num_tokens": 11149596.0,
"reward": -0.34294235706329346,
"reward_std": 0.4486418664455414,
"rewards/cosine_scaled_reward/mean": -0.28084617853164673,
"rewards/cosine_scaled_reward/std": 0.18859638273715973,
"rewards/format_reward/mean": 0.21875,
"rewards/format_reward/std": 0.4166666865348816,
"step": 93
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.703125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1641.0,
"completions/mean_length": 1736.5,
"completions/mean_terminated_length": 998.7368774414062,
"completions/min_length": 536.0,
"completions/min_terminated_length": 536.0,
"epoch": 0.10742857142857143,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2263384759426117,
"learning_rate": 1.1336692317580158e-07,
"loss": 0.0247,
"num_tokens": 11270500.0,
"reward": 0.068375363945961,
"reward_std": 0.3989260196685791,
"rewards/cosine_scaled_reward/mean": -0.1220623031258583,
"rewards/cosine_scaled_reward/std": 0.4283704161643982,
"rewards/format_reward/mean": 0.3125,
"rewards/format_reward/std": 0.467176616191864,
"step": 94
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.65625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2040.0,
"completions/mean_length": 1739.734375,
"completions/mean_terminated_length": 1151.227294921875,
"completions/min_length": 620.0,
"completions/min_terminated_length": 620.0,
"epoch": 0.10857142857142857,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.18776042759418488,
"learning_rate": 1.0983357966978745e-07,
"loss": 0.0606,
"num_tokens": 11392371.0,
"reward": -0.18143336474895477,
"reward_std": 0.5141602158546448,
"rewards/cosine_scaled_reward/mean": -0.2625916600227356,
"rewards/cosine_scaled_reward/std": 0.23184041678905487,
"rewards/format_reward/mean": 0.34375,
"rewards/format_reward/std": 0.4787135720252991,
"step": 95
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.546875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1686.0,
"completions/mean_length": 1542.140625,
"completions/mean_terminated_length": 931.6206665039062,
"completions/min_length": 391.0,
"completions/min_terminated_length": 391.0,
"epoch": 0.10971428571428571,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17707978188991547,
"learning_rate": 1.068365111445064e-07,
"loss": 0.0335,
"num_tokens": 11502204.0,
"reward": 0.4653927683830261,
"reward_std": 0.6023179292678833,
"rewards/cosine_scaled_reward/mean": 0.006133869290351868,
"rewards/cosine_scaled_reward/std": 0.4863370656967163,
"rewards/format_reward/mean": 0.453125,
"rewards/format_reward/std": 0.501733124256134,
"step": 96
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.125,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2041.0,
"completions/mean_length": 1533.03125,
"completions/mean_terminated_length": 1132.5,
"completions/min_length": 358.0,
"completions/min_terminated_length": 358.0,
"epoch": 0.11085714285714286,
"epsilon_high_adjusted": 0.24375000000000002,
"epsilon_low_adjusted": 0.24375000000000002,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1965816169977188,
"learning_rate": 1.0437936906629334e-07,
"loss": 0.036,
"num_tokens": 11610582.0,
"reward": 0.42514127492904663,
"reward_std": 0.6956003904342651,
"rewards/cosine_scaled_reward/mean": -0.10774186253547668,
"rewards/cosine_scaled_reward/std": 0.3644869327545166,
"rewards/format_reward/mean": 0.640625,
"rewards/format_reward/std": 0.4836103618144989,
"step": 97
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.453125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1889.0,
"completions/mean_length": 1502.15625,
"completions/mean_terminated_length": 1049.8857421875,
"completions/min_length": 398.0,
"completions/min_terminated_length": 398.0,
"epoch": 0.112,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19055023789405823,
"learning_rate": 1.0246514708427701e-07,
"loss": 0.0435,
"num_tokens": 11717784.0,
"reward": 0.3386792242527008,
"reward_std": 0.560700535774231,
"rewards/cosine_scaled_reward/mean": -0.1197228878736496,
"rewards/cosine_scaled_reward/std": 0.3668956160545349,
"rewards/format_reward/mean": 0.578125,
"rewards/format_reward/std": 0.49776285886764526,
"step": 98
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.375,
"completions/clipped_ratio": 0.703125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1972.0,
"completions/mean_length": 1663.796875,
"completions/mean_terminated_length": 753.8421020507812,
"completions/min_length": 384.0,
"completions/min_terminated_length": 384.0,
"epoch": 0.11314285714285714,
"epsilon_high_adjusted": 0.275,
"epsilon_low_adjusted": 0.275,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17839431762695312,
"learning_rate": 1.0109617738307911e-07,
"loss": 0.0419,
"num_tokens": 11835435.0,
"reward": 0.13991518318653107,
"reward_std": 0.4746280312538147,
"rewards/cosine_scaled_reward/mean": -0.08629240095615387,
"rewards/cosine_scaled_reward/std": 0.40295156836509705,
"rewards/format_reward/mean": 0.3125,
"rewards/format_reward/std": 0.467176616191864,
"step": 99
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completion_accuracy": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1838.0,
"completions/mean_length": 1376.015625,
"completions/mean_terminated_length": 972.8250122070312,
"completions/min_length": 583.0,
"completions/min_terminated_length": 583.0,
"epoch": 0.11428571428571428,
"epsilon_high_adjusted": 0.2,
"epsilon_low_adjusted": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.16691775619983673,
"learning_rate": 1.002741278414069e-07,
"loss": 0.0456,
"num_tokens": 11933212.0,
"reward": 0.7074599266052246,
"reward_std": 0.6797176003456116,
"rewards/cosine_scaled_reward/mean": 0.03341745585203171,
"rewards/cosine_scaled_reward/std": 0.4788829982280731,
"rewards/format_reward/mean": 0.640625,
"rewards/format_reward/std": 0.4836103618144989,
"step": 100
},
{
"epoch": 0.11428571428571428,
"step": 100,
"total_flos": 0.0,
"train_loss": 0.042700088126584886,
"train_runtime": 5405.613,
"train_samples_per_second": 1.184,
"train_steps_per_second": 0.018
}
],
"logging_steps": 1,
"max_steps": 100,
"num_input_tokens_seen": 11933212,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}