chenggong1995's picture
Model save
89fbd37 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.9960715844609345,
"eval_steps": 10000000000,
"global_step": 815,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1205357142857143,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2019.0,
"completions/mean_length": 770.888427734375,
"completions/mean_terminated_length": 595.852783203125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.006110868616324749,
"grad_norm": 0.15881415948365019,
"learning_rate": 0.0,
"loss": 0.0224,
"num_tokens": 937540.0,
"reward": 0.1958705484867096,
"reward_std": 0.19535484910011292,
"rewards/accuracy_reward/mean": 0.1908482164144516,
"rewards/accuracy_reward/std": 0.39318913221359253,
"rewards/format_reward/mean": 0.010044642724096775,
"rewards/format_reward/std": 0.09977404028177261,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.11858258928571427,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2031.5,
"completions/mean_length": 785.4727020263672,
"completions/mean_terminated_length": 615.7516174316406,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.030554343081623744,
"grad_norm": 0.1767328289689374,
"learning_rate": 4.878048780487805e-08,
"loss": 0.0243,
"num_tokens": 4684098.0,
"reward": 0.16782925091683865,
"reward_std": 0.21238624304533005,
"rewards/accuracy_reward/mean": 0.1618303582072258,
"rewards/accuracy_reward/std": 0.3662395253777504,
"rewards/format_reward/mean": 0.011997767956927419,
"rewards/format_reward/std": 0.10807658545672894,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.11004464285714285,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2031.4,
"completions/mean_length": 759.0725830078125,
"completions/mean_terminated_length": 599.7246826171875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.06110868616324749,
"grad_norm": 0.18581806299325898,
"learning_rate": 1.097560975609756e-07,
"loss": 0.0139,
"num_tokens": 9296687.0,
"reward": 0.17388393878936767,
"reward_std": 0.20642079710960387,
"rewards/accuracy_reward/mean": 0.16919642835855483,
"rewards/accuracy_reward/std": 0.37258111238479613,
"rewards/format_reward/mean": 0.009375000186264516,
"rewards/format_reward/std": 0.09551291912794113,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10334821428571428,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2043.6,
"completions/mean_length": 750.8370971679688,
"completions/mean_terminated_length": 601.4375366210937,
"completions/min_length": 2.2,
"completions/min_terminated_length": 2.2,
"epoch": 0.09166302924487124,
"grad_norm": 0.3736061943892377,
"learning_rate": 1.7073170731707317e-07,
"loss": 0.0204,
"num_tokens": 13852021.0,
"reward": 0.1564732164144516,
"reward_std": 0.2009236067533493,
"rewards/accuracy_reward/mean": 0.1524553567171097,
"rewards/accuracy_reward/std": 0.35877678990364076,
"rewards/format_reward/mean": 0.008035714365541935,
"rewards/format_reward/std": 0.08825668692588806,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.11897321428571428,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2025.0,
"completions/mean_length": 774.8120849609375,
"completions/mean_terminated_length": 602.8999267578125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.12221737232649497,
"grad_norm": 0.19598270223049968,
"learning_rate": 2.3170731707317074e-07,
"loss": 0.019,
"num_tokens": 18537803.0,
"reward": 0.15993304252624513,
"reward_std": 0.19966006577014922,
"rewards/accuracy_reward/mean": 0.1562499985098839,
"rewards/accuracy_reward/std": 0.36170021891593934,
"rewards/format_reward/mean": 0.007366071408614516,
"rewards/format_reward/std": 0.08276789709925651,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.11160714285714286,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2036.6,
"completions/mean_length": 782.6261474609375,
"completions/mean_terminated_length": 623.6066772460938,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.15277171540811874,
"grad_norm": 0.2223634191509812,
"learning_rate": 2.9268292682926825e-07,
"loss": 0.0215,
"num_tokens": 23263096.0,
"reward": 0.17544643878936766,
"reward_std": 0.22418100237846375,
"rewards/accuracy_reward/mean": 0.171875,
"rewards/accuracy_reward/std": 0.376137775182724,
"rewards/format_reward/mean": 0.007142857229337096,
"rewards/format_reward/std": 0.0823508433997631,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.11004464285714285,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2032.6,
"completions/mean_length": 772.9310668945312,
"completions/mean_terminated_length": 615.305078125,
"completions/min_length": 2.4,
"completions/min_terminated_length": 2.4,
"epoch": 0.18332605848974248,
"grad_norm": 0.17733725929125416,
"learning_rate": 3.536585365853658e-07,
"loss": 0.0148,
"num_tokens": 27871043.0,
"reward": 0.18794643580913545,
"reward_std": 0.21233190596103668,
"rewards/accuracy_reward/mean": 0.1837053567171097,
"rewards/accuracy_reward/std": 0.3869286894798279,
"rewards/format_reward/mean": 0.008482143003493547,
"rewards/format_reward/std": 0.09140371978282928,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10602678571428573,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2032.0,
"completions/mean_length": 771.5975708007812,
"completions/mean_terminated_length": 620.2588745117188,
"completions/min_length": 1.8,
"completions/min_terminated_length": 1.8,
"epoch": 0.2138804015713662,
"grad_norm": 0.1666671926822025,
"learning_rate": 4.146341463414634e-07,
"loss": 0.0378,
"num_tokens": 32528608.0,
"reward": 0.20000001192092895,
"reward_std": 0.22290224432945252,
"rewards/accuracy_reward/mean": 0.1939732164144516,
"rewards/accuracy_reward/std": 0.39474709033966066,
"rewards/format_reward/mean": 0.012053571455180646,
"rewards/format_reward/std": 0.10890180021524429,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.11160714285714284,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2037.0,
"completions/mean_length": 787.1473510742187,
"completions/mean_terminated_length": 628.6722290039063,
"completions/min_length": 3.8,
"completions/min_terminated_length": 3.8,
"epoch": 0.24443474465298995,
"grad_norm": 0.14488779442627023,
"learning_rate": 4.756097560975609e-07,
"loss": 0.0491,
"num_tokens": 37232716.0,
"reward": 0.18783482909202576,
"reward_std": 0.21656265556812287,
"rewards/accuracy_reward/mean": 0.18058035969734193,
"rewards/accuracy_reward/std": 0.38407248854637144,
"rewards/format_reward/mean": 0.01450892873108387,
"rewards/format_reward/std": 0.11844103038311005,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10714285714285712,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2034.8,
"completions/mean_length": 778.4797241210938,
"completions/mean_terminated_length": 626.24775390625,
"completions/min_length": 2.2,
"completions/min_terminated_length": 2.2,
"epoch": 0.2749890877346137,
"grad_norm": 0.42129197441608957,
"learning_rate": 5.365853658536586e-07,
"loss": 0.0574,
"num_tokens": 41873681.0,
"reward": 0.22678572237491607,
"reward_std": 0.22645699977874756,
"rewards/accuracy_reward/mean": 0.21785714328289033,
"rewards/accuracy_reward/std": 0.4128295660018921,
"rewards/format_reward/mean": 0.017857142351567747,
"rewards/format_reward/std": 0.1320108875632286,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09955357142857142,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2030.2,
"completions/mean_length": 773.639990234375,
"completions/mean_terminated_length": 632.7865966796875,
"completions/min_length": 2.4,
"completions/min_terminated_length": 2.4,
"epoch": 0.30554343081623747,
"grad_norm": 0.23363683442949074,
"learning_rate": 5.97560975609756e-07,
"loss": 0.0529,
"num_tokens": 46484964.0,
"reward": 0.21796876192092896,
"reward_std": 0.22946391999721527,
"rewards/accuracy_reward/mean": 0.20044642984867095,
"rewards/accuracy_reward/std": 0.39960110783576963,
"rewards/format_reward/mean": 0.03504464328289032,
"rewards/format_reward/std": 0.18281182646751404,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1029017857142857,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2036.4,
"completions/mean_length": 776.3364013671875,
"completions/mean_terminated_length": 630.3668823242188,
"completions/min_length": 2.4,
"completions/min_terminated_length": 2.4,
"epoch": 0.3360977738978612,
"grad_norm": 1.5497170619543985,
"learning_rate": 6.585365853658536e-07,
"loss": 0.0618,
"num_tokens": 51153095.0,
"reward": 0.23917411863803864,
"reward_std": 0.251529660820961,
"rewards/accuracy_reward/mean": 0.2049107164144516,
"rewards/accuracy_reward/std": 0.40335396528244016,
"rewards/format_reward/mean": 0.0685267873108387,
"rewards/format_reward/std": 0.2510806113481522,
"step": 55
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2016.2,
"completions/mean_length": 770.2853149414062,
"completions/mean_terminated_length": 628.3363647460938,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.36665211697948497,
"grad_norm": 1.3201348594744142,
"learning_rate": 7.195121951219512e-07,
"loss": 0.0424,
"num_tokens": 55789605.0,
"reward": 0.2799107253551483,
"reward_std": 0.2940471529960632,
"rewards/accuracy_reward/mean": 0.1872767835855484,
"rewards/accuracy_reward/std": 0.38942901492118837,
"rewards/format_reward/mean": 0.1852678582072258,
"rewards/format_reward/std": 0.3819054841995239,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08348214285714287,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2033.8,
"completions/mean_length": 775.243115234375,
"completions/mean_terminated_length": 659.3077392578125,
"completions/min_length": 3.2,
"completions/min_terminated_length": 3.2,
"epoch": 0.3972064600611087,
"grad_norm": 0.6241979041154287,
"learning_rate": 7.804878048780488e-07,
"loss": 0.0414,
"num_tokens": 60475678.0,
"reward": 0.38325895071029664,
"reward_std": 0.349049574136734,
"rewards/accuracy_reward/mean": 0.19776785969734192,
"rewards/accuracy_reward/std": 0.3972113490104675,
"rewards/format_reward/mean": 0.3709821403026581,
"rewards/format_reward/std": 0.47938224077224734,
"step": 65
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2018.4,
"completions/mean_length": 789.9357421875,
"completions/mean_terminated_length": 647.59345703125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.4277608031427324,
"grad_norm": 3.0968235590707516,
"learning_rate": 8.414634146341463e-07,
"loss": 0.0367,
"num_tokens": 65204934.0,
"reward": 0.4755580544471741,
"reward_std": 0.323903352022171,
"rewards/accuracy_reward/mean": 0.18191964328289031,
"rewards/accuracy_reward/std": 0.3848208785057068,
"rewards/format_reward/mean": 0.5872767806053162,
"rewards/format_reward/std": 0.4898715138435364,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.12388392857142856,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2030.6,
"completions/mean_length": 852.7183349609375,
"completions/mean_terminated_length": 683.7888916015625,
"completions/min_length": 21.2,
"completions/min_terminated_length": 21.2,
"epoch": 0.45831514622435615,
"grad_norm": 1.5320665170830472,
"learning_rate": 9.024390243902439e-07,
"loss": 0.0641,
"num_tokens": 70214856.0,
"reward": 0.568526816368103,
"reward_std": 0.3136624157428741,
"rewards/accuracy_reward/mean": 0.1935267835855484,
"rewards/accuracy_reward/std": 0.39293824434280394,
"rewards/format_reward/mean": 0.7500000119209289,
"rewards/format_reward/std": 0.42248891592025756,
"step": 75
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15066964285714285,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2045.2,
"completions/mean_length": 938.47548828125,
"completions/mean_terminated_length": 741.8728881835938,
"completions/min_length": 54.6,
"completions/min_terminated_length": 54.6,
"epoch": 0.4888694893059799,
"grad_norm": 0.7877123339546239,
"learning_rate": 9.634146341463414e-07,
"loss": 0.0511,
"num_tokens": 75612706.0,
"reward": 0.6741071701049804,
"reward_std": 0.280909526348114,
"rewards/accuracy_reward/mean": 0.2337053596973419,
"rewards/accuracy_reward/std": 0.42297143340110777,
"rewards/format_reward/mean": 0.8808035612106323,
"rewards/format_reward/std": 0.32103320956230164,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.16651785714285713,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2038.8,
"completions/mean_length": 943.0940185546875,
"completions/mean_terminated_length": 722.4891723632812,
"completions/min_length": 74.2,
"completions/min_terminated_length": 74.2,
"epoch": 0.5194238323876037,
"grad_norm": 79.72392839014248,
"learning_rate": 9.999816308467719e-07,
"loss": 0.0431,
"num_tokens": 81047231.0,
"reward": 0.6685268163681031,
"reward_std": 0.2459824115037918,
"rewards/accuracy_reward/mean": 0.20245535671710968,
"rewards/accuracy_reward/std": 0.40179912447929383,
"rewards/format_reward/mean": 0.9321428537368774,
"rewards/format_reward/std": 0.24975821375846863,
"step": 85
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.17700892857142858,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2027.0,
"completions/mean_length": 947.4239379882813,
"completions/mean_terminated_length": 710.8060668945312,
"completions/min_length": 39.4,
"completions/min_terminated_length": 39.4,
"epoch": 0.5499781754692274,
"grad_norm": 0.291932520320594,
"learning_rate": 9.997749933731396e-07,
"loss": 0.0619,
"num_tokens": 86464450.0,
"reward": 0.6984375357627869,
"reward_std": 0.22690512537956237,
"rewards/accuracy_reward/mean": 0.21964285969734193,
"rewards/accuracy_reward/std": 0.41324721574783324,
"rewards/format_reward/mean": 0.9575892806053161,
"rewards/format_reward/std": 0.20066657960414885,
"step": 90
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.18214285714285713,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2040.8,
"completions/mean_length": 952.8315185546875,
"completions/mean_terminated_length": 708.9797485351562,
"completions/min_length": 86.8,
"completions/min_terminated_length": 86.8,
"epoch": 0.5805325185508512,
"grad_norm": 0.22931952800626815,
"learning_rate": 9.993388521915133e-07,
"loss": 0.0472,
"num_tokens": 91891095.0,
"reward": 0.6991071820259094,
"reward_std": 0.21199851036071776,
"rewards/accuracy_reward/mean": 0.21562499701976776,
"rewards/accuracy_reward/std": 0.4086563289165497,
"rewards/format_reward/mean": 0.9669642806053161,
"rewards/format_reward/std": 0.1772433876991272,
"step": 95
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.19553571428571428,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2035.4,
"completions/mean_length": 958.6931274414062,
"completions/mean_terminated_length": 693.9798217773438,
"completions/min_length": 81.8,
"completions/min_terminated_length": 81.8,
"epoch": 0.6110868616324749,
"grad_norm": 0.27473928016336013,
"learning_rate": 9.98673407584059e-07,
"loss": 0.0569,
"num_tokens": 97331264.0,
"reward": 0.7299107432365417,
"reward_std": 0.22558192610740663,
"rewards/accuracy_reward/mean": 0.24129464626312255,
"rewards/accuracy_reward/std": 0.4275233745574951,
"rewards/format_reward/mean": 0.9772321343421936,
"rewards/format_reward/std": 0.1488254725933075,
"step": 100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.19799107142857145,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2017.4,
"completions/mean_length": 958.1645629882812,
"completions/mean_terminated_length": 689.5414916992188,
"completions/min_length": 71.0,
"completions/min_terminated_length": 71.0,
"epoch": 0.6416412047140987,
"grad_norm": 0.24105776096711912,
"learning_rate": 9.977789651323023e-07,
"loss": 0.057,
"num_tokens": 102735145.0,
"reward": 0.7639509320259095,
"reward_std": 0.20851088762283326,
"rewards/accuracy_reward/mean": 0.27120536267757417,
"rewards/accuracy_reward/std": 0.44363213181495664,
"rewards/format_reward/mean": 0.9854910612106323,
"rewards/format_reward/std": 0.11715486496686936,
"step": 105
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.22165178571428573,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2034.2,
"completions/mean_length": 1011.181298828125,
"completions/mean_terminated_length": 716.3480346679687,
"completions/min_length": 87.6,
"completions/min_terminated_length": 87.6,
"epoch": 0.6721955477957224,
"grad_norm": 0.25752851995593745,
"learning_rate": 9.966559355768005e-07,
"loss": 0.0462,
"num_tokens": 108471237.0,
"reward": 0.711495578289032,
"reward_std": 0.2048900991678238,
"rewards/accuracy_reward/mean": 0.22165178656578063,
"rewards/accuracy_reward/std": 0.4142503201961517,
"rewards/format_reward/mean": 0.979687511920929,
"rewards/format_reward/std": 0.13945238292217255,
"step": 110
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.17678571428571427,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2032.6,
"completions/mean_length": 911.7765991210938,
"completions/mean_terminated_length": 668.0103515625,
"completions/min_length": 84.0,
"completions/min_terminated_length": 84.0,
"epoch": 0.7027498908773462,
"grad_norm": 0.7544051504203841,
"learning_rate": 9.953048346285245e-07,
"loss": 0.0427,
"num_tokens": 113687620.0,
"reward": 0.7469866514205933,
"reward_std": 0.21086025536060332,
"rewards/accuracy_reward/mean": 0.2558035671710968,
"rewards/accuracy_reward/std": 0.4357575476169586,
"rewards/format_reward/mean": 0.9823660850524902,
"rewards/format_reward/std": 0.13122147917747498,
"step": 115
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.12455357142857144,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2014.4,
"completions/mean_length": 823.8100708007812,
"completions/mean_terminated_length": 649.6679077148438,
"completions/min_length": 65.6,
"completions/min_terminated_length": 65.6,
"epoch": 0.7333042339589699,
"grad_norm": 0.19620616918149955,
"learning_rate": 9.937262827320378e-07,
"loss": 0.0409,
"num_tokens": 118563265.0,
"reward": 0.7304687738418579,
"reward_std": 0.1787090927362442,
"rewards/accuracy_reward/mean": 0.23526785671710967,
"rewards/accuracy_reward/std": 0.4237225532531738,
"rewards/format_reward/mean": 0.9904017806053161,
"rewards/format_reward/std": 0.09243127331137657,
"step": 120
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10111607142857144,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2030.2,
"completions/mean_length": 781.3973510742187,
"completions/mean_terminated_length": 638.783203125,
"completions/min_length": 104.8,
"completions/min_terminated_length": 104.8,
"epoch": 0.7638585770405937,
"grad_norm": 0.3873163212234931,
"learning_rate": 9.919210047805791e-07,
"loss": 0.0432,
"num_tokens": 123244949.0,
"reward": 0.7643973588943481,
"reward_std": 0.1833624541759491,
"rewards/accuracy_reward/mean": 0.26763392686843873,
"rewards/accuracy_reward/std": 0.44135683178901675,
"rewards/format_reward/mean": 0.9935267925262451,
"rewards/format_reward/std": 0.0794298842549324,
"step": 125
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08794642857142858,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2024.2,
"completions/mean_length": 748.4192260742187,
"completions/mean_terminated_length": 623.27763671875,
"completions/min_length": 102.2,
"completions/min_terminated_length": 102.2,
"epoch": 0.7944129201222174,
"grad_norm": 0.2247654782468763,
"learning_rate": 9.898898297831805e-07,
"loss": 0.0344,
"num_tokens": 127697243.0,
"reward": 0.7719866394996643,
"reward_std": 0.18199025988578796,
"rewards/accuracy_reward/mean": 0.27522321939468386,
"rewards/accuracy_reward/std": 0.4465114951133728,
"rewards/format_reward/mean": 0.9935267806053162,
"rewards/format_reward/std": 0.07858637794852256,
"step": 130
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08683035714285714,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2026.2,
"completions/mean_length": 771.8107543945313,
"completions/mean_terminated_length": 650.6771118164063,
"completions/min_length": 97.2,
"completions/min_terminated_length": 97.2,
"epoch": 0.8249672632038412,
"grad_norm": 0.2798829097392128,
"learning_rate": 9.87633690483977e-07,
"loss": 0.0372,
"num_tokens": 132318403.0,
"reward": 0.758147370815277,
"reward_std": 0.18273472785949707,
"rewards/accuracy_reward/mean": 0.2638392835855484,
"rewards/accuracy_reward/std": 0.4399906635284424,
"rewards/format_reward/mean": 0.9886160731315613,
"rewards/format_reward/std": 0.09895561486482621,
"step": 135
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09486607142857142,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2009.2,
"completions/mean_length": 795.43955078125,
"completions/mean_terminated_length": 664.3606079101562,
"completions/min_length": 93.8,
"completions/min_terminated_length": 93.8,
"epoch": 0.8555216062854648,
"grad_norm": 0.21329881145583074,
"learning_rate": 9.851536229338746e-07,
"loss": 0.0447,
"num_tokens": 137023468.0,
"reward": 0.7857143163681031,
"reward_std": 0.17908414602279663,
"rewards/accuracy_reward/mean": 0.28772321343421936,
"rewards/accuracy_reward/std": 0.45079903602600097,
"rewards/format_reward/mean": 0.9959821343421936,
"rewards/format_reward/std": 0.058868531882762906,
"step": 140
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10825892857142856,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2026.8,
"completions/mean_length": 813.8167724609375,
"completions/mean_terminated_length": 663.8405151367188,
"completions/min_length": 103.2,
"completions/min_terminated_length": 103.2,
"epoch": 0.8860759493670886,
"grad_norm": 0.2922638336693392,
"learning_rate": 9.824507660147829e-07,
"loss": 0.0269,
"num_tokens": 141827911.0,
"reward": 0.7977679014205933,
"reward_std": 0.1762310028076172,
"rewards/accuracy_reward/mean": 0.3008928596973419,
"rewards/accuracy_reward/std": 0.458146858215332,
"rewards/format_reward/mean": 0.99375,
"rewards/format_reward/std": 0.07701331004500389,
"step": 145
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10334821428571428,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2010.8,
"completions/mean_length": 815.0272583007812,
"completions/mean_terminated_length": 672.932568359375,
"completions/min_length": 98.0,
"completions/min_terminated_length": 98.0,
"epoch": 0.9166302924487123,
"grad_norm": 0.2200709505164526,
"learning_rate": 9.795263609166241e-07,
"loss": 0.0201,
"num_tokens": 146688457.0,
"reward": 0.7667410969734192,
"reward_std": 0.1663602650165558,
"rewards/accuracy_reward/mean": 0.26874999701976776,
"rewards/accuracy_reward/std": 0.44311601519584654,
"rewards/format_reward/mean": 0.9959821343421936,
"rewards/format_reward/std": 0.06237732619047165,
"step": 150
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.14174107142857142,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2036.8,
"completions/mean_length": 867.5750366210938,
"completions/mean_terminated_length": 672.6513916015625,
"completions/min_length": 104.2,
"completions/min_terminated_length": 104.2,
"epoch": 0.947184635530336,
"grad_norm": 3.9175247421368873,
"learning_rate": 9.763817505673613e-07,
"loss": 0.013,
"num_tokens": 151751633.0,
"reward": 0.7524554014205933,
"reward_std": 0.1689078688621521,
"rewards/accuracy_reward/mean": 0.25468750596046447,
"rewards/accuracy_reward/std": 0.434172123670578,
"rewards/format_reward/mean": 0.9955357193946839,
"rewards/format_reward/std": 0.06422311663627625,
"step": 155
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.13950892857142858,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2035.4,
"completions/mean_length": 872.5841918945313,
"completions/mean_terminated_length": 681.9927001953125,
"completions/min_length": 118.8,
"completions/min_terminated_length": 118.8,
"epoch": 0.9777389786119598,
"grad_norm": 0.09237595076171519,
"learning_rate": 9.73018379016306e-07,
"loss": 0.026,
"num_tokens": 156840938.0,
"reward": 0.7686384320259094,
"reward_std": 0.1452054500579834,
"rewards/accuracy_reward/mean": 0.2703124970197678,
"rewards/accuracy_reward/std": 0.44216405153274535,
"rewards/format_reward/mean": 0.9966517806053161,
"rewards/format_reward/std": 0.05593275800347328,
"step": 160
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.11732755602240896,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2004.6,
"completions/mean_length": 871.1178344726562,
"completions/mean_terminated_length": 714.471337890625,
"completions/min_length": 121.4,
"completions/min_terminated_length": 121.4,
"epoch": 1.0122217372326494,
"grad_norm": 0.13333631435873888,
"learning_rate": 9.69437790770992e-07,
"loss": 0.0192,
"num_tokens": 161885270.0,
"reward": 0.7757812857627868,
"reward_std": 0.15525116622447968,
"rewards/accuracy_reward/mean": 0.27767857909202576,
"rewards/accuracy_reward/std": 0.44785426259040834,
"rewards/format_reward/mean": 0.9962053537368775,
"rewards/format_reward/std": 0.05871548354625702,
"step": 165
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.11071428571428572,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2027.6,
"completions/mean_length": 832.9765991210937,
"completions/mean_terminated_length": 681.9777954101562,
"completions/min_length": 71.6,
"completions/min_terminated_length": 71.6,
"epoch": 1.0427760803142732,
"grad_norm": 0.12564407803252095,
"learning_rate": 9.656416300879148e-07,
"loss": 0.0252,
"num_tokens": 166785109.0,
"reward": 0.7676339507102966,
"reward_std": 0.15091821551322937,
"rewards/accuracy_reward/mean": 0.2696428596973419,
"rewards/accuracy_reward/std": 0.4421641111373901,
"rewards/format_reward/mean": 0.9959821462631225,
"rewards/format_reward/std": 0.0620421789586544,
"step": 170
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09486607142857144,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2024.2,
"completions/mean_length": 826.4540771484375,
"completions/mean_terminated_length": 698.4597412109375,
"completions/min_length": 143.2,
"completions/min_terminated_length": 143.2,
"epoch": 1.073330423395897,
"grad_norm": 0.2544410900753877,
"learning_rate": 9.616316402174656e-07,
"loss": 0.0411,
"num_tokens": 171643703.0,
"reward": 0.7573660969734192,
"reward_std": 0.17742253839969635,
"rewards/accuracy_reward/mean": 0.26004464328289034,
"rewards/accuracy_reward/std": 0.43822175860404966,
"rewards/format_reward/mean": 0.9946428418159485,
"rewards/format_reward/std": 0.06691066473722458,
"step": 175
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09955357142857144,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2023.4,
"completions/mean_length": 841.521240234375,
"completions/mean_terminated_length": 708.1962890625,
"completions/min_length": 137.0,
"completions/min_terminated_length": 137.0,
"epoch": 1.1038847664775207,
"grad_norm": 0.08747091378742429,
"learning_rate": 9.574096626034077e-07,
"loss": 0.0295,
"num_tokens": 176546414.0,
"reward": 0.7808035969734192,
"reward_std": 0.16267039477825165,
"rewards/accuracy_reward/mean": 0.28281249701976774,
"rewards/accuracy_reward/std": 0.4487378478050232,
"rewards/format_reward/mean": 0.9959821462631225,
"rewards/format_reward/std": 0.062224668264389035,
"step": 180
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10758928571428575,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2031.0,
"completions/mean_length": 854.6339721679688,
"completions/mean_terminated_length": 710.7416015625,
"completions/min_length": 136.2,
"completions/min_terminated_length": 136.2,
"epoch": 1.1344391095591444,
"grad_norm": 0.10836785274917456,
"learning_rate": 9.529776360372575e-07,
"loss": 0.0222,
"num_tokens": 181534502.0,
"reward": 0.7752232670783996,
"reward_std": 0.16252341270446777,
"rewards/accuracy_reward/mean": 0.2763392925262451,
"rewards/accuracy_reward/std": 0.446439266204834,
"rewards/format_reward/mean": 0.9977678537368775,
"rewards/format_reward/std": 0.04102582335472107,
"step": 185
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09084821428571428,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2004.4,
"completions/mean_length": 823.7250366210938,
"completions/mean_terminated_length": 701.4886596679687,
"completions/min_length": 138.8,
"completions/min_terminated_length": 138.8,
"epoch": 1.1649934526407681,
"grad_norm": 0.27365355800487584,
"learning_rate": 9.483375957679658e-07,
"loss": 0.0322,
"num_tokens": 186409470.0,
"reward": 0.7799107551574707,
"reward_std": 0.15788044035434723,
"rewards/accuracy_reward/mean": 0.2828125,
"rewards/accuracy_reward/std": 0.450066876411438,
"rewards/format_reward/mean": 0.9941964387893677,
"rewards/format_reward/std": 0.07516667991876602,
"step": 190
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0513392857142857,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2013.0,
"completions/mean_length": 745.5102905273437,
"completions/mean_terminated_length": 675.1964477539062,
"completions/min_length": 100.6,
"completions/min_terminated_length": 100.6,
"epoch": 1.195547795722392,
"grad_norm": 0.11622896960479281,
"learning_rate": 9.434916725673022e-07,
"loss": 0.0206,
"num_tokens": 190930508.0,
"reward": 0.7508928775787354,
"reward_std": 0.1432833790779114,
"rewards/accuracy_reward/mean": 0.2517857164144516,
"rewards/accuracy_reward/std": 0.4329728066921234,
"rewards/format_reward/mean": 0.9982142806053161,
"rewards/format_reward/std": 0.03614754155278206,
"step": 195
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0497767857142857,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2004.0,
"completions/mean_length": 741.8364379882812,
"completions/mean_terminated_length": 673.540771484375,
"completions/min_length": 102.2,
"completions/min_terminated_length": 102.2,
"epoch": 1.2261021388040156,
"grad_norm": 0.12025594911809087,
"learning_rate": 9.384420917513751e-07,
"loss": 0.0277,
"num_tokens": 195465647.0,
"reward": 0.7793527126312256,
"reward_std": 0.16832945346832276,
"rewards/accuracy_reward/mean": 0.28147320747375487,
"rewards/accuracy_reward/std": 0.44833584427833556,
"rewards/format_reward/mean": 0.9957589387893677,
"rewards/format_reward/std": 0.06345580816268921,
"step": 200
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04241071428571428,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2010.6,
"completions/mean_length": 721.6792724609375,
"completions/mean_terminated_length": 662.9951904296875,
"completions/min_length": 133.6,
"completions/min_terminated_length": 133.6,
"epoch": 1.2566564818856394,
"grad_norm": 0.16981863836060604,
"learning_rate": 9.331911721587345e-07,
"loss": 0.0258,
"num_tokens": 199837642.0,
"reward": 0.8217634320259094,
"reward_std": 0.1668698877096176,
"rewards/accuracy_reward/mean": 0.32321428060531615,
"rewards/accuracy_reward/std": 0.465910816192627,
"rewards/format_reward/mean": 0.9970982074737549,
"rewards/format_reward/std": 0.0474703922867775,
"step": 205
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2008.0,
"completions/mean_length": 702.5625244140625,
"completions/mean_terminated_length": 650.3322509765625,
"completions/min_length": 128.8,
"completions/min_terminated_length": 128.8,
"epoch": 1.2872108249672631,
"grad_norm": 0.1779204185282472,
"learning_rate": 9.277413250855295e-07,
"loss": 0.0209,
"num_tokens": 204120490.0,
"reward": 0.7945312976837158,
"reward_std": 0.14636826515197754,
"rewards/accuracy_reward/mean": 0.29598214030265807,
"rewards/accuracy_reward/std": 0.4555119574069977,
"rewards/format_reward/mean": 0.9970982074737549,
"rewards/format_reward/std": 0.050687269866466524,
"step": 210
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.038616071428571444,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1970.2,
"completions/mean_length": 705.130615234375,
"completions/mean_terminated_length": 651.2255004882812,
"completions/min_length": 117.6,
"completions/min_terminated_length": 117.6,
"epoch": 1.3177651680488869,
"grad_norm": 0.11235672853706381,
"learning_rate": 9.220950531782068e-07,
"loss": 0.0205,
"num_tokens": 208492307.0,
"reward": 0.7922991394996644,
"reward_std": 0.14874889701604843,
"rewards/accuracy_reward/mean": 0.2935267806053162,
"rewards/accuracy_reward/std": 0.45490965247154236,
"rewards/format_reward/mean": 0.9975446343421936,
"rewards/format_reward/std": 0.04770735502243042,
"step": 215
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04531249999999998,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2014.4,
"completions/mean_length": 724.0094116210937,
"completions/mean_terminated_length": 661.1902099609375,
"completions/min_length": 116.2,
"completions/min_terminated_length": 116.2,
"epoch": 1.3483195111305106,
"grad_norm": 0.21618506319361352,
"learning_rate": 9.162549492842602e-07,
"loss": 0.0186,
"num_tokens": 212840781.0,
"reward": 0.7775670051574707,
"reward_std": 0.15426733791828157,
"rewards/accuracy_reward/mean": 0.27946428656578065,
"rewards/accuracy_reward/std": 0.44824106693267823,
"rewards/format_reward/mean": 0.9962053537368775,
"rewards/format_reward/std": 0.0598295733332634,
"step": 220
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.048214285714285696,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1982.4,
"completions/mean_length": 738.7886474609375,
"completions/mean_terminated_length": 672.33486328125,
"completions/min_length": 117.0,
"completions/min_terminated_length": 117.0,
"epoch": 1.3788738542121344,
"grad_norm": 0.5672502376903732,
"learning_rate": 9.102236952615588e-07,
"loss": 0.0233,
"num_tokens": 217336258.0,
"reward": 0.8011161088943481,
"reward_std": 0.15403492003679276,
"rewards/accuracy_reward/mean": 0.30245535969734194,
"rewards/accuracy_reward/std": 0.4566509246826172,
"rewards/format_reward/mean": 0.9973214149475098,
"rewards/format_reward/std": 0.0498233363032341,
"step": 225
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05200892857142858,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2007.2,
"completions/mean_length": 763.8837280273438,
"completions/mean_terminated_length": 693.3828979492188,
"completions/min_length": 140.0,
"completions/min_terminated_length": 140.0,
"epoch": 1.4094281972937581,
"grad_norm": 0.1506768290492925,
"learning_rate": 9.040040607467998e-07,
"loss": 0.0213,
"num_tokens": 221888289.0,
"reward": 0.8059152126312256,
"reward_std": 0.16730209589004516,
"rewards/accuracy_reward/mean": 0.30691964030265806,
"rewards/accuracy_reward/std": 0.46105764508247377,
"rewards/format_reward/mean": 0.9979910612106323,
"rewards/format_reward/std": 0.0392449900507927,
"step": 230
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0591517857142857,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2021.2,
"completions/mean_length": 774.4962280273437,
"completions/mean_terminated_length": 694.5208862304687,
"completions/min_length": 139.4,
"completions/min_terminated_length": 139.4,
"epoch": 1.4399825403753819,
"grad_norm": 0.09504335900254936,
"learning_rate": 8.97598901883653e-07,
"loss": 0.0231,
"num_tokens": 226517008.0,
"reward": 0.7858259201049804,
"reward_std": 0.16012441515922546,
"rewards/accuracy_reward/mean": 0.2870535671710968,
"rewards/accuracy_reward/std": 0.4516939163208008,
"rewards/format_reward/mean": 0.9975446224212646,
"rewards/format_reward/std": 0.048042502254247665,
"step": 235
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.047321428571428556,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2016.4,
"completions/mean_length": 753.5406616210937,
"completions/mean_terminated_length": 689.3065673828125,
"completions/min_length": 116.2,
"completions/min_terminated_length": 116.2,
"epoch": 1.4705368834570056,
"grad_norm": 0.1786738253060093,
"learning_rate": 8.910111600111784e-07,
"loss": 0.0166,
"num_tokens": 231081470.0,
"reward": 0.798995566368103,
"reward_std": 0.16656452119350434,
"rewards/accuracy_reward/mean": 0.2995535731315613,
"rewards/accuracy_reward/std": 0.4570247888565063,
"rewards/format_reward/mean": 0.9988839268684387,
"rewards/format_reward/std": 0.020022178441286086,
"step": 240
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06272321428571428,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2029.8,
"completions/mean_length": 779.9359741210938,
"completions/mean_terminated_length": 695.1734497070313,
"completions/min_length": 136.2,
"completions/min_terminated_length": 136.2,
"epoch": 1.5010912265386294,
"grad_norm": 0.18954232465278012,
"learning_rate": 8.842438603131231e-07,
"loss": 0.0215,
"num_tokens": 235752599.0,
"reward": 0.8042410969734192,
"reward_std": 0.16244812905788422,
"rewards/accuracy_reward/mean": 0.3051339268684387,
"rewards/accuracy_reward/std": 0.460322242975235,
"rewards/format_reward/mean": 0.9982142806053161,
"rewards/format_reward/std": 0.036147540807724,
"step": 245
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.047321428571428584,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2016.6,
"completions/mean_length": 756.9152099609375,
"completions/mean_terminated_length": 692.7505004882812,
"completions/min_length": 148.2,
"completions/min_terminated_length": 148.2,
"epoch": 1.5316455696202531,
"grad_norm": 0.16452679161969938,
"learning_rate": 8.773001104287137e-07,
"loss": 0.0196,
"num_tokens": 240301251.0,
"reward": 0.7897321820259094,
"reward_std": 0.15674711763858795,
"rewards/accuracy_reward/mean": 0.29040178656578064,
"rewards/accuracy_reward/std": 0.4533629357814789,
"rewards/format_reward/mean": 0.9986607074737549,
"rewards/format_reward/std": 0.032250724732875824,
"step": 250
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05044642857142858,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2025.6,
"completions/mean_length": 779.5279296875,
"completions/mean_terminated_length": 712.1023315429687,
"completions/min_length": 137.8,
"completions/min_terminated_length": 137.8,
"epoch": 1.5621999127018769,
"grad_norm": 0.17402626013276218,
"learning_rate": 8.701830990255843e-07,
"loss": 0.0183,
"num_tokens": 244964872.0,
"reward": 0.8008929014205932,
"reward_std": 0.1571328818798065,
"rewards/accuracy_reward/mean": 0.3017857223749161,
"rewards/accuracy_reward/std": 0.45720450282096864,
"rewards/format_reward/mean": 0.9982142925262452,
"rewards/format_reward/std": 0.0257643923163414,
"step": 255
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04285714285714284,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2028.6,
"completions/mean_length": 768.3616455078125,
"completions/mean_terminated_length": 711.1112915039063,
"completions/min_length": 144.2,
"completions/min_terminated_length": 144.2,
"epoch": 1.5927542557835006,
"grad_norm": 0.21275520014326793,
"learning_rate": 8.628960943354964e-07,
"loss": 0.0257,
"num_tokens": 249571852.0,
"reward": 0.7987723588943482,
"reward_std": 0.14928914606571198,
"rewards/accuracy_reward/mean": 0.2995535671710968,
"rewards/accuracy_reward/std": 0.45520797967910764,
"rewards/format_reward/mean": 0.9984374880790711,
"rewards/format_reward/std": 0.03436670675873756,
"step": 260
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03928571428571428,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2009.6,
"completions/mean_length": 756.3906494140625,
"completions/mean_terminated_length": 703.5931640625,
"completions/min_length": 135.0,
"completions/min_terminated_length": 135.0,
"epoch": 1.6233085988651244,
"grad_norm": 0.3399675848185139,
"learning_rate": 8.5544244265352e-07,
"loss": 0.019,
"num_tokens": 254129050.0,
"reward": 0.7928571701049805,
"reward_std": 0.15490830242633818,
"rewards/accuracy_reward/mean": 0.29352678656578063,
"rewards/accuracy_reward/std": 0.4540963590145111,
"rewards/format_reward/mean": 0.9986606955528259,
"rewards/format_reward/std": 0.036169955134391786,
"step": 265
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04285714285714284,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1997.4,
"completions/mean_length": 784.441552734375,
"completions/mean_terminated_length": 727.8777465820312,
"completions/min_length": 130.8,
"completions/min_terminated_length": 130.8,
"epoch": 1.653862941946748,
"grad_norm": 0.13020680741503093,
"learning_rate": 8.478255668013688e-07,
"loss": 0.0189,
"num_tokens": 258782500.0,
"reward": 0.7985491514205932,
"reward_std": 0.16011734902858735,
"rewards/accuracy_reward/mean": 0.29910714030265806,
"rewards/accuracy_reward/std": 0.45711545944213866,
"rewards/format_reward/mean": 0.9988839268684387,
"rewards/format_reward/std": 0.025569194555282594,
"step": 270
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0388392857142857,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1987.8,
"completions/mean_length": 752.8703491210938,
"completions/mean_terminated_length": 700.4107543945313,
"completions/min_length": 131.6,
"completions/min_terminated_length": 131.6,
"epoch": 1.6844172850283718,
"grad_norm": 0.09516473489346335,
"learning_rate": 8.400489645555913e-07,
"loss": 0.0181,
"num_tokens": 263356343.0,
"reward": 0.7926339626312255,
"reward_std": 0.13618116676807404,
"rewards/accuracy_reward/mean": 0.2930803567171097,
"rewards/accuracy_reward/std": 0.45382532477378845,
"rewards/format_reward/mean": 0.9991071343421936,
"rewards/format_reward/std": 0.022806893289089202,
"step": 275
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.035937500000000025,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1989.6,
"completions/mean_length": 744.4375366210937,
"completions/mean_terminated_length": 695.6819580078125,
"completions/min_length": 151.8,
"completions/min_terminated_length": 151.8,
"epoch": 1.7149716281099956,
"grad_norm": 0.15549132521172215,
"learning_rate": 8.32116207041343e-07,
"loss": 0.0157,
"num_tokens": 267909335.0,
"reward": 0.7906250357627869,
"reward_std": 0.13802923113107682,
"rewards/accuracy_reward/mean": 0.29107142686843873,
"rewards/accuracy_reward/std": 0.4523549020290375,
"rewards/format_reward/mean": 0.9991071343421936,
"rewards/format_reward/std": 0.022806893289089202,
"step": 280
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.031696428571428556,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2021.4,
"completions/mean_length": 767.20673828125,
"completions/mean_terminated_length": 725.3736938476562,
"completions/min_length": 162.8,
"completions/min_terminated_length": 162.8,
"epoch": 1.7455259711916193,
"grad_norm": 0.13521071199400242,
"learning_rate": 8.240309370924757e-07,
"loss": 0.022,
"num_tokens": 272499365.0,
"reward": 0.8069196820259095,
"reward_std": 0.15525558590888977,
"rewards/accuracy_reward/mean": 0.3073660671710968,
"rewards/accuracy_reward/std": 0.4584523618221283,
"rewards/format_reward/mean": 0.9991071343421936,
"rewards/format_reward/std": 0.01824134439229965,
"step": 285
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03236607142857144,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1968.0,
"completions/mean_length": 747.6482543945312,
"completions/mean_terminated_length": 704.1689819335937,
"completions/min_length": 136.6,
"completions/min_terminated_length": 136.6,
"epoch": 1.776080314273243,
"grad_norm": 0.09997634934328843,
"learning_rate": 8.15796867578697e-07,
"loss": 0.0135,
"num_tokens": 277009557.0,
"reward": 0.8188616394996643,
"reward_std": 0.14347042739391327,
"rewards/accuracy_reward/mean": 0.3191964328289032,
"rewards/accuracy_reward/std": 0.46494446992874144,
"rewards/format_reward/mean": 0.9993303418159485,
"rewards/format_reward/std": 0.020044592767953874,
"step": 290
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03705357142857144,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2019.0,
"completions/mean_length": 777.629736328125,
"completions/mean_terminated_length": 728.7614379882813,
"completions/min_length": 166.0,
"completions/min_terminated_length": 166.0,
"epoch": 1.8066346573548668,
"grad_norm": 0.11283666972583091,
"learning_rate": 8.074177797005677e-07,
"loss": 0.0165,
"num_tokens": 281669506.0,
"reward": 0.7588170051574707,
"reward_std": 0.15343317985534669,
"rewards/accuracy_reward/mean": 0.2595982104539871,
"rewards/accuracy_reward/std": 0.43734992146492,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.024350765347480773,
"step": 295
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03325892857142856,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1996.6,
"completions/mean_length": 760.6272583007812,
"completions/mean_terminated_length": 716.4204345703125,
"completions/min_length": 166.2,
"completions/min_terminated_length": 166.2,
"epoch": 1.8371890004364906,
"grad_norm": 0.2361778065235896,
"learning_rate": 7.988975212531219e-07,
"loss": 0.0213,
"num_tokens": 286265020.0,
"reward": 0.7878348588943481,
"reward_std": 0.15699843168258668,
"rewards/accuracy_reward/mean": 0.2886160731315613,
"rewards/accuracy_reward/std": 0.4517777502536774,
"rewards/format_reward/mean": 0.998437476158142,
"rewards/format_reward/std": 0.038285937160253525,
"step": 300
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02991071428571428,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2034.6,
"completions/mean_length": 737.7507080078125,
"completions/mean_terminated_length": 697.3485717773438,
"completions/min_length": 144.8,
"completions/min_terminated_length": 144.8,
"epoch": 1.8677433435181143,
"grad_norm": 0.09126999711545696,
"learning_rate": 7.902400048589051e-07,
"loss": 0.0182,
"num_tokens": 290702311.0,
"reward": 0.8001116394996644,
"reward_std": 0.14078624546527863,
"rewards/accuracy_reward/mean": 0.3002232164144516,
"rewards/accuracy_reward/std": 0.45618838667869566,
"rewards/format_reward/mean": 0.9997767806053162,
"rewards/format_reward/std": 0.006681530922651291,
"step": 305
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0392857142857143,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2016.6,
"completions/mean_length": 762.6062866210938,
"completions/mean_terminated_length": 710.1842163085937,
"completions/min_length": 123.8,
"completions/min_terminated_length": 123.8,
"epoch": 1.898297686599738,
"grad_norm": 0.22123281107398743,
"learning_rate": 7.814492061712428e-07,
"loss": 0.0151,
"num_tokens": 295299843.0,
"reward": 0.8112723588943481,
"reward_std": 0.14834096431732177,
"rewards/accuracy_reward/mean": 0.31183035373687745,
"rewards/accuracy_reward/std": 0.46237130761146544,
"rewards/format_reward/mean": 0.9988839268684387,
"rewards/format_reward/std": 0.025569194555282594,
"step": 310
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03705357142857142,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1994.6,
"completions/mean_length": 764.9411010742188,
"completions/mean_terminated_length": 715.6682983398438,
"completions/min_length": 140.2,
"completions/min_terminated_length": 140.2,
"epoch": 1.9288520296813618,
"grad_norm": 0.21704155377729267,
"learning_rate": 7.725291620485652e-07,
"loss": 0.0176,
"num_tokens": 299885843.0,
"reward": 0.7936384320259094,
"reward_std": 0.14965083748102187,
"rewards/accuracy_reward/mean": 0.29486607015132904,
"rewards/accuracy_reward/std": 0.45394750833511355,
"rewards/format_reward/mean": 0.9975446343421936,
"rewards/format_reward/std": 0.048688821494579315,
"step": 315
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03727678571428572,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1988.6,
"completions/mean_length": 771.9966918945313,
"completions/mean_terminated_length": 722.7735107421875,
"completions/min_length": 184.4,
"completions/min_terminated_length": 184.4,
"epoch": 1.9594063727629856,
"grad_norm": 0.1619916913219656,
"learning_rate": 7.63483968700624e-07,
"loss": 0.0189,
"num_tokens": 304466700.0,
"reward": 0.8251116514205933,
"reward_std": 0.16231110244989394,
"rewards/accuracy_reward/mean": 0.32566964626312256,
"rewards/accuracy_reward/std": 0.4671235501766205,
"rewards/format_reward/mean": 0.9988839030265808,
"rewards/format_reward/std": 0.033407654613256454,
"step": 320
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05647321428571428,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2011.8,
"completions/mean_length": 825.4569580078125,
"completions/mean_terminated_length": 752.82509765625,
"completions/min_length": 180.4,
"completions/min_terminated_length": 180.4,
"epoch": 1.9899607158446093,
"grad_norm": 0.10053555689980577,
"learning_rate": 7.543177798074562e-07,
"loss": 0.0161,
"num_tokens": 309342371.0,
"reward": 0.8236607432365417,
"reward_std": 0.16026362478733064,
"rewards/accuracy_reward/mean": 0.3247767806053162,
"rewards/accuracy_reward/std": 0.4665621817111969,
"rewards/format_reward/mean": 0.9977678537368775,
"rewards/format_reward/std": 0.04102582409977913,
"step": 325
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06180847338935576,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2024.8,
"completions/mean_length": 844.2710327148437,
"completions/mean_terminated_length": 764.9605590820313,
"completions/min_length": 202.8,
"completions/min_terminated_length": 202.8,
"epoch": 2.024443474465299,
"grad_norm": 0.276734985009805,
"learning_rate": 7.45034804611955e-07,
"loss": 0.0177,
"num_tokens": 314315823.0,
"reward": 0.8000000357627869,
"reward_std": 0.14559195637702943,
"rewards/accuracy_reward/mean": 0.30022320747375486,
"rewards/accuracy_reward/std": 0.4581437647342682,
"rewards/format_reward/mean": 0.9995535612106323,
"rewards/format_reward/std": 0.013363061845302582,
"step": 330
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0580357142857143,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2033.8,
"completions/mean_length": 846.123486328125,
"completions/mean_terminated_length": 771.8980712890625,
"completions/min_length": 197.0,
"completions/min_terminated_length": 197.0,
"epoch": 2.054997817546923,
"grad_norm": 0.11027573622781695,
"learning_rate": 7.356393059869272e-07,
"loss": 0.0215,
"num_tokens": 319262760.0,
"reward": 0.8050223469734192,
"reward_std": 0.14204435050487518,
"rewards/accuracy_reward/mean": 0.3053571403026581,
"rewards/accuracy_reward/std": 0.4595285415649414,
"rewards/format_reward/mean": 0.9993303537368774,
"rewards/format_reward/std": 0.016125363111495972,
"step": 335
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06964285714285715,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1993.0,
"completions/mean_length": 845.8031494140625,
"completions/mean_terminated_length": 755.8747924804687,
"completions/min_length": 186.8,
"completions/min_terminated_length": 186.8,
"epoch": 2.0855521606285463,
"grad_norm": 0.1474712703415126,
"learning_rate": 7.261355984775207e-07,
"loss": 0.0159,
"num_tokens": 324183094.0,
"reward": 0.8159598469734192,
"reward_std": 0.15490956604480743,
"rewards/accuracy_reward/mean": 0.3165178596973419,
"rewards/accuracy_reward/std": 0.46469891667366026,
"rewards/format_reward/mean": 0.9988839268684387,
"rewards/format_reward/std": 0.025569193810224534,
"step": 340
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2020.2,
"completions/mean_length": 850.1359741210938,
"completions/mean_terminated_length": 761.6943969726562,
"completions/min_length": 197.0,
"completions/min_terminated_length": 197.0,
"epoch": 2.1161065037101703,
"grad_norm": 0.11472996418505006,
"learning_rate": 7.16528046319926e-07,
"loss": 0.0196,
"num_tokens": 329179287.0,
"reward": 0.8004464626312255,
"reward_std": 0.15879366397857667,
"rewards/accuracy_reward/mean": 0.3006696403026581,
"rewards/accuracy_reward/std": 0.45732685923576355,
"rewards/format_reward/mean": 0.9995535612106323,
"rewards/format_reward/std": 0.013363061845302582,
"step": 345
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06830357142857146,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2029.8,
"completions/mean_length": 852.2819458007813,
"completions/mean_terminated_length": 764.2258666992187,
"completions/min_length": 206.0,
"completions/min_terminated_length": 206.0,
"epoch": 2.146660846791794,
"grad_norm": 0.17200982778085797,
"learning_rate": 7.068210614372567e-07,
"loss": 0.0155,
"num_tokens": 334167246.0,
"reward": 0.8152902007102967,
"reward_std": 0.13903563916683198,
"rewards/accuracy_reward/mean": 0.3154017835855484,
"rewards/accuracy_reward/std": 0.46241012811660764,
"rewards/format_reward/mean": 0.9997767806053162,
"rewards/format_reward/std": 0.006681530922651291,
"step": 350
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.06026785714285714,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2017.2,
"completions/mean_length": 839.6986938476563,
"completions/mean_terminated_length": 762.2915649414062,
"completions/min_length": 164.4,
"completions/min_terminated_length": 164.4,
"epoch": 2.1772151898734178,
"grad_norm": 0.11842920414034579,
"learning_rate": 6.97019101413533e-07,
"loss": 0.0172,
"num_tokens": 339137416.0,
"reward": 0.8003348588943482,
"reward_std": 0.14564830660820008,
"rewards/accuracy_reward/mean": 0.3004464268684387,
"rewards/accuracy_reward/std": 0.45833338499069215,
"rewards/format_reward/mean": 0.9997767806053162,
"rewards/format_reward/std": 0.006681530922651291,
"step": 355
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.056026785714285744,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2040.4,
"completions/mean_length": 822.5317260742188,
"completions/mean_terminated_length": 749.818017578125,
"completions/min_length": 198.6,
"completions/min_terminated_length": 198.6,
"epoch": 2.2077695329550413,
"grad_norm": 0.14592027381013908,
"learning_rate": 6.871266674466954e-07,
"loss": 0.02,
"num_tokens": 343966622.0,
"reward": 0.8177455782890319,
"reward_std": 0.15409113913774491,
"rewards/accuracy_reward/mean": 0.3180803656578064,
"rewards/accuracy_reward/std": 0.4620179235935211,
"rewards/format_reward/mean": 0.9993303537368774,
"rewards/format_reward/std": 0.01155981346964836,
"step": 360
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05401785714285714,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2030.8,
"completions/mean_length": 825.13955078125,
"completions/mean_terminated_length": 755.38876953125,
"completions/min_length": 184.2,
"completions/min_terminated_length": 184.2,
"epoch": 2.2383238760366653,
"grad_norm": 0.19441801433524555,
"learning_rate": 6.771483022815924e-07,
"loss": 0.0123,
"num_tokens": 348798887.0,
"reward": 0.8023437976837158,
"reward_std": 0.15522640347480773,
"rewards/accuracy_reward/mean": 0.3031250059604645,
"rewards/accuracy_reward/std": 0.4593860566616058,
"rewards/format_reward/mean": 0.9984374880790711,
"rewards/format_reward/std": 0.033385240286588666,
"step": 365
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0484375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2009.6,
"completions/mean_length": 817.0913330078125,
"completions/mean_terminated_length": 754.5521484375,
"completions/min_length": 139.8,
"completions/min_terminated_length": 139.8,
"epoch": 2.268878219118289,
"grad_norm": 0.16264501124178576,
"learning_rate": 6.670885881238877e-07,
"loss": 0.017,
"num_tokens": 353677000.0,
"reward": 0.795089316368103,
"reward_std": 0.13972520977258682,
"rewards/accuracy_reward/mean": 0.2970982164144516,
"rewards/accuracy_reward/std": 0.45542468428611754,
"rewards/format_reward/mean": 0.9959821343421936,
"rewards/format_reward/std": 0.0598499983549118,
"step": 370
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03303571428571428,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1997.4,
"completions/mean_length": 799.5964721679687,
"completions/mean_terminated_length": 757.1193237304688,
"completions/min_length": 159.4,
"completions/min_terminated_length": 159.4,
"epoch": 2.2994325621999128,
"grad_norm": 0.15997251979228794,
"learning_rate": 6.569521445358463e-07,
"loss": 0.0128,
"num_tokens": 358446768.0,
"reward": 0.7751116514205932,
"reward_std": 0.14272799640893935,
"rewards/accuracy_reward/mean": 0.27589285671710967,
"rewards/accuracy_reward/std": 0.44521453976631165,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.035013025254011156,
"step": 375
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04174107142857144,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2013.2,
"completions/mean_length": 831.6589599609375,
"completions/mean_terminated_length": 778.9645385742188,
"completions/min_length": 189.8,
"completions/min_terminated_length": 189.8,
"epoch": 2.3299869052815363,
"grad_norm": 0.14573344935368374,
"learning_rate": 6.467436263149678e-07,
"loss": 0.0198,
"num_tokens": 363303208.0,
"reward": 0.8113839626312256,
"reward_std": 0.16406364738941193,
"rewards/accuracy_reward/mean": 0.31183035373687745,
"rewards/accuracy_reward/std": 0.4620197117328644,
"rewards/format_reward/mean": 0.9991071462631226,
"rewards/format_reward/std": 0.018887662887573244,
"step": 380
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03571428571428574,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1976.4,
"completions/mean_length": 804.5574096679687,
"completions/mean_terminated_length": 758.7299438476563,
"completions/min_length": 194.0,
"completions/min_terminated_length": 194.0,
"epoch": 2.3605412483631603,
"grad_norm": 0.17866901932170098,
"learning_rate": 6.364677213564364e-07,
"loss": 0.0241,
"num_tokens": 368039513.0,
"reward": 0.8130580663681031,
"reward_std": 0.14618841260671617,
"rewards/accuracy_reward/mean": 0.3140625,
"rewards/accuracy_reward/std": 0.46417089700698855,
"rewards/format_reward/mean": 0.9979910731315613,
"rewards/format_reward/std": 0.03434429243206978,
"step": 385
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.041294642857142835,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2028.2,
"completions/mean_length": 818.0640991210937,
"completions/mean_terminated_length": 765.2714233398438,
"completions/min_length": 133.2,
"completions/min_terminated_length": 133.2,
"epoch": 2.391095591444784,
"grad_norm": 0.13515766805571083,
"learning_rate": 6.261291485003751e-07,
"loss": 0.0227,
"num_tokens": 372900728.0,
"reward": 0.7949777126312256,
"reward_std": 0.13654857128858566,
"rewards/accuracy_reward/mean": 0.2959821462631226,
"rewards/accuracy_reward/std": 0.45374839901924136,
"rewards/format_reward/mean": 0.9979910612106323,
"rewards/format_reward/std": 0.0392449900507927,
"step": 390
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.029910714285714256,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2027.2,
"completions/mean_length": 791.8616455078125,
"completions/mean_terminated_length": 753.0618286132812,
"completions/min_length": 171.0,
"completions/min_terminated_length": 171.0,
"epoch": 2.4216499345264078,
"grad_norm": 0.12669388650912664,
"learning_rate": 6.157326553648862e-07,
"loss": 0.0227,
"num_tokens": 377597444.0,
"reward": 0.8180803894996643,
"reward_std": 0.13929658234119416,
"rewards/accuracy_reward/mean": 0.3187499940395355,
"rewards/accuracy_reward/std": 0.4652796983718872,
"rewards/format_reward/mean": 0.9986606955528259,
"rewards/format_reward/std": 0.036169955134391786,
"step": 395
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02946428571428572,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2020.4,
"completions/mean_length": 770.8859619140625,
"completions/mean_terminated_length": 732.1171142578125,
"completions/min_length": 176.6,
"completions/min_terminated_length": 176.6,
"epoch": 2.4522042776080313,
"grad_norm": 0.3432253754177521,
"learning_rate": 6.052830161658798e-07,
"loss": 0.0235,
"num_tokens": 382175093.0,
"reward": 0.8391741394996644,
"reward_std": 0.1441566601395607,
"rewards/accuracy_reward/mean": 0.33995535373687746,
"rewards/accuracy_reward/std": 0.4726693868637085,
"rewards/format_reward/mean": 0.9984374880790711,
"rewards/format_reward/std": 0.03436670750379563,
"step": 400
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0314732142857143,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2011.8,
"completions/mean_length": 760.5685668945313,
"completions/mean_terminated_length": 718.9107055664062,
"completions/min_length": 153.6,
"completions/min_terminated_length": 153.6,
"epoch": 2.4827586206896552,
"grad_norm": 0.26608602720822144,
"learning_rate": 5.947850295246859e-07,
"loss": 0.0236,
"num_tokens": 386739008.0,
"reward": 0.8093750357627869,
"reward_std": 0.14069317430257797,
"rewards/accuracy_reward/mean": 0.31071428656578065,
"rewards/accuracy_reward/std": 0.462552535533905,
"rewards/format_reward/mean": 0.9973214149475098,
"rewards/format_reward/std": 0.050804803520441054,
"step": 405
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.028571428571428602,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2013.6,
"completions/mean_length": 767.4560668945312,
"completions/mean_terminated_length": 729.8543823242187,
"completions/min_length": 157.2,
"completions/min_terminated_length": 157.2,
"epoch": 2.5133129637712788,
"grad_norm": 0.13536643376627677,
"learning_rate": 5.842435162644601e-07,
"loss": 0.0242,
"num_tokens": 391350947.0,
"reward": 0.7974330663681031,
"reward_std": 0.1469767302274704,
"rewards/accuracy_reward/mean": 0.29888392686843873,
"rewards/accuracy_reward/std": 0.457661384344101,
"rewards/format_reward/mean": 0.9970982074737549,
"rewards/format_reward/std": 0.0453034907579422,
"step": 410
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03415178571428572,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2007.4,
"completions/mean_length": 763.2268188476562,
"completions/mean_terminated_length": 718.07421875,
"completions/min_length": 188.6,
"completions/min_terminated_length": 188.6,
"epoch": 2.5438673068529027,
"grad_norm": 0.12279594307566204,
"learning_rate": 5.736633171963936e-07,
"loss": 0.0267,
"num_tokens": 395891987.0,
"reward": 0.8217634320259094,
"reward_std": 0.14362162351608276,
"rewards/accuracy_reward/mean": 0.32366071343421937,
"rewards/accuracy_reward/std": 0.46651501059532163,
"rewards/format_reward/mean": 0.9962053537368775,
"rewards/format_reward/std": 0.054164633899927137,
"step": 415
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.029241071428571443,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2005.6,
"completions/mean_length": 765.0089599609375,
"completions/mean_terminated_length": 726.3901245117188,
"completions/min_length": 174.4,
"completions/min_terminated_length": 174.4,
"epoch": 2.5744216499345263,
"grad_norm": 0.27027816621800804,
"learning_rate": 5.63049290896745e-07,
"loss": 0.0266,
"num_tokens": 400494083.0,
"reward": 0.7960937738418579,
"reward_std": 0.14977291822433472,
"rewards/accuracy_reward/mean": 0.29933035373687744,
"rewards/accuracy_reward/std": 0.45718762278556824,
"rewards/format_reward/mean": 0.9935267806053162,
"rewards/format_reward/std": 0.07908380329608918,
"step": 420
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.028124999999999976,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2027.4,
"completions/mean_length": 763.7607543945312,
"completions/mean_terminated_length": 726.5771484375,
"completions/min_length": 190.6,
"completions/min_terminated_length": 190.6,
"epoch": 2.6049759930161502,
"grad_norm": 0.15731913329109534,
"learning_rate": 5.524063114757138e-07,
"loss": 0.0191,
"num_tokens": 405093259.0,
"reward": 0.8109375238418579,
"reward_std": 0.13588204383850097,
"rewards/accuracy_reward/mean": 0.3120535731315613,
"rewards/accuracy_reward/std": 0.46348544359207156,
"rewards/format_reward/mean": 0.9977678537368775,
"rewards/format_reward/std": 0.04559137225151062,
"step": 425
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02991071428571428,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1996.6,
"completions/mean_length": 773.5725830078125,
"completions/mean_terminated_length": 734.3085571289063,
"completions/min_length": 151.2,
"completions/min_terminated_length": 151.2,
"epoch": 2.6355303360977738,
"grad_norm": 0.24913693661690342,
"learning_rate": 5.417392663391796e-07,
"loss": 0.019,
"num_tokens": 409737920.0,
"reward": 0.8024553775787353,
"reward_std": 0.15582817792892456,
"rewards/accuracy_reward/mean": 0.3037946462631226,
"rewards/accuracy_reward/std": 0.4587751805782318,
"rewards/format_reward/mean": 0.9973214268684387,
"rewards/format_reward/std": 0.051451122760772704,
"step": 430
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02991071428571428,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1997.0,
"completions/mean_length": 767.83642578125,
"completions/mean_terminated_length": 728.5034790039062,
"completions/min_length": 202.8,
"completions/min_terminated_length": 202.8,
"epoch": 2.6660846791793977,
"grad_norm": 0.08662935166762037,
"learning_rate": 5.310530539443374e-07,
"loss": 0.0229,
"num_tokens": 414338979.0,
"reward": 0.8100446820259094,
"reward_std": 0.14469670355319977,
"rewards/accuracy_reward/mean": 0.31049107313156127,
"rewards/accuracy_reward/std": 0.46080448031425475,
"rewards/format_reward/mean": 0.9991071343421936,
"rewards/format_reward/std": 0.022806893289089202,
"step": 435
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.034375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2037.0,
"completions/mean_length": 798.643115234375,
"completions/mean_terminated_length": 754.474267578125,
"completions/min_length": 184.4,
"completions/min_terminated_length": 184.4,
"epoch": 2.6966390222610213,
"grad_norm": 0.11484531741935144,
"learning_rate": 5.203525815502573e-07,
"loss": 0.0127,
"num_tokens": 419133124.0,
"reward": 0.8082589626312255,
"reward_std": 0.14231041073799133,
"rewards/accuracy_reward/mean": 0.308928570151329,
"rewards/accuracy_reward/std": 0.45954376459121704,
"rewards/format_reward/mean": 0.9986606955528259,
"rewards/format_reward/std": 0.03616995587944984,
"step": 440
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03549107142857144,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2001.0,
"completions/mean_length": 777.7386596679687,
"completions/mean_terminated_length": 730.9507080078125,
"completions/min_length": 160.6,
"completions/min_terminated_length": 160.6,
"epoch": 2.7271933653426452,
"grad_norm": 0.20967762583445285,
"learning_rate": 5.096427629644007e-07,
"loss": 0.0273,
"num_tokens": 423777425.0,
"reward": 0.7706473588943481,
"reward_std": 0.14438892751932145,
"rewards/accuracy_reward/mean": 0.2723214328289032,
"rewards/accuracy_reward/std": 0.44458450078964235,
"rewards/format_reward/mean": 0.9966517806053161,
"rewards/format_reward/std": 0.05691422298550606,
"step": 445
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.031026785714285722,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1974.0,
"completions/mean_length": 755.1810546875,
"completions/mean_terminated_length": 713.7764892578125,
"completions/min_length": 190.4,
"completions/min_terminated_length": 190.4,
"epoch": 2.7577477084242688,
"grad_norm": 0.13264170340300163,
"learning_rate": 4.989285162861326e-07,
"loss": 0.0132,
"num_tokens": 428330684.0,
"reward": 0.8100446820259094,
"reward_std": 0.1402748256921768,
"rewards/accuracy_reward/mean": 0.3109374940395355,
"rewards/accuracy_reward/std": 0.46209981441497805,
"rewards/format_reward/mean": 0.9982142806053161,
"rewards/format_reward/std": 0.036147540807724,
"step": 450
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.023660714285714278,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2004.8,
"completions/mean_length": 756.9578491210938,
"completions/mean_terminated_length": 725.7147583007812,
"completions/min_length": 182.4,
"completions/min_terminated_length": 182.4,
"epoch": 2.7883020515058927,
"grad_norm": 0.12615728523984254,
"learning_rate": 4.882147616482602e-07,
"loss": 0.0187,
"num_tokens": 432811527.0,
"reward": 0.8729911088943482,
"reward_std": 0.15758154690265655,
"rewards/accuracy_reward/mean": 0.37366071343421936,
"rewards/accuracy_reward/std": 0.4820841133594513,
"rewards/format_reward/mean": 0.9986607074737549,
"rewards/format_reward/std": 0.02311962693929672,
"step": 455
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.025446428571428602,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2006.4,
"completions/mean_length": 763.37080078125,
"completions/mean_terminated_length": 729.8010131835938,
"completions/min_length": 156.8,
"completions/min_terminated_length": 156.8,
"epoch": 2.8188563945875162,
"grad_norm": 0.09243192733056833,
"learning_rate": 4.775064189576381e-07,
"loss": 0.0217,
"num_tokens": 437391948.0,
"reward": 0.8172991394996643,
"reward_std": 0.1438477709889412,
"rewards/accuracy_reward/mean": 0.31852678656578065,
"rewards/accuracy_reward/std": 0.46524045467376707,
"rewards/format_reward/mean": 0.9975446343421936,
"rewards/format_reward/std": 0.048688821494579315,
"step": 460
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03147321428571428,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2001.6,
"completions/mean_length": 776.061865234375,
"completions/mean_terminated_length": 734.75556640625,
"completions/min_length": 164.8,
"completions/min_terminated_length": 164.8,
"epoch": 2.84941073766914,
"grad_norm": 0.11621958803651176,
"learning_rate": 4.6680840563587956e-07,
"loss": 0.0173,
"num_tokens": 442009553.0,
"reward": 0.8102678894996643,
"reward_std": 0.14555521458387374,
"rewards/accuracy_reward/mean": 0.3127232134342194,
"rewards/accuracy_reward/std": 0.46230775117874146,
"rewards/format_reward/mean": 0.9950892925262451,
"rewards/format_reward/std": 0.0665913127362728,
"step": 465
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.027008928571428604,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1973.2,
"completions/mean_length": 760.6007080078125,
"completions/mean_terminated_length": 724.8645874023438,
"completions/min_length": 172.8,
"completions/min_terminated_length": 172.8,
"epoch": 2.8799650807507637,
"grad_norm": 0.33049107612273193,
"learning_rate": 4.5612563436120607e-07,
"loss": 0.0188,
"num_tokens": 446620780.0,
"reward": 0.7725446701049805,
"reward_std": 0.14168490767478942,
"rewards/accuracy_reward/mean": 0.2736607164144516,
"rewards/accuracy_reward/std": 0.4457698047161102,
"rewards/format_reward/mean": 0.9977678537368775,
"rewards/format_reward/std": 0.042007289826869965,
"step": 470
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03370535714285716,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2013.0,
"completions/mean_length": 783.5509399414062,
"completions/mean_terminated_length": 739.5219116210938,
"completions/min_length": 186.0,
"completions/min_terminated_length": 186.0,
"epoch": 2.9105194238323877,
"grad_norm": 0.09416582998102499,
"learning_rate": 4.4546301081247594e-07,
"loss": 0.0226,
"num_tokens": 451283440.0,
"reward": 0.8034598588943481,
"reward_std": 0.14468722343444823,
"rewards/accuracy_reward/mean": 0.3042410671710968,
"rewards/accuracy_reward/std": 0.4593252778053284,
"rewards/format_reward/mean": 0.9984374880790711,
"rewards/format_reward/std": 0.033385240286588666,
"step": 475
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.027901785714285744,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2019.0,
"completions/mean_length": 777.1678955078125,
"completions/mean_terminated_length": 740.9552124023437,
"completions/min_length": 166.6,
"completions/min_terminated_length": 166.6,
"epoch": 2.9410737669140112,
"grad_norm": 263.3850760782866,
"learning_rate": 4.3482543141642943e-07,
"loss": 0.0169,
"num_tokens": 455913688.0,
"reward": 0.8248884439468384,
"reward_std": 0.15477718710899352,
"rewards/accuracy_reward/mean": 0.32589285969734194,
"rewards/accuracy_reward/std": 0.4662268817424774,
"rewards/format_reward/mean": 0.9979910612106323,
"rewards/format_reward/std": 0.03924498930573463,
"step": 480
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.021875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2013.6,
"completions/mean_length": 764.9087280273437,
"completions/mean_terminated_length": 736.3053466796875,
"completions/min_length": 181.8,
"completions/min_terminated_length": 181.8,
"epoch": 2.971628109995635,
"grad_norm": 1.5752341559837937,
"learning_rate": 4.2421778109917884e-07,
"loss": 0.0109,
"num_tokens": 460508287.0,
"reward": 0.8045759320259094,
"reward_std": 0.13703190684318542,
"rewards/accuracy_reward/mean": 0.3051339328289032,
"rewards/accuracy_reward/std": 0.4602676570415497,
"rewards/format_reward/mean": 0.9988839149475097,
"rewards/format_reward/std": 0.029488424956798553,
"step": 485
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.024903711484593825,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2023.2,
"completions/mean_length": 755.0973266601562,
"completions/mean_terminated_length": 722.012353515625,
"completions/min_length": 188.0,
"completions/min_terminated_length": 188.0,
"epoch": 3.0061108686163247,
"grad_norm": 6.337153138467977,
"learning_rate": 4.1364493104298214e-07,
"loss": 0.0238,
"num_tokens": 465080956.0,
"reward": 0.8160714507102966,
"reward_std": 0.15418992638587953,
"rewards/accuracy_reward/mean": 0.3171874940395355,
"rewards/accuracy_reward/std": 0.46480806469917296,
"rewards/format_reward/mean": 0.9977678418159485,
"rewards/format_reward/std": 0.04494505375623703,
"step": 490
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02544642857142858,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2035.0,
"completions/mean_length": 787.8612060546875,
"completions/mean_terminated_length": 754.84599609375,
"completions/min_length": 169.6,
"completions/min_terminated_length": 169.6,
"epoch": 3.0366652116979487,
"grad_norm": 0.10215304155909323,
"learning_rate": 4.0311173644932897e-07,
"loss": 0.0159,
"num_tokens": 469756310.0,
"reward": 0.8203125357627868,
"reward_std": 0.14979312121868132,
"rewards/accuracy_reward/mean": 0.32254464030265806,
"rewards/accuracy_reward/std": 0.46628392338752744,
"rewards/format_reward/mean": 0.9955357074737549,
"rewards/format_reward/std": 0.06204817742109299,
"step": 495
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02834821428571428,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2016.0,
"completions/mean_length": 774.084619140625,
"completions/mean_terminated_length": 736.901904296875,
"completions/min_length": 163.4,
"completions/min_terminated_length": 163.4,
"epoch": 3.067219554779572,
"grad_norm": 0.2386996397732377,
"learning_rate": 3.926230343093616e-07,
"loss": 0.0231,
"num_tokens": 474334081.0,
"reward": 0.8320312857627868,
"reward_std": 0.14526015818119048,
"rewards/accuracy_reward/mean": 0.3339285671710968,
"rewards/accuracy_reward/std": 0.4708992302417755,
"rewards/format_reward/mean": 0.9962053656578064,
"rewards/format_reward/std": 0.059290457516908646,
"step": 500
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02410714285714286,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1961.6,
"completions/mean_length": 771.1132080078125,
"completions/mean_terminated_length": 739.582470703125,
"completions/min_length": 182.4,
"completions/min_terminated_length": 182.4,
"epoch": 3.097773897861196,
"grad_norm": 0.1862212865138003,
"learning_rate": 3.8218364118266194e-07,
"loss": 0.0185,
"num_tokens": 478977172.0,
"reward": 0.8248884201049804,
"reward_std": 0.14697628915309907,
"rewards/accuracy_reward/mean": 0.32589285969734194,
"rewards/accuracy_reward/std": 0.46748438477516174,
"rewards/format_reward/mean": 0.9979910612106323,
"rewards/format_reward/std": 0.04381053820252419,
"step": 505
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.029241071428571463,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2003.4,
"completions/mean_length": 792.349365234375,
"completions/mean_terminated_length": 754.631298828125,
"completions/min_length": 196.0,
"completions/min_terminated_length": 196.0,
"epoch": 3.1283282409428197,
"grad_norm": 0.48045188200682915,
"learning_rate": 3.717983509854198e-07,
"loss": 0.0181,
"num_tokens": 483702281.0,
"reward": 0.8324777126312256,
"reward_std": 0.14449880719184877,
"rewards/accuracy_reward/mean": 0.33348214626312256,
"rewards/accuracy_reward/std": 0.47003584504127505,
"rewards/format_reward/mean": 0.9979910612106323,
"rewards/format_reward/std": 0.038263522833585736,
"step": 510
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.027455357142857118,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2013.8,
"completions/mean_length": 775.73798828125,
"completions/mean_terminated_length": 739.96357421875,
"completions/min_length": 169.6,
"completions/min_terminated_length": 169.6,
"epoch": 3.1588825840244437,
"grad_norm": 0.3283120662028003,
"learning_rate": 3.614719327889978e-07,
"loss": 0.0186,
"num_tokens": 488337539.0,
"reward": 0.8064732313156128,
"reward_std": 0.14136622548103334,
"rewards/accuracy_reward/mean": 0.3080357193946838,
"rewards/accuracy_reward/std": 0.4613425314426422,
"rewards/format_reward/mean": 0.996875,
"rewards/format_reward/std": 0.04735285863280296,
"step": 515
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02544642857142858,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2002.6,
"completions/mean_length": 774.1507080078125,
"completions/mean_terminated_length": 740.759423828125,
"completions/min_length": 188.6,
"completions/min_terminated_length": 188.6,
"epoch": 3.189436927106067,
"grad_norm": 0.11515983895440855,
"learning_rate": 3.5120912862990806e-07,
"loss": 0.0189,
"num_tokens": 492952430.0,
"reward": 0.8037946820259094,
"reward_std": 0.14940813481807708,
"rewards/accuracy_reward/mean": 0.3049107134342194,
"rewards/accuracy_reward/std": 0.4603531539440155,
"rewards/format_reward/mean": 0.9977678418159485,
"rewards/format_reward/std": 0.039829809218645096,
"step": 520
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.025892857142857117,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2017.2,
"completions/mean_length": 765.4111938476562,
"completions/mean_terminated_length": 731.3354614257812,
"completions/min_length": 190.8,
"completions/min_terminated_length": 190.8,
"epoch": 3.219991270187691,
"grad_norm": 0.12201743402421254,
"learning_rate": 3.4101465133220197e-07,
"loss": 0.0189,
"num_tokens": 497568728.0,
"reward": 0.8258928775787353,
"reward_std": 0.13175857067108154,
"rewards/accuracy_reward/mean": 0.32700892090797423,
"rewards/accuracy_reward/std": 0.4685250222682953,
"rewards/format_reward/mean": 0.9977678418159485,
"rewards/format_reward/std": 0.039829809218645096,
"step": 525
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.025892857142857162,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2011.0,
"completions/mean_length": 767.7826293945312,
"completions/mean_terminated_length": 733.800537109375,
"completions/min_length": 172.2,
"completions/min_terminated_length": 172.2,
"epoch": 3.2505456132693147,
"grad_norm": 0.11607450802707917,
"learning_rate": 3.308931823432744e-07,
"loss": 0.0187,
"num_tokens": 502187602.0,
"reward": 0.8228794932365417,
"reward_std": 0.1379667639732361,
"rewards/accuracy_reward/mean": 0.32366071343421937,
"rewards/accuracy_reward/std": 0.46647522449493406,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.029466009885072707,
"step": 530
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03236607142857144,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2001.4,
"completions/mean_length": 801.5406616210937,
"completions/mean_terminated_length": 759.765966796875,
"completions/min_length": 207.8,
"completions/min_terminated_length": 207.8,
"epoch": 3.281099956350938,
"grad_norm": 0.09332659149457807,
"learning_rate": 3.2084936958407803e-07,
"loss": 0.0168,
"num_tokens": 506972984.0,
"reward": 0.8069196701049804,
"reward_std": 0.13857145309448243,
"rewards/accuracy_reward/mean": 0.30803571045398714,
"rewards/accuracy_reward/std": 0.45904147028923037,
"rewards/format_reward/mean": 0.9977678537368775,
"rewards/format_reward/std": 0.042007289826869965,
"step": 535
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2025.8,
"completions/mean_length": 747.7482543945313,
"completions/mean_terminated_length": 716.5390014648438,
"completions/min_length": 165.0,
"completions/min_terminated_length": 165.0,
"epoch": 3.311654299432562,
"grad_norm": 0.209554112692953,
"learning_rate": 3.108878253147326e-07,
"loss": 0.0156,
"num_tokens": 511502112.0,
"reward": 0.8238839745521546,
"reward_std": 0.13446195125579835,
"rewards/accuracy_reward/mean": 0.3243303596973419,
"rewards/accuracy_reward/std": 0.46747735142707825,
"rewards/format_reward/mean": 0.9991071343421936,
"rewards/format_reward/std": 0.01824134439229965,
"step": 540
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.023660714285714257,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2001.4,
"completions/mean_length": 736.8674438476562,
"completions/mean_terminated_length": 705.1341552734375,
"completions/min_length": 182.8,
"completions/min_terminated_length": 182.8,
"epoch": 3.342208642514186,
"grad_norm": 0.24913453230303073,
"learning_rate": 3.0101312401650933e-07,
"loss": 0.0206,
"num_tokens": 515959454.0,
"reward": 0.8094866394996643,
"reward_std": 0.13945239633321763,
"rewards/accuracy_reward/mean": 0.3104910671710968,
"rewards/accuracy_reward/std": 0.4625352382659912,
"rewards/format_reward/mean": 0.9979910492897034,
"rewards/format_reward/std": 0.043164219707250595,
"step": 545
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02343750000000002,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1984.6,
"completions/mean_length": 761.5877563476563,
"completions/mean_terminated_length": 730.7889282226563,
"completions/min_length": 207.6,
"completions/min_terminated_length": 207.6,
"epoch": 3.3727629855958097,
"grad_norm": 0.22025725568792806,
"learning_rate": 2.9122980029116584e-07,
"loss": 0.0257,
"num_tokens": 520559911.0,
"reward": 0.7988839626312256,
"reward_std": 0.15252991318702697,
"rewards/accuracy_reward/mean": 0.3006696403026581,
"rewards/accuracy_reward/std": 0.45508008599281313,
"rewards/format_reward/mean": 0.9964285612106323,
"rewards/format_reward/std": 0.05040616616606712,
"step": 550
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02343749999999998,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2005.8,
"completions/mean_length": 748.8795043945313,
"completions/mean_terminated_length": 717.6960571289062,
"completions/min_length": 181.6,
"completions/min_terminated_length": 181.6,
"epoch": 3.403317328677433,
"grad_norm": 0.2061435391247104,
"learning_rate": 2.815423467785925e-07,
"loss": 0.0222,
"num_tokens": 525092251.0,
"reward": 0.7954241275787354,
"reward_std": 0.1349016010761261,
"rewards/accuracy_reward/mean": 0.2964285731315613,
"rewards/accuracy_reward/std": 0.4558440089225769,
"rewards/format_reward/mean": 0.9979910612106323,
"rewards/format_reward/std": 0.03771382719278336,
"step": 555
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.022321428571428558,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1989.2,
"completions/mean_length": 743.7830688476563,
"completions/mean_terminated_length": 714.1416259765625,
"completions/min_length": 160.0,
"completions/min_terminated_length": 160.0,
"epoch": 3.433871671759057,
"grad_norm": 0.15863539766743784,
"learning_rate": 2.7195521209372896e-07,
"loss": 0.0191,
"num_tokens": 529581375.0,
"reward": 0.7804687738418579,
"reward_std": 0.1402986854314804,
"rewards/accuracy_reward/mean": 0.2816964238882065,
"rewards/accuracy_reward/std": 0.4474053859710693,
"rewards/format_reward/mean": 0.9975446343421936,
"rewards/format_reward/std": 0.04770735427737236,
"step": 560
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.025,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1952.2,
"completions/mean_length": 749.480615234375,
"completions/mean_terminated_length": 716.3030883789063,
"completions/min_length": 164.4,
"completions/min_terminated_length": 164.4,
"epoch": 3.464426014840681,
"grad_norm": 0.14033902876933363,
"learning_rate": 2.624727987836991e-07,
"loss": 0.0166,
"num_tokens": 534045608.0,
"reward": 0.8416294932365418,
"reward_std": 0.14161382615566254,
"rewards/accuracy_reward/mean": 0.3424107074737549,
"rewards/accuracy_reward/std": 0.4728982329368591,
"rewards/format_reward/mean": 0.998437476158142,
"rewards/format_reward/std": 0.038285937160253525,
"step": 565
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03214285714285716,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1970.8,
"completions/mean_length": 760.5319580078125,
"completions/mean_terminated_length": 718.0567016601562,
"completions/min_length": 181.6,
"completions/min_terminated_length": 181.6,
"epoch": 3.4949803579223047,
"grad_norm": 0.2024122400994545,
"learning_rate": 2.530994613060965e-07,
"loss": 0.0213,
"num_tokens": 538587247.0,
"reward": 0.8157366394996644,
"reward_std": 0.1593634992837906,
"rewards/accuracy_reward/mean": 0.3174107164144516,
"rewards/accuracy_reward/std": 0.46349750757217406,
"rewards/format_reward/mean": 0.9966517806053161,
"rewards/format_reward/std": 0.05556555166840553,
"step": 570
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2006.6,
"completions/mean_length": 763.4875244140625,
"completions/mean_terminated_length": 732.6395629882812,
"completions/min_length": 177.4,
"completions/min_terminated_length": 177.4,
"epoch": 3.525534701003928,
"grad_norm": 0.1767704593051593,
"learning_rate": 2.43839504029359e-07,
"loss": 0.0134,
"num_tokens": 543252887.0,
"reward": 0.7833705902099609,
"reward_std": 0.1322135180234909,
"rewards/accuracy_reward/mean": 0.2837053596973419,
"rewards/accuracy_reward/std": 0.44775116443634033,
"rewards/format_reward/mean": 0.9993303418159485,
"rewards/format_reward/std": 0.020044592767953874,
"step": 575
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.024553571428571442,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1991.0,
"completions/mean_length": 744.0576171875,
"completions/mean_terminated_length": 711.3238403320313,
"completions/min_length": 170.6,
"completions/min_terminated_length": 170.6,
"epoch": 3.556089044085552,
"grad_norm": 0.1430344151818995,
"learning_rate": 2.346971792561413e-07,
"loss": 0.0147,
"num_tokens": 547711049.0,
"reward": 0.8356027245521546,
"reward_std": 0.1501772254705429,
"rewards/accuracy_reward/mean": 0.3363839268684387,
"rewards/accuracy_reward/std": 0.4726757764816284,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.029466009885072707,
"step": 580
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02388392857142858,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2009.0,
"completions/mean_length": 737.341552734375,
"completions/mean_terminated_length": 705.2665893554688,
"completions/min_length": 154.8,
"completions/min_terminated_length": 154.8,
"epoch": 3.586643387167176,
"grad_norm": 0.1800009617633983,
"learning_rate": 2.2567668527059668e-07,
"loss": 0.02,
"num_tokens": 552121939.0,
"reward": 0.8487723588943481,
"reward_std": 0.1420737773180008,
"rewards/accuracy_reward/mean": 0.3504464328289032,
"rewards/accuracy_reward/std": 0.47479273080825807,
"rewards/format_reward/mean": 0.9966517806053161,
"rewards/format_reward/std": 0.0574639193713665,
"step": 585
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02633928571428572,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1999.2,
"completions/mean_length": 746.2768188476563,
"completions/mean_terminated_length": 711.2746704101562,
"completions/min_length": 177.0,
"completions/min_terminated_length": 177.0,
"epoch": 3.6171977302487996,
"grad_norm": 0.23027106629756255,
"learning_rate": 2.1678216441046732e-07,
"loss": 0.0137,
"num_tokens": 556618195.0,
"reward": 0.8133929014205933,
"reward_std": 0.13093625605106354,
"rewards/accuracy_reward/mean": 0.3138392806053162,
"rewards/accuracy_reward/std": 0.4636102974414825,
"rewards/format_reward/mean": 0.9991071343421936,
"rewards/format_reward/std": 0.022806894034147263,
"step": 590
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.023660714285714257,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2008.0,
"completions/mean_length": 748.0799438476563,
"completions/mean_terminated_length": 716.4669311523437,
"completions/min_length": 155.8,
"completions/min_terminated_length": 155.8,
"epoch": 3.647752073330423,
"grad_norm": 0.11781060271968649,
"learning_rate": 2.0801770116486443e-07,
"loss": 0.0197,
"num_tokens": 561164697.0,
"reward": 0.8264509201049804,
"reward_std": 0.12667974680662156,
"rewards/accuracy_reward/mean": 0.3279017865657806,
"rewards/accuracy_reward/std": 0.467680948972702,
"rewards/format_reward/mean": 0.9970982193946838,
"rewards/format_reward/std": 0.05133358836174011,
"step": 595
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02120535714285714,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1992.6,
"completions/mean_length": 755.0018188476563,
"completions/mean_terminated_length": 727.0325317382812,
"completions/min_length": 184.2,
"completions/min_terminated_length": 184.2,
"epoch": 3.678306416412047,
"grad_norm": 0.2652502656450688,
"learning_rate": 1.993873202986119e-07,
"loss": 0.0112,
"num_tokens": 565692097.0,
"reward": 0.8014509320259094,
"reward_std": 0.136177259683609,
"rewards/accuracy_reward/mean": 0.3020089268684387,
"rewards/accuracy_reward/std": 0.4574079930782318,
"rewards/format_reward/mean": 0.9988839268684387,
"rewards/format_reward/std": 0.02556919530034065,
"step": 600
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.018526785714285697,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2026.6,
"completions/mean_length": 752.8002563476563,
"completions/mean_terminated_length": 728.3860717773438,
"completions/min_length": 196.8,
"completions/min_terminated_length": 196.8,
"epoch": 3.708860759493671,
"grad_norm": 0.24669083110778534,
"learning_rate": 1.9089498500401913e-07,
"loss": 0.0107,
"num_tokens": 570260762.0,
"reward": 0.7993303894996643,
"reward_std": 0.12696305960416793,
"rewards/accuracy_reward/mean": 0.3,
"rewards/accuracy_reward/std": 0.45755314230918886,
"rewards/format_reward/mean": 0.9986607074737549,
"rewards/format_reward/std": 0.02670370936393738,
"step": 605
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02388392857142858,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2012.2,
"completions/mean_length": 772.6015869140625,
"completions/mean_terminated_length": 741.4518676757813,
"completions/min_length": 177.0,
"completions/min_terminated_length": 177.0,
"epoch": 3.7394151025752946,
"grad_norm": 0.28021718948531754,
"learning_rate": 1.8254459508092768e-07,
"loss": 0.0209,
"num_tokens": 574897761.0,
"reward": 0.8345982432365417,
"reward_std": 0.15114397704601287,
"rewards/accuracy_reward/mean": 0.3352678596973419,
"rewards/accuracy_reward/std": 0.46947299838066103,
"rewards/format_reward/mean": 0.9986607074737549,
"rewards/format_reward/std": 0.02670370936393738,
"step": 610
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0265625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2011.4,
"completions/mean_length": 773.546240234375,
"completions/mean_terminated_length": 738.8490478515625,
"completions/min_length": 183.2,
"completions/min_terminated_length": 183.2,
"epoch": 3.769969445656918,
"grad_norm": 0.17390405788122348,
"learning_rate": 1.7433998514586628e-07,
"loss": 0.0239,
"num_tokens": 579587960.0,
"reward": 0.8064732551574707,
"reward_std": 0.14708727300167085,
"rewards/accuracy_reward/mean": 0.30758928656578066,
"rewards/accuracy_reward/std": 0.45995755195617677,
"rewards/format_reward/mean": 0.9977678537368775,
"rewards/format_reward/std": 0.04047612771391869,
"step": 615
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02901785714285714,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2005.8,
"completions/mean_length": 783.2547241210938,
"completions/mean_terminated_length": 745.5877197265625,
"completions/min_length": 184.8,
"completions/min_terminated_length": 184.8,
"epoch": 3.800523788738542,
"grad_norm": 0.6746711244364784,
"learning_rate": 1.6628492287114294e-07,
"loss": 0.0233,
"num_tokens": 584280101.0,
"reward": 0.827120566368103,
"reward_std": 0.14461604207754136,
"rewards/accuracy_reward/mean": 0.3283482134342194,
"rewards/accuracy_reward/std": 0.46811321973800657,
"rewards/format_reward/mean": 0.9975446224212646,
"rewards/format_reward/std": 0.048042502254247665,
"step": 620
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02991071428571428,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2005.6,
"completions/mean_length": 769.333740234375,
"completions/mean_terminated_length": 730.0556396484375,
"completions/min_length": 187.0,
"completions/min_terminated_length": 187.0,
"epoch": 3.831078131820166,
"grad_norm": 0.1567474234940952,
"learning_rate": 1.583831072546764e-07,
"loss": 0.0184,
"num_tokens": 588911764.0,
"reward": 0.8108259320259095,
"reward_std": 0.15171462893486024,
"rewards/accuracy_reward/mean": 0.31160714030265807,
"rewards/accuracy_reward/std": 0.46260055899620056,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.035013026744127276,
"step": 625
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02767857142857142,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1990.6,
"completions/mean_length": 768.8522705078125,
"completions/mean_terminated_length": 732.5064575195313,
"completions/min_length": 192.6,
"completions/min_terminated_length": 192.6,
"epoch": 3.8616324749017896,
"grad_norm": 0.14954945676291354,
"learning_rate": 1.506381669213637e-07,
"loss": 0.0173,
"num_tokens": 593477238.0,
"reward": 0.8109375357627868,
"reward_std": 0.1530637711286545,
"rewards/accuracy_reward/mean": 0.3120535671710968,
"rewards/accuracy_reward/std": 0.46293908953666685,
"rewards/format_reward/mean": 0.9977678418159485,
"rewards/format_reward/std": 0.043046686053276065,
"step": 630
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2003.6,
"completions/mean_length": 762.0721435546875,
"completions/mean_terminated_length": 731.2215698242187,
"completions/min_length": 151.6,
"completions/min_terminated_length": 151.6,
"epoch": 3.892186817983413,
"grad_norm": 0.11924662572229765,
"learning_rate": 1.4305365845676438e-07,
"loss": 0.0179,
"num_tokens": 598017601.0,
"reward": 0.8376116514205932,
"reward_std": 0.13201817125082016,
"rewards/accuracy_reward/mean": 0.33861607909202573,
"rewards/accuracy_reward/std": 0.471518212556839,
"rewards/format_reward/mean": 0.9979910612106323,
"rewards/format_reward/std": 0.038263522833585736,
"step": 635
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.022767857142857117,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2012.6,
"completions/mean_length": 760.8882080078125,
"completions/mean_terminated_length": 730.9736572265625,
"completions/min_length": 196.8,
"completions/min_terminated_length": 196.8,
"epoch": 3.922741161065037,
"grad_norm": 0.2074198049774532,
"learning_rate": 1.3563306477386783e-07,
"loss": 0.007,
"num_tokens": 602579660.0,
"reward": 0.8518973588943481,
"reward_std": 0.12488884776830673,
"rewards/accuracy_reward/mean": 0.3522321403026581,
"rewards/accuracy_reward/std": 0.474759179353714,
"rewards/format_reward/mean": 0.9993303537368774,
"rewards/format_reward/std": 0.016125362366437912,
"step": 640
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02901785714285716,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2022.2,
"completions/mean_length": 759.3109619140625,
"completions/mean_terminated_length": 720.9727416992188,
"completions/min_length": 177.8,
"completions/min_terminated_length": 177.8,
"epoch": 3.953295504146661,
"grad_norm": 2.2174198065804362,
"learning_rate": 1.283797935136891e-07,
"loss": 0.0184,
"num_tokens": 607132141.0,
"reward": 0.8058036208152771,
"reward_std": 0.13838455379009246,
"rewards/accuracy_reward/mean": 0.3071428596973419,
"rewards/accuracy_reward/std": 0.4608741343021393,
"rewards/format_reward/mean": 0.9973214268684387,
"rewards/format_reward/std": 0.0459041066467762,
"step": 645
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02388392857142856,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1997.2,
"completions/mean_length": 753.8201293945312,
"completions/mean_terminated_length": 722.1459716796875,
"completions/min_length": 167.8,
"completions/min_terminated_length": 167.8,
"epoch": 3.9838498472282846,
"grad_norm": 0.1409830685996524,
"learning_rate": 1.2129717548043238e-07,
"loss": 0.0152,
"num_tokens": 611680479.0,
"reward": 0.8150669813156128,
"reward_std": 0.13196606040000916,
"rewards/accuracy_reward/mean": 0.31651785373687746,
"rewards/accuracy_reward/std": 0.4651065766811371,
"rewards/format_reward/mean": 0.997098195552826,
"rewards/format_reward/std": 0.051389621943235396,
"step": 650
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02373949579831931,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2019.6,
"completions/mean_length": 756.08515625,
"completions/mean_terminated_length": 724.6867309570313,
"completions/min_length": 188.2,
"completions/min_terminated_length": 188.2,
"epoch": 4.018332605848975,
"grad_norm": 0.3105897352521659,
"learning_rate": 1.1438846311194023e-07,
"loss": 0.0207,
"num_tokens": 616251243.0,
"reward": 0.8137277126312256,
"reward_std": 0.1312696397304535,
"rewards/accuracy_reward/mean": 0.31450892686843873,
"rewards/accuracy_reward/std": 0.4644446551799774,
"rewards/format_reward/mean": 0.998437476158142,
"rewards/format_reward/std": 0.038285937160253525,
"step": 655
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.023214285714285743,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2004.6,
"completions/mean_length": 767.8951293945313,
"completions/mean_terminated_length": 737.55,
"completions/min_length": 172.6,
"completions/min_terminated_length": 172.6,
"epoch": 4.048886948930598,
"grad_norm": 0.29276735287237543,
"learning_rate": 1.0765682898612655e-07,
"loss": 0.0195,
"num_tokens": 620884765.0,
"reward": 0.8213170051574707,
"reward_std": 0.12525852173566818,
"rewards/accuracy_reward/mean": 0.32209821939468386,
"rewards/accuracy_reward/std": 0.4670514702796936,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.030447476357221604,
"step": 660
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.023660714285714323,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2013.2,
"completions/mean_length": 766.2964599609375,
"completions/mean_terminated_length": 735.2522216796875,
"completions/min_length": 200.2,
"completions/min_terminated_length": 200.2,
"epoch": 4.079441292012222,
"grad_norm": 0.14633777787437463,
"learning_rate": 1.0110536436408535e-07,
"loss": 0.0144,
"num_tokens": 625454277.0,
"reward": 0.8141741514205932,
"reward_std": 0.13741124123334886,
"rewards/accuracy_reward/mean": 0.3149553596973419,
"rewards/accuracy_reward/std": 0.46363803148269656,
"rewards/format_reward/mean": 0.9984374880790711,
"rewards/format_reward/std": 0.03436670750379563,
"step": 665
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02857142857142858,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2019.8,
"completions/mean_length": 777.2511474609375,
"completions/mean_terminated_length": 739.8740356445312,
"completions/min_length": 179.6,
"completions/min_terminated_length": 179.6,
"epoch": 4.109995635093846,
"grad_norm": 0.37118116639410365,
"learning_rate": 9.473707777053969e-08,
"loss": 0.0138,
"num_tokens": 630043018.0,
"reward": 0.8378348469734191,
"reward_std": 0.15374741554260254,
"rewards/accuracy_reward/mean": 0.33906249403953553,
"rewards/accuracy_reward/std": 0.47094921469688417,
"rewards/format_reward/mean": 0.9975446343421936,
"rewards/format_reward/std": 0.04770735502243042,
"step": 670
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02120535714285716,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2023.6,
"completions/mean_length": 765.1116455078125,
"completions/mean_terminated_length": 737.5109741210938,
"completions/min_length": 193.8,
"completions/min_terminated_length": 193.8,
"epoch": 4.1405499781754695,
"grad_norm": 0.1618993630861369,
"learning_rate": 8.855489361228496e-08,
"loss": 0.0165,
"num_tokens": 634606062.0,
"reward": 0.831026828289032,
"reward_std": 0.1420398995280266,
"rewards/accuracy_reward/mean": 0.33147321343421937,
"rewards/accuracy_reward/std": 0.46809123158454896,
"rewards/format_reward/mean": 0.9991071343421936,
"rewards/format_reward/std": 0.022806893289089202,
"step": 675
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.020535714285714303,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2030.0,
"completions/mean_length": 760.178173828125,
"completions/mean_terminated_length": 733.2706909179688,
"completions/min_length": 183.6,
"completions/min_terminated_length": 183.6,
"epoch": 4.171104321257093,
"grad_norm": 0.4936046835467819,
"learning_rate": 8.256165083526018e-08,
"loss": 0.0136,
"num_tokens": 639173940.0,
"reward": 0.7970982670783997,
"reward_std": 0.12565719038248063,
"rewards/accuracy_reward/mean": 0.29776785969734193,
"rewards/accuracy_reward/std": 0.45710530281066897,
"rewards/format_reward/mean": 0.9986606955528259,
"rewards/format_reward/std": 0.03160440623760223,
"step": 680
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02700892857142858,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2031.2,
"completions/mean_length": 796.4911010742187,
"completions/mean_terminated_length": 761.7802124023438,
"completions/min_length": 195.8,
"completions/min_terminated_length": 195.8,
"epoch": 4.201658664338717,
"grad_norm": 0.17040374073973114,
"learning_rate": 7.676010162086388e-08,
"loss": 0.0235,
"num_tokens": 643877580.0,
"reward": 0.8481027126312256,
"reward_std": 0.16478358209133148,
"rewards/accuracy_reward/mean": 0.34933035373687743,
"rewards/accuracy_reward/std": 0.47519075870513916,
"rewards/format_reward/mean": 0.9975446224212646,
"rewards/format_reward/std": 0.04651134014129639,
"step": 685
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2034.2,
"completions/mean_length": 815.3140991210937,
"completions/mean_terminated_length": 775.48994140625,
"completions/min_length": 209.4,
"completions/min_terminated_length": 209.4,
"epoch": 4.232213007420341,
"grad_norm": 0.30309775860764243,
"learning_rate": 7.115291012211383e-08,
"loss": 0.0206,
"num_tokens": 648727331.0,
"reward": 0.8197545051574707,
"reward_std": 0.15248821377754213,
"rewards/accuracy_reward/mean": 0.32165178656578064,
"rewards/accuracy_reward/std": 0.4659262537956238,
"rewards/format_reward/mean": 0.9962053656578064,
"rewards/format_reward/std": 0.06102558821439743,
"step": 690
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03526785714285714,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2022.2,
"completions/mean_length": 800.2846313476563,
"completions/mean_terminated_length": 754.6343017578125,
"completions/min_length": 216.8,
"completions/min_terminated_length": 216.8,
"epoch": 4.2627673505019645,
"grad_norm": 0.12563167093092303,
"learning_rate": 6.574265124023053e-08,
"loss": 0.0202,
"num_tokens": 653502278.0,
"reward": 0.8223214626312256,
"reward_std": 0.1450999230146408,
"rewards/accuracy_reward/mean": 0.3238839268684387,
"rewards/accuracy_reward/std": 0.46725066304206847,
"rewards/format_reward/mean": 0.996875,
"rewards/format_reward/std": 0.048269759863615036,
"step": 695
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0296875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2014.4,
"completions/mean_length": 792.3102905273438,
"completions/mean_terminated_length": 753.8573852539063,
"completions/min_length": 209.6,
"completions/min_terminated_length": 209.6,
"epoch": 4.293321693583588,
"grad_norm": 1.4024635061933521,
"learning_rate": 6.053180944220627e-08,
"loss": 0.0198,
"num_tokens": 658181276.0,
"reward": 0.828683054447174,
"reward_std": 0.13339326530694962,
"rewards/accuracy_reward/mean": 0.3296875,
"rewards/accuracy_reward/std": 0.4695268332958221,
"rewards/format_reward/mean": 0.9979910612106323,
"rewards/format_reward/std": 0.03924498930573463,
"step": 700
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02276785714285714,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2001.8,
"completions/mean_length": 785.6281616210938,
"completions/mean_terminated_length": 756.338330078125,
"completions/min_length": 144.0,
"completions/min_terminated_length": 144.0,
"epoch": 4.323876036665212,
"grad_norm": 0.21545676412663686,
"learning_rate": 5.552277761990293e-08,
"loss": 0.0191,
"num_tokens": 662827890.0,
"reward": 0.8563616394996643,
"reward_std": 0.1465895101428032,
"rewards/accuracy_reward/mean": 0.35714285373687743,
"rewards/accuracy_reward/std": 0.47914679646492003,
"rewards/format_reward/mean": 0.9984374880790711,
"rewards/format_reward/std": 0.03436670675873756,
"step": 705
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02812500000000002,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2009.4,
"completions/mean_length": 793.2120727539062,
"completions/mean_terminated_length": 756.9038940429688,
"completions/min_length": 185.8,
"completions/min_terminated_length": 185.8,
"epoch": 4.3544303797468356,
"grad_norm": 1.0833248237653017,
"learning_rate": 5.071785599120243e-08,
"loss": 0.0164,
"num_tokens": 667620048.0,
"reward": 0.8035714626312256,
"reward_std": 0.13308998942375183,
"rewards/accuracy_reward/mean": 0.3046875,
"rewards/accuracy_reward/std": 0.45963962078094484,
"rewards/format_reward/mean": 0.9977678537368775,
"rewards/format_reward/std": 0.04657283872365951,
"step": 710
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02812500000000002,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2036.8,
"completions/mean_length": 789.0960327148438,
"completions/mean_terminated_length": 752.6344848632813,
"completions/min_length": 207.8,
"completions/min_terminated_length": 207.8,
"epoch": 4.3849847228284595,
"grad_norm": 0.19747588935303667,
"learning_rate": 4.611925104371423e-08,
"loss": 0.022,
"num_tokens": 672327366.0,
"reward": 0.8189732670783997,
"reward_std": 0.1410384178161621,
"rewards/accuracy_reward/mean": 0.3198660671710968,
"rewards/accuracy_reward/std": 0.46489458084106444,
"rewards/format_reward/mean": 0.9982142806053161,
"rewards/format_reward/std": 0.04169455766677856,
"step": 715
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.028125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2015.0,
"completions/mean_length": 785.8915649414063,
"completions/mean_terminated_length": 749.4353637695312,
"completions/min_length": 180.2,
"completions/min_terminated_length": 180.2,
"epoch": 4.415539065910083,
"grad_norm": 0.1439524328667155,
"learning_rate": 4.1729074521525187e-08,
"loss": 0.0202,
"num_tokens": 677006864.0,
"reward": 0.8311384201049805,
"reward_std": 0.14120545536279677,
"rewards/accuracy_reward/mean": 0.3321428596973419,
"rewards/accuracy_reward/std": 0.47006452083587646,
"rewards/format_reward/mean": 0.9979910731315613,
"rewards/format_reward/std": 0.033794596791267395,
"step": 720
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02053571428571428,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2019.2,
"completions/mean_length": 758.9214721679688,
"completions/mean_terminated_length": 731.9757080078125,
"completions/min_length": 185.4,
"completions/min_terminated_length": 185.4,
"epoch": 4.446093408991707,
"grad_norm": 1.9313250022212012,
"learning_rate": 3.754934245545721e-08,
"loss": 0.021,
"num_tokens": 681578592.0,
"reward": 0.8079241394996644,
"reward_std": 0.12895339727401733,
"rewards/accuracy_reward/mean": 0.3087053596973419,
"rewards/accuracy_reward/std": 0.4609071433544159,
"rewards/format_reward/mean": 0.9984374880790711,
"rewards/format_reward/std": 0.03436670750379563,
"step": 725
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.024553571428571418,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2022.0,
"completions/mean_length": 776.86923828125,
"completions/mean_terminated_length": 744.994580078125,
"completions/min_length": 184.0,
"completions/min_terminated_length": 184.0,
"epoch": 4.4766477520733305,
"grad_norm": 1.3059812380413065,
"learning_rate": 3.35819742372771e-08,
"loss": 0.0181,
"num_tokens": 686250246.0,
"reward": 0.8127232551574707,
"reward_std": 0.13722361177206038,
"rewards/accuracy_reward/mean": 0.31428571939468386,
"rewards/accuracy_reward/std": 0.46391156911849973,
"rewards/format_reward/mean": 0.9968749761581421,
"rewards/format_reward/std": 0.05042858049273491,
"step": 730
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03080357142857144,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2010.2,
"completions/mean_length": 770.19267578125,
"completions/mean_terminated_length": 729.6285400390625,
"completions/min_length": 187.8,
"completions/min_terminated_length": 187.8,
"epoch": 4.5072020951549545,
"grad_norm": 0.182945009864938,
"learning_rate": 2.982879173828523e-08,
"loss": 0.0168,
"num_tokens": 690868445.0,
"reward": 0.7860491394996643,
"reward_std": 0.14061158150434494,
"rewards/accuracy_reward/mean": 0.28705357015132904,
"rewards/accuracy_reward/std": 0.4502267956733704,
"rewards/format_reward/mean": 0.9979910731315613,
"rewards/format_reward/std": 0.03890984132885933,
"step": 735
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02946428571428572,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2010.2,
"completions/mean_length": 804.9870849609375,
"completions/mean_terminated_length": 767.5385986328125,
"completions/min_length": 200.4,
"completions/min_terminated_length": 200.4,
"epoch": 4.537756438236578,
"grad_norm": 0.1838967471327345,
"learning_rate": 2.6291518472686402e-08,
"loss": 0.0169,
"num_tokens": 695725163.0,
"reward": 0.7940848708152771,
"reward_std": 0.13515909463167192,
"rewards/accuracy_reward/mean": 0.2957589328289032,
"rewards/accuracy_reward/std": 0.4556801378726959,
"rewards/format_reward/mean": 0.9966517925262451,
"rewards/format_reward/std": 0.055230402201414106,
"step": 740
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02700892857142856,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2009.8,
"completions/mean_length": 784.9819580078125,
"completions/mean_terminated_length": 750.2052001953125,
"completions/min_length": 174.2,
"completions/min_terminated_length": 174.2,
"epoch": 4.568310781318202,
"grad_norm": 0.11383203893243991,
"learning_rate": 2.2971778806127993e-08,
"loss": 0.0163,
"num_tokens": 700342410.0,
"reward": 0.8392857670783996,
"reward_std": 0.14468344449996948,
"rewards/accuracy_reward/mean": 0.34040178656578063,
"rewards/accuracy_reward/std": 0.47302438616752623,
"rewards/format_reward/mean": 0.9977678418159485,
"rewards/format_reward/std": 0.04494505375623703,
"step": 745
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03214285714285716,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2015.8,
"completions/mean_length": 773.4073974609375,
"completions/mean_terminated_length": 731.1708618164063,
"completions/min_length": 196.4,
"completions/min_terminated_length": 196.4,
"epoch": 4.5988651243998255,
"grad_norm": 0.4400992481788288,
"learning_rate": 1.9871097209768372e-08,
"loss": 0.025,
"num_tokens": 705047211.0,
"reward": 0.785714304447174,
"reward_std": 0.13290437459945678,
"rewards/accuracy_reward/mean": 0.28727678656578065,
"rewards/accuracy_reward/std": 0.45220935344696045,
"rewards/format_reward/mean": 0.996875,
"rewards/format_reward/std": 0.04552400186657905,
"step": 750
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.022767857142857163,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1956.6,
"completions/mean_length": 759.9413208007812,
"completions/mean_terminated_length": 729.9281372070312,
"completions/min_length": 145.0,
"completions/min_terminated_length": 145.0,
"epoch": 4.6294194674814495,
"grad_norm": 0.2102766441971714,
"learning_rate": 1.6990897560218208e-08,
"loss": 0.0154,
"num_tokens": 709627828.0,
"reward": 0.8046875357627868,
"reward_std": 0.12229991853237152,
"rewards/accuracy_reward/mean": 0.3055803656578064,
"rewards/accuracy_reward/std": 0.4593049645423889,
"rewards/format_reward/mean": 0.9982142686843872,
"rewards/format_reward/std": 0.0364826887845993,
"step": 755
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03325892857142856,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2025.8,
"completions/mean_length": 790.7350952148438,
"completions/mean_terminated_length": 747.6206909179688,
"completions/min_length": 201.8,
"completions/min_terminated_length": 201.8,
"epoch": 4.659973810563073,
"grad_norm": 0.38647401180951185,
"learning_rate": 1.4332502485676357e-08,
"loss": 0.0196,
"num_tokens": 714338657.0,
"reward": 0.8329241514205933,
"reward_std": 0.13567108660936356,
"rewards/accuracy_reward/mean": 0.33415178656578065,
"rewards/accuracy_reward/std": 0.46947482228279114,
"rewards/format_reward/mean": 0.9975446343421936,
"rewards/format_reward/std": 0.04770735427737236,
"step": 760
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.024553571428571418,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2002.4,
"completions/mean_length": 773.5127563476562,
"completions/mean_terminated_length": 741.4698608398437,
"completions/min_length": 187.6,
"completions/min_terminated_length": 187.6,
"epoch": 4.6905281536446966,
"grad_norm": 2.805510653477693,
"learning_rate": 1.1897132758560468e-08,
"loss": 0.014,
"num_tokens": 718989634.0,
"reward": 0.794308066368103,
"reward_std": 0.13785322159528732,
"rewards/accuracy_reward/mean": 0.2950892895460129,
"rewards/accuracy_reward/std": 0.4552138805389404,
"rewards/format_reward/mean": 0.9984374880790711,
"rewards/format_reward/std": 0.03436670675873756,
"step": 765
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0247767857142857,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2024.6,
"completions/mean_length": 772.9489135742188,
"completions/mean_terminated_length": 740.5083984375,
"completions/min_length": 180.6,
"completions/min_terminated_length": 180.6,
"epoch": 4.7210824967263205,
"grad_norm": 0.20187582947048766,
"learning_rate": 9.685906734910987e-09,
"loss": 0.0211,
"num_tokens": 723551525.0,
"reward": 0.8534598708152771,
"reward_std": 0.14746141731739043,
"rewards/accuracy_reward/mean": 0.3546875,
"rewards/accuracy_reward/std": 0.47859745621681216,
"rewards/format_reward/mean": 0.9975446343421936,
"rewards/format_reward/std": 0.04770735502243042,
"step": 770
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.023214285714285722,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2028.0,
"completions/mean_length": 766.0482543945312,
"completions/mean_terminated_length": 735.5724365234375,
"completions/min_length": 208.2,
"completions/min_terminated_length": 208.2,
"epoch": 4.7516368398079445,
"grad_norm": 0.13176327437541996,
"learning_rate": 7.69983984082634e-09,
"loss": 0.0174,
"num_tokens": 728130133.0,
"reward": 0.8250000357627869,
"reward_std": 0.14912801682949067,
"rewards/accuracy_reward/mean": 0.3256696403026581,
"rewards/accuracy_reward/std": 0.4685256540775299,
"rewards/format_reward/mean": 0.9986607074737549,
"rewards/format_reward/std": 0.02670370936393738,
"step": 775
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01919642857142856,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2022.2,
"completions/mean_length": 763.625927734375,
"completions/mean_terminated_length": 738.4434936523437,
"completions/min_length": 188.8,
"completions/min_terminated_length": 188.8,
"epoch": 4.782191182889568,
"grad_norm": 0.2053305617912015,
"learning_rate": 5.9398441061652704e-09,
"loss": 0.0119,
"num_tokens": 732737825.0,
"reward": 0.8334821820259094,
"reward_std": 0.12687831670045852,
"rewards/accuracy_reward/mean": 0.33482142686843874,
"rewards/accuracy_reward/std": 0.47021366357803346,
"rewards/format_reward/mean": 0.9973214268684387,
"rewards/format_reward/std": 0.05145112201571465,
"step": 780
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02700892857142858,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2018.4,
"completions/mean_length": 799.6297241210938,
"completions/mean_terminated_length": 764.9281372070312,
"completions/min_length": 214.4,
"completions/min_terminated_length": 214.4,
"epoch": 4.8127455259711915,
"grad_norm": 0.13203428184197374,
"learning_rate": 4.406727745729255e-09,
"loss": 0.0155,
"num_tokens": 737482614.0,
"reward": 0.8324777126312256,
"reward_std": 0.14140197932720183,
"rewards/accuracy_reward/mean": 0.33370535969734194,
"rewards/accuracy_reward/std": 0.47152380347251893,
"rewards/format_reward/mean": 0.9975446343421936,
"rewards/format_reward/std": 0.04770735502243042,
"step": 785
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02857142857142858,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1993.2,
"completions/mean_length": 776.2779418945313,
"completions/mean_terminated_length": 738.7794067382813,
"completions/min_length": 176.0,
"completions/min_terminated_length": 176.0,
"epoch": 4.8432998690528155,
"grad_norm": 0.14199527601143866,
"learning_rate": 3.1011947881195986e-09,
"loss": 0.0133,
"num_tokens": 742148267.0,
"reward": 0.8156250357627869,
"reward_std": 0.1406142756342888,
"rewards/accuracy_reward/mean": 0.3171875,
"rewards/accuracy_reward/std": 0.4645317316055298,
"rewards/format_reward/mean": 0.996874988079071,
"rewards/format_reward/std": 0.053170456737279895,
"step": 790
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.027232142857142837,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2035.0,
"completions/mean_length": 771.7283813476563,
"completions/mean_terminated_length": 736.1706298828125,
"completions/min_length": 181.8,
"completions/min_terminated_length": 181.8,
"epoch": 4.8738542121344395,
"grad_norm": 0.13648261004688908,
"learning_rate": 2.0238447524372204e-09,
"loss": 0.0167,
"num_tokens": 746735922.0,
"reward": 0.830245566368103,
"reward_std": 0.14905019402503966,
"rewards/accuracy_reward/mean": 0.3314732193946838,
"rewards/accuracy_reward/std": 0.47030400633811953,
"rewards/format_reward/mean": 0.9975446343421936,
"rewards/format_reward/std": 0.04042520821094513,
"step": 795
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.023214285714285722,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2016.6,
"completions/mean_length": 751.7683349609375,
"completions/mean_terminated_length": 720.94951171875,
"completions/min_length": 198.8,
"completions/min_terminated_length": 198.8,
"epoch": 4.904408555216063,
"grad_norm": 3.313764079369233,
"learning_rate": 1.1751723729750973e-09,
"loss": 0.0215,
"num_tokens": 751241948.0,
"reward": 0.8477678894996643,
"reward_std": 0.13650742769241334,
"rewards/accuracy_reward/mean": 0.34866071343421934,
"rewards/accuracy_reward/std": 0.4754593074321747,
"rewards/format_reward/mean": 0.9982142806053161,
"rewards/format_reward/std": 0.031032297015190124,
"step": 800
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02566964285714286,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2020.6,
"completions/mean_length": 756.6982543945312,
"completions/mean_terminated_length": 722.67041015625,
"completions/min_length": 185.2,
"completions/min_terminated_length": 185.2,
"epoch": 4.9349628982976865,
"grad_norm": 0.27923466768374233,
"learning_rate": 5.555673720292753e-10,
"loss": 0.0213,
"num_tokens": 755765500.0,
"reward": 0.8094866394996643,
"reward_std": 0.1245961531996727,
"rewards/accuracy_reward/mean": 0.31026785969734194,
"rewards/accuracy_reward/std": 0.4614250659942627,
"rewards/format_reward/mean": 0.9984374880790711,
"rewards/format_reward/std": 0.03893225565552712,
"step": 805
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02254464285714286,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2016.2,
"completions/mean_length": 766.8745971679688,
"completions/mean_terminated_length": 737.5291625976563,
"completions/min_length": 172.0,
"completions/min_terminated_length": 172.0,
"epoch": 4.9655172413793105,
"grad_norm": 0.9332087446281511,
"learning_rate": 1.653142809331376e-10,
"loss": 0.0159,
"num_tokens": 760334674.0,
"reward": 0.8467634201049805,
"reward_std": 0.12525381296873092,
"rewards/accuracy_reward/mean": 0.3477678596973419,
"rewards/accuracy_reward/std": 0.4751604437828064,
"rewards/format_reward/mean": 0.9979910612106323,
"rewards/format_reward/std": 0.038263522833585736,
"step": 810
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.025442051820728274,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2001.0,
"completions/mean_length": 765.5223388671875,
"completions/mean_terminated_length": 732.3697021484375,
"completions/min_length": 223.4,
"completions/min_terminated_length": 223.4,
"epoch": 4.9960715844609345,
"grad_norm": 0.34825447986098135,
"learning_rate": 4.592309396311833e-12,
"loss": 0.0152,
"num_tokens": 764911746.0,
"reward": 0.8251116514205933,
"reward_std": 0.13371331244707108,
"rewards/accuracy_reward/mean": 0.3258928656578064,
"rewards/accuracy_reward/std": 0.4686496198177338,
"rewards/format_reward/mean": 0.9984374880790711,
"rewards/format_reward/std": 0.03436670675873756,
"step": 815
},
{
"epoch": 4.9960715844609345,
"step": 815,
"total_flos": 0.0,
"train_loss": 0.023123394832746384,
"train_runtime": 153605.2159,
"train_samples_per_second": 0.597,
"train_steps_per_second": 0.005
}
],
"logging_steps": 5,
"max_steps": 815,
"num_input_tokens_seen": 764911746,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}