DeepSeek-R1-Distill-Qwen-14B-GRPO / trainer_state.json
SoheylM's picture
Model save
f3a66ed verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9936305732484076,
"eval_steps": 500,
"global_step": 39,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.1328125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2012.25,
"completions/mean_length": 1484.009765625,
"completions/mean_terminated_length": 1397.5816650390625,
"completions/min_length": 787.5,
"completions/min_terminated_length": 787.5,
"epoch": 0.025477707006369428,
"grad_norm": 6733.567763172914,
"kl": 0.6657562255859375,
"learning_rate": 0.0,
"loss": -0.0422,
"num_tokens": 1021957.0,
"reward": -0.77338707447052,
"reward_std": 0.32955513894557953,
"rewards/eps_simulator_reward/mean": -0.77338707447052,
"rewards/eps_simulator_reward/std": 0.4255572780966759,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 1
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.119140625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2004.5,
"completions/mean_length": 1476.125,
"completions/mean_terminated_length": 1398.6347961425781,
"completions/min_length": 733.5,
"completions/min_terminated_length": 733.5,
"epoch": 0.050955414012738856,
"grad_norm": 970.9524031746012,
"kl": 0.0662841796875,
"learning_rate": 2.5e-07,
"loss": -0.0475,
"num_tokens": 2039877.0,
"reward": -0.791015625,
"reward_std": 0.2963574752211571,
"rewards/eps_simulator_reward/mean": -0.791015625,
"rewards/eps_simulator_reward/std": 0.40781307965517044,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 2
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.138671875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2029.5,
"completions/mean_length": 1510.779296875,
"completions/mean_terminated_length": 1424.845703125,
"completions/min_length": 813.75,
"completions/min_terminated_length": 813.75,
"epoch": 0.07643312101910828,
"grad_norm": 1.343793674877022,
"kl": 0.00213623046875,
"learning_rate": 5e-07,
"loss": -0.0653,
"num_tokens": 3075540.0,
"reward": -0.763671875,
"reward_std": 0.3399305194616318,
"rewards/eps_simulator_reward/mean": -0.763671875,
"rewards/eps_simulator_reward/std": 0.42510994523763657,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 3
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.099609375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2025.25,
"completions/mean_length": 1490.4921875,
"completions/mean_terminated_length": 1429.5967407226562,
"completions/min_length": 744.0,
"completions/min_terminated_length": 744.0,
"epoch": 0.10191082802547771,
"grad_norm": 1.0331254646479913,
"kl": 0.0016155242919921875,
"learning_rate": 7.5e-07,
"loss": -0.0522,
"num_tokens": 4100816.0,
"reward": -0.78515625,
"reward_std": 0.3134715184569359,
"rewards/eps_simulator_reward/mean": -0.78515625,
"rewards/eps_simulator_reward/std": 0.406184121966362,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 4
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.1015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2011.5,
"completions/mean_length": 1423.41015625,
"completions/mean_terminated_length": 1352.7940368652344,
"completions/min_length": 700.0,
"completions/min_terminated_length": 700.0,
"epoch": 0.12738853503184713,
"grad_norm": 1.8236196385202714,
"kl": 0.0016574859619140625,
"learning_rate": 1e-06,
"loss": -0.0699,
"num_tokens": 5091746.0,
"reward": -0.79296875,
"reward_std": 0.3344918265938759,
"rewards/eps_simulator_reward/mean": -0.79296875,
"rewards/eps_simulator_reward/std": 0.4023679941892624,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 5
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.140625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2018.0,
"completions/mean_length": 1507.345703125,
"completions/mean_terminated_length": 1418.7192993164062,
"completions/min_length": 778.75,
"completions/min_terminated_length": 778.75,
"epoch": 0.15286624203821655,
"grad_norm": 0.2847883117730281,
"kl": 0.00554656982421875,
"learning_rate": 9.981884322978574e-07,
"loss": -0.0605,
"num_tokens": 6125651.0,
"reward": -0.751953125,
"reward_std": 0.3284922167658806,
"rewards/eps_simulator_reward/mean": -0.751953125,
"rewards/eps_simulator_reward/std": 0.43277325481176376,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 6
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.158203125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2009.25,
"completions/mean_length": 1494.505859375,
"completions/mean_terminated_length": 1390.4645080566406,
"completions/min_length": 710.0,
"completions/min_terminated_length": 710.0,
"epoch": 0.17834394904458598,
"grad_norm": 0.3559502825960311,
"kl": 0.006683349609375,
"learning_rate": 9.927683148693833e-07,
"loss": -0.0737,
"num_tokens": 7152982.0,
"reward": -0.71875,
"reward_std": 0.3783887177705765,
"rewards/eps_simulator_reward/mean": -0.71875,
"rewards/eps_simulator_reward/std": 0.4498228207230568,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 7
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.1171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2013.25,
"completions/mean_length": 1505.7734375,
"completions/mean_terminated_length": 1434.4289855957031,
"completions/min_length": 730.5,
"completions/min_terminated_length": 730.5,
"epoch": 0.20382165605095542,
"grad_norm": 0.6393420547069347,
"kl": 0.0185089111328125,
"learning_rate": 9.83783287313134e-07,
"loss": -0.048,
"num_tokens": 8186082.0,
"reward": -0.75,
"reward_std": 0.3828992694616318,
"rewards/eps_simulator_reward/mean": -0.75,
"rewards/eps_simulator_reward/std": 0.4343789964914322,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 8
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.146484375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2038.0,
"completions/mean_length": 1525.02734375,
"completions/mean_terminated_length": 1435.8611755371094,
"completions/min_length": 713.75,
"completions/min_terminated_length": 713.75,
"epoch": 0.22929936305732485,
"grad_norm": 0.9103220051184144,
"kl": 0.03057861328125,
"learning_rate": 9.713056917878816e-07,
"loss": -0.0548,
"num_tokens": 9229040.0,
"reward": -0.650390625,
"reward_std": 0.42321375012397766,
"rewards/eps_simulator_reward/mean": -0.650390625,
"rewards/eps_simulator_reward/std": 0.47667451947927475,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 9
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.119140625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2043.0,
"completions/mean_length": 1516.04296875,
"completions/mean_terminated_length": 1444.0675048828125,
"completions/min_length": 718.5,
"completions/min_terminated_length": 718.5,
"epoch": 0.25477707006369427,
"grad_norm": 0.31002656427107383,
"kl": 0.020538330078125,
"learning_rate": 9.554359905560885e-07,
"loss": -0.0549,
"num_tokens": 10267398.0,
"reward": -0.671875,
"reward_std": 0.41061024367809296,
"rewards/eps_simulator_reward/mean": -0.671875,
"rewards/eps_simulator_reward/std": 0.4707574099302292,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 10
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.169921875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2040.0,
"completions/mean_length": 1524.63671875,
"completions/mean_terminated_length": 1417.4291687011719,
"completions/min_length": 737.5,
"completions/min_terminated_length": 737.5,
"epoch": 0.2802547770700637,
"grad_norm": 0.3975698195821328,
"kl": 0.020050048828125,
"learning_rate": 9.363019571208397e-07,
"loss": -0.0511,
"num_tokens": 11310156.0,
"reward": -0.6796875,
"reward_std": 0.41205591708421707,
"rewards/eps_simulator_reward/mean": -0.6796875,
"rewards/eps_simulator_reward/std": 0.46482934057712555,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 11
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.12890625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2032.25,
"completions/mean_length": 1533.3046875,
"completions/mean_terminated_length": 1456.9059448242188,
"completions/min_length": 763.0,
"completions/min_terminated_length": 763.0,
"epoch": 0.3057324840764331,
"grad_norm": 1.0419265668973368,
"kl": 0.024017333984375,
"learning_rate": 9.140576474687263e-07,
"loss": -0.0551,
"num_tokens": 12357352.0,
"reward": -0.70703125,
"reward_std": 0.4042434096336365,
"rewards/eps_simulator_reward/mean": -0.70703125,
"rewards/eps_simulator_reward/std": 0.4536997899413109,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 12
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.166015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2028.0,
"completions/mean_length": 1526.982421875,
"completions/mean_terminated_length": 1423.71337890625,
"completions/min_length": 637.0,
"completions/min_terminated_length": 637.0,
"epoch": 0.33121019108280253,
"grad_norm": 0.3341045274795088,
"kl": 0.020660400390625,
"learning_rate": 8.88882159701625e-07,
"loss": -0.0667,
"num_tokens": 13401311.0,
"reward": -0.724609375,
"reward_std": 0.3498363718390465,
"rewards/eps_simulator_reward/mean": -0.724609375,
"rewards/eps_simulator_reward/std": 0.44695496559143066,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 13
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.158203125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2024.25,
"completions/mean_length": 1531.578125,
"completions/mean_terminated_length": 1435.1131286621094,
"completions/min_length": 718.5,
"completions/min_terminated_length": 718.5,
"epoch": 0.35668789808917195,
"grad_norm": 0.3415663995876473,
"kl": 0.02423095703125,
"learning_rate": 8.609781920440891e-07,
"loss": -0.052,
"num_tokens": 14447623.0,
"reward": -0.72265625,
"reward_std": 0.3862012252211571,
"rewards/eps_simulator_reward/mean": -0.72265625,
"rewards/eps_simulator_reward/std": 0.4454372450709343,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 14
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.15234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2029.0,
"completions/mean_length": 1516.60546875,
"completions/mean_terminated_length": 1420.9369506835938,
"completions/min_length": 754.75,
"completions/min_terminated_length": 754.75,
"epoch": 0.3821656050955414,
"grad_norm": 0.40113945682093677,
"kl": 0.026031494140625,
"learning_rate": 8.305704108364301e-07,
"loss": -0.0556,
"num_tokens": 15486269.0,
"reward": -0.673828125,
"reward_std": 0.4069410637021065,
"rewards/eps_simulator_reward/mean": -0.673828125,
"rewards/eps_simulator_reward/std": 0.4652545005083084,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 15
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.158203125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2034.75,
"completions/mean_length": 1545.33984375,
"completions/mean_terminated_length": 1451.2561340332031,
"completions/min_length": 765.25,
"completions/min_terminated_length": 765.25,
"epoch": 0.40764331210191085,
"grad_norm": 0.3980926201477192,
"kl": 0.02734375,
"learning_rate": 7.979036416534461e-07,
"loss": -0.0547,
"num_tokens": 16539627.0,
"reward": -0.669921875,
"reward_std": 0.39735905081033707,
"rewards/eps_simulator_reward/mean": -0.669921875,
"rewards/eps_simulator_reward/std": 0.4680754914879799,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 16
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.119140625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2039.0,
"completions/mean_length": 1541.90625,
"completions/mean_terminated_length": 1473.3203430175781,
"completions/min_length": 757.0,
"completions/min_terminated_length": 757.0,
"epoch": 0.43312101910828027,
"grad_norm": 0.293619293173836,
"kl": 0.027069091796875,
"learning_rate": 7.632408981128493e-07,
"loss": -0.0319,
"num_tokens": 17591227.0,
"reward": -0.681640625,
"reward_std": 0.37931685149669647,
"rewards/eps_simulator_reward/mean": -0.681640625,
"rewards/eps_simulator_reward/std": 0.46698828786611557,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 17
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.1796875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2016.75,
"completions/mean_length": 1574.708984375,
"completions/mean_terminated_length": 1471.5007019042969,
"completions/min_length": 773.0,
"completions/min_terminated_length": 773.0,
"epoch": 0.4585987261146497,
"grad_norm": 0.3758960483732288,
"kl": 0.02618408203125,
"learning_rate": 7.268612642442656e-07,
"loss": -0.0438,
"num_tokens": 18659622.0,
"reward": -0.630859375,
"reward_std": 0.42619186639785767,
"rewards/eps_simulator_reward/mean": -0.630859375,
"rewards/eps_simulator_reward/std": 0.48388052731752396,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 18
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.1796875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2032.0,
"completions/mean_length": 1560.658203125,
"completions/mean_terminated_length": 1453.4781494140625,
"completions/min_length": 833.25,
"completions/min_terminated_length": 833.25,
"epoch": 0.4840764331210191,
"grad_norm": 0.2591766871237973,
"kl": 0.02783203125,
"learning_rate": 6.890576474687263e-07,
"loss": -0.0392,
"num_tokens": 19720823.0,
"reward": -0.611328125,
"reward_std": 0.44004734605550766,
"rewards/eps_simulator_reward/mean": -0.611328125,
"rewards/eps_simulator_reward/std": 0.4889531210064888,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 19
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.166015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2041.75,
"completions/mean_length": 1587.4140625,
"completions/mean_terminated_length": 1495.7480163574219,
"completions/min_length": 803.5,
"completions/min_terminated_length": 803.5,
"epoch": 0.5095541401273885,
"grad_norm": 0.24419145488172941,
"kl": 0.02935791015625,
"learning_rate": 6.501344202803414e-07,
"loss": -0.0332,
"num_tokens": 20795723.0,
"reward": -0.548828125,
"reward_std": 0.46469344943761826,
"rewards/eps_simulator_reward/mean": -0.548828125,
"rewards/eps_simulator_reward/std": 0.49816230684518814,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 20
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.18359375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2035.0,
"completions/mean_length": 1579.974609375,
"completions/mean_terminated_length": 1476.3168640136719,
"completions/min_length": 772.25,
"completions/min_terminated_length": 772.25,
"epoch": 0.535031847133758,
"grad_norm": 0.2887492844862219,
"kl": 0.035125732421875,
"learning_rate": 6.10404969617945e-07,
"loss": -0.0473,
"num_tokens": 21866814.0,
"reward": -0.548828125,
"reward_std": 0.46501730382442474,
"rewards/eps_simulator_reward/mean": -0.548828125,
"rewards/eps_simulator_reward/std": 0.49840257316827774,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 21
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.1796875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2037.25,
"completions/mean_length": 1596.095703125,
"completions/mean_terminated_length": 1497.2264099121094,
"completions/min_length": 760.0,
"completions/min_terminated_length": 760.0,
"epoch": 0.5605095541401274,
"grad_norm": 0.4483927215331061,
"kl": 0.0399169921875,
"learning_rate": 5.701891736577317e-07,
"loss": -0.0322,
"num_tokens": 22946159.0,
"reward": -0.51953125,
"reward_std": 0.48454854637384415,
"rewards/eps_simulator_reward/mean": -0.51953125,
"rewards/eps_simulator_reward/std": 0.49943237751722336,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 22
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.18359375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2028.5,
"completions/mean_length": 1587.29296875,
"completions/mean_terminated_length": 1483.8641662597656,
"completions/min_length": 828.25,
"completions/min_terminated_length": 828.25,
"epoch": 0.5859872611464968,
"grad_norm": 0.35917761276782173,
"kl": 0.0360107421875,
"learning_rate": 5.298108263422685e-07,
"loss": -0.0434,
"num_tokens": 24020997.0,
"reward": -0.50390625,
"reward_std": 0.47762079536914825,
"rewards/eps_simulator_reward/mean": -0.50390625,
"rewards/eps_simulator_reward/std": 0.5010738596320152,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 23
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.212890625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2027.5,
"completions/mean_length": 1582.2890625,
"completions/mean_terminated_length": 1456.1675109863281,
"completions/min_length": 776.75,
"completions/min_terminated_length": 776.75,
"epoch": 0.6114649681528662,
"grad_norm": 0.317959150806763,
"kl": 0.03668212890625,
"learning_rate": 4.895950303820552e-07,
"loss": -0.0486,
"num_tokens": 25093273.0,
"reward": -0.447265625,
"reward_std": 0.46050675213336945,
"rewards/eps_simulator_reward/mean": -0.447265625,
"rewards/eps_simulator_reward/std": 0.4985574185848236,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 24
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.162109375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2028.75,
"completions/mean_length": 1571.0625,
"completions/mean_terminated_length": 1479.2693176269531,
"completions/min_length": 790.75,
"completions/min_terminated_length": 790.75,
"epoch": 0.6369426751592356,
"grad_norm": 0.3007766407739895,
"kl": 0.03619384765625,
"learning_rate": 4.4986557971965856e-07,
"loss": -0.0381,
"num_tokens": 26159801.0,
"reward": -0.498046875,
"reward_std": 0.43191099166870117,
"rewards/eps_simulator_reward/mean": -0.498046875,
"rewards/eps_simulator_reward/std": 0.4973808750510216,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 25
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.1796875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2040.25,
"completions/mean_length": 1576.947265625,
"completions/mean_terminated_length": 1474.1740112304688,
"completions/min_length": 756.25,
"completions/min_terminated_length": 756.25,
"epoch": 0.6624203821656051,
"grad_norm": 0.320328638853622,
"kl": 0.034881591796875,
"learning_rate": 4.1094235253127374e-07,
"loss": -0.0469,
"num_tokens": 27229342.0,
"reward": -0.490234375,
"reward_std": 0.45237039774656296,
"rewards/eps_simulator_reward/mean": -0.490234375,
"rewards/eps_simulator_reward/std": 0.5005030706524849,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 26
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.1640625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2040.75,
"completions/mean_length": 1598.625,
"completions/mean_terminated_length": 1510.354736328125,
"completions/min_length": 738.0,
"completions/min_terminated_length": 738.0,
"epoch": 0.6878980891719745,
"grad_norm": 0.35008352061308967,
"kl": 0.03564453125,
"learning_rate": 3.731387357557344e-07,
"loss": -0.0477,
"num_tokens": 28309982.0,
"reward": -0.5546875,
"reward_std": 0.42381805181503296,
"rewards/eps_simulator_reward/mean": -0.5546875,
"rewards/eps_simulator_reward/std": 0.4947791174054146,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 27
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.21484375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2022.5,
"completions/mean_length": 1600.478515625,
"completions/mean_terminated_length": 1478.0363159179688,
"completions/min_length": 757.25,
"completions/min_terminated_length": 757.25,
"epoch": 0.7133757961783439,
"grad_norm": 0.3095860436443937,
"kl": 0.035736083984375,
"learning_rate": 3.367591018871506e-07,
"loss": -0.0547,
"num_tokens": 29391571.0,
"reward": -0.51171875,
"reward_std": 0.47134073078632355,
"rewards/eps_simulator_reward/mean": -0.51171875,
"rewards/eps_simulator_reward/std": 0.5015044659376144,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 28
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.21484375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2027.5,
"completions/mean_length": 1615.025390625,
"completions/mean_terminated_length": 1497.0940856933594,
"completions/min_length": 708.5,
"completions/min_terminated_length": 708.5,
"epoch": 0.7388535031847133,
"grad_norm": 0.27660703860323266,
"kl": 0.033172607421875,
"learning_rate": 3.020963583465539e-07,
"loss": -0.0513,
"num_tokens": 30480608.0,
"reward": -0.4765625,
"reward_std": 0.48036184906959534,
"rewards/eps_simulator_reward/mean": -0.4765625,
"rewards/eps_simulator_reward/std": 0.49983665347099304,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 29
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.17578125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2027.75,
"completions/mean_length": 1601.826171875,
"completions/mean_terminated_length": 1506.8683166503906,
"completions/min_length": 787.5,
"completions/min_terminated_length": 787.5,
"epoch": 0.7643312101910829,
"grad_norm": 0.42748719862561546,
"kl": 0.034423828125,
"learning_rate": 2.6942958916356994e-07,
"loss": -0.0487,
"num_tokens": 31562887.0,
"reward": -0.525390625,
"reward_std": 0.49626730382442474,
"rewards/eps_simulator_reward/mean": -0.525390625,
"rewards/eps_simulator_reward/std": 0.4998583048582077,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 30
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.20703125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2040.0,
"completions/mean_length": 1603.60546875,
"completions/mean_terminated_length": 1487.5808410644531,
"completions/min_length": 779.75,
"completions/min_terminated_length": 779.75,
"epoch": 0.7898089171974523,
"grad_norm": 0.26533857511488407,
"kl": 0.03125,
"learning_rate": 2.390218079559109e-07,
"loss": -0.06,
"num_tokens": 32646077.0,
"reward": -0.5234375,
"reward_std": 0.47222549468278885,
"rewards/eps_simulator_reward/mean": -0.5234375,
"rewards/eps_simulator_reward/std": 0.5008884444832802,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 31
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.20703125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2025.5,
"completions/mean_length": 1610.716796875,
"completions/mean_terminated_length": 1495.678955078125,
"completions/min_length": 789.75,
"completions/min_terminated_length": 789.75,
"epoch": 0.8152866242038217,
"grad_norm": 0.25691683516929065,
"kl": 0.031829833984375,
"learning_rate": 2.1111784029837509e-07,
"loss": -0.0447,
"num_tokens": 33732908.0,
"reward": -0.515625,
"reward_std": 0.45176610350608826,
"rewards/eps_simulator_reward/mean": -0.515625,
"rewards/eps_simulator_reward/std": 0.5000600069761276,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 32
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.203125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2027.0,
"completions/mean_length": 1604.3359375,
"completions/mean_terminated_length": 1491.3028564453125,
"completions/min_length": 795.25,
"completions/min_terminated_length": 795.25,
"epoch": 0.8407643312101911,
"grad_norm": 0.6246804081121883,
"kl": 0.041748046875,
"learning_rate": 1.8594235253127372e-07,
"loss": -0.0511,
"num_tokens": 34816472.0,
"reward": -0.517578125,
"reward_std": 0.45204655081033707,
"rewards/eps_simulator_reward/mean": -0.517578125,
"rewards/eps_simulator_reward/std": 0.5011512860655785,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 33
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.244140625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2041.75,
"completions/mean_length": 1642.564453125,
"completions/mean_terminated_length": 1510.7035217285156,
"completions/min_length": 781.75,
"completions/min_terminated_length": 781.75,
"epoch": 0.8662420382165605,
"grad_norm": 0.2636823037232082,
"kl": 0.03643798828125,
"learning_rate": 1.6369804287916025e-07,
"loss": -0.0411,
"num_tokens": 35919609.0,
"reward": -0.470703125,
"reward_std": 0.45567234605550766,
"rewards/eps_simulator_reward/mean": -0.470703125,
"rewards/eps_simulator_reward/std": 0.4984956756234169,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 34
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.224609375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2031.0,
"completions/mean_length": 1605.212890625,
"completions/mean_terminated_length": 1476.3855895996094,
"completions/min_length": 839.25,
"completions/min_terminated_length": 839.25,
"epoch": 0.89171974522293,
"grad_norm": 0.32334944678197935,
"kl": 0.0355224609375,
"learning_rate": 1.4456400944391144e-07,
"loss": -0.0662,
"num_tokens": 37003622.0,
"reward": -0.458984375,
"reward_std": 0.46678680181503296,
"rewards/eps_simulator_reward/mean": -0.458984375,
"rewards/eps_simulator_reward/std": 0.49930109083652496,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 35
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.19921875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2010.0,
"completions/mean_length": 1613.6328125,
"completions/mean_terminated_length": 1504.9357604980469,
"completions/min_length": 811.75,
"completions/min_terminated_length": 811.75,
"epoch": 0.9171974522292994,
"grad_norm": 0.2540670404555639,
"kl": 0.035858154296875,
"learning_rate": 1.2869430821211826e-07,
"loss": -0.0423,
"num_tokens": 38091946.0,
"reward": -0.5078125,
"reward_std": 0.46562159806489944,
"rewards/eps_simulator_reward/mean": -0.5078125,
"rewards/eps_simulator_reward/std": 0.5015047490596771,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 36
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.212890625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2037.25,
"completions/mean_length": 1638.091796875,
"completions/mean_terminated_length": 1526.8819885253906,
"completions/min_length": 769.5,
"completions/min_terminated_length": 769.5,
"epoch": 0.9426751592356688,
"grad_norm": 0.6072482178147118,
"kl": 0.0433349609375,
"learning_rate": 1.1621671268686605e-07,
"loss": -0.0309,
"num_tokens": 39192793.0,
"reward": -0.49609375,
"reward_std": 0.45329853892326355,
"rewards/eps_simulator_reward/mean": -0.49609375,
"rewards/eps_simulator_reward/std": 0.5007057711482048,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 37
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.212890625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2042.0,
"completions/mean_length": 1634.078125,
"completions/mean_terminated_length": 1521.7028198242188,
"completions/min_length": 768.25,
"completions/min_terminated_length": 768.25,
"epoch": 0.9681528662420382,
"grad_norm": 1.674101211279826,
"kl": 0.0494384765625,
"learning_rate": 1.0723168513061665e-07,
"loss": -0.0404,
"num_tokens": 40291585.0,
"reward": -0.5,
"reward_std": 0.46803878247737885,
"rewards/eps_simulator_reward/mean": -0.5,
"rewards/eps_simulator_reward/std": 0.49916671961545944,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 38
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 2032.0,
"completions/max_terminated_length": 1812.75,
"completions/mean_length": 1557.90625,
"completions/mean_terminated_length": 1464.7232666015625,
"completions/min_length": 1071.25,
"completions/min_terminated_length": 1071.25,
"epoch": 0.9936305732484076,
"grad_norm": 0.2581163051289786,
"kl": 0.0352783203125,
"learning_rate": 1.0181156770214242e-07,
"loss": -0.0572,
"num_tokens": 41375402.0,
"reward": -0.51171875,
"reward_std": 0.47641219943761826,
"rewards/eps_simulator_reward/mean": -0.51171875,
"rewards/eps_simulator_reward/std": 0.5010442584753036,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 39
},
{
"epoch": 0.9936305732484076,
"step": 39,
"total_flos": 0.0,
"train_loss": 0.0,
"train_runtime": 1.8663,
"train_samples_per_second": 2679.088,
"train_steps_per_second": 20.897
}
],
"logging_steps": 1,
"max_steps": 39,
"num_input_tokens_seen": 41375402,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}