|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9936305732484076, |
|
"eval_steps": 500, |
|
"global_step": 39, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1328125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2012.25, |
|
"completions/mean_length": 1484.009765625, |
|
"completions/mean_terminated_length": 1397.5816650390625, |
|
"completions/min_length": 787.5, |
|
"completions/min_terminated_length": 787.5, |
|
"epoch": 0.025477707006369428, |
|
"grad_norm": 6733.567763172914, |
|
"kl": 0.6657562255859375, |
|
"learning_rate": 0.0, |
|
"loss": -0.0422, |
|
"num_tokens": 1021957.0, |
|
"reward": -0.77338707447052, |
|
"reward_std": 0.32955513894557953, |
|
"rewards/eps_simulator_reward/mean": -0.77338707447052, |
|
"rewards/eps_simulator_reward/std": 0.4255572780966759, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 1 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.119140625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2004.5, |
|
"completions/mean_length": 1476.125, |
|
"completions/mean_terminated_length": 1398.6347961425781, |
|
"completions/min_length": 733.5, |
|
"completions/min_terminated_length": 733.5, |
|
"epoch": 0.050955414012738856, |
|
"grad_norm": 970.9524031746012, |
|
"kl": 0.0662841796875, |
|
"learning_rate": 2.5e-07, |
|
"loss": -0.0475, |
|
"num_tokens": 2039877.0, |
|
"reward": -0.791015625, |
|
"reward_std": 0.2963574752211571, |
|
"rewards/eps_simulator_reward/mean": -0.791015625, |
|
"rewards/eps_simulator_reward/std": 0.40781307965517044, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 2 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.138671875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2029.5, |
|
"completions/mean_length": 1510.779296875, |
|
"completions/mean_terminated_length": 1424.845703125, |
|
"completions/min_length": 813.75, |
|
"completions/min_terminated_length": 813.75, |
|
"epoch": 0.07643312101910828, |
|
"grad_norm": 1.343793674877022, |
|
"kl": 0.00213623046875, |
|
"learning_rate": 5e-07, |
|
"loss": -0.0653, |
|
"num_tokens": 3075540.0, |
|
"reward": -0.763671875, |
|
"reward_std": 0.3399305194616318, |
|
"rewards/eps_simulator_reward/mean": -0.763671875, |
|
"rewards/eps_simulator_reward/std": 0.42510994523763657, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 3 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.099609375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2025.25, |
|
"completions/mean_length": 1490.4921875, |
|
"completions/mean_terminated_length": 1429.5967407226562, |
|
"completions/min_length": 744.0, |
|
"completions/min_terminated_length": 744.0, |
|
"epoch": 0.10191082802547771, |
|
"grad_norm": 1.0331254646479913, |
|
"kl": 0.0016155242919921875, |
|
"learning_rate": 7.5e-07, |
|
"loss": -0.0522, |
|
"num_tokens": 4100816.0, |
|
"reward": -0.78515625, |
|
"reward_std": 0.3134715184569359, |
|
"rewards/eps_simulator_reward/mean": -0.78515625, |
|
"rewards/eps_simulator_reward/std": 0.406184121966362, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 4 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1015625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2011.5, |
|
"completions/mean_length": 1423.41015625, |
|
"completions/mean_terminated_length": 1352.7940368652344, |
|
"completions/min_length": 700.0, |
|
"completions/min_terminated_length": 700.0, |
|
"epoch": 0.12738853503184713, |
|
"grad_norm": 1.8236196385202714, |
|
"kl": 0.0016574859619140625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0699, |
|
"num_tokens": 5091746.0, |
|
"reward": -0.79296875, |
|
"reward_std": 0.3344918265938759, |
|
"rewards/eps_simulator_reward/mean": -0.79296875, |
|
"rewards/eps_simulator_reward/std": 0.4023679941892624, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.140625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2018.0, |
|
"completions/mean_length": 1507.345703125, |
|
"completions/mean_terminated_length": 1418.7192993164062, |
|
"completions/min_length": 778.75, |
|
"completions/min_terminated_length": 778.75, |
|
"epoch": 0.15286624203821655, |
|
"grad_norm": 0.2847883117730281, |
|
"kl": 0.00554656982421875, |
|
"learning_rate": 9.981884322978574e-07, |
|
"loss": -0.0605, |
|
"num_tokens": 6125651.0, |
|
"reward": -0.751953125, |
|
"reward_std": 0.3284922167658806, |
|
"rewards/eps_simulator_reward/mean": -0.751953125, |
|
"rewards/eps_simulator_reward/std": 0.43277325481176376, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 6 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.158203125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2009.25, |
|
"completions/mean_length": 1494.505859375, |
|
"completions/mean_terminated_length": 1390.4645080566406, |
|
"completions/min_length": 710.0, |
|
"completions/min_terminated_length": 710.0, |
|
"epoch": 0.17834394904458598, |
|
"grad_norm": 0.3559502825960311, |
|
"kl": 0.006683349609375, |
|
"learning_rate": 9.927683148693833e-07, |
|
"loss": -0.0737, |
|
"num_tokens": 7152982.0, |
|
"reward": -0.71875, |
|
"reward_std": 0.3783887177705765, |
|
"rewards/eps_simulator_reward/mean": -0.71875, |
|
"rewards/eps_simulator_reward/std": 0.4498228207230568, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 7 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1171875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2013.25, |
|
"completions/mean_length": 1505.7734375, |
|
"completions/mean_terminated_length": 1434.4289855957031, |
|
"completions/min_length": 730.5, |
|
"completions/min_terminated_length": 730.5, |
|
"epoch": 0.20382165605095542, |
|
"grad_norm": 0.6393420547069347, |
|
"kl": 0.0185089111328125, |
|
"learning_rate": 9.83783287313134e-07, |
|
"loss": -0.048, |
|
"num_tokens": 8186082.0, |
|
"reward": -0.75, |
|
"reward_std": 0.3828992694616318, |
|
"rewards/eps_simulator_reward/mean": -0.75, |
|
"rewards/eps_simulator_reward/std": 0.4343789964914322, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 8 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.146484375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2038.0, |
|
"completions/mean_length": 1525.02734375, |
|
"completions/mean_terminated_length": 1435.8611755371094, |
|
"completions/min_length": 713.75, |
|
"completions/min_terminated_length": 713.75, |
|
"epoch": 0.22929936305732485, |
|
"grad_norm": 0.9103220051184144, |
|
"kl": 0.03057861328125, |
|
"learning_rate": 9.713056917878816e-07, |
|
"loss": -0.0548, |
|
"num_tokens": 9229040.0, |
|
"reward": -0.650390625, |
|
"reward_std": 0.42321375012397766, |
|
"rewards/eps_simulator_reward/mean": -0.650390625, |
|
"rewards/eps_simulator_reward/std": 0.47667451947927475, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 9 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.119140625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2043.0, |
|
"completions/mean_length": 1516.04296875, |
|
"completions/mean_terminated_length": 1444.0675048828125, |
|
"completions/min_length": 718.5, |
|
"completions/min_terminated_length": 718.5, |
|
"epoch": 0.25477707006369427, |
|
"grad_norm": 0.31002656427107383, |
|
"kl": 0.020538330078125, |
|
"learning_rate": 9.554359905560885e-07, |
|
"loss": -0.0549, |
|
"num_tokens": 10267398.0, |
|
"reward": -0.671875, |
|
"reward_std": 0.41061024367809296, |
|
"rewards/eps_simulator_reward/mean": -0.671875, |
|
"rewards/eps_simulator_reward/std": 0.4707574099302292, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.169921875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2040.0, |
|
"completions/mean_length": 1524.63671875, |
|
"completions/mean_terminated_length": 1417.4291687011719, |
|
"completions/min_length": 737.5, |
|
"completions/min_terminated_length": 737.5, |
|
"epoch": 0.2802547770700637, |
|
"grad_norm": 0.3975698195821328, |
|
"kl": 0.020050048828125, |
|
"learning_rate": 9.363019571208397e-07, |
|
"loss": -0.0511, |
|
"num_tokens": 11310156.0, |
|
"reward": -0.6796875, |
|
"reward_std": 0.41205591708421707, |
|
"rewards/eps_simulator_reward/mean": -0.6796875, |
|
"rewards/eps_simulator_reward/std": 0.46482934057712555, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 11 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.12890625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2032.25, |
|
"completions/mean_length": 1533.3046875, |
|
"completions/mean_terminated_length": 1456.9059448242188, |
|
"completions/min_length": 763.0, |
|
"completions/min_terminated_length": 763.0, |
|
"epoch": 0.3057324840764331, |
|
"grad_norm": 1.0419265668973368, |
|
"kl": 0.024017333984375, |
|
"learning_rate": 9.140576474687263e-07, |
|
"loss": -0.0551, |
|
"num_tokens": 12357352.0, |
|
"reward": -0.70703125, |
|
"reward_std": 0.4042434096336365, |
|
"rewards/eps_simulator_reward/mean": -0.70703125, |
|
"rewards/eps_simulator_reward/std": 0.4536997899413109, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 12 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.166015625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2028.0, |
|
"completions/mean_length": 1526.982421875, |
|
"completions/mean_terminated_length": 1423.71337890625, |
|
"completions/min_length": 637.0, |
|
"completions/min_terminated_length": 637.0, |
|
"epoch": 0.33121019108280253, |
|
"grad_norm": 0.3341045274795088, |
|
"kl": 0.020660400390625, |
|
"learning_rate": 8.88882159701625e-07, |
|
"loss": -0.0667, |
|
"num_tokens": 13401311.0, |
|
"reward": -0.724609375, |
|
"reward_std": 0.3498363718390465, |
|
"rewards/eps_simulator_reward/mean": -0.724609375, |
|
"rewards/eps_simulator_reward/std": 0.44695496559143066, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 13 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.158203125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2024.25, |
|
"completions/mean_length": 1531.578125, |
|
"completions/mean_terminated_length": 1435.1131286621094, |
|
"completions/min_length": 718.5, |
|
"completions/min_terminated_length": 718.5, |
|
"epoch": 0.35668789808917195, |
|
"grad_norm": 0.3415663995876473, |
|
"kl": 0.02423095703125, |
|
"learning_rate": 8.609781920440891e-07, |
|
"loss": -0.052, |
|
"num_tokens": 14447623.0, |
|
"reward": -0.72265625, |
|
"reward_std": 0.3862012252211571, |
|
"rewards/eps_simulator_reward/mean": -0.72265625, |
|
"rewards/eps_simulator_reward/std": 0.4454372450709343, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 14 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.15234375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2029.0, |
|
"completions/mean_length": 1516.60546875, |
|
"completions/mean_terminated_length": 1420.9369506835938, |
|
"completions/min_length": 754.75, |
|
"completions/min_terminated_length": 754.75, |
|
"epoch": 0.3821656050955414, |
|
"grad_norm": 0.40113945682093677, |
|
"kl": 0.026031494140625, |
|
"learning_rate": 8.305704108364301e-07, |
|
"loss": -0.0556, |
|
"num_tokens": 15486269.0, |
|
"reward": -0.673828125, |
|
"reward_std": 0.4069410637021065, |
|
"rewards/eps_simulator_reward/mean": -0.673828125, |
|
"rewards/eps_simulator_reward/std": 0.4652545005083084, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.158203125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2034.75, |
|
"completions/mean_length": 1545.33984375, |
|
"completions/mean_terminated_length": 1451.2561340332031, |
|
"completions/min_length": 765.25, |
|
"completions/min_terminated_length": 765.25, |
|
"epoch": 0.40764331210191085, |
|
"grad_norm": 0.3980926201477192, |
|
"kl": 0.02734375, |
|
"learning_rate": 7.979036416534461e-07, |
|
"loss": -0.0547, |
|
"num_tokens": 16539627.0, |
|
"reward": -0.669921875, |
|
"reward_std": 0.39735905081033707, |
|
"rewards/eps_simulator_reward/mean": -0.669921875, |
|
"rewards/eps_simulator_reward/std": 0.4680754914879799, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 16 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.119140625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2039.0, |
|
"completions/mean_length": 1541.90625, |
|
"completions/mean_terminated_length": 1473.3203430175781, |
|
"completions/min_length": 757.0, |
|
"completions/min_terminated_length": 757.0, |
|
"epoch": 0.43312101910828027, |
|
"grad_norm": 0.293619293173836, |
|
"kl": 0.027069091796875, |
|
"learning_rate": 7.632408981128493e-07, |
|
"loss": -0.0319, |
|
"num_tokens": 17591227.0, |
|
"reward": -0.681640625, |
|
"reward_std": 0.37931685149669647, |
|
"rewards/eps_simulator_reward/mean": -0.681640625, |
|
"rewards/eps_simulator_reward/std": 0.46698828786611557, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 17 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1796875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2016.75, |
|
"completions/mean_length": 1574.708984375, |
|
"completions/mean_terminated_length": 1471.5007019042969, |
|
"completions/min_length": 773.0, |
|
"completions/min_terminated_length": 773.0, |
|
"epoch": 0.4585987261146497, |
|
"grad_norm": 0.3758960483732288, |
|
"kl": 0.02618408203125, |
|
"learning_rate": 7.268612642442656e-07, |
|
"loss": -0.0438, |
|
"num_tokens": 18659622.0, |
|
"reward": -0.630859375, |
|
"reward_std": 0.42619186639785767, |
|
"rewards/eps_simulator_reward/mean": -0.630859375, |
|
"rewards/eps_simulator_reward/std": 0.48388052731752396, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 18 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1796875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2032.0, |
|
"completions/mean_length": 1560.658203125, |
|
"completions/mean_terminated_length": 1453.4781494140625, |
|
"completions/min_length": 833.25, |
|
"completions/min_terminated_length": 833.25, |
|
"epoch": 0.4840764331210191, |
|
"grad_norm": 0.2591766871237973, |
|
"kl": 0.02783203125, |
|
"learning_rate": 6.890576474687263e-07, |
|
"loss": -0.0392, |
|
"num_tokens": 19720823.0, |
|
"reward": -0.611328125, |
|
"reward_std": 0.44004734605550766, |
|
"rewards/eps_simulator_reward/mean": -0.611328125, |
|
"rewards/eps_simulator_reward/std": 0.4889531210064888, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 19 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.166015625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2041.75, |
|
"completions/mean_length": 1587.4140625, |
|
"completions/mean_terminated_length": 1495.7480163574219, |
|
"completions/min_length": 803.5, |
|
"completions/min_terminated_length": 803.5, |
|
"epoch": 0.5095541401273885, |
|
"grad_norm": 0.24419145488172941, |
|
"kl": 0.02935791015625, |
|
"learning_rate": 6.501344202803414e-07, |
|
"loss": -0.0332, |
|
"num_tokens": 20795723.0, |
|
"reward": -0.548828125, |
|
"reward_std": 0.46469344943761826, |
|
"rewards/eps_simulator_reward/mean": -0.548828125, |
|
"rewards/eps_simulator_reward/std": 0.49816230684518814, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.18359375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2035.0, |
|
"completions/mean_length": 1579.974609375, |
|
"completions/mean_terminated_length": 1476.3168640136719, |
|
"completions/min_length": 772.25, |
|
"completions/min_terminated_length": 772.25, |
|
"epoch": 0.535031847133758, |
|
"grad_norm": 0.2887492844862219, |
|
"kl": 0.035125732421875, |
|
"learning_rate": 6.10404969617945e-07, |
|
"loss": -0.0473, |
|
"num_tokens": 21866814.0, |
|
"reward": -0.548828125, |
|
"reward_std": 0.46501730382442474, |
|
"rewards/eps_simulator_reward/mean": -0.548828125, |
|
"rewards/eps_simulator_reward/std": 0.49840257316827774, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 21 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1796875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2037.25, |
|
"completions/mean_length": 1596.095703125, |
|
"completions/mean_terminated_length": 1497.2264099121094, |
|
"completions/min_length": 760.0, |
|
"completions/min_terminated_length": 760.0, |
|
"epoch": 0.5605095541401274, |
|
"grad_norm": 0.4483927215331061, |
|
"kl": 0.0399169921875, |
|
"learning_rate": 5.701891736577317e-07, |
|
"loss": -0.0322, |
|
"num_tokens": 22946159.0, |
|
"reward": -0.51953125, |
|
"reward_std": 0.48454854637384415, |
|
"rewards/eps_simulator_reward/mean": -0.51953125, |
|
"rewards/eps_simulator_reward/std": 0.49943237751722336, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 22 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.18359375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2028.5, |
|
"completions/mean_length": 1587.29296875, |
|
"completions/mean_terminated_length": 1483.8641662597656, |
|
"completions/min_length": 828.25, |
|
"completions/min_terminated_length": 828.25, |
|
"epoch": 0.5859872611464968, |
|
"grad_norm": 0.35917761276782173, |
|
"kl": 0.0360107421875, |
|
"learning_rate": 5.298108263422685e-07, |
|
"loss": -0.0434, |
|
"num_tokens": 24020997.0, |
|
"reward": -0.50390625, |
|
"reward_std": 0.47762079536914825, |
|
"rewards/eps_simulator_reward/mean": -0.50390625, |
|
"rewards/eps_simulator_reward/std": 0.5010738596320152, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 23 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.212890625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2027.5, |
|
"completions/mean_length": 1582.2890625, |
|
"completions/mean_terminated_length": 1456.1675109863281, |
|
"completions/min_length": 776.75, |
|
"completions/min_terminated_length": 776.75, |
|
"epoch": 0.6114649681528662, |
|
"grad_norm": 0.317959150806763, |
|
"kl": 0.03668212890625, |
|
"learning_rate": 4.895950303820552e-07, |
|
"loss": -0.0486, |
|
"num_tokens": 25093273.0, |
|
"reward": -0.447265625, |
|
"reward_std": 0.46050675213336945, |
|
"rewards/eps_simulator_reward/mean": -0.447265625, |
|
"rewards/eps_simulator_reward/std": 0.4985574185848236, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 24 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.162109375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2028.75, |
|
"completions/mean_length": 1571.0625, |
|
"completions/mean_terminated_length": 1479.2693176269531, |
|
"completions/min_length": 790.75, |
|
"completions/min_terminated_length": 790.75, |
|
"epoch": 0.6369426751592356, |
|
"grad_norm": 0.3007766407739895, |
|
"kl": 0.03619384765625, |
|
"learning_rate": 4.4986557971965856e-07, |
|
"loss": -0.0381, |
|
"num_tokens": 26159801.0, |
|
"reward": -0.498046875, |
|
"reward_std": 0.43191099166870117, |
|
"rewards/eps_simulator_reward/mean": -0.498046875, |
|
"rewards/eps_simulator_reward/std": 0.4973808750510216, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1796875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2040.25, |
|
"completions/mean_length": 1576.947265625, |
|
"completions/mean_terminated_length": 1474.1740112304688, |
|
"completions/min_length": 756.25, |
|
"completions/min_terminated_length": 756.25, |
|
"epoch": 0.6624203821656051, |
|
"grad_norm": 0.320328638853622, |
|
"kl": 0.034881591796875, |
|
"learning_rate": 4.1094235253127374e-07, |
|
"loss": -0.0469, |
|
"num_tokens": 27229342.0, |
|
"reward": -0.490234375, |
|
"reward_std": 0.45237039774656296, |
|
"rewards/eps_simulator_reward/mean": -0.490234375, |
|
"rewards/eps_simulator_reward/std": 0.5005030706524849, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 26 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1640625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2040.75, |
|
"completions/mean_length": 1598.625, |
|
"completions/mean_terminated_length": 1510.354736328125, |
|
"completions/min_length": 738.0, |
|
"completions/min_terminated_length": 738.0, |
|
"epoch": 0.6878980891719745, |
|
"grad_norm": 0.35008352061308967, |
|
"kl": 0.03564453125, |
|
"learning_rate": 3.731387357557344e-07, |
|
"loss": -0.0477, |
|
"num_tokens": 28309982.0, |
|
"reward": -0.5546875, |
|
"reward_std": 0.42381805181503296, |
|
"rewards/eps_simulator_reward/mean": -0.5546875, |
|
"rewards/eps_simulator_reward/std": 0.4947791174054146, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 27 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.21484375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2022.5, |
|
"completions/mean_length": 1600.478515625, |
|
"completions/mean_terminated_length": 1478.0363159179688, |
|
"completions/min_length": 757.25, |
|
"completions/min_terminated_length": 757.25, |
|
"epoch": 0.7133757961783439, |
|
"grad_norm": 0.3095860436443937, |
|
"kl": 0.035736083984375, |
|
"learning_rate": 3.367591018871506e-07, |
|
"loss": -0.0547, |
|
"num_tokens": 29391571.0, |
|
"reward": -0.51171875, |
|
"reward_std": 0.47134073078632355, |
|
"rewards/eps_simulator_reward/mean": -0.51171875, |
|
"rewards/eps_simulator_reward/std": 0.5015044659376144, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 28 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.21484375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2027.5, |
|
"completions/mean_length": 1615.025390625, |
|
"completions/mean_terminated_length": 1497.0940856933594, |
|
"completions/min_length": 708.5, |
|
"completions/min_terminated_length": 708.5, |
|
"epoch": 0.7388535031847133, |
|
"grad_norm": 0.27660703860323266, |
|
"kl": 0.033172607421875, |
|
"learning_rate": 3.020963583465539e-07, |
|
"loss": -0.0513, |
|
"num_tokens": 30480608.0, |
|
"reward": -0.4765625, |
|
"reward_std": 0.48036184906959534, |
|
"rewards/eps_simulator_reward/mean": -0.4765625, |
|
"rewards/eps_simulator_reward/std": 0.49983665347099304, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 29 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.17578125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2027.75, |
|
"completions/mean_length": 1601.826171875, |
|
"completions/mean_terminated_length": 1506.8683166503906, |
|
"completions/min_length": 787.5, |
|
"completions/min_terminated_length": 787.5, |
|
"epoch": 0.7643312101910829, |
|
"grad_norm": 0.42748719862561546, |
|
"kl": 0.034423828125, |
|
"learning_rate": 2.6942958916356994e-07, |
|
"loss": -0.0487, |
|
"num_tokens": 31562887.0, |
|
"reward": -0.525390625, |
|
"reward_std": 0.49626730382442474, |
|
"rewards/eps_simulator_reward/mean": -0.525390625, |
|
"rewards/eps_simulator_reward/std": 0.4998583048582077, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.20703125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2040.0, |
|
"completions/mean_length": 1603.60546875, |
|
"completions/mean_terminated_length": 1487.5808410644531, |
|
"completions/min_length": 779.75, |
|
"completions/min_terminated_length": 779.75, |
|
"epoch": 0.7898089171974523, |
|
"grad_norm": 0.26533857511488407, |
|
"kl": 0.03125, |
|
"learning_rate": 2.390218079559109e-07, |
|
"loss": -0.06, |
|
"num_tokens": 32646077.0, |
|
"reward": -0.5234375, |
|
"reward_std": 0.47222549468278885, |
|
"rewards/eps_simulator_reward/mean": -0.5234375, |
|
"rewards/eps_simulator_reward/std": 0.5008884444832802, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 31 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.20703125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2025.5, |
|
"completions/mean_length": 1610.716796875, |
|
"completions/mean_terminated_length": 1495.678955078125, |
|
"completions/min_length": 789.75, |
|
"completions/min_terminated_length": 789.75, |
|
"epoch": 0.8152866242038217, |
|
"grad_norm": 0.25691683516929065, |
|
"kl": 0.031829833984375, |
|
"learning_rate": 2.1111784029837509e-07, |
|
"loss": -0.0447, |
|
"num_tokens": 33732908.0, |
|
"reward": -0.515625, |
|
"reward_std": 0.45176610350608826, |
|
"rewards/eps_simulator_reward/mean": -0.515625, |
|
"rewards/eps_simulator_reward/std": 0.5000600069761276, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 32 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.203125, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2027.0, |
|
"completions/mean_length": 1604.3359375, |
|
"completions/mean_terminated_length": 1491.3028564453125, |
|
"completions/min_length": 795.25, |
|
"completions/min_terminated_length": 795.25, |
|
"epoch": 0.8407643312101911, |
|
"grad_norm": 0.6246804081121883, |
|
"kl": 0.041748046875, |
|
"learning_rate": 1.8594235253127372e-07, |
|
"loss": -0.0511, |
|
"num_tokens": 34816472.0, |
|
"reward": -0.517578125, |
|
"reward_std": 0.45204655081033707, |
|
"rewards/eps_simulator_reward/mean": -0.517578125, |
|
"rewards/eps_simulator_reward/std": 0.5011512860655785, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 33 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.244140625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2041.75, |
|
"completions/mean_length": 1642.564453125, |
|
"completions/mean_terminated_length": 1510.7035217285156, |
|
"completions/min_length": 781.75, |
|
"completions/min_terminated_length": 781.75, |
|
"epoch": 0.8662420382165605, |
|
"grad_norm": 0.2636823037232082, |
|
"kl": 0.03643798828125, |
|
"learning_rate": 1.6369804287916025e-07, |
|
"loss": -0.0411, |
|
"num_tokens": 35919609.0, |
|
"reward": -0.470703125, |
|
"reward_std": 0.45567234605550766, |
|
"rewards/eps_simulator_reward/mean": -0.470703125, |
|
"rewards/eps_simulator_reward/std": 0.4984956756234169, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 34 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.224609375, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2031.0, |
|
"completions/mean_length": 1605.212890625, |
|
"completions/mean_terminated_length": 1476.3855895996094, |
|
"completions/min_length": 839.25, |
|
"completions/min_terminated_length": 839.25, |
|
"epoch": 0.89171974522293, |
|
"grad_norm": 0.32334944678197935, |
|
"kl": 0.0355224609375, |
|
"learning_rate": 1.4456400944391144e-07, |
|
"loss": -0.0662, |
|
"num_tokens": 37003622.0, |
|
"reward": -0.458984375, |
|
"reward_std": 0.46678680181503296, |
|
"rewards/eps_simulator_reward/mean": -0.458984375, |
|
"rewards/eps_simulator_reward/std": 0.49930109083652496, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.19921875, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2010.0, |
|
"completions/mean_length": 1613.6328125, |
|
"completions/mean_terminated_length": 1504.9357604980469, |
|
"completions/min_length": 811.75, |
|
"completions/min_terminated_length": 811.75, |
|
"epoch": 0.9171974522292994, |
|
"grad_norm": 0.2540670404555639, |
|
"kl": 0.035858154296875, |
|
"learning_rate": 1.2869430821211826e-07, |
|
"loss": -0.0423, |
|
"num_tokens": 38091946.0, |
|
"reward": -0.5078125, |
|
"reward_std": 0.46562159806489944, |
|
"rewards/eps_simulator_reward/mean": -0.5078125, |
|
"rewards/eps_simulator_reward/std": 0.5015047490596771, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 36 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.212890625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2037.25, |
|
"completions/mean_length": 1638.091796875, |
|
"completions/mean_terminated_length": 1526.8819885253906, |
|
"completions/min_length": 769.5, |
|
"completions/min_terminated_length": 769.5, |
|
"epoch": 0.9426751592356688, |
|
"grad_norm": 0.6072482178147118, |
|
"kl": 0.0433349609375, |
|
"learning_rate": 1.1621671268686605e-07, |
|
"loss": -0.0309, |
|
"num_tokens": 39192793.0, |
|
"reward": -0.49609375, |
|
"reward_std": 0.45329853892326355, |
|
"rewards/eps_simulator_reward/mean": -0.49609375, |
|
"rewards/eps_simulator_reward/std": 0.5007057711482048, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 37 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.212890625, |
|
"completions/max_length": 2048.0, |
|
"completions/max_terminated_length": 2042.0, |
|
"completions/mean_length": 1634.078125, |
|
"completions/mean_terminated_length": 1521.7028198242188, |
|
"completions/min_length": 768.25, |
|
"completions/min_terminated_length": 768.25, |
|
"epoch": 0.9681528662420382, |
|
"grad_norm": 1.674101211279826, |
|
"kl": 0.0494384765625, |
|
"learning_rate": 1.0723168513061665e-07, |
|
"loss": -0.0404, |
|
"num_tokens": 40291585.0, |
|
"reward": -0.5, |
|
"reward_std": 0.46803878247737885, |
|
"rewards/eps_simulator_reward/mean": -0.5, |
|
"rewards/eps_simulator_reward/std": 0.49916671961545944, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 38 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.15625, |
|
"completions/max_length": 2032.0, |
|
"completions/max_terminated_length": 1812.75, |
|
"completions/mean_length": 1557.90625, |
|
"completions/mean_terminated_length": 1464.7232666015625, |
|
"completions/min_length": 1071.25, |
|
"completions/min_terminated_length": 1071.25, |
|
"epoch": 0.9936305732484076, |
|
"grad_norm": 0.2581163051289786, |
|
"kl": 0.0352783203125, |
|
"learning_rate": 1.0181156770214242e-07, |
|
"loss": -0.0572, |
|
"num_tokens": 41375402.0, |
|
"reward": -0.51171875, |
|
"reward_std": 0.47641219943761826, |
|
"rewards/eps_simulator_reward/mean": -0.51171875, |
|
"rewards/eps_simulator_reward/std": 0.5010442584753036, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.9936305732484076, |
|
"step": 39, |
|
"total_flos": 0.0, |
|
"train_loss": 0.0, |
|
"train_runtime": 1.8663, |
|
"train_samples_per_second": 2679.088, |
|
"train_steps_per_second": 20.897 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 39, |
|
"num_input_tokens_seen": 41375402, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|