{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9936305732484076, "eval_steps": 500, "global_step": 39, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.25, "completions/mean_length": 1484.009765625, "completions/mean_terminated_length": 1397.5816650390625, "completions/min_length": 787.5, "completions/min_terminated_length": 787.5, "epoch": 0.025477707006369428, "grad_norm": 6733.567763172914, "kl": 0.6657562255859375, "learning_rate": 0.0, "loss": -0.0422, "num_tokens": 1021957.0, "reward": -0.77338707447052, "reward_std": 0.32955513894557953, "rewards/eps_simulator_reward/mean": -0.77338707447052, "rewards/eps_simulator_reward/std": 0.4255572780966759, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.119140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.5, "completions/mean_length": 1476.125, "completions/mean_terminated_length": 1398.6347961425781, "completions/min_length": 733.5, "completions/min_terminated_length": 733.5, "epoch": 0.050955414012738856, "grad_norm": 970.9524031746012, "kl": 0.0662841796875, "learning_rate": 2.5e-07, "loss": -0.0475, "num_tokens": 2039877.0, "reward": -0.791015625, "reward_std": 0.2963574752211571, "rewards/eps_simulator_reward/mean": -0.791015625, "rewards/eps_simulator_reward/std": 0.40781307965517044, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.138671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.5, "completions/mean_length": 1510.779296875, "completions/mean_terminated_length": 1424.845703125, "completions/min_length": 813.75, "completions/min_terminated_length": 813.75, "epoch": 0.07643312101910828, "grad_norm": 1.343793674877022, "kl": 0.00213623046875, "learning_rate": 5e-07, "loss": -0.0653, "num_tokens": 3075540.0, "reward": -0.763671875, "reward_std": 0.3399305194616318, "rewards/eps_simulator_reward/mean": -0.763671875, "rewards/eps_simulator_reward/std": 0.42510994523763657, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.099609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.25, "completions/mean_length": 1490.4921875, "completions/mean_terminated_length": 1429.5967407226562, "completions/min_length": 744.0, "completions/min_terminated_length": 744.0, "epoch": 0.10191082802547771, "grad_norm": 1.0331254646479913, "kl": 0.0016155242919921875, "learning_rate": 7.5e-07, "loss": -0.0522, "num_tokens": 4100816.0, "reward": -0.78515625, "reward_std": 0.3134715184569359, "rewards/eps_simulator_reward/mean": -0.78515625, "rewards/eps_simulator_reward/std": 0.406184121966362, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 4 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.5, "completions/mean_length": 1423.41015625, "completions/mean_terminated_length": 1352.7940368652344, "completions/min_length": 700.0, "completions/min_terminated_length": 700.0, "epoch": 0.12738853503184713, "grad_norm": 1.8236196385202714, "kl": 0.0016574859619140625, "learning_rate": 1e-06, "loss": -0.0699, "num_tokens": 5091746.0, "reward": -0.79296875, "reward_std": 0.3344918265938759, "rewards/eps_simulator_reward/mean": -0.79296875, "rewards/eps_simulator_reward/std": 0.4023679941892624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 5 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1507.345703125, "completions/mean_terminated_length": 1418.7192993164062, "completions/min_length": 778.75, "completions/min_terminated_length": 778.75, "epoch": 0.15286624203821655, "grad_norm": 0.2847883117730281, "kl": 0.00554656982421875, "learning_rate": 9.981884322978574e-07, "loss": -0.0605, "num_tokens": 6125651.0, "reward": -0.751953125, "reward_std": 0.3284922167658806, "rewards/eps_simulator_reward/mean": -0.751953125, "rewards/eps_simulator_reward/std": 0.43277325481176376, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 6 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.158203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.25, "completions/mean_length": 1494.505859375, "completions/mean_terminated_length": 1390.4645080566406, "completions/min_length": 710.0, "completions/min_terminated_length": 710.0, "epoch": 0.17834394904458598, "grad_norm": 0.3559502825960311, "kl": 0.006683349609375, "learning_rate": 9.927683148693833e-07, "loss": -0.0737, "num_tokens": 7152982.0, "reward": -0.71875, "reward_std": 0.3783887177705765, "rewards/eps_simulator_reward/mean": -0.71875, "rewards/eps_simulator_reward/std": 0.4498228207230568, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 7 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.25, "completions/mean_length": 1505.7734375, "completions/mean_terminated_length": 1434.4289855957031, "completions/min_length": 730.5, "completions/min_terminated_length": 730.5, "epoch": 0.20382165605095542, "grad_norm": 0.6393420547069347, "kl": 0.0185089111328125, "learning_rate": 9.83783287313134e-07, "loss": -0.048, "num_tokens": 8186082.0, "reward": -0.75, "reward_std": 0.3828992694616318, "rewards/eps_simulator_reward/mean": -0.75, "rewards/eps_simulator_reward/std": 0.4343789964914322, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 8 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.146484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1525.02734375, "completions/mean_terminated_length": 1435.8611755371094, "completions/min_length": 713.75, "completions/min_terminated_length": 713.75, "epoch": 0.22929936305732485, "grad_norm": 0.9103220051184144, "kl": 0.03057861328125, "learning_rate": 9.713056917878816e-07, "loss": -0.0548, "num_tokens": 9229040.0, "reward": -0.650390625, "reward_std": 0.42321375012397766, "rewards/eps_simulator_reward/mean": -0.650390625, "rewards/eps_simulator_reward/std": 0.47667451947927475, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 9 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.119140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1516.04296875, "completions/mean_terminated_length": 1444.0675048828125, "completions/min_length": 718.5, "completions/min_terminated_length": 718.5, "epoch": 0.25477707006369427, "grad_norm": 0.31002656427107383, "kl": 0.020538330078125, "learning_rate": 9.554359905560885e-07, "loss": -0.0549, "num_tokens": 10267398.0, "reward": -0.671875, "reward_std": 0.41061024367809296, "rewards/eps_simulator_reward/mean": -0.671875, "rewards/eps_simulator_reward/std": 0.4707574099302292, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 10 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.169921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1524.63671875, "completions/mean_terminated_length": 1417.4291687011719, "completions/min_length": 737.5, "completions/min_terminated_length": 737.5, "epoch": 0.2802547770700637, "grad_norm": 0.3975698195821328, "kl": 0.020050048828125, "learning_rate": 9.363019571208397e-07, "loss": -0.0511, "num_tokens": 11310156.0, "reward": -0.6796875, "reward_std": 0.41205591708421707, "rewards/eps_simulator_reward/mean": -0.6796875, "rewards/eps_simulator_reward/std": 0.46482934057712555, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 11 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.12890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.25, "completions/mean_length": 1533.3046875, "completions/mean_terminated_length": 1456.9059448242188, "completions/min_length": 763.0, "completions/min_terminated_length": 763.0, "epoch": 0.3057324840764331, "grad_norm": 1.0419265668973368, "kl": 0.024017333984375, "learning_rate": 9.140576474687263e-07, "loss": -0.0551, "num_tokens": 12357352.0, "reward": -0.70703125, "reward_std": 0.4042434096336365, "rewards/eps_simulator_reward/mean": -0.70703125, "rewards/eps_simulator_reward/std": 0.4536997899413109, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 12 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.166015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1526.982421875, "completions/mean_terminated_length": 1423.71337890625, "completions/min_length": 637.0, "completions/min_terminated_length": 637.0, "epoch": 0.33121019108280253, "grad_norm": 0.3341045274795088, "kl": 0.020660400390625, "learning_rate": 8.88882159701625e-07, "loss": -0.0667, "num_tokens": 13401311.0, "reward": -0.724609375, "reward_std": 0.3498363718390465, "rewards/eps_simulator_reward/mean": -0.724609375, "rewards/eps_simulator_reward/std": 0.44695496559143066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 13 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.158203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.25, "completions/mean_length": 1531.578125, "completions/mean_terminated_length": 1435.1131286621094, "completions/min_length": 718.5, "completions/min_terminated_length": 718.5, "epoch": 0.35668789808917195, "grad_norm": 0.3415663995876473, "kl": 0.02423095703125, "learning_rate": 8.609781920440891e-07, "loss": -0.052, "num_tokens": 14447623.0, "reward": -0.72265625, "reward_std": 0.3862012252211571, "rewards/eps_simulator_reward/mean": -0.72265625, "rewards/eps_simulator_reward/std": 0.4454372450709343, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 14 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1516.60546875, "completions/mean_terminated_length": 1420.9369506835938, "completions/min_length": 754.75, "completions/min_terminated_length": 754.75, "epoch": 0.3821656050955414, "grad_norm": 0.40113945682093677, "kl": 0.026031494140625, "learning_rate": 8.305704108364301e-07, "loss": -0.0556, "num_tokens": 15486269.0, "reward": -0.673828125, "reward_std": 0.4069410637021065, "rewards/eps_simulator_reward/mean": -0.673828125, "rewards/eps_simulator_reward/std": 0.4652545005083084, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 15 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.158203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.75, "completions/mean_length": 1545.33984375, "completions/mean_terminated_length": 1451.2561340332031, "completions/min_length": 765.25, "completions/min_terminated_length": 765.25, "epoch": 0.40764331210191085, "grad_norm": 0.3980926201477192, "kl": 0.02734375, "learning_rate": 7.979036416534461e-07, "loss": -0.0547, "num_tokens": 16539627.0, "reward": -0.669921875, "reward_std": 0.39735905081033707, "rewards/eps_simulator_reward/mean": -0.669921875, "rewards/eps_simulator_reward/std": 0.4680754914879799, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 16 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.119140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1541.90625, "completions/mean_terminated_length": 1473.3203430175781, "completions/min_length": 757.0, "completions/min_terminated_length": 757.0, "epoch": 0.43312101910828027, "grad_norm": 0.293619293173836, "kl": 0.027069091796875, "learning_rate": 7.632408981128493e-07, "loss": -0.0319, "num_tokens": 17591227.0, "reward": -0.681640625, "reward_std": 0.37931685149669647, "rewards/eps_simulator_reward/mean": -0.681640625, "rewards/eps_simulator_reward/std": 0.46698828786611557, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 17 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.75, "completions/mean_length": 1574.708984375, "completions/mean_terminated_length": 1471.5007019042969, "completions/min_length": 773.0, "completions/min_terminated_length": 773.0, "epoch": 0.4585987261146497, "grad_norm": 0.3758960483732288, "kl": 0.02618408203125, "learning_rate": 7.268612642442656e-07, "loss": -0.0438, "num_tokens": 18659622.0, "reward": -0.630859375, "reward_std": 0.42619186639785767, "rewards/eps_simulator_reward/mean": -0.630859375, "rewards/eps_simulator_reward/std": 0.48388052731752396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 18 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1560.658203125, "completions/mean_terminated_length": 1453.4781494140625, "completions/min_length": 833.25, "completions/min_terminated_length": 833.25, "epoch": 0.4840764331210191, "grad_norm": 0.2591766871237973, "kl": 0.02783203125, "learning_rate": 6.890576474687263e-07, "loss": -0.0392, "num_tokens": 19720823.0, "reward": -0.611328125, "reward_std": 0.44004734605550766, "rewards/eps_simulator_reward/mean": -0.611328125, "rewards/eps_simulator_reward/std": 0.4889531210064888, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 19 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.166015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.75, "completions/mean_length": 1587.4140625, "completions/mean_terminated_length": 1495.7480163574219, "completions/min_length": 803.5, "completions/min_terminated_length": 803.5, "epoch": 0.5095541401273885, "grad_norm": 0.24419145488172941, "kl": 0.02935791015625, "learning_rate": 6.501344202803414e-07, "loss": -0.0332, "num_tokens": 20795723.0, "reward": -0.548828125, "reward_std": 0.46469344943761826, "rewards/eps_simulator_reward/mean": -0.548828125, "rewards/eps_simulator_reward/std": 0.49816230684518814, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 20 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.18359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1579.974609375, "completions/mean_terminated_length": 1476.3168640136719, "completions/min_length": 772.25, "completions/min_terminated_length": 772.25, "epoch": 0.535031847133758, "grad_norm": 0.2887492844862219, "kl": 0.035125732421875, "learning_rate": 6.10404969617945e-07, "loss": -0.0473, "num_tokens": 21866814.0, "reward": -0.548828125, "reward_std": 0.46501730382442474, "rewards/eps_simulator_reward/mean": -0.548828125, "rewards/eps_simulator_reward/std": 0.49840257316827774, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 21 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.25, "completions/mean_length": 1596.095703125, "completions/mean_terminated_length": 1497.2264099121094, "completions/min_length": 760.0, "completions/min_terminated_length": 760.0, "epoch": 0.5605095541401274, "grad_norm": 0.4483927215331061, "kl": 0.0399169921875, "learning_rate": 5.701891736577317e-07, "loss": -0.0322, "num_tokens": 22946159.0, "reward": -0.51953125, "reward_std": 0.48454854637384415, "rewards/eps_simulator_reward/mean": -0.51953125, "rewards/eps_simulator_reward/std": 0.49943237751722336, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 22 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.18359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.5, "completions/mean_length": 1587.29296875, "completions/mean_terminated_length": 1483.8641662597656, "completions/min_length": 828.25, "completions/min_terminated_length": 828.25, "epoch": 0.5859872611464968, "grad_norm": 0.35917761276782173, "kl": 0.0360107421875, "learning_rate": 5.298108263422685e-07, "loss": -0.0434, "num_tokens": 24020997.0, "reward": -0.50390625, "reward_std": 0.47762079536914825, "rewards/eps_simulator_reward/mean": -0.50390625, "rewards/eps_simulator_reward/std": 0.5010738596320152, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 23 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.212890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.5, "completions/mean_length": 1582.2890625, "completions/mean_terminated_length": 1456.1675109863281, "completions/min_length": 776.75, "completions/min_terminated_length": 776.75, "epoch": 0.6114649681528662, "grad_norm": 0.317959150806763, "kl": 0.03668212890625, "learning_rate": 4.895950303820552e-07, "loss": -0.0486, "num_tokens": 25093273.0, "reward": -0.447265625, "reward_std": 0.46050675213336945, "rewards/eps_simulator_reward/mean": -0.447265625, "rewards/eps_simulator_reward/std": 0.4985574185848236, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 24 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.162109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.75, "completions/mean_length": 1571.0625, "completions/mean_terminated_length": 1479.2693176269531, "completions/min_length": 790.75, "completions/min_terminated_length": 790.75, "epoch": 0.6369426751592356, "grad_norm": 0.3007766407739895, "kl": 0.03619384765625, "learning_rate": 4.4986557971965856e-07, "loss": -0.0381, "num_tokens": 26159801.0, "reward": -0.498046875, "reward_std": 0.43191099166870117, "rewards/eps_simulator_reward/mean": -0.498046875, "rewards/eps_simulator_reward/std": 0.4973808750510216, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 25 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.25, "completions/mean_length": 1576.947265625, "completions/mean_terminated_length": 1474.1740112304688, "completions/min_length": 756.25, "completions/min_terminated_length": 756.25, "epoch": 0.6624203821656051, "grad_norm": 0.320328638853622, "kl": 0.034881591796875, "learning_rate": 4.1094235253127374e-07, "loss": -0.0469, "num_tokens": 27229342.0, "reward": -0.490234375, "reward_std": 0.45237039774656296, "rewards/eps_simulator_reward/mean": -0.490234375, "rewards/eps_simulator_reward/std": 0.5005030706524849, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 26 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.75, "completions/mean_length": 1598.625, "completions/mean_terminated_length": 1510.354736328125, "completions/min_length": 738.0, "completions/min_terminated_length": 738.0, "epoch": 0.6878980891719745, "grad_norm": 0.35008352061308967, "kl": 0.03564453125, "learning_rate": 3.731387357557344e-07, "loss": -0.0477, "num_tokens": 28309982.0, "reward": -0.5546875, "reward_std": 0.42381805181503296, "rewards/eps_simulator_reward/mean": -0.5546875, "rewards/eps_simulator_reward/std": 0.4947791174054146, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 27 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.5, "completions/mean_length": 1600.478515625, "completions/mean_terminated_length": 1478.0363159179688, "completions/min_length": 757.25, "completions/min_terminated_length": 757.25, "epoch": 0.7133757961783439, "grad_norm": 0.3095860436443937, "kl": 0.035736083984375, "learning_rate": 3.367591018871506e-07, "loss": -0.0547, "num_tokens": 29391571.0, "reward": -0.51171875, "reward_std": 0.47134073078632355, "rewards/eps_simulator_reward/mean": -0.51171875, "rewards/eps_simulator_reward/std": 0.5015044659376144, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 28 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.5, "completions/mean_length": 1615.025390625, "completions/mean_terminated_length": 1497.0940856933594, "completions/min_length": 708.5, "completions/min_terminated_length": 708.5, "epoch": 0.7388535031847133, "grad_norm": 0.27660703860323266, "kl": 0.033172607421875, "learning_rate": 3.020963583465539e-07, "loss": -0.0513, "num_tokens": 30480608.0, "reward": -0.4765625, "reward_std": 0.48036184906959534, "rewards/eps_simulator_reward/mean": -0.4765625, "rewards/eps_simulator_reward/std": 0.49983665347099304, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 29 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.17578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.75, "completions/mean_length": 1601.826171875, "completions/mean_terminated_length": 1506.8683166503906, "completions/min_length": 787.5, "completions/min_terminated_length": 787.5, "epoch": 0.7643312101910829, "grad_norm": 0.42748719862561546, "kl": 0.034423828125, "learning_rate": 2.6942958916356994e-07, "loss": -0.0487, "num_tokens": 31562887.0, "reward": -0.525390625, "reward_std": 0.49626730382442474, "rewards/eps_simulator_reward/mean": -0.525390625, "rewards/eps_simulator_reward/std": 0.4998583048582077, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 30 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.20703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1603.60546875, "completions/mean_terminated_length": 1487.5808410644531, "completions/min_length": 779.75, "completions/min_terminated_length": 779.75, "epoch": 0.7898089171974523, "grad_norm": 0.26533857511488407, "kl": 0.03125, "learning_rate": 2.390218079559109e-07, "loss": -0.06, "num_tokens": 32646077.0, "reward": -0.5234375, "reward_std": 0.47222549468278885, "rewards/eps_simulator_reward/mean": -0.5234375, "rewards/eps_simulator_reward/std": 0.5008884444832802, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 31 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.20703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.5, "completions/mean_length": 1610.716796875, "completions/mean_terminated_length": 1495.678955078125, "completions/min_length": 789.75, "completions/min_terminated_length": 789.75, "epoch": 0.8152866242038217, "grad_norm": 0.25691683516929065, "kl": 0.031829833984375, "learning_rate": 2.1111784029837509e-07, "loss": -0.0447, "num_tokens": 33732908.0, "reward": -0.515625, "reward_std": 0.45176610350608826, "rewards/eps_simulator_reward/mean": -0.515625, "rewards/eps_simulator_reward/std": 0.5000600069761276, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 32 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1604.3359375, "completions/mean_terminated_length": 1491.3028564453125, "completions/min_length": 795.25, "completions/min_terminated_length": 795.25, "epoch": 0.8407643312101911, "grad_norm": 0.6246804081121883, "kl": 0.041748046875, "learning_rate": 1.8594235253127372e-07, "loss": -0.0511, "num_tokens": 34816472.0, "reward": -0.517578125, "reward_std": 0.45204655081033707, "rewards/eps_simulator_reward/mean": -0.517578125, "rewards/eps_simulator_reward/std": 0.5011512860655785, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 33 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.244140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.75, "completions/mean_length": 1642.564453125, "completions/mean_terminated_length": 1510.7035217285156, "completions/min_length": 781.75, "completions/min_terminated_length": 781.75, "epoch": 0.8662420382165605, "grad_norm": 0.2636823037232082, "kl": 0.03643798828125, "learning_rate": 1.6369804287916025e-07, "loss": -0.0411, "num_tokens": 35919609.0, "reward": -0.470703125, "reward_std": 0.45567234605550766, "rewards/eps_simulator_reward/mean": -0.470703125, "rewards/eps_simulator_reward/std": 0.4984956756234169, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 34 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.224609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1605.212890625, "completions/mean_terminated_length": 1476.3855895996094, "completions/min_length": 839.25, "completions/min_terminated_length": 839.25, "epoch": 0.89171974522293, "grad_norm": 0.32334944678197935, "kl": 0.0355224609375, "learning_rate": 1.4456400944391144e-07, "loss": -0.0662, "num_tokens": 37003622.0, "reward": -0.458984375, "reward_std": 0.46678680181503296, "rewards/eps_simulator_reward/mean": -0.458984375, "rewards/eps_simulator_reward/std": 0.49930109083652496, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 35 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.19921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 1613.6328125, "completions/mean_terminated_length": 1504.9357604980469, "completions/min_length": 811.75, "completions/min_terminated_length": 811.75, "epoch": 0.9171974522292994, "grad_norm": 0.2540670404555639, "kl": 0.035858154296875, "learning_rate": 1.2869430821211826e-07, "loss": -0.0423, "num_tokens": 38091946.0, "reward": -0.5078125, "reward_std": 0.46562159806489944, "rewards/eps_simulator_reward/mean": -0.5078125, "rewards/eps_simulator_reward/std": 0.5015047490596771, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 36 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.212890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.25, "completions/mean_length": 1638.091796875, "completions/mean_terminated_length": 1526.8819885253906, "completions/min_length": 769.5, "completions/min_terminated_length": 769.5, "epoch": 0.9426751592356688, "grad_norm": 0.6072482178147118, "kl": 0.0433349609375, "learning_rate": 1.1621671268686605e-07, "loss": -0.0309, "num_tokens": 39192793.0, "reward": -0.49609375, "reward_std": 0.45329853892326355, "rewards/eps_simulator_reward/mean": -0.49609375, "rewards/eps_simulator_reward/std": 0.5007057711482048, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 37 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.212890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1634.078125, "completions/mean_terminated_length": 1521.7028198242188, "completions/min_length": 768.25, "completions/min_terminated_length": 768.25, "epoch": 0.9681528662420382, "grad_norm": 1.674101211279826, "kl": 0.0494384765625, "learning_rate": 1.0723168513061665e-07, "loss": -0.0404, "num_tokens": 40291585.0, "reward": -0.5, "reward_std": 0.46803878247737885, "rewards/eps_simulator_reward/mean": -0.5, "rewards/eps_simulator_reward/std": 0.49916671961545944, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 38 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2032.0, "completions/max_terminated_length": 1812.75, "completions/mean_length": 1557.90625, "completions/mean_terminated_length": 1464.7232666015625, "completions/min_length": 1071.25, "completions/min_terminated_length": 1071.25, "epoch": 0.9936305732484076, "grad_norm": 0.2581163051289786, "kl": 0.0352783203125, "learning_rate": 1.0181156770214242e-07, "loss": -0.0572, "num_tokens": 41375402.0, "reward": -0.51171875, "reward_std": 0.47641219943761826, "rewards/eps_simulator_reward/mean": -0.51171875, "rewards/eps_simulator_reward/std": 0.5010442584753036, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 39 }, { "epoch": 0.9936305732484076, "step": 39, "total_flos": 0.0, "train_loss": 0.0, "train_runtime": 1.8663, "train_samples_per_second": 2679.088, "train_steps_per_second": 20.897 } ], "logging_steps": 1, "max_steps": 39, "num_input_tokens_seen": 41375402, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }