{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.9936305732484076,
  "eval_steps": 500,
  "global_step": 39,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.1328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2012.25,
      "completions/mean_length": 1484.009765625,
      "completions/mean_terminated_length": 1397.5816650390625,
      "completions/min_length": 787.5,
      "completions/min_terminated_length": 787.5,
      "epoch": 0.025477707006369428,
      "grad_norm": 6733.567763172914,
      "kl": 0.6657562255859375,
      "learning_rate": 0.0,
      "loss": -0.0422,
      "num_tokens": 1021957.0,
      "reward": -0.77338707447052,
      "reward_std": 0.32955513894557953,
      "rewards/eps_simulator_reward/mean": -0.77338707447052,
      "rewards/eps_simulator_reward/std": 0.4255572780966759,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 1
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.119140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2004.5,
      "completions/mean_length": 1476.125,
      "completions/mean_terminated_length": 1398.6347961425781,
      "completions/min_length": 733.5,
      "completions/min_terminated_length": 733.5,
      "epoch": 0.050955414012738856,
      "grad_norm": 970.9524031746012,
      "kl": 0.0662841796875,
      "learning_rate": 2.5e-07,
      "loss": -0.0475,
      "num_tokens": 2039877.0,
      "reward": -0.791015625,
      "reward_std": 0.2963574752211571,
      "rewards/eps_simulator_reward/mean": -0.791015625,
      "rewards/eps_simulator_reward/std": 0.40781307965517044,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 2
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.138671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.5,
      "completions/mean_length": 1510.779296875,
      "completions/mean_terminated_length": 1424.845703125,
      "completions/min_length": 813.75,
      "completions/min_terminated_length": 813.75,
      "epoch": 0.07643312101910828,
      "grad_norm": 1.343793674877022,
      "kl": 0.00213623046875,
      "learning_rate": 5e-07,
      "loss": -0.0653,
      "num_tokens": 3075540.0,
      "reward": -0.763671875,
      "reward_std": 0.3399305194616318,
      "rewards/eps_simulator_reward/mean": -0.763671875,
      "rewards/eps_simulator_reward/std": 0.42510994523763657,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 3
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.099609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2025.25,
      "completions/mean_length": 1490.4921875,
      "completions/mean_terminated_length": 1429.5967407226562,
      "completions/min_length": 744.0,
      "completions/min_terminated_length": 744.0,
      "epoch": 0.10191082802547771,
      "grad_norm": 1.0331254646479913,
      "kl": 0.0016155242919921875,
      "learning_rate": 7.5e-07,
      "loss": -0.0522,
      "num_tokens": 4100816.0,
      "reward": -0.78515625,
      "reward_std": 0.3134715184569359,
      "rewards/eps_simulator_reward/mean": -0.78515625,
      "rewards/eps_simulator_reward/std": 0.406184121966362,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 4
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.1015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2011.5,
      "completions/mean_length": 1423.41015625,
      "completions/mean_terminated_length": 1352.7940368652344,
      "completions/min_length": 700.0,
      "completions/min_terminated_length": 700.0,
      "epoch": 0.12738853503184713,
      "grad_norm": 1.8236196385202714,
      "kl": 0.0016574859619140625,
      "learning_rate": 1e-06,
      "loss": -0.0699,
      "num_tokens": 5091746.0,
      "reward": -0.79296875,
      "reward_std": 0.3344918265938759,
      "rewards/eps_simulator_reward/mean": -0.79296875,
      "rewards/eps_simulator_reward/std": 0.4023679941892624,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 5
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2018.0,
      "completions/mean_length": 1507.345703125,
      "completions/mean_terminated_length": 1418.7192993164062,
      "completions/min_length": 778.75,
      "completions/min_terminated_length": 778.75,
      "epoch": 0.15286624203821655,
      "grad_norm": 0.2847883117730281,
      "kl": 0.00554656982421875,
      "learning_rate": 9.981884322978574e-07,
      "loss": -0.0605,
      "num_tokens": 6125651.0,
      "reward": -0.751953125,
      "reward_std": 0.3284922167658806,
      "rewards/eps_simulator_reward/mean": -0.751953125,
      "rewards/eps_simulator_reward/std": 0.43277325481176376,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 6
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.158203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2009.25,
      "completions/mean_length": 1494.505859375,
      "completions/mean_terminated_length": 1390.4645080566406,
      "completions/min_length": 710.0,
      "completions/min_terminated_length": 710.0,
      "epoch": 0.17834394904458598,
      "grad_norm": 0.3559502825960311,
      "kl": 0.006683349609375,
      "learning_rate": 9.927683148693833e-07,
      "loss": -0.0737,
      "num_tokens": 7152982.0,
      "reward": -0.71875,
      "reward_std": 0.3783887177705765,
      "rewards/eps_simulator_reward/mean": -0.71875,
      "rewards/eps_simulator_reward/std": 0.4498228207230568,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 7
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.1171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2013.25,
      "completions/mean_length": 1505.7734375,
      "completions/mean_terminated_length": 1434.4289855957031,
      "completions/min_length": 730.5,
      "completions/min_terminated_length": 730.5,
      "epoch": 0.20382165605095542,
      "grad_norm": 0.6393420547069347,
      "kl": 0.0185089111328125,
      "learning_rate": 9.83783287313134e-07,
      "loss": -0.048,
      "num_tokens": 8186082.0,
      "reward": -0.75,
      "reward_std": 0.3828992694616318,
      "rewards/eps_simulator_reward/mean": -0.75,
      "rewards/eps_simulator_reward/std": 0.4343789964914322,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 8
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.146484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2038.0,
      "completions/mean_length": 1525.02734375,
      "completions/mean_terminated_length": 1435.8611755371094,
      "completions/min_length": 713.75,
      "completions/min_terminated_length": 713.75,
      "epoch": 0.22929936305732485,
      "grad_norm": 0.9103220051184144,
      "kl": 0.03057861328125,
      "learning_rate": 9.713056917878816e-07,
      "loss": -0.0548,
      "num_tokens": 9229040.0,
      "reward": -0.650390625,
      "reward_std": 0.42321375012397766,
      "rewards/eps_simulator_reward/mean": -0.650390625,
      "rewards/eps_simulator_reward/std": 0.47667451947927475,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 9
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.119140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1516.04296875,
      "completions/mean_terminated_length": 1444.0675048828125,
      "completions/min_length": 718.5,
      "completions/min_terminated_length": 718.5,
      "epoch": 0.25477707006369427,
      "grad_norm": 0.31002656427107383,
      "kl": 0.020538330078125,
      "learning_rate": 9.554359905560885e-07,
      "loss": -0.0549,
      "num_tokens": 10267398.0,
      "reward": -0.671875,
      "reward_std": 0.41061024367809296,
      "rewards/eps_simulator_reward/mean": -0.671875,
      "rewards/eps_simulator_reward/std": 0.4707574099302292,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 10
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.169921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1524.63671875,
      "completions/mean_terminated_length": 1417.4291687011719,
      "completions/min_length": 737.5,
      "completions/min_terminated_length": 737.5,
      "epoch": 0.2802547770700637,
      "grad_norm": 0.3975698195821328,
      "kl": 0.020050048828125,
      "learning_rate": 9.363019571208397e-07,
      "loss": -0.0511,
      "num_tokens": 11310156.0,
      "reward": -0.6796875,
      "reward_std": 0.41205591708421707,
      "rewards/eps_simulator_reward/mean": -0.6796875,
      "rewards/eps_simulator_reward/std": 0.46482934057712555,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 11
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.12890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.25,
      "completions/mean_length": 1533.3046875,
      "completions/mean_terminated_length": 1456.9059448242188,
      "completions/min_length": 763.0,
      "completions/min_terminated_length": 763.0,
      "epoch": 0.3057324840764331,
      "grad_norm": 1.0419265668973368,
      "kl": 0.024017333984375,
      "learning_rate": 9.140576474687263e-07,
      "loss": -0.0551,
      "num_tokens": 12357352.0,
      "reward": -0.70703125,
      "reward_std": 0.4042434096336365,
      "rewards/eps_simulator_reward/mean": -0.70703125,
      "rewards/eps_simulator_reward/std": 0.4536997899413109,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 12
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.166015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2028.0,
      "completions/mean_length": 1526.982421875,
      "completions/mean_terminated_length": 1423.71337890625,
      "completions/min_length": 637.0,
      "completions/min_terminated_length": 637.0,
      "epoch": 0.33121019108280253,
      "grad_norm": 0.3341045274795088,
      "kl": 0.020660400390625,
      "learning_rate": 8.88882159701625e-07,
      "loss": -0.0667,
      "num_tokens": 13401311.0,
      "reward": -0.724609375,
      "reward_std": 0.3498363718390465,
      "rewards/eps_simulator_reward/mean": -0.724609375,
      "rewards/eps_simulator_reward/std": 0.44695496559143066,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 13
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.158203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2024.25,
      "completions/mean_length": 1531.578125,
      "completions/mean_terminated_length": 1435.1131286621094,
      "completions/min_length": 718.5,
      "completions/min_terminated_length": 718.5,
      "epoch": 0.35668789808917195,
      "grad_norm": 0.3415663995876473,
      "kl": 0.02423095703125,
      "learning_rate": 8.609781920440891e-07,
      "loss": -0.052,
      "num_tokens": 14447623.0,
      "reward": -0.72265625,
      "reward_std": 0.3862012252211571,
      "rewards/eps_simulator_reward/mean": -0.72265625,
      "rewards/eps_simulator_reward/std": 0.4454372450709343,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 14
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.15234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1516.60546875,
      "completions/mean_terminated_length": 1420.9369506835938,
      "completions/min_length": 754.75,
      "completions/min_terminated_length": 754.75,
      "epoch": 0.3821656050955414,
      "grad_norm": 0.40113945682093677,
      "kl": 0.026031494140625,
      "learning_rate": 8.305704108364301e-07,
      "loss": -0.0556,
      "num_tokens": 15486269.0,
      "reward": -0.673828125,
      "reward_std": 0.4069410637021065,
      "rewards/eps_simulator_reward/mean": -0.673828125,
      "rewards/eps_simulator_reward/std": 0.4652545005083084,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 15
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.158203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.75,
      "completions/mean_length": 1545.33984375,
      "completions/mean_terminated_length": 1451.2561340332031,
      "completions/min_length": 765.25,
      "completions/min_terminated_length": 765.25,
      "epoch": 0.40764331210191085,
      "grad_norm": 0.3980926201477192,
      "kl": 0.02734375,
      "learning_rate": 7.979036416534461e-07,
      "loss": -0.0547,
      "num_tokens": 16539627.0,
      "reward": -0.669921875,
      "reward_std": 0.39735905081033707,
      "rewards/eps_simulator_reward/mean": -0.669921875,
      "rewards/eps_simulator_reward/std": 0.4680754914879799,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 16
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.119140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1541.90625,
      "completions/mean_terminated_length": 1473.3203430175781,
      "completions/min_length": 757.0,
      "completions/min_terminated_length": 757.0,
      "epoch": 0.43312101910828027,
      "grad_norm": 0.293619293173836,
      "kl": 0.027069091796875,
      "learning_rate": 7.632408981128493e-07,
      "loss": -0.0319,
      "num_tokens": 17591227.0,
      "reward": -0.681640625,
      "reward_std": 0.37931685149669647,
      "rewards/eps_simulator_reward/mean": -0.681640625,
      "rewards/eps_simulator_reward/std": 0.46698828786611557,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 17
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.1796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2016.75,
      "completions/mean_length": 1574.708984375,
      "completions/mean_terminated_length": 1471.5007019042969,
      "completions/min_length": 773.0,
      "completions/min_terminated_length": 773.0,
      "epoch": 0.4585987261146497,
      "grad_norm": 0.3758960483732288,
      "kl": 0.02618408203125,
      "learning_rate": 7.268612642442656e-07,
      "loss": -0.0438,
      "num_tokens": 18659622.0,
      "reward": -0.630859375,
      "reward_std": 0.42619186639785767,
      "rewards/eps_simulator_reward/mean": -0.630859375,
      "rewards/eps_simulator_reward/std": 0.48388052731752396,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 18
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.1796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 1560.658203125,
      "completions/mean_terminated_length": 1453.4781494140625,
      "completions/min_length": 833.25,
      "completions/min_terminated_length": 833.25,
      "epoch": 0.4840764331210191,
      "grad_norm": 0.2591766871237973,
      "kl": 0.02783203125,
      "learning_rate": 6.890576474687263e-07,
      "loss": -0.0392,
      "num_tokens": 19720823.0,
      "reward": -0.611328125,
      "reward_std": 0.44004734605550766,
      "rewards/eps_simulator_reward/mean": -0.611328125,
      "rewards/eps_simulator_reward/std": 0.4889531210064888,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 19
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.166015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.75,
      "completions/mean_length": 1587.4140625,
      "completions/mean_terminated_length": 1495.7480163574219,
      "completions/min_length": 803.5,
      "completions/min_terminated_length": 803.5,
      "epoch": 0.5095541401273885,
      "grad_norm": 0.24419145488172941,
      "kl": 0.02935791015625,
      "learning_rate": 6.501344202803414e-07,
      "loss": -0.0332,
      "num_tokens": 20795723.0,
      "reward": -0.548828125,
      "reward_std": 0.46469344943761826,
      "rewards/eps_simulator_reward/mean": -0.548828125,
      "rewards/eps_simulator_reward/std": 0.49816230684518814,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 20
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.18359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2035.0,
      "completions/mean_length": 1579.974609375,
      "completions/mean_terminated_length": 1476.3168640136719,
      "completions/min_length": 772.25,
      "completions/min_terminated_length": 772.25,
      "epoch": 0.535031847133758,
      "grad_norm": 0.2887492844862219,
      "kl": 0.035125732421875,
      "learning_rate": 6.10404969617945e-07,
      "loss": -0.0473,
      "num_tokens": 21866814.0,
      "reward": -0.548828125,
      "reward_std": 0.46501730382442474,
      "rewards/eps_simulator_reward/mean": -0.548828125,
      "rewards/eps_simulator_reward/std": 0.49840257316827774,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 21
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.1796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.25,
      "completions/mean_length": 1596.095703125,
      "completions/mean_terminated_length": 1497.2264099121094,
      "completions/min_length": 760.0,
      "completions/min_terminated_length": 760.0,
      "epoch": 0.5605095541401274,
      "grad_norm": 0.4483927215331061,
      "kl": 0.0399169921875,
      "learning_rate": 5.701891736577317e-07,
      "loss": -0.0322,
      "num_tokens": 22946159.0,
      "reward": -0.51953125,
      "reward_std": 0.48454854637384415,
      "rewards/eps_simulator_reward/mean": -0.51953125,
      "rewards/eps_simulator_reward/std": 0.49943237751722336,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 22
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.18359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2028.5,
      "completions/mean_length": 1587.29296875,
      "completions/mean_terminated_length": 1483.8641662597656,
      "completions/min_length": 828.25,
      "completions/min_terminated_length": 828.25,
      "epoch": 0.5859872611464968,
      "grad_norm": 0.35917761276782173,
      "kl": 0.0360107421875,
      "learning_rate": 5.298108263422685e-07,
      "loss": -0.0434,
      "num_tokens": 24020997.0,
      "reward": -0.50390625,
      "reward_std": 0.47762079536914825,
      "rewards/eps_simulator_reward/mean": -0.50390625,
      "rewards/eps_simulator_reward/std": 0.5010738596320152,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 23
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.212890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2027.5,
      "completions/mean_length": 1582.2890625,
      "completions/mean_terminated_length": 1456.1675109863281,
      "completions/min_length": 776.75,
      "completions/min_terminated_length": 776.75,
      "epoch": 0.6114649681528662,
      "grad_norm": 0.317959150806763,
      "kl": 0.03668212890625,
      "learning_rate": 4.895950303820552e-07,
      "loss": -0.0486,
      "num_tokens": 25093273.0,
      "reward": -0.447265625,
      "reward_std": 0.46050675213336945,
      "rewards/eps_simulator_reward/mean": -0.447265625,
      "rewards/eps_simulator_reward/std": 0.4985574185848236,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 24
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.162109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2028.75,
      "completions/mean_length": 1571.0625,
      "completions/mean_terminated_length": 1479.2693176269531,
      "completions/min_length": 790.75,
      "completions/min_terminated_length": 790.75,
      "epoch": 0.6369426751592356,
      "grad_norm": 0.3007766407739895,
      "kl": 0.03619384765625,
      "learning_rate": 4.4986557971965856e-07,
      "loss": -0.0381,
      "num_tokens": 26159801.0,
      "reward": -0.498046875,
      "reward_std": 0.43191099166870117,
      "rewards/eps_simulator_reward/mean": -0.498046875,
      "rewards/eps_simulator_reward/std": 0.4973808750510216,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 25
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.1796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.25,
      "completions/mean_length": 1576.947265625,
      "completions/mean_terminated_length": 1474.1740112304688,
      "completions/min_length": 756.25,
      "completions/min_terminated_length": 756.25,
      "epoch": 0.6624203821656051,
      "grad_norm": 0.320328638853622,
      "kl": 0.034881591796875,
      "learning_rate": 4.1094235253127374e-07,
      "loss": -0.0469,
      "num_tokens": 27229342.0,
      "reward": -0.490234375,
      "reward_std": 0.45237039774656296,
      "rewards/eps_simulator_reward/mean": -0.490234375,
      "rewards/eps_simulator_reward/std": 0.5005030706524849,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 26
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.1640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.75,
      "completions/mean_length": 1598.625,
      "completions/mean_terminated_length": 1510.354736328125,
      "completions/min_length": 738.0,
      "completions/min_terminated_length": 738.0,
      "epoch": 0.6878980891719745,
      "grad_norm": 0.35008352061308967,
      "kl": 0.03564453125,
      "learning_rate": 3.731387357557344e-07,
      "loss": -0.0477,
      "num_tokens": 28309982.0,
      "reward": -0.5546875,
      "reward_std": 0.42381805181503296,
      "rewards/eps_simulator_reward/mean": -0.5546875,
      "rewards/eps_simulator_reward/std": 0.4947791174054146,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 27
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.21484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2022.5,
      "completions/mean_length": 1600.478515625,
      "completions/mean_terminated_length": 1478.0363159179688,
      "completions/min_length": 757.25,
      "completions/min_terminated_length": 757.25,
      "epoch": 0.7133757961783439,
      "grad_norm": 0.3095860436443937,
      "kl": 0.035736083984375,
      "learning_rate": 3.367591018871506e-07,
      "loss": -0.0547,
      "num_tokens": 29391571.0,
      "reward": -0.51171875,
      "reward_std": 0.47134073078632355,
      "rewards/eps_simulator_reward/mean": -0.51171875,
      "rewards/eps_simulator_reward/std": 0.5015044659376144,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 28
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.21484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2027.5,
      "completions/mean_length": 1615.025390625,
      "completions/mean_terminated_length": 1497.0940856933594,
      "completions/min_length": 708.5,
      "completions/min_terminated_length": 708.5,
      "epoch": 0.7388535031847133,
      "grad_norm": 0.27660703860323266,
      "kl": 0.033172607421875,
      "learning_rate": 3.020963583465539e-07,
      "loss": -0.0513,
      "num_tokens": 30480608.0,
      "reward": -0.4765625,
      "reward_std": 0.48036184906959534,
      "rewards/eps_simulator_reward/mean": -0.4765625,
      "rewards/eps_simulator_reward/std": 0.49983665347099304,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 29
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.17578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2027.75,
      "completions/mean_length": 1601.826171875,
      "completions/mean_terminated_length": 1506.8683166503906,
      "completions/min_length": 787.5,
      "completions/min_terminated_length": 787.5,
      "epoch": 0.7643312101910829,
      "grad_norm": 0.42748719862561546,
      "kl": 0.034423828125,
      "learning_rate": 2.6942958916356994e-07,
      "loss": -0.0487,
      "num_tokens": 31562887.0,
      "reward": -0.525390625,
      "reward_std": 0.49626730382442474,
      "rewards/eps_simulator_reward/mean": -0.525390625,
      "rewards/eps_simulator_reward/std": 0.4998583048582077,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 30
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.20703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1603.60546875,
      "completions/mean_terminated_length": 1487.5808410644531,
      "completions/min_length": 779.75,
      "completions/min_terminated_length": 779.75,
      "epoch": 0.7898089171974523,
      "grad_norm": 0.26533857511488407,
      "kl": 0.03125,
      "learning_rate": 2.390218079559109e-07,
      "loss": -0.06,
      "num_tokens": 32646077.0,
      "reward": -0.5234375,
      "reward_std": 0.47222549468278885,
      "rewards/eps_simulator_reward/mean": -0.5234375,
      "rewards/eps_simulator_reward/std": 0.5008884444832802,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 31
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.20703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2025.5,
      "completions/mean_length": 1610.716796875,
      "completions/mean_terminated_length": 1495.678955078125,
      "completions/min_length": 789.75,
      "completions/min_terminated_length": 789.75,
      "epoch": 0.8152866242038217,
      "grad_norm": 0.25691683516929065,
      "kl": 0.031829833984375,
      "learning_rate": 2.1111784029837509e-07,
      "loss": -0.0447,
      "num_tokens": 33732908.0,
      "reward": -0.515625,
      "reward_std": 0.45176610350608826,
      "rewards/eps_simulator_reward/mean": -0.515625,
      "rewards/eps_simulator_reward/std": 0.5000600069761276,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 32
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2027.0,
      "completions/mean_length": 1604.3359375,
      "completions/mean_terminated_length": 1491.3028564453125,
      "completions/min_length": 795.25,
      "completions/min_terminated_length": 795.25,
      "epoch": 0.8407643312101911,
      "grad_norm": 0.6246804081121883,
      "kl": 0.041748046875,
      "learning_rate": 1.8594235253127372e-07,
      "loss": -0.0511,
      "num_tokens": 34816472.0,
      "reward": -0.517578125,
      "reward_std": 0.45204655081033707,
      "rewards/eps_simulator_reward/mean": -0.517578125,
      "rewards/eps_simulator_reward/std": 0.5011512860655785,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 33
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.244140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.75,
      "completions/mean_length": 1642.564453125,
      "completions/mean_terminated_length": 1510.7035217285156,
      "completions/min_length": 781.75,
      "completions/min_terminated_length": 781.75,
      "epoch": 0.8662420382165605,
      "grad_norm": 0.2636823037232082,
      "kl": 0.03643798828125,
      "learning_rate": 1.6369804287916025e-07,
      "loss": -0.0411,
      "num_tokens": 35919609.0,
      "reward": -0.470703125,
      "reward_std": 0.45567234605550766,
      "rewards/eps_simulator_reward/mean": -0.470703125,
      "rewards/eps_simulator_reward/std": 0.4984956756234169,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 34
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.224609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1605.212890625,
      "completions/mean_terminated_length": 1476.3855895996094,
      "completions/min_length": 839.25,
      "completions/min_terminated_length": 839.25,
      "epoch": 0.89171974522293,
      "grad_norm": 0.32334944678197935,
      "kl": 0.0355224609375,
      "learning_rate": 1.4456400944391144e-07,
      "loss": -0.0662,
      "num_tokens": 37003622.0,
      "reward": -0.458984375,
      "reward_std": 0.46678680181503296,
      "rewards/eps_simulator_reward/mean": -0.458984375,
      "rewards/eps_simulator_reward/std": 0.49930109083652496,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 35
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.19921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2010.0,
      "completions/mean_length": 1613.6328125,
      "completions/mean_terminated_length": 1504.9357604980469,
      "completions/min_length": 811.75,
      "completions/min_terminated_length": 811.75,
      "epoch": 0.9171974522292994,
      "grad_norm": 0.2540670404555639,
      "kl": 0.035858154296875,
      "learning_rate": 1.2869430821211826e-07,
      "loss": -0.0423,
      "num_tokens": 38091946.0,
      "reward": -0.5078125,
      "reward_std": 0.46562159806489944,
      "rewards/eps_simulator_reward/mean": -0.5078125,
      "rewards/eps_simulator_reward/std": 0.5015047490596771,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 36
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.212890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.25,
      "completions/mean_length": 1638.091796875,
      "completions/mean_terminated_length": 1526.8819885253906,
      "completions/min_length": 769.5,
      "completions/min_terminated_length": 769.5,
      "epoch": 0.9426751592356688,
      "grad_norm": 0.6072482178147118,
      "kl": 0.0433349609375,
      "learning_rate": 1.1621671268686605e-07,
      "loss": -0.0309,
      "num_tokens": 39192793.0,
      "reward": -0.49609375,
      "reward_std": 0.45329853892326355,
      "rewards/eps_simulator_reward/mean": -0.49609375,
      "rewards/eps_simulator_reward/std": 0.5007057711482048,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 37
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.212890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 1634.078125,
      "completions/mean_terminated_length": 1521.7028198242188,
      "completions/min_length": 768.25,
      "completions/min_terminated_length": 768.25,
      "epoch": 0.9681528662420382,
      "grad_norm": 1.674101211279826,
      "kl": 0.0494384765625,
      "learning_rate": 1.0723168513061665e-07,
      "loss": -0.0404,
      "num_tokens": 40291585.0,
      "reward": -0.5,
      "reward_std": 0.46803878247737885,
      "rewards/eps_simulator_reward/mean": -0.5,
      "rewards/eps_simulator_reward/std": 0.49916671961545944,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 38
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 2032.0,
      "completions/max_terminated_length": 1812.75,
      "completions/mean_length": 1557.90625,
      "completions/mean_terminated_length": 1464.7232666015625,
      "completions/min_length": 1071.25,
      "completions/min_terminated_length": 1071.25,
      "epoch": 0.9936305732484076,
      "grad_norm": 0.2581163051289786,
      "kl": 0.0352783203125,
      "learning_rate": 1.0181156770214242e-07,
      "loss": -0.0572,
      "num_tokens": 41375402.0,
      "reward": -0.51171875,
      "reward_std": 0.47641219943761826,
      "rewards/eps_simulator_reward/mean": -0.51171875,
      "rewards/eps_simulator_reward/std": 0.5010442584753036,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 39
    },
    {
      "epoch": 0.9936305732484076,
      "step": 39,
      "total_flos": 0.0,
      "train_loss": 0.0,
      "train_runtime": 1.8663,
      "train_samples_per_second": 2679.088,
      "train_steps_per_second": 20.897
    }
  ],
  "logging_steps": 1,
  "max_steps": 39,
  "num_input_tokens_seen": 41375402,
  "num_train_epochs": 1,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 16,
  "trial_name": null,
  "trial_params": null
}