{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.997867803837953, "eval_steps": 100, "global_step": 117, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.259765625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 622.638671875, "completions/mean_terminated_length": 481.79156494140625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.008528784648187633, "grad_norm": 0.8145178930391833, "kl": 0.00037097930908203125, "learning_rate": 0.0, "loss": 0.0196, "num_tokens": 392023.0, "reward": 0.634765625, "reward_std": 0.33141425251960754, "rewards/accuracy_reward/mean": 0.634765625, "rewards/accuracy_reward/std": 0.4819667339324951, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2314453125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.75, "completions/mean_length": 595.36865234375, "completions/mean_terminated_length": 466.6201705932617, "completions/min_length": 7.25, "completions/min_terminated_length": 7.25, "epoch": 0.042643923240938165, "grad_norm": 0.4773668247144155, "kl": 0.00041562318801879883, "learning_rate": 1e-06, "loss": 0.0477, "num_tokens": 1910946.0, "reward": 0.64501953125, "reward_std": 0.37129127234220505, "rewards/accuracy_reward/mean": 0.64501953125, "rewards/accuracy_reward/std": 0.47847116738557816, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.2, "completions/mean_length": 618.060546875, "completions/mean_terminated_length": 491.431787109375, "completions/min_length": 10.8, "completions/min_terminated_length": 10.8, "epoch": 0.08528784648187633, "grad_norm": 0.3298661973983867, "kl": 0.0007388591766357422, "learning_rate": 2.25e-06, "loss": 0.0181, "num_tokens": 3860773.0, "reward": 0.6421875, "reward_std": 0.33047744631767273, "rewards/accuracy_reward/mean": 0.6421875, "rewards/accuracy_reward/std": 0.4783483386039734, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.6, "completions/mean_length": 603.215625, "completions/mean_terminated_length": 495.9770202636719, "completions/min_length": 45.2, "completions/min_terminated_length": 45.2, "epoch": 0.1279317697228145, "grad_norm": 0.25911245136387523, "kl": 0.0020538330078125, "learning_rate": 2.9973151946516025e-06, "loss": 0.0687, "num_tokens": 5749941.0, "reward": 0.734375, "reward_std": 0.2392146944999695, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 0.44130025506019593, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.2, "completions/mean_length": 585.84765625, "completions/mean_terminated_length": 508.5999755859375, "completions/min_length": 148.8, "completions/min_terminated_length": 148.8, "epoch": 0.17057569296375266, "grad_norm": 0.19464089548857677, "kl": 0.0031925201416015624, "learning_rate": 2.9672214011007086e-06, "loss": 0.0434, "num_tokens": 7604799.0, "reward": 0.76953125, "reward_std": 0.181648451089859, "rewards/accuracy_reward/mean": 0.76953125, "rewards/accuracy_reward/std": 0.41696075201034544, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.4, "completions/mean_length": 569.831640625, "completions/mean_terminated_length": 505.2327087402344, "completions/min_length": 126.2, "completions/min_terminated_length": 126.2, "epoch": 0.21321961620469082, "grad_norm": 0.11280922307838523, "kl": 0.003424835205078125, "learning_rate": 2.904352305959606e-06, "loss": 0.0414, "num_tokens": 9406560.0, "reward": 0.800390625, "reward_std": 0.1668152093887329, "rewards/accuracy_reward/mean": 0.800390625, "rewards/accuracy_reward/std": 0.3962217092514038, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.123828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.4, "completions/mean_length": 572.0875, "completions/mean_terminated_length": 508.3062744140625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.255863539445629, "grad_norm": 0.17292784666143843, "kl": 0.0038557052612304688, "learning_rate": 2.8101123009256945e-06, "loss": 0.0444, "num_tokens": 11235280.0, "reward": 0.779296875, "reward_std": 0.18240008652210235, "rewards/accuracy_reward/mean": 0.779296875, "rewards/accuracy_reward/std": 0.4136933445930481, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10546875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.2, "completions/mean_length": 563.6625, "completions/mean_terminated_length": 509.4048156738281, "completions/min_length": 91.2, "completions/min_terminated_length": 91.2, "epoch": 0.29850746268656714, "grad_norm": 0.1417198452884755, "kl": 0.004206085205078125, "learning_rate": 2.6866065519845123e-06, "loss": 0.0393, "num_tokens": 13029192.0, "reward": 0.796875, "reward_std": 0.1687217354774475, "rewards/accuracy_reward/mean": 0.796875, "rewards/accuracy_reward/std": 0.40032246708869934, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.11015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.8, "completions/mean_length": 555.61328125, "completions/mean_terminated_length": 498.0840698242188, "completions/min_length": 122.6, "completions/min_terminated_length": 122.6, "epoch": 0.3411513859275053, "grad_norm": 0.2464658893666007, "kl": 0.003921127319335938, "learning_rate": 2.5365939734802974e-06, "loss": 0.0384, "num_tokens": 14806018.0, "reward": 0.796484375, "reward_std": 0.16222528517246246, "rewards/accuracy_reward/mean": 0.796484375, "rewards/accuracy_reward/std": 0.40181103348731995, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.121875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.4, "completions/mean_length": 568.50703125, "completions/mean_terminated_length": 505.3303649902344, "completions/min_length": 146.8, "completions/min_terminated_length": 146.8, "epoch": 0.3837953091684435, "grad_norm": 0.229924275356646, "kl": 0.004242706298828125, "learning_rate": 2.3634255985285104e-06, "loss": 0.0377, "num_tokens": 16620316.0, "reward": 0.81171875, "reward_std": 0.16615791916847228, "rewards/accuracy_reward/mean": 0.81171875, "rewards/accuracy_reward/std": 0.39103145599365235, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.122265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.2, "completions/mean_length": 565.183984375, "completions/mean_terminated_length": 501.5477783203125, "completions/min_length": 119.6, "completions/min_terminated_length": 119.6, "epoch": 0.42643923240938164, "grad_norm": 0.2797579550491506, "kl": 0.00439910888671875, "learning_rate": 2.1709697224734488e-06, "loss": 0.0355, "num_tokens": 18421059.0, "reward": 0.766015625, "reward_std": 0.1700347661972046, "rewards/accuracy_reward/mean": 0.766015625, "rewards/accuracy_reward/std": 0.41948946118354796, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.111328125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 558.638671875, "completions/mean_terminated_length": 500.31190795898436, "completions/min_length": 114.4, "completions/min_terminated_length": 114.4, "epoch": 0.4690831556503198, "grad_norm": 0.2632961965405764, "kl": 0.0050487518310546875, "learning_rate": 1.963525491562421e-06, "loss": 0.0222, "num_tokens": 20196126.0, "reward": 0.7828125, "reward_std": 0.1633252739906311, "rewards/accuracy_reward/mean": 0.7828125, "rewards/accuracy_reward/std": 0.41108490228652955, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.117578125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.4, "completions/mean_length": 581.921875, "completions/mean_terminated_length": 522.9397216796875, "completions/min_length": 131.6, "completions/min_terminated_length": 131.6, "epoch": 0.511727078891258, "grad_norm": 0.0965979552879068, "kl": 0.004494857788085937, "learning_rate": 1.7457268671227065e-06, "loss": 0.0396, "num_tokens": 22047438.0, "reward": 0.77734375, "reward_std": 0.16624618470668792, "rewards/accuracy_reward/mean": 0.77734375, "rewards/accuracy_reward/std": 0.41559609174728396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.116015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.8, "completions/mean_length": 552.442578125, "completions/mean_terminated_length": 490.71544189453124, "completions/min_length": 101.6, "completions/min_terminated_length": 101.6, "epoch": 0.5543710021321961, "grad_norm": 0.17017618787674732, "kl": 0.004914474487304687, "learning_rate": 1.5224391105228955e-06, "loss": 0.0323, "num_tokens": 23838739.0, "reward": 0.774609375, "reward_std": 0.1651999369263649, "rewards/accuracy_reward/mean": 0.774609375, "rewards/accuracy_reward/std": 0.4162830650806427, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.122265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.2, "completions/mean_length": 572.6484375, "completions/mean_terminated_length": 510.603759765625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.5970149253731343, "grad_norm": 0.12570880852383362, "kl": 0.004351425170898438, "learning_rate": 1.2986501012735172e-06, "loss": 0.0361, "num_tokens": 25673351.0, "reward": 0.75234375, "reward_std": 0.18619680404663086, "rewards/accuracy_reward/mean": 0.75234375, "rewards/accuracy_reward/std": 0.42559565901756286, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.105078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.8, "completions/mean_length": 539.875, "completions/mean_terminated_length": 483.08538818359375, "completions/min_length": 105.2, "completions/min_terminated_length": 105.2, "epoch": 0.6396588486140725, "grad_norm": 0.19371389016243035, "kl": 0.005275344848632813, "learning_rate": 1.079358916040996e-06, "loss": 0.0372, "num_tokens": 27420743.0, "reward": 0.78046875, "reward_std": 0.17587369978427886, "rewards/accuracy_reward/mean": 0.78046875, "rewards/accuracy_reward/std": 0.41278024911880495, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.107421875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.6, "completions/mean_length": 551.752734375, "completions/mean_terminated_length": 495.41151733398436, "completions/min_length": 92.4, "completions/min_terminated_length": 92.4, "epoch": 0.6823027718550106, "grad_norm": 0.12351401523997702, "kl": 0.004492950439453125, "learning_rate": 8.69464157535652e-07, "loss": 0.0355, "num_tokens": 29190430.0, "reward": 0.775390625, "reward_std": 0.16248664855957032, "rewards/accuracy_reward/mean": 0.775390625, "rewards/accuracy_reward/std": 0.4170423984527588, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10546875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.8, "completions/mean_length": 564.80859375, "completions/mean_terminated_length": 510.5605102539063, "completions/min_length": 99.2, "completions/min_terminated_length": 99.2, "epoch": 0.7249466950959488, "grad_norm": 0.11800011260860456, "kl": 0.004195022583007813, "learning_rate": 6.736545278218464e-07, "loss": 0.0232, "num_tokens": 30987452.0, "reward": 0.792578125, "reward_std": 0.17066446840763091, "rewards/accuracy_reward/mean": 0.7951016902923584, "rewards/accuracy_reward/std": 0.40229706168174745, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.108203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.4, "completions/mean_length": 555.931640625, "completions/mean_terminated_length": 499.55921630859376, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.767590618336887, "grad_norm": 0.1766286378559812, "kl": 0.004351806640625, "learning_rate": 4.963040904617131e-07, "loss": 0.0305, "num_tokens": 32791613.0, "reward": 0.762890625, "reward_std": 0.16622219383716583, "rewards/accuracy_reward/mean": 0.762890625, "rewards/accuracy_reward/std": 0.42507564425468447, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.110546875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.4, "completions/mean_length": 567.742578125, "completions/mean_terminated_length": 511.2064636230469, "completions/min_length": 89.6, "completions/min_terminated_length": 89.6, "epoch": 0.8102345415778252, "grad_norm": 0.13097760910471143, "kl": 0.004387664794921875, "learning_rate": 3.4137456116310087e-07, "loss": 0.0181, "num_tokens": 34613034.0, "reward": 0.76953125, "reward_std": 0.16518225371837617, "rewards/accuracy_reward/mean": 0.76953125, "rewards/accuracy_reward/std": 0.42095457315444945, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10234375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.8, "completions/mean_length": 570.152734375, "completions/mean_terminated_length": 518.7150634765625, "completions/min_length": 120.8, "completions/min_terminated_length": 120.8, "epoch": 0.8528784648187633, "grad_norm": 0.1265224718572913, "kl": 0.004515457153320313, "learning_rate": 2.1232680959720086e-07, "loss": 0.0369, "num_tokens": 36434305.0, "reward": 0.77890625, "reward_std": 0.16256613433361053, "rewards/accuracy_reward/mean": 0.7815476179122924, "rewards/accuracy_reward/std": 0.41042966246604917, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 100 }, { "epoch": 0.8528784648187633, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.093025, "eval_completions/max_length": 1017.8096, "eval_completions/max_terminated_length": 956.9056, "eval_completions/mean_length": 544.63845, "eval_completions/mean_terminated_length": 497.0462825683594, "eval_completions/min_length": 184.3936, "eval_completions/min_terminated_length": 184.3936, "eval_kl": 0.00679013671875, "eval_loss": 0.01577703095972538, "eval_num_tokens": 36434305.0, "eval_reward": 0.694925, "eval_reward_std": 0.20438497549295426, "eval_rewards/accuracy_reward/mean": 0.694925, "eval_rewards/accuracy_reward/std": 0.44015464553833006, "eval_rewards/format_reward/mean": 0.0, "eval_rewards/format_reward/std": 0.0, "eval_runtime": 4105.356, "eval_samples_per_second": 1.218, "eval_steps_per_second": 0.019, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.093359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.6, "completions/mean_length": 551.971875, "completions/mean_terminated_length": 503.73955688476565, "completions/min_length": 124.4, "completions/min_terminated_length": 124.4, "epoch": 0.8955223880597015, "grad_norm": 0.11809578314174589, "kl": 0.004218292236328125, "learning_rate": 1.1204354928900495e-07, "loss": 0.0265, "num_tokens": 38198833.0, "reward": 0.803125, "reward_std": 0.1608460694551468, "rewards/accuracy_reward/mean": 0.803125, "rewards/accuracy_reward/std": 0.39661717414855957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 559.37890625, "completions/mean_terminated_length": 514.0422424316406, "completions/min_length": 139.2, "completions/min_terminated_length": 139.2, "epoch": 0.9381663113006397, "grad_norm": 0.1543236952292962, "kl": 0.0042022705078125, "learning_rate": 4.276494256069874e-08, "loss": 0.0293, "num_tokens": 39998331.0, "reward": 0.817578125, "reward_std": 0.17398287057876588, "rewards/accuracy_reward/mean": 0.817578125, "rewards/accuracy_reward/std": 0.3809166610240936, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.093359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.8, "completions/mean_length": 548.9046875, "completions/mean_terminated_length": 499.97105102539064, "completions/min_length": 94.4, "completions/min_terminated_length": 94.4, "epoch": 0.9808102345415778, "grad_norm": 0.15606339132567215, "kl": 0.00447540283203125, "learning_rate": 6.038559007141398e-09, "loss": 0.0288, "num_tokens": 41769847.0, "reward": 0.787890625, "reward_std": 0.1641687572002411, "rewards/accuracy_reward/mean": 0.787890625, "rewards/accuracy_reward/std": 0.4066555380821228, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08664344879518071, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 558.7720031738281, "completions/mean_terminated_length": 514.6327362060547, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.997867803837953, "kl": 0.003989219665527344, "num_tokens": 42497058.0, "reward": 0.7802734375, "reward_std": 0.19359246641397476, "rewards/accuracy_reward/mean": 0.7802734375, "rewards/accuracy_reward/std": 0.41445116698741913, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 117, "total_flos": 0.0, "train_loss": 0.03524629572708892, "train_runtime": 9378.8625, "train_samples_per_second": 0.8, "train_steps_per_second": 0.012 } ], "logging_steps": 5, "max_steps": 117, "num_input_tokens_seen": 42497058, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }