diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,11114 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4998933394594772, + "eval_steps": 500, + "global_step": 1904, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3807.0, + "completions/max_terminated_length": 3807.0, + "completions/mean_length": 915.0521240234375, + "completions/mean_terminated_length": 915.0521240234375, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.0002625490228253557, + "grad_norm": 0.5600714880500142, + "kl": 0.0009694099426269531, + "learning_rate": 0.0, + "loss": 0.0388, + "num_tokens": 413820.0, + "reward": 0.9820963740348816, + "reward_std": 0.22425100207328796, + "rewards/format_reward/mean": 0.0026041667442768812, + "rewards/format_reward/std": 0.05103103443980217, + "rewards/mcq_accuracy_reward/mean": 0.765625, + "rewards/mcq_accuracy_reward/std": 0.4241601824760437, + "rewards/tag_count_reward/mean": 0.86328125, + "rewards/tag_count_reward/std": 0.2134343385696411, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4019.75, + "completions/max_terminated_length": 4019.75, + "completions/mean_length": 972.7415618896484, + "completions/mean_terminated_length": 972.7415618896484, + "completions/min_length": 245.75, + "completions/min_terminated_length": 245.75, + "epoch": 0.0013127451141267783, + "grad_norm": 0.5902006739832311, + "kl": 0.000987410545349121, + "learning_rate": 6.282722513089006e-08, + "loss": 0.0362, + "num_tokens": 2148903.0, + "reward": 0.9148356467485428, + "reward_std": 0.22137761488556862, + "rewards/format_reward/mean": 0.0026041667442768812, + "rewards/format_reward/std": 0.04353414382785559, + "rewards/mcq_accuracy_reward/mean": 0.6972656100988388, + "rewards/mcq_accuracy_reward/std": 0.4569668620824814, + "rewards/tag_count_reward/mean": 0.86767578125, + "rewards/tag_count_reward/std": 0.20981235057115555, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3982.6, + "completions/max_terminated_length": 3982.6, + "completions/mean_length": 1011.59169921875, + "completions/mean_terminated_length": 1011.59169921875, + "completions/min_length": 228.6, + "completions/min_terminated_length": 228.6, + "epoch": 0.0026254902282535565, + "grad_norm": 0.4814943076321377, + "kl": 0.000986623764038086, + "learning_rate": 1.4136125654450263e-07, + "loss": 0.0197, + "num_tokens": 4390631.0, + "reward": 0.8633789181709289, + "reward_std": 0.17433639168739318, + "rewards/format_reward/mean": 0.004166666744276881, + "rewards/format_reward/std": 0.05666746348142624, + "rewards/mcq_accuracy_reward/mean": 0.6432291626930237, + "rewards/mcq_accuracy_reward/std": 0.47391964197158815, + "rewards/tag_count_reward/mean": 0.8764322996139526, + "rewards/tag_count_reward/std": 0.20409679412841797, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5356.8, + "completions/max_terminated_length": 5356.8, + "completions/mean_length": 1066.7468994140625, + "completions/mean_terminated_length": 1066.7468994140625, + "completions/min_length": 244.8, + "completions/min_terminated_length": 244.8, + "epoch": 0.003938235342380335, + "grad_norm": 0.5298022494586494, + "kl": 0.0009682655334472656, + "learning_rate": 2.1989528795811518e-07, + "loss": 0.0283, + "num_tokens": 6742057.0, + "reward": 0.9032877683639526, + "reward_std": 0.21250698268413543, + "rewards/format_reward/mean": 0.0010416666977107526, + "rewards/format_reward/std": 0.014414900541305542, + "rewards/mcq_accuracy_reward/mean": 0.6796874880790711, + "rewards/mcq_accuracy_reward/std": 0.46576390266418455, + "rewards/tag_count_reward/mean": 0.893359375, + "rewards/tag_count_reward/std": 0.19493359625339507, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4865.0, + "completions/max_terminated_length": 4865.0, + "completions/mean_length": 1038.8906494140624, + "completions/mean_terminated_length": 1038.8906494140624, + "completions/min_length": 239.6, + "completions/min_terminated_length": 239.6, + "epoch": 0.005250980456507113, + "grad_norm": 0.4309993436245974, + "kl": 0.0011208057403564453, + "learning_rate": 2.9842931937172774e-07, + "loss": 0.0246, + "num_tokens": 9034735.0, + "reward": 0.887402355670929, + "reward_std": 0.18113622069358826, + "rewards/format_reward/mean": 0.003645833441987634, + "rewards/format_reward/std": 0.0449534185230732, + "rewards/mcq_accuracy_reward/mean": 0.6598958373069763, + "rewards/mcq_accuracy_reward/std": 0.4723894000053406, + "rewards/tag_count_reward/mean": 0.9063802003860474, + "rewards/tag_count_reward/std": 0.18173416554927826, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3925.8, + "completions/max_terminated_length": 3925.8, + "completions/mean_length": 945.2312744140625, + "completions/mean_terminated_length": 945.2312744140625, + "completions/min_length": 206.2, + "completions/min_terminated_length": 206.2, + "epoch": 0.006563725570633892, + "grad_norm": 0.5322996591395511, + "kl": 0.0016646862030029296, + "learning_rate": 3.769633507853403e-07, + "loss": 0.0317, + "num_tokens": 11149131.0, + "reward": 0.9009114861488342, + "reward_std": 0.1493413269519806, + "rewards/format_reward/mean": 0.002083333348855376, + "rewards/format_reward/std": 0.027837660163640976, + "rewards/mcq_accuracy_reward/mean": 0.6671875, + "rewards/mcq_accuracy_reward/std": 0.4703998565673828, + "rewards/tag_count_reward/mean": 0.9328125, + "rewards/tag_count_reward/std": 0.1592049092054367, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3823.0, + "completions/max_terminated_length": 3823.0, + "completions/mean_length": 859.3396118164062, + "completions/mean_terminated_length": 859.3396118164062, + "completions/min_length": 207.6, + "completions/min_terminated_length": 207.6, + "epoch": 0.00787647068476067, + "grad_norm": 0.19308745288392534, + "kl": 0.0033286094665527345, + "learning_rate": 4.5549738219895285e-07, + "loss": 0.0069, + "num_tokens": 13095175.0, + "reward": 0.9616211295127869, + "reward_std": 0.11504709869623184, + "rewards/format_reward/mean": 0.0015625000465661286, + "rewards/format_reward/std": 0.024621108919382094, + "rewards/mcq_accuracy_reward/mean": 0.7161458373069763, + "rewards/mcq_accuracy_reward/std": 0.4499928534030914, + "rewards/tag_count_reward/mean": 0.9803385496139526, + "rewards/tag_count_reward/std": 0.08263763412833214, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4245.0, + "completions/max_terminated_length": 4245.0, + "completions/mean_length": 914.0500244140625, + "completions/mean_terminated_length": 914.0500244140625, + "completions/min_length": 213.2, + "completions/min_terminated_length": 213.2, + "epoch": 0.009189215798887448, + "grad_norm": 0.2814592297002721, + "kl": 0.004398536682128906, + "learning_rate": 5.340314136125655e-07, + "loss": 0.0204, + "num_tokens": 15148551.0, + "reward": 0.9290690302848816, + "reward_std": 0.1563553810119629, + "rewards/format_reward/mean": 0.0005208333488553763, + "rewards/format_reward/std": 0.010206206887960433, + "rewards/mcq_accuracy_reward/mean": 0.6802083253860474, + "rewards/mcq_accuracy_reward/std": 0.4631023585796356, + "rewards/tag_count_reward/mean": 0.994921875, + "rewards/tag_count_reward/std": 0.03888525143265724, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4360.2, + "completions/max_terminated_length": 4360.2, + "completions/mean_length": 884.5177368164062, + "completions/mean_terminated_length": 884.5177368164062, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.010501960913014226, + "grad_norm": 0.21668814775296738, + "kl": 0.004182052612304687, + "learning_rate": 6.12565445026178e-07, + "loss": 0.0079, + "num_tokens": 17142537.0, + "reward": 0.9423828244209289, + "reward_std": 0.12639484107494353, + "rewards/format_reward/mean": 0.0010416666977107526, + "rewards/format_reward/std": 0.020412414520978927, + "rewards/mcq_accuracy_reward/mean": 0.6927083373069763, + "rewards/mcq_accuracy_reward/std": 0.46153690814971926, + "rewards/tag_count_reward/mean": 0.9976562619209289, + "rewards/tag_count_reward/std": 0.0302169531583786, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4127.6, + "completions/max_terminated_length": 4127.6, + "completions/mean_length": 932.9922241210937, + "completions/mean_terminated_length": 932.9922241210937, + "completions/min_length": 233.6, + "completions/min_terminated_length": 233.6, + "epoch": 0.011814706027141006, + "grad_norm": 0.18322482842473176, + "kl": 0.0040074348449707035, + "learning_rate": 6.910994764397906e-07, + "loss": 0.0174, + "num_tokens": 19233346.0, + "reward": 0.9326497554779053, + "reward_std": 0.13236641138792038, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/mcq_accuracy_reward/mean": 0.6828125, + "rewards/mcq_accuracy_reward/std": 0.4628955662250519, + "rewards/tag_count_reward/mean": 0.9993489623069763, + "rewards/tag_count_reward/std": 0.011258380860090256, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3836.0, + "completions/max_terminated_length": 3836.0, + "completions/mean_length": 875.6781616210938, + "completions/mean_terminated_length": 875.6781616210938, + "completions/min_length": 229.6, + "completions/min_terminated_length": 229.6, + "epoch": 0.013127451141267784, + "grad_norm": 0.18563360840223864, + "kl": 0.004321479797363281, + "learning_rate": 7.696335078534032e-07, + "loss": 0.0081, + "num_tokens": 21213792.0, + "reward": 0.8936849117279053, + "reward_std": 0.1276272490620613, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/mcq_accuracy_reward/mean": 0.643749988079071, + "rewards/mcq_accuracy_reward/std": 0.47782949209213255, + "rewards/tag_count_reward/mean": 0.9997395873069763, + "rewards/tag_count_reward/std": 0.005103103816509247, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4194.4, + "completions/max_terminated_length": 4194.4, + "completions/mean_length": 837.1468872070312, + "completions/mean_terminated_length": 837.1468872070312, + "completions/min_length": 211.2, + "completions/min_terminated_length": 211.2, + "epoch": 0.014440196255394562, + "grad_norm": 0.24106583558723774, + "kl": 0.004308032989501953, + "learning_rate": 8.481675392670158e-07, + "loss": 0.0179, + "num_tokens": 23118898.0, + "reward": 0.9015299677848816, + "reward_std": 0.11953569799661637, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/mcq_accuracy_reward/mean": 0.6515625, + "rewards/mcq_accuracy_reward/std": 0.4750236213207245, + "rewards/tag_count_reward/mean": 0.9998697876930237, + "rewards/tag_count_reward/std": 0.0025515519082546234, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4411.0, + "completions/max_terminated_length": 4411.0, + "completions/mean_length": 941.3682373046875, + "completions/mean_terminated_length": 941.3682373046875, + "completions/min_length": 235.6, + "completions/min_terminated_length": 235.6, + "epoch": 0.01575294136952134, + "grad_norm": 0.23675445544527454, + "kl": 0.003926849365234375, + "learning_rate": 9.267015706806283e-07, + "loss": 0.0124, + "num_tokens": 25228053.0, + "reward": 0.9048177242279053, + "reward_std": 0.12503997683525087, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/mcq_accuracy_reward/mean": 0.6552083253860473, + "rewards/mcq_accuracy_reward/std": 0.4749483704566956, + "rewards/tag_count_reward/mean": 0.9984375, + "rewards/tag_count_reward/std": 0.01938077136874199, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3957.2, + "completions/max_terminated_length": 3957.2, + "completions/mean_length": 861.5698120117188, + "completions/mean_terminated_length": 861.5698120117188, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.017065686483648118, + "grad_norm": 0.2084139684080682, + "kl": 0.004307174682617187, + "learning_rate": 1.005235602094241e-06, + "loss": 0.0042, + "num_tokens": 27184019.0, + "reward": 0.9222330927848816, + "reward_std": 0.11201669871807099, + "rewards/format_reward/mean": 0.0005208333488553763, + "rewards/format_reward/std": 0.010206206887960433, + "rewards/mcq_accuracy_reward/mean": 0.6723958253860474, + "rewards/mcq_accuracy_reward/std": 0.46497212052345277, + "rewards/tag_count_reward/mean": 0.998828125, + "rewards/tag_count_reward/std": 0.015995388478040697, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 5187.0, + "completions/max_terminated_length": 4377.6, + "completions/mean_length": 900.5296997070312, + "completions/mean_terminated_length": 896.767431640625, + "completions/min_length": 206.2, + "completions/min_terminated_length": 206.2, + "epoch": 0.018378431597774896, + "grad_norm": 0.2815092224743427, + "kl": 0.004451370239257813, + "learning_rate": 1.0837696335078534e-06, + "loss": 0.0143, + "num_tokens": 29209436.0, + "reward": 0.9232747673988342, + "reward_std": 0.1456607848405838, + "rewards/format_reward/mean": 0.003125, + "rewards/format_reward/std": 0.035262906551361085, + "rewards/mcq_accuracy_reward/mean": 0.6734375, + "rewards/mcq_accuracy_reward/std": 0.4688749730587006, + "rewards/tag_count_reward/mean": 0.9962239503860474, + "rewards/tag_count_reward/std": 0.034351322799921036, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3981.2, + "completions/max_terminated_length": 3981.2, + "completions/mean_length": 971.86875, + "completions/mean_terminated_length": 971.86875, + "completions/min_length": 222.6, + "completions/min_terminated_length": 222.6, + "epoch": 0.019691176711901674, + "grad_norm": 0.28319375745370917, + "kl": 0.00325469970703125, + "learning_rate": 1.162303664921466e-06, + "loss": 0.0097, + "num_tokens": 31373384.0, + "reward": 0.8909179925918579, + "reward_std": 0.15230139791965486, + "rewards/format_reward/mean": 0.0078124999534338714, + "rewards/format_reward/std": 0.08345819935202599, + "rewards/mcq_accuracy_reward/mean": 0.6411458373069763, + "rewards/mcq_accuracy_reward/std": 0.4798282980918884, + "rewards/tag_count_reward/mean": 0.9912760376930236, + "rewards/tag_count_reward/std": 0.053227897733449936, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 4836.4, + "completions/max_terminated_length": 4114.8, + "completions/mean_length": 894.0838745117187, + "completions/mean_terminated_length": 890.3379760742188, + "completions/min_length": 218.8, + "completions/min_terminated_length": 218.8, + "epoch": 0.021003921826028452, + "grad_norm": 0.3414670173950181, + "kl": 0.002822208404541016, + "learning_rate": 1.2408376963350786e-06, + "loss": 0.0088, + "num_tokens": 33383145.0, + "reward": 0.9288737297058105, + "reward_std": 0.12165495455265045, + "rewards/format_reward/mean": 0.01458333320915699, + "rewards/format_reward/std": 0.10183531641960145, + "rewards/mcq_accuracy_reward/mean": 0.6791666626930237, + "rewards/mcq_accuracy_reward/std": 0.4610283553600311, + "rewards/tag_count_reward/mean": 0.9842447996139526, + "rewards/tag_count_reward/std": 0.07519266158342361, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3679.8, + "completions/max_terminated_length": 3679.8, + "completions/mean_length": 980.1281616210938, + "completions/mean_terminated_length": 980.1281616210938, + "completions/min_length": 236.6, + "completions/min_terminated_length": 236.6, + "epoch": 0.022316666940155234, + "grad_norm": 0.46527258724051174, + "kl": 0.006236839294433594, + "learning_rate": 1.3193717277486912e-06, + "loss": 0.0089, + "num_tokens": 35563751.0, + "reward": 1.0026042103767394, + "reward_std": 0.17879453003406526, + "rewards/format_reward/mean": 0.3296874985098839, + "rewards/format_reward/std": 0.37602974772453307, + "rewards/mcq_accuracy_reward/mean": 0.692187511920929, + "rewards/mcq_accuracy_reward/std": 0.456982284784317, + "rewards/tag_count_reward/mean": 0.9119791626930237, + "rewards/tag_count_reward/std": 0.10359178185462951, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3764.6, + "completions/max_terminated_length": 3764.6, + "completions/mean_length": 1014.7625244140625, + "completions/mean_terminated_length": 1014.7625244140625, + "completions/min_length": 319.6, + "completions/min_terminated_length": 319.6, + "epoch": 0.023629412054282012, + "grad_norm": 0.23665230700021717, + "kl": 0.013361358642578125, + "learning_rate": 1.3979057591623036e-06, + "loss": 0.0194, + "num_tokens": 37809919.0, + "reward": 1.107910192012787, + "reward_std": 0.1419330820441246, + "rewards/format_reward/mean": 0.9494791746139526, + "rewards/format_reward/std": 0.20899733603000642, + "rewards/mcq_accuracy_reward/mean": 0.6807291626930236, + "rewards/mcq_accuracy_reward/std": 0.4589788854122162, + "rewards/tag_count_reward/mean": 0.7592447996139526, + "rewards/tag_count_reward/std": 0.04565813317894936, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3839.8, + "completions/max_terminated_length": 3839.8, + "completions/mean_length": 975.8276123046875, + "completions/mean_terminated_length": 975.8276123046875, + "completions/min_length": 316.6, + "completions/min_terminated_length": 316.6, + "epoch": 0.02494215716840879, + "grad_norm": 0.14823223443222375, + "kl": 0.016013717651367186, + "learning_rate": 1.4764397905759164e-06, + "loss": 0.0051, + "num_tokens": 39985420.0, + "reward": 1.1560872793197632, + "reward_std": 0.10409260243177414, + "rewards/format_reward/mean": 0.993749988079071, + "rewards/format_reward/std": 0.07549194097518921, + "rewards/mcq_accuracy_reward/mean": 0.7197916626930236, + "rewards/mcq_accuracy_reward/std": 0.44616584181785585, + "rewards/tag_count_reward/mean": 0.7514322996139526, + "rewards/tag_count_reward/std": 0.018197770044207574, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4453.6, + "completions/max_terminated_length": 4453.6, + "completions/mean_length": 1036.6390625, + "completions/mean_terminated_length": 1036.6390625, + "completions/min_length": 299.6, + "completions/min_terminated_length": 299.6, + "epoch": 0.026254902282535568, + "grad_norm": 0.20014345036957168, + "kl": 0.01590728759765625, + "learning_rate": 1.554973821989529e-06, + "loss": 0.0065, + "num_tokens": 42275519.0, + "reward": 1.1000000715255738, + "reward_std": 0.12915935069322587, + "rewards/format_reward/mean": 0.9973958373069763, + "rewards/format_reward/std": 0.03204635381698608, + "rewards/mcq_accuracy_reward/mean": 0.6630208373069764, + "rewards/mcq_accuracy_reward/std": 0.4720790863037109, + "rewards/tag_count_reward/mean": 0.7505208253860474, + "rewards/tag_count_reward/std": 0.007207450270652771, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4729.8, + "completions/max_terminated_length": 4729.8, + "completions/mean_length": 967.3614990234375, + "completions/mean_terminated_length": 967.3614990234375, + "completions/min_length": 331.8, + "completions/min_terminated_length": 331.8, + "epoch": 0.027567647396662346, + "grad_norm": 0.2023314682954963, + "kl": 0.01651458740234375, + "learning_rate": 1.6335078534031414e-06, + "loss": 0.0006, + "num_tokens": 44424341.0, + "reward": 1.1651041984558106, + "reward_std": 0.13515677601099013, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/mcq_accuracy_reward/mean": 0.7276041746139527, + "rewards/mcq_accuracy_reward/std": 0.44498780369758606, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4038.0, + "completions/max_terminated_length": 4038.0, + "completions/mean_length": 960.7849243164062, + "completions/mean_terminated_length": 960.7849243164062, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.028880392510789124, + "grad_norm": 0.19257847122058988, + "kl": 0.0166168212890625, + "learning_rate": 1.712041884816754e-06, + "loss": -0.001, + "num_tokens": 46566376.0, + "reward": 1.1156250476837157, + "reward_std": 0.11993977576494216, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/mcq_accuracy_reward/mean": 0.678125011920929, + "rewards/mcq_accuracy_reward/std": 0.4642050087451935, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4187.6, + "completions/max_terminated_length": 4187.6, + "completions/mean_length": 1067.5010620117187, + "completions/mean_terminated_length": 1067.5010620117187, + "completions/min_length": 308.8, + "completions/min_terminated_length": 308.8, + "epoch": 0.030193137624915902, + "grad_norm": 0.2017458742071714, + "kl": 0.015716552734375, + "learning_rate": 1.7905759162303664e-06, + "loss": 0.0134, + "num_tokens": 48916730.0, + "reward": 1.0925130605697633, + "reward_std": 0.12151644229888917, + "rewards/format_reward/mean": 0.9963541626930237, + "rewards/format_reward/std": 0.05245876908302307, + "rewards/mcq_accuracy_reward/mean": 0.6557291507720947, + "rewards/mcq_accuracy_reward/std": 0.4730843961238861, + "rewards/tag_count_reward/mean": 0.75078125, + "rewards/tag_count_reward/std": 0.010563140362501144, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.004166666666666674, + "completions/max_length": 4980.2, + "completions/max_terminated_length": 4108.8, + "completions/mean_length": 1024.264599609375, + "completions/mean_terminated_length": 994.69462890625, + "completions/min_length": 343.4, + "completions/min_terminated_length": 343.4, + "epoch": 0.03150588273904268, + "grad_norm": 0.22094033325989748, + "kl": 0.015886688232421876, + "learning_rate": 1.8691099476439793e-06, + "loss": 0.0099, + "num_tokens": 51184590.0, + "reward": 1.1372070789337159, + "reward_std": 0.12987421602010726, + "rewards/format_reward/mean": 0.9942708373069763, + "rewards/format_reward/std": 0.05322360098361969, + "rewards/mcq_accuracy_reward/mean": 0.7015625, + "rewards/mcq_accuracy_reward/std": 0.4520592331886292, + "rewards/tag_count_reward/mean": 0.7483072876930237, + "rewards/tag_count_reward/std": 0.020456523448228837, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4318.4, + "completions/max_terminated_length": 4318.4, + "completions/mean_length": 1049.1615112304687, + "completions/mean_terminated_length": 1049.1615112304687, + "completions/min_length": 309.8, + "completions/min_terminated_length": 309.8, + "epoch": 0.03281862785316946, + "grad_norm": 0.19709543911988747, + "kl": 0.016326141357421876, + "learning_rate": 1.9476439790575916e-06, + "loss": 0.0119, + "num_tokens": 53501332.0, + "reward": 1.1492839097976684, + "reward_std": 0.1298315316438675, + "rewards/format_reward/mean": 0.9989583253860473, + "rewards/format_reward/std": 0.020412415266036987, + "rewards/mcq_accuracy_reward/mean": 0.7119791626930236, + "rewards/mcq_accuracy_reward/std": 0.45076271891593933, + "rewards/tag_count_reward/mean": 0.7502604246139526, + "rewards/tag_count_reward/std": 0.005103103816509247, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5118.6, + "completions/max_terminated_length": 5118.6, + "completions/mean_length": 1078.6104614257813, + "completions/mean_terminated_length": 1078.6104614257813, + "completions/min_length": 340.2, + "completions/min_terminated_length": 340.2, + "epoch": 0.034131372967296236, + "grad_norm": 0.26753136371737135, + "kl": 0.015799713134765626, + "learning_rate": 2.0261780104712043e-06, + "loss": 0.0048, + "num_tokens": 55870072.0, + "reward": 1.096386766433716, + "reward_std": 0.1485609829425812, + "rewards/format_reward/mean": 0.9973958253860473, + "rewards/format_reward/std": 0.045033523440361024, + "rewards/mcq_accuracy_reward/mean": 0.659375, + "rewards/mcq_accuracy_reward/std": 0.47198906540870667, + "rewards/tag_count_reward/mean": 0.7506510496139527, + "rewards/tag_count_reward/std": 0.011258380860090256, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4929.6, + "completions/max_terminated_length": 4929.6, + "completions/mean_length": 1026.62138671875, + "completions/mean_terminated_length": 1026.62138671875, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.03544411808142302, + "grad_norm": 0.20838769464486284, + "kl": 0.01702423095703125, + "learning_rate": 2.104712041884817e-06, + "loss": 0.0115, + "num_tokens": 58144049.0, + "reward": 1.1482096672058106, + "reward_std": 0.14524158835411072, + "rewards/format_reward/mean": 0.9963541626930237, + "rewards/format_reward/std": 0.05245876908302307, + "rewards/mcq_accuracy_reward/mean": 0.7114583253860474, + "rewards/mcq_accuracy_reward/std": 0.45284959077835085, + "rewards/tag_count_reward/mean": 0.7506510496139527, + "rewards/tag_count_reward/std": 0.01313009075820446, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3528.2, + "completions/max_terminated_length": 3528.2, + "completions/mean_length": 1007.7364624023437, + "completions/mean_terminated_length": 1007.7364624023437, + "completions/min_length": 320.8, + "completions/min_terminated_length": 320.8, + "epoch": 0.03675686319554979, + "grad_norm": 0.15900920958806017, + "kl": 0.017884063720703124, + "learning_rate": 2.1832460732984295e-06, + "loss": 0.0048, + "num_tokens": 60376055.0, + "reward": 1.111067771911621, + "reward_std": 0.1403453230857849, + "rewards/format_reward/mean": 0.9979166626930237, + "rewards/format_reward/std": 0.03482731580734253, + "rewards/mcq_accuracy_reward/mean": 0.6739583373069763, + "rewards/mcq_accuracy_reward/std": 0.4687348544597626, + "rewards/tag_count_reward/mean": 0.7505208373069763, + "rewards/tag_count_reward/std": 0.008706828951835633, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3979.6, + "completions/max_terminated_length": 3979.6, + "completions/mean_length": 992.67607421875, + "completions/mean_terminated_length": 992.67607421875, + "completions/min_length": 304.4, + "completions/min_terminated_length": 304.4, + "epoch": 0.038069608309676574, + "grad_norm": 0.22537387404474848, + "kl": 0.020897674560546874, + "learning_rate": 2.261780104712042e-06, + "loss": 0.0061, + "num_tokens": 62575993.0, + "reward": 1.1391927242279052, + "reward_std": 0.136134535074234, + "rewards/format_reward/mean": 0.9979166746139526, + "rewards/format_reward/std": 0.028829801082611083, + "rewards/mcq_accuracy_reward/mean": 0.7020833253860473, + "rewards/mcq_accuracy_reward/std": 0.45655394792556764, + "rewards/tag_count_reward/mean": 0.7505208253860474, + "rewards/tag_count_reward/std": 0.007207450270652771, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4118.0, + "completions/max_terminated_length": 4118.0, + "completions/mean_length": 987.42138671875, + "completions/mean_terminated_length": 987.42138671875, + "completions/min_length": 325.2, + "completions/min_terminated_length": 325.2, + "epoch": 0.03938235342380335, + "grad_norm": 0.21610268300811203, + "kl": 0.025011444091796876, + "learning_rate": 2.3403141361256547e-06, + "loss": 0.0112, + "num_tokens": 64764562.0, + "reward": 1.083691453933716, + "reward_std": 0.13813820779323577, + "rewards/format_reward/mean": 0.9963541626930237, + "rewards/format_reward/std": 0.05245876908302307, + "rewards/mcq_accuracy_reward/mean": 0.646875, + "rewards/mcq_accuracy_reward/std": 0.47466925978660585, + "rewards/tag_count_reward/mean": 0.7509114623069764, + "rewards/tag_count_reward/std": 0.013114692270755767, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001041666666666652, + "completions/max_length": 5917.2, + "completions/max_terminated_length": 4925.0, + "completions/mean_length": 991.1208618164062, + "completions/mean_terminated_length": 983.62021484375, + "completions/min_length": 308.4, + "completions/min_terminated_length": 308.4, + "epoch": 0.04069509853793013, + "grad_norm": 0.20173276460384038, + "kl": 0.02845306396484375, + "learning_rate": 2.418848167539267e-06, + "loss": 0.01, + "num_tokens": 66966074.0, + "reward": 1.1273437976837157, + "reward_std": 0.10823013186454773, + "rewards/format_reward/mean": 0.9942708253860474, + "rewards/format_reward/std": 0.06528573334217072, + "rewards/mcq_accuracy_reward/mean": 0.6911458253860474, + "rewards/mcq_accuracy_reward/std": 0.46151432394981384, + "rewards/tag_count_reward/mean": 0.7505208492279053, + "rewards/tag_count_reward/std": 0.01844913251698017, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3826.0, + "completions/max_terminated_length": 3826.0, + "completions/mean_length": 1018.38388671875, + "completions/mean_terminated_length": 1018.38388671875, + "completions/min_length": 325.2, + "completions/min_terminated_length": 325.2, + "epoch": 0.042007843652056905, + "grad_norm": 0.18267483756807298, + "kl": 0.0232086181640625, + "learning_rate": 2.49738219895288e-06, + "loss": 0.0012, + "num_tokens": 69220155.0, + "reward": 1.0927083611488342, + "reward_std": 0.13507107496261597, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/mcq_accuracy_reward/mean": 0.6552083373069764, + "rewards/mcq_accuracy_reward/std": 0.4687082588672638, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4219.8, + "completions/max_terminated_length": 4219.8, + "completions/mean_length": 1010.44638671875, + "completions/mean_terminated_length": 1010.44638671875, + "completions/min_length": 300.6, + "completions/min_terminated_length": 300.6, + "epoch": 0.043320588766183686, + "grad_norm": 0.16873994230588218, + "kl": 0.02104339599609375, + "learning_rate": 2.575916230366492e-06, + "loss": 0.0071, + "num_tokens": 71459772.0, + "reward": 1.1238607168197632, + "reward_std": 0.10564412921667099, + "rewards/format_reward/mean": 0.9994791626930237, + "rewards/format_reward/std": 0.010206207633018494, + "rewards/mcq_accuracy_reward/mean": 0.6864583373069764, + "rewards/mcq_accuracy_reward/std": 0.4609563410282135, + "rewards/tag_count_reward/mean": 0.7501302123069763, + "rewards/tag_count_reward/std": 0.0025515519082546234, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4356.6, + "completions/max_terminated_length": 4356.6, + "completions/mean_length": 996.4901245117187, + "completions/mean_terminated_length": 996.4901245117187, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "epoch": 0.04463333388031047, + "grad_norm": 0.250891354783539, + "kl": 0.0210235595703125, + "learning_rate": 2.6544502617801047e-06, + "loss": 0.0065, + "num_tokens": 73670689.0, + "reward": 1.1228190422058106, + "reward_std": 0.15216803103685378, + "rewards/format_reward/mean": 0.9994791626930237, + "rewards/format_reward/std": 0.010206207633018494, + "rewards/mcq_accuracy_reward/mean": 0.6854166626930237, + "rewards/mcq_accuracy_reward/std": 0.4592389941215515, + "rewards/tag_count_reward/mean": 0.7501302123069763, + "rewards/tag_count_reward/std": 0.0025515519082546234, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4457.0, + "completions/max_terminated_length": 4457.0, + "completions/mean_length": 984.8484741210938, + "completions/mean_terminated_length": 984.8484741210938, + "completions/min_length": 311.4, + "completions/min_terminated_length": 311.4, + "epoch": 0.04594607899443724, + "grad_norm": 0.21907212493302594, + "kl": 0.022550201416015624, + "learning_rate": 2.7329842931937173e-06, + "loss": 0.0123, + "num_tokens": 75861398.0, + "reward": 1.1744792222976685, + "reward_std": 0.12148265987634659, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/mcq_accuracy_reward/mean": 0.7369791746139527, + "rewards/mcq_accuracy_reward/std": 0.43844847083091737, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5094.4, + "completions/max_terminated_length": 5094.4, + "completions/mean_length": 1032.9802490234374, + "completions/mean_terminated_length": 1032.9802490234374, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "epoch": 0.047258824108564024, + "grad_norm": 0.23309652992518912, + "kl": 0.022772979736328126, + "learning_rate": 2.81151832460733e-06, + "loss": 0.0014, + "num_tokens": 78140024.0, + "reward": 1.1265299797058106, + "reward_std": 0.1179361805319786, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/mcq_accuracy_reward/mean": 0.689062488079071, + "rewards/mcq_accuracy_reward/std": 0.46171254515647886, + "rewards/tag_count_reward/mean": 0.7498697876930237, + "rewards/tag_count_reward/std": 0.0025515519082546234, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4534.2, + "completions/max_terminated_length": 4534.2, + "completions/mean_length": 1045.909423828125, + "completions/mean_terminated_length": 1045.909423828125, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.0485715692226908, + "grad_norm": 0.23330613529437708, + "kl": 0.02296905517578125, + "learning_rate": 2.8900523560209425e-06, + "loss": 0.0016, + "num_tokens": 80448178.0, + "reward": 1.1164713859558106, + "reward_std": 0.13200942873954774, + "rewards/format_reward/mean": 0.9989583373069764, + "rewards/format_reward/std": 0.014414900541305542, + "rewards/mcq_accuracy_reward/mean": 0.6791666626930237, + "rewards/mcq_accuracy_reward/std": 0.4624324977397919, + "rewards/tag_count_reward/mean": 0.7502604126930237, + "rewards/tag_count_reward/std": 0.0036037251353263854, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3616.8, + "completions/max_terminated_length": 3616.8, + "completions/mean_length": 943.230224609375, + "completions/mean_terminated_length": 943.230224609375, + "completions/min_length": 303.6, + "completions/min_terminated_length": 303.6, + "epoch": 0.04988431433681758, + "grad_norm": 0.22201886109634597, + "kl": 0.02301788330078125, + "learning_rate": 2.968586387434555e-06, + "loss": 0.0042, + "num_tokens": 82556940.0, + "reward": 1.151562547683716, + "reward_std": 0.11185846626758575, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/mcq_accuracy_reward/mean": 0.7140625, + "rewards/mcq_accuracy_reward/std": 0.4490456938743591, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4932.4, + "completions/max_terminated_length": 4932.4, + "completions/mean_length": 1072.140673828125, + "completions/mean_terminated_length": 1072.140673828125, + "completions/min_length": 325.8, + "completions/min_terminated_length": 325.8, + "epoch": 0.051197059450944354, + "grad_norm": 0.25191710328074673, + "kl": 0.0211456298828125, + "learning_rate": 2.999977296767498e-06, + "loss": 0.0064, + "num_tokens": 84913458.0, + "reward": 1.0848958611488342, + "reward_std": 0.1388065218925476, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/mcq_accuracy_reward/mean": 0.6473958253860473, + "rewards/mcq_accuracy_reward/std": 0.47163227796554563, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 5665.8, + "completions/max_terminated_length": 4958.2, + "completions/mean_length": 1007.6521118164062, + "completions/mean_terminated_length": 1003.9624389648437, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.052509804565071136, + "grad_norm": 0.2374888919991407, + "kl": 0.025051116943359375, + "learning_rate": 2.9998385572798936e-06, + "loss": 0.0137, + "num_tokens": 87145934.0, + "reward": 1.1438802242279054, + "reward_std": 0.11781122833490372, + "rewards/format_reward/mean": 0.9984374880790711, + "rewards/format_reward/std": 0.03061862289905548, + "rewards/mcq_accuracy_reward/mean": 0.7067708373069763, + "rewards/mcq_accuracy_reward/std": 0.4523226022720337, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.005103103816509247, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4059.2, + "completions/max_terminated_length": 4059.2, + "completions/mean_length": 969.873974609375, + "completions/mean_terminated_length": 969.873974609375, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.05382254967919791, + "grad_norm": 0.2150302532657217, + "kl": 0.0292572021484375, + "learning_rate": 2.999573702863491e-06, + "loss": 0.0167, + "num_tokens": 89305956.0, + "reward": 1.1479166984558105, + "reward_std": 0.11885059028863906, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/mcq_accuracy_reward/mean": 0.7104166626930237, + "rewards/mcq_accuracy_reward/std": 0.451763379573822, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4066.2, + "completions/max_terminated_length": 4066.2, + "completions/mean_length": 1068.4500244140625, + "completions/mean_terminated_length": 1068.4500244140625, + "completions/min_length": 322.8, + "completions/min_terminated_length": 322.8, + "epoch": 0.05513529479332469, + "grad_norm": 0.21220982277630238, + "kl": 0.025278472900390626, + "learning_rate": 2.9991827557887514e-06, + "loss": 0.0118, + "num_tokens": 91654460.0, + "reward": 1.0993815422058106, + "reward_std": 0.1643467515707016, + "rewards/format_reward/mean": 0.9994791626930237, + "rewards/format_reward/std": 0.010206207633018494, + "rewards/mcq_accuracy_reward/mean": 0.6619791626930237, + "rewards/mcq_accuracy_reward/std": 0.472586578130722, + "rewards/tag_count_reward/mean": 0.7501302123069763, + "rewards/tag_count_reward/std": 0.0025515519082546234, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4159.2, + "completions/max_terminated_length": 4159.2, + "completions/mean_length": 1071.8323486328125, + "completions/mean_terminated_length": 1071.8323486328125, + "completions/min_length": 342.6, + "completions/min_terminated_length": 342.6, + "epoch": 0.05644803990745147, + "grad_norm": 0.19922757787331763, + "kl": 0.02652587890625, + "learning_rate": 2.9986657489287217e-06, + "loss": 0.0187, + "num_tokens": 94011874.0, + "reward": 1.1180338859558105, + "reward_std": 0.1331680089235306, + "rewards/format_reward/mean": 0.9989583253860473, + "rewards/format_reward/std": 0.020412415266036987, + "rewards/mcq_accuracy_reward/mean": 0.6807291746139527, + "rewards/mcq_accuracy_reward/std": 0.463273698091507, + "rewards/tag_count_reward/mean": 0.7502604246139526, + "rewards/tag_count_reward/std": 0.005103103816509247, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 5161.6, + "completions/max_terminated_length": 4304.6, + "completions/mean_length": 1072.8344116210938, + "completions/mean_terminated_length": 1069.0624755859376, + "completions/min_length": 325.8, + "completions/min_terminated_length": 325.8, + "epoch": 0.05776078502157825, + "grad_norm": 0.21184878910744143, + "kl": 0.02801666259765625, + "learning_rate": 2.9980227257562692e-06, + "loss": 0.0139, + "num_tokens": 96371676.0, + "reward": 1.190657615661621, + "reward_std": 0.1365644320845604, + "rewards/format_reward/mean": 0.9953125, + "rewards/format_reward/std": 0.05988401472568512, + "rewards/mcq_accuracy_reward/mean": 0.7541666626930237, + "rewards/mcq_accuracy_reward/std": 0.42858466506004333, + "rewards/tag_count_reward/mean": 0.7506510496139527, + "rewards/tag_count_reward/std": 0.01647038236260414, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4275.2, + "completions/max_terminated_length": 4275.2, + "completions/mean_length": 1021.97451171875, + "completions/mean_terminated_length": 1021.97451171875, + "completions/min_length": 316.6, + "completions/min_terminated_length": 316.6, + "epoch": 0.05907353013570503, + "grad_norm": 0.23626927859400262, + "kl": 0.028619384765625, + "learning_rate": 2.997253740340428e-06, + "loss": 0.0083, + "num_tokens": 98632019.0, + "reward": 1.1561523914337157, + "reward_std": 0.12944005876779557, + "rewards/format_reward/mean": 0.9942708373069763, + "rewards/format_reward/std": 0.07378322035074233, + "rewards/mcq_accuracy_reward/mean": 0.7197916746139527, + "rewards/mcq_accuracy_reward/std": 0.4463343620300293, + "rewards/tag_count_reward/mean": 0.751171863079071, + "rewards/tag_count_reward/std": 0.018465830758213998, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3801.6, + "completions/max_terminated_length": 3801.6, + "completions/mean_length": 1030.4307861328125, + "completions/mean_terminated_length": 1030.4307861328125, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.060386275249831804, + "grad_norm": 0.5532627008634082, + "kl": 0.02605438232421875, + "learning_rate": 2.9963588573418514e-06, + "loss": 0.0017, + "num_tokens": 100907086.0, + "reward": 1.1284180164337159, + "reward_std": 0.14278736859560012, + "rewards/format_reward/mean": 0.990625011920929, + "rewards/format_reward/std": 0.09399981647729874, + "rewards/mcq_accuracy_reward/mean": 0.6927083373069763, + "rewards/mcq_accuracy_reward/std": 0.459688264131546, + "rewards/tag_count_reward/mean": 0.7522135376930237, + "rewards/tag_count_reward/std": 0.022447781078517436, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4143.6, + "completions/max_terminated_length": 4143.6, + "completions/mean_length": 1030.836474609375, + "completions/mean_terminated_length": 1030.836474609375, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.061699020363958586, + "grad_norm": 0.2514607352568713, + "kl": 0.02453460693359375, + "learning_rate": 2.9953381520073745e-06, + "loss": 0.0069, + "num_tokens": 103183852.0, + "reward": 1.1106445789337158, + "reward_std": 0.15580974966287614, + "rewards/format_reward/mean": 0.9848958253860474, + "rewards/format_reward/std": 0.11988793760538101, + "rewards/mcq_accuracy_reward/mean": 0.6760416626930237, + "rewards/mcq_accuracy_reward/std": 0.4647082030773163, + "rewards/tag_count_reward/mean": 0.7535156369209289, + "rewards/tag_count_reward/std": 0.029095379635691643, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3941.8, + "completions/max_terminated_length": 3941.8, + "completions/mean_length": 1001.7906616210937, + "completions/mean_terminated_length": 1001.7906616210937, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.06301176547808536, + "grad_norm": 0.34613585367425415, + "kl": 0.02639312744140625, + "learning_rate": 2.9941917101636894e-06, + "loss": 0.0103, + "num_tokens": 105402210.0, + "reward": 1.1089518785476684, + "reward_std": 0.14578705430030822, + "rewards/format_reward/mean": 0.9630208373069763, + "rewards/format_reward/std": 0.16518850326538087, + "rewards/mcq_accuracy_reward/mean": 0.6796874880790711, + "rewards/mcq_accuracy_reward/std": 0.46075564026832583, + "rewards/tag_count_reward/mean": 0.7540364742279053, + "rewards/tag_count_reward/std": 0.030129339545965195, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4183.2, + "completions/max_terminated_length": 4183.2, + "completions/mean_length": 979.314599609375, + "completions/mean_terminated_length": 979.314599609375, + "completions/min_length": 322.8, + "completions/min_terminated_length": 322.8, + "epoch": 0.06432451059221214, + "grad_norm": 0.20761736028492528, + "kl": 0.0290008544921875, + "learning_rate": 2.9929196282101254e-06, + "loss": 0.0159, + "num_tokens": 107581102.0, + "reward": 1.0987956047058105, + "reward_std": 0.15559289008378982, + "rewards/format_reward/mean": 0.93125, + "rewards/format_reward/std": 0.1825384944677353, + "rewards/mcq_accuracy_reward/mean": 0.6776041746139526, + "rewards/mcq_accuracy_reward/std": 0.46558194160461425, + "rewards/tag_count_reward/mean": 0.753515625, + "rewards/tag_count_reward/std": 0.030161596089601516, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3886.0, + "completions/max_terminated_length": 3886.0, + "completions/mean_length": 984.1218872070312, + "completions/mean_terminated_length": 984.1218872070312, + "completions/min_length": 311.6, + "completions/min_terminated_length": 311.6, + "epoch": 0.06563725570633892, + "grad_norm": 0.22659356532290936, + "kl": 0.0353729248046875, + "learning_rate": 2.9915220131105448e-06, + "loss": 0.0087, + "num_tokens": 109767864.0, + "reward": 1.1377930164337158, + "reward_std": 0.14986170530319215, + "rewards/format_reward/mean": 0.990625, + "rewards/format_reward/std": 0.08424023538827896, + "rewards/mcq_accuracy_reward/mean": 0.7020833253860473, + "rewards/mcq_accuracy_reward/std": 0.45472397804260256, + "rewards/tag_count_reward/mean": 0.7522135376930237, + "rewards/tag_count_reward/std": 0.020571443811059, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 4994.2, + "completions/max_terminated_length": 4278.2, + "completions/mean_length": 1090.2104248046876, + "completions/mean_terminated_length": 1086.580517578125, + "completions/min_length": 314.4, + "completions/min_terminated_length": 314.4, + "epoch": 0.06695000082046569, + "grad_norm": 0.14311356598874359, + "kl": 0.0318939208984375, + "learning_rate": 2.989998982384348e-06, + "loss": 0.0135, + "num_tokens": 112161268.0, + "reward": 1.1673502922058105, + "reward_std": 0.12497571557760238, + "rewards/format_reward/mean": 0.9901041626930237, + "rewards/format_reward/std": 0.09260772317647933, + "rewards/mcq_accuracy_reward/mean": 0.7317708253860473, + "rewards/mcq_accuracy_reward/std": 0.43985254168510435, + "rewards/tag_count_reward/mean": 0.7522135376930237, + "rewards/tag_count_reward/std": 0.02318231761455536, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4579.2, + "completions/max_terminated_length": 4579.2, + "completions/mean_length": 1027.9755615234376, + "completions/mean_terminated_length": 1027.9755615234376, + "completions/min_length": 313.8, + "completions/min_terminated_length": 313.8, + "epoch": 0.06826274593459247, + "grad_norm": 0.2337911029316405, + "kl": 0.03267822265625, + "learning_rate": 2.9883506640965938e-06, + "loss": 0.0088, + "num_tokens": 114430805.0, + "reward": 1.1212890863418579, + "reward_std": 0.11923356056213379, + "rewards/format_reward/mean": 0.9916666626930237, + "rewards/format_reward/std": 0.07807708233594894, + "rewards/mcq_accuracy_reward/mean": 0.6854166626930237, + "rewards/mcq_accuracy_reward/std": 0.4634483814239502, + "rewards/tag_count_reward/mean": 0.7518229246139526, + "rewards/tag_count_reward/std": 0.0205176517367363, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4742.0, + "completions/max_terminated_length": 4742.0, + "completions/mean_length": 942.0177368164062, + "completions/mean_terminated_length": 942.0177368164062, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.06957549104871925, + "grad_norm": 0.23606556675320234, + "kl": 0.03384552001953125, + "learning_rate": 2.986577196847228e-06, + "loss": 0.0161, + "num_tokens": 116533055.0, + "reward": 1.1155599355697632, + "reward_std": 0.13383112102746964, + "rewards/format_reward/mean": 0.9942708253860474, + "rewards/format_reward/std": 0.0727910801768303, + "rewards/mcq_accuracy_reward/mean": 0.6791666746139526, + "rewards/mcq_accuracy_reward/std": 0.46549699306488035, + "rewards/tag_count_reward/mean": 0.7513020873069763, + "rewards/tag_count_reward/std": 0.01739363223314285, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4084.4, + "completions/max_terminated_length": 4084.4, + "completions/mean_length": 1029.0109741210938, + "completions/mean_terminated_length": 1029.0109741210938, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.07088823616284604, + "grad_norm": 0.17402568738720683, + "kl": 0.0336669921875, + "learning_rate": 2.984678729759431e-06, + "loss": 0.0015, + "num_tokens": 118812340.0, + "reward": 1.1317057371139527, + "reward_std": 0.11408141851425171, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/mcq_accuracy_reward/mean": 0.6942708253860473, + "rewards/mcq_accuracy_reward/std": 0.45725122690200803, + "rewards/tag_count_reward/mean": 0.7497395873069763, + "rewards/tag_count_reward/std": 0.0036037251353263854, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4788.0, + "completions/max_terminated_length": 4788.0, + "completions/mean_length": 1066.65888671875, + "completions/mean_terminated_length": 1066.65888671875, + "completions/min_length": 336.6, + "completions/min_terminated_length": 336.6, + "epoch": 0.0722009812769728, + "grad_norm": 0.2022085908444333, + "kl": 0.03605499267578125, + "learning_rate": 2.9826554224670782e-06, + "loss": 0.0012, + "num_tokens": 121157309.0, + "reward": 1.1613607168197633, + "reward_std": 0.13188395649194717, + "rewards/format_reward/mean": 0.996874988079071, + "rewards/format_reward/std": 0.04825007617473602, + "rewards/mcq_accuracy_reward/mean": 0.7244791626930237, + "rewards/mcq_accuracy_reward/std": 0.44537928104400637, + "rewards/tag_count_reward/mean": 0.7506510496139527, + "rewards/tag_count_reward/std": 0.011258380860090256, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5094.6, + "completions/max_terminated_length": 5094.6, + "completions/mean_length": 1127.7750244140625, + "completions/mean_terminated_length": 1127.7750244140625, + "completions/min_length": 324.6, + "completions/min_terminated_length": 324.6, + "epoch": 0.07351372639109958, + "grad_norm": 0.24414018191018827, + "kl": 0.03695220947265625, + "learning_rate": 2.980507445101318e-06, + "loss": 0.004, + "num_tokens": 123620709.0, + "reward": 1.152669334411621, + "reward_std": 0.12334871292114258, + "rewards/format_reward/mean": 0.9979166746139526, + "rewards/format_reward/std": 0.028829801082611083, + "rewards/mcq_accuracy_reward/mean": 0.715625, + "rewards/mcq_accuracy_reward/std": 0.4434922933578491, + "rewards/tag_count_reward/mean": 0.7502604126930237, + "rewards/tag_count_reward/std": 0.008713486418128014, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6217.2, + "completions/max_terminated_length": 6217.2, + "completions/mean_length": 1029.028662109375, + "completions/mean_terminated_length": 1029.028662109375, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.07482647150522637, + "grad_norm": 0.22434409470233285, + "kl": 0.03756256103515625, + "learning_rate": 2.9782349782762646e-06, + "loss": 0.0088, + "num_tokens": 125894052.0, + "reward": 1.1321614742279054, + "reward_std": 0.12179401963949203, + "rewards/format_reward/mean": 0.9994791626930237, + "rewards/format_reward/std": 0.010206207633018494, + "rewards/mcq_accuracy_reward/mean": 0.6947916626930237, + "rewards/mcq_accuracy_reward/std": 0.45902812480926514, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.005103103816509247, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4193.6, + "completions/max_terminated_length": 4193.6, + "completions/mean_length": 1005.5505249023438, + "completions/mean_terminated_length": 1005.5505249023438, + "completions/min_length": 321.4, + "completions/min_terminated_length": 321.4, + "epoch": 0.07613921661935315, + "grad_norm": 0.22577030529609984, + "kl": 0.035418701171875, + "learning_rate": 2.975838213073811e-06, + "loss": 0.0066, + "num_tokens": 128127341.0, + "reward": 1.1895508289337158, + "reward_std": 0.13039956539869307, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/mcq_accuracy_reward/mean": 0.7520833373069763, + "rewards/mcq_accuracy_reward/std": 0.4281717538833618, + "rewards/tag_count_reward/mean": 0.7498697876930237, + "rewards/tag_count_reward/std": 0.0025515519082546234, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4432.2, + "completions/max_terminated_length": 4432.2, + "completions/mean_length": 1003.1448364257812, + "completions/mean_terminated_length": 1003.1448364257812, + "completions/min_length": 301.8, + "completions/min_terminated_length": 301.8, + "epoch": 0.07745196173347993, + "grad_norm": 0.18658743625546148, + "kl": 0.0338043212890625, + "learning_rate": 2.9733173510275632e-06, + "loss": 0.0149, + "num_tokens": 130351811.0, + "reward": 1.1479818105697632, + "reward_std": 0.12679630517959595, + "rewards/format_reward/mean": 0.9979166507720947, + "rewards/format_reward/std": 0.040824830532073975, + "rewards/mcq_accuracy_reward/mean": 0.7109375, + "rewards/mcq_accuracy_reward/std": 0.4502442955970764, + "rewards/tag_count_reward/mean": 0.7502604246139526, + "rewards/tag_count_reward/std": 0.010206207633018494, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4482.0, + "completions/max_terminated_length": 4482.0, + "completions/mean_length": 994.4219116210937, + "completions/mean_terminated_length": 994.4219116210937, + "completions/min_length": 314.6, + "completions/min_terminated_length": 314.6, + "epoch": 0.0787647068476067, + "grad_norm": 0.1935442138608805, + "kl": 0.03246002197265625, + "learning_rate": 2.9706726041058943e-06, + "loss": 0.0036, + "num_tokens": 132563533.0, + "reward": 1.167089867591858, + "reward_std": 0.11183267682790757, + "rewards/format_reward/mean": 0.9994791626930237, + "rewards/format_reward/std": 0.010206207633018494, + "rewards/mcq_accuracy_reward/mean": 0.729687488079071, + "rewards/mcq_accuracy_reward/std": 0.4386721014976501, + "rewards/tag_count_reward/mean": 0.7501302123069763, + "rewards/tag_count_reward/std": 0.0025515519082546234, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3957.6, + "completions/max_terminated_length": 3957.6, + "completions/mean_length": 1047.6417114257813, + "completions/mean_terminated_length": 1047.6417114257813, + "completions/min_length": 330.2, + "completions/min_terminated_length": 330.2, + "epoch": 0.08007745196173348, + "grad_norm": 0.32899185402335096, + "kl": 0.0323333740234375, + "learning_rate": 2.9679041946941183e-06, + "loss": 0.0162, + "num_tokens": 134870333.0, + "reward": 1.1627929925918579, + "reward_std": 0.11780945211648941, + "rewards/format_reward/mean": 0.9989583253860473, + "rewards/format_reward/std": 0.020412415266036987, + "rewards/mcq_accuracy_reward/mean": 0.7255208373069764, + "rewards/mcq_accuracy_reward/std": 0.44467177987098694, + "rewards/tag_count_reward/mean": 0.7501302123069763, + "rewards/tag_count_reward/std": 0.00765465572476387, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3908.0, + "completions/max_terminated_length": 3908.0, + "completions/mean_length": 990.3031494140625, + "completions/mean_terminated_length": 990.3031494140625, + "completions/min_length": 314.8, + "completions/min_terminated_length": 314.8, + "epoch": 0.08139019707586026, + "grad_norm": 0.21139800666039962, + "kl": 0.038262939453125, + "learning_rate": 2.965012355575794e-06, + "loss": 0.0056, + "num_tokens": 137071811.0, + "reward": 1.1192708969116212, + "reward_std": 0.12102733254432678, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/mcq_accuracy_reward/mean": 0.6817708373069763, + "rewards/mcq_accuracy_reward/std": 0.4634950816631317, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5447.2, + "completions/max_terminated_length": 5447.2, + "completions/mean_length": 1113.3771240234375, + "completions/mean_terminated_length": 1113.3771240234375, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.08270294218998704, + "grad_norm": 0.201307272479087, + "kl": 0.03599395751953125, + "learning_rate": 2.9619973299131474e-06, + "loss": 0.0101, + "num_tokens": 139514999.0, + "reward": 1.1358398914337158, + "reward_std": 0.13782176226377488, + "rewards/format_reward/mean": 0.9994791626930237, + "rewards/format_reward/std": 0.010206207633018494, + "rewards/mcq_accuracy_reward/mean": 0.6984375, + "rewards/mcq_accuracy_reward/std": 0.4522415041923523, + "rewards/tag_count_reward/mean": 0.7501302123069763, + "rewards/tag_count_reward/std": 0.0025515519082546234, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3986.0, + "completions/max_terminated_length": 3986.0, + "completions/mean_length": 975.2984497070313, + "completions/mean_terminated_length": 975.2984497070313, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.08401568730411381, + "grad_norm": 0.2533131446602347, + "kl": 0.03861236572265625, + "learning_rate": 2.9588593712266307e-06, + "loss": 0.015, + "num_tokens": 141687140.0, + "reward": 1.175716185569763, + "reward_std": 0.13188568651676177, + "rewards/format_reward/mean": 0.9984374880790711, + "rewards/format_reward/std": 0.03061862215399742, + "rewards/mcq_accuracy_reward/mean": 0.7385416746139526, + "rewards/mcq_accuracy_reward/std": 0.4337004065513611, + "rewards/tag_count_reward/mean": 0.7502604246139526, + "rewards/tag_count_reward/std": 0.005103103816509247, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4331.8, + "completions/max_terminated_length": 4331.8, + "completions/mean_length": 1055.0307739257812, + "completions/mean_terminated_length": 1055.0307739257812, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 0.08532843241824059, + "grad_norm": 0.19223914330913347, + "kl": 0.03333740234375, + "learning_rate": 2.955598743373599e-06, + "loss": 0.0086, + "num_tokens": 144009671.0, + "reward": 1.0899089097976684, + "reward_std": 0.1461899310350418, + "rewards/format_reward/mean": 0.9989583253860473, + "rewards/format_reward/std": 0.020412415266036987, + "rewards/mcq_accuracy_reward/mean": 0.6526041746139526, + "rewards/mcq_accuracy_reward/std": 0.4737109839916229, + "rewards/tag_count_reward/mean": 0.7502604246139526, + "rewards/tag_count_reward/std": 0.005103103816509247, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4023.4, + "completions/max_terminated_length": 4023.4, + "completions/mean_length": 1031.7583618164062, + "completions/mean_terminated_length": 1031.7583618164062, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.08664117753236737, + "grad_norm": 0.22321548290368853, + "kl": 0.033099365234375, + "learning_rate": 2.952215720526128e-06, + "loss": -0.0, + "num_tokens": 146289239.0, + "reward": 1.1442708730697633, + "reward_std": 0.13757950067520142, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/mcq_accuracy_reward/mean": 0.7067708373069763, + "rewards/mcq_accuracy_reward/std": 0.4495010733604431, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4104.2, + "completions/max_terminated_length": 4104.2, + "completions/mean_length": 1069.9166748046875, + "completions/mean_terminated_length": 1069.9166748046875, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.08795392264649415, + "grad_norm": 0.20740477286429523, + "kl": 0.036529541015625, + "learning_rate": 2.9487105871479574e-06, + "loss": 0.0147, + "num_tokens": 148643055.0, + "reward": 1.1513672113418578, + "reward_std": 0.14243375211954118, + "rewards/format_reward/mean": 0.9989583373069764, + "rewards/format_reward/std": 0.014414900541305542, + "rewards/mcq_accuracy_reward/mean": 0.7140625, + "rewards/mcq_accuracy_reward/std": 0.44832395315170287, + "rewards/tag_count_reward/mean": 0.7502604126930237, + "rewards/tag_count_reward/std": 0.0036037251353263854, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4008.4, + "completions/max_terminated_length": 4008.4, + "completions/mean_length": 1026.2192993164062, + "completions/mean_terminated_length": 1026.2192993164062, + "completions/min_length": 312.4, + "completions/min_terminated_length": 312.4, + "epoch": 0.08926666776062094, + "grad_norm": 0.20951274008844561, + "kl": 0.0352203369140625, + "learning_rate": 2.945083637970573e-06, + "loss": 0.0128, + "num_tokens": 150914964.0, + "reward": 1.1210937976837159, + "reward_std": 0.1250404566526413, + "rewards/format_reward/mean": 0.9989583253860473, + "rewards/format_reward/std": 0.020412414520978927, + "rewards/mcq_accuracy_reward/mean": 0.6838541626930237, + "rewards/mcq_accuracy_reward/std": 0.46301427483558655, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.005103103630244732, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3883.6, + "completions/max_terminated_length": 3883.6, + "completions/mean_length": 959.7328369140625, + "completions/mean_terminated_length": 959.7328369140625, + "completions/min_length": 330.6, + "completions/min_terminated_length": 330.6, + "epoch": 0.0905794128747477, + "grad_norm": 0.1567229897208322, + "kl": 0.03869781494140625, + "learning_rate": 2.9413351779684235e-06, + "loss": 0.0165, + "num_tokens": 153054323.0, + "reward": 1.1731120347976685, + "reward_std": 0.10713299810886383, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.017631453275680543, + "rewards/mcq_accuracy_reward/mean": 0.7359375119209289, + "rewards/mcq_accuracy_reward/std": 0.4377618730068207, + "rewards/tag_count_reward/mean": 0.7502604126930237, + "rewards/tag_count_reward/std": 0.0036037251353263854, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3348.6, + "completions/max_terminated_length": 3348.6, + "completions/mean_length": 912.9442993164063, + "completions/mean_terminated_length": 912.9442993164063, + "completions/min_length": 286.8, + "completions/min_terminated_length": 286.8, + "epoch": 0.09189215798887448, + "grad_norm": 0.19354318520678146, + "kl": 0.04102935791015625, + "learning_rate": 2.937465522333277e-06, + "loss": 0.0159, + "num_tokens": 155100016.0, + "reward": 1.155208373069763, + "reward_std": 0.13475794196128846, + "rewards/format_reward/mean": 0.9973958373069763, + "rewards/format_reward/std": 0.03204635381698608, + "rewards/mcq_accuracy_reward/mean": 0.7182291746139526, + "rewards/mcq_accuracy_reward/std": 0.4463015913963318, + "rewards/tag_count_reward/mean": 0.7505208253860474, + "rewards/tag_count_reward/std": 0.007207450270652771, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3784.4, + "completions/max_terminated_length": 3784.4, + "completions/mean_length": 918.192724609375, + "completions/mean_terminated_length": 918.192724609375, + "completions/min_length": 303.4, + "completions/min_terminated_length": 303.4, + "epoch": 0.09320490310300127, + "grad_norm": 0.15314380123149077, + "kl": 0.03994903564453125, + "learning_rate": 2.933474996447717e-06, + "loss": 0.0089, + "num_tokens": 157160498.0, + "reward": 1.109798216819763, + "reward_std": 0.12845655977725984, + "rewards/format_reward/mean": 0.9994791626930237, + "rewards/format_reward/std": 0.010206207633018494, + "rewards/mcq_accuracy_reward/mean": 0.6723958373069763, + "rewards/mcq_accuracy_reward/std": 0.4589682936668396, + "rewards/tag_count_reward/mean": 0.7501302123069763, + "rewards/tag_count_reward/std": 0.0025515519082546234, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 4832.6, + "completions/max_terminated_length": 3925.4, + "completions/mean_length": 1013.578173828125, + "completions/mean_terminated_length": 1009.7974487304688, + "completions/min_length": 307.8, + "completions/min_terminated_length": 307.8, + "epoch": 0.09451764821712805, + "grad_norm": 0.23978329231332055, + "kl": 0.03802947998046875, + "learning_rate": 2.9293639358577835e-06, + "loss": 0.0074, + "num_tokens": 159410048.0, + "reward": 1.1422526359558105, + "reward_std": 0.1519577294588089, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.024621108174324037, + "rewards/mcq_accuracy_reward/mean": 0.7052083253860474, + "rewards/mcq_accuracy_reward/std": 0.44966967701911925, + "rewards/tag_count_reward/mean": 0.7497395873069763, + "rewards/tag_count_reward/std": 0.008716250583529473, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0015625, + "completions/max_length": 5520.0, + "completions/max_terminated_length": 5518.8, + "completions/mean_length": 1105.1140625, + "completions/mean_terminated_length": 1093.9616455078126, + "completions/min_length": 313.2, + "completions/min_terminated_length": 313.2, + "epoch": 0.09583039333125482, + "grad_norm": 0.2370306805515519, + "kl": 0.0391845703125, + "learning_rate": 2.9251326862447565e-06, + "loss": 0.0056, + "num_tokens": 161833307.0, + "reward": 1.1424479484558105, + "reward_std": 0.13019494712352753, + "rewards/format_reward/mean": 0.9973958253860473, + "rewards/format_reward/std": 0.030538519471883775, + "rewards/mcq_accuracy_reward/mean": 0.7057291626930237, + "rewards/mcq_accuracy_reward/std": 0.45453677177429197, + "rewards/tag_count_reward/mean": 0.7494791746139526, + "rewards/tag_count_reward/std": 0.011740208975970745, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 5963.0, + "completions/max_terminated_length": 5876.0, + "completions/mean_length": 1057.2646118164062, + "completions/mean_terminated_length": 1053.5991821289062, + "completions/min_length": 315.8, + "completions/min_terminated_length": 315.8, + "epoch": 0.0971431384453816, + "grad_norm": 0.20234448413575182, + "kl": 0.04073333740234375, + "learning_rate": 2.920781603396092e-06, + "loss": 0.0148, + "num_tokens": 164158775.0, + "reward": 1.1438802242279054, + "reward_std": 0.14484989494085312, + "rewards/format_reward/mean": 0.9984374880790711, + "rewards/format_reward/std": 0.03061862289905548, + "rewards/mcq_accuracy_reward/mean": 0.7067708253860474, + "rewards/mcq_accuracy_reward/std": 0.45467503666877745, + "rewards/tag_count_reward/mean": 0.7500000119209289, + "rewards/tag_count_reward/std": 0.010206207633018494, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4499.4, + "completions/max_terminated_length": 4499.4, + "completions/mean_length": 1091.4484619140626, + "completions/mean_terminated_length": 1091.4484619140626, + "completions/min_length": 328.4, + "completions/min_terminated_length": 328.4, + "epoch": 0.09845588355950838, + "grad_norm": 0.21529014562372936, + "kl": 0.037982177734375, + "learning_rate": 2.916311053175503e-06, + "loss": 0.0103, + "num_tokens": 166553556.0, + "reward": 1.123958373069763, + "reward_std": 0.14746845364570618, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/mcq_accuracy_reward/mean": 0.6864583373069764, + "rewards/mcq_accuracy_reward/std": 0.4606847584247589, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4030.6, + "completions/max_terminated_length": 4030.6, + "completions/mean_length": 981.2666748046875, + "completions/mean_terminated_length": 981.2666748046875, + "completions/min_length": 312.6, + "completions/min_terminated_length": 312.6, + "epoch": 0.09976862867363516, + "grad_norm": 0.19669355950220319, + "kl": 0.04024658203125, + "learning_rate": 2.911721411492196e-06, + "loss": 0.0102, + "num_tokens": 168731516.0, + "reward": 1.1050130605697632, + "reward_std": 0.12965331375598907, + "rewards/format_reward/mean": 0.9989583253860473, + "rewards/format_reward/std": 0.020412415266036987, + "rewards/mcq_accuracy_reward/mean": 0.6677083253860474, + "rewards/mcq_accuracy_reward/std": 0.46371673941612246, + "rewards/tag_count_reward/mean": 0.7502604246139526, + "rewards/tag_count_reward/std": 0.005103103816509247, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4679.4, + "completions/max_terminated_length": 4679.4, + "completions/mean_length": 1058.3802368164063, + "completions/mean_terminated_length": 1058.3802368164063, + "completions/min_length": 323.8, + "completions/min_terminated_length": 323.8, + "epoch": 0.10108137378776193, + "grad_norm": 0.19738187333166138, + "kl": 0.04134979248046875, + "learning_rate": 2.9070130642692644e-06, + "loss": 0.0118, + "num_tokens": 171065230.0, + "reward": 1.1357422113418578, + "reward_std": 0.12654991894960405, + "rewards/format_reward/mean": 0.9989583253860473, + "rewards/format_reward/std": 0.020412415266036987, + "rewards/mcq_accuracy_reward/mean": 0.698437488079071, + "rewards/mcq_accuracy_reward/std": 0.45783511400222776, + "rewards/tag_count_reward/mean": 0.7502604246139526, + "rewards/tag_count_reward/std": 0.005103103816509247, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4729.8, + "completions/max_terminated_length": 4729.8, + "completions/mean_length": 1037.7041870117187, + "completions/mean_terminated_length": 1037.7041870117187, + "completions/min_length": 309.6, + "completions/min_terminated_length": 309.6, + "epoch": 0.10239411890188871, + "grad_norm": 0.20893768141632535, + "kl": 0.04337921142578125, + "learning_rate": 2.9021864074112343e-06, + "loss": 0.0008, + "num_tokens": 173352934.0, + "reward": 1.1367838859558106, + "reward_std": 0.11311140209436417, + "rewards/format_reward/mean": 0.9989583253860473, + "rewards/format_reward/std": 0.020412415266036987, + "rewards/mcq_accuracy_reward/mean": 0.6994791626930237, + "rewards/mcq_accuracy_reward/std": 0.4534127116203308, + "rewards/tag_count_reward/mean": 0.7502604246139526, + "rewards/tag_count_reward/std": 0.005103103816509247, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3961.2, + "completions/max_terminated_length": 3961.2, + "completions/mean_length": 1020.6994995117187, + "completions/mean_terminated_length": 1020.6994995117187, + "completions/min_length": 276.8, + "completions/min_terminated_length": 276.8, + "epoch": 0.10370686401601549, + "grad_norm": 0.2648662688034053, + "kl": 0.04385833740234375, + "learning_rate": 2.8972418467707787e-06, + "loss": 0.0134, + "num_tokens": 175613421.0, + "reward": 1.0958333730697631, + "reward_std": 0.1485915958881378, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/mcq_accuracy_reward/mean": 0.6583333373069763, + "rewards/mcq_accuracy_reward/std": 0.4721109449863434, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3979.8, + "completions/max_terminated_length": 3979.8, + "completions/mean_length": 997.191162109375, + "completions/mean_terminated_length": 997.191162109375, + "completions/min_length": 350.4, + "completions/min_terminated_length": 350.4, + "epoch": 0.10501960913014227, + "grad_norm": 0.22074320722409232, + "kl": 0.04484710693359375, + "learning_rate": 2.8921797981145873e-06, + "loss": 0.0155, + "num_tokens": 177827052.0, + "reward": 1.1201172351837159, + "reward_std": 0.14905780106782912, + "rewards/format_reward/mean": 0.9989583253860473, + "rewards/format_reward/std": 0.020412415266036987, + "rewards/mcq_accuracy_reward/mean": 0.6828125, + "rewards/mcq_accuracy_reward/std": 0.46074681878089907, + "rewards/tag_count_reward/mean": 0.7502604246139526, + "rewards/tag_count_reward/std": 0.005103103816509247, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4591.4, + "completions/max_terminated_length": 4591.4, + "completions/mean_length": 1148.679736328125, + "completions/mean_terminated_length": 1148.679736328125, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.10633235424426905, + "grad_norm": 0.18660512022787928, + "kl": 0.04748382568359375, + "learning_rate": 2.8870006870884088e-06, + "loss": 0.0083, + "num_tokens": 180330805.0, + "reward": 1.1873047351837158, + "reward_std": 0.13910053074359893, + "rewards/format_reward/mean": 0.9989583253860473, + "rewards/format_reward/std": 0.020412415266036987, + "rewards/mcq_accuracy_reward/mean": 0.7500000119209289, + "rewards/mcq_accuracy_reward/std": 0.43143912553787234, + "rewards/tag_count_reward/mean": 0.7502604246139526, + "rewards/tag_count_reward/std": 0.005103103816509247, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5085.2, + "completions/max_terminated_length": 5085.2, + "completions/mean_length": 1038.3880493164063, + "completions/mean_terminated_length": 1038.3880493164063, + "completions/min_length": 319.8, + "completions/min_terminated_length": 319.8, + "epoch": 0.10764509935839582, + "grad_norm": 0.24525296238584263, + "kl": 0.0441741943359375, + "learning_rate": 2.88170494918126e-06, + "loss": 0.0164, + "num_tokens": 182621382.0, + "reward": 1.1647461175918579, + "reward_std": 0.12996296137571334, + "rewards/format_reward/mean": 0.9984374880790711, + "rewards/format_reward/std": 0.03061862289905548, + "rewards/mcq_accuracy_reward/mean": 0.7276041746139527, + "rewards/mcq_accuracy_reward/std": 0.44393226504325867, + "rewards/tag_count_reward/mean": 0.7501302123069763, + "rewards/tag_count_reward/std": 0.00765465572476387, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4580.8, + "completions/max_terminated_length": 4580.8, + "completions/mean_length": 982.8547241210938, + "completions/mean_terminated_length": 982.8547241210938, + "completions/min_length": 305.6, + "completions/min_terminated_length": 305.6, + "epoch": 0.1089578444725226, + "grad_norm": 0.18841517820288065, + "kl": 0.04103851318359375, + "learning_rate": 2.876293029688807e-06, + "loss": 0.0095, + "num_tokens": 184803167.0, + "reward": 1.121875023841858, + "reward_std": 0.14617051631212236, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/mcq_accuracy_reward/mean": 0.684374988079071, + "rewards/mcq_accuracy_reward/std": 0.4598234474658966, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4499.6, + "completions/max_terminated_length": 4499.6, + "completions/mean_length": 1042.9515869140625, + "completions/mean_terminated_length": 1042.9515869140625, + "completions/min_length": 335.4, + "completions/min_terminated_length": 335.4, + "epoch": 0.11027058958664938, + "grad_norm": 0.18762774776751226, + "kl": 0.0423126220703125, + "learning_rate": 2.8707653836759217e-06, + "loss": 0.0174, + "num_tokens": 187107346.0, + "reward": 1.1656250476837158, + "reward_std": 0.12856559455394745, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/mcq_accuracy_reward/mean": 0.728125, + "rewards/mcq_accuracy_reward/std": 0.44129165410995486, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3974.0, + "completions/max_terminated_length": 3974.0, + "completions/mean_length": 1131.3771240234375, + "completions/mean_terminated_length": 1131.3771240234375, + "completions/min_length": 334.8, + "completions/min_terminated_length": 334.8, + "epoch": 0.11158333470077617, + "grad_norm": 0.2230517183363958, + "kl": 0.04215087890625, + "learning_rate": 2.8651224759384187e-06, + "loss": 0.0094, + "num_tokens": 189576366.0, + "reward": 1.1322917222976685, + "reward_std": 0.14240131080150603, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/mcq_accuracy_reward/mean": 0.6947916626930237, + "rewards/mcq_accuracy_reward/std": 0.45904839038848877, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4577.2, + "completions/max_terminated_length": 4577.2, + "completions/mean_length": 1015.2953491210938, + "completions/mean_terminated_length": 1015.2953491210938, + "completions/min_length": 295.8, + "completions/min_terminated_length": 295.8, + "epoch": 0.11289607981490293, + "grad_norm": 0.14708276728513894, + "kl": 0.0443939208984375, + "learning_rate": 2.859364780963971e-06, + "loss": 0.0069, + "num_tokens": 191822053.0, + "reward": 1.1196940183639525, + "reward_std": 0.1310182273387909, + "rewards/format_reward/mean": 0.9994791626930237, + "rewards/format_reward/std": 0.010206207633018494, + "rewards/mcq_accuracy_reward/mean": 0.6822916626930237, + "rewards/mcq_accuracy_reward/std": 0.46062104105949403, + "rewards/tag_count_reward/mean": 0.7501302123069763, + "rewards/tag_count_reward/std": 0.0025515519082546234, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 5103.6, + "completions/max_terminated_length": 4303.0, + "completions/mean_length": 1002.6463745117187, + "completions/mean_terminated_length": 998.8788208007812, + "completions/min_length": 306.2, + "completions/min_terminated_length": 306.2, + "epoch": 0.11420882492902971, + "grad_norm": 0.2098542895231453, + "kl": 0.0451385498046875, + "learning_rate": 2.8534927828922143e-06, + "loss": 0.0006, + "num_tokens": 194047550.0, + "reward": 1.1366862535476685, + "reward_std": 0.1288916677236557, + "rewards/format_reward/mean": 0.9989583253860473, + "rewards/format_reward/std": 0.020412415266036987, + "rewards/mcq_accuracy_reward/mean": 0.6994791626930237, + "rewards/mcq_accuracy_reward/std": 0.457518196105957, + "rewards/tag_count_reward/mean": 0.7498697996139526, + "rewards/tag_count_reward/std": 0.00765465572476387, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4727.8, + "completions/max_terminated_length": 4727.8, + "completions/mean_length": 1096.03388671875, + "completions/mean_terminated_length": 1096.03388671875, + "completions/min_length": 325.2, + "completions/min_terminated_length": 325.2, + "epoch": 0.1155215700431565, + "grad_norm": 0.19636314540755623, + "kl": 0.04406890869140625, + "learning_rate": 2.8475069754740346e-06, + "loss": 0.0075, + "num_tokens": 196455895.0, + "reward": 1.1352214097976685, + "reward_std": 0.1213495135307312, + "rewards/format_reward/mean": 0.9989583253860473, + "rewards/format_reward/std": 0.020412415266036987, + "rewards/mcq_accuracy_reward/mean": 0.6979166746139527, + "rewards/mcq_accuracy_reward/std": 0.45844133496284484, + "rewards/tag_count_reward/mean": 0.7502604246139526, + "rewards/tag_count_reward/std": 0.005103103816509247, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3955.0, + "completions/max_terminated_length": 3955.0, + "completions/mean_length": 1080.3458618164063, + "completions/mean_terminated_length": 1080.3458618164063, + "completions/min_length": 332.2, + "completions/min_terminated_length": 332.2, + "epoch": 0.11683431515728328, + "grad_norm": 0.14367028901550888, + "kl": 0.04437103271484375, + "learning_rate": 2.841407862030056e-06, + "loss": 0.0078, + "num_tokens": 198826975.0, + "reward": 1.1103190422058105, + "reward_std": 0.1273256078362465, + "rewards/format_reward/mean": 0.9994791626930237, + "rewards/format_reward/std": 0.010206207633018494, + "rewards/mcq_accuracy_reward/mean": 0.6729166626930236, + "rewards/mcq_accuracy_reward/std": 0.4691579341888428, + "rewards/tag_count_reward/mean": 0.7501302123069763, + "rewards/tag_count_reward/std": 0.0025515519082546234, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3705.8, + "completions/max_terminated_length": 3705.8, + "completions/mean_length": 1027.7901245117187, + "completions/mean_terminated_length": 1027.7901245117187, + "completions/min_length": 313.2, + "completions/min_terminated_length": 313.2, + "epoch": 0.11814706027141006, + "grad_norm": 0.23131706197506913, + "kl": 0.04774627685546875, + "learning_rate": 2.835195955408313e-06, + "loss": 0.0036, + "num_tokens": 201097524.0, + "reward": 1.1567708492279052, + "reward_std": 0.13514232039451599, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/mcq_accuracy_reward/mean": 0.7192708253860474, + "rewards/mcq_accuracy_reward/std": 0.4459089934825897, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4686.6, + "completions/max_terminated_length": 4686.6, + "completions/mean_length": 1009.730224609375, + "completions/mean_terminated_length": 1009.730224609375, + "completions/min_length": 264.2, + "completions/min_terminated_length": 264.2, + "epoch": 0.11945980538553683, + "grad_norm": 0.8662825356287738, + "kl": 0.0483062744140625, + "learning_rate": 2.8288717779411317e-06, + "loss": 0.0084, + "num_tokens": 203337622.0, + "reward": 1.1162760972976684, + "reward_std": 0.11033509969711304, + "rewards/format_reward/mean": 0.9979166626930237, + "rewards/format_reward/std": 0.03482731580734253, + "rewards/mcq_accuracy_reward/mean": 0.6791666626930237, + "rewards/mcq_accuracy_reward/std": 0.4658125936985016, + "rewards/tag_count_reward/mean": 0.7505208373069763, + "rewards/tag_count_reward/std": 0.008706828951835633, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5116.2, + "completions/max_terminated_length": 5116.2, + "completions/mean_length": 989.9885864257812, + "completions/mean_terminated_length": 989.9885864257812, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.12077255049966361, + "grad_norm": 0.1748034544842212, + "kl": 0.04725341796875, + "learning_rate": 2.8224358614012063e-06, + "loss": 0.0038, + "num_tokens": 205534096.0, + "reward": 1.1321940422058105, + "reward_std": 0.13878467231988906, + "rewards/format_reward/mean": 0.9994791626930237, + "rewards/format_reward/std": 0.010206207633018494, + "rewards/mcq_accuracy_reward/mean": 0.6947916626930237, + "rewards/mcq_accuracy_reward/std": 0.4569283425807953, + "rewards/tag_count_reward/mean": 0.7501302123069763, + "rewards/tag_count_reward/std": 0.0025515519082546234, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4338.2, + "completions/max_terminated_length": 4338.2, + "completions/mean_length": 991.6979248046875, + "completions/mean_terminated_length": 991.6979248046875, + "completions/min_length": 323.4, + "completions/min_terminated_length": 323.4, + "epoch": 0.12208529561379039, + "grad_norm": 0.25320454897739286, + "kl": 0.045721435546875, + "learning_rate": 2.8158887469568856e-06, + "loss": 0.0156, + "num_tokens": 207734012.0, + "reward": 1.1461588859558105, + "reward_std": 0.154278764128685, + "rewards/format_reward/mean": 0.9989583253860473, + "rewards/format_reward/std": 0.020412415266036987, + "rewards/mcq_accuracy_reward/mean": 0.7088541507720947, + "rewards/mcq_accuracy_reward/std": 0.4543288826942444, + "rewards/tag_count_reward/mean": 0.7502604246139526, + "rewards/tag_count_reward/std": 0.005103103816509247, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4106.0, + "completions/max_terminated_length": 4106.0, + "completions/mean_length": 1093.31669921875, + "completions/mean_terminated_length": 1093.31669921875, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.12339804072791717, + "grad_norm": 0.2190086805652383, + "kl": 0.04641876220703125, + "learning_rate": 2.8092309851266697e-06, + "loss": 0.0089, + "num_tokens": 210135804.0, + "reward": 1.0983724355697633, + "reward_std": 0.1476101756095886, + "rewards/format_reward/mean": 0.996874988079071, + "rewards/format_reward/std": 0.040744727849960326, + "rewards/mcq_accuracy_reward/mean": 0.6614583373069763, + "rewards/mcq_accuracy_reward/std": 0.47056936621665957, + "rewards/tag_count_reward/mean": 0.750781261920929, + "rewards/tag_count_reward/std": 0.010186181962490081, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3982.6, + "completions/max_terminated_length": 3982.6, + "completions/mean_length": 1031.648486328125, + "completions/mean_terminated_length": 1031.648486328125, + "completions/min_length": 310.4, + "completions/min_terminated_length": 310.4, + "epoch": 0.12471078584204394, + "grad_norm": 0.2118878094802896, + "kl": 0.0532867431640625, + "learning_rate": 2.8024631357329178e-06, + "loss": 0.0136, + "num_tokens": 212416233.0, + "reward": 1.154882836341858, + "reward_std": 0.10870791971683502, + "rewards/format_reward/mean": 0.9927083253860474, + "rewards/format_reward/std": 0.07300624921917916, + "rewards/mcq_accuracy_reward/mean": 0.71875, + "rewards/mcq_accuracy_reward/std": 0.445217365026474, + "rewards/tag_count_reward/mean": 0.7518229246139526, + "rewards/tag_count_reward/std": 0.01825156230479479, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3592.8, + "completions/max_terminated_length": 3592.8, + "completions/mean_length": 1053.66513671875, + "completions/mean_terminated_length": 1053.66513671875, + "completions/min_length": 317.2, + "completions/min_terminated_length": 317.2, + "epoch": 0.12602353095617072, + "grad_norm": 0.26869130056320445, + "kl": 0.04827880859375, + "learning_rate": 2.795585767854774e-06, + "loss": 0.0084, + "num_tokens": 214729502.0, + "reward": 1.1073568344116211, + "reward_std": 0.13634911626577378, + "rewards/format_reward/mean": 0.9697916626930236, + "rewards/format_reward/std": 0.16930488049983977, + "rewards/mcq_accuracy_reward/mean": 0.6755208373069763, + "rewards/mcq_accuracy_reward/std": 0.466803765296936, + "rewards/tag_count_reward/mean": 0.7575520873069763, + "rewards/tag_count_reward/std": 0.04232621863484383, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4503.0, + "completions/max_terminated_length": 4503.0, + "completions/mean_length": 1067.3990234375, + "completions/mean_terminated_length": 1067.3990234375, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.1273362760702975, + "grad_norm": 0.21101949067608478, + "kl": 0.0453826904296875, + "learning_rate": 2.7885994597803203e-06, + "loss": 0.0034, + "num_tokens": 217079548.0, + "reward": 1.1095052480697631, + "reward_std": 0.13806882351636887, + "rewards/format_reward/mean": 0.9895833373069763, + "rewards/format_reward/std": 0.0863472655415535, + "rewards/mcq_accuracy_reward/mean": 0.6739583373069763, + "rewards/mcq_accuracy_reward/std": 0.46705670952796935, + "rewards/tag_count_reward/mean": 0.7526041626930237, + "rewards/tag_count_reward/std": 0.021586816012859344, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4696.4, + "completions/max_terminated_length": 4696.4, + "completions/mean_length": 1018.1213623046875, + "completions/mean_terminated_length": 1018.1213623046875, + "completions/min_length": 331.8, + "completions/min_terminated_length": 331.8, + "epoch": 0.12864902118442428, + "grad_norm": 0.17157451196891385, + "kl": 0.046221923828125, + "learning_rate": 2.7815047989579454e-06, + "loss": 0.001, + "num_tokens": 219330493.0, + "reward": 1.1589193105697633, + "reward_std": 0.11324364095926284, + "rewards/format_reward/mean": 0.9947916626930237, + "rewards/format_reward/std": 0.06258487403392791, + "rewards/mcq_accuracy_reward/mean": 0.7223958253860474, + "rewards/mcq_accuracy_reward/std": 0.4433377683162689, + "rewards/tag_count_reward/mean": 0.7513020873069763, + "rewards/tag_count_reward/std": 0.01564621850848198, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4478.0, + "completions/max_terminated_length": 4478.0, + "completions/mean_length": 1071.9989624023438, + "completions/mean_terminated_length": 1071.9989624023438, + "completions/min_length": 333.8, + "completions/min_terminated_length": 333.8, + "epoch": 0.12996176629855105, + "grad_norm": 0.21686205250112542, + "kl": 0.04475250244140625, + "learning_rate": 2.7743023819469527e-06, + "loss": 0.0109, + "num_tokens": 221689027.0, + "reward": 1.1631185293197632, + "reward_std": 0.12640594244003295, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08420459926128387, + "rewards/mcq_accuracy_reward/mean": 0.7270833253860474, + "rewards/mcq_accuracy_reward/std": 0.44429651498794553, + "rewards/tag_count_reward/mean": 0.751953125, + "rewards/tag_count_reward/std": 0.02105114981532097, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4606.0, + "completions/max_terminated_length": 4606.0, + "completions/mean_length": 1108.1776611328125, + "completions/mean_terminated_length": 1108.1776611328125, + "completions/min_length": 322.2, + "completions/min_terminated_length": 322.2, + "epoch": 0.13127451141267785, + "grad_norm": 0.220079975944011, + "kl": 0.0483062744140625, + "learning_rate": 2.7669928143673966e-06, + "loss": 0.0108, + "num_tokens": 224114304.0, + "reward": 1.1401041984558105, + "reward_std": 0.11760788708925247, + "rewards/format_reward/mean": 0.9916666746139526, + "rewards/format_reward/std": 0.08766689300537109, + "rewards/mcq_accuracy_reward/mean": 0.7041666626930236, + "rewards/mcq_accuracy_reward/std": 0.45539613366127013, + "rewards/tag_count_reward/mean": 0.7520833253860474, + "rewards/tag_count_reward/std": 0.021916723251342772, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 5796.4, + "completions/max_terminated_length": 5714.4, + "completions/mean_length": 1051.8364868164062, + "completions/mean_terminated_length": 1048.1223999023437, + "completions/min_length": 334.6, + "completions/min_terminated_length": 334.6, + "epoch": 0.13258725652680461, + "grad_norm": 0.265670163416576, + "kl": 0.04952392578125, + "learning_rate": 2.759576710849159e-06, + "loss": 0.0219, + "num_tokens": 226432590.0, + "reward": 1.107617235183716, + "reward_std": 0.1576975926756859, + "rewards/format_reward/mean": 0.9744791746139526, + "rewards/format_reward/std": 0.15659859776496887, + "rewards/mcq_accuracy_reward/mean": 0.675, + "rewards/mcq_accuracy_reward/std": 0.45978304743766785, + "rewards/tag_count_reward/mean": 0.7559895753860474, + "rewards/tag_count_reward/std": 0.03963593617081642, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4940.0, + "completions/max_terminated_length": 4940.0, + "completions/mean_length": 1010.5802490234375, + "completions/mean_terminated_length": 1010.5802490234375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.13390000164093138, + "grad_norm": 0.2148254586238118, + "kl": 0.046527099609375, + "learning_rate": 2.752054694980266e-06, + "loss": 0.0124, + "num_tokens": 228670672.0, + "reward": 1.1529948472976685, + "reward_std": 0.14415046721696853, + "rewards/format_reward/mean": 0.9828124880790711, + "rewards/format_reward/std": 0.12713556587696076, + "rewards/mcq_accuracy_reward/mean": 0.71875, + "rewards/mcq_accuracy_reward/std": 0.4496320605278015, + "rewards/tag_count_reward/mean": 0.7541666746139526, + "rewards/tag_count_reward/std": 0.03110867626965046, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4110.4, + "completions/max_terminated_length": 4110.4, + "completions/mean_length": 1005.0484619140625, + "completions/mean_terminated_length": 1005.0484619140625, + "completions/min_length": 299.6, + "completions/min_terminated_length": 299.6, + "epoch": 0.13521274675505818, + "grad_norm": 0.26907621126413234, + "kl": 0.0470611572265625, + "learning_rate": 2.744427399254457e-06, + "loss": 0.0115, + "num_tokens": 230896821.0, + "reward": 1.1107096672058105, + "reward_std": 0.15524463653564452, + "rewards/format_reward/mean": 0.9572916746139526, + "rewards/format_reward/std": 0.18706629574298858, + "rewards/mcq_accuracy_reward/mean": 0.681249988079071, + "rewards/mcq_accuracy_reward/std": 0.4618394076824188, + "rewards/tag_count_reward/mean": 0.760546875, + "rewards/tag_count_reward/std": 0.04647269546985626, + "step": 515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3898.2, + "completions/max_terminated_length": 3898.2, + "completions/mean_length": 992.600537109375, + "completions/mean_terminated_length": 992.600537109375, + "completions/min_length": 302.8, + "completions/min_terminated_length": 302.8, + "epoch": 0.13652549186918495, + "grad_norm": 0.21977916345604828, + "kl": 0.0468994140625, + "learning_rate": 2.7366954650179973e-06, + "loss": 0.0143, + "num_tokens": 233099382.0, + "reward": 1.1469076156616211, + "reward_std": 0.13330337703227996, + "rewards/format_reward/mean": 0.9973958253860473, + "rewards/format_reward/std": 0.038043868541717527, + "rewards/mcq_accuracy_reward/mean": 0.7098958373069764, + "rewards/mcq_accuracy_reward/std": 0.4519686996936798, + "rewards/tag_count_reward/mean": 0.7506510496139527, + "rewards/tag_count_reward/std": 0.009510967135429382, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4921.4, + "completions/max_terminated_length": 4921.4, + "completions/mean_length": 1077.63701171875, + "completions/mean_terminated_length": 1077.63701171875, + "completions/min_length": 317.6, + "completions/min_terminated_length": 317.6, + "epoch": 0.13783823698331174, + "grad_norm": 0.2112484909819163, + "kl": 0.046099853515625, + "learning_rate": 2.7288595424157503e-06, + "loss": 0.0052, + "num_tokens": 235468645.0, + "reward": 1.128645896911621, + "reward_std": 0.1344786450266838, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/mcq_accuracy_reward/mean": 0.6911458492279052, + "rewards/mcq_accuracy_reward/std": 0.45899441838264465, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4256.8, + "completions/max_terminated_length": 4256.8, + "completions/mean_length": 1009.775537109375, + "completions/mean_terminated_length": 1009.775537109375, + "completions/min_length": 292.4, + "completions/min_terminated_length": 292.4, + "epoch": 0.1391509820974385, + "grad_norm": 0.22813560000029065, + "kl": 0.051544189453125, + "learning_rate": 2.7209202903365133e-06, + "loss": 0.0129, + "num_tokens": 237706174.0, + "reward": 1.1440755367279052, + "reward_std": 0.12200047373771668, + "rewards/format_reward/mean": 0.9989583253860473, + "rewards/format_reward/std": 0.020412415266036987, + "rewards/mcq_accuracy_reward/mean": 0.7067708253860474, + "rewards/mcq_accuracy_reward/std": 0.4542596101760864, + "rewards/tag_count_reward/mean": 0.7502604246139526, + "rewards/tag_count_reward/std": 0.005103103816509247, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4892.6, + "completions/max_terminated_length": 4892.6, + "completions/mean_length": 1056.46357421875, + "completions/mean_terminated_length": 1056.46357421875, + "completions/min_length": 328.4, + "completions/min_terminated_length": 328.4, + "epoch": 0.14046372721156528, + "grad_norm": 0.2106400361865077, + "kl": 0.0487518310546875, + "learning_rate": 2.712878376357609e-06, + "loss": 0.0009, + "num_tokens": 240033648.0, + "reward": 1.1613607168197633, + "reward_std": 0.1544968828558922, + "rewards/format_reward/mean": 0.9994791626930237, + "rewards/format_reward/std": 0.010206207633018494, + "rewards/mcq_accuracy_reward/mean": 0.7239583253860473, + "rewards/mcq_accuracy_reward/std": 0.44602025747299195, + "rewards/tag_count_reward/mean": 0.7501302123069763, + "rewards/tag_count_reward/std": 0.0025515519082546234, + "step": 535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4366.0, + "completions/max_terminated_length": 4366.0, + "completions/mean_length": 1032.29013671875, + "completions/mean_terminated_length": 1032.29013671875, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.14177647232569207, + "grad_norm": 0.16625089256406675, + "kl": 0.05540771484375, + "learning_rate": 2.704734476688757e-06, + "loss": 0.0126, + "num_tokens": 242315829.0, + "reward": 1.0630534172058106, + "reward_std": 0.13148987144231797, + "rewards/format_reward/mean": 0.9973958253860473, + "rewards/format_reward/std": 0.045033523440361024, + "rewards/mcq_accuracy_reward/mean": 0.6260416626930236, + "rewards/mcq_accuracy_reward/std": 0.48251315355300906, + "rewards/tag_count_reward/mean": 0.7506510496139527, + "rewards/tag_count_reward/std": 0.011258380860090256, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3565.6, + "completions/max_terminated_length": 3565.6, + "completions/mean_length": 959.4375122070312, + "completions/mean_terminated_length": 959.4375122070312, + "completions/min_length": 311.4, + "completions/min_terminated_length": 311.4, + "epoch": 0.14308921743981884, + "grad_norm": 0.22464259659779204, + "kl": 0.055169677734375, + "learning_rate": 2.6964892761152113e-06, + "loss": 0.0045, + "num_tokens": 244456525.0, + "reward": 1.1574870347976685, + "reward_std": 0.11304931193590165, + "rewards/format_reward/mean": 0.9927083373069763, + "rewards/format_reward/std": 0.0820706069469452, + "rewards/mcq_accuracy_reward/mean": 0.7213541626930237, + "rewards/mcq_accuracy_reward/std": 0.4449552297592163, + "rewards/tag_count_reward/mean": 0.7518229126930237, + "rewards/tag_count_reward/std": 0.0205176517367363, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4356.4, + "completions/max_terminated_length": 4356.4, + "completions/mean_length": 1031.305224609375, + "completions/mean_terminated_length": 1031.305224609375, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.1444019625539456, + "grad_norm": 0.2949922514002113, + "kl": 0.0519500732421875, + "learning_rate": 2.688143467940179e-06, + "loss": 0.0136, + "num_tokens": 246734719.0, + "reward": 1.1007162094116212, + "reward_std": 0.13727524280548095, + "rewards/format_reward/mean": 0.9682291507720947, + "rewards/format_reward/std": 0.17280667126178742, + "rewards/mcq_accuracy_reward/mean": 0.6692708373069763, + "rewards/mcq_accuracy_reward/std": 0.4672414779663086, + "rewards/tag_count_reward/mean": 0.7575520992279052, + "rewards/tag_count_reward/std": 0.04285155162215233, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4932.4, + "completions/max_terminated_length": 4932.4, + "completions/mean_length": 1104.8286865234375, + "completions/mean_terminated_length": 1104.8286865234375, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "epoch": 0.1457147076680724, + "grad_norm": 0.37569944999617755, + "kl": 0.0444305419921875, + "learning_rate": 2.6796977539265263e-06, + "loss": 0.0115, + "num_tokens": 249151998.0, + "reward": 1.0762370228767395, + "reward_std": 0.17273767292499542, + "rewards/format_reward/mean": 0.909375, + "rewards/format_reward/std": 0.2841670572757721, + "rewards/mcq_accuracy_reward/mean": 0.6557291746139526, + "rewards/mcq_accuracy_reward/std": 0.47235715985298155, + "rewards/tag_count_reward/mean": 0.77265625, + "rewards/tag_count_reward/std": 0.07104176431894302, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4706.6, + "completions/max_terminated_length": 4706.6, + "completions/mean_length": 1098.668798828125, + "completions/mean_terminated_length": 1098.668798828125, + "completions/min_length": 317.8, + "completions/min_terminated_length": 317.8, + "epoch": 0.14702745278219917, + "grad_norm": 0.20458097527753633, + "kl": 0.04848785400390625, + "learning_rate": 2.671152844237767e-06, + "loss": 0.012, + "num_tokens": 251560162.0, + "reward": 1.1113607168197632, + "reward_std": 0.1699225127696991, + "rewards/format_reward/mean": 0.9744791746139526, + "rewards/format_reward/std": 0.1472677320241928, + "rewards/mcq_accuracy_reward/mean": 0.6786458373069764, + "rewards/mcq_accuracy_reward/std": 0.46689584851264954, + "rewards/tag_count_reward/mean": 0.7563802003860474, + "rewards/tag_count_reward/std": 0.0368169330060482, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4197.2, + "completions/max_terminated_length": 4197.2, + "completions/mean_length": 1027.9234741210937, + "completions/mean_terminated_length": 1027.9234741210937, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.14834019789632596, + "grad_norm": 0.19738662626067227, + "kl": 0.0525482177734375, + "learning_rate": 2.6625094573783525e-06, + "loss": 0.0051, + "num_tokens": 253831783.0, + "reward": 1.148535180091858, + "reward_std": 0.1445666506886482, + "rewards/format_reward/mean": 0.9921875119209289, + "rewards/format_reward/std": 0.08679499328136445, + "rewards/mcq_accuracy_reward/mean": 0.7125, + "rewards/mcq_accuracy_reward/std": 0.4487602710723877, + "rewards/tag_count_reward/mean": 0.7519531130790711, + "rewards/tag_count_reward/std": 0.02169874832034111, + "step": 565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5731.8, + "completions/max_terminated_length": 5731.8, + "completions/mean_length": 1000.9666870117187, + "completions/mean_terminated_length": 1000.9666870117187, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "epoch": 0.14965294301045273, + "grad_norm": 0.6629376539645357, + "kl": 0.0606658935546875, + "learning_rate": 2.653768320133249e-06, + "loss": 0.0036, + "num_tokens": 256050487.0, + "reward": 1.1544271230697631, + "reward_std": 0.14360256791114806, + "rewards/format_reward/mean": 0.9739583253860473, + "rewards/format_reward/std": 0.15649746656417846, + "rewards/mcq_accuracy_reward/mean": 0.721874988079071, + "rewards/mcq_accuracy_reward/std": 0.44708335399627686, + "rewards/tag_count_reward/mean": 0.75625, + "rewards/tag_count_reward/std": 0.03805009052157402, + "step": 570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3642.2, + "completions/max_terminated_length": 3642.2, + "completions/mean_length": 921.5651245117188, + "completions/mean_terminated_length": 921.5651245117188, + "completions/min_length": 290.6, + "completions/min_terminated_length": 290.6, + "epoch": 0.1509656881245795, + "grad_norm": 0.2833419177880032, + "kl": 0.0765777587890625, + "learning_rate": 2.6449301675068333e-06, + "loss": 0.0036, + "num_tokens": 258119052.0, + "reward": 1.1480794668197631, + "reward_std": 0.11862675398588181, + "rewards/format_reward/mean": 0.9510416746139526, + "rewards/format_reward/std": 0.20895527303218842, + "rewards/mcq_accuracy_reward/mean": 0.7208333253860474, + "rewards/mcq_accuracy_reward/std": 0.4487335324287415, + "rewards/tag_count_reward/mean": 0.7579427003860474, + "rewards/tag_count_reward/std": 0.052790357172489165, + "step": 575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0015624999999999778, + "completions/max_length": 6627.4, + "completions/max_terminated_length": 4780.6, + "completions/mean_length": 1146.55732421875, + "completions/mean_terminated_length": 1135.5760009765625, + "completions/min_length": 305.8, + "completions/min_terminated_length": 305.8, + "epoch": 0.1522784332387063, + "grad_norm": 0.33358201038036384, + "kl": 0.082269287109375, + "learning_rate": 2.6359957426610815e-06, + "loss": 0.0269, + "num_tokens": 260614746.0, + "reward": 1.0840820789337158, + "reward_std": 0.19902680516242982, + "rewards/format_reward/mean": 0.8432291507720947, + "rewards/format_reward/std": 0.3611042559146881, + "rewards/mcq_accuracy_reward/mean": 0.6802083373069763, + "rewards/mcq_accuracy_reward/std": 0.46300193667411804, + "rewards/tag_count_reward/mean": 0.772265636920929, + "rewards/tag_count_reward/std": 0.09754357039928437, + "step": 580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001041666666666652, + "completions/max_length": 5595.6, + "completions/max_terminated_length": 4531.6, + "completions/mean_length": 1012.8130615234375, + "completions/mean_terminated_length": 1005.4170166015625, + "completions/min_length": 310.6, + "completions/min_terminated_length": 310.6, + "epoch": 0.15359117835283306, + "grad_norm": 0.2523109475747213, + "kl": 0.08853759765625, + "learning_rate": 2.626965796853088e-06, + "loss": 0.0217, + "num_tokens": 262861883.0, + "reward": 1.1301106929779052, + "reward_std": 0.138107630610466, + "rewards/format_reward/mean": 0.9494791626930237, + "rewards/format_reward/std": 0.20458437204360963, + "rewards/mcq_accuracy_reward/mean": 0.703125, + "rewards/mcq_accuracy_reward/std": 0.4504086494445801, + "rewards/tag_count_reward/mean": 0.7584635376930237, + "rewards/tag_count_reward/std": 0.056847980618476866, + "step": 585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3845.4, + "completions/max_terminated_length": 3845.4, + "completions/mean_length": 984.94326171875, + "completions/mean_terminated_length": 984.94326171875, + "completions/min_length": 347.8, + "completions/min_terminated_length": 347.8, + "epoch": 0.15490392346695986, + "grad_norm": 0.28113354464842577, + "kl": 0.090582275390625, + "learning_rate": 2.6178410893718873e-06, + "loss": 0.0182, + "num_tokens": 265058926.0, + "reward": 1.1351562976837157, + "reward_std": 0.15236479938030242, + "rewards/format_reward/mean": 0.95625, + "rewards/format_reward/std": 0.20097582340240477, + "rewards/mcq_accuracy_reward/mean": 0.706250011920929, + "rewards/mcq_accuracy_reward/std": 0.4482737958431244, + "rewards/tag_count_reward/mean": 0.759375, + "rewards/tag_count_reward/std": 0.05579227134585381, + "step": 590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4442.0, + "completions/max_terminated_length": 4442.0, + "completions/mean_length": 1068.7864990234375, + "completions/mean_terminated_length": 1068.7864990234375, + "completions/min_length": 329.6, + "completions/min_terminated_length": 329.6, + "epoch": 0.15621666858108663, + "grad_norm": 0.32968623215666726, + "kl": 0.095965576171875, + "learning_rate": 2.6086223874746145e-06, + "loss": -0.0022, + "num_tokens": 267409396.0, + "reward": 1.1165039539337158, + "reward_std": 0.1593833565711975, + "rewards/format_reward/mean": 0.8791666626930237, + "rewards/format_reward/std": 0.30755018889904023, + "rewards/mcq_accuracy_reward/mean": 0.7026041626930237, + "rewards/mcq_accuracy_reward/std": 0.45486966967582704, + "rewards/tag_count_reward/mean": 0.7764322996139527, + "rewards/tag_count_reward/std": 0.08141107484698296, + "step": 595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 4753.8, + "completions/max_terminated_length": 4040.0, + "completions/mean_length": 1079.21044921875, + "completions/mean_terminated_length": 1075.47734375, + "completions/min_length": 180.6, + "completions/min_terminated_length": 180.6, + "epoch": 0.1575294136952134, + "grad_norm": 0.4245217689242739, + "kl": 0.0919097900390625, + "learning_rate": 2.5993104663219893e-06, + "loss": -0.0006, + "num_tokens": 269777136.0, + "reward": 1.0943685293197631, + "reward_std": 0.18973935544490814, + "rewards/format_reward/mean": 0.8140625119209289, + "rewards/format_reward/std": 0.388655948638916, + "rewards/mcq_accuracy_reward/mean": 0.6958333373069763, + "rewards/mcq_accuracy_reward/std": 0.4564329445362091, + "rewards/tag_count_reward/mean": 0.780078136920929, + "rewards/tag_count_reward/std": 0.09761925488710403, + "step": 600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 5854.8, + "completions/max_terminated_length": 5026.0, + "completions/mean_length": 988.61826171875, + "completions/mean_terminated_length": 984.8109252929687, + "completions/min_length": 252.4, + "completions/min_terminated_length": 252.4, + "epoch": 0.1588421588093402, + "grad_norm": 0.31175086195733365, + "kl": 0.10859375, + "learning_rate": 2.589906108913132e-06, + "loss": 0.0053, + "num_tokens": 271972347.0, + "reward": 1.1373372554779053, + "reward_std": 0.16227784454822541, + "rewards/format_reward/mean": 0.9276041626930237, + "rewards/format_reward/std": 0.2523292779922485, + "rewards/mcq_accuracy_reward/mean": 0.7171875, + "rewards/mcq_accuracy_reward/std": 0.4466317713260651, + "rewards/tag_count_reward/mean": 0.7529947996139527, + "rewards/tag_count_reward/std": 0.06086684390902519, + "step": 605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.002083333333333304, + "completions/max_length": 7422.2, + "completions/max_terminated_length": 5019.0, + "completions/mean_length": 1024.456298828125, + "completions/mean_terminated_length": 1009.5465209960937, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.16015490392346696, + "grad_norm": 0.3407885548504301, + "kl": 0.1013671875, + "learning_rate": 2.5804101060197294e-06, + "loss": 0.0296, + "num_tokens": 274239439.0, + "reward": 1.1067057609558106, + "reward_std": 0.15819495618343354, + "rewards/format_reward/mean": 0.95, + "rewards/format_reward/std": 0.21575847566127776, + "rewards/mcq_accuracy_reward/mean": 0.6802083373069763, + "rewards/mcq_accuracy_reward/std": 0.46360679268836974, + "rewards/tag_count_reward/mean": 0.7559895873069763, + "rewards/tag_count_reward/std": 0.055414225161075595, + "step": 610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001041666666666652, + "completions/max_length": 5708.4, + "completions/max_terminated_length": 4399.2, + "completions/mean_length": 1024.0218994140625, + "completions/mean_terminated_length": 1016.4803100585938, + "completions/min_length": 176.4, + "completions/min_terminated_length": 176.4, + "epoch": 0.16146764903759372, + "grad_norm": 0.3057735273353973, + "kl": 0.102655029296875, + "learning_rate": 2.5708232561195393e-06, + "loss": 0.01, + "num_tokens": 276503977.0, + "reward": 1.1376953601837159, + "reward_std": 0.15699831247329712, + "rewards/format_reward/mean": 0.9072916626930236, + "rewards/format_reward/std": 0.2886850744485855, + "rewards/mcq_accuracy_reward/mean": 0.7197916626930236, + "rewards/mcq_accuracy_reward/std": 0.44838468432426454, + "rewards/tag_count_reward/mean": 0.7643229246139527, + "rewards/tag_count_reward/std": 0.070687335729599, + "step": 615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 5456.8, + "completions/max_terminated_length": 4621.2, + "completions/mean_length": 1022.0906494140625, + "completions/mean_terminated_length": 1018.36103515625, + "completions/min_length": 281.4, + "completions/min_terminated_length": 281.4, + "epoch": 0.16278039415172052, + "grad_norm": 0.28255538839800876, + "kl": 0.10247802734375, + "learning_rate": 2.5611463653292526e-06, + "loss": 0.009, + "num_tokens": 278763647.0, + "reward": 1.1120443105697633, + "reward_std": 0.14500748366117477, + "rewards/format_reward/mean": 0.9619791626930236, + "rewards/format_reward/std": 0.18564997911453246, + "rewards/mcq_accuracy_reward/mean": 0.6833333373069763, + "rewards/mcq_accuracy_reward/std": 0.4617635548114777, + "rewards/tag_count_reward/mean": 0.7528645873069764, + "rewards/tag_count_reward/std": 0.05139733701944351, + "step": 620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5036.0, + "completions/max_terminated_length": 5036.0, + "completions/mean_length": 1099.5187744140626, + "completions/mean_terminated_length": 1099.5187744140626, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.1640931392658473, + "grad_norm": 0.2674078471694753, + "kl": 0.0955718994140625, + "learning_rate": 2.551380247336705e-06, + "loss": 0.0119, + "num_tokens": 281171915.0, + "reward": 1.14716796875, + "reward_std": 0.13423814177513121, + "rewards/format_reward/mean": 0.9859375, + "rewards/format_reward/std": 0.11680568009614944, + "rewards/mcq_accuracy_reward/mean": 0.7130208373069763, + "rewards/mcq_accuracy_reward/std": 0.4499955832958221, + "rewards/tag_count_reward/mean": 0.7506510496139527, + "rewards/tag_count_reward/std": 0.03345811441540718, + "step": 625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3848.0, + "completions/max_terminated_length": 3848.0, + "completions/mean_length": 1002.9067993164062, + "completions/mean_terminated_length": 1002.9067993164062, + "completions/min_length": 318.4, + "completions/min_terminated_length": 318.4, + "epoch": 0.16540588437997408, + "grad_norm": 0.2588835570779672, + "kl": 0.101873779296875, + "learning_rate": 2.5415257233324646e-06, + "loss": 0.0131, + "num_tokens": 283390496.0, + "reward": 1.1674479484558105, + "reward_std": 0.11202247887849807, + "rewards/format_reward/mean": 0.9833333253860473, + "rewards/format_reward/std": 0.12443470656871795, + "rewards/mcq_accuracy_reward/mean": 0.7338541626930237, + "rewards/mcq_accuracy_reward/std": 0.43885971903800963, + "rewards/tag_count_reward/mean": 0.7510416626930236, + "rewards/tag_count_reward/std": 0.029071030020713807, + "step": 630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5375.6, + "completions/max_terminated_length": 5375.6, + "completions/mean_length": 1154.6885498046875, + "completions/mean_terminated_length": 1154.6885498046875, + "completions/min_length": 316.4, + "completions/min_terminated_length": 316.4, + "epoch": 0.16671862949410085, + "grad_norm": 0.25099269544522196, + "kl": 0.0910186767578125, + "learning_rate": 2.5315836219407777e-06, + "loss": 0.0074, + "num_tokens": 285910282.0, + "reward": 1.101985716819763, + "reward_std": 0.17357454299926758, + "rewards/format_reward/mean": 0.9567708373069763, + "rewards/format_reward/std": 0.20307721197605133, + "rewards/mcq_accuracy_reward/mean": 0.674999988079071, + "rewards/mcq_accuracy_reward/std": 0.4670876502990723, + "rewards/tag_count_reward/mean": 0.751171875, + "rewards/tag_count_reward/std": 0.05102614611387253, + "step": 635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4645.8, + "completions/max_terminated_length": 4645.8, + "completions/mean_length": 1014.6609741210938, + "completions/mean_terminated_length": 1014.6609741210938, + "completions/min_length": 297.2, + "completions/min_terminated_length": 297.2, + "epoch": 0.16803137460822762, + "grad_norm": 0.2762476527966176, + "kl": 0.0971466064453125, + "learning_rate": 2.5215547791498923e-06, + "loss": 0.006, + "num_tokens": 288156479.0, + "reward": 1.1187174797058106, + "reward_std": 0.1557531863451004, + "rewards/format_reward/mean": 0.9604166626930237, + "rewards/format_reward/std": 0.19489842057228088, + "rewards/mcq_accuracy_reward/mean": 0.6901041746139527, + "rewards/mcq_accuracy_reward/std": 0.4620574116706848, + "rewards/tag_count_reward/mean": 0.7540364503860474, + "rewards/tag_count_reward/std": 0.04608112797141075, + "step": 640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6067.4, + "completions/max_terminated_length": 6067.4, + "completions/mean_length": 1071.4401245117188, + "completions/mean_terminated_length": 1071.4401245117188, + "completions/min_length": 264.2, + "completions/min_terminated_length": 264.2, + "epoch": 0.1693441197223544, + "grad_norm": 0.3397691304064965, + "kl": 0.092620849609375, + "learning_rate": 2.511440038241766e-06, + "loss": 0.0085, + "num_tokens": 290512636.0, + "reward": 1.1400065422058105, + "reward_std": 0.17376405298709868, + "rewards/format_reward/mean": 0.9677083373069764, + "rewards/format_reward/std": 0.17367684543132783, + "rewards/mcq_accuracy_reward/mean": 0.7098958253860473, + "rewards/mcq_accuracy_reward/std": 0.4493175148963928, + "rewards/tag_count_reward/mean": 0.752734386920929, + "rewards/tag_count_reward/std": 0.04237373508512974, + "step": 645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4338.4, + "completions/max_terminated_length": 4338.4, + "completions/mean_length": 1013.9291748046875, + "completions/mean_terminated_length": 1013.9291748046875, + "completions/min_length": 278.4, + "completions/min_terminated_length": 278.4, + "epoch": 0.17065686483648118, + "grad_norm": 0.28143597809949467, + "kl": 0.09576416015625, + "learning_rate": 2.501240249721157e-06, + "loss": 0.0093, + "num_tokens": 292753500.0, + "reward": 1.1295247793197631, + "reward_std": 0.12028966844081879, + "rewards/format_reward/mean": 0.9677083373069764, + "rewards/format_reward/std": 0.17512830197811127, + "rewards/mcq_accuracy_reward/mean": 0.6994791626930237, + "rewards/mcq_accuracy_reward/std": 0.4571451902389526, + "rewards/tag_count_reward/mean": 0.7524739503860474, + "rewards/tag_count_reward/std": 0.04040663465857506, + "step": 650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4301.6, + "completions/max_terminated_length": 4301.6, + "completions/mean_length": 1149.34169921875, + "completions/mean_terminated_length": 1149.34169921875, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.17196960995060798, + "grad_norm": 0.22386904325769802, + "kl": 0.089056396484375, + "learning_rate": 2.4909562712441092e-06, + "loss": 0.0053, + "num_tokens": 295258940.0, + "reward": 1.0865234851837158, + "reward_std": 0.1424835965037346, + "rewards/format_reward/mean": 0.9911458373069764, + "rewards/format_reward/std": 0.09219671040773392, + "rewards/mcq_accuracy_reward/mean": 0.6510416626930237, + "rewards/mcq_accuracy_reward/std": 0.4731388509273529, + "rewards/tag_count_reward/mean": 0.750781238079071, + "rewards/tag_count_reward/std": 0.026772572472691537, + "step": 655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 4617.4, + "completions/max_terminated_length": 3890.0, + "completions/mean_length": 1031.8010620117188, + "completions/mean_terminated_length": 1028.1223754882812, + "completions/min_length": 329.4, + "completions/min_terminated_length": 329.4, + "epoch": 0.17328235506473474, + "grad_norm": 0.2931320702379845, + "kl": 0.09725341796875, + "learning_rate": 2.480588967545835e-06, + "loss": 0.0148, + "num_tokens": 297537750.0, + "reward": 1.1160807609558105, + "reward_std": 0.11506828516721726, + "rewards/format_reward/mean": 0.9692708253860474, + "rewards/format_reward/std": 0.171831214427948, + "rewards/mcq_accuracy_reward/mean": 0.6848958373069763, + "rewards/mcq_accuracy_reward/std": 0.4624066174030304, + "rewards/tag_count_reward/mean": 0.75546875, + "rewards/tag_count_reward/std": 0.042606911063194274, + "step": 660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4014.0, + "completions/max_terminated_length": 4014.0, + "completions/mean_length": 1045.518798828125, + "completions/mean_terminated_length": 1045.518798828125, + "completions/min_length": 261.6, + "completions/min_terminated_length": 261.6, + "epoch": 0.1745951001788615, + "grad_norm": 0.4029213201646986, + "kl": 0.098211669921875, + "learning_rate": 2.4701392103680032e-06, + "loss": 0.0302, + "num_tokens": 299843994.0, + "reward": 1.1408854484558106, + "reward_std": 0.16271174997091292, + "rewards/format_reward/mean": 0.9630208373069763, + "rewards/format_reward/std": 0.18720515668392182, + "rewards/mcq_accuracy_reward/mean": 0.7072916626930237, + "rewards/mcq_accuracy_reward/std": 0.45043286085128786, + "rewards/tag_count_reward/mean": 0.7713541626930237, + "rewards/tag_count_reward/std": 0.07240235581994056, + "step": 665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4697.4, + "completions/max_terminated_length": 4697.4, + "completions/mean_length": 950.3083618164062, + "completions/mean_terminated_length": 950.3083618164062, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.1759078452929883, + "grad_norm": 0.30937156743704614, + "kl": 0.0913604736328125, + "learning_rate": 2.4596078783854395e-06, + "loss": 0.0527, + "num_tokens": 301970506.0, + "reward": 1.2098958969116211, + "reward_std": 0.15293447375297547, + "rewards/format_reward/mean": 0.9744791746139526, + "rewards/format_reward/std": 0.1493392065167427, + "rewards/mcq_accuracy_reward/mean": 0.7343749880790711, + "rewards/mcq_accuracy_reward/std": 0.4410438001155853, + "rewards/tag_count_reward/mean": 0.9276041746139526, + "rewards/tag_count_reward/std": 0.10165828615427017, + "step": 670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3577.0, + "completions/max_terminated_length": 3577.0, + "completions/mean_length": 810.6135620117187, + "completions/mean_terminated_length": 810.6135620117187, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.17722059040711508, + "grad_norm": 0.24334673661454137, + "kl": 0.0971405029296875, + "learning_rate": 2.448995857132242e-06, + "loss": 0.0138, + "num_tokens": 303819620.0, + "reward": 1.1772786855697632, + "reward_std": 0.13102926015853883, + "rewards/format_reward/mean": 0.9963541626930237, + "rewards/format_reward/std": 0.05944842398166657, + "rewards/mcq_accuracy_reward/mean": 0.6791666626930237, + "rewards/mcq_accuracy_reward/std": 0.46516323685646055, + "rewards/tag_count_reward/mean": 0.9960937619209289, + "rewards/tag_count_reward/std": 0.04132447391748428, + "step": 675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4475.4, + "completions/max_terminated_length": 4475.4, + "completions/mean_length": 891.2484619140625, + "completions/mean_terminated_length": 891.2484619140625, + "completions/min_length": 236.4, + "completions/min_terminated_length": 236.4, + "epoch": 0.17853333552124187, + "grad_norm": 0.20827476934192937, + "kl": 0.09639892578125, + "learning_rate": 2.4383040389273177e-06, + "loss": 0.0098, + "num_tokens": 305831441.0, + "reward": 1.206445336341858, + "reward_std": 0.12271949425339698, + "rewards/format_reward/mean": 0.9953125, + "rewards/format_reward/std": 0.05988401472568512, + "rewards/mcq_accuracy_reward/mean": 0.7083333373069763, + "rewards/mcq_accuracy_reward/std": 0.447719019651413, + "rewards/tag_count_reward/mean": 0.9971354246139527, + "rewards/tag_count_reward/std": 0.03434977605938912, + "step": 680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 5095.2, + "completions/max_terminated_length": 4058.8, + "completions/mean_length": 830.3276245117188, + "completions/mean_terminated_length": 826.486279296875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.17984608063536864, + "grad_norm": 0.3633038076338155, + "kl": 0.107373046875, + "learning_rate": 2.4275333227993577e-06, + "loss": 0.007, + "num_tokens": 307722086.0, + "reward": 1.2303711414337157, + "reward_std": 0.10993013381958008, + "rewards/format_reward/mean": 0.9864583373069763, + "rewards/format_reward/std": 0.11256087720394134, + "rewards/mcq_accuracy_reward/mean": 0.7354166746139527, + "rewards/mcq_accuracy_reward/std": 0.4411444842815399, + "rewards/tag_count_reward/mean": 0.993359375, + "rewards/tag_count_reward/std": 0.050104308500885965, + "step": 685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 5100.4, + "completions/max_terminated_length": 4223.6, + "completions/mean_length": 828.2906494140625, + "completions/mean_terminated_length": 824.4663940429688, + "completions/min_length": 95.8, + "completions/min_terminated_length": 95.8, + "epoch": 0.1811588257494954, + "grad_norm": 0.378669584225402, + "kl": 0.1189208984375, + "learning_rate": 2.416684614411234e-06, + "loss": 0.0304, + "num_tokens": 309606964.0, + "reward": 1.1665039539337159, + "reward_std": 0.1571077048778534, + "rewards/format_reward/mean": 0.9671875, + "rewards/format_reward/std": 0.1764684796333313, + "rewards/mcq_accuracy_reward/mean": 0.678125, + "rewards/mcq_accuracy_reward/std": 0.4639050602912903, + "rewards/tag_count_reward/mean": 0.986328125, + "rewards/tag_count_reward/std": 0.06384196206927299, + "step": 690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3740.6, + "completions/max_terminated_length": 3740.6, + "completions/mean_length": 798.8682495117188, + "completions/mean_terminated_length": 798.8682495117188, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.1824715708636222, + "grad_norm": 0.38229619321322356, + "kl": 0.11539306640625, + "learning_rate": 2.4057588259838516e-06, + "loss": 0.0151, + "num_tokens": 311443463.0, + "reward": 1.1821940660476684, + "reward_std": 0.1349425196647644, + "rewards/format_reward/mean": 0.9661458253860473, + "rewards/format_reward/std": 0.17904312610626222, + "rewards/mcq_accuracy_reward/mean": 0.6932291626930237, + "rewards/mcq_accuracy_reward/std": 0.4592526853084564, + "rewards/tag_count_reward/mean": 0.9897135496139526, + "rewards/tag_count_reward/std": 0.05400870144367218, + "step": 695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3791.4, + "completions/max_terminated_length": 3791.4, + "completions/mean_length": 837.1546997070312, + "completions/mean_terminated_length": 837.1546997070312, + "completions/min_length": 127.6, + "completions/min_terminated_length": 127.6, + "epoch": 0.18378431597774897, + "grad_norm": 0.30946545828778366, + "kl": 0.105487060546875, + "learning_rate": 2.394756876219443e-06, + "loss": 0.0195, + "num_tokens": 313350720.0, + "reward": 1.2077474355697633, + "reward_std": 0.13557008504867554, + "rewards/format_reward/mean": 0.975, + "rewards/format_reward/std": 0.15502002239227294, + "rewards/mcq_accuracy_reward/mean": 0.715624988079071, + "rewards/mcq_accuracy_reward/std": 0.44962496161460874, + "rewards/tag_count_reward/mean": 0.9934895753860473, + "rewards/tag_count_reward/std": 0.04399415552616119, + "step": 700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4273.8, + "completions/max_terminated_length": 4273.8, + "completions/mean_length": 977.7422119140625, + "completions/mean_terminated_length": 977.7422119140625, + "completions/min_length": 211.6, + "completions/min_terminated_length": 211.6, + "epoch": 0.18509706109187574, + "grad_norm": 0.3570018799807185, + "kl": 0.09617919921875, + "learning_rate": 2.383679690224316e-06, + "loss": 0.0151, + "num_tokens": 315527665.0, + "reward": 1.1819010734558106, + "reward_std": 0.14875901490449905, + "rewards/format_reward/mean": 0.9604166746139526, + "rewards/format_reward/std": 0.19436180591583252, + "rewards/mcq_accuracy_reward/mean": 0.6958333253860474, + "rewards/mcq_accuracy_reward/std": 0.4571181952953339, + "rewards/tag_count_reward/mean": 0.9838541746139526, + "rewards/tag_count_reward/std": 0.06378985792398453, + "step": 705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4853.4, + "completions/max_terminated_length": 4853.4, + "completions/mean_length": 1073.3177368164063, + "completions/mean_terminated_length": 1073.3177368164063, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.18640980620600253, + "grad_norm": 0.2415514436805403, + "kl": 0.093927001953125, + "learning_rate": 2.3725281994310674e-06, + "loss": 0.0264, + "num_tokens": 317889003.0, + "reward": 1.1770833730697632, + "reward_std": 0.1562931627035141, + "rewards/format_reward/mean": 0.9697916746139527, + "rewards/format_reward/std": 0.16718428432941437, + "rewards/mcq_accuracy_reward/mean": 0.6869791626930237, + "rewards/mcq_accuracy_reward/std": 0.459024053812027, + "rewards/tag_count_reward/mean": 0.990625, + "rewards/tag_count_reward/std": 0.047656980156898496, + "step": 710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3354.4, + "completions/max_terminated_length": 3354.4, + "completions/mean_length": 804.5828369140625, + "completions/mean_terminated_length": 804.5828369140625, + "completions/min_length": 211.2, + "completions/min_terminated_length": 211.2, + "epoch": 0.1877225513201293, + "grad_norm": 0.3037091660090706, + "kl": 0.1019287109375, + "learning_rate": 2.361303341520265e-06, + "loss": 0.0181, + "num_tokens": 319733258.0, + "reward": 1.2174479722976685, + "reward_std": 0.1454225778579712, + "rewards/format_reward/mean": 0.9828125, + "rewards/format_reward/std": 0.12946273684501647, + "rewards/mcq_accuracy_reward/mean": 0.7229166626930237, + "rewards/mcq_accuracy_reward/std": 0.4425632655620575, + "rewards/tag_count_reward/mean": 0.9953125, + "rewards/tag_count_reward/std": 0.03198712095618248, + "step": 715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4266.2, + "completions/max_terminated_length": 4266.2, + "completions/mean_length": 877.1494995117188, + "completions/mean_terminated_length": 877.1494995117188, + "completions/min_length": 191.2, + "completions/min_terminated_length": 191.2, + "epoch": 0.1890352964342561, + "grad_norm": 0.30282678832070237, + "kl": 0.09903564453125, + "learning_rate": 2.3500060603415964e-06, + "loss": 0.017, + "num_tokens": 321714697.0, + "reward": 1.2045573234558105, + "reward_std": 0.11696214526891709, + "rewards/format_reward/mean": 0.9895833373069763, + "rewards/format_reward/std": 0.10099979043006897, + "rewards/mcq_accuracy_reward/mean": 0.7088541626930237, + "rewards/mcq_accuracy_reward/std": 0.45111257433891294, + "rewards/tag_count_reward/mean": 0.9932291626930236, + "rewards/tag_count_reward/std": 0.04236720055341721, + "step": 720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4630.2, + "completions/max_terminated_length": 4630.2, + "completions/mean_length": 904.8312622070313, + "completions/mean_terminated_length": 904.8312622070313, + "completions/min_length": 214.6, + "completions/min_terminated_length": 214.6, + "epoch": 0.19034804154838286, + "grad_norm": 0.2813692043039447, + "kl": 0.107977294921875, + "learning_rate": 2.33863730583451e-06, + "loss": 0.0205, + "num_tokens": 323746629.0, + "reward": 1.2071940660476685, + "reward_std": 0.1388883501291275, + "rewards/format_reward/mean": 0.9932291746139527, + "rewards/format_reward/std": 0.07936974763870239, + "rewards/mcq_accuracy_reward/mean": 0.7114583373069763, + "rewards/mcq_accuracy_reward/std": 0.45128917694091797, + "rewards/tag_count_reward/mean": 0.9897135496139526, + "rewards/tag_count_reward/std": 0.056111722439527514, + "step": 725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4240.4, + "completions/max_terminated_length": 4240.4, + "completions/mean_length": 830.8906494140625, + "completions/mean_terminated_length": 830.8906494140625, + "completions/min_length": 199.6, + "completions/min_terminated_length": 199.6, + "epoch": 0.19166078666250963, + "grad_norm": 0.3418030958149265, + "kl": 0.117681884765625, + "learning_rate": 2.3271980339483376e-06, + "loss": 0.0249, + "num_tokens": 325638643.0, + "reward": 1.2385091543197633, + "reward_std": 0.13196541368961334, + "rewards/format_reward/mean": 0.9864583373069763, + "rewards/format_reward/std": 0.11410482078790665, + "rewards/mcq_accuracy_reward/mean": 0.7442708373069763, + "rewards/mcq_accuracy_reward/std": 0.429628324508667, + "rewards/tag_count_reward/mean": 0.9904947996139526, + "rewards/tag_count_reward/std": 0.05054264217615127, + "step": 730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4042.2, + "completions/max_terminated_length": 4042.2, + "completions/mean_length": 867.1703247070312, + "completions/mean_terminated_length": 867.1703247070312, + "completions/min_length": 158.6, + "completions/min_terminated_length": 158.6, + "epoch": 0.19297353177663643, + "grad_norm": 0.3070057635549287, + "kl": 0.112152099609375, + "learning_rate": 2.3156892065619104e-06, + "loss": 0.0219, + "num_tokens": 327597746.0, + "reward": 1.2247396469116212, + "reward_std": 0.14504944980144502, + "rewards/format_reward/mean": 0.9854166746139527, + "rewards/format_reward/std": 0.11686484217643738, + "rewards/mcq_accuracy_reward/mean": 0.7307291746139526, + "rewards/mcq_accuracy_reward/std": 0.4406039535999298, + "rewards/tag_count_reward/mean": 0.990625, + "rewards/tag_count_reward/std": 0.04738780185580253, + "step": 735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 5245.2, + "completions/max_terminated_length": 4408.0, + "completions/mean_length": 1005.84013671875, + "completions/mean_terminated_length": 1002.1602294921875, + "completions/min_length": 230.8, + "completions/min_terminated_length": 230.8, + "epoch": 0.1942862768907632, + "grad_norm": 0.2801749263835448, + "kl": 0.10025634765625, + "learning_rate": 2.304111791402681e-06, + "loss": 0.0184, + "num_tokens": 329827815.0, + "reward": 1.2093099355697632, + "reward_std": 0.14343093633651732, + "rewards/format_reward/mean": 0.9864583253860474, + "rewards/format_reward/std": 0.11499392688274383, + "rewards/mcq_accuracy_reward/mean": 0.715625, + "rewards/mcq_accuracy_reward/std": 0.4490103185176849, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.05368418544530869, + "step": 740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4766.4, + "completions/max_terminated_length": 4766.4, + "completions/mean_length": 1018.3146118164062, + "completions/mean_terminated_length": 1018.3146118164062, + "completions/min_length": 233.8, + "completions/min_terminated_length": 233.8, + "epoch": 0.19559902200489, + "grad_norm": 0.27471521636326646, + "kl": 0.0914703369140625, + "learning_rate": 2.292466761965351e-06, + "loss": 0.0153, + "num_tokens": 332085059.0, + "reward": 1.1575196027755736, + "reward_std": 0.13835495710372925, + "rewards/format_reward/mean": 0.9848958253860474, + "rewards/format_reward/std": 0.11914153993129731, + "rewards/mcq_accuracy_reward/mean": 0.6635416746139526, + "rewards/mcq_accuracy_reward/std": 0.47156142592430117, + "rewards/tag_count_reward/mean": 0.991015625, + "rewards/tag_count_reward/std": 0.04784061089158058, + "step": 745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5084.2, + "completions/max_terminated_length": 5084.2, + "completions/mean_length": 894.000537109375, + "completions/mean_terminated_length": 894.000537109375, + "completions/min_length": 191.8, + "completions/min_terminated_length": 191.8, + "epoch": 0.19691176711901676, + "grad_norm": 0.3768333858860703, + "kl": 0.099725341796875, + "learning_rate": 2.2807550974300127e-06, + "loss": 0.0154, + "num_tokens": 334097268.0, + "reward": 1.189388060569763, + "reward_std": 0.12904826253652574, + "rewards/format_reward/mean": 0.9880208253860474, + "rewards/format_reward/std": 0.10168309658765792, + "rewards/mcq_accuracy_reward/mean": 0.6947916626930237, + "rewards/mcq_accuracy_reward/std": 0.4547412574291229, + "rewards/tag_count_reward/mean": 0.9903645873069763, + "rewards/tag_count_reward/std": 0.049686431884765625, + "step": 750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3930.4, + "completions/max_terminated_length": 3930.4, + "completions/mean_length": 844.750537109375, + "completions/mean_terminated_length": 844.750537109375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.19822451223314352, + "grad_norm": 0.2988832459603959, + "kl": 0.103009033203125, + "learning_rate": 2.2689777825798176e-06, + "loss": 0.0126, + "num_tokens": 336012189.0, + "reward": 1.2101237297058105, + "reward_std": 0.15491734445095062, + "rewards/format_reward/mean": 0.9848958253860474, + "rewards/format_reward/std": 0.1214523509144783, + "rewards/mcq_accuracy_reward/mean": 0.715624988079071, + "rewards/mcq_accuracy_reward/std": 0.44589301347732546, + "rewards/tag_count_reward/mean": 0.9930989503860473, + "rewards/tag_count_reward/std": 0.0409212127327919, + "step": 755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3499.4, + "completions/max_terminated_length": 3499.4, + "completions/mean_length": 823.6994995117187, + "completions/mean_terminated_length": 823.6994995117187, + "completions/min_length": 216.2, + "completions/min_terminated_length": 216.2, + "epoch": 0.19953725734727032, + "grad_norm": 0.3134942167542027, + "kl": 0.109033203125, + "learning_rate": 2.2571358077181657e-06, + "loss": 0.0186, + "num_tokens": 337889132.0, + "reward": 1.1647786855697633, + "reward_std": 0.15706235468387603, + "rewards/format_reward/mean": 0.9869791626930237, + "rewards/format_reward/std": 0.10998225510120392, + "rewards/mcq_accuracy_reward/mean": 0.6703125, + "rewards/mcq_accuracy_reward/std": 0.47007850408554075, + "rewards/tag_count_reward/mean": 0.9908854246139527, + "rewards/tag_count_reward/std": 0.049467197805643084, + "step": 760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 4582.0, + "completions/max_terminated_length": 4495.6, + "completions/mean_length": 888.8443115234375, + "completions/mean_terminated_length": 885.1675048828125, + "completions/min_length": 198.6, + "completions/min_terminated_length": 198.6, + "epoch": 0.2008500024613971, + "grad_norm": 0.3462508352844443, + "kl": 0.108099365234375, + "learning_rate": 2.2452301685854377e-06, + "loss": 0.0141, + "num_tokens": 339893625.0, + "reward": 1.1956380605697632, + "reward_std": 0.1353281930088997, + "rewards/format_reward/mean": 0.9848958373069763, + "rewards/format_reward/std": 0.11764556318521499, + "rewards/mcq_accuracy_reward/mean": 0.7020833253860473, + "rewards/mcq_accuracy_reward/std": 0.45713489055633544, + "rewards/tag_count_reward/mean": 0.9893229126930236, + "rewards/tag_count_reward/std": 0.051869630068540576, + "step": 765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4027.6, + "completions/max_terminated_length": 4027.6, + "completions/mean_length": 877.772412109375, + "completions/mean_terminated_length": 877.772412109375, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.20216274757552385, + "grad_norm": 0.34425566253117373, + "kl": 0.1074462890625, + "learning_rate": 2.233261866275268e-06, + "loss": 0.0215, + "num_tokens": 341877396.0, + "reward": 1.1619466543197632, + "reward_std": 0.13299514204263688, + "rewards/format_reward/mean": 0.9911458373069764, + "rewards/format_reward/std": 0.09271240532398224, + "rewards/mcq_accuracy_reward/mean": 0.667187511920929, + "rewards/mcq_accuracy_reward/std": 0.4692698299884796, + "rewards/tag_count_reward/mean": 0.9878906369209289, + "rewards/tag_count_reward/std": 0.054040233790874484, + "step": 770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 5088.6, + "completions/max_terminated_length": 4923.8, + "completions/mean_length": 892.339599609375, + "completions/mean_terminated_length": 888.6330810546875, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.20347549268965065, + "grad_norm": 0.336511321766566, + "kl": 0.110089111328125, + "learning_rate": 2.2212319071503655e-06, + "loss": 0.0312, + "num_tokens": 343888232.0, + "reward": 1.2074870109558105, + "reward_std": 0.11935290098190307, + "rewards/format_reward/mean": 0.9973958373069763, + "rewards/format_reward/std": 0.03903600871562958, + "rewards/mcq_accuracy_reward/mean": 0.7114583373069763, + "rewards/mcq_accuracy_reward/std": 0.451280277967453, + "rewards/tag_count_reward/mean": 0.9867187380790711, + "rewards/tag_count_reward/std": 0.05807850882411003, + "step": 775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3880.8, + "completions/max_terminated_length": 3880.8, + "completions/mean_length": 829.2047119140625, + "completions/mean_terminated_length": 829.2047119140625, + "completions/min_length": 199.4, + "completions/min_terminated_length": 199.4, + "epoch": 0.20478823780377742, + "grad_norm": 0.23578657166141415, + "kl": 0.10751953125, + "learning_rate": 2.2091413027578944e-06, + "loss": 0.0246, + "num_tokens": 345777649.0, + "reward": 1.1837565660476685, + "reward_std": 0.12974026203155517, + "rewards/format_reward/mean": 0.9973958253860473, + "rewards/format_reward/std": 0.038043868541717527, + "rewards/mcq_accuracy_reward/mean": 0.6869791626930237, + "rewards/mcq_accuracy_reward/std": 0.45958949327468873, + "rewards/tag_count_reward/mean": 0.9897135496139526, + "rewards/tag_count_reward/std": 0.0489323228597641, + "step": 780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3892.4, + "completions/max_terminated_length": 3892.4, + "completions/mean_length": 833.5875244140625, + "completions/mean_terminated_length": 833.5875244140625, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.2061009829179042, + "grad_norm": 0.4512967934737037, + "kl": 0.1091552734375, + "learning_rate": 2.196991069744415e-06, + "loss": 0.0212, + "num_tokens": 347675849.0, + "reward": 1.1695964097976685, + "reward_std": 0.12535380721092224, + "rewards/format_reward/mean": 0.9973958373069763, + "rewards/format_reward/std": 0.03903600871562958, + "rewards/mcq_accuracy_reward/mean": 0.6723958373069763, + "rewards/mcq_accuracy_reward/std": 0.4641672134399414, + "rewards/tag_count_reward/mean": 0.99140625, + "rewards/tag_count_reward/std": 0.04575772508978844, + "step": 785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 5297.0, + "completions/max_terminated_length": 5178.8, + "completions/mean_length": 919.3922119140625, + "completions/mean_terminated_length": 915.6112426757812, + "completions/min_length": 146.2, + "completions/min_terminated_length": 146.2, + "epoch": 0.20741372803203098, + "grad_norm": 0.2764327244680889, + "kl": 0.10242919921875, + "learning_rate": 2.1847822297704015e-06, + "loss": 0.0253, + "num_tokens": 349737986.0, + "reward": 1.1909831285476684, + "reward_std": 0.14769960641860963, + "rewards/format_reward/mean": 0.9984374880790711, + "rewards/format_reward/std": 0.03061862289905548, + "rewards/mcq_accuracy_reward/mean": 0.69375, + "rewards/mcq_accuracy_reward/std": 0.45690102577209474, + "rewards/tag_count_reward/mean": 0.9904947876930237, + "rewards/tag_count_reward/std": 0.04974740073084831, + "step": 790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4754.0, + "completions/max_terminated_length": 4754.0, + "completions/mean_length": 929.5567993164062, + "completions/mean_terminated_length": 929.5567993164062, + "completions/min_length": 213.2, + "completions/min_terminated_length": 213.2, + "epoch": 0.20872647314615775, + "grad_norm": 0.3373887535960412, + "kl": 0.1104156494140625, + "learning_rate": 2.1725158094243343e-06, + "loss": 0.025, + "num_tokens": 351820967.0, + "reward": 1.2060872554779052, + "reward_std": 0.14268118292093276, + "rewards/format_reward/mean": 0.995312511920929, + "rewards/format_reward/std": 0.06087615489959717, + "rewards/mcq_accuracy_reward/mean": 0.7098958253860473, + "rewards/mcq_accuracy_reward/std": 0.45349709391593934, + "rewards/tag_count_reward/mean": 0.989453125, + "rewards/tag_count_reward/std": 0.0499441422522068, + "step": 795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4833.0, + "completions/max_terminated_length": 4833.0, + "completions/mean_length": 879.928662109375, + "completions/mean_terminated_length": 879.928662109375, + "completions/min_length": 208.8, + "completions/min_terminated_length": 208.8, + "epoch": 0.21003921826028454, + "grad_norm": 0.2818023588916798, + "kl": 0.111529541015625, + "learning_rate": 2.1601928401363765e-06, + "loss": 0.0218, + "num_tokens": 353806494.0, + "reward": 1.181250023841858, + "reward_std": 0.123849318921566, + "rewards/format_reward/mean": 0.9958333253860474, + "rewards/format_reward/std": 0.05567532181739807, + "rewards/mcq_accuracy_reward/mean": 0.6854166626930237, + "rewards/mcq_accuracy_reward/std": 0.4643314599990845, + "rewards/tag_count_reward/mean": 0.9875, + "rewards/tag_count_reward/std": 0.05521019995212555, + "step": 800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4347.4, + "completions/max_terminated_length": 4347.4, + "completions/mean_length": 988.566162109375, + "completions/mean_terminated_length": 988.566162109375, + "completions/min_length": 192.4, + "completions/min_terminated_length": 192.4, + "epoch": 0.2113519633744113, + "grad_norm": 0.3011991669770241, + "kl": 0.105316162109375, + "learning_rate": 2.147814358091647e-06, + "loss": 0.0196, + "num_tokens": 356002053.0, + "reward": 1.1870117664337159, + "reward_std": 0.1630198687314987, + "rewards/format_reward/mean": 0.9911458373069764, + "rewards/format_reward/std": 0.0903677523136139, + "rewards/mcq_accuracy_reward/mean": 0.6927083373069763, + "rewards/mcq_accuracy_reward/std": 0.4560886979103088, + "rewards/tag_count_reward/mean": 0.9860677003860474, + "rewards/tag_count_reward/std": 0.0572124108672142, + "step": 805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3987.8, + "completions/max_terminated_length": 3987.8, + "completions/mean_length": 902.5734375, + "completions/mean_terminated_length": 902.5734375, + "completions/min_length": 241.6, + "completions/min_terminated_length": 241.6, + "epoch": 0.2126647084885381, + "grad_norm": 0.8213012076926964, + "kl": 0.10941162109375, + "learning_rate": 2.135381404143093e-06, + "loss": 0.0376, + "num_tokens": 358030274.0, + "reward": 1.1996094465255738, + "reward_std": 0.1476485699415207, + "rewards/format_reward/mean": 0.9932291626930236, + "rewards/format_reward/std": 0.07322667241096496, + "rewards/mcq_accuracy_reward/mean": 0.7041666626930236, + "rewards/mcq_accuracy_reward/std": 0.45592594146728516, + "rewards/tag_count_reward/mean": 0.9885416626930237, + "rewards/tag_count_reward/std": 0.05387606173753738, + "step": 810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4307.6, + "completions/max_terminated_length": 4307.6, + "completions/mean_length": 902.6661743164062, + "completions/mean_terminated_length": 902.6661743164062, + "completions/min_length": 207.8, + "completions/min_terminated_length": 207.8, + "epoch": 0.21397745360266487, + "grad_norm": 0.24638487415575316, + "kl": 0.10924072265625, + "learning_rate": 2.122895023723967e-06, + "loss": 0.0217, + "num_tokens": 360063209.0, + "reward": 1.1451823234558105, + "reward_std": 0.12560669779777528, + "rewards/format_reward/mean": 0.990625, + "rewards/format_reward/std": 0.09018117189407349, + "rewards/mcq_accuracy_reward/mean": 0.6505208253860474, + "rewards/mcq_accuracy_reward/std": 0.47175200581550597, + "rewards/tag_count_reward/mean": 0.9880208373069763, + "rewards/tag_count_reward/std": 0.0554523803293705, + "step": 815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3780.0, + "completions/max_terminated_length": 3780.0, + "completions/mean_length": 940.577099609375, + "completions/mean_terminated_length": 940.577099609375, + "completions/min_length": 244.6, + "completions/min_terminated_length": 244.6, + "epoch": 0.21529019871679164, + "grad_norm": 0.2948823227595438, + "kl": 0.1004638671875, + "learning_rate": 2.1103562667599234e-06, + "loss": 0.0249, + "num_tokens": 362171253.0, + "reward": 1.2054036855697632, + "reward_std": 0.13165797889232636, + "rewards/format_reward/mean": 0.9885416746139526, + "rewards/format_reward/std": 0.10076216459274293, + "rewards/mcq_accuracy_reward/mean": 0.7109375, + "rewards/mcq_accuracy_reward/std": 0.4504377841949463, + "rewards/tag_count_reward/mean": 0.9893229246139527, + "rewards/tag_count_reward/std": 0.05169776976108551, + "step": 820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 5362.6, + "completions/max_terminated_length": 4448.6, + "completions/mean_length": 984.2328369140625, + "completions/mean_terminated_length": 980.4164306640625, + "completions/min_length": 251.6, + "completions/min_terminated_length": 251.6, + "epoch": 0.21660294383091844, + "grad_norm": 0.27016037254188985, + "kl": 0.0974761962890625, + "learning_rate": 2.0977661875807316e-06, + "loss": 0.0274, + "num_tokens": 364359324.0, + "reward": 1.1861003160476684, + "reward_std": 0.14154660403728486, + "rewards/format_reward/mean": 0.9947916626930237, + "rewards/format_reward/std": 0.07009022235870362, + "rewards/mcq_accuracy_reward/mean": 0.6901041626930237, + "rewards/mcq_accuracy_reward/std": 0.4561116933822632, + "rewards/tag_count_reward/mean": 0.9891927123069764, + "rewards/tag_count_reward/std": 0.05274080410599709, + "step": 825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4238.2, + "completions/max_terminated_length": 4238.2, + "completions/mean_length": 883.0401245117188, + "completions/mean_terminated_length": 883.0401245117188, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.2179156889450452, + "grad_norm": 0.3164725939462402, + "kl": 0.1029632568359375, + "learning_rate": 2.085125844831625e-06, + "loss": 0.0289, + "num_tokens": 366355097.0, + "reward": 1.1899088859558105, + "reward_std": 0.11709032207727432, + "rewards/format_reward/mean": 0.9916666626930237, + "rewards/format_reward/std": 0.08558243215084076, + "rewards/mcq_accuracy_reward/mean": 0.69375, + "rewards/mcq_accuracy_reward/std": 0.4574463129043579, + "rewards/tag_count_reward/mean": 0.992968761920929, + "rewards/tag_count_reward/std": 0.040481379628181456, + "step": 830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4188.4, + "completions/max_terminated_length": 4188.4, + "completions/mean_length": 868.9817993164063, + "completions/mean_terminated_length": 868.9817993164063, + "completions/min_length": 225.8, + "completions/min_terminated_length": 225.8, + "epoch": 0.21922843405917197, + "grad_norm": 0.269629411456499, + "kl": 0.1031829833984375, + "learning_rate": 2.0724363013842837e-06, + "loss": 0.0248, + "num_tokens": 368320318.0, + "reward": 1.1896810293197633, + "reward_std": 0.14794873893260957, + "rewards/format_reward/mean": 0.990625, + "rewards/format_reward/std": 0.0943307027220726, + "rewards/mcq_accuracy_reward/mean": 0.6942708253860473, + "rewards/mcq_accuracy_reward/std": 0.4578032672405243, + "rewards/tag_count_reward/mean": 0.991015625, + "rewards/tag_count_reward/std": 0.04928223416209221, + "step": 835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4444.4, + "completions/max_terminated_length": 4444.4, + "completions/mean_length": 885.78388671875, + "completions/mean_terminated_length": 885.78388671875, + "completions/min_length": 239.8, + "completions/min_terminated_length": 239.8, + "epoch": 0.22054117917329877, + "grad_norm": 0.26996694120456644, + "kl": 0.10313720703125, + "learning_rate": 2.059698624247459e-06, + "loss": 0.0291, + "num_tokens": 370315655.0, + "reward": 1.2359375476837158, + "reward_std": 0.13430304378271102, + "rewards/format_reward/mean": 0.9885416746139526, + "rewards/format_reward/std": 0.10607062131166459, + "rewards/mcq_accuracy_reward/mean": 0.7411458373069764, + "rewards/mcq_accuracy_reward/std": 0.43406546115875244, + "rewards/tag_count_reward/mean": 0.990625, + "rewards/tag_count_reward/std": 0.04949425682425499, + "step": 840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4200.4, + "completions/max_terminated_length": 4200.4, + "completions/mean_length": 895.502099609375, + "completions/mean_terminated_length": 895.502099609375, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.22185392428742554, + "grad_norm": 0.28496143728529333, + "kl": 0.10233154296875, + "learning_rate": 2.046913884477258e-06, + "loss": 0.0374, + "num_tokens": 372331371.0, + "reward": 1.2172852039337159, + "reward_std": 0.13796493113040925, + "rewards/format_reward/mean": 0.9885416626930237, + "rewards/format_reward/std": 0.10583464056253433, + "rewards/mcq_accuracy_reward/mean": 0.7229166626930237, + "rewards/mcq_accuracy_reward/std": 0.44481814503669737, + "rewards/tag_count_reward/mean": 0.9889322996139527, + "rewards/tag_count_reward/std": 0.051649411022663114, + "step": 845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4320.6, + "completions/max_terminated_length": 4320.6, + "completions/mean_length": 907.2797119140625, + "completions/mean_terminated_length": 907.2797119140625, + "completions/min_length": 240.2, + "completions/min_terminated_length": 240.2, + "epoch": 0.22316666940155233, + "grad_norm": 0.35896664704650666, + "kl": 0.1032470703125, + "learning_rate": 2.03408315708708e-06, + "loss": 0.0247, + "num_tokens": 374371324.0, + "reward": 1.1230794429779052, + "reward_std": 0.1360609158873558, + "rewards/format_reward/mean": 0.99375, + "rewards/format_reward/std": 0.07699977234005928, + "rewards/mcq_accuracy_reward/mean": 0.6276041567325592, + "rewards/mcq_accuracy_reward/std": 0.47708049416542053, + "rewards/tag_count_reward/mean": 0.9881510376930237, + "rewards/tag_count_reward/std": 0.05234459862112999, + "step": 850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5234.0, + "completions/max_terminated_length": 5234.0, + "completions/mean_length": 911.492724609375, + "completions/mean_terminated_length": 911.492724609375, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.2244794145156791, + "grad_norm": 0.2838935922499586, + "kl": 0.104302978515625, + "learning_rate": 2.0212075209572234e-06, + "loss": 0.0326, + "num_tokens": 376423142.0, + "reward": 1.233333373069763, + "reward_std": 0.12426867336034775, + "rewards/format_reward/mean": 0.9973958373069763, + "rewards/format_reward/std": 0.03903600871562958, + "rewards/mcq_accuracy_reward/mean": 0.7364583373069763, + "rewards/mcq_accuracy_reward/std": 0.43884278535842897, + "rewards/tag_count_reward/mean": 0.9901041626930237, + "rewards/tag_count_reward/std": 0.04945099204778671, + "step": 855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4701.8, + "completions/max_terminated_length": 4701.8, + "completions/mean_length": 870.2854370117187, + "completions/mean_terminated_length": 870.2854370117187, + "completions/min_length": 223.6, + "completions/min_terminated_length": 223.6, + "epoch": 0.22579215962980587, + "grad_norm": 0.3147914477304938, + "kl": 0.10526123046875, + "learning_rate": 2.0082880587441697e-06, + "loss": 0.0298, + "num_tokens": 378392074.0, + "reward": 1.1909180164337159, + "reward_std": 0.14606537222862243, + "rewards/format_reward/mean": 0.9958333373069763, + "rewards/format_reward/std": 0.05666746199131012, + "rewards/mcq_accuracy_reward/mean": 0.6942708373069764, + "rewards/mcq_accuracy_reward/std": 0.45592130422592164, + "rewards/tag_count_reward/mean": 0.9907552123069763, + "rewards/tag_count_reward/std": 0.047022730112075806, + "step": 860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3898.8, + "completions/max_terminated_length": 3898.8, + "completions/mean_length": 831.5922119140625, + "completions/mean_terminated_length": 831.5922119140625, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.22710490474393266, + "grad_norm": 0.2820538750549166, + "kl": 0.105938720703125, + "learning_rate": 1.9953258567895448e-06, + "loss": 0.0273, + "num_tokens": 380286059.0, + "reward": 1.2141602039337158, + "reward_std": 0.13937286287546158, + "rewards/format_reward/mean": 0.9947916746139527, + "rewards/format_reward/std": 0.06409270763397217, + "rewards/mcq_accuracy_reward/mean": 0.7182291626930237, + "rewards/mcq_accuracy_reward/std": 0.4484278976917267, + "rewards/tag_count_reward/mean": 0.9889322876930237, + "rewards/tag_count_reward/std": 0.052978326380252835, + "step": 865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3912.2, + "completions/max_terminated_length": 3912.2, + "completions/mean_length": 936.1541870117187, + "completions/mean_terminated_length": 936.1541870117187, + "completions/min_length": 221.6, + "completions/min_terminated_length": 221.6, + "epoch": 0.22841764985805943, + "grad_norm": 0.32072325952694847, + "kl": 0.10338134765625, + "learning_rate": 1.9823220050287734e-06, + "loss": 0.0402, + "num_tokens": 382385651.0, + "reward": 1.2158203840255737, + "reward_std": 0.14391910284757614, + "rewards/format_reward/mean": 0.9901041746139526, + "rewards/format_reward/std": 0.08504579663276672, + "rewards/mcq_accuracy_reward/mean": 0.721875, + "rewards/mcq_accuracy_reward/std": 0.4426910042762756, + "rewards/tag_count_reward/mean": 0.9856770873069763, + "rewards/tag_count_reward/std": 0.05938722267746925, + "step": 870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4050.6, + "completions/max_terminated_length": 4050.6, + "completions/mean_length": 938.5890869140625, + "completions/mean_terminated_length": 938.5890869140625, + "completions/min_length": 231.2, + "completions/min_terminated_length": 231.2, + "epoch": 0.22973039497218622, + "grad_norm": 0.3351443626702459, + "kl": 0.104779052734375, + "learning_rate": 1.969277596899434e-06, + "loss": 0.0246, + "num_tokens": 384484206.0, + "reward": 1.2103516101837157, + "reward_std": 0.1480497747659683, + "rewards/format_reward/mean": 0.9859375, + "rewards/format_reward/std": 0.11736389696598053, + "rewards/mcq_accuracy_reward/mean": 0.7171875, + "rewards/mcq_accuracy_reward/std": 0.44948660135269164, + "rewards/tag_count_reward/mean": 0.986718761920929, + "rewards/tag_count_reward/std": 0.05666636303067207, + "step": 875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3911.4, + "completions/max_terminated_length": 3911.4, + "completions/mean_length": 831.500537109375, + "completions/mean_terminated_length": 831.500537109375, + "completions/min_length": 200.2, + "completions/min_terminated_length": 200.2, + "epoch": 0.231043140086313, + "grad_norm": 0.3099672754394894, + "kl": 0.113739013671875, + "learning_rate": 1.9561937292493136e-06, + "loss": 0.0225, + "num_tokens": 386375175.0, + "reward": 1.1722005605697632, + "reward_std": 0.14943790435791016, + "rewards/format_reward/mean": 0.989062511920929, + "rewards/format_reward/std": 0.10228720307350159, + "rewards/mcq_accuracy_reward/mean": 0.6776041626930237, + "rewards/mcq_accuracy_reward/std": 0.46458404660224917, + "rewards/tag_count_reward/mean": 0.9893229126930236, + "rewards/tag_count_reward/std": 0.05291555672883987, + "step": 880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4023.4, + "completions/max_terminated_length": 4023.4, + "completions/mean_length": 841.591162109375, + "completions/mean_terminated_length": 841.591162109375, + "completions/min_length": 209.4, + "completions/min_terminated_length": 209.4, + "epoch": 0.23235588520043976, + "grad_norm": 0.3516105076881317, + "kl": 0.110040283203125, + "learning_rate": 1.943071502244179e-06, + "loss": 0.0417, + "num_tokens": 388286582.0, + "reward": 1.2150716543197633, + "reward_std": 0.1517113447189331, + "rewards/format_reward/mean": 0.9921874880790711, + "rewards/format_reward/std": 0.08613373786211014, + "rewards/mcq_accuracy_reward/mean": 0.7192708253860474, + "rewards/mcq_accuracy_reward/std": 0.4481381118297577, + "rewards/tag_count_reward/mean": 0.991015625, + "rewards/tag_count_reward/std": 0.046563072502613066, + "step": 885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4983.2, + "completions/max_terminated_length": 4983.2, + "completions/mean_length": 927.5875244140625, + "completions/mean_terminated_length": 927.5875244140625, + "completions/min_length": 214.6, + "completions/min_terminated_length": 214.6, + "epoch": 0.23366863031456656, + "grad_norm": 1.4395027773427695, + "kl": 0.110986328125, + "learning_rate": 1.9299120192752705e-06, + "loss": 0.0316, + "num_tokens": 390361406.0, + "reward": 1.1762370109558105, + "reward_std": 0.15257585942745208, + "rewards/format_reward/mean": 0.9932291746139527, + "rewards/format_reward/std": 0.0812084659934044, + "rewards/mcq_accuracy_reward/mean": 0.6807291626930236, + "rewards/mcq_accuracy_reward/std": 0.4600822150707245, + "rewards/tag_count_reward/mean": 0.9888020753860474, + "rewards/tag_count_reward/std": 0.051376774162054065, + "step": 890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5141.8, + "completions/max_terminated_length": 5141.8, + "completions/mean_length": 968.0713623046875, + "completions/mean_terminated_length": 968.0713623046875, + "completions/min_length": 179.6, + "completions/min_terminated_length": 179.6, + "epoch": 0.23498137542869332, + "grad_norm": 0.30440735633861715, + "kl": 0.108074951171875, + "learning_rate": 1.9167163868665187e-06, + "loss": 0.0337, + "num_tokens": 392516063.0, + "reward": 1.155664086341858, + "reward_std": 0.14373701214790344, + "rewards/format_reward/mean": 0.9859375, + "rewards/format_reward/std": 0.11736389994621277, + "rewards/mcq_accuracy_reward/mean": 0.6625, + "rewards/mcq_accuracy_reward/std": 0.4693600594997406, + "rewards/tag_count_reward/mean": 0.986718761920929, + "rewards/tag_count_reward/std": 0.05712253078818321, + "step": 895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4211.4, + "completions/max_terminated_length": 4211.4, + "completions/mean_length": 971.10732421875, + "completions/mean_terminated_length": 971.10732421875, + "completions/min_length": 220.2, + "completions/min_terminated_length": 220.2, + "epoch": 0.23629412054282012, + "grad_norm": 2.465257369445643, + "kl": 0.10181884765625, + "learning_rate": 1.903485714581506e-06, + "loss": 0.0399, + "num_tokens": 394678245.0, + "reward": 1.205306053161621, + "reward_std": 0.14588176757097243, + "rewards/format_reward/mean": 0.9916666626930237, + "rewards/format_reward/std": 0.07576627135276795, + "rewards/mcq_accuracy_reward/mean": 0.7109375, + "rewards/mcq_accuracy_reward/std": 0.45111408829689026, + "rewards/tag_count_reward/mean": 0.9858072996139526, + "rewards/tag_count_reward/std": 0.057577265053987504, + "step": 900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001041666666666652, + "completions/max_length": 5989.2, + "completions/max_terminated_length": 5055.4, + "completions/mean_length": 1021.2781494140625, + "completions/mean_terminated_length": 1013.8339721679688, + "completions/min_length": 180.2, + "completions/min_terminated_length": 180.2, + "epoch": 0.2376068656569469, + "grad_norm": 0.28977586849668147, + "kl": 0.1012908935546875, + "learning_rate": 1.8902211149301653e-06, + "loss": 0.0423, + "num_tokens": 396938171.0, + "reward": 1.1585612535476684, + "reward_std": 0.12691444158554077, + "rewards/format_reward/mean": 0.9791666746139527, + "rewards/format_reward/std": 0.13845563530921937, + "rewards/mcq_accuracy_reward/mean": 0.6666666626930237, + "rewards/mcq_accuracy_reward/std": 0.4667339861392975, + "rewards/tag_count_reward/mean": 0.9884114623069763, + "rewards/tag_count_reward/std": 0.056493081897497174, + "step": 905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4916.0, + "completions/max_terminated_length": 4916.0, + "completions/mean_length": 947.591162109375, + "completions/mean_terminated_length": 947.591162109375, + "completions/min_length": 229.2, + "completions/min_terminated_length": 229.2, + "epoch": 0.23891961077107365, + "grad_norm": 0.24139879115073043, + "kl": 0.102703857421875, + "learning_rate": 1.8769237032752358e-06, + "loss": 0.0364, + "num_tokens": 399059610.0, + "reward": 1.2334635972976684, + "reward_std": 0.1511177584528923, + "rewards/format_reward/mean": 0.9838541626930237, + "rewards/format_reward/std": 0.12099267542362213, + "rewards/mcq_accuracy_reward/mean": 0.7395833253860473, + "rewards/mcq_accuracy_reward/std": 0.43673797845840456, + "rewards/tag_count_reward/mean": 0.9916666626930237, + "rewards/tag_count_reward/std": 0.046709629148244856, + "step": 910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4142.6, + "completions/max_terminated_length": 4142.6, + "completions/mean_length": 819.5895874023438, + "completions/mean_terminated_length": 819.5895874023438, + "completions/min_length": 210.4, + "completions/min_terminated_length": 210.4, + "epoch": 0.24023235588520045, + "grad_norm": 0.3256954528257416, + "kl": 0.11097412109375, + "learning_rate": 1.863594597738475e-06, + "loss": 0.029, + "num_tokens": 400933966.0, + "reward": 1.1769206047058105, + "reward_std": 0.14448025226593017, + "rewards/format_reward/mean": 0.9848958373069763, + "rewards/format_reward/std": 0.11708734929561615, + "rewards/mcq_accuracy_reward/mean": 0.682812511920929, + "rewards/mcq_accuracy_reward/std": 0.462482750415802, + "rewards/tag_count_reward/mean": 0.9915364623069763, + "rewards/tag_count_reward/std": 0.04615458846092224, + "step": 915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4604.6, + "completions/max_terminated_length": 4604.6, + "completions/mean_length": 888.7057495117188, + "completions/mean_terminated_length": 888.7057495117188, + "completions/min_length": 179.6, + "completions/min_terminated_length": 179.6, + "epoch": 0.24154510099932722, + "grad_norm": 0.3294894755583271, + "kl": 0.1090087890625, + "learning_rate": 1.850234919106641e-06, + "loss": 0.0328, + "num_tokens": 402936185.0, + "reward": 1.1775716304779054, + "reward_std": 0.1417522519826889, + "rewards/format_reward/mean": 0.9973958253860473, + "rewards/format_reward/std": 0.038043868541717527, + "rewards/mcq_accuracy_reward/mean": 0.6807291746139527, + "rewards/mcq_accuracy_reward/std": 0.46024264097213746, + "rewards/tag_count_reward/mean": 0.9899739503860474, + "rewards/tag_count_reward/std": 0.04943537563085556, + "step": 920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 4915.0, + "completions/max_terminated_length": 4888.6, + "completions/mean_length": 923.30732421875, + "completions/mean_terminated_length": 919.541455078125, + "completions/min_length": 198.8, + "completions/min_terminated_length": 198.8, + "epoch": 0.24285784611345398, + "grad_norm": 0.25685439642191515, + "kl": 0.114288330078125, + "learning_rate": 1.8368457907372533e-06, + "loss": 0.0372, + "num_tokens": 405012391.0, + "reward": 1.1902669668197632, + "reward_std": 0.12827223390340806, + "rewards/format_reward/mean": 0.9942708253860474, + "rewards/format_reward/std": 0.0727910801768303, + "rewards/mcq_accuracy_reward/mean": 0.6942708253860473, + "rewards/mcq_accuracy_reward/std": 0.45233704447746276, + "rewards/tag_count_reward/mean": 0.9897135376930237, + "rewards/tag_count_reward/std": 0.05516553744673729, + "step": 925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4887.4, + "completions/max_terminated_length": 4887.4, + "completions/mean_length": 941.4432373046875, + "completions/mean_terminated_length": 941.4432373046875, + "completions/min_length": 230.6, + "completions/min_terminated_length": 230.6, + "epoch": 0.24417059122758078, + "grad_norm": 0.5222931368448083, + "kl": 0.108673095703125, + "learning_rate": 1.8234283384641303e-06, + "loss": 0.0309, + "num_tokens": 407114578.0, + "reward": 1.2192383050918578, + "reward_std": 0.14156628847122193, + "rewards/format_reward/mean": 0.9958333253860474, + "rewards/format_reward/std": 0.05515962839126587, + "rewards/mcq_accuracy_reward/mean": 0.7229166626930237, + "rewards/mcq_accuracy_reward/std": 0.4467171013355255, + "rewards/tag_count_reward/mean": 0.989453125, + "rewards/tag_count_reward/std": 0.05159546509385109, + "step": 930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3835.8, + "completions/max_terminated_length": 3835.8, + "completions/mean_length": 894.0604248046875, + "completions/mean_terminated_length": 894.0604248046875, + "completions/min_length": 209.4, + "completions/min_terminated_length": 209.4, + "epoch": 0.24548333634170755, + "grad_norm": 0.23749889170988014, + "kl": 0.107171630859375, + "learning_rate": 1.8099836905027259e-06, + "loss": 0.0209, + "num_tokens": 409129510.0, + "reward": 1.216731810569763, + "reward_std": 0.12658591270446778, + "rewards/format_reward/mean": 0.9927083373069763, + "rewards/format_reward/std": 0.0755966454744339, + "rewards/mcq_accuracy_reward/mean": 0.7203125, + "rewards/mcq_accuracy_reward/std": 0.4465170979499817, + "rewards/tag_count_reward/mean": 0.99296875, + "rewards/tag_count_reward/std": 0.04256304502487183, + "step": 935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5457.4, + "completions/max_terminated_length": 5457.4, + "completions/mean_length": 887.3687622070313, + "completions/mean_terminated_length": 887.3687622070313, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.24679608145583434, + "grad_norm": 0.2918122549792227, + "kl": 0.104058837890625, + "learning_rate": 1.7965129773552607e-06, + "loss": 0.022, + "num_tokens": 411130058.0, + "reward": 1.164257860183716, + "reward_std": 0.14749560058116912, + "rewards/format_reward/mean": 0.9916666626930237, + "rewards/format_reward/std": 0.0903424322605133, + "rewards/mcq_accuracy_reward/mean": 0.6682291626930237, + "rewards/mcq_accuracy_reward/std": 0.47021653652191164, + "rewards/tag_count_reward/mean": 0.9924479126930237, + "rewards/tag_count_reward/std": 0.04240243434906006, + "step": 940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4172.2, + "completions/max_terminated_length": 4172.2, + "completions/mean_length": 901.0458618164063, + "completions/mean_terminated_length": 901.0458618164063, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.2481088265699611, + "grad_norm": 0.2561704697853748, + "kl": 0.10447998046875, + "learning_rate": 1.783017331715665e-06, + "loss": 0.0361, + "num_tokens": 413156098.0, + "reward": 1.2151367664337158, + "reward_std": 0.12443311214447021, + "rewards/format_reward/mean": 0.9921874880790711, + "rewards/format_reward/std": 0.08613373711705208, + "rewards/mcq_accuracy_reward/mean": 0.7197916746139527, + "rewards/mcq_accuracy_reward/std": 0.44791502952575685, + "rewards/tag_count_reward/mean": 0.9891927123069764, + "rewards/tag_count_reward/std": 0.051287151873111725, + "step": 945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5087.4, + "completions/max_terminated_length": 5087.4, + "completions/mean_length": 859.4536743164062, + "completions/mean_terminated_length": 859.4536743164062, + "completions/min_length": 215.2, + "completions/min_terminated_length": 215.2, + "epoch": 0.24942157168408788, + "grad_norm": 0.3451013247231478, + "kl": 0.1074951171875, + "learning_rate": 1.7694978883743334e-06, + "loss": 0.031, + "num_tokens": 415104945.0, + "reward": 1.1970052242279052, + "reward_std": 0.14825678318738938, + "rewards/format_reward/mean": 0.9817708253860473, + "rewards/format_reward/std": 0.13201589286327362, + "rewards/mcq_accuracy_reward/mean": 0.7046875, + "rewards/mcq_accuracy_reward/std": 0.44837576150894165, + "rewards/tag_count_reward/mean": 0.987499988079071, + "rewards/tag_count_reward/std": 0.05466760918498039, + "step": 950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3939.6, + "completions/max_terminated_length": 3939.6, + "completions/mean_length": 861.2234497070312, + "completions/mean_terminated_length": 861.2234497070312, + "completions/min_length": 193.6, + "completions/min_terminated_length": 193.6, + "epoch": 0.2507343167982147, + "grad_norm": 0.26572891020354233, + "kl": 0.107391357421875, + "learning_rate": 1.7559557841227072e-06, + "loss": 0.0281, + "num_tokens": 417056054.0, + "reward": 1.174349021911621, + "reward_std": 0.1531801000237465, + "rewards/format_reward/mean": 0.9822916746139526, + "rewards/format_reward/std": 0.12736119329929352, + "rewards/mcq_accuracy_reward/mean": 0.6812500119209289, + "rewards/mcq_accuracy_reward/std": 0.4628969132900238, + "rewards/tag_count_reward/mean": 0.9901041746139526, + "rewards/tag_count_reward/std": 0.04958392456173897, + "step": 955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3716.8, + "completions/max_terminated_length": 3716.8, + "completions/mean_length": 874.0343994140625, + "completions/mean_terminated_length": 874.0343994140625, + "completions/min_length": 226.6, + "completions/min_terminated_length": 226.6, + "epoch": 0.25204706191234144, + "grad_norm": 0.3080657453867167, + "kl": 0.105377197265625, + "learning_rate": 1.742392157657684e-06, + "loss": 0.0245, + "num_tokens": 419034824.0, + "reward": 1.1940755367279052, + "reward_std": 0.13542111366987228, + "rewards/format_reward/mean": 0.9880208373069763, + "rewards/format_reward/std": 0.10877147763967514, + "rewards/mcq_accuracy_reward/mean": 0.7005208253860473, + "rewards/mcq_accuracy_reward/std": 0.45785903334617617, + "rewards/tag_count_reward/mean": 0.9861979126930237, + "rewards/tag_count_reward/std": 0.0575270377099514, + "step": 960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4316.2, + "completions/max_terminated_length": 4316.2, + "completions/mean_length": 846.4448120117188, + "completions/mean_terminated_length": 846.4448120117188, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.2533598070264682, + "grad_norm": 0.2501685751539327, + "kl": 0.104644775390625, + "learning_rate": 1.728808149485873e-06, + "loss": 0.0287, + "num_tokens": 420956926.0, + "reward": 1.1921550035476685, + "reward_std": 0.1133539080619812, + "rewards/format_reward/mean": 0.9869791507720947, + "rewards/format_reward/std": 0.10895627439022064, + "rewards/mcq_accuracy_reward/mean": 0.6979166626930237, + "rewards/mcq_accuracy_reward/std": 0.45869707465171816, + "rewards/tag_count_reward/mean": 0.9899739742279052, + "rewards/tag_count_reward/std": 0.04764600470662117, + "step": 965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4768.0, + "completions/max_terminated_length": 4768.0, + "completions/mean_length": 924.1135620117187, + "completions/mean_terminated_length": 924.1135620117187, + "completions/min_length": 203.6, + "completions/min_terminated_length": 203.6, + "epoch": 0.254672552140595, + "grad_norm": 0.3006639326471123, + "kl": 0.101129150390625, + "learning_rate": 1.7152049018276918e-06, + "loss": 0.0267, + "num_tokens": 423026760.0, + "reward": 1.198470115661621, + "reward_std": 0.12352189719676972, + "rewards/format_reward/mean": 0.9916666626930237, + "rewards/format_reward/std": 0.08667475283145905, + "rewards/mcq_accuracy_reward/mean": 0.7026041746139526, + "rewards/mcq_accuracy_reward/std": 0.45497956275939944, + "rewards/tag_count_reward/mean": 0.991796863079071, + "rewards/tag_count_reward/std": 0.044075030088424685, + "step": 970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 5318.4, + "completions/max_terminated_length": 4293.0, + "completions/mean_length": 889.917724609375, + "completions/mean_terminated_length": 886.099072265625, + "completions/min_length": 208.4, + "completions/min_terminated_length": 208.4, + "epoch": 0.2559852972547218, + "grad_norm": 0.2898870675752201, + "kl": 0.105804443359375, + "learning_rate": 1.7015835585213223e-06, + "loss": 0.0264, + "num_tokens": 425032946.0, + "reward": 1.2359049797058106, + "reward_std": 0.11990730464458466, + "rewards/format_reward/mean": 0.9937500119209289, + "rewards/format_reward/std": 0.06808097958564759, + "rewards/mcq_accuracy_reward/mean": 0.7395833373069763, + "rewards/mcq_accuracy_reward/std": 0.4357429265975952, + "rewards/tag_count_reward/mean": 0.9915364623069763, + "rewards/tag_count_reward/std": 0.04548919722437859, + "step": 975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4356.8, + "completions/max_terminated_length": 4356.8, + "completions/mean_length": 924.0609497070312, + "completions/mean_terminated_length": 924.0609497070312, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.25729804236884857, + "grad_norm": 0.3228428209336355, + "kl": 0.0982421875, + "learning_rate": 1.6879452649265323e-06, + "loss": 0.0279, + "num_tokens": 427105031.0, + "reward": 1.205175805091858, + "reward_std": 0.13552329540252686, + "rewards/format_reward/mean": 0.9916666746139526, + "rewards/format_reward/std": 0.08949585258960724, + "rewards/mcq_accuracy_reward/mean": 0.7098958253860473, + "rewards/mcq_accuracy_reward/std": 0.44993178844451903, + "rewards/tag_count_reward/mean": 0.989453125, + "rewards/tag_count_reward/std": 0.05053946226835251, + "step": 980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4094.8, + "completions/max_terminated_length": 4094.8, + "completions/mean_length": 864.8322998046875, + "completions/mean_terminated_length": 864.8322998046875, + "completions/min_length": 236.4, + "completions/min_terminated_length": 236.4, + "epoch": 0.25861078748297534, + "grad_norm": 0.30432222748603166, + "kl": 0.102020263671875, + "learning_rate": 1.6742911678283667e-06, + "loss": 0.0424, + "num_tokens": 429058901.0, + "reward": 1.238378930091858, + "reward_std": 0.12873915433883668, + "rewards/format_reward/mean": 0.989062511920929, + "rewards/format_reward/std": 0.09572007954120636, + "rewards/mcq_accuracy_reward/mean": 0.7442708373069763, + "rewards/mcq_accuracy_reward/std": 0.4367725014686584, + "rewards/tag_count_reward/mean": 0.9873697996139527, + "rewards/tag_count_reward/std": 0.05552826225757599, + "step": 985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001041666666666652, + "completions/max_length": 6258.8, + "completions/max_terminated_length": 5436.8, + "completions/mean_length": 932.652099609375, + "completions/mean_terminated_length": 925.1237915039062, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.2599235325971021, + "grad_norm": 0.377146719414668, + "kl": 0.09991455078125, + "learning_rate": 1.6606224153407162e-06, + "loss": 0.0388, + "num_tokens": 431145689.0, + "reward": 1.174121117591858, + "reward_std": 0.1475616067647934, + "rewards/format_reward/mean": 0.995312488079071, + "rewards/format_reward/std": 0.06536583602428436, + "rewards/mcq_accuracy_reward/mean": 0.6791666626930237, + "rewards/mcq_accuracy_reward/std": 0.46490822434425355, + "rewards/tag_count_reward/mean": 0.9845052003860474, + "rewards/tag_count_reward/std": 0.062242785841226576, + "step": 990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4304.4, + "completions/max_terminated_length": 4304.4, + "completions/mean_length": 947.080224609375, + "completions/mean_terminated_length": 947.080224609375, + "completions/min_length": 204.8, + "completions/min_terminated_length": 204.8, + "epoch": 0.26123627771122887, + "grad_norm": 0.27125983779667057, + "kl": 0.102142333984375, + "learning_rate": 1.6469401568097813e-06, + "loss": 0.0288, + "num_tokens": 433260723.0, + "reward": 1.1469076156616211, + "reward_std": 0.13977819234132766, + "rewards/format_reward/mean": 0.996875, + "rewards/format_reward/std": 0.042252561450004576, + "rewards/mcq_accuracy_reward/mean": 0.6505208253860474, + "rewards/mcq_accuracy_reward/std": 0.47611605525016787, + "rewards/tag_count_reward/mean": 0.988671875, + "rewards/tag_count_reward/std": 0.052698489278554916, + "step": 995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3820.4, + "completions/max_terminated_length": 3820.4, + "completions/mean_length": 844.3453369140625, + "completions/mean_terminated_length": 844.3453369140625, + "completions/min_length": 221.2, + "completions/min_terminated_length": 221.2, + "epoch": 0.2625490228253557, + "grad_norm": 0.24954260942805134, + "kl": 0.109918212890625, + "learning_rate": 1.6332455427174277e-06, + "loss": 0.0312, + "num_tokens": 435177786.0, + "reward": 1.2122070789337158, + "reward_std": 0.12331400215625762, + "rewards/format_reward/mean": 0.996875, + "rewards/format_reward/std": 0.049242216348648074, + "rewards/mcq_accuracy_reward/mean": 0.7151041746139526, + "rewards/mcq_accuracy_reward/std": 0.4488114595413208, + "rewards/tag_count_reward/mean": 0.9915364503860473, + "rewards/tag_count_reward/std": 0.044947605580091476, + "step": 1000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4111.4, + "completions/max_terminated_length": 4111.4, + "completions/mean_length": 853.2729370117188, + "completions/mean_terminated_length": 853.2729370117188, + "completions/min_length": 198.6, + "completions/min_terminated_length": 198.6, + "epoch": 0.26386176793948246, + "grad_norm": 0.23177151169245003, + "kl": 0.3007568359375, + "learning_rate": 1.6195397245844445e-06, + "loss": 0.0336, + "num_tokens": 437113334.0, + "reward": 1.1825195789337157, + "reward_std": 0.14098667800426484, + "rewards/format_reward/mean": 0.9979166626930237, + "rewards/format_reward/std": 0.03482731506228447, + "rewards/mcq_accuracy_reward/mean": 0.6848958373069763, + "rewards/mcq_accuracy_reward/std": 0.46221978664398194, + "rewards/tag_count_reward/mean": 0.992578125, + "rewards/tag_count_reward/std": 0.04234918504953385, + "step": 1005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 5211.6, + "completions/max_terminated_length": 4371.6, + "completions/mean_length": 896.0078491210937, + "completions/mean_terminated_length": 892.189599609375, + "completions/min_length": 215.6, + "completions/min_terminated_length": 215.6, + "epoch": 0.26517451305360923, + "grad_norm": 0.29049434039175254, + "kl": 0.107574462890625, + "learning_rate": 1.6058238548737228e-06, + "loss": 0.0257, + "num_tokens": 439131653.0, + "reward": 1.1943685293197632, + "reward_std": 0.13383956998586655, + "rewards/format_reward/mean": 0.9953125, + "rewards/format_reward/std": 0.05936832129955292, + "rewards/mcq_accuracy_reward/mean": 0.6979166626930237, + "rewards/mcq_accuracy_reward/std": 0.4510037302970886, + "rewards/tag_count_reward/mean": 0.9904947876930237, + "rewards/tag_count_reward/std": 0.0484579287469387, + "step": 1010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3880.0, + "completions/max_terminated_length": 3880.0, + "completions/mean_length": 873.4807495117187, + "completions/mean_terminated_length": 873.4807495117187, + "completions/min_length": 220.6, + "completions/min_terminated_length": 220.6, + "epoch": 0.266487258167736, + "grad_norm": 0.2769645988344639, + "kl": 0.10634765625, + "learning_rate": 1.5920990868933454e-06, + "loss": 0.0307, + "num_tokens": 441107384.0, + "reward": 1.183789110183716, + "reward_std": 0.11905215680599213, + "rewards/format_reward/mean": 0.9927083373069763, + "rewards/format_reward/std": 0.08494071364402771, + "rewards/mcq_accuracy_reward/mean": 0.6885416626930236, + "rewards/mcq_accuracy_reward/std": 0.45600869655609133, + "rewards/tag_count_reward/mean": 0.9882812380790711, + "rewards/tag_count_reward/std": 0.05228431895375252, + "step": 1015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5916.8, + "completions/max_terminated_length": 5916.8, + "completions/mean_length": 912.341162109375, + "completions/mean_terminated_length": 912.341162109375, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.26780000328186276, + "grad_norm": 0.30781882633491553, + "kl": 0.106732177734375, + "learning_rate": 1.5783665746996125e-06, + "loss": 0.0326, + "num_tokens": 443154503.0, + "reward": 1.238509178161621, + "reward_std": 0.12810098826885224, + "rewards/format_reward/mean": 0.9947916626930237, + "rewards/format_reward/std": 0.060746153444051744, + "rewards/mcq_accuracy_reward/mean": 0.7427083253860474, + "rewards/mcq_accuracy_reward/std": 0.4240863502025604, + "rewards/tag_count_reward/mean": 0.9884114503860474, + "rewards/tag_count_reward/std": 0.05227399542927742, + "step": 1020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 5931.2, + "completions/max_terminated_length": 5046.8, + "completions/mean_length": 919.4447998046875, + "completions/mean_terminated_length": 915.6514892578125, + "completions/min_length": 229.8, + "completions/min_terminated_length": 229.8, + "epoch": 0.2691127483959896, + "grad_norm": 0.2587146731266212, + "kl": 0.108319091796875, + "learning_rate": 1.5646274730000026e-06, + "loss": 0.047, + "num_tokens": 445219509.0, + "reward": 1.258007860183716, + "reward_std": 0.11905478984117508, + "rewards/format_reward/mean": 0.9932291626930236, + "rewards/format_reward/std": 0.07087225764989853, + "rewards/mcq_accuracy_reward/mean": 0.7630208373069763, + "rewards/mcq_accuracy_reward/std": 0.4199582636356354, + "rewards/tag_count_reward/mean": 0.9867187380790711, + "rewards/tag_count_reward/std": 0.05925486981868744, + "step": 1025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4526.4, + "completions/max_terminated_length": 4526.4, + "completions/mean_length": 975.3312866210938, + "completions/mean_terminated_length": 975.3312866210938, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.27042549351011635, + "grad_norm": 0.29128562698392996, + "kl": 0.099261474609375, + "learning_rate": 1.550882937056076e-06, + "loss": 0.0316, + "num_tokens": 447389921.0, + "reward": 1.2530925035476685, + "reward_std": 0.13542728424072265, + "rewards/format_reward/mean": 0.9942708253860474, + "rewards/format_reward/std": 0.0727910801768303, + "rewards/mcq_accuracy_reward/mean": 0.7567708373069764, + "rewards/mcq_accuracy_reward/std": 0.426976078748703, + "rewards/tag_count_reward/mean": 0.991015625, + "rewards/tag_count_reward/std": 0.046357862651348114, + "step": 1030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4289.4, + "completions/max_terminated_length": 4289.4, + "completions/mean_length": 885.291162109375, + "completions/mean_terminated_length": 885.291162109375, + "completions/min_length": 239.6, + "completions/min_terminated_length": 239.6, + "epoch": 0.2717382386242431, + "grad_norm": 0.2710651427961778, + "kl": 0.10098876953125, + "learning_rate": 1.5371341225863354e-06, + "loss": 0.0174, + "num_tokens": 449386976.0, + "reward": 1.2436198234558105, + "reward_std": 0.11783002018928528, + "rewards/format_reward/mean": 0.996875, + "rewards/format_reward/std": 0.042252561450004576, + "rewards/mcq_accuracy_reward/mean": 0.7463541626930237, + "rewards/mcq_accuracy_reward/std": 0.4305288910865784, + "rewards/tag_count_reward/mean": 0.9921875, + "rewards/tag_count_reward/std": 0.04338146075606346, + "step": 1035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4191.2, + "completions/max_terminated_length": 4191.2, + "completions/mean_length": 852.6047119140625, + "completions/mean_terminated_length": 852.6047119140625, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.2730509837383699, + "grad_norm": 0.3475445562780726, + "kl": 0.10545654296875, + "learning_rate": 1.5233821856690467e-06, + "loss": 0.0394, + "num_tokens": 451318281.0, + "reward": 1.211523461341858, + "reward_std": 0.12902569472789766, + "rewards/format_reward/mean": 0.9979166746139526, + "rewards/format_reward/std": 0.028829801082611083, + "rewards/mcq_accuracy_reward/mean": 0.7145833373069763, + "rewards/mcq_accuracy_reward/std": 0.45018666982650757, + "rewards/tag_count_reward/mean": 0.98984375, + "rewards/tag_count_reward/std": 0.04915663674473762, + "step": 1040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3797.8, + "completions/max_terminated_length": 3797.8, + "completions/mean_length": 872.7442993164062, + "completions/mean_terminated_length": 872.7442993164062, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.27436372885249666, + "grad_norm": 0.2956925330835969, + "kl": 0.129901123046875, + "learning_rate": 1.5096282826450285e-06, + "loss": 0.0274, + "num_tokens": 453290582.0, + "reward": 1.1625651597976685, + "reward_std": 0.14105179607868196, + "rewards/format_reward/mean": 0.9963541626930237, + "rewards/format_reward/std": 0.05245876908302307, + "rewards/mcq_accuracy_reward/mean": 0.6661458373069763, + "rewards/mcq_accuracy_reward/std": 0.4634169816970825, + "rewards/tag_count_reward/mean": 0.9893229126930236, + "rewards/tag_count_reward/std": 0.050521744042634965, + "step": 1045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4074.0, + "completions/max_terminated_length": 4074.0, + "completions/mean_length": 877.01513671875, + "completions/mean_terminated_length": 877.01513671875, + "completions/min_length": 192.2, + "completions/min_terminated_length": 192.2, + "epoch": 0.2756764739666235, + "grad_norm": 0.34489407071980144, + "kl": 0.103094482421875, + "learning_rate": 1.4958735700204207e-06, + "loss": 0.0293, + "num_tokens": 455271499.0, + "reward": 1.2530924797058105, + "reward_std": 0.1419347956776619, + "rewards/format_reward/mean": 0.9979166626930237, + "rewards/format_reward/std": 0.03482731580734253, + "rewards/mcq_accuracy_reward/mean": 0.7562499880790711, + "rewards/mcq_accuracy_reward/std": 0.42783520817756654, + "rewards/tag_count_reward/mean": 0.989453136920929, + "rewards/tag_count_reward/std": 0.04957008436322212, + "step": 1050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001041666666666652, + "completions/max_length": 5991.2, + "completions/max_terminated_length": 4366.2, + "completions/mean_length": 921.11201171875, + "completions/mean_terminated_length": 913.590478515625, + "completions/min_length": 188.2, + "completions/min_terminated_length": 188.2, + "epoch": 0.27698921908075025, + "grad_norm": 0.4061969534972234, + "kl": 0.10203857421875, + "learning_rate": 1.482119204369439e-06, + "loss": 0.04, + "num_tokens": 457339650.0, + "reward": 1.2165690660476685, + "reward_std": 0.1536157339811325, + "rewards/format_reward/mean": 0.9947916626930237, + "rewards/format_reward/std": 0.07009022235870362, + "rewards/mcq_accuracy_reward/mean": 0.7223958253860474, + "rewards/mcq_accuracy_reward/std": 0.4433733582496643, + "rewards/tag_count_reward/mean": 0.9819010496139526, + "rewards/tag_count_reward/std": 0.06520590037107468, + "step": 1055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0015624999999999778, + "completions/max_length": 6559.2, + "completions/max_terminated_length": 4498.4, + "completions/mean_length": 909.3864868164062, + "completions/mean_terminated_length": 898.0133544921875, + "completions/min_length": 219.8, + "completions/min_terminated_length": 219.8, + "epoch": 0.278301964194877, + "grad_norm": 0.3091369215584339, + "kl": 0.10927734375, + "learning_rate": 1.4683663422371244e-06, + "loss": 0.0573, + "num_tokens": 459386728.0, + "reward": 1.2358073234558105, + "reward_std": 0.14321452528238296, + "rewards/format_reward/mean": 0.9942708253860474, + "rewards/format_reward/std": 0.06528573334217072, + "rewards/mcq_accuracy_reward/mean": 0.7427083373069763, + "rewards/mcq_accuracy_reward/std": 0.4347099125385284, + "rewards/tag_count_reward/mean": 0.978125, + "rewards/tag_count_reward/std": 0.07187658026814461, + "step": 1060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 5785.2, + "completions/max_terminated_length": 5189.8, + "completions/mean_length": 1008.1125366210938, + "completions/mean_terminated_length": 1004.3799438476562, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.2796147093090038, + "grad_norm": 0.3404790488225519, + "kl": 0.095965576171875, + "learning_rate": 1.454616140042092e-06, + "loss": 0.0383, + "num_tokens": 461620056.0, + "reward": 1.2148763418197632, + "reward_std": 0.15013333261013032, + "rewards/format_reward/mean": 0.9947916507720947, + "rewards/format_reward/std": 0.06107703745365143, + "rewards/mcq_accuracy_reward/mean": 0.71875, + "rewards/mcq_accuracy_reward/std": 0.446232408285141, + "rewards/tag_count_reward/mean": 0.9897135376930237, + "rewards/tag_count_reward/std": 0.04962035268545151, + "step": 1065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4499.6, + "completions/max_terminated_length": 4499.6, + "completions/mean_length": 885.6682495117187, + "completions/mean_terminated_length": 885.6682495117187, + "completions/min_length": 207.8, + "completions/min_terminated_length": 207.8, + "epoch": 0.28092745442313055, + "grad_norm": 0.25733702728091395, + "kl": 0.097802734375, + "learning_rate": 1.4408697539792954e-06, + "loss": 0.0289, + "num_tokens": 463619419.0, + "reward": 1.25390625, + "reward_std": 0.13167650550603865, + "rewards/format_reward/mean": 0.9984374880790711, + "rewards/format_reward/std": 0.03061862289905548, + "rewards/mcq_accuracy_reward/mean": 0.756250011920929, + "rewards/mcq_accuracy_reward/std": 0.42550272345542905, + "rewards/tag_count_reward/mean": 0.9921875, + "rewards/tag_count_reward/std": 0.042726832628250125, + "step": 1070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 4707.8, + "completions/max_terminated_length": 3936.4, + "completions/mean_length": 829.01982421875, + "completions/mean_terminated_length": 825.1350463867187, + "completions/min_length": 194.6, + "completions/min_terminated_length": 194.6, + "epoch": 0.2822401995372574, + "grad_norm": 0.24879251307010478, + "kl": 0.101123046875, + "learning_rate": 1.4271283399228065e-06, + "loss": 0.0297, + "num_tokens": 465510921.0, + "reward": 1.2486002683639525, + "reward_std": 0.10949242562055587, + "rewards/format_reward/mean": 0.995312488079071, + "rewards/format_reward/std": 0.05837618038058281, + "rewards/mcq_accuracy_reward/mean": 0.7520833373069763, + "rewards/mcq_accuracy_reward/std": 0.4282021701335907, + "rewards/tag_count_reward/mean": 0.9907552003860474, + "rewards/tag_count_reward/std": 0.047438500076532365, + "step": 1075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4584.8, + "completions/max_terminated_length": 4584.8, + "completions/mean_length": 915.3640747070312, + "completions/mean_terminated_length": 915.3640747070312, + "completions/min_length": 211.8, + "completions/min_terminated_length": 211.8, + "epoch": 0.28355294465138414, + "grad_norm": 0.3030733618497136, + "kl": 0.0978973388671875, + "learning_rate": 1.4133930533286217e-06, + "loss": 0.0363, + "num_tokens": 467561460.0, + "reward": 1.1730794429779052, + "reward_std": 0.15443700850009917, + "rewards/format_reward/mean": 0.9927083253860474, + "rewards/format_reward/std": 0.08107846453785897, + "rewards/mcq_accuracy_reward/mean": 0.6776041746139526, + "rewards/mcq_accuracy_reward/std": 0.46696102023124697, + "rewards/tag_count_reward/mean": 0.9891927123069764, + "rewards/tag_count_reward/std": 0.05146094560623169, + "step": 1080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 4552.0, + "completions/max_terminated_length": 4491.2, + "completions/mean_length": 853.025537109375, + "completions/mean_terminated_length": 849.2734619140625, + "completions/min_length": 194.2, + "completions/min_terminated_length": 194.2, + "epoch": 0.2848656897655109, + "grad_norm": 0.6470303786029298, + "kl": 0.104852294921875, + "learning_rate": 1.3996650491375055e-06, + "loss": 0.033, + "num_tokens": 469496621.0, + "reward": 1.2207682609558106, + "reward_std": 0.12598108798265456, + "rewards/format_reward/mean": 0.99375, + "rewards/format_reward/std": 0.06817139983177185, + "rewards/mcq_accuracy_reward/mean": 0.725, + "rewards/mcq_accuracy_reward/std": 0.44501650929450987, + "rewards/tag_count_reward/mean": 0.9893229246139527, + "rewards/tag_count_reward/std": 0.05279574990272522, + "step": 1085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3871.6, + "completions/max_terminated_length": 3871.6, + "completions/mean_length": 881.7875122070312, + "completions/mean_terminated_length": 881.7875122070312, + "completions/min_length": 221.8, + "completions/min_terminated_length": 221.8, + "epoch": 0.2861784348796377, + "grad_norm": 0.324537158637336, + "kl": 0.0988037109375, + "learning_rate": 1.3859454816778784e-06, + "loss": 0.0302, + "num_tokens": 471489741.0, + "reward": 1.176595115661621, + "reward_std": 0.1287701666355133, + "rewards/format_reward/mean": 0.9916666626930237, + "rewards/format_reward/std": 0.08667475283145905, + "rewards/mcq_accuracy_reward/mean": 0.6817708373069763, + "rewards/mcq_accuracy_reward/std": 0.4618507921695709, + "rewards/tag_count_reward/mean": 0.9876302003860473, + "rewards/tag_count_reward/std": 0.05513873398303985, + "step": 1090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 5293.0, + "completions/max_terminated_length": 4399.8, + "completions/mean_length": 949.5380737304688, + "completions/mean_terminated_length": 945.680908203125, + "completions/min_length": 179.4, + "completions/min_terminated_length": 179.4, + "epoch": 0.28749117999376445, + "grad_norm": 0.260870671956979, + "kl": 0.098046875, + "learning_rate": 1.372235504568751e-06, + "loss": 0.0355, + "num_tokens": 473611670.0, + "reward": 1.2236653804779052, + "reward_std": 0.152621129155159, + "rewards/format_reward/mean": 0.99375, + "rewards/format_reward/std": 0.06708883941173553, + "rewards/mcq_accuracy_reward/mean": 0.728125, + "rewards/mcq_accuracy_reward/std": 0.4447413980960846, + "rewards/tag_count_reward/mean": 0.9884114503860474, + "rewards/tag_count_reward/std": 0.053086066246032716, + "step": 1095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4885.8, + "completions/max_terminated_length": 4885.8, + "completions/mean_length": 988.8536743164062, + "completions/mean_terminated_length": 988.8536743164062, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.2888039251078912, + "grad_norm": 0.2975137983799119, + "kl": 0.094390869140625, + "learning_rate": 1.3585362706227228e-06, + "loss": 0.0369, + "num_tokens": 475810525.0, + "reward": 1.208789086341858, + "reward_std": 0.1461799308657646, + "rewards/format_reward/mean": 0.9901041626930237, + "rewards/format_reward/std": 0.0981141209602356, + "rewards/mcq_accuracy_reward/mean": 0.7140625, + "rewards/mcq_accuracy_reward/std": 0.45147722363471987, + "rewards/tag_count_reward/mean": 0.9888020873069763, + "rewards/tag_count_reward/std": 0.05229301229119301, + "step": 1100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 4545.4, + "completions/max_terminated_length": 3545.2, + "completions/mean_length": 950.5484741210937, + "completions/mean_terminated_length": 946.7467651367188, + "completions/min_length": 207.2, + "completions/min_terminated_length": 207.2, + "epoch": 0.29011667022201804, + "grad_norm": 0.24140918352460633, + "kl": 0.09429931640625, + "learning_rate": 1.3448489317490492e-06, + "loss": 0.0374, + "num_tokens": 477932418.0, + "reward": 1.150195336341858, + "reward_std": 0.14613086134195327, + "rewards/format_reward/mean": 0.9927083373069763, + "rewards/format_reward/std": 0.08442502021789551, + "rewards/mcq_accuracy_reward/mean": 0.6541666746139526, + "rewards/mcq_accuracy_reward/std": 0.4752920091152191, + "rewards/tag_count_reward/mean": 0.99140625, + "rewards/tag_count_reward/std": 0.04609321057796478, + "step": 1105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 5258.8, + "completions/max_terminated_length": 5185.6, + "completions/mean_length": 867.9718872070313, + "completions/mean_terminated_length": 864.1667236328125, + "completions/min_length": 175.6, + "completions/min_terminated_length": 175.6, + "epoch": 0.2914294153361448, + "grad_norm": 0.26441462179572645, + "kl": 0.100274658203125, + "learning_rate": 1.331174638856778e-06, + "loss": 0.0297, + "num_tokens": 479894156.0, + "reward": 1.2019206285476685, + "reward_std": 0.12067270874977112, + "rewards/format_reward/mean": 0.9927083492279053, + "rewards/format_reward/std": 0.08357844054698944, + "rewards/mcq_accuracy_reward/mean": 0.7057291626930237, + "rewards/mcq_accuracy_reward/std": 0.45124209523200987, + "rewards/tag_count_reward/mean": 0.9920572876930237, + "rewards/tag_count_reward/std": 0.044434083998203276, + "step": 1110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4152.6, + "completions/max_terminated_length": 4152.6, + "completions/mean_length": 904.1000244140625, + "completions/mean_terminated_length": 904.1000244140625, + "completions/min_length": 189.8, + "completions/min_terminated_length": 189.8, + "epoch": 0.29274216045027157, + "grad_norm": 0.35755153158816827, + "kl": 0.09925537109375, + "learning_rate": 1.3175145417579778e-06, + "loss": 0.0277, + "num_tokens": 481928604.0, + "reward": 1.1956706047058105, + "reward_std": 0.1424822062253952, + "rewards/format_reward/mean": 0.9911458373069764, + "rewards/format_reward/std": 0.09162984490394592, + "rewards/mcq_accuracy_reward/mean": 0.7005208253860473, + "rewards/mcq_accuracy_reward/std": 0.45600619316101076, + "rewards/tag_count_reward/mean": 0.989453125, + "rewards/tag_count_reward/std": 0.05053408965468407, + "step": 1115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 4631.4, + "completions/max_terminated_length": 3726.2, + "completions/mean_length": 862.3536743164062, + "completions/mean_terminated_length": 858.5463500976563, + "completions/min_length": 209.8, + "completions/min_terminated_length": 209.8, + "epoch": 0.29405490556439834, + "grad_norm": 0.30078468112358725, + "kl": 0.1020263671875, + "learning_rate": 1.303869789071056e-06, + "loss": 0.0279, + "num_tokens": 483886123.0, + "reward": 1.1878581285476684, + "reward_std": 0.15787606835365295, + "rewards/format_reward/mean": 0.9869791626930237, + "rewards/format_reward/std": 0.11229306608438491, + "rewards/mcq_accuracy_reward/mean": 0.6932291626930237, + "rewards/mcq_accuracy_reward/std": 0.46140028834342955, + "rewards/tag_count_reward/mean": 0.9915364503860473, + "rewards/tag_count_reward/std": 0.04553716480731964, + "step": 1120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3956.4, + "completions/max_terminated_length": 3956.4, + "completions/mean_length": 846.9072998046875, + "completions/mean_terminated_length": 846.9072998046875, + "completions/min_length": 213.6, + "completions/min_terminated_length": 213.6, + "epoch": 0.2953676506785251, + "grad_norm": 0.33255686175964827, + "kl": 0.09791259765625, + "learning_rate": 1.2902415281241725e-06, + "loss": 0.0366, + "num_tokens": 485811977.0, + "reward": 1.164453148841858, + "reward_std": 0.14701950252056123, + "rewards/format_reward/mean": 0.9911458373069764, + "rewards/format_reward/std": 0.08979112505912781, + "rewards/mcq_accuracy_reward/mean": 0.6692708492279053, + "rewards/mcq_accuracy_reward/std": 0.46931456923484804, + "rewards/tag_count_reward/mean": 0.9895833373069763, + "rewards/tag_count_reward/std": 0.05044736191630363, + "step": 1125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3750.0, + "completions/max_terminated_length": 3750.0, + "completions/mean_length": 870.0198120117187, + "completions/mean_terminated_length": 870.0198120117187, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.29668039579265193, + "grad_norm": 0.28920552791834225, + "kl": 0.097833251953125, + "learning_rate": 1.2766309048587689e-06, + "loss": 0.0306, + "num_tokens": 487779143.0, + "reward": 1.2136393308639526, + "reward_std": 0.1202643096446991, + "rewards/format_reward/mean": 0.9916666626930237, + "rewards/format_reward/std": 0.08558243215084076, + "rewards/mcq_accuracy_reward/mean": 0.7182291626930237, + "rewards/mcq_accuracy_reward/std": 0.4479684352874756, + "rewards/tag_count_reward/mean": 0.9899739623069763, + "rewards/tag_count_reward/std": 0.0489795096218586, + "step": 1130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 5059.8, + "completions/max_terminated_length": 4232.4, + "completions/mean_length": 967.6573120117188, + "completions/mean_terminated_length": 963.84794921875, + "completions/min_length": 233.2, + "completions/min_terminated_length": 233.2, + "epoch": 0.2979931409067787, + "grad_norm": 0.28966310055274447, + "kl": 0.095050048828125, + "learning_rate": 1.263039063733212e-06, + "loss": 0.0322, + "num_tokens": 489935597.0, + "reward": 1.196679711341858, + "reward_std": 0.13751293271780013, + "rewards/format_reward/mean": 0.9901041626930237, + "rewards/format_reward/std": 0.0986298143863678, + "rewards/mcq_accuracy_reward/mean": 0.7015625, + "rewards/mcq_accuracy_reward/std": 0.4548418581485748, + "rewards/tag_count_reward/mean": 0.9903645753860474, + "rewards/tag_count_reward/std": 0.048744142055511475, + "step": 1135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3950.2, + "completions/max_terminated_length": 3950.2, + "completions/mean_length": 914.4963745117187, + "completions/mean_terminated_length": 914.4963745117187, + "completions/min_length": 234.8, + "completions/min_terminated_length": 234.8, + "epoch": 0.29930588602090547, + "grad_norm": 0.299714600368771, + "kl": 0.09405517578125, + "learning_rate": 1.249467147626558e-06, + "loss": 0.0207, + "num_tokens": 491988614.0, + "reward": 1.2069662094116211, + "reward_std": 0.12101843655109405, + "rewards/format_reward/mean": 0.9869791746139527, + "rewards/format_reward/std": 0.10958365797996521, + "rewards/mcq_accuracy_reward/mean": 0.7130208373069763, + "rewards/mcq_accuracy_reward/std": 0.44965891242027284, + "rewards/tag_count_reward/mean": 0.9888020873069763, + "rewards/tag_count_reward/std": 0.05231942683458328, + "step": 1140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001041666666666652, + "completions/max_length": 5572.0, + "completions/max_terminated_length": 4058.2, + "completions/mean_length": 926.9687744140625, + "completions/mean_terminated_length": 919.4324096679687, + "completions/min_length": 234.2, + "completions/min_terminated_length": 234.2, + "epoch": 0.30061863113503223, + "grad_norm": 0.251132111836919, + "kl": 0.09281005859375, + "learning_rate": 1.2359162977424545e-06, + "loss": 0.0397, + "num_tokens": 494065226.0, + "reward": 1.2354166984558106, + "reward_std": 0.12176204025745392, + "rewards/format_reward/mean": 0.9885416746139526, + "rewards/format_reward/std": 0.10311658084392547, + "rewards/mcq_accuracy_reward/mean": 0.740625, + "rewards/mcq_accuracy_reward/std": 0.43262937664985657, + "rewards/tag_count_reward/mean": 0.990625, + "rewards/tag_count_reward/std": 0.049148285388946535, + "step": 1145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4279.4, + "completions/max_terminated_length": 4279.4, + "completions/mean_length": 988.9099243164062, + "completions/mean_terminated_length": 988.9099243164062, + "completions/min_length": 241.8, + "completions/min_terminated_length": 241.8, + "epoch": 0.301931376249159, + "grad_norm": 0.2874075192803983, + "kl": 0.088531494140625, + "learning_rate": 1.2223876535131837e-06, + "loss": 0.0282, + "num_tokens": 496259701.0, + "reward": 1.1604492664337158, + "reward_std": 0.1415869861841202, + "rewards/format_reward/mean": 0.995312488079071, + "rewards/format_reward/std": 0.06588152945041656, + "rewards/mcq_accuracy_reward/mean": 0.6640625119209289, + "rewards/mcq_accuracy_reward/std": 0.4699177861213684, + "rewards/tag_count_reward/mean": 0.990234375, + "rewards/tag_count_reward/std": 0.04840839058160782, + "step": 1150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 5687.2, + "completions/max_terminated_length": 4687.4, + "completions/mean_length": 941.8541748046875, + "completions/mean_terminated_length": 938.0679565429688, + "completions/min_length": 238.8, + "completions/min_terminated_length": 238.8, + "epoch": 0.3032441213632858, + "grad_norm": 0.24995204018323447, + "kl": 0.088568115234375, + "learning_rate": 1.2088823525038488e-06, + "loss": 0.0246, + "num_tokens": 498368509.0, + "reward": 1.177441453933716, + "reward_std": 0.1310253456234932, + "rewards/format_reward/mean": 0.9953125, + "rewards/format_reward/std": 0.05988401472568512, + "rewards/mcq_accuracy_reward/mean": 0.6807291626930236, + "rewards/mcq_accuracy_reward/std": 0.4657273292541504, + "rewards/tag_count_reward/mean": 0.9915364623069763, + "rewards/tag_count_reward/std": 0.04745750948786735, + "step": 1155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 5310.6, + "completions/max_terminated_length": 4572.4, + "completions/mean_length": 922.123974609375, + "completions/mean_terminated_length": 918.2950439453125, + "completions/min_length": 207.6, + "completions/min_terminated_length": 207.6, + "epoch": 0.3045568664774126, + "grad_norm": 0.2560795563981496, + "kl": 0.092950439453125, + "learning_rate": 1.1954015303167226e-06, + "loss": 0.0295, + "num_tokens": 500436299.0, + "reward": 1.2002279043197632, + "reward_std": 0.1334495574235916, + "rewards/format_reward/mean": 0.9916666626930237, + "rewards/format_reward/std": 0.08667475283145905, + "rewards/mcq_accuracy_reward/mean": 0.7041666626930236, + "rewards/mcq_accuracy_reward/std": 0.45614818334579466, + "rewards/tag_count_reward/mean": 0.992578125, + "rewards/tag_count_reward/std": 0.04358335807919502, + "step": 1160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4696.8, + "completions/max_terminated_length": 4696.8, + "completions/mean_length": 879.1088745117188, + "completions/mean_terminated_length": 879.1088745117188, + "completions/min_length": 205.6, + "completions/min_terminated_length": 205.6, + "epoch": 0.30586961159153936, + "grad_norm": 0.31978793470045697, + "kl": 0.0942626953125, + "learning_rate": 1.1819463204957599e-06, + "loss": 0.0392, + "num_tokens": 502419660.0, + "reward": 1.195149803161621, + "reward_std": 0.13009956032037734, + "rewards/format_reward/mean": 0.9942708373069763, + "rewards/format_reward/std": 0.06679356694221497, + "rewards/mcq_accuracy_reward/mean": 0.698437511920929, + "rewards/mcq_accuracy_reward/std": 0.4529357612133026, + "rewards/tag_count_reward/mean": 0.992578125, + "rewards/tag_count_reward/std": 0.0423842042684555, + "step": 1165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0015624999999999778, + "completions/max_length": 6250.8, + "completions/max_terminated_length": 3782.8, + "completions/mean_length": 872.009912109375, + "completions/mean_terminated_length": 860.6198974609375, + "completions/min_length": 198.2, + "completions/min_terminated_length": 198.2, + "epoch": 0.3071823567056661, + "grad_norm": 0.29161469623824227, + "kl": 0.09508056640625, + "learning_rate": 1.168517854431284e-06, + "loss": 0.0387, + "num_tokens": 504388799.0, + "reward": 1.2068034410476685, + "reward_std": 0.13980658054351808, + "rewards/format_reward/mean": 0.9885416746139526, + "rewards/format_reward/std": 0.10216155499219895, + "rewards/mcq_accuracy_reward/mean": 0.7119791626930236, + "rewards/mcq_accuracy_reward/std": 0.45085117816925047, + "rewards/tag_count_reward/mean": 0.9907552123069763, + "rewards/tag_count_reward/std": 0.050213661044836044, + "step": 1170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.000520833333333326, + "completions/max_length": 4886.8, + "completions/max_terminated_length": 4418.4, + "completions/mean_length": 877.2281494140625, + "completions/mean_terminated_length": 873.3738891601563, + "completions/min_length": 146.8, + "completions/min_terminated_length": 146.8, + "epoch": 0.3084951018197929, + "grad_norm": 0.2508499764133376, + "kl": 0.096417236328125, + "learning_rate": 1.1551172612648498e-06, + "loss": 0.0401, + "num_tokens": 506367589.0, + "reward": 1.1960612297058106, + "reward_std": 0.15048128068447114, + "rewards/format_reward/mean": 0.9880208253860474, + "rewards/format_reward/std": 0.10596014261245727, + "rewards/mcq_accuracy_reward/mean": 0.7015625, + "rewards/mcq_accuracy_reward/std": 0.4549172818660736, + "rewards/tag_count_reward/mean": 0.9899739503860474, + "rewards/tag_count_reward/std": 0.052994024753570554, + "step": 1175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.002083333333333326, + "completions/max_length": 6358.4, + "completions/max_terminated_length": 4897.2, + "completions/mean_length": 886.8026245117187, + "completions/mean_terminated_length": 871.60263671875, + "completions/min_length": 190.2, + "completions/min_terminated_length": 190.2, + "epoch": 0.3098078469339197, + "grad_norm": 0.29417987849227795, + "kl": 0.09688720703125, + "learning_rate": 1.1417456677943009e-06, + "loss": 0.0422, + "num_tokens": 508365546.0, + "reward": 1.2189779043197633, + "reward_std": 0.13087325543165207, + "rewards/format_reward/mean": 0.9848958373069763, + "rewards/format_reward/std": 0.12030345052480698, + "rewards/mcq_accuracy_reward/mean": 0.725000011920929, + "rewards/mcq_accuracy_reward/std": 0.44554378390312194, + "rewards/tag_count_reward/mean": 0.991015625, + "rewards/tag_count_reward/std": 0.054814372956752774, + "step": 1180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0015625, + "completions/max_length": 6994.0, + "completions/max_terminated_length": 5690.8, + "completions/mean_length": 959.602099609375, + "completions/mean_terminated_length": 948.2137939453125, + "completions/min_length": 228.4, + "completions/min_terminated_length": 228.4, + "epoch": 0.3111205920480465, + "grad_norm": 0.3008724063490892, + "kl": 0.095574951171875, + "learning_rate": 1.128404198379024e-06, + "loss": 0.0466, + "num_tokens": 510506974.0, + "reward": 1.2033854722976685, + "reward_std": 0.14370514154434205, + "rewards/format_reward/mean": 0.989062488079071, + "rewards/format_reward/std": 0.1034646674990654, + "rewards/mcq_accuracy_reward/mean": 0.7088541626930237, + "rewards/mcq_accuracy_reward/std": 0.4540067434310913, + "rewards/tag_count_reward/mean": 0.9890625, + "rewards/tag_count_reward/std": 0.05478756055235863, + "step": 1185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0026041666666666297, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 4056.6, + "completions/mean_length": 941.2953369140625, + "completions/mean_terminated_length": 922.3639770507813, + "completions/min_length": 203.2, + "completions/min_terminated_length": 203.2, + "epoch": 0.31243333716217325, + "grad_norm": 0.23956552212047857, + "kl": 0.0982666015625, + "learning_rate": 1.1150939748454025e-06, + "loss": 0.0418, + "num_tokens": 512613829.0, + "reward": 1.2209961414337158, + "reward_std": 0.1391269087791443, + "rewards/format_reward/mean": 0.9859375, + "rewards/format_reward/std": 0.11754343211650849, + "rewards/mcq_accuracy_reward/mean": 0.7276041746139527, + "rewards/mcq_accuracy_reward/std": 0.44260039925575256, + "rewards/tag_count_reward/mean": 0.9876302003860473, + "rewards/tag_count_reward/std": 0.06373499035835266, + "step": 1190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0015624999999999778, + "completions/max_length": 6174.0, + "completions/max_terminated_length": 3870.2, + "completions/mean_length": 869.4922119140625, + "completions/mean_terminated_length": 858.0418334960938, + "completions/min_length": 212.6, + "completions/min_terminated_length": 212.6, + "epoch": 0.3137460822763, + "grad_norm": 0.2780316798852266, + "kl": 0.1012298583984375, + "learning_rate": 1.1018161163924888e-06, + "loss": 0.0438, + "num_tokens": 514580046.0, + "reward": 1.1597656607627869, + "reward_std": 0.14694774746894837, + "rewards/format_reward/mean": 0.9864583373069763, + "rewards/format_reward/std": 0.11466304063796998, + "rewards/mcq_accuracy_reward/mean": 0.665625, + "rewards/mcq_accuracy_reward/std": 0.4638380169868469, + "rewards/tag_count_reward/mean": 0.9901041746139526, + "rewards/tag_count_reward/std": 0.05345030203461647, + "step": 1195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 7206.8, + "completions/max_terminated_length": 3790.0, + "completions/mean_length": 842.1812744140625, + "completions/mean_terminated_length": 819.1089477539062, + "completions/min_length": 231.2, + "completions/min_terminated_length": 231.2, + "epoch": 0.3150588273904268, + "grad_norm": 0.2374488842954211, + "kl": 0.104351806640625, + "learning_rate": 1.0885717394978982e-06, + "loss": 0.0589, + "num_tokens": 516502810.0, + "reward": 1.2016601800918578, + "reward_std": 0.1265686884522438, + "rewards/format_reward/mean": 0.9895833253860473, + "rewards/format_reward/std": 0.1007638081908226, + "rewards/mcq_accuracy_reward/mean": 0.7067708373069763, + "rewards/mcq_accuracy_reward/std": 0.4535964548587799, + "rewards/tag_count_reward/mean": 0.9899739623069763, + "rewards/tag_count_reward/std": 0.054083701223134995, + "step": 1200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0026041666666666518, + "completions/max_length": 6578.8, + "completions/max_terminated_length": 4280.4, + "completions/mean_length": 875.673974609375, + "completions/mean_terminated_length": 856.6267211914062, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.3163715725045536, + "grad_norm": 0.3187040271359626, + "kl": 0.100946044921875, + "learning_rate": 1.0753619578239243e-06, + "loss": 0.0453, + "num_tokens": 518483096.0, + "reward": 1.175227904319763, + "reward_std": 0.1521242767572403, + "rewards/format_reward/mean": 0.9895833253860473, + "rewards/format_reward/std": 0.09950171709060669, + "rewards/mcq_accuracy_reward/mean": 0.6807291626930236, + "rewards/mcq_accuracy_reward/std": 0.4634882271289825, + "rewards/tag_count_reward/mean": 0.9884114623069763, + "rewards/tag_count_reward/std": 0.05796384140849113, + "step": 1205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0015625, + "completions/max_length": 5600.2, + "completions/max_terminated_length": 4714.0, + "completions/mean_length": 868.8469116210938, + "completions/mean_terminated_length": 857.5214599609375, + "completions/min_length": 205.8, + "completions/min_terminated_length": 205.8, + "epoch": 0.3176843176186804, + "grad_norm": 0.3427318737359656, + "kl": 0.1036865234375, + "learning_rate": 1.0621878821238985e-06, + "loss": 0.037, + "num_tokens": 520451386.0, + "reward": 1.1977539539337159, + "reward_std": 0.14127858132123947, + "rewards/format_reward/mean": 0.9916666626930237, + "rewards/format_reward/std": 0.0908581256866455, + "rewards/mcq_accuracy_reward/mean": 0.7026041626930237, + "rewards/mcq_accuracy_reward/std": 0.45714895725250243, + "rewards/tag_count_reward/mean": 0.9889322996139527, + "rewards/tag_count_reward/std": 0.0535339318215847, + "step": 1210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001041666666666652, + "completions/max_length": 5354.4, + "completions/max_terminated_length": 4611.4, + "completions/mean_length": 868.8078247070313, + "completions/mean_terminated_length": 861.2146728515625, + "completions/min_length": 227.6, + "completions/min_terminated_length": 227.6, + "epoch": 0.31899706273280715, + "grad_norm": 0.3107578809190733, + "kl": 0.096490478515625, + "learning_rate": 1.0490506201487941e-06, + "loss": 0.0444, + "num_tokens": 522426057.0, + "reward": 1.199641966819763, + "reward_std": 0.13278701901435852, + "rewards/format_reward/mean": 0.9953125, + "rewards/format_reward/std": 0.04303459823131561, + "rewards/mcq_accuracy_reward/mean": 0.7036458373069763, + "rewards/mcq_accuracy_reward/std": 0.4550426959991455, + "rewards/tag_count_reward/mean": 0.988671886920929, + "rewards/tag_count_reward/std": 0.05410802438855171, + "step": 1215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0015625, + "completions/max_length": 4777.0, + "completions/max_terminated_length": 4190.2, + "completions/mean_length": 879.5640869140625, + "completions/mean_terminated_length": 868.2216918945312, + "completions/min_length": 216.6, + "completions/min_terminated_length": 216.6, + "epoch": 0.3203098078469339, + "grad_norm": 0.2729092366585053, + "kl": 0.093463134765625, + "learning_rate": 1.0359512765540745e-06, + "loss": 0.0323, + "num_tokens": 524421956.0, + "reward": 1.1787109851837159, + "reward_std": 0.14528779685497284, + "rewards/format_reward/mean": 0.9880208492279052, + "rewards/format_reward/std": 0.10605341792106629, + "rewards/mcq_accuracy_reward/mean": 0.684375, + "rewards/mcq_accuracy_reward/std": 0.46437373757362366, + "rewards/tag_count_reward/mean": 0.9893229246139527, + "rewards/tag_count_reward/std": 0.05656399056315422, + "step": 1220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 8063.0, + "completions/max_terminated_length": 6696.8, + "completions/mean_length": 946.0468872070312, + "completions/mean_terminated_length": 923.4013305664063, + "completions/min_length": 192.6, + "completions/min_terminated_length": 192.6, + "epoch": 0.3216225529610607, + "grad_norm": 0.32276750218379463, + "kl": 0.0928375244140625, + "learning_rate": 1.022890952806812e-06, + "loss": 0.0683, + "num_tokens": 526540982.0, + "reward": 1.2102864980697632, + "reward_std": 0.15063133835792542, + "rewards/format_reward/mean": 0.979687488079071, + "rewards/format_reward/std": 0.1406083106994629, + "rewards/mcq_accuracy_reward/mean": 0.7182291746139526, + "rewards/mcq_accuracy_reward/std": 0.44776550531387327, + "rewards/tag_count_reward/mean": 0.9885416626930237, + "rewards/tag_count_reward/std": 0.06407605335116387, + "step": 1225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003645833333333326, + "completions/max_length": 7912.6, + "completions/max_terminated_length": 5848.8, + "completions/mean_length": 962.5630493164062, + "completions/mean_terminated_length": 936.0208618164063, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.32293529807518745, + "grad_norm": 0.24959467950956737, + "kl": 0.0869140625, + "learning_rate": 1.00987074709307e-06, + "loss": 0.0796, + "num_tokens": 528681007.0, + "reward": 1.1607422351837158, + "reward_std": 0.13493019640445708, + "rewards/format_reward/mean": 0.9781250119209289, + "rewards/format_reward/std": 0.1425768494606018, + "rewards/mcq_accuracy_reward/mean": 0.66875, + "rewards/mcq_accuracy_reward/std": 0.4674504935741425, + "rewards/tag_count_reward/mean": 0.98984375, + "rewards/tag_count_reward/std": 0.062356358766555785, + "step": 1230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0046875, + "completions/max_length": 7972.4, + "completions/max_terminated_length": 4710.0, + "completions/mean_length": 892.7922119140625, + "completions/mean_terminated_length": 858.3798706054688, + "completions/min_length": 221.4, + "completions/min_terminated_length": 221.4, + "epoch": 0.3242480431893143, + "grad_norm": 0.25198159577092044, + "kl": 0.094195556640625, + "learning_rate": 9.968917542255588e-07, + "loss": 0.0769, + "num_tokens": 530695816.0, + "reward": 1.2774739980697631, + "reward_std": 0.1336188346147537, + "rewards/format_reward/mean": 0.9791666746139527, + "rewards/format_reward/std": 0.14154193848371505, + "rewards/mcq_accuracy_reward/mean": 0.7854166626930237, + "rewards/mcq_accuracy_reward/std": 0.4092969536781311, + "rewards/tag_count_reward/mean": 0.989062488079071, + "rewards/tag_count_reward/std": 0.060338548570871356, + "step": 1235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003645833333333326, + "completions/max_length": 7312.0, + "completions/max_terminated_length": 4752.0, + "completions/mean_length": 933.7510498046875, + "completions/mean_terminated_length": 907.3432495117188, + "completions/min_length": 222.8, + "completions/min_terminated_length": 222.8, + "epoch": 0.32556078830344104, + "grad_norm": 0.29569058286986494, + "kl": 5.2160400390625, + "learning_rate": 9.839550655515791e-07, + "loss": 0.2584, + "num_tokens": 532789162.0, + "reward": 1.1652995109558106, + "reward_std": 0.15169093906879424, + "rewards/format_reward/mean": 0.98125, + "rewards/format_reward/std": 0.13063163161277772, + "rewards/mcq_accuracy_reward/mean": 0.6729166626930236, + "rewards/mcq_accuracy_reward/std": 0.46670416593551634, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.06279756501317024, + "step": 1240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005208333333333348, + "completions/max_length": 7426.0, + "completions/max_terminated_length": 4109.6, + "completions/mean_length": 924.5833618164063, + "completions/mean_terminated_length": 886.5406372070313, + "completions/min_length": 220.6, + "completions/min_terminated_length": 220.6, + "epoch": 0.3268735334175678, + "grad_norm": 0.27586831362869235, + "kl": 0.091607666015625, + "learning_rate": 9.710617688612552e-07, + "loss": 0.0488, + "num_tokens": 534860826.0, + "reward": 1.158398485183716, + "reward_std": 0.15814161002635957, + "rewards/format_reward/mean": 0.9833333253860473, + "rewards/format_reward/std": 0.1265550136566162, + "rewards/mcq_accuracy_reward/mean": 0.6651041626930236, + "rewards/mcq_accuracy_reward/std": 0.4707404851913452, + "rewards/tag_count_reward/mean": 0.989843738079071, + "rewards/tag_count_reward/std": 0.058845020830631256, + "step": 1245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.004166666666666652, + "completions/max_length": 6798.6, + "completions/max_terminated_length": 5609.0, + "completions/mean_length": 953.72763671875, + "completions/mean_terminated_length": 923.44287109375, + "completions/min_length": 210.2, + "completions/min_terminated_length": 210.2, + "epoch": 0.3281862785316946, + "grad_norm": 0.26792583573870105, + "kl": 0.090447998046875, + "learning_rate": 9.582129482960685e-07, + "loss": 0.068, + "num_tokens": 536986111.0, + "reward": 1.2064453601837157, + "reward_std": 0.13627744168043138, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.11964005529880524, + "rewards/mcq_accuracy_reward/mean": 0.7130208373069763, + "rewards/mcq_accuracy_reward/std": 0.45226293206214907, + "rewards/tag_count_reward/mean": 0.9893229246139527, + "rewards/tag_count_reward/std": 0.060405439138412474, + "step": 1250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.004166666666666674, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5108.6, + "completions/mean_length": 949.3375244140625, + "completions/mean_terminated_length": 919.0989990234375, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.32949902364582134, + "grad_norm": 0.2599652137309011, + "kl": 0.091217041015625, + "learning_rate": 9.454096842576943e-07, + "loss": 0.0575, + "num_tokens": 539105223.0, + "reward": 1.1193685054779052, + "reward_std": 0.17008641958236695, + "rewards/format_reward/mean": 0.9802083373069763, + "rewards/format_reward/std": 0.13821131885051727, + "rewards/mcq_accuracy_reward/mean": 0.6270833253860474, + "rewards/mcq_accuracy_reward/std": 0.482833993434906, + "rewards/tag_count_reward/mean": 0.9889322876930237, + "rewards/tag_count_reward/std": 0.061086802184581755, + "step": 1255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.004166666666666652, + "completions/max_length": 7338.0, + "completions/max_terminated_length": 4784.0, + "completions/mean_length": 914.6682495117187, + "completions/mean_terminated_length": 884.18369140625, + "completions/min_length": 251.8, + "completions/min_terminated_length": 251.8, + "epoch": 0.33081176875994817, + "grad_norm": 0.29105984859318607, + "kl": 0.093768310546875, + "learning_rate": 9.326530533171565e-07, + "loss": 0.0626, + "num_tokens": 541160074.0, + "reward": 1.206966185569763, + "reward_std": 0.15963056981563567, + "rewards/format_reward/mean": 0.985937488079071, + "rewards/format_reward/std": 0.11171629428863525, + "rewards/mcq_accuracy_reward/mean": 0.7130208373069763, + "rewards/mcq_accuracy_reward/std": 0.4436314463615417, + "rewards/tag_count_reward/mean": 0.98984375, + "rewards/tag_count_reward/std": 0.05682651698589325, + "step": 1260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0015625, + "completions/max_length": 5652.2, + "completions/max_terminated_length": 4907.6, + "completions/mean_length": 908.7515747070313, + "completions/mean_terminated_length": 897.3482421875, + "completions/min_length": 190.4, + "completions/min_terminated_length": 190.4, + "epoch": 0.33212451387407493, + "grad_norm": 0.24651265071049852, + "kl": 0.095489501953125, + "learning_rate": 9.199441281243063e-07, + "loss": 0.0746, + "num_tokens": 543202357.0, + "reward": 1.1495768547058105, + "reward_std": 0.1556103855371475, + "rewards/format_reward/mean": 0.979687511920929, + "rewards/format_reward/std": 0.13930579125881196, + "rewards/mcq_accuracy_reward/mean": 0.6578125, + "rewards/mcq_accuracy_reward/std": 0.47418121695518495, + "rewards/tag_count_reward/mean": 0.9873697876930236, + "rewards/tag_count_reward/std": 0.06287725642323494, + "step": 1265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.004166666666666674, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 4990.4, + "completions/mean_length": 935.3724243164063, + "completions/mean_terminated_length": 905.0205688476562, + "completions/min_length": 222.6, + "completions/min_terminated_length": 222.6, + "epoch": 0.3334372589882017, + "grad_norm": 0.2772245507619063, + "kl": 0.0914154052734375, + "learning_rate": 9.072839773176228e-07, + "loss": 0.0653, + "num_tokens": 545299272.0, + "reward": 1.211848998069763, + "reward_std": 0.17000419050455093, + "rewards/format_reward/mean": 0.979687511920929, + "rewards/format_reward/std": 0.13882749080657958, + "rewards/mcq_accuracy_reward/mean": 0.7197916746139527, + "rewards/mcq_accuracy_reward/std": 0.4474457621574402, + "rewards/tag_count_reward/mean": 0.9885416865348816, + "rewards/tag_count_reward/std": 0.06225705593824386, + "step": 1270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005729166666666674, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5402.4, + "completions/mean_length": 964.2307495117187, + "completions/mean_terminated_length": 922.6523071289063, + "completions/min_length": 222.8, + "completions/min_terminated_length": 222.8, + "epoch": 0.33475000410232847, + "grad_norm": 0.29033593259073814, + "kl": 0.095770263671875, + "learning_rate": 8.946736654343579e-07, + "loss": 0.0739, + "num_tokens": 547452475.0, + "reward": 1.161360740661621, + "reward_std": 0.15833270251750947, + "rewards/format_reward/mean": 0.9739583373069763, + "rewards/format_reward/std": 0.15878624022006987, + "rewards/mcq_accuracy_reward/mean": 0.6708333373069764, + "rewards/mcq_accuracy_reward/std": 0.47031649947166443, + "rewards/tag_count_reward/mean": 0.9881510376930237, + "rewards/tag_count_reward/std": 0.06445005685091018, + "step": 1275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003645833333333348, + "completions/max_length": 7484.4, + "completions/max_terminated_length": 4588.4, + "completions/mean_length": 918.8213745117188, + "completions/mean_terminated_length": 892.1643676757812, + "completions/min_length": 232.2, + "completions/min_terminated_length": 232.2, + "epoch": 0.33606274921645524, + "grad_norm": 0.3364980744646374, + "kl": 0.091497802734375, + "learning_rate": 8.821142528210268e-07, + "loss": 0.0681, + "num_tokens": 549513484.0, + "reward": 1.2004557609558106, + "reward_std": 0.1418253093957901, + "rewards/format_reward/mean": 0.9817708373069763, + "rewards/format_reward/std": 0.13287073373794556, + "rewards/mcq_accuracy_reward/mean": 0.7083333253860473, + "rewards/mcq_accuracy_reward/std": 0.446407163143158, + "rewards/tag_count_reward/mean": 0.98671875, + "rewards/tag_count_reward/std": 0.06684387996792793, + "step": 1280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.002604166666666674, + "completions/max_length": 6607.2, + "completions/max_terminated_length": 5258.4, + "completions/mean_length": 961.4552368164062, + "completions/mean_terminated_length": 942.6665161132812, + "completions/min_length": 239.8, + "completions/min_terminated_length": 239.8, + "epoch": 0.33737549433058206, + "grad_norm": 0.28823208295015845, + "kl": 0.0906341552734375, + "learning_rate": 8.696067955442435e-07, + "loss": 0.0656, + "num_tokens": 551657926.0, + "reward": 1.233593797683716, + "reward_std": 0.15817490816116334, + "rewards/format_reward/mean": 0.978125, + "rewards/format_reward/std": 0.13853369355201722, + "rewards/mcq_accuracy_reward/mean": 0.7416666746139526, + "rewards/mcq_accuracy_reward/std": 0.4366734981536865, + "rewards/tag_count_reward/mean": 0.9895833253860473, + "rewards/tag_count_reward/std": 0.056377878040075304, + "step": 1285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.002604166666666674, + "completions/max_length": 6340.2, + "completions/max_terminated_length": 4677.4, + "completions/mean_length": 944.58701171875, + "completions/mean_terminated_length": 925.6741088867187, + "completions/min_length": 203.4, + "completions/min_terminated_length": 203.4, + "epoch": 0.3386882394447088, + "grad_norm": 0.278642649842979, + "kl": 0.0903656005859375, + "learning_rate": 8.571523453019224e-07, + "loss": 0.055, + "num_tokens": 553766173.0, + "reward": 1.1909180164337159, + "reward_std": 0.14857401698827744, + "rewards/format_reward/mean": 0.9776041746139527, + "rewards/format_reward/std": 0.1469006657600403, + "rewards/mcq_accuracy_reward/mean": 0.6989583373069763, + "rewards/mcq_accuracy_reward/std": 0.4565257132053375, + "rewards/tag_count_reward/mean": 0.9902343630790711, + "rewards/tag_count_reward/std": 0.056921444833278656, + "step": 1290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005729166666666674, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5064.0, + "completions/mean_length": 938.3146240234375, + "completions/mean_terminated_length": 896.4767944335938, + "completions/min_length": 208.6, + "completions/min_terminated_length": 208.6, + "epoch": 0.3400009845588356, + "grad_norm": 0.25208648159041736, + "kl": 0.0884063720703125, + "learning_rate": 8.447519493348479e-07, + "loss": 0.0881, + "num_tokens": 555861857.0, + "reward": 1.2301107168197631, + "reward_std": 0.14678625762462616, + "rewards/format_reward/mean": 0.975, + "rewards/format_reward/std": 0.15619019865989686, + "rewards/mcq_accuracy_reward/mean": 0.7385416626930237, + "rewards/mcq_accuracy_reward/std": 0.4386380910873413, + "rewards/tag_count_reward/mean": 0.9912760376930236, + "rewards/tag_count_reward/std": 0.05794672966003418, + "step": 1295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.007291666666666674, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 3937.0, + "completions/mean_length": 935.6437744140625, + "completions/mean_terminated_length": 882.5367553710937, + "completions/min_length": 222.6, + "completions/min_terminated_length": 222.6, + "epoch": 0.34131372967296236, + "grad_norm": 0.2702087798215732, + "kl": 0.087200927734375, + "learning_rate": 8.324066503386131e-07, + "loss": 0.0881, + "num_tokens": 557954565.0, + "reward": 1.1867513418197633, + "reward_std": 0.18401645720005036, + "rewards/format_reward/mean": 0.965625011920929, + "rewards/format_reward/std": 0.18140285909175874, + "rewards/mcq_accuracy_reward/mean": 0.6979166626930237, + "rewards/mcq_accuracy_reward/std": 0.4561771512031555, + "rewards/tag_count_reward/mean": 0.9897135496139526, + "rewards/tag_count_reward/std": 0.06069606617093086, + "step": 1300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.006770833333333326, + "completions/max_length": 7352.8, + "completions/max_terminated_length": 5800.2, + "completions/mean_length": 909.8599243164062, + "completions/mean_terminated_length": 860.2773803710937, + "completions/min_length": 239.2, + "completions/min_terminated_length": 239.2, + "epoch": 0.34262647478708913, + "grad_norm": 0.3031766423866328, + "kl": 0.0894775390625, + "learning_rate": 8.201174863759449e-07, + "loss": 0.0868, + "num_tokens": 559998504.0, + "reward": 1.1915364742279053, + "reward_std": 0.1640061616897583, + "rewards/format_reward/mean": 0.9703125, + "rewards/format_reward/std": 0.16941283643245697, + "rewards/mcq_accuracy_reward/mean": 0.7020833373069764, + "rewards/mcq_accuracy_reward/std": 0.453474223613739, + "rewards/tag_count_reward/mean": 0.9875, + "rewards/tag_count_reward/std": 0.07210705354809761, + "step": 1305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0057291666666666515, + "completions/max_length": 7233.0, + "completions/max_terminated_length": 4231.0, + "completions/mean_length": 870.1015747070312, + "completions/mean_terminated_length": 827.9660888671875, + "completions/min_length": 219.2, + "completions/min_terminated_length": 219.2, + "epoch": 0.34393921990121595, + "grad_norm": 0.3675226209480633, + "kl": 0.094219970703125, + "learning_rate": 8.078854907894204e-07, + "loss": 0.0892, + "num_tokens": 561964995.0, + "reward": 1.1995768547058105, + "reward_std": 0.14960525333881378, + "rewards/format_reward/mean": 0.9677083373069764, + "rewards/format_reward/std": 0.1751413345336914, + "rewards/mcq_accuracy_reward/mean": 0.7098958253860473, + "rewards/mcq_accuracy_reward/std": 0.4518738627433777, + "rewards/tag_count_reward/mean": 0.991015636920929, + "rewards/tag_count_reward/std": 0.05426904633641243, + "step": 1310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005208333333333326, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 4718.8, + "completions/mean_length": 877.6682495117187, + "completions/mean_terminated_length": 839.4986206054688, + "completions/min_length": 207.2, + "completions/min_terminated_length": 207.2, + "epoch": 0.3452519650153427, + "grad_norm": 0.3092371466318724, + "kl": 0.091790771484375, + "learning_rate": 7.957116921145737e-07, + "loss": 0.0757, + "num_tokens": 563943550.0, + "reward": 1.1806966543197632, + "reward_std": 0.1582411915063858, + "rewards/format_reward/mean": 0.975, + "rewards/format_reward/std": 0.15468841791152954, + "rewards/mcq_accuracy_reward/mean": 0.689062488079071, + "rewards/mcq_accuracy_reward/std": 0.4607856452465057, + "rewards/tag_count_reward/mean": 0.9915364623069763, + "rewards/tag_count_reward/std": 0.05511779636144638, + "step": 1315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.004166666666666652, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5550.4, + "completions/mean_length": 931.4385620117188, + "completions/mean_terminated_length": 901.1396606445312, + "completions/min_length": 207.2, + "completions/min_terminated_length": 207.2, + "epoch": 0.3465647101294695, + "grad_norm": 0.3245285230859717, + "kl": 0.0923675537109375, + "learning_rate": 7.835971139934133e-07, + "loss": 0.0954, + "num_tokens": 566025912.0, + "reward": 1.1934896230697631, + "reward_std": 0.16996956020593643, + "rewards/format_reward/mean": 0.9692708373069763, + "rewards/format_reward/std": 0.17073639035224913, + "rewards/mcq_accuracy_reward/mean": 0.7041666746139527, + "rewards/mcq_accuracy_reward/std": 0.4530996441841125, + "rewards/tag_count_reward/mean": 0.9880208373069763, + "rewards/tag_count_reward/std": 0.0650701068341732, + "step": 1320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.010416666666666652, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5809.2, + "completions/mean_length": 938.5276245117187, + "completions/mean_terminated_length": 862.0428344726563, + "completions/min_length": 194.6, + "completions/min_terminated_length": 194.6, + "epoch": 0.34787745524359626, + "grad_norm": 0.2797815078559725, + "kl": 0.094732666015625, + "learning_rate": 7.715427750883467e-07, + "loss": 0.1295, + "num_tokens": 568123037.0, + "reward": 1.1935872793197633, + "reward_std": 0.17868755459785463, + "rewards/format_reward/mean": 0.9666666746139526, + "rewards/format_reward/std": 0.17942690551280976, + "rewards/mcq_accuracy_reward/mean": 0.7057291746139527, + "rewards/mcq_accuracy_reward/std": 0.45257904529571535, + "rewards/tag_count_reward/mean": 0.984765636920929, + "rewards/tag_count_reward/std": 0.07510874569416046, + "step": 1325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.007291666666666652, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5704.8, + "completions/mean_length": 965.5760498046875, + "completions/mean_terminated_length": 912.4212036132812, + "completions/min_length": 214.2, + "completions/min_terminated_length": 214.2, + "epoch": 0.349190200357723, + "grad_norm": 0.26950688780113247, + "kl": 0.0924163818359375, + "learning_rate": 7.595496889965293e-07, + "loss": 0.0957, + "num_tokens": 570272655.0, + "reward": 1.1714844226837158, + "reward_std": 0.18243164122104644, + "rewards/format_reward/mean": 0.9666666626930237, + "rewards/format_reward/std": 0.17696910202503205, + "rewards/mcq_accuracy_reward/mean": 0.684375011920929, + "rewards/mcq_accuracy_reward/std": 0.4594986379146576, + "rewards/tag_count_reward/mean": 0.9817708373069763, + "rewards/tag_count_reward/std": 0.0805608570575714, + "step": 1330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0052083333333333036, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6230.2, + "completions/mean_length": 922.9661499023438, + "completions/mean_terminated_length": 884.9851440429687, + "completions/min_length": 174.4, + "completions/min_terminated_length": 174.4, + "epoch": 0.35050294547184985, + "grad_norm": 0.2709872892255123, + "kl": 0.0945526123046875, + "learning_rate": 7.476188641646305e-07, + "loss": 0.083, + "num_tokens": 572340830.0, + "reward": 1.221126341819763, + "reward_std": 0.133340547978878, + "rewards/format_reward/mean": 0.9713541507720947, + "rewards/format_reward/std": 0.16574167907238008, + "rewards/mcq_accuracy_reward/mean": 0.73125, + "rewards/mcq_accuracy_reward/std": 0.4414697825908661, + "rewards/tag_count_reward/mean": 0.9881510376930237, + "rewards/tag_count_reward/std": 0.062182580679655076, + "step": 1335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0046875, + "completions/max_length": 7778.4, + "completions/max_terminated_length": 4806.8, + "completions/mean_length": 942.7468994140625, + "completions/mean_terminated_length": 908.5369262695312, + "completions/min_length": 206.2, + "completions/min_terminated_length": 206.2, + "epoch": 0.3518156905859766, + "grad_norm": 0.2895581919972125, + "kl": 0.08924560546875, + "learning_rate": 7.357513038040395e-07, + "loss": 0.092, + "num_tokens": 574448936.0, + "reward": 1.1744466543197631, + "reward_std": 0.17989281415939332, + "rewards/format_reward/mean": 0.9651041746139526, + "rewards/format_reward/std": 0.18109793365001678, + "rewards/mcq_accuracy_reward/mean": 0.6859375, + "rewards/mcq_accuracy_reward/std": 0.4625144422054291, + "rewards/tag_count_reward/mean": 0.9889322876930237, + "rewards/tag_count_reward/std": 0.0639925792813301, + "step": 1340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005208333333333326, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 4583.6, + "completions/mean_length": 925.4781494140625, + "completions/mean_terminated_length": 887.4354248046875, + "completions/min_length": 233.8, + "completions/min_terminated_length": 233.8, + "epoch": 0.3531284357001034, + "grad_norm": 0.24107527060824302, + "kl": 0.0877349853515625, + "learning_rate": 7.239480058065125e-07, + "loss": 0.0564, + "num_tokens": 576524182.0, + "reward": 1.2178711652755738, + "reward_std": 0.15503210127353667, + "rewards/format_reward/mean": 0.975, + "rewards/format_reward/std": 0.15321291089057923, + "rewards/mcq_accuracy_reward/mean": 0.7260416626930237, + "rewards/mcq_accuracy_reward/std": 0.43772119879722593, + "rewards/tag_count_reward/mean": 0.9923177123069763, + "rewards/tag_count_reward/std": 0.05627003088593483, + "step": 1345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.006770833333333326, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 4155.6, + "completions/mean_length": 925.711474609375, + "completions/mean_terminated_length": 876.1068481445312, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.35444118081423015, + "grad_norm": 0.27757275124193864, + "kl": 0.0880615234375, + "learning_rate": 7.122099626602601e-07, + "loss": 0.0707, + "num_tokens": 578598444.0, + "reward": 1.1669270992279053, + "reward_std": 0.157212695479393, + "rewards/format_reward/mean": 0.9744791626930237, + "rewards/format_reward/std": 0.15557833313941954, + "rewards/mcq_accuracy_reward/mean": 0.6755208253860474, + "rewards/mcq_accuracy_reward/std": 0.46677578091621397, + "rewards/tag_count_reward/mean": 0.9911458373069764, + "rewards/tag_count_reward/std": 0.05677817389369011, + "step": 1350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009375, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6506.2, + "completions/mean_length": 924.9140869140625, + "completions/mean_terminated_length": 856.0612426757813, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.3557539259283569, + "grad_norm": 0.23654400342477042, + "kl": 0.093511962890625, + "learning_rate": 7.005381613664948e-07, + "loss": 0.1175, + "num_tokens": 580677063.0, + "reward": 1.2289713859558105, + "reward_std": 0.1496271163225174, + "rewards/format_reward/mean": 0.9671875, + "rewards/format_reward/std": 0.1767730474472046, + "rewards/mcq_accuracy_reward/mean": 0.7401041626930237, + "rewards/mcq_accuracy_reward/std": 0.4370701968669891, + "rewards/tag_count_reward/mean": 0.9882812380790711, + "rewards/tag_count_reward/std": 0.07008460760116578, + "step": 1355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.004166666666666652, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5384.6, + "completions/mean_length": 937.1140869140625, + "completions/mean_terminated_length": 906.7700439453125, + "completions/min_length": 225.6, + "completions/min_terminated_length": 225.6, + "epoch": 0.35706667104248374, + "grad_norm": 0.2723744526227269, + "kl": 0.0885101318359375, + "learning_rate": 6.889335833564404e-07, + "loss": 0.0591, + "num_tokens": 582776826.0, + "reward": 1.2099609613418578, + "reward_std": 0.15142365097999572, + "rewards/format_reward/mean": 0.9776041626930236, + "rewards/format_reward/std": 0.1459692895412445, + "rewards/mcq_accuracy_reward/mean": 0.7171875, + "rewards/mcq_accuracy_reward/std": 0.4506485879421234, + "rewards/tag_count_reward/mean": 0.9934895873069763, + "rewards/tag_count_reward/std": 0.04879186972975731, + "step": 1360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.010416666666666675, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 4690.2, + "completions/mean_length": 1007.6791870117188, + "completions/mean_terminated_length": 932.0317016601563, + "completions/min_length": 231.2, + "completions/min_terminated_length": 231.2, + "epoch": 0.3583794161566105, + "grad_norm": 0.24727146312537124, + "kl": 0.0836212158203125, + "learning_rate": 6.773972044088041e-07, + "loss": 0.0895, + "num_tokens": 585009226.0, + "reward": 1.1719401359558106, + "reward_std": 0.1597072571516037, + "rewards/format_reward/mean": 0.9697916626930236, + "rewards/format_reward/std": 0.16928132772445678, + "rewards/mcq_accuracy_reward/mean": 0.6822916626930237, + "rewards/mcq_accuracy_reward/std": 0.4641745090484619, + "rewards/tag_count_reward/mean": 0.9888020753860474, + "rewards/tag_count_reward/std": 0.06786207780241967, + "step": 1365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 7661.2, + "completions/max_terminated_length": 6576.4, + "completions/mean_length": 987.9838745117188, + "completions/mean_terminated_length": 931.4978393554687, + "completions/min_length": 206.8, + "completions/min_terminated_length": 206.8, + "epoch": 0.3596921612707373, + "grad_norm": 0.27251153554509744, + "kl": 0.084527587890625, + "learning_rate": 6.659299945677297e-07, + "loss": 0.0972, + "num_tokens": 587206859.0, + "reward": 1.201595091819763, + "reward_std": 0.18883035480976104, + "rewards/format_reward/mean": 0.9682291626930237, + "rewards/format_reward/std": 0.17330897152423858, + "rewards/mcq_accuracy_reward/mean": 0.7119791626930236, + "rewards/mcq_accuracy_reward/std": 0.448106461763382, + "rewards/tag_count_reward/mean": 0.9902343869209289, + "rewards/tag_count_reward/std": 0.06168298721313477, + "step": 1370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.014583333333333327, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6724.2, + "completions/mean_length": 1082.7198486328125, + "completions/mean_terminated_length": 977.2488159179687, + "completions/min_length": 232.6, + "completions/min_terminated_length": 232.6, + "epoch": 0.36100490638486404, + "grad_norm": 0.24263403242843537, + "kl": 0.080609130859375, + "learning_rate": 6.545329180612315e-07, + "loss": 0.1264, + "num_tokens": 589585849.0, + "reward": 1.1980469226837158, + "reward_std": 0.1734657406806946, + "rewards/format_reward/mean": 0.9666666626930237, + "rewards/format_reward/std": 0.17868994772434235, + "rewards/mcq_accuracy_reward/mean": 0.709375, + "rewards/mcq_accuracy_reward/std": 0.45397242307662966, + "rewards/tag_count_reward/mean": 0.9880208253860474, + "rewards/tag_count_reward/std": 0.06743852570652961, + "step": 1375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012500000000000022, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5185.2, + "completions/mean_length": 1031.4052368164062, + "completions/mean_terminated_length": 941.0228271484375, + "completions/min_length": 230.6, + "completions/min_terminated_length": 230.6, + "epoch": 0.3623176514989908, + "grad_norm": 0.25968476567234894, + "kl": 0.082025146484375, + "learning_rate": 6.432069332201143e-07, + "loss": 0.0982, + "num_tokens": 591865003.0, + "reward": 1.1749674797058105, + "reward_std": 0.1887389212846756, + "rewards/format_reward/mean": 0.9677083373069764, + "rewards/format_reward/std": 0.17375943660736085, + "rewards/mcq_accuracy_reward/mean": 0.6859374880790711, + "rewards/mcq_accuracy_reward/std": 0.4610561430454254, + "rewards/tag_count_reward/mean": 0.9884114623069763, + "rewards/tag_count_reward/std": 0.06857026070356369, + "step": 1380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009375, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6164.8, + "completions/mean_length": 1019.397412109375, + "completions/mean_terminated_length": 951.837109375, + "completions/min_length": 197.8, + "completions/min_terminated_length": 197.8, + "epoch": 0.3636303966131176, + "grad_norm": 0.25890733924962406, + "kl": 0.0840057373046875, + "learning_rate": 6.319529923973923e-07, + "loss": 0.0888, + "num_tokens": 594123974.0, + "reward": 1.1779948472976685, + "reward_std": 0.16528845876455306, + "rewards/format_reward/mean": 0.9708333253860474, + "rewards/format_reward/std": 0.168318572640419, + "rewards/mcq_accuracy_reward/mean": 0.6880208373069763, + "rewards/mcq_accuracy_reward/std": 0.4467033386230469, + "rewards/tag_count_reward/mean": 0.9890625, + "rewards/tag_count_reward/std": 0.06641164571046829, + "step": 1385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 7789.2, + "completions/max_terminated_length": 4618.0, + "completions/mean_length": 970.4734619140625, + "completions/mean_terminated_length": 913.4739624023438, + "completions/min_length": 202.6, + "completions/min_terminated_length": 202.6, + "epoch": 0.3649431417272444, + "grad_norm": 0.2632795572456831, + "kl": 0.0863128662109375, + "learning_rate": 6.207720418882119e-07, + "loss": 0.0813, + "num_tokens": 596285131.0, + "reward": 1.1853841543197632, + "reward_std": 0.17734076082706451, + "rewards/format_reward/mean": 0.9791666746139527, + "rewards/format_reward/std": 0.13667924106121063, + "rewards/mcq_accuracy_reward/mean": 0.6932291746139526, + "rewards/mcq_accuracy_reward/std": 0.45342081785202026, + "rewards/tag_count_reward/mean": 0.989453136920929, + "rewards/tag_count_reward/std": 0.0635631449520588, + "step": 1390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999978, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5975.2, + "completions/mean_length": 940.6333740234375, + "completions/mean_terminated_length": 848.9737548828125, + "completions/min_length": 210.8, + "completions/min_terminated_length": 210.8, + "epoch": 0.36625588684137117, + "grad_norm": 0.2566744532329966, + "kl": 0.0884124755859375, + "learning_rate": 6.096650218502785e-07, + "loss": 0.121, + "num_tokens": 598386819.0, + "reward": 1.2008464097976685, + "reward_std": 0.1603745013475418, + "rewards/format_reward/mean": 0.9723958253860474, + "rewards/format_reward/std": 0.16392945051193236, + "rewards/mcq_accuracy_reward/mean": 0.7104166626930237, + "rewards/mcq_accuracy_reward/std": 0.4536996841430664, + "rewards/tag_count_reward/mean": 0.9893229126930236, + "rewards/tag_count_reward/std": 0.06582736894488335, + "step": 1395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009895833333333348, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6206.4, + "completions/mean_length": 963.5880493164062, + "completions/mean_terminated_length": 891.4520141601563, + "completions/min_length": 221.8, + "completions/min_terminated_length": 221.8, + "epoch": 0.36756863195549794, + "grad_norm": 0.27284405303552195, + "kl": 0.0887359619140625, + "learning_rate": 5.986328662248048e-07, + "loss": 0.0977, + "num_tokens": 600538004.0, + "reward": 1.1674153804779053, + "reward_std": 0.15957081615924834, + "rewards/format_reward/mean": 0.9697916746139527, + "rewards/format_reward/std": 0.16952840685844422, + "rewards/mcq_accuracy_reward/mean": 0.6776041626930237, + "rewards/mcq_accuracy_reward/std": 0.4673686742782593, + "rewards/tag_count_reward/mean": 0.989453125, + "rewards/tag_count_reward/std": 0.064417564868927, + "step": 1400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.006770833333333326, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 4781.4, + "completions/mean_length": 883.656787109375, + "completions/mean_terminated_length": 833.6979248046875, + "completions/min_length": 203.8, + "completions/min_terminated_length": 203.8, + "epoch": 0.3688813770696247, + "grad_norm": 0.27209923115253615, + "kl": 0.0899261474609375, + "learning_rate": 5.876765026579794e-07, + "loss": 0.0863, + "num_tokens": 602528193.0, + "reward": 1.2273112297058106, + "reward_std": 0.1523064225912094, + "rewards/format_reward/mean": 0.971875, + "rewards/format_reward/std": 0.16340906620025636, + "rewards/mcq_accuracy_reward/mean": 0.7364583134651184, + "rewards/mcq_accuracy_reward/std": 0.43888933062553404, + "rewards/tag_count_reward/mean": 0.9915364623069763, + "rewards/tag_count_reward/std": 0.05906555280089378, + "step": 1405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0140625, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5987.0, + "completions/mean_length": 989.613037109375, + "completions/mean_terminated_length": 886.79697265625, + "completions/min_length": 217.6, + "completions/min_terminated_length": 217.6, + "epoch": 0.3701941221837515, + "grad_norm": 0.2365495877880457, + "kl": 0.0850433349609375, + "learning_rate": 5.767968524229656e-07, + "loss": 0.1094, + "num_tokens": 604726570.0, + "reward": 1.1916992664337158, + "reward_std": 0.18976544141769408, + "rewards/format_reward/mean": 0.9703125119209289, + "rewards/format_reward/std": 0.16923010647296904, + "rewards/mcq_accuracy_reward/mean": 0.7020833492279053, + "rewards/mcq_accuracy_reward/std": 0.4537588059902191, + "rewards/tag_count_reward/mean": 0.9881510257720947, + "rewards/tag_count_reward/std": 0.0718590758740902, + "step": 1410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.014583333333333304, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 4823.2, + "completions/mean_length": 955.6828369140625, + "completions/mean_terminated_length": 848.7431884765625, + "completions/min_length": 200.8, + "completions/min_terminated_length": 200.8, + "epoch": 0.3715068672978783, + "grad_norm": 0.27768646408232467, + "kl": 0.0902099609375, + "learning_rate": 5.659948303424336e-07, + "loss": 0.0907, + "num_tokens": 606857001.0, + "reward": 1.221386766433716, + "reward_std": 0.16610985100269318, + "rewards/format_reward/mean": 0.9703125, + "rewards/format_reward/std": 0.16957048773765565, + "rewards/mcq_accuracy_reward/mean": 0.7317708492279053, + "rewards/mcq_accuracy_reward/std": 0.43518059253692626, + "rewards/tag_count_reward/mean": 0.9881510257720947, + "rewards/tag_count_reward/std": 0.07345265299081802, + "step": 1415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01875, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6625.2, + "completions/mean_length": 1047.3901245117188, + "completions/mean_terminated_length": 911.2766235351562, + "completions/min_length": 206.8, + "completions/min_terminated_length": 206.8, + "epoch": 0.37281961241200506, + "grad_norm": 0.23021722941118053, + "kl": 0.085260009765625, + "learning_rate": 5.552713447116387e-07, + "loss": 0.1354, + "num_tokens": 609173862.0, + "reward": 1.1575846910476684, + "reward_std": 0.18594156801700593, + "rewards/format_reward/mean": 0.9583333373069763, + "rewards/format_reward/std": 0.19625433683395385, + "rewards/mcq_accuracy_reward/mean": 0.671875, + "rewards/mcq_accuracy_reward/std": 0.46608195304870603, + "rewards/tag_count_reward/mean": 0.9845052003860474, + "rewards/tag_count_reward/std": 0.08369605988264084, + "step": 1420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5443.2, + "completions/mean_length": 976.6734619140625, + "completions/mean_terminated_length": 885.3800048828125, + "completions/min_length": 214.6, + "completions/min_terminated_length": 214.6, + "epoch": 0.37413235752613183, + "grad_norm": 0.20449587396075822, + "kl": 0.0916473388671875, + "learning_rate": 5.446272972220471e-07, + "loss": 0.1121, + "num_tokens": 611346147.0, + "reward": 1.155468797683716, + "reward_std": 0.1579383999109268, + "rewards/format_reward/mean": 0.9713541746139527, + "rewards/format_reward/std": 0.16580126881599427, + "rewards/mcq_accuracy_reward/mean": 0.665625, + "rewards/mcq_accuracy_reward/std": 0.4697365045547485, + "rewards/tag_count_reward/mean": 0.9880208253860474, + "rewards/tag_count_reward/std": 0.06857141479849815, + "step": 1425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017187499999999977, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5585.2, + "completions/mean_length": 1027.6948486328124, + "completions/mean_terminated_length": 902.1425048828125, + "completions/min_length": 224.6, + "completions/min_terminated_length": 224.6, + "epoch": 0.3754451026402586, + "grad_norm": 0.243436771810509, + "kl": 0.0887359619140625, + "learning_rate": 5.340635828855146e-07, + "loss": 0.1202, + "num_tokens": 613619081.0, + "reward": 1.187337303161621, + "reward_std": 0.18232888281345366, + "rewards/format_reward/mean": 0.9619791626930236, + "rewards/format_reward/std": 0.19042017161846161, + "rewards/mcq_accuracy_reward/mean": 0.7005208253860473, + "rewards/mcq_accuracy_reward/std": 0.45718598961830137, + "rewards/tag_count_reward/mean": 0.9852864742279053, + "rewards/tag_count_reward/std": 0.07882689535617829, + "step": 1430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01614583333333337, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 4880.8, + "completions/mean_length": 1022.1651245117188, + "completions/mean_terminated_length": 904.30068359375, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.37675784775438537, + "grad_norm": 0.24371500764602952, + "kl": 0.0864532470703125, + "learning_rate": 5.235810899590304e-07, + "loss": 0.1309, + "num_tokens": 615875966.0, + "reward": 1.1290039539337158, + "reward_std": 0.1866077810525894, + "rewards/format_reward/mean": 0.9645833373069763, + "rewards/format_reward/std": 0.18463312089443207, + "rewards/mcq_accuracy_reward/mean": 0.6411458253860474, + "rewards/mcq_accuracy_reward/std": 0.4766407489776611, + "rewards/tag_count_reward/mean": 0.9868489623069763, + "rewards/tag_count_reward/std": 0.0738982081413269, + "step": 1435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.014583333333333347, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6876.4, + "completions/mean_length": 1025.8192993164062, + "completions/mean_terminated_length": 919.61845703125, + "completions/min_length": 233.2, + "completions/min_terminated_length": 233.2, + "epoch": 0.3780705928685122, + "grad_norm": 0.26844294220804454, + "kl": 0.084637451171875, + "learning_rate": 5.131806998700279e-07, + "loss": 0.1333, + "num_tokens": 618144163.0, + "reward": 1.194140648841858, + "reward_std": 0.1884692758321762, + "rewards/format_reward/mean": 0.9609375, + "rewards/format_reward/std": 0.19351592659950256, + "rewards/mcq_accuracy_reward/mean": 0.7072916746139526, + "rewards/mcq_accuracy_reward/std": 0.45319082736968996, + "rewards/tag_count_reward/mean": 0.9864583373069763, + "rewards/tag_count_reward/std": 0.07387047111988068, + "step": 1440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.016145833333333325, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6859.2, + "completions/mean_length": 1030.95732421875, + "completions/mean_terminated_length": 913.4848876953125, + "completions/min_length": 230.2, + "completions/min_terminated_length": 230.2, + "epoch": 0.37938333798263896, + "grad_norm": 0.2359894697786925, + "kl": 0.08385009765625, + "learning_rate": 5.028632871422673e-07, + "loss": 0.1225, + "num_tokens": 620420545.0, + "reward": 1.2027995109558105, + "reward_std": 0.17637802064418792, + "rewards/format_reward/mean": 0.9625, + "rewards/format_reward/std": 0.1882784515619278, + "rewards/mcq_accuracy_reward/mean": 0.715625011920929, + "rewards/mcq_accuracy_reward/std": 0.4508500277996063, + "rewards/tag_count_reward/mean": 0.9861979126930237, + "rewards/tag_count_reward/std": 0.07888660803437234, + "step": 1445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.016145833333333325, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5485.6, + "completions/mean_length": 1003.5338623046875, + "completions/mean_terminated_length": 885.4496826171875, + "completions/min_length": 203.2, + "completions/min_terminated_length": 203.2, + "epoch": 0.3806960830967657, + "grad_norm": 0.2977397413420194, + "kl": 0.08756103515625, + "learning_rate": 4.926297193223021e-07, + "loss": 0.1155, + "num_tokens": 622643138.0, + "reward": 1.1806641101837159, + "reward_std": 0.18380341827869415, + "rewards/format_reward/mean": 0.9630208373069763, + "rewards/format_reward/std": 0.18843259513378144, + "rewards/mcq_accuracy_reward/mean": 0.6932291626930237, + "rewards/mcq_accuracy_reward/std": 0.4607417047023773, + "rewards/tag_count_reward/mean": 0.98671875, + "rewards/tag_count_reward/std": 0.07472375854849815, + "step": 1450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0171875, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5823.8, + "completions/mean_length": 1042.9161865234375, + "completions/mean_terminated_length": 917.9686401367187, + "completions/min_length": 210.2, + "completions/min_terminated_length": 210.2, + "epoch": 0.3820088282108925, + "grad_norm": 0.3012876124023039, + "kl": 0.0844635009765625, + "learning_rate": 4.824808569065316e-07, + "loss": 0.1437, + "num_tokens": 624945185.0, + "reward": 1.1821614980697632, + "reward_std": 0.1772130995988846, + "rewards/format_reward/mean": 0.9520833492279053, + "rewards/format_reward/std": 0.21113504767417907, + "rewards/mcq_accuracy_reward/mean": 0.6979166626930237, + "rewards/mcq_accuracy_reward/std": 0.45754783153533934, + "rewards/tag_count_reward/mean": 0.9848958373069763, + "rewards/tag_count_reward/std": 0.07901396602392197, + "step": 1455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.014583333333333327, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5138.4, + "completions/mean_length": 1021.0765869140625, + "completions/mean_terminated_length": 914.6953002929688, + "completions/min_length": 203.8, + "completions/min_terminated_length": 203.8, + "epoch": 0.38332157332501926, + "grad_norm": 0.26113628893311486, + "kl": 0.085809326171875, + "learning_rate": 4.724175532688434e-07, + "loss": 0.1011, + "num_tokens": 627198748.0, + "reward": 1.1461589097976685, + "reward_std": 0.15672878324985504, + "rewards/format_reward/mean": 0.9666666746139526, + "rewards/format_reward/std": 0.1784056931734085, + "rewards/mcq_accuracy_reward/mean": 0.6572916626930236, + "rewards/mcq_accuracy_reward/std": 0.4743048846721649, + "rewards/tag_count_reward/mean": 0.9888020873069763, + "rewards/tag_count_reward/std": 0.06765694916248322, + "step": 1460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.016145833333333325, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5611.2, + "completions/mean_length": 1058.6927490234375, + "completions/mean_terminated_length": 941.4895263671875, + "completions/min_length": 230.4, + "completions/min_terminated_length": 230.4, + "epoch": 0.3846343184391461, + "grad_norm": 0.32685407352098594, + "kl": 0.085101318359375, + "learning_rate": 4.624406545888578e-07, + "loss": 0.1417, + "num_tokens": 629531558.0, + "reward": 1.1527344226837157, + "reward_std": 0.18683286905288696, + "rewards/format_reward/mean": 0.9635416746139527, + "rewards/format_reward/std": 0.18649468719959258, + "rewards/mcq_accuracy_reward/mean": 0.6651041626930236, + "rewards/mcq_accuracy_reward/std": 0.4680155038833618, + "rewards/tag_count_reward/mean": 0.9869791626930237, + "rewards/tag_count_reward/std": 0.07304810658097267, + "step": 1465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.016666666666666653, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6361.4, + "completions/mean_length": 1003.2948486328125, + "completions/mean_terminated_length": 881.2682983398438, + "completions/min_length": 207.4, + "completions/min_terminated_length": 207.4, + "epoch": 0.38594706355327285, + "grad_norm": 0.20565238492244858, + "kl": 0.0844390869140625, + "learning_rate": 4.5255099978077796e-07, + "loss": 0.1169, + "num_tokens": 631756548.0, + "reward": 1.1529622554779053, + "reward_std": 0.16282138526439666, + "rewards/format_reward/mean": 0.9666666746139526, + "rewards/format_reward/std": 0.17745698094367982, + "rewards/mcq_accuracy_reward/mean": 0.6645833373069763, + "rewards/mcq_accuracy_reward/std": 0.47092134952545167, + "rewards/tag_count_reward/mean": 0.9868489503860474, + "rewards/tag_count_reward/std": 0.07527425959706306, + "step": 1470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.014583333333333327, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5859.4, + "completions/mean_length": 987.045849609375, + "completions/mean_terminated_length": 880.5638916015625, + "completions/min_length": 225.2, + "completions/min_terminated_length": 225.2, + "epoch": 0.3872598086673996, + "grad_norm": 0.2500561543284479, + "kl": 0.08560791015625, + "learning_rate": 4.4274942042284613e-07, + "loss": 0.1156, + "num_tokens": 633951692.0, + "reward": 1.193554735183716, + "reward_std": 0.16549597978591918, + "rewards/format_reward/mean": 0.9671875, + "rewards/format_reward/std": 0.17667604386806487, + "rewards/mcq_accuracy_reward/mean": 0.7046875, + "rewards/mcq_accuracy_reward/std": 0.454887855052948, + "rewards/tag_count_reward/mean": 0.98828125, + "rewards/tag_count_reward/std": 0.06945615559816361, + "step": 1475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012500000000000022, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6620.6, + "completions/mean_length": 1000.8067993164062, + "completions/mean_terminated_length": 909.700048828125, + "completions/min_length": 182.2, + "completions/min_terminated_length": 182.2, + "epoch": 0.3885725537815264, + "grad_norm": 0.2778360674839553, + "kl": 0.085479736328125, + "learning_rate": 4.3303674068742155e-07, + "loss": 0.1052, + "num_tokens": 636175833.0, + "reward": 1.2297200679779052, + "reward_std": 0.15796131789684295, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.1737212747335434, + "rewards/mcq_accuracy_reward/mean": 0.7401041626930237, + "rewards/mcq_accuracy_reward/std": 0.43687437772750853, + "rewards/tag_count_reward/mean": 0.9897135376930237, + "rewards/tag_count_reward/std": 0.06589329987764359, + "step": 1480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.016666666666666673, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 4967.8, + "completions/mean_length": 1017.1005493164063, + "completions/mean_terminated_length": 895.3921020507812, + "completions/min_length": 218.4, + "completions/min_terminated_length": 218.4, + "epoch": 0.38988529889565315, + "grad_norm": 0.2518297010491606, + "kl": 0.0847991943359375, + "learning_rate": 4.234137772716805e-07, + "loss": 0.1309, + "num_tokens": 638420218.0, + "reward": 1.17958984375, + "reward_std": 0.1842581272125244, + "rewards/format_reward/mean": 0.9598958373069764, + "rewards/format_reward/std": 0.19621492624282838, + "rewards/mcq_accuracy_reward/mean": 0.6932291626930237, + "rewards/mcq_accuracy_reward/std": 0.4594866275787354, + "rewards/tag_count_reward/mean": 0.985546875, + "rewards/tag_count_reward/std": 0.07874632328748703, + "step": 1485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017708333333333347, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5799.6, + "completions/mean_length": 1024.9453491210938, + "completions/mean_terminated_length": 895.6031005859375, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.39119804400978, + "grad_norm": 0.34262475861466224, + "kl": 0.0867218017578125, + "learning_rate": 4.1388133932894115e-07, + "loss": 0.1161, + "num_tokens": 640688137.0, + "reward": 1.1960612297058106, + "reward_std": 0.16250043511390685, + "rewards/format_reward/mean": 0.9661458373069763, + "rewards/format_reward/std": 0.1797842115163803, + "rewards/mcq_accuracy_reward/mean": 0.7078125, + "rewards/mcq_accuracy_reward/std": 0.45415318608283994, + "rewards/tag_count_reward/mean": 0.9868489742279053, + "rewards/tag_count_reward/std": 0.07558658868074417, + "step": 1490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.022916666666666696, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6469.2, + "completions/mean_length": 1024.105224609375, + "completions/mean_terminated_length": 856.6689086914063, + "completions/min_length": 184.6, + "completions/min_terminated_length": 184.6, + "epoch": 0.39251078912390674, + "grad_norm": 0.23388786013992377, + "kl": 0.08525390625, + "learning_rate": 4.0444022840062675e-07, + "loss": 0.1386, + "num_tokens": 642954979.0, + "reward": 1.196484422683716, + "reward_std": 0.1667303830385208, + "rewards/format_reward/mean": 0.9588541746139526, + "rewards/format_reward/std": 0.19664739072322845, + "rewards/mcq_accuracy_reward/mean": 0.7109375, + "rewards/mcq_accuracy_reward/std": 0.4497135758399963, + "rewards/tag_count_reward/mean": 0.9833333253860473, + "rewards/tag_count_reward/std": 0.08694315999746323, + "step": 1495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0171875, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 4531.6, + "completions/mean_length": 994.2114868164062, + "completions/mean_terminated_length": 868.069921875, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.3938235342380335, + "grad_norm": 0.254515190213171, + "kl": 0.083502197265625, + "learning_rate": 3.950912383488687e-07, + "loss": 0.1201, + "num_tokens": 645169697.0, + "reward": 1.1894531726837159, + "reward_std": 0.16043694466352462, + "rewards/format_reward/mean": 0.9677083492279053, + "rewards/format_reward/std": 0.17475406527519227, + "rewards/mcq_accuracy_reward/mean": 0.7005208134651184, + "rewards/mcq_accuracy_reward/std": 0.45792239904403687, + "rewards/tag_count_reward/mean": 0.9880208134651184, + "rewards/tag_count_reward/std": 0.07076983675360679, + "step": 1500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.022395833333333327, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5445.0, + "completions/mean_length": 1006.06201171875, + "completions/mean_terminated_length": 841.7529296875, + "completions/min_length": 208.4, + "completions/min_terminated_length": 208.4, + "epoch": 0.3951362793521603, + "grad_norm": 0.22818063853405307, + "kl": 0.0824188232421875, + "learning_rate": 3.8583515528975125e-07, + "loss": 0.1577, + "num_tokens": 647402496.0, + "reward": 1.1706706047058106, + "reward_std": 0.18033518493175507, + "rewards/format_reward/mean": 0.9614583373069763, + "rewards/format_reward/std": 0.19214192926883697, + "rewards/mcq_accuracy_reward/mean": 0.684374988079071, + "rewards/mcq_accuracy_reward/std": 0.4614598214626312, + "rewards/tag_count_reward/mean": 0.9837239503860473, + "rewards/tag_count_reward/std": 0.08594592958688736, + "step": 1505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.027604166666666673, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5152.6, + "completions/mean_length": 1064.10576171875, + "completions/mean_terminated_length": 861.8424926757813, + "completions/min_length": 202.2, + "completions/min_terminated_length": 202.2, + "epoch": 0.39644902446628705, + "grad_norm": 0.18970799738987676, + "kl": 0.0820281982421875, + "learning_rate": 3.766727575272125e-07, + "loss": 0.1466, + "num_tokens": 649745291.0, + "reward": 1.1701823234558106, + "reward_std": 0.2083835333585739, + "rewards/format_reward/mean": 0.9604166746139526, + "rewards/format_reward/std": 0.19371051490306854, + "rewards/mcq_accuracy_reward/mean": 0.684375, + "rewards/mcq_accuracy_reward/std": 0.4649881660938263, + "rewards/tag_count_reward/mean": 0.9828124880790711, + "rewards/tag_count_reward/std": 0.08855385333299637, + "step": 1510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0171875, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5355.6, + "completions/mean_length": 1060.7026611328124, + "completions/mean_terminated_length": 935.8194580078125, + "completions/min_length": 218.8, + "completions/min_terminated_length": 218.8, + "epoch": 0.3977617695804138, + "grad_norm": 0.2551015641180584, + "kl": 0.0816192626953125, + "learning_rate": 3.6760481548760085e-07, + "loss": 0.1037, + "num_tokens": 652079312.0, + "reward": 1.1490885734558105, + "reward_std": 0.19925102293491365, + "rewards/format_reward/mean": 0.9671875, + "rewards/format_reward/std": 0.17774123549461365, + "rewards/mcq_accuracy_reward/mean": 0.660937488079071, + "rewards/mcq_accuracy_reward/std": 0.4675086855888367, + "rewards/tag_count_reward/mean": 0.9854166626930236, + "rewards/tag_count_reward/std": 0.08080853968858719, + "step": 1515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01874999999999998, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6059.2, + "completions/mean_length": 1073.5588745117188, + "completions/mean_terminated_length": 937.402490234375, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.39907451469454064, + "grad_norm": 0.23725198272532902, + "kl": 0.08021240234375, + "learning_rate": 3.5863209165489045e-07, + "loss": 0.1239, + "num_tokens": 654442689.0, + "reward": 1.1897786617279054, + "reward_std": 0.17656248509883882, + "rewards/format_reward/mean": 0.965625, + "rewards/format_reward/std": 0.18011027872562407, + "rewards/mcq_accuracy_reward/mean": 0.7015625, + "rewards/mcq_accuracy_reward/std": 0.45294424891471863, + "rewards/tag_count_reward/mean": 0.9872395753860473, + "rewards/tag_count_reward/std": 0.0741159550845623, + "step": 1520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.020833333333333325, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6858.8, + "completions/mean_length": 1121.1531494140625, + "completions/mean_terminated_length": 970.5452270507812, + "completions/min_length": 234.6, + "completions/min_terminated_length": 234.6, + "epoch": 0.4003872598086674, + "grad_norm": 0.25515935372763504, + "kl": 0.0774322509765625, + "learning_rate": 3.497553405065689e-07, + "loss": 0.1419, + "num_tokens": 656890999.0, + "reward": 1.1923828601837159, + "reward_std": 0.19383802115917206, + "rewards/format_reward/mean": 0.9583333253860473, + "rewards/format_reward/std": 0.19991600811481475, + "rewards/mcq_accuracy_reward/mean": 0.7067708373069763, + "rewards/mcq_accuracy_reward/std": 0.4554051637649536, + "rewards/tag_count_reward/mean": 0.9841145753860474, + "rewards/tag_count_reward/std": 0.08523715436458587, + "step": 1525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.022916666666666675, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6754.4, + "completions/mean_length": 1128.309423828125, + "completions/mean_terminated_length": 962.5262451171875, + "completions/min_length": 192.4, + "completions/min_terminated_length": 192.4, + "epoch": 0.4017000049227942, + "grad_norm": 0.24874036274269795, + "kl": 0.0803436279296875, + "learning_rate": 3.409753084501981e-07, + "loss": 0.1626, + "num_tokens": 659353729.0, + "reward": 1.182454490661621, + "reward_std": 0.17859623432159424, + "rewards/format_reward/mean": 0.95625, + "rewards/format_reward/std": 0.20463235378265382, + "rewards/mcq_accuracy_reward/mean": 0.6973958253860474, + "rewards/mcq_accuracy_reward/std": 0.4589643716812134, + "rewards/tag_count_reward/mean": 0.9839843869209289, + "rewards/tag_count_reward/std": 0.08507048487663268, + "step": 1530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.014583333333333347, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 7133.6, + "completions/mean_length": 1026.5026245117188, + "completions/mean_terminated_length": 920.8350463867188, + "completions/min_length": 207.6, + "completions/min_terminated_length": 207.6, + "epoch": 0.40301275003692094, + "grad_norm": 0.2552543729047736, + "kl": 0.0824371337890625, + "learning_rate": 3.322927337606487e-07, + "loss": 0.1328, + "num_tokens": 661624750.0, + "reward": 1.231738305091858, + "reward_std": 0.17942396700382232, + "rewards/format_reward/mean": 0.965624988079071, + "rewards/format_reward/std": 0.181215038895607, + "rewards/mcq_accuracy_reward/mean": 0.74375, + "rewards/mcq_accuracy_reward/std": 0.42808024287223817, + "rewards/tag_count_reward/mean": 0.9863281369209289, + "rewards/tag_count_reward/std": 0.0778714470565319, + "step": 1535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5646.4, + "completions/mean_length": 1042.723486328125, + "completions/mean_terminated_length": 929.1105346679688, + "completions/min_length": 235.4, + "completions/min_terminated_length": 235.4, + "epoch": 0.4043254951510477, + "grad_norm": 0.21870236245210212, + "kl": 0.08179931640625, + "learning_rate": 3.2370834651802324e-07, + "loss": 0.1157, + "num_tokens": 663924235.0, + "reward": 1.2114909172058106, + "reward_std": 0.17583871483802796, + "rewards/format_reward/mean": 0.9651041626930237, + "rewards/format_reward/std": 0.18167175948619843, + "rewards/mcq_accuracy_reward/mean": 0.723437511920929, + "rewards/mcq_accuracy_reward/std": 0.4445623874664307, + "rewards/tag_count_reward/mean": 0.987109375, + "rewards/tag_count_reward/std": 0.07320255935192108, + "step": 1540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015104166666666651, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5473.8, + "completions/mean_length": 1038.6088745117188, + "completions/mean_terminated_length": 928.608642578125, + "completions/min_length": 211.8, + "completions/min_terminated_length": 211.8, + "epoch": 0.40563824026517453, + "grad_norm": 0.27563614784172696, + "kl": 0.0810516357421875, + "learning_rate": 3.1522286854626877e-07, + "loss": 0.1369, + "num_tokens": 666219524.0, + "reward": 1.1748047351837159, + "reward_std": 0.18643143773078918, + "rewards/format_reward/mean": 0.9625, + "rewards/format_reward/std": 0.18872076869010926, + "rewards/mcq_accuracy_reward/mean": 0.6880208373069763, + "rewards/mcq_accuracy_reward/std": 0.45874003171920774, + "rewards/tag_count_reward/mean": 0.9846354126930237, + "rewards/tag_count_reward/std": 0.07935881465673447, + "step": 1545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01562500000000002, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5851.4, + "completions/mean_length": 1078.4375366210938, + "completions/mean_terminated_length": 965.8440551757812, + "completions/min_length": 234.2, + "completions/min_terminated_length": 234.2, + "epoch": 0.4069509853793013, + "grad_norm": 0.21595365752895646, + "kl": 0.081439208984375, + "learning_rate": 3.068370133524788e-07, + "loss": 0.1139, + "num_tokens": 668589716.0, + "reward": 1.1895833730697631, + "reward_std": 0.1823151409626007, + "rewards/format_reward/mean": 0.9635416626930237, + "rewards/format_reward/std": 0.18685607016086578, + "rewards/mcq_accuracy_reward/mean": 0.7015625, + "rewards/mcq_accuracy_reward/std": 0.4553744554519653, + "rewards/tag_count_reward/mean": 0.9885416746139526, + "rewards/tag_count_reward/std": 0.0697978377342224, + "step": 1550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 8135.6, + "completions/max_terminated_length": 6530.2, + "completions/mean_length": 1013.4062622070312, + "completions/mean_terminated_length": 922.4423461914063, + "completions/min_length": 191.4, + "completions/min_terminated_length": 191.4, + "epoch": 0.40826373049342807, + "grad_norm": 0.24391044563687836, + "kl": 0.08460693359375, + "learning_rate": 2.985514860668988e-07, + "loss": 0.1332, + "num_tokens": 670832040.0, + "reward": 1.2451823234558106, + "reward_std": 0.1686953365802765, + "rewards/format_reward/mean": 0.9640625, + "rewards/format_reward/std": 0.18548206984996796, + "rewards/mcq_accuracy_reward/mean": 0.7572916626930237, + "rewards/mcq_accuracy_reward/std": 0.4232532560825348, + "rewards/tag_count_reward/mean": 0.9875, + "rewards/tag_count_reward/std": 0.07444161251187324, + "step": 1555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01562500000000002, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5588.4, + "completions/mean_length": 1064.4948120117188, + "completions/mean_terminated_length": 951.467236328125, + "completions/min_length": 209.4, + "completions/min_terminated_length": 209.4, + "epoch": 0.40957647560755484, + "grad_norm": 0.23385212743046657, + "kl": 0.0805572509765625, + "learning_rate": 2.903669833836361e-07, + "loss": 0.1301, + "num_tokens": 673170902.0, + "reward": 1.203059935569763, + "reward_std": 0.18821612000465393, + "rewards/format_reward/mean": 0.9604166626930237, + "rewards/format_reward/std": 0.19410396814346315, + "rewards/mcq_accuracy_reward/mean": 0.7161458373069763, + "rewards/mcq_accuracy_reward/std": 0.44954833984375, + "rewards/tag_count_reward/mean": 0.9872395873069764, + "rewards/tag_count_reward/std": 0.07538967058062554, + "step": 1560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.018229166666666675, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6027.4, + "completions/mean_length": 1064.2614990234374, + "completions/mean_terminated_length": 931.6636840820313, + "completions/min_length": 193.2, + "completions/min_terminated_length": 193.2, + "epoch": 0.4108892207216816, + "grad_norm": 0.2424681191719265, + "kl": 0.08028564453125, + "learning_rate": 2.822841935020757e-07, + "loss": 0.1346, + "num_tokens": 675514956.0, + "reward": 1.1413411855697633, + "reward_std": 0.16588305830955505, + "rewards/format_reward/mean": 0.964062511920929, + "rewards/format_reward/std": 0.18512460589408875, + "rewards/mcq_accuracy_reward/mean": 0.6541666626930237, + "rewards/mcq_accuracy_reward/std": 0.473183274269104, + "rewards/tag_count_reward/mean": 0.9846354126930237, + "rewards/tag_count_reward/std": 0.08285345733165742, + "step": 1565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017708333333333326, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5986.2, + "completions/mean_length": 1065.0375366210938, + "completions/mean_terminated_length": 936.4407592773438, + "completions/min_length": 233.2, + "completions/min_terminated_length": 233.2, + "epoch": 0.4122019658358084, + "grad_norm": 0.22456350000295425, + "kl": 0.0832122802734375, + "learning_rate": 2.7430379606901325e-07, + "loss": 0.1393, + "num_tokens": 677859252.0, + "reward": 1.1885417222976684, + "reward_std": 0.18469297885894775, + "rewards/format_reward/mean": 0.9588541626930237, + "rewards/format_reward/std": 0.19863855242729186, + "rewards/mcq_accuracy_reward/mean": 0.703125, + "rewards/mcq_accuracy_reward/std": 0.45365396738052366, + "rewards/tag_count_reward/mean": 0.9828124880790711, + "rewards/tag_count_reward/std": 0.08710355907678605, + "step": 1570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.010937500000000022, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5637.8, + "completions/mean_length": 992.0224243164063, + "completions/mean_terminated_length": 912.45703125, + "completions/min_length": 218.8, + "completions/min_terminated_length": 218.8, + "epoch": 0.4135147109499352, + "grad_norm": 0.3056104940729159, + "kl": 0.0854034423828125, + "learning_rate": 2.664264621215083e-07, + "loss": 0.1212, + "num_tokens": 680062623.0, + "reward": 1.1785807609558105, + "reward_std": 0.1592057764530182, + "rewards/format_reward/mean": 0.9671875, + "rewards/format_reward/std": 0.1776927351951599, + "rewards/mcq_accuracy_reward/mean": 0.6890625, + "rewards/mcq_accuracy_reward/std": 0.4568677544593811, + "rewards/tag_count_reward/mean": 0.9908854246139527, + "rewards/tag_count_reward/std": 0.06356613710522652, + "step": 1575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5828.4, + "completions/mean_length": 972.0750244140625, + "completions/mean_terminated_length": 857.68662109375, + "completions/min_length": 208.6, + "completions/min_terminated_length": 208.6, + "epoch": 0.41482745606406196, + "grad_norm": 0.21541851445481855, + "kl": 0.0854888916015625, + "learning_rate": 2.5865285403045727e-07, + "loss": 0.1428, + "num_tokens": 682225575.0, + "reward": 1.2102214097976685, + "reward_std": 0.17691716849803923, + "rewards/format_reward/mean": 0.9604166626930237, + "rewards/format_reward/std": 0.1902270197868347, + "rewards/mcq_accuracy_reward/mean": 0.7234375, + "rewards/mcq_accuracy_reward/std": 0.44213899970054626, + "rewards/tag_count_reward/mean": 0.9867187380790711, + "rewards/tag_count_reward/std": 0.07317372038960457, + "step": 1580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.016145833333333325, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6371.0, + "completions/mean_length": 1052.9526245117188, + "completions/mean_terminated_length": 935.5980590820312, + "completions/min_length": 222.4, + "completions/min_terminated_length": 222.4, + "epoch": 0.41614020117818873, + "grad_norm": 0.18288223074007262, + "kl": 0.08072509765625, + "learning_rate": 2.509836254448982e-07, + "loss": 0.139, + "num_tokens": 684542204.0, + "reward": 1.1663737297058105, + "reward_std": 0.1614986389875412, + "rewards/format_reward/mean": 0.9614583253860474, + "rewards/format_reward/std": 0.1926332026720047, + "rewards/mcq_accuracy_reward/mean": 0.6796875119209289, + "rewards/mcq_accuracy_reward/std": 0.46414305567741393, + "rewards/tag_count_reward/mean": 0.9852864623069764, + "rewards/tag_count_reward/std": 0.08049650341272355, + "step": 1585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.014062499999999978, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5271.8, + "completions/mean_length": 981.14638671875, + "completions/mean_terminated_length": 878.263134765625, + "completions/min_length": 225.6, + "completions/min_terminated_length": 225.6, + "epoch": 0.4174529462923155, + "grad_norm": 0.21389025835774958, + "kl": 0.0817779541015625, + "learning_rate": 2.434194212370496e-07, + "loss": 0.1182, + "num_tokens": 686728157.0, + "reward": 1.2107096672058106, + "reward_std": 0.1615220218896866, + "rewards/format_reward/mean": 0.9687499880790711, + "rewards/format_reward/std": 0.17234475612640382, + "rewards/mcq_accuracy_reward/mean": 0.7213541746139527, + "rewards/mcq_accuracy_reward/std": 0.4478746592998505, + "rewards/tag_count_reward/mean": 0.988671875, + "rewards/tag_count_reward/std": 0.07017130926251411, + "step": 1590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.016666666666666673, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5804.2, + "completions/mean_length": 1034.8849243164063, + "completions/mean_terminated_length": 914.2256103515625, + "completions/min_length": 226.2, + "completions/min_terminated_length": 226.2, + "epoch": 0.4187656914064423, + "grad_norm": 0.2330047450795056, + "kl": 0.08216552734375, + "learning_rate": 2.3596087744808425e-07, + "loss": 0.1317, + "num_tokens": 689009288.0, + "reward": 1.1403971672058106, + "reward_std": 0.1611458867788315, + "rewards/format_reward/mean": 0.9661458253860473, + "rewards/format_reward/std": 0.17640395164489747, + "rewards/mcq_accuracy_reward/mean": 0.6515625, + "rewards/mcq_accuracy_reward/std": 0.4752722978591919, + "rewards/tag_count_reward/mean": 0.9891927003860473, + "rewards/tag_count_reward/std": 0.069086854159832, + "step": 1595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.014583333333333347, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5484.0, + "completions/mean_length": 1006.98076171875, + "completions/mean_terminated_length": 900.7617309570312, + "completions/min_length": 203.6, + "completions/min_terminated_length": 203.6, + "epoch": 0.4200784365205691, + "grad_norm": 0.2038830698812817, + "kl": 0.082769775390625, + "learning_rate": 2.2860862123464732e-07, + "loss": 0.1284, + "num_tokens": 691244563.0, + "reward": 1.1855794668197632, + "reward_std": 0.17056121230125426, + "rewards/format_reward/mean": 0.9635416626930237, + "rewards/format_reward/std": 0.1867744207382202, + "rewards/mcq_accuracy_reward/mean": 0.6979166626930237, + "rewards/mcq_accuracy_reward/std": 0.45866488814353945, + "rewards/tag_count_reward/mean": 0.987109375, + "rewards/tag_count_reward/std": 0.07439810931682586, + "step": 1600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017187499999999977, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6076.2, + "completions/mean_length": 1017.7140869140625, + "completions/mean_terminated_length": 892.451025390625, + "completions/min_length": 202.8, + "completions/min_terminated_length": 202.8, + "epoch": 0.42139118163469585, + "grad_norm": 0.27289546934357695, + "kl": 0.084405517578125, + "learning_rate": 2.2136327081612378e-07, + "loss": 0.1345, + "num_tokens": 693494422.0, + "reward": 1.209016990661621, + "reward_std": 0.1846052587032318, + "rewards/format_reward/mean": 0.9677083253860473, + "rewards/format_reward/std": 0.17476534843444824, + "rewards/mcq_accuracy_reward/mean": 0.7208333373069763, + "rewards/mcq_accuracy_reward/std": 0.4458419978618622, + "rewards/tag_count_reward/mean": 0.9850260496139527, + "rewards/tag_count_reward/std": 0.07866601720452308, + "step": 1605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.019270833333333327, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6513.4, + "completions/mean_length": 1000.706298828125, + "completions/mean_terminated_length": 859.7481079101562, + "completions/min_length": 206.8, + "completions/min_terminated_length": 206.8, + "epoch": 0.4227039267488226, + "grad_norm": 0.2344094210716835, + "kl": 0.08548583984375, + "learning_rate": 2.1422543542265167e-07, + "loss": 0.1636, + "num_tokens": 695711186.0, + "reward": 1.210546898841858, + "reward_std": 0.1715066224336624, + "rewards/format_reward/mean": 0.9645833373069763, + "rewards/format_reward/std": 0.1841827154159546, + "rewards/mcq_accuracy_reward/mean": 0.7234375, + "rewards/mcq_accuracy_reward/std": 0.44177095890045165, + "rewards/tag_count_reward/mean": 0.9838541507720947, + "rewards/tag_count_reward/std": 0.08359743729233741, + "step": 1610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02083333333333335, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5151.4, + "completions/mean_length": 1044.5646240234375, + "completions/mean_terminated_length": 892.2681518554688, + "completions/min_length": 206.2, + "completions/min_terminated_length": 206.2, + "epoch": 0.4240166718629494, + "grad_norm": 0.3083321099090082, + "kl": 0.082916259765625, + "learning_rate": 2.0719571524389657e-07, + "loss": 0.1507, + "num_tokens": 698017678.0, + "reward": 1.1385742664337157, + "reward_std": 0.18226715326309204, + "rewards/format_reward/mean": 0.9651041746139526, + "rewards/format_reward/std": 0.18234705924987793, + "rewards/mcq_accuracy_reward/mean": 0.6505208253860474, + "rewards/mcq_accuracy_reward/std": 0.475015252828598, + "rewards/tag_count_reward/mean": 0.987109363079071, + "rewards/tag_count_reward/std": 0.0754339836537838, + "step": 1615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.011458333333333326, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5239.8, + "completions/mean_length": 1018.0286743164063, + "completions/mean_terminated_length": 934.7542114257812, + "completions/min_length": 218.4, + "completions/min_terminated_length": 218.4, + "epoch": 0.4253294169770762, + "grad_norm": 0.24486957603330695, + "kl": 0.081744384765625, + "learning_rate": 2.002747013785845e-07, + "loss": 0.1269, + "num_tokens": 700275381.0, + "reward": 1.1510417222976685, + "reward_std": 0.18218718469142914, + "rewards/format_reward/mean": 0.9614583373069763, + "rewards/format_reward/std": 0.1923879623413086, + "rewards/mcq_accuracy_reward/mean": 0.6640625119209289, + "rewards/mcq_accuracy_reward/std": 0.470738285779953, + "rewards/tag_count_reward/mean": 0.9864583373069763, + "rewards/tag_count_reward/std": 0.07670341208577156, + "step": 1620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.013541666666666674, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6158.4, + "completions/mean_length": 967.8588623046875, + "completions/mean_terminated_length": 868.4676513671875, + "completions/min_length": 211.6, + "completions/min_terminated_length": 211.6, + "epoch": 0.426642162091203, + "grad_norm": 0.25422522378253515, + "kl": 0.08609619140625, + "learning_rate": 1.9346297578479766e-07, + "loss": 0.1183, + "num_tokens": 702431070.0, + "reward": 1.2086263179779053, + "reward_std": 0.17132827937602996, + "rewards/format_reward/mean": 0.9640625, + "rewards/format_reward/std": 0.18543145358562468, + "rewards/mcq_accuracy_reward/mean": 0.7208333253860474, + "rewards/mcq_accuracy_reward/std": 0.44708632826805117, + "rewards/tag_count_reward/mean": 0.987109375, + "rewards/tag_count_reward/std": 0.07233263552188873, + "step": 1625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017708333333333326, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5759.6, + "completions/mean_length": 985.2875366210938, + "completions/mean_terminated_length": 855.3727172851562, + "completions/min_length": 217.4, + "completions/min_terminated_length": 217.4, + "epoch": 0.42795490720532975, + "grad_norm": 0.2563241754812317, + "kl": 0.083123779296875, + "learning_rate": 1.8676111123104133e-07, + "loss": 0.1331, + "num_tokens": 704619510.0, + "reward": 1.232324242591858, + "reward_std": 0.15760177373886108, + "rewards/format_reward/mean": 0.965625, + "rewards/format_reward/std": 0.18122280836105348, + "rewards/mcq_accuracy_reward/mean": 0.7442708373069763, + "rewards/mcq_accuracy_reward/std": 0.4349357128143311, + "rewards/tag_count_reward/mean": 0.9865885376930237, + "rewards/tag_count_reward/std": 0.07669364511966706, + "step": 1630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017708333333333305, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5083.4, + "completions/mean_length": 1048.3521118164062, + "completions/mean_terminated_length": 919.4367553710938, + "completions/min_length": 219.6, + "completions/min_terminated_length": 219.6, + "epoch": 0.4292676523194565, + "grad_norm": 0.25376851139893164, + "kl": 0.07901611328125, + "learning_rate": 1.8016967124808158e-07, + "loss": 0.1294, + "num_tokens": 706929378.0, + "reward": 1.188899779319763, + "reward_std": 0.17628267109394075, + "rewards/format_reward/mean": 0.9593749880790711, + "rewards/format_reward/std": 0.19710786044597625, + "rewards/mcq_accuracy_reward/mean": 0.7020833373069764, + "rewards/mcq_accuracy_reward/std": 0.4569210708141327, + "rewards/tag_count_reward/mean": 0.987890625, + "rewards/tag_count_reward/std": 0.07315431162714958, + "step": 1635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01875, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5969.6, + "completions/mean_length": 965.4890991210938, + "completions/mean_terminated_length": 827.5226806640625, + "completions/min_length": 163.8, + "completions/min_terminated_length": 163.8, + "epoch": 0.4305803974335833, + "grad_norm": 0.23996912053610905, + "kl": 0.08253173828125, + "learning_rate": 1.7368921008155996e-07, + "loss": 0.1576, + "num_tokens": 709081005.0, + "reward": 1.1748698234558106, + "reward_std": 0.16301866471767426, + "rewards/format_reward/mean": 0.9651041626930237, + "rewards/format_reward/std": 0.18361350595951081, + "rewards/mcq_accuracy_reward/mean": 0.6869791626930237, + "rewards/mcq_accuracy_reward/std": 0.45726142525672914, + "rewards/tag_count_reward/mean": 0.9864583373069763, + "rewards/tag_count_reward/std": 0.079268079996109, + "step": 1640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.018229166666666675, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5236.4, + "completions/mean_length": 1030.1031494140625, + "completions/mean_terminated_length": 896.9246948242187, + "completions/min_length": 218.4, + "completions/min_terminated_length": 218.4, + "epoch": 0.4318931425477101, + "grad_norm": 0.2829580429719119, + "kl": 0.0818328857421875, + "learning_rate": 1.6732027264539064e-07, + "loss": 0.1297, + "num_tokens": 711358187.0, + "reward": 1.206933617591858, + "reward_std": 0.1774197429418564, + "rewards/format_reward/mean": 0.9677083373069764, + "rewards/format_reward/std": 0.17570118010044097, + "rewards/mcq_accuracy_reward/mean": 0.7182291626930237, + "rewards/mcq_accuracy_reward/std": 0.4481948554515839, + "rewards/tag_count_reward/mean": 0.987109375, + "rewards/tag_count_reward/std": 0.07491986826062202, + "step": 1645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01875, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6988.0, + "completions/mean_length": 1111.8214111328125, + "completions/mean_terminated_length": 976.7391723632812, + "completions/min_length": 200.8, + "completions/min_terminated_length": 200.8, + "epoch": 0.4332058876618369, + "grad_norm": 0.2531523765416524, + "kl": 0.0789642333984375, + "learning_rate": 1.610633944759403e-07, + "loss": 0.1427, + "num_tokens": 713789020.0, + "reward": 1.1413411617279052, + "reward_std": 0.17559497505426408, + "rewards/format_reward/mean": 0.959375, + "rewards/format_reward/std": 0.19587102830410003, + "rewards/mcq_accuracy_reward/mean": 0.6552083492279053, + "rewards/mcq_accuracy_reward/std": 0.47282237410545347, + "rewards/tag_count_reward/mean": 0.98515625, + "rewards/tag_count_reward/std": 0.08134672194719314, + "step": 1650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017708333333333347, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5311.6, + "completions/mean_length": 1043.8364624023438, + "completions/mean_terminated_length": 914.997314453125, + "completions/min_length": 214.8, + "completions/min_terminated_length": 214.8, + "epoch": 0.43451863277596364, + "grad_norm": 0.2709434027002295, + "kl": 0.0837310791015625, + "learning_rate": 1.549191016869969e-07, + "loss": 0.137, + "num_tokens": 716091946.0, + "reward": 1.2224283933639526, + "reward_std": 0.17213355302810668, + "rewards/format_reward/mean": 0.9651041626930237, + "rewards/format_reward/std": 0.18244988918304444, + "rewards/mcq_accuracy_reward/mean": 0.734375, + "rewards/mcq_accuracy_reward/std": 0.4379712402820587, + "rewards/tag_count_reward/mean": 0.987109386920929, + "rewards/tag_count_reward/std": 0.07394366636872292, + "step": 1655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.018229166666666675, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5126.4, + "completions/mean_length": 1057.6109497070313, + "completions/mean_terminated_length": 924.958544921875, + "completions/min_length": 219.6, + "completions/min_terminated_length": 219.6, + "epoch": 0.4358313778900904, + "grad_norm": 0.22668495348324813, + "kl": 0.0806610107421875, + "learning_rate": 1.488879109255311e-07, + "loss": 0.1494, + "num_tokens": 718422727.0, + "reward": 1.1942383289337157, + "reward_std": 0.16713110804557801, + "rewards/format_reward/mean": 0.9625000119209289, + "rewards/format_reward/std": 0.1887198120355606, + "rewards/mcq_accuracy_reward/mean": 0.7072916626930237, + "rewards/mcq_accuracy_reward/std": 0.45269834995269775, + "rewards/tag_count_reward/mean": 0.9852864623069764, + "rewards/tag_count_reward/std": 0.08136075884103774, + "step": 1660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6701.8, + "completions/mean_length": 976.24326171875, + "completions/mean_terminated_length": 884.4385986328125, + "completions/min_length": 218.8, + "completions/min_terminated_length": 218.8, + "epoch": 0.4371441230042172, + "grad_norm": 0.28935254188057125, + "kl": 0.0852874755859375, + "learning_rate": 1.4297032932825488e-07, + "loss": 0.1221, + "num_tokens": 720589490.0, + "reward": 1.1969401359558105, + "reward_std": 0.1636398732662201, + "rewards/format_reward/mean": 0.9708333253860474, + "rewards/format_reward/std": 0.16557413041591645, + "rewards/mcq_accuracy_reward/mean": 0.7072916626930237, + "rewards/mcq_accuracy_reward/std": 0.4538182318210602, + "rewards/tag_count_reward/mean": 0.9877604126930237, + "rewards/tag_count_reward/std": 0.07189310193061829, + "step": 1665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017187499999999977, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 7091.4, + "completions/mean_length": 1068.6916870117188, + "completions/mean_terminated_length": 944.7509033203125, + "completions/min_length": 223.8, + "completions/min_terminated_length": 223.8, + "epoch": 0.43845686811834395, + "grad_norm": 0.20431534366256526, + "kl": 0.08160400390625, + "learning_rate": 1.3716685447897654e-07, + "loss": 0.1241, + "num_tokens": 722943106.0, + "reward": 1.178515648841858, + "reward_std": 0.1654358074069023, + "rewards/format_reward/mean": 0.9619791626930236, + "rewards/format_reward/std": 0.18928584158420564, + "rewards/mcq_accuracy_reward/mean": 0.6916666746139526, + "rewards/mcq_accuracy_reward/std": 0.4589478552341461, + "rewards/tag_count_reward/mean": 0.9854166626930236, + "rewards/tag_count_reward/std": 0.07801044210791588, + "step": 1670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01614583333333335, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5854.2, + "completions/mean_length": 1052.3255493164063, + "completions/mean_terminated_length": 935.2223266601562, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.43976961323247077, + "grad_norm": 0.27040501823047547, + "kl": 0.077593994140625, + "learning_rate": 1.3147797436676224e-07, + "loss": 0.1331, + "num_tokens": 725265259.0, + "reward": 1.2275391340255737, + "reward_std": 0.1955667555332184, + "rewards/format_reward/mean": 0.965624988079071, + "rewards/format_reward/std": 0.18177194595336915, + "rewards/mcq_accuracy_reward/mean": 0.7395833373069763, + "rewards/mcq_accuracy_reward/std": 0.4316529452800751, + "rewards/tag_count_reward/mean": 0.9861979126930237, + "rewards/tag_count_reward/std": 0.07777385264635087, + "step": 1675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.019270833333333327, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6294.2, + "completions/mean_length": 1017.6062744140625, + "completions/mean_terminated_length": 876.7208862304688, + "completions/min_length": 206.2, + "completions/min_terminated_length": 206.2, + "epoch": 0.44108235834659754, + "grad_norm": 0.27970095332735917, + "kl": 0.084344482421875, + "learning_rate": 1.2590416734490397e-07, + "loss": 0.1346, + "num_tokens": 727512583.0, + "reward": 1.1866862535476685, + "reward_std": 0.2085656315088272, + "rewards/format_reward/mean": 0.9635416626930237, + "rewards/format_reward/std": 0.18674692809581755, + "rewards/mcq_accuracy_reward/mean": 0.7, + "rewards/mcq_accuracy_reward/std": 0.4554550886154175, + "rewards/tag_count_reward/mean": 0.983203125, + "rewards/tag_count_reward/std": 0.08469946086406707, + "step": 1680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.016145833333333325, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6967.6, + "completions/mean_length": 1039.61044921875, + "completions/mean_terminated_length": 922.3169555664062, + "completions/min_length": 215.6, + "completions/min_terminated_length": 215.6, + "epoch": 0.4423951034607243, + "grad_norm": 0.2582812921602449, + "kl": 0.080157470703125, + "learning_rate": 1.2044590209069484e-07, + "loss": 0.1274, + "num_tokens": 729804027.0, + "reward": 1.1663411617279054, + "reward_std": 0.17748666256666185, + "rewards/format_reward/mean": 0.9651041626930237, + "rewards/format_reward/std": 0.18224866390228273, + "rewards/mcq_accuracy_reward/mean": 0.678125, + "rewards/mcq_accuracy_reward/std": 0.4662696778774261, + "rewards/tag_count_reward/mean": 0.9877604246139526, + "rewards/tag_count_reward/std": 0.07289739400148391, + "step": 1685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02343750000000002, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6813.6, + "completions/mean_length": 1087.1844116210937, + "completions/mean_terminated_length": 916.1988403320313, + "completions/min_length": 203.4, + "completions/min_terminated_length": 203.4, + "epoch": 0.44370784857485107, + "grad_norm": 0.2082777635534797, + "kl": 0.080224609375, + "learning_rate": 1.1510363756602138e-07, + "loss": 0.161, + "num_tokens": 732193205.0, + "reward": 1.170019555091858, + "reward_std": 0.20227122008800508, + "rewards/format_reward/mean": 0.9593749880790711, + "rewards/format_reward/std": 0.19528466761112212, + "rewards/mcq_accuracy_reward/mean": 0.6848958253860473, + "rewards/mcq_accuracy_reward/std": 0.46139426827430724, + "rewards/tag_count_reward/mean": 0.9811197996139527, + "rewards/tag_count_reward/std": 0.09162076711654663, + "step": 1690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.019791666666666673, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6417.4, + "completions/mean_length": 1048.46357421875, + "completions/mean_terminated_length": 904.2213256835937, + "completions/min_length": 233.2, + "completions/min_terminated_length": 233.2, + "epoch": 0.44502059368897784, + "grad_norm": 0.373236853010342, + "kl": 0.0792724609375, + "learning_rate": 1.0987782297877147e-07, + "loss": 0.1518, + "num_tokens": 734498487.0, + "reward": 1.207552146911621, + "reward_std": 0.1833156704902649, + "rewards/format_reward/mean": 0.9625, + "rewards/format_reward/std": 0.18757579922676088, + "rewards/mcq_accuracy_reward/mean": 0.7208333373069763, + "rewards/mcq_accuracy_reward/std": 0.4433580696582794, + "rewards/tag_count_reward/mean": 0.984375, + "rewards/tag_count_reward/std": 0.08315660357475281, + "step": 1695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.018229166666666675, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5581.8, + "completions/mean_length": 1111.49482421875, + "completions/mean_terminated_length": 980.0511596679687, + "completions/min_length": 230.4, + "completions/min_terminated_length": 230.4, + "epoch": 0.44633333880310466, + "grad_norm": 0.2022374991946529, + "kl": 0.074969482421875, + "learning_rate": 1.0476889774506176e-07, + "loss": 0.133, + "num_tokens": 736929901.0, + "reward": 1.1820964336395263, + "reward_std": 0.19433790147304536, + "rewards/format_reward/mean": 0.9661458253860473, + "rewards/format_reward/std": 0.17841885685920716, + "rewards/mcq_accuracy_reward/mean": 0.6942708373069764, + "rewards/mcq_accuracy_reward/std": 0.458478981256485, + "rewards/tag_count_reward/mean": 0.98515625, + "rewards/tag_count_reward/std": 0.0791962131857872, + "step": 1700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017187499999999977, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5692.8, + "completions/mean_length": 1005.2672241210937, + "completions/mean_terminated_length": 879.7180541992187, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.44764608391723143, + "grad_norm": 0.23775183415854478, + "kl": 0.0818359375, + "learning_rate": 9.977729145228926e-08, + "loss": 0.1499, + "num_tokens": 739158390.0, + "reward": 1.1923828601837159, + "reward_std": 0.1815950393676758, + "rewards/format_reward/mean": 0.9614583253860474, + "rewards/format_reward/std": 0.19167436361312867, + "rewards/mcq_accuracy_reward/mean": 0.7052083373069763, + "rewards/mcq_accuracy_reward/std": 0.4539748430252075, + "rewards/tag_count_reward/mean": 0.9872395873069764, + "rewards/tag_count_reward/std": 0.07393510714173317, + "step": 1705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.019791666666666673, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6248.2, + "completions/mean_length": 1095.76044921875, + "completions/mean_terminated_length": 952.4527954101562, + "completions/min_length": 217.2, + "completions/min_terminated_length": 217.2, + "epoch": 0.4489588290313582, + "grad_norm": 0.24998214943928376, + "kl": 0.0780517578125, + "learning_rate": 9.490342382301015e-08, + "loss": 0.1234, + "num_tokens": 741560266.0, + "reward": 1.1538411855697632, + "reward_std": 0.18679152131080629, + "rewards/format_reward/mean": 0.9635416746139527, + "rewards/format_reward/std": 0.18567490875720977, + "rewards/mcq_accuracy_reward/mean": 0.6666666626930237, + "rewards/mcq_accuracy_reward/std": 0.4683292627334595, + "rewards/tag_count_reward/mean": 0.985156261920929, + "rewards/tag_count_reward/std": 0.08130853474140168, + "step": 1710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02083333333333335, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5381.4, + "completions/mean_length": 1026.4661743164063, + "completions/mean_terminated_length": 874.073095703125, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.45027157414548497, + "grad_norm": 0.25903998449059856, + "kl": 0.08055419921875, + "learning_rate": 9.014770467964512e-08, + "loss": 0.1334, + "num_tokens": 743829689.0, + "reward": 1.1772135496139526, + "reward_std": 0.19024882912635804, + "rewards/format_reward/mean": 0.9583333373069763, + "rewards/format_reward/std": 0.1997450053691864, + "rewards/mcq_accuracy_reward/mean": 0.6911458373069763, + "rewards/mcq_accuracy_reward/std": 0.4611590027809143, + "rewards/tag_count_reward/mean": 0.9859375119209289, + "rewards/tag_count_reward/std": 0.07893149852752686, + "step": 1715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.013541666666666652, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5946.6, + "completions/mean_length": 1030.3401245117188, + "completions/mean_terminated_length": 932.21171875, + "completions/min_length": 205.6, + "completions/min_terminated_length": 205.6, + "epoch": 0.45158431925961173, + "grad_norm": 0.25551556067970904, + "kl": 0.0804931640625, + "learning_rate": 8.551053391002117e-08, + "loss": 0.1238, + "num_tokens": 746108198.0, + "reward": 1.1993815422058105, + "reward_std": 0.18771098256111146, + "rewards/format_reward/mean": 0.9640625, + "rewards/format_reward/std": 0.18314007818698883, + "rewards/mcq_accuracy_reward/mean": 0.7109375, + "rewards/mcq_accuracy_reward/std": 0.4526368260383606, + "rewards/tag_count_reward/mean": 0.9897135376930237, + "rewards/tag_count_reward/std": 0.06644659414887429, + "step": 1720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017708333333333347, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5910.6, + "completions/mean_length": 1093.7833618164063, + "completions/mean_terminated_length": 965.8473999023438, + "completions/min_length": 215.4, + "completions/min_terminated_length": 215.4, + "epoch": 0.45289706437373856, + "grad_norm": 0.2730452252462176, + "kl": 0.07869873046875, + "learning_rate": 8.099230143374542e-08, + "loss": 0.1552, + "num_tokens": 748505870.0, + "reward": 1.1943685293197632, + "reward_std": 0.1997908353805542, + "rewards/format_reward/mean": 0.9526041746139526, + "rewards/format_reward/std": 0.212078994512558, + "rewards/mcq_accuracy_reward/mean": 0.7098958373069764, + "rewards/mcq_accuracy_reward/std": 0.4517241775989532, + "rewards/tag_count_reward/mean": 0.9852864623069764, + "rewards/tag_count_reward/std": 0.08081508204340934, + "step": 1725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0140625, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5570.6, + "completions/mean_length": 954.6322998046875, + "completions/mean_terminated_length": 851.5086303710938, + "completions/min_length": 196.2, + "completions/min_terminated_length": 196.2, + "epoch": 0.4542098094878653, + "grad_norm": 0.2132854431894847, + "kl": 0.0851715087890625, + "learning_rate": 7.65933871694196e-08, + "loss": 0.1186, + "num_tokens": 750634556.0, + "reward": 1.1819010734558106, + "reward_std": 0.1830554336309433, + "rewards/format_reward/mean": 0.9651041626930237, + "rewards/format_reward/std": 0.18238966166973114, + "rewards/mcq_accuracy_reward/mean": 0.69375, + "rewards/mcq_accuracy_reward/std": 0.4600892424583435, + "rewards/tag_count_reward/mean": 0.987499988079071, + "rewards/tag_count_reward/std": 0.07311699837446213, + "step": 1730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.013020833333333325, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5267.2, + "completions/mean_length": 1003.7312744140625, + "completions/mean_terminated_length": 908.9691528320312, + "completions/min_length": 242.2, + "completions/min_terminated_length": 242.2, + "epoch": 0.4555225546019921, + "grad_norm": 0.3075306208778918, + "kl": 0.08009033203125, + "learning_rate": 7.231416100269334e-08, + "loss": 0.1103, + "num_tokens": 752864032.0, + "reward": 1.1773763656616212, + "reward_std": 0.18131552636623383, + "rewards/format_reward/mean": 0.959375, + "rewards/format_reward/std": 0.1970660448074341, + "rewards/mcq_accuracy_reward/mean": 0.690625, + "rewards/mcq_accuracy_reward/std": 0.46059725284576414, + "rewards/tag_count_reward/mean": 0.9876302123069763, + "rewards/tag_count_reward/std": 0.07316697388887405, + "step": 1735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0171875, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6406.4, + "completions/mean_length": 1049.5323364257813, + "completions/mean_terminated_length": 924.8327270507813, + "completions/min_length": 223.4, + "completions/min_terminated_length": 223.4, + "epoch": 0.45683529971611886, + "grad_norm": 0.2453210756028621, + "kl": 0.0803680419921875, + "learning_rate": 6.815498275516246e-08, + "loss": 0.1228, + "num_tokens": 755182622.0, + "reward": 1.1906250715255737, + "reward_std": 0.1940203696489334, + "rewards/format_reward/mean": 0.9635416746139527, + "rewards/format_reward/std": 0.18729088306427003, + "rewards/mcq_accuracy_reward/mean": 0.7031250119209289, + "rewards/mcq_accuracy_reward/std": 0.45645213723182676, + "rewards/tag_count_reward/mean": 0.9864583373069763, + "rewards/tag_count_reward/std": 0.07760375142097473, + "step": 1740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0171875, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6310.8, + "completions/mean_length": 992.0260620117188, + "completions/mean_terminated_length": 866.177392578125, + "completions/min_length": 210.8, + "completions/min_terminated_length": 210.8, + "epoch": 0.4581480448302456, + "grad_norm": 0.20598257556769775, + "kl": 0.08331298828125, + "learning_rate": 6.411620215411363e-08, + "loss": 0.1407, + "num_tokens": 757386392.0, + "reward": 1.1970052242279052, + "reward_std": 0.1418435662984848, + "rewards/format_reward/mean": 0.9661458373069763, + "rewards/format_reward/std": 0.18086549937725066, + "rewards/mcq_accuracy_reward/mean": 0.7088541626930237, + "rewards/mcq_accuracy_reward/std": 0.44930904507637026, + "rewards/tag_count_reward/mean": 0.9864583253860474, + "rewards/tag_count_reward/std": 0.07562315613031387, + "step": 1745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.022395833333333327, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5441.0, + "completions/mean_length": 1102.5635986328125, + "completions/mean_terminated_length": 940.0921875, + "completions/min_length": 223.2, + "completions/min_terminated_length": 223.2, + "epoch": 0.45946078994437245, + "grad_norm": 0.27186564162788, + "kl": 0.0766845703125, + "learning_rate": 6.019815880311607e-08, + "loss": 0.1499, + "num_tokens": 759801098.0, + "reward": 1.1539388418197631, + "reward_std": 0.1955434650182724, + "rewards/format_reward/mean": 0.9552083492279053, + "rewards/format_reward/std": 0.20629194676876067, + "rewards/mcq_accuracy_reward/mean": 0.6692708373069763, + "rewards/mcq_accuracy_reward/std": 0.4693849742412567, + "rewards/tag_count_reward/mean": 0.9834635496139527, + "rewards/tag_count_reward/std": 0.08597085326910019, + "step": 1750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01562500000000002, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6187.0, + "completions/mean_length": 981.7411743164063, + "completions/mean_terminated_length": 867.06298828125, + "completions/min_length": 189.8, + "completions/min_terminated_length": 189.8, + "epoch": 0.4607735350584992, + "grad_norm": 0.25949863005390483, + "kl": 0.0835296630859375, + "learning_rate": 5.6401182153467066e-08, + "loss": 0.1378, + "num_tokens": 761981929.0, + "reward": 1.196516990661621, + "reward_std": 0.1518436536192894, + "rewards/format_reward/mean": 0.9630208253860474, + "rewards/format_reward/std": 0.188490092754364, + "rewards/mcq_accuracy_reward/mean": 0.7088541746139526, + "rewards/mcq_accuracy_reward/std": 0.44876769185066223, + "rewards/tag_count_reward/mean": 0.9876302003860473, + "rewards/tag_count_reward/std": 0.07246685847640037, + "step": 1755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0203125, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5304.2, + "completions/mean_length": 1058.15732421875, + "completions/mean_terminated_length": 910.5197631835938, + "completions/min_length": 168.4, + "completions/min_terminated_length": 168.4, + "epoch": 0.462086280172626, + "grad_norm": 0.22247990952232205, + "kl": 0.080224609375, + "learning_rate": 5.272559147648931e-08, + "loss": 0.1429, + "num_tokens": 764310023.0, + "reward": 1.1832031726837158, + "reward_std": 0.16688547730445863, + "rewards/format_reward/mean": 0.9578125, + "rewards/format_reward/std": 0.2004141926765442, + "rewards/mcq_accuracy_reward/mean": 0.6973958253860474, + "rewards/mcq_accuracy_reward/std": 0.45610825419425965, + "rewards/tag_count_reward/mean": 0.9854166626930236, + "rewards/tag_count_reward/std": 0.08089341968297958, + "step": 1760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.013020833333333348, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5338.6, + "completions/mean_length": 973.7260620117188, + "completions/mean_terminated_length": 878.3600952148438, + "completions/min_length": 207.6, + "completions/min_terminated_length": 207.6, + "epoch": 0.46339902528675275, + "grad_norm": 0.29934373393577457, + "kl": 0.0813232421875, + "learning_rate": 4.9171695836684624e-08, + "loss": 0.0918, + "num_tokens": 766474969.0, + "reward": 1.1958659172058106, + "reward_std": 0.1812240242958069, + "rewards/format_reward/mean": 0.965625, + "rewards/format_reward/std": 0.18067629635334015, + "rewards/mcq_accuracy_reward/mean": 0.7072916626930237, + "rewards/mcq_accuracy_reward/std": 0.45433762669563293, + "rewards/tag_count_reward/mean": 0.988671875, + "rewards/tag_count_reward/std": 0.0721053034067154, + "step": 1765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.018750000000000024, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6394.2, + "completions/mean_length": 1087.4005859375, + "completions/mean_terminated_length": 952.097998046875, + "completions/min_length": 213.4, + "completions/min_terminated_length": 213.4, + "epoch": 0.4647117704008795, + "grad_norm": 0.21814819997530685, + "kl": 0.0785919189453125, + "learning_rate": 4.573979406574591e-08, + "loss": 0.1277, + "num_tokens": 768861354.0, + "reward": 1.1765950918197632, + "reward_std": 0.20586762726306915, + "rewards/format_reward/mean": 0.9640625, + "rewards/format_reward/std": 0.18235532939434052, + "rewards/mcq_accuracy_reward/mean": 0.6890625, + "rewards/mcq_accuracy_reward/std": 0.46032426357269285, + "rewards/tag_count_reward/mean": 0.9860677123069763, + "rewards/tag_count_reward/std": 0.07786537632346154, + "step": 1770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015104166666666674, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 4792.8, + "completions/mean_length": 984.3286865234375, + "completions/mean_terminated_length": 873.5760131835938, + "completions/min_length": 194.8, + "completions/min_terminated_length": 194.8, + "epoch": 0.46602451551500634, + "grad_norm": 0.22467894178993614, + "kl": 0.08221435546875, + "learning_rate": 4.2430174737430715e-08, + "loss": 0.1132, + "num_tokens": 771051465.0, + "reward": 1.1300130605697631, + "reward_std": 0.16092004776000976, + "rewards/format_reward/mean": 0.9682291746139526, + "rewards/format_reward/std": 0.17446247339248658, + "rewards/mcq_accuracy_reward/mean": 0.640625, + "rewards/mcq_accuracy_reward/std": 0.4793547034263611, + "rewards/tag_count_reward/mean": 0.9893229126930236, + "rewards/tag_count_reward/std": 0.06772346422076225, + "step": 1775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.019270833333333327, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6386.6, + "completions/mean_length": 1057.7385620117188, + "completions/mean_terminated_length": 917.490380859375, + "completions/min_length": 190.6, + "completions/min_terminated_length": 190.6, + "epoch": 0.4673372606291331, + "grad_norm": 0.19458080026376692, + "kl": 0.0784637451171875, + "learning_rate": 3.9243116143294675e-08, + "loss": 0.1246, + "num_tokens": 773378235.0, + "reward": 1.1596680164337159, + "reward_std": 0.16667491495609282, + "rewards/format_reward/mean": 0.9614583253860474, + "rewards/format_reward/std": 0.1920902758836746, + "rewards/mcq_accuracy_reward/mean": 0.6729166746139527, + "rewards/mcq_accuracy_reward/std": 0.4680816411972046, + "rewards/tag_count_reward/mean": 0.985546886920929, + "rewards/tag_count_reward/std": 0.07821824252605439, + "step": 1780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01614583333333335, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6172.6, + "completions/mean_length": 1011.3994995117188, + "completions/mean_terminated_length": 893.7958618164063, + "completions/min_length": 235.2, + "completions/min_terminated_length": 235.2, + "epoch": 0.4686500057432599, + "grad_norm": 0.2562739835408513, + "kl": 0.0811981201171875, + "learning_rate": 3.6178886269292387e-08, + "loss": 0.1527, + "num_tokens": 775614210.0, + "reward": 1.1977214336395263, + "reward_std": 0.19775204062461854, + "rewards/format_reward/mean": 0.95625, + "rewards/format_reward/std": 0.20088295936584472, + "rewards/mcq_accuracy_reward/mean": 0.7119791626930236, + "rewards/mcq_accuracy_reward/std": 0.4491087973117828, + "rewards/tag_count_reward/mean": 0.986718761920929, + "rewards/tag_count_reward/std": 0.07675719782710075, + "step": 1785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017708333333333347, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 7061.6, + "completions/mean_length": 1075.5474609375, + "completions/mean_terminated_length": 947.2901000976562, + "completions/min_length": 178.2, + "completions/min_terminated_length": 178.2, + "epoch": 0.46996275085738665, + "grad_norm": 0.24344219309523582, + "kl": 0.08035888671875, + "learning_rate": 3.3237742773244096e-08, + "loss": 0.1405, + "num_tokens": 777981661.0, + "reward": 1.1930338859558105, + "reward_std": 0.19193060100078582, + "rewards/format_reward/mean": 0.9604166626930237, + "rewards/format_reward/std": 0.19489842355251313, + "rewards/mcq_accuracy_reward/mean": 0.706249988079071, + "rewards/mcq_accuracy_reward/std": 0.45210697054862975, + "rewards/tag_count_reward/mean": 0.98671875, + "rewards/tag_count_reward/std": 0.07695698589086533, + "step": 1790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01875, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5833.4, + "completions/mean_length": 1013.4265991210938, + "completions/mean_terminated_length": 876.085546875, + "completions/min_length": 182.2, + "completions/min_terminated_length": 182.2, + "epoch": 0.4712754959715134, + "grad_norm": 0.22345759332845588, + "kl": 0.08150634765625, + "learning_rate": 3.041993296316836e-08, + "loss": 0.1336, + "num_tokens": 780219176.0, + "reward": 1.1795899152755738, + "reward_std": 0.16516764014959334, + "rewards/format_reward/mean": 0.9609375, + "rewards/format_reward/std": 0.1915453314781189, + "rewards/mcq_accuracy_reward/mean": 0.6927083373069763, + "rewards/mcq_accuracy_reward/std": 0.46063382625579835, + "rewards/tag_count_reward/mean": 0.9865885376930237, + "rewards/tag_count_reward/std": 0.07691352069377899, + "step": 1795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015104166666666674, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5906.4, + "completions/mean_length": 1023.9302368164062, + "completions/mean_terminated_length": 914.1335205078125, + "completions/min_length": 211.8, + "completions/min_terminated_length": 211.8, + "epoch": 0.47258824108564024, + "grad_norm": 0.27934448709876947, + "kl": 0.07825927734375, + "learning_rate": 2.7725693776488868e-08, + "loss": 0.1264, + "num_tokens": 782486506.0, + "reward": 1.1843424797058106, + "reward_std": 0.18524578511714934, + "rewards/format_reward/mean": 0.9677083373069764, + "rewards/format_reward/std": 0.1750609189271927, + "rewards/mcq_accuracy_reward/mean": 0.6953125119209289, + "rewards/mcq_accuracy_reward/std": 0.4599054217338562, + "rewards/tag_count_reward/mean": 0.9884114623069763, + "rewards/tag_count_reward/std": 0.06945529356598854, + "step": 1800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01614583333333335, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5170.2, + "completions/mean_length": 981.7869995117187, + "completions/mean_terminated_length": 863.4468139648437, + "completions/min_length": 208.4, + "completions/min_terminated_length": 208.4, + "epoch": 0.473900986199767, + "grad_norm": 0.2334973096405833, + "kl": 0.080096435546875, + "learning_rate": 2.515525176011052e-08, + "loss": 0.132, + "num_tokens": 784668169.0, + "reward": 1.192903709411621, + "reward_std": 0.1794237494468689, + "rewards/format_reward/mean": 0.9682291746139526, + "rewards/format_reward/std": 0.17474088966846466, + "rewards/mcq_accuracy_reward/mean": 0.7036458253860474, + "rewards/mcq_accuracy_reward/std": 0.45158185362815856, + "rewards/tag_count_reward/mean": 0.9888020992279053, + "rewards/tag_count_reward/std": 0.07005045041441918, + "step": 1805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02031250000000002, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6023.0, + "completions/mean_length": 1065.5052490234375, + "completions/mean_terminated_length": 917.9845336914062, + "completions/min_length": 225.6, + "completions/min_terminated_length": 225.6, + "epoch": 0.4752137313138938, + "grad_norm": 0.2402075880089955, + "kl": 0.0773040771484375, + "learning_rate": 2.2708823051370842e-08, + "loss": 0.1639, + "num_tokens": 787013899.0, + "reward": 1.1648112297058106, + "reward_std": 0.2108749955892563, + "rewards/format_reward/mean": 0.9598958373069764, + "rewards/format_reward/std": 0.1947837144136429, + "rewards/mcq_accuracy_reward/mean": 0.6786458253860473, + "rewards/mcq_accuracy_reward/std": 0.4658651888370514, + "rewards/tag_count_reward/mean": 0.984765625, + "rewards/tag_count_reward/std": 0.08150749653577805, + "step": 1810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.018229166666666675, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5596.6, + "completions/mean_length": 1057.868798828125, + "completions/mean_terminated_length": 925.4756225585937, + "completions/min_length": 227.4, + "completions/min_terminated_length": 227.4, + "epoch": 0.47652647642802054, + "grad_norm": 0.2523510770393629, + "kl": 0.0785552978515625, + "learning_rate": 2.0386613359864692e-08, + "loss": 0.1316, + "num_tokens": 789342359.0, + "reward": 1.1559245109558105, + "reward_std": 0.19021188616752624, + "rewards/format_reward/mean": 0.9578125, + "rewards/format_reward/std": 0.19850789308547973, + "rewards/mcq_accuracy_reward/mean": 0.6697916626930237, + "rewards/mcq_accuracy_reward/std": 0.4686493456363678, + "rewards/tag_count_reward/mean": 0.98671875, + "rewards/tag_count_reward/std": 0.07522183880209923, + "step": 1815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0140625, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5611.8, + "completions/mean_length": 1035.1151123046875, + "completions/mean_terminated_length": 933.0056884765625, + "completions/min_length": 192.4, + "completions/min_terminated_length": 192.4, + "epoch": 0.4778392215421473, + "grad_norm": 0.21935160098413667, + "kl": 0.078466796875, + "learning_rate": 1.818881795014793e-08, + "loss": 0.1133, + "num_tokens": 791625516.0, + "reward": 1.1901042222976685, + "reward_std": 0.18765581846237184, + "rewards/format_reward/mean": 0.9671875, + "rewards/format_reward/std": 0.173410826921463, + "rewards/mcq_accuracy_reward/mean": 0.7010416626930237, + "rewards/mcq_accuracy_reward/std": 0.4567505419254303, + "rewards/tag_count_reward/mean": 0.989062511920929, + "rewards/tag_count_reward/std": 0.06786322966217995, + "step": 1820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02031250000000002, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5660.2, + "completions/mean_length": 1090.5672241210937, + "completions/mean_terminated_length": 943.4929321289062, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.4791519666562741, + "grad_norm": 0.24870758510748905, + "kl": 0.078271484375, + "learning_rate": 1.6115621625318545e-08, + "loss": 0.1539, + "num_tokens": 794023941.0, + "reward": 1.1345052480697633, + "reward_std": 0.22315718829631806, + "rewards/format_reward/mean": 0.9546875, + "rewards/format_reward/std": 0.20684564113616943, + "rewards/mcq_accuracy_reward/mean": 0.6494791746139527, + "rewards/mcq_accuracy_reward/std": 0.47216633558273313, + "rewards/tag_count_reward/mean": 0.9854166626930236, + "rewards/tag_count_reward/std": 0.07945655509829522, + "step": 1825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017187499999999977, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5525.8, + "completions/mean_length": 1120.8781982421874, + "completions/mean_terminated_length": 997.1640014648438, + "completions/min_length": 222.4, + "completions/min_terminated_length": 222.4, + "epoch": 0.4804647117704009, + "grad_norm": 0.20841255913304904, + "kl": 0.0766387939453125, + "learning_rate": 1.4167198711476925e-08, + "loss": 0.1207, + "num_tokens": 796474395.0, + "reward": 1.1244466304779053, + "reward_std": 0.19489411115646363, + "rewards/format_reward/mean": 0.965624988079071, + "rewards/format_reward/std": 0.18177194595336915, + "rewards/mcq_accuracy_reward/mean": 0.6364583253860474, + "rewards/mcq_accuracy_reward/std": 0.48031699657440186, + "rewards/tag_count_reward/mean": 0.986328125, + "rewards/tag_count_reward/std": 0.07789578065276145, + "step": 1830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017708333333333326, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6599.6, + "completions/mean_length": 1055.42451171875, + "completions/mean_terminated_length": 926.8253784179688, + "completions/min_length": 211.4, + "completions/min_terminated_length": 211.4, + "epoch": 0.48177745688452767, + "grad_norm": 0.2692641229789336, + "kl": 0.0811553955078125, + "learning_rate": 1.2343713043067584e-08, + "loss": 0.1348, + "num_tokens": 798801002.0, + "reward": 1.1620768547058105, + "reward_std": 0.19237086176872253, + "rewards/format_reward/mean": 0.9604166626930237, + "rewards/format_reward/std": 0.1947425276041031, + "rewards/mcq_accuracy_reward/mean": 0.6755208373069763, + "rewards/mcq_accuracy_reward/std": 0.46648314595222473, + "rewards/tag_count_reward/mean": 0.9858072757720947, + "rewards/tag_count_reward/std": 0.07648578733205795, + "step": 1835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0171875, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5421.6, + "completions/mean_length": 977.0328369140625, + "completions/mean_terminated_length": 850.7861694335937, + "completions/min_length": 212.8, + "completions/min_terminated_length": 212.8, + "epoch": 0.48309020199865443, + "grad_norm": 0.1833509211086439, + "kl": 0.082562255859375, + "learning_rate": 1.0645317949103505e-08, + "loss": 0.133, + "num_tokens": 800974529.0, + "reward": 1.2052734613418579, + "reward_std": 0.16241915971040727, + "rewards/format_reward/mean": 0.9635416746139527, + "rewards/format_reward/std": 0.18576928973197937, + "rewards/mcq_accuracy_reward/mean": 0.7177083373069764, + "rewards/mcq_accuracy_reward/std": 0.4494118273258209, + "rewards/tag_count_reward/mean": 0.98671875, + "rewards/tag_count_reward/std": 0.07476085349917412, + "step": 1840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.016666666666666653, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5726.6, + "completions/mean_length": 1053.2531494140626, + "completions/mean_terminated_length": 932.0677368164063, + "completions/min_length": 218.8, + "completions/min_terminated_length": 218.8, + "epoch": 0.4844029471127812, + "grad_norm": 0.2854531315963596, + "kl": 0.0770111083984375, + "learning_rate": 9.072156240272466e-09, + "loss": 0.1163, + "num_tokens": 803301935.0, + "reward": 1.1879883050918578, + "reward_std": 0.19440164864063264, + "rewards/format_reward/mean": 0.9588541746139526, + "rewards/format_reward/std": 0.1976425290107727, + "rewards/mcq_accuracy_reward/mean": 0.7015625, + "rewards/mcq_accuracy_reward/std": 0.4510770201683044, + "rewards/tag_count_reward/mean": 0.9868489503860474, + "rewards/tag_count_reward/std": 0.07597034126520157, + "step": 1845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.016145833333333325, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5052.0, + "completions/mean_length": 937.7416748046875, + "completions/mean_terminated_length": 818.3315063476563, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.48571569222690797, + "grad_norm": 0.37107805763185814, + "kl": 0.0835418701171875, + "learning_rate": 7.624360196929636e-09, + "loss": 0.1335, + "num_tokens": 805405815.0, + "reward": 1.2314128160476685, + "reward_std": 0.17356855571269988, + "rewards/format_reward/mean": 0.9671874880790711, + "rewards/format_reward/std": 0.17651339173316954, + "rewards/mcq_accuracy_reward/mean": 0.7421875119209289, + "rewards/mcq_accuracy_reward/std": 0.4368328511714935, + "rewards/tag_count_reward/mean": 0.9897135376930237, + "rewards/tag_count_reward/std": 0.06618870720267296, + "step": 1850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.013020833333333348, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 7120.4, + "completions/mean_length": 1029.1672119140626, + "completions/mean_terminated_length": 934.6678955078125, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.4870284373410348, + "grad_norm": 0.24210266745055134, + "kl": 0.0822906494140625, + "learning_rate": 6.3020515579741425e-09, + "loss": 0.1378, + "num_tokens": 807681592.0, + "reward": 1.1843099117279052, + "reward_std": 0.1729869067668915, + "rewards/format_reward/mean": 0.9588541746139526, + "rewards/format_reward/std": 0.19813241064548492, + "rewards/mcq_accuracy_reward/mean": 0.6973958253860474, + "rewards/mcq_accuracy_reward/std": 0.45897703766822817, + "rewards/tag_count_reward/mean": 0.9888020873069763, + "rewards/tag_count_reward/std": 0.07053646892309189, + "step": 1855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.014583333333333327, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6695.0, + "completions/mean_length": 1004.7271118164062, + "completions/mean_terminated_length": 898.6779052734375, + "completions/min_length": 194.8, + "completions/min_terminated_length": 194.8, + "epoch": 0.48834118245516156, + "grad_norm": 0.3518497673796796, + "kl": 0.080328369140625, + "learning_rate": 5.10534151061276e-09, + "loss": 0.1238, + "num_tokens": 809905828.0, + "reward": 1.1438477039337158, + "reward_std": 0.17833697199821472, + "rewards/format_reward/mean": 0.965625011920929, + "rewards/format_reward/std": 0.17968954443931578, + "rewards/mcq_accuracy_reward/mean": 0.6552083373069764, + "rewards/mcq_accuracy_reward/std": 0.46999741792678834, + "rewards/tag_count_reward/mean": 0.9889322876930237, + "rewards/tag_count_reward/std": 0.06754784062504768, + "step": 1860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6071.8, + "completions/mean_length": 1014.6010986328125, + "completions/mean_terminated_length": 923.7098876953125, + "completions/min_length": 212.4, + "completions/min_terminated_length": 212.4, + "epoch": 0.4896539275692883, + "grad_norm": 0.21268221184051614, + "kl": 0.0804473876953125, + "learning_rate": 4.034330681010889e-09, + "loss": 0.1008, + "num_tokens": 812152398.0, + "reward": 1.1568359851837158, + "reward_std": 0.16342645436525344, + "rewards/format_reward/mean": 0.9692708373069763, + "rewards/format_reward/std": 0.17012720704078674, + "rewards/mcq_accuracy_reward/mean": 0.667187511920929, + "rewards/mcq_accuracy_reward/std": 0.4693709433078766, + "rewards/tag_count_reward/mean": 0.9893229246139527, + "rewards/tag_count_reward/std": 0.06837740167975426, + "step": 1865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0171875, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6503.4, + "completions/mean_length": 1034.4943115234375, + "completions/mean_terminated_length": 909.29326171875, + "completions/min_length": 209.6, + "completions/min_terminated_length": 209.6, + "epoch": 0.4909666726834151, + "grad_norm": 0.311800910187323, + "kl": 0.08046875, + "learning_rate": 3.089109125830658e-09, + "loss": 0.1298, + "num_tokens": 814436483.0, + "reward": 1.1655925035476684, + "reward_std": 0.19115622639656066, + "rewards/format_reward/mean": 0.9635416626930237, + "rewards/format_reward/std": 0.18627865612506866, + "rewards/mcq_accuracy_reward/mean": 0.678125, + "rewards/mcq_accuracy_reward/std": 0.45942809581756594, + "rewards/tag_count_reward/mean": 0.986328125, + "rewards/tag_count_reward/std": 0.07668826580047608, + "step": 1870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015104166666666674, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6406.4, + "completions/mean_length": 1010.128173828125, + "completions/mean_terminated_length": 900.2413940429688, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.49227941779754186, + "grad_norm": 0.30561316860976634, + "kl": 0.0820709228515625, + "learning_rate": 2.269756324659311e-09, + "loss": 0.1243, + "num_tokens": 816672593.0, + "reward": 1.1677083730697633, + "reward_std": 0.17404088973999024, + "rewards/format_reward/mean": 0.9588541626930237, + "rewards/format_reward/std": 0.1984285056591034, + "rewards/mcq_accuracy_reward/mean": 0.68125, + "rewards/mcq_accuracy_reward/std": 0.4629057466983795, + "rewards/tag_count_reward/mean": 0.9869791626930237, + "rewards/tag_count_reward/std": 0.07526598200201988, + "step": 1875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.020833333333333325, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6272.8, + "completions/mean_length": 1080.9489990234374, + "completions/mean_terminated_length": 929.7738891601563, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.4935921629116687, + "grad_norm": 0.24056769460021862, + "kl": 0.078076171875, + "learning_rate": 1.5763411733250576e-09, + "loss": 0.1417, + "num_tokens": 819045711.0, + "reward": 1.177278709411621, + "reward_std": 0.21043918132781983, + "rewards/format_reward/mean": 0.9619791507720947, + "rewards/format_reward/std": 0.1895926982164383, + "rewards/mcq_accuracy_reward/mean": 0.690625011920929, + "rewards/mcq_accuracy_reward/std": 0.46058908104896545, + "rewards/tag_count_reward/mean": 0.9846354246139526, + "rewards/tag_count_reward/std": 0.08321737423539162, + "step": 1880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01875, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6771.0, + "completions/mean_length": 1021.1484741210937, + "completions/mean_terminated_length": 884.1717529296875, + "completions/min_length": 216.6, + "completions/min_terminated_length": 216.6, + "epoch": 0.49490490802579545, + "grad_norm": 0.3199598811195582, + "kl": 0.0810272216796875, + "learning_rate": 1.0089219781053705e-09, + "loss": 0.1483, + "num_tokens": 821302572.0, + "reward": 1.1499349355697632, + "reward_std": 0.1887631893157959, + "rewards/format_reward/mean": 0.9619791626930236, + "rewards/format_reward/std": 0.19118059873580934, + "rewards/mcq_accuracy_reward/mean": 0.6630208373069764, + "rewards/mcq_accuracy_reward/std": 0.470464414358139, + "rewards/tag_count_reward/mean": 0.9856770753860473, + "rewards/tag_count_reward/std": 0.07939509749412536, + "step": 1885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017708333333333347, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6107.0, + "completions/mean_length": 1051.2187744140624, + "completions/mean_terminated_length": 922.9707763671875, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.4962176531399222, + "grad_norm": 0.19028630887082554, + "kl": 0.0782012939453125, + "learning_rate": 5.675464508227423e-10, + "loss": 0.1324, + "num_tokens": 823617080.0, + "reward": 1.162500023841858, + "reward_std": 0.17356816232204436, + "rewards/format_reward/mean": 0.965624988079071, + "rewards/format_reward/std": 0.18135603666305541, + "rewards/mcq_accuracy_reward/mean": 0.6744791626930237, + "rewards/mcq_accuracy_reward/std": 0.4685880124568939, + "rewards/tag_count_reward/mean": 0.9864583373069763, + "rewards/tag_count_reward/std": 0.07429665103554725, + "step": 1890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015624999999999977, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 4837.2, + "completions/mean_length": 1002.3323120117187, + "completions/mean_terminated_length": 888.3731811523437, + "completions/min_length": 218.8, + "completions/min_terminated_length": 218.8, + "epoch": 0.497530398254049, + "grad_norm": 0.213697658200741, + "kl": 0.0796661376953125, + "learning_rate": 2.522517048338946e-10, + "loss": 0.1228, + "num_tokens": 825839622.0, + "reward": 1.178157591819763, + "reward_std": 0.16585054397583007, + "rewards/format_reward/mean": 0.9625, + "rewards/format_reward/std": 0.18972494304180146, + "rewards/mcq_accuracy_reward/mean": 0.690625011920929, + "rewards/mcq_accuracy_reward/std": 0.4586210668087006, + "rewards/tag_count_reward/mean": 0.9876302003860473, + "rewards/tag_count_reward/std": 0.0735657349228859, + "step": 1895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.014583333333333347, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 6924.4, + "completions/mean_length": 1013.5021240234375, + "completions/mean_terminated_length": 907.4473266601562, + "completions/min_length": 215.6, + "completions/min_terminated_length": 215.6, + "epoch": 0.49884314336817576, + "grad_norm": 0.2418392792418562, + "kl": 0.082781982421875, + "learning_rate": 6.306425190844011e-11, + "loss": 0.1056, + "num_tokens": 828084378.0, + "reward": 1.1822591543197631, + "reward_std": 0.16198524236679077, + "rewards/format_reward/mean": 0.9666666746139526, + "rewards/format_reward/std": 0.17960063517093658, + "rewards/mcq_accuracy_reward/mean": 0.69375, + "rewards/mcq_accuracy_reward/std": 0.4611159682273865, + "rewards/tag_count_reward/mean": 0.9873697876930236, + "rewards/tag_count_reward/std": 0.07599816620349883, + "step": 1900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.013020833333333343, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 5126.5, + "completions/mean_length": 1012.0169677734375, + "completions/mean_terminated_length": 917.6627349853516, + "completions/min_length": 223.75, + "completions/min_terminated_length": 223.75, + "epoch": 0.4998933394594772, + "kl": 0.07724761962890625, + "num_tokens": 829881068.0, + "reward": 1.2089030146598816, + "reward_std": 0.1814947985112667, + "rewards/format_reward/mean": 0.96484375, + "rewards/format_reward/std": 0.1830342337489128, + "rewards/mcq_accuracy_reward/mean": 0.720703125, + "rewards/mcq_accuracy_reward/std": 0.44706109911203384, + "rewards/tag_count_reward/mean": 0.9879557192325592, + "rewards/tag_count_reward/std": 0.07010491285473108, + "step": 1904, + "total_flos": 0.0, + "train_loss": 0.0036022694108365965, + "train_runtime": 15331.4788, + "train_samples_per_second": 5.962, + "train_steps_per_second": 0.124 + } + ], + "logging_steps": 5, + "max_steps": 1904, + "num_input_tokens_seen": 829881068, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}