{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0012106537530266344, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3274.0, "completions/max_terminated_length": 3274.0, "completions/mean_length": 1305.6875, "completions/mean_terminated_length": 1305.6875, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 6.053268765133172e-06, "grad_norm": 0.7806517512814045, "kl": 0.0191497802734375, "learning_rate": 0.0, "loss": -0.0018, "num_tokens": 28431.0, "reward": 0.10032547265291214, "reward_std": 0.21521395444869995, "rewards/correct_answer_reward_func": 0.1875, "rewards/efficient_thinking_reward_func": 5.08148193359375, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.1875, "rewards/tool_execution_reward_func": 0.9375, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.2106537530266343e-05, "grad_norm": 0.779905050857545, "kl": 0.0191497802734375, "learning_rate": 1.5000000000000002e-07, "loss": -0.0018, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.8159806295399516e-05, "grad_norm": 0.775895943887529, "kl": 0.0205230712890625, "learning_rate": 3.0000000000000004e-07, "loss": -0.0018, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.4213075060532686e-05, "grad_norm": 0.7054717897002856, "kl": 0.0189208984375, "learning_rate": 4.5e-07, "loss": -0.0018, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 3197.0, "completions/max_terminated_length": 3197.0, "completions/mean_length": 1387.5625, "completions/mean_terminated_length": 1480.0666666666666, "completions/min_length": 0.0, "completions/min_terminated_length": 301.0, "epoch": 3.026634382566586e-05, "grad_norm": 2.8398488181877535, "kl": 0.0939483642578125, "learning_rate": 6.000000000000001e-07, "loss": 0.0005, "num_tokens": 62348.0, "reward": 0.0669252872467041, "reward_std": 0.18236754834651947, "rewards/correct_answer_reward_func": 0.125, "rewards/efficient_thinking_reward_func": 4.3189802169799805, "rewards/format_reward_func": 0.9474999904632568, "rewards/num_xml_reward_func": 1.0, "rewards/tool_execution_reward_func": 0.875, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.631961259079903e-05, "grad_norm": 0.6955667618263449, "kl": 0.013031005859375, "learning_rate": 7.5e-07, "loss": 0.0002, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.2372881355932206e-05, "grad_norm": 0.6839471972703409, "kl": 0.0131378173828125, "learning_rate": 9e-07, "loss": 0.0003, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.842615012106537e-05, "grad_norm": 0.6825085562302083, "kl": 0.0136871337890625, "learning_rate": 1.05e-06, "loss": 0.0003, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1535.0, "completions/max_terminated_length": 1535.0, "completions/mean_length": 1288.375, "completions/mean_terminated_length": 1288.375, "completions/min_length": 501.0, "completions/min_terminated_length": 501.0, "epoch": 5.4479418886198546e-05, "grad_norm": 2.1585776860330728, "kl": 0.01605224609375, "learning_rate": 1.2000000000000002e-06, "loss": 0.0002, "num_tokens": 90542.0, "reward": 0.004091441631317139, "reward_std": 0.4142349064350128, "rewards/correct_answer_reward_func": 0.4375, "rewards/efficient_thinking_reward_func": 4.875295639038086, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.15625, "rewards/tool_execution_reward_func": 0.9375, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.053268765133172e-05, "grad_norm": 2.086709029627379, "kl": 0.030364990234375, "learning_rate": 1.35e-06, "loss": 0.0003, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.658595641646489e-05, "grad_norm": 2.055694134933805, "kl": 0.035125732421875, "learning_rate": 1.5e-06, "loss": 0.0003, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.263922518159807e-05, "grad_norm": 5.214306021603971, "kl": 0.24627685546875, "learning_rate": 1.65e-06, "loss": 0.0009, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3169.0, "completions/max_terminated_length": 3169.0, "completions/mean_length": 1584.5625, "completions/mean_terminated_length": 1584.5625, "completions/min_length": 1048.0, "completions/min_terminated_length": 1048.0, "epoch": 7.869249394673124e-05, "grad_norm": 3.0602846246870006, "kl": 0.2955322265625, "learning_rate": 1.8e-06, "loss": 0.0012, "num_tokens": 123515.0, "reward": 0.00020614694221876562, "reward_std": 0.00014354230370372534, "rewards/correct_answer_reward_func": 0.0, "rewards/efficient_thinking_reward_func": 4.435483932495117, "rewards/format_reward_func": 0.9916666746139526, "rewards/num_xml_reward_func": 1.03125, "rewards/tool_execution_reward_func": 1.125, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.474576271186441e-05, "grad_norm": 3.24773890393597, "kl": 0.262451171875, "learning_rate": 1.95e-06, "loss": 0.0012, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.079903147699757e-05, "grad_norm": 1.0320633234812377, "kl": 0.21142578125, "learning_rate": 2.1e-06, "loss": 0.0009, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.685230024213075e-05, "grad_norm": 0.6955273697626486, "kl": 0.19775390625, "learning_rate": 2.25e-06, "loss": 0.0008, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2131.0, "completions/max_terminated_length": 2131.0, "completions/mean_length": 1081.8125, "completions/mean_terminated_length": 1153.9333333333334, "completions/min_length": 0.0, "completions/min_terminated_length": 279.0, "epoch": 0.00010290556900726392, "grad_norm": 1.383354124641088, "kl": 0.18017578125, "learning_rate": 2.4000000000000003e-06, "loss": 0.0065, "num_tokens": 152500.0, "reward": 0.14100381731987, "reward_std": 0.44302019476890564, "rewards/correct_answer_reward_func": 0.5, "rewards/efficient_thinking_reward_func": 5.360107898712158, "rewards/format_reward_func": 0.9816666841506958, "rewards/num_xml_reward_func": 1.0458333492279053, "rewards/tool_execution_reward_func": 0.75, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00010895883777239709, "grad_norm": 1.370303518112428, "kl": 0.256591796875, "learning_rate": 2.55e-06, "loss": 0.0067, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00011501210653753027, "grad_norm": 1.3030395065430471, "kl": 0.32666015625, "learning_rate": 2.7e-06, "loss": 0.0069, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00012106537530266344, "grad_norm": 1.5799075174968453, "kl": 0.5322265625, "learning_rate": 2.85e-06, "loss": 0.0075, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2074.0, "completions/max_terminated_length": 2074.0, "completions/mean_length": 1281.9375, "completions/mean_terminated_length": 1367.4, "completions/min_length": 0.0, "completions/min_terminated_length": 420.0, "epoch": 0.0001271186440677966, "grad_norm": 38.55336802862083, "kl": 2.150390625, "learning_rate": 3e-06, "loss": 0.0073, "num_tokens": 184727.0, "reward": 0.03352511301636696, "reward_std": 0.1334875077009201, "rewards/correct_answer_reward_func": 0.0625, "rewards/efficient_thinking_reward_func": 3.6303672790527344, "rewards/format_reward_func": 0.9537500143051147, "rewards/num_xml_reward_func": 0.8125, "rewards/tool_execution_reward_func": 0.9375, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00013317191283292979, "grad_norm": 10.045688461429338, "kl": 0.865234375, "learning_rate": 3e-06, "loss": 0.0025, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00013922518159806296, "grad_norm": 2.9644172349588827, "kl": 0.8720703125, "learning_rate": 3e-06, "loss": 0.0022, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00014527845036319613, "grad_norm": 2.3601465047763033, "kl": 0.6787109375, "learning_rate": 3e-06, "loss": 0.0017, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1914.0, "completions/max_terminated_length": 1914.0, "completions/mean_length": 1091.0625, "completions/mean_terminated_length": 1091.0625, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.0001513317191283293, "grad_norm": 9.206773673324696, "kl": 1.31640625, "learning_rate": 3e-06, "loss": 0.0148, "num_tokens": 209764.0, "reward": -0.007718075066804886, "reward_std": 0.3225381076335907, "rewards/correct_answer_reward_func": 0.3125, "rewards/efficient_thinking_reward_func": 4.085506439208984, "rewards/format_reward_func": 0.9916666746139526, "rewards/num_xml_reward_func": 0.9270833730697632, "rewards/tool_execution_reward_func": 0.6875, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00015738498789346248, "grad_norm": 241.05237266393604, "kl": 1.84228515625, "learning_rate": 3e-06, "loss": 0.0166, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00016343825665859565, "grad_norm": 1.2667482118640077, "kl": 0.6767578125, "learning_rate": 3e-06, "loss": 0.0123, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00016949152542372882, "grad_norm": 1.066626868681045, "kl": 0.572265625, "learning_rate": 3e-06, "loss": 0.0119, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2413.0, "completions/max_terminated_length": 2413.0, "completions/mean_length": 1022.5625, "completions/mean_terminated_length": 1022.5625, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.000175544794188862, "grad_norm": 3.376000196555004, "kl": 0.90673828125, "learning_rate": 3e-06, "loss": -0.0045, "num_tokens": 233745.0, "reward": 0.06693361699581146, "reward_std": 0.18236428499221802, "rewards/correct_answer_reward_func": 0.125, "rewards/efficient_thinking_reward_func": 4.435483932495117, "rewards/format_reward_func": 0.8999999761581421, "rewards/num_xml_reward_func": 1.0416666269302368, "rewards/tool_execution_reward_func": 0.6875, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00018159806295399514, "grad_norm": 0.3430397854754984, "kl": 0.67919921875, "learning_rate": 3e-06, "loss": -0.0051, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00018765133171912832, "grad_norm": 0.2377290664264501, "kl": 0.70751953125, "learning_rate": 3e-06, "loss": -0.0051, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0001937046004842615, "grad_norm": 0.2344163560775934, "kl": 0.751953125, "learning_rate": 3e-06, "loss": -0.0051, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2072.0, "completions/max_terminated_length": 2072.0, "completions/mean_length": 1032.0, "completions/mean_terminated_length": 1032.0, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.00019975786924939466, "grad_norm": 9.098443242784603, "kl": 1.21142578125, "learning_rate": 3e-06, "loss": 0.0035, "num_tokens": 257837.0, "reward": 0.13366317749023438, "reward_std": 0.23877617716789246, "rewards/correct_answer_reward_func": 0.25, "rewards/efficient_thinking_reward_func": 4.384251594543457, "rewards/format_reward_func": 0.987500011920929, "rewards/num_xml_reward_func": 1.0625, "rewards/tool_execution_reward_func": 0.625, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00020581113801452784, "grad_norm": 2.3617131203251613, "kl": 0.775390625, "learning_rate": 3e-06, "loss": 0.0014, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.000211864406779661, "grad_norm": 0.47446261418389957, "kl": 0.59765625, "learning_rate": 3e-06, "loss": 0.0006, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00021791767554479418, "grad_norm": 0.34925057861597, "kl": 0.564453125, "learning_rate": 3e-06, "loss": 0.0004, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2844.0, "completions/max_terminated_length": 2844.0, "completions/mean_length": 1202.9375, "completions/mean_terminated_length": 1374.7857142857142, "completions/min_length": 0.0, "completions/min_terminated_length": 377.0, "epoch": 0.00022397094430992736, "grad_norm": 7.330956289365437, "kl": 1.2607421875, "learning_rate": 3e-06, "loss": 0.0044, "num_tokens": 292896.0, "reward": 0.03360215947031975, "reward_std": 0.1334669440984726, "rewards/correct_answer_reward_func": 0.0625, "rewards/efficient_thinking_reward_func": 5.241935729980469, "rewards/format_reward_func": 0.9662500023841858, "rewards/num_xml_reward_func": 1.1979166269302368, "rewards/tool_execution_reward_func": 0.9375, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00023002421307506053, "grad_norm": 0.549579766985337, "kl": 0.7529296875, "learning_rate": 3e-06, "loss": 0.0029, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0002360774818401937, "grad_norm": 0.16634021782187236, "kl": 0.7275390625, "learning_rate": 3e-06, "loss": 0.0028, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00024213075060532688, "grad_norm": 0.1711020261656612, "kl": 0.716796875, "learning_rate": 3e-06, "loss": 0.0027, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2655.0, "completions/max_terminated_length": 2655.0, "completions/mean_length": 1266.6875, "completions/mean_terminated_length": 1266.6875, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.00024818401937046, "grad_norm": 0.6184470607386672, "kl": 0.796875, "learning_rate": 3e-06, "loss": -0.0074, "num_tokens": 320743.0, "reward": 0.18172809481620789, "reward_std": 0.28321510553359985, "rewards/correct_answer_reward_func": 0.3125, "rewards/efficient_thinking_reward_func": 6.04838752746582, "rewards/format_reward_func": 0.987500011920929, "rewards/num_xml_reward_func": 1.3541666269302368, "rewards/tool_execution_reward_func": 0.875, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0002542372881355932, "grad_norm": 0.4662817750620152, "kl": 0.775390625, "learning_rate": 3e-06, "loss": -0.0075, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00026029055690072637, "grad_norm": 0.48985485884822744, "kl": 0.7978515625, "learning_rate": 3e-06, "loss": -0.0074, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00026634382566585957, "grad_norm": 0.5614378811652494, "kl": 0.83984375, "learning_rate": 3e-06, "loss": -0.0073, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 3370.0, "completions/max_terminated_length": 3370.0, "completions/mean_length": 1417.3125, "completions/mean_terminated_length": 1619.7857142857142, "completions/min_length": 0.0, "completions/min_terminated_length": 1053.0, "epoch": 0.0002723970944309927, "grad_norm": 57880.63241282136, "kl": 1502.919921875, "learning_rate": 3e-06, "loss": 10.1932, "num_tokens": 359232.0, "reward": 0.06695549190044403, "reward_std": 0.18235576152801514, "rewards/correct_answer_reward_func": 0.125, "rewards/efficient_thinking_reward_func": 5.443548679351807, "rewards/format_reward_func": 0.987500011920929, "rewards/num_xml_reward_func": 1.1510416269302368, "rewards/tool_execution_reward_func": 1.1875, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0002784503631961259, "grad_norm": 2894.0307559416137, "kl": 157.4794921875, "learning_rate": 3e-06, "loss": 0.3165, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00028450363196125906, "grad_norm": 37.029243795631096, "kl": 3.7021484375, "learning_rate": 3e-06, "loss": 0.0144, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00029055690072639226, "grad_norm": 12.662485260083166, "kl": 2.4423828125, "learning_rate": 3e-06, "loss": 0.0108, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3108.0, "completions/max_terminated_length": 3108.0, "completions/mean_length": 1516.75, "completions/mean_terminated_length": 1516.75, "completions/min_length": 1020.0, "completions/min_terminated_length": 1020.0, "epoch": 0.0002966101694915254, "grad_norm": 7845.857745835188, "kl": 527.46875, "learning_rate": 3e-06, "loss": 1.7351, "num_tokens": 391080.0, "reward": 0.1431797295808792, "reward_std": 0.3620518743991852, "rewards/correct_answer_reward_func": 0.375, "rewards/efficient_thinking_reward_func": 6.04838752746582, "rewards/format_reward_func": 0.9916666746139526, "rewards/num_xml_reward_func": 1.34375, "rewards/tool_execution_reward_func": 1.0, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0003026634382566586, "grad_norm": 705.0569148996761, "kl": 47.4765625, "learning_rate": 3e-06, "loss": 0.1643, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00030871670702179176, "grad_norm": 19.469174009613884, "kl": 5.75390625, "learning_rate": 3e-06, "loss": 0.0042, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00031476997578692496, "grad_norm": 4.187483088072952, "kl": 2.0078125, "learning_rate": 3e-06, "loss": -0.0101, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3277.0, "completions/max_terminated_length": 3277.0, "completions/mean_length": 1857.125, "completions/mean_terminated_length": 1857.125, "completions/min_length": 1220.0, "completions/min_terminated_length": 1220.0, "epoch": 0.0003208232445520581, "grad_norm": 15.598878965568742, "kl": 1.259765625, "learning_rate": 3e-06, "loss": 0.0128, "num_tokens": 428414.0, "reward": 0.06696485728025436, "reward_std": 0.18235208094120026, "rewards/correct_answer_reward_func": 0.125, "rewards/efficient_thinking_reward_func": 5.241935729980469, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.1979166269302368, "rewards/tool_execution_reward_func": 1.0625, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0003268765133171913, "grad_norm": 0.46962880382282596, "kl": 0.845703125, "learning_rate": 3e-06, "loss": 0.0115, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00033292978208232445, "grad_norm": 0.3371767290491545, "kl": 0.734375, "learning_rate": 3e-06, "loss": 0.0111, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00033898305084745765, "grad_norm": 0.35169159593818344, "kl": 0.69140625, "learning_rate": 3e-06, "loss": 0.011, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2908.0, "completions/max_terminated_length": 2908.0, "completions/mean_length": 1546.0625, "completions/mean_terminated_length": 1546.0625, "completions/min_length": 1002.0, "completions/min_terminated_length": 1002.0, "epoch": 0.0003450363196125908, "grad_norm": 157.35744089074626, "kl": 16.322265625, "learning_rate": 3e-06, "loss": 0.0504, "num_tokens": 460731.0, "reward": 0.19058775901794434, "reward_std": 0.35528117418289185, "rewards/correct_answer_reward_func": 0.5, "rewards/efficient_thinking_reward_func": 5.443548202514648, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.2291666269302368, "rewards/tool_execution_reward_func": 1.0, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.000351089588377724, "grad_norm": 190.19152750617562, "kl": 7.6640625, "learning_rate": 3e-06, "loss": 0.0339, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00035714285714285714, "grad_norm": 6.638740120478302, "kl": 1.9609375, "learning_rate": 3e-06, "loss": 0.0123, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0003631961259079903, "grad_norm": 0.9919110476899615, "kl": 1.177734375, "learning_rate": 3e-06, "loss": 0.0097, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 3020.0, "completions/max_terminated_length": 3020.0, "completions/mean_length": 1400.375, "completions/mean_terminated_length": 1723.5384615384614, "completions/min_length": 0.0, "completions/min_terminated_length": 1168.0, "epoch": 0.0003692493946731235, "grad_norm": 13723.1434300017, "kl": 499.875, "learning_rate": 3e-06, "loss": 1.6878, "num_tokens": 503045.0, "reward": -0.17267484962940216, "reward_std": 0.3091583847999573, "rewards/correct_answer_reward_func": 0.25, "rewards/efficient_thinking_reward_func": 2.4193549156188965, "rewards/format_reward_func": 0.9662500023841858, "rewards/num_xml_reward_func": 0.5625, "rewards/tool_execution_reward_func": 1.125, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00037530266343825664, "grad_norm": 583.6315212975338, "kl": 59.34375, "learning_rate": 3e-06, "loss": 0.1944, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00038135593220338984, "grad_norm": 6.259462206668412, "kl": 1.6533203125, "learning_rate": 3e-06, "loss": 0.0194, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.000387409200968523, "grad_norm": 1.0697409446207862, "kl": 0.7666015625, "learning_rate": 3e-06, "loss": 0.0167, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2703.0, "completions/max_terminated_length": 2703.0, "completions/mean_length": 1332.75, "completions/mean_terminated_length": 1421.6, "completions/min_length": 0.0, "completions/min_terminated_length": 664.0, "epoch": 0.0003934624697336562, "grad_norm": 0.8884534962640673, "kl": 1.0087890625, "learning_rate": 3e-06, "loss": -0.0275, "num_tokens": 536045.0, "reward": 0.12867216765880585, "reward_std": 0.4248371720314026, "rewards/correct_answer_reward_func": 0.5, "rewards/efficient_thinking_reward_func": 5.460975170135498, "rewards/format_reward_func": 0.9925000071525574, "rewards/num_xml_reward_func": 1.2291666269302368, "rewards/tool_execution_reward_func": 1.0, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00039951573849878933, "grad_norm": 0.8129440217914238, "kl": 0.91796875, "learning_rate": 3e-06, "loss": -0.0278, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00040556900726392253, "grad_norm": 2074.331071701581, "kl": 95.181640625, "learning_rate": 3e-06, "loss": 0.275, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0004116222760290557, "grad_norm": 25.42750541497437, "kl": 2.0732421875, "learning_rate": 3e-06, "loss": -0.0241, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1718.0, "completions/max_terminated_length": 1718.0, "completions/mean_length": 1171.125, "completions/mean_terminated_length": 1441.3846153846155, "completions/min_length": 0.0, "completions/min_terminated_length": 800.0, "epoch": 0.0004176755447941889, "grad_norm": 3132.1136439947386, "kl": 258.822265625, "learning_rate": 3e-06, "loss": 0.993, "num_tokens": 574691.0, "reward": -0.17273107171058655, "reward_std": 0.3091248571872711, "rewards/correct_answer_reward_func": 0.25, "rewards/efficient_thinking_reward_func": 1.2329180240631104, "rewards/format_reward_func": 0.90625, "rewards/num_xml_reward_func": 0.28125, "rewards/tool_execution_reward_func": 0.9375, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.000423728813559322, "grad_norm": 83.706286199725, "kl": 14.453125, "learning_rate": 3e-06, "loss": 0.0731, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0004297820823244552, "grad_norm": 13.647323217615291, "kl": 3.970703125, "learning_rate": 3e-06, "loss": 0.0323, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00043583535108958837, "grad_norm": 1.7482065120313235, "kl": 1.4990234375, "learning_rate": 3e-06, "loss": 0.0241, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 2964.0, "completions/max_terminated_length": 2964.0, "completions/mean_length": 1056.0, "completions/mean_terminated_length": 1536.0, "completions/min_length": 0.0, "completions/min_terminated_length": 1002.0, "epoch": 0.00044188861985472157, "grad_norm": 0.8589483061045617, "kl": 1.400390625, "learning_rate": 3e-06, "loss": -0.0032, "num_tokens": 619647.0, "reward": 0.12374991923570633, "reward_std": 0.3325650691986084, "rewards/correct_answer_reward_func": 0.375, "rewards/efficient_thinking_reward_func": 3.02419376373291, "rewards/format_reward_func": 0.875, "rewards/num_xml_reward_func": 0.6666666269302368, "rewards/tool_execution_reward_func": 1.0, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0004479418886198547, "grad_norm": 0.6265176251259559, "kl": 1.064453125, "learning_rate": 3e-06, "loss": -0.004, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0004539951573849879, "grad_norm": 0.6501523287860633, "kl": 1.0078125, "learning_rate": 3e-06, "loss": -0.0042, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00046004842615012106, "grad_norm": 0.6557107326779114, "kl": 1.041015625, "learning_rate": 3e-06, "loss": -0.0041, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 3914.0, "completions/max_terminated_length": 3914.0, "completions/mean_length": 1885.25, "completions/mean_terminated_length": 2154.5714285714284, "completions/min_length": 0.0, "completions/min_terminated_length": 1259.0, "epoch": 0.00046610169491525426, "grad_norm": 1426.697250240864, "kl": 71.76171875, "learning_rate": 3e-06, "loss": 0.3336, "num_tokens": 665623.0, "reward": -0.08629994839429855, "reward_std": 0.23610900342464447, "rewards/correct_answer_reward_func": 0.125, "rewards/efficient_thinking_reward_func": 2.0161290168762207, "rewards/format_reward_func": 0.9462500214576721, "rewards/num_xml_reward_func": 0.46875, "rewards/tool_execution_reward_func": 1.0625, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0004721549636803874, "grad_norm": 12.49919870676222, "kl": 1.5146484375, "learning_rate": 3e-06, "loss": 0.0013, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00047820823244552055, "grad_norm": 1.8191523325467, "kl": 0.966796875, "learning_rate": 3e-06, "loss": -0.0021, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00048426150121065375, "grad_norm": 0.8614783265852023, "kl": 0.779296875, "learning_rate": 3e-06, "loss": -0.003, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 3240.0, "completions/max_terminated_length": 3240.0, "completions/mean_length": 1146.0625, "completions/mean_terminated_length": 1410.5384615384614, "completions/min_length": 0.0, "completions/min_terminated_length": 954.0, "epoch": 0.000490314769975787, "grad_norm": 1343141904879.566, "kl": 132607115273.9043, "learning_rate": 3e-06, "loss": 187695104.0, "num_tokens": 703828.0, "reward": -0.19236184656620026, "reward_std": 0.43547719717025757, "rewards/correct_answer_reward_func": 0.5, "rewards/efficient_thinking_reward_func": 2.1309096813201904, "rewards/format_reward_func": 0.9219642877578735, "rewards/num_xml_reward_func": 0.46875, "rewards/tool_execution_reward_func": 0.9375, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00049636803874092, "grad_norm": 7965471.976798021, "kl": 1048591.578125, "learning_rate": 3e-06, "loss": 1488.7067, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0005024213075060532, "grad_norm": 16601.817266727197, "kl": 1132.98046875, "learning_rate": 3e-06, "loss": 2.5826, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0005084745762711864, "grad_norm": 3.849730508254259, "kl": 1.3916015625, "learning_rate": 3e-06, "loss": 0.0188, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2706.0, "completions/max_terminated_length": 2706.0, "completions/mean_length": 1372.625, "completions/mean_terminated_length": 1568.7142857142858, "completions/min_length": 0.0, "completions/min_terminated_length": 13.0, "epoch": 0.0005145278450363196, "grad_norm": 75.5463129972844, "kl": 41.2880859375, "learning_rate": 3e-06, "loss": 0.0314, "num_tokens": 741602.0, "reward": -0.09613406658172607, "reward_std": 0.323569655418396, "rewards/correct_answer_reward_func": 0.25, "rewards/efficient_thinking_reward_func": 2.0161290168762207, "rewards/format_reward_func": 0.9337499737739563, "rewards/num_xml_reward_func": 0.46875, "rewards/tool_execution_reward_func": 1.0, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0005205811138014527, "grad_norm": 8.361466445717037, "kl": 11.2041015625, "learning_rate": 3e-06, "loss": 0.0104, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0005266343825665859, "grad_norm": 0.4858997056210773, "kl": 3.3115234375, "learning_rate": 3e-06, "loss": 0.0073, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0005326876513317191, "grad_norm": 0.48251870840305755, "kl": 1.1220703125, "learning_rate": 3e-06, "loss": 0.0071, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2879.0, "completions/max_terminated_length": 2879.0, "completions/mean_length": 1475.9375, "completions/mean_terminated_length": 1475.9375, "completions/min_length": 949.0, "completions/min_terminated_length": 949.0, "epoch": 0.0005387409200968523, "grad_norm": 817.4364276672566, "kl": 11.91796875, "learning_rate": 3e-06, "loss": 0.0879, "num_tokens": 772797.0, "reward": -0.029312893748283386, "reward_std": 0.3894536793231964, "rewards/correct_answer_reward_func": 0.375, "rewards/efficient_thinking_reward_func": 4.233870983123779, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 0.9479166269302368, "rewards/tool_execution_reward_func": 1.125, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0005447941888619854, "grad_norm": 3.3298619813321695, "kl": 1.474609375, "learning_rate": 3e-06, "loss": 0.0212, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0005508474576271186, "grad_norm": 1.1274332258815973, "kl": 1.1123046875, "learning_rate": 3e-06, "loss": 0.0196, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0005569007263922518, "grad_norm": 0.8850067409872743, "kl": 1.123046875, "learning_rate": 3e-06, "loss": 0.0196, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2178.0, "completions/max_terminated_length": 2178.0, "completions/mean_length": 1189.5625, "completions/mean_terminated_length": 1359.5, "completions/min_length": 0.0, "completions/min_terminated_length": 13.0, "epoch": 0.000562953995157385, "grad_norm": 1.3395550909317473, "kl": 1.9501953125, "learning_rate": 3e-06, "loss": 0.0077, "num_tokens": 807642.0, "reward": -0.009627980180084705, "reward_std": 0.2253102958202362, "rewards/correct_answer_reward_func": 0.125, "rewards/efficient_thinking_reward_func": 4.435483932495117, "rewards/format_reward_func": 0.8824999928474426, "rewards/num_xml_reward_func": 1.03125, "rewards/tool_execution_reward_func": 0.9375, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0005690072639225181, "grad_norm": 0.5682146184631082, "kl": 1.765625, "learning_rate": 3e-06, "loss": 0.0072, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0005750605326876513, "grad_norm": 0.28055748126607594, "kl": 1.4423828125, "learning_rate": 3e-06, "loss": 0.0067, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0005811138014527845, "grad_norm": 0.2679104384917441, "kl": 1.2880859375, "learning_rate": 3e-06, "loss": 0.0067, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2370.0, "completions/max_terminated_length": 2370.0, "completions/mean_length": 1182.8125, "completions/mean_terminated_length": 1351.7857142857142, "completions/min_length": 0.0, "completions/min_terminated_length": 942.0, "epoch": 0.0005871670702179177, "grad_norm": 74.74923285644503, "kl": 5.83984375, "learning_rate": 3e-06, "loss": 0.0258, "num_tokens": 842339.0, "reward": 0.15201856195926666, "reward_std": 0.4216999411582947, "rewards/correct_answer_reward_func": 0.5625, "rewards/efficient_thinking_reward_func": 5.040322303771973, "rewards/format_reward_func": 0.9916666746139526, "rewards/num_xml_reward_func": 1.1145833730697632, "rewards/tool_execution_reward_func": 1.1875, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0005932203389830508, "grad_norm": 6.21548581436623, "kl": 2.208984375, "learning_rate": 3e-06, "loss": 0.0066, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.000599273607748184, "grad_norm": 0.9670438333546869, "kl": 1.333984375, "learning_rate": 3e-06, "loss": 0.0034, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0006053268765133172, "grad_norm": 0.9705895262826928, "kl": 1.177734375, "learning_rate": 3e-06, "loss": 0.0029, "step": 100 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "completions/clipped_ratio": 0.3125, "completions/max_length": 2850.0, "completions/max_terminated_length": 2850.0, "completions/mean_length": 1026.1875, "completions/mean_terminated_length": 1492.6363636363637, "completions/min_length": 0.0, "completions/min_terminated_length": 1078.0, "epoch": 0.0006113801452784503, "grad_norm": 0.34914402483189466, "kl": NaN, "learning_rate": 3e-06, "loss": 0.0038, "num_tokens": 886858.0, "reward": -0.009644638746976852, "reward_std": 0.22530952095985413, "rewards/correct_answer_reward_func": 0.125, "rewards/efficient_thinking_reward_func": 5.01161003112793, "rewards/format_reward_func": 0.949999988079071, "rewards/num_xml_reward_func": 0.9479166269302368, "rewards/tool_execution_reward_func": 1.125, "step": 101 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "epoch": 0.0006174334140435835, "grad_norm": 0.409729728396783, "kl": NaN, "learning_rate": 3e-06, "loss": 0.0039, "step": 102 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "epoch": 0.0006234866828087167, "grad_norm": 34.71662901286587, "kl": NaN, "learning_rate": 3e-06, "loss": 0.016, "step": 103 }, { "clip_ratio/high_max": NaN, "clip_ratio/high_mean": NaN, "clip_ratio/low_mean": NaN, "clip_ratio/low_min": NaN, "clip_ratio/region_mean": NaN, "epoch": 0.0006295399515738499, "grad_norm": 178.12705241502368, "kl": NaN, "learning_rate": 3e-06, "loss": 0.0662, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2669.0, "completions/max_terminated_length": 2669.0, "completions/mean_length": 1197.0625, "completions/mean_terminated_length": 1473.3076923076924, "completions/min_length": 0.0, "completions/min_terminated_length": 1011.0, "epoch": 0.000635593220338983, "grad_norm": 113656977492.3838, "kl": 4060086272.1398926, "learning_rate": 3e-06, "loss": 26552720.0, "num_tokens": 925879.0, "reward": 0.12848791480064392, "reward_std": 0.3391018807888031, "rewards/correct_answer_reward_func": 0.375, "rewards/efficient_thinking_reward_func": 6.138603687286377, "rewards/format_reward_func": 0.9183332920074463, "rewards/num_xml_reward_func": 1.1041667461395264, "rewards/tool_execution_reward_func": 1.0625, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0006416464891041162, "grad_norm": 862090008.796202, "kl": 29097984.143554688, "learning_rate": 3e-06, "loss": 190154.0312, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0006476997578692494, "grad_norm": 5971.671602768365, "kl": 225.146484375, "learning_rate": 3e-06, "loss": 1.4726, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0006537530266343826, "grad_norm": 197.81456021441926, "kl": 10.714111328125, "learning_rate": 3e-06, "loss": 0.071, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 2507.0, "completions/max_terminated_length": 2507.0, "completions/mean_length": 1018.3125, "completions/mean_terminated_length": 1481.1818181818182, "completions/min_length": 0.0, "completions/min_terminated_length": 920.0, "epoch": 0.0006598062953995157, "grad_norm": 211.32333218858383, "kl": 29.7578125, "learning_rate": 3e-06, "loss": 0.0762, "num_tokens": 970272.0, "reward": -0.05291781574487686, "reward_std": 0.282351553440094, "rewards/correct_answer_reward_func": 0.1875, "rewards/efficient_thinking_reward_func": 4.623542785644531, "rewards/format_reward_func": 0.9093055725097656, "rewards/num_xml_reward_func": 0.5659722089767456, "rewards/tool_execution_reward_func": 1.0, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0006658595641646489, "grad_norm": 47.690518846315975, "kl": 5.3515625, "learning_rate": 3e-06, "loss": 0.0217, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0006719128329297821, "grad_norm": 3.1184302707951286, "kl": 1.0947265625, "learning_rate": 3e-06, "loss": 0.0071, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0006779661016949153, "grad_norm": 0.5838887307984975, "kl": 0.3798828125, "learning_rate": 3e-06, "loss": 0.0055, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2388.0, "completions/max_terminated_length": 2388.0, "completions/mean_length": 976.4375, "completions/mean_terminated_length": 1301.9166666666667, "completions/min_length": 0.0, "completions/min_terminated_length": 522.0, "epoch": 0.0006840193704600484, "grad_norm": 2.279673569786722, "kl": 0.669921875, "learning_rate": 3e-06, "loss": 0.0027, "num_tokens": 1009859.0, "reward": -0.08624996989965439, "reward_std": 0.2361285239458084, "rewards/correct_answer_reward_func": 0.125, "rewards/efficient_thinking_reward_func": 5.102198123931885, "rewards/format_reward_func": 0.9054166674613953, "rewards/num_xml_reward_func": 0.71875, "rewards/tool_execution_reward_func": 0.90625, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0006900726392251816, "grad_norm": 0.29363313071510083, "kl": 0.310546875, "learning_rate": 3e-06, "loss": 0.0017, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0006961259079903148, "grad_norm": 0.25416521021234684, "kl": 0.2568359375, "learning_rate": 3e-06, "loss": 0.0015, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.000702179176755448, "grad_norm": 0.2532709696883375, "kl": 0.25048828125, "learning_rate": 3e-06, "loss": 0.0015, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 2938.0, "completions/max_terminated_length": 2938.0, "completions/mean_length": 825.1875, "completions/mean_terminated_length": 1320.3, "completions/min_length": 0.0, "completions/min_terminated_length": 265.0, "epoch": 0.0007082324455205811, "grad_norm": 0.06499596334850685, "kl": 0.195556640625, "learning_rate": 3e-06, "loss": 0.0004, "num_tokens": 1055258.0, "reward": 0.00015617192548234016, "reward_std": 0.000146350241266191, "rewards/correct_answer_reward_func": 0.0, "rewards/efficient_thinking_reward_func": 5.059846878051758, "rewards/format_reward_func": 0.8541666865348816, "rewards/num_xml_reward_func": 0.78125, "rewards/tool_execution_reward_func": 1.0625, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0007142857142857143, "grad_norm": 0.12561554187034915, "kl": 0.206298828125, "learning_rate": 3e-06, "loss": 0.0004, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0007203389830508475, "grad_norm": 0.03335194268511418, "kl": 0.185546875, "learning_rate": 3e-06, "loss": 0.0004, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0007263922518159806, "grad_norm": 0.016335498310956653, "kl": 0.1824951171875, "learning_rate": 3e-06, "loss": 0.0004, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 3041.0, "completions/max_terminated_length": 3041.0, "completions/mean_length": 1208.6875, "completions/mean_terminated_length": 1758.090909090909, "completions/min_length": 0.0, "completions/min_terminated_length": 237.0, "epoch": 0.0007324455205811138, "grad_norm": 3074.215567652929, "kl": 87.255126953125, "learning_rate": 3e-06, "loss": 0.4728, "num_tokens": 1102657.0, "reward": 0.09965705871582031, "reward_std": 0.3296266794204712, "rewards/correct_answer_reward_func": 0.3125, "rewards/efficient_thinking_reward_func": 3.1649727821350098, "rewards/format_reward_func": 0.9187500476837158, "rewards/num_xml_reward_func": 0.5104166865348816, "rewards/tool_execution_reward_func": 1.0, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.000738498789346247, "grad_norm": 179.50807324392667, "kl": 30.089599609375, "learning_rate": 3e-06, "loss": 0.1486, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0007445520581113802, "grad_norm": 20.649024610652848, "kl": 3.32763671875, "learning_rate": 3e-06, "loss": 0.0155, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0007506053268765133, "grad_norm": 2.0418855369195485, "kl": 0.590087890625, "learning_rate": 3e-06, "loss": -0.0023, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1461.0, "completions/max_terminated_length": 1461.0, "completions/mean_length": 844.125, "completions/mean_terminated_length": 1227.8181818181818, "completions/min_length": 0.0, "completions/min_terminated_length": 919.0, "epoch": 0.0007566585956416465, "grad_norm": 1.2107206967986066, "kl": 0.489501953125, "learning_rate": 3e-06, "loss": 0.0052, "num_tokens": 1144263.0, "reward": -0.043053146451711655, "reward_std": 0.1728256642818451, "rewards/correct_answer_reward_func": 0.0625, "rewards/efficient_thinking_reward_func": 3.937290668487549, "rewards/format_reward_func": 0.877500057220459, "rewards/num_xml_reward_func": 0.71875, "rewards/tool_execution_reward_func": 0.9166666865348816, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0007627118644067797, "grad_norm": 0.3153399141642928, "kl": 0.267333984375, "learning_rate": 3e-06, "loss": 0.0046, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0007687651331719129, "grad_norm": 0.30940853761801884, "kl": 0.25048828125, "learning_rate": 3e-06, "loss": 0.0046, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.000774818401937046, "grad_norm": 0.41896360385565806, "kl": 0.291259765625, "learning_rate": 3e-06, "loss": 0.0048, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2040.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1019.8125, "completions/mean_terminated_length": 1255.1538461538462, "completions/min_length": 0.0, "completions/min_terminated_length": 915.0, "epoch": 0.0007808716707021792, "grad_norm": 3.046991653902829, "kl": 1.09619140625, "learning_rate": 3e-06, "loss": 0.0034, "num_tokens": 1180448.0, "reward": 0.023746376857161522, "reward_std": 0.263210654258728, "rewards/correct_answer_reward_func": 0.1875, "rewards/efficient_thinking_reward_func": 5.699418067932129, "rewards/format_reward_func": 0.9104167222976685, "rewards/num_xml_reward_func": 1.089583396911621, "rewards/tool_execution_reward_func": 0.8541666865348816, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0007869249394673124, "grad_norm": 0.6371118649367347, "kl": 0.56591796875, "learning_rate": 3e-06, "loss": 0.0022, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0007929782082324456, "grad_norm": 0.7977652642661696, "kl": 0.475341796875, "learning_rate": 3e-06, "loss": 0.002, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0007990314769975787, "grad_norm": 0.3954173262150925, "kl": 0.418212890625, "learning_rate": 3e-06, "loss": 0.0019, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2829.0, "completions/max_terminated_length": 2829.0, "completions/mean_length": 1284.6875, "completions/mean_terminated_length": 1581.1538461538462, "completions/min_length": 0.0, "completions/min_terminated_length": 946.0, "epoch": 0.0008050847457627119, "grad_norm": 0.16569380056333147, "kl": 0.203369140625, "learning_rate": 3e-06, "loss": 0.0001, "num_tokens": 1220839.0, "reward": 0.06701274961233139, "reward_std": 0.18233336508274078, "rewards/correct_answer_reward_func": 0.125, "rewards/efficient_thinking_reward_func": 8.0274658203125, "rewards/format_reward_func": 0.96833336353302, "rewards/num_xml_reward_func": 1.4375, "rewards/tool_execution_reward_func": 1.0, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0008111380145278451, "grad_norm": 0.16664121027946213, "kl": 0.20068359375, "learning_rate": 3e-06, "loss": 0.0001, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0008171912832929783, "grad_norm": 0.16072020293930506, "kl": 0.194091796875, "learning_rate": 3e-06, "loss": 0.0001, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0008232445520581114, "grad_norm": 41.76166861124815, "kl": 1.373046875, "learning_rate": 3e-06, "loss": 0.0071, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3684.0, "completions/max_terminated_length": 3684.0, "completions/mean_length": 1599.125, "completions/mean_terminated_length": 1599.125, "completions/min_length": 589.0, "completions/min_terminated_length": 589.0, "epoch": 0.0008292978208232446, "grad_norm": 646644.6699956892, "kl": 49842.197265625, "learning_rate": 3e-06, "loss": 229.4684, "num_tokens": 1254025.0, "reward": 0.013887053355574608, "reward_std": 0.34718748927116394, "rewards/correct_answer_reward_func": 0.3125, "rewards/efficient_thinking_reward_func": 5.241935729980469, "rewards/format_reward_func": 0.9479166269302368, "rewards/num_xml_reward_func": 0.9635417461395264, "rewards/tool_execution_reward_func": 0.875, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0008353510895883778, "grad_norm": 27876.102822783414, "kl": 2136.07666015625, "learning_rate": 3e-06, "loss": 12.8243, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0008414043583535108, "grad_norm": 790.2979277928656, "kl": 118.16162109375, "learning_rate": 3e-06, "loss": 0.4985, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.000847457627118644, "grad_norm": 47.030265324687, "kl": 7.62548828125, "learning_rate": 3e-06, "loss": 0.0346, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 3041.0, "completions/max_terminated_length": 3041.0, "completions/mean_length": 1098.375, "completions/mean_terminated_length": 1255.2857142857142, "completions/min_length": 0.0, "completions/min_terminated_length": 325.0, "epoch": 0.0008535108958837772, "grad_norm": 27163.40903493807, "kl": 1403.2744140625, "learning_rate": 3e-06, "loss": 7.8279, "num_tokens": 1287339.0, "reward": -0.00962173379957676, "reward_std": 0.2253105789422989, "rewards/correct_answer_reward_func": 0.125, "rewards/efficient_thinking_reward_func": 4.435483932495117, "rewards/format_reward_func": 0.9901785850524902, "rewards/num_xml_reward_func": 1.0625, "rewards/tool_execution_reward_func": 0.875, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0008595641646489104, "grad_norm": 104.2213070575419, "kl": 5.6044921875, "learning_rate": 3e-06, "loss": 0.0309, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0008656174334140435, "grad_norm": 5.517418949203676, "kl": 1.123046875, "learning_rate": 3e-06, "loss": 0.0054, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0008716707021791767, "grad_norm": 0.5745631831412419, "kl": 0.42724609375, "learning_rate": 3e-06, "loss": 0.0033, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2668.0, "completions/max_terminated_length": 2668.0, "completions/mean_length": 1254.25, "completions/mean_terminated_length": 1337.8666666666666, "completions/min_length": 0.0, "completions/min_terminated_length": 377.0, "epoch": 0.0008777239709443099, "grad_norm": 11.985045430350514, "kl": 1.40576171875, "learning_rate": 3e-06, "loss": -0.0054, "num_tokens": 1319103.0, "reward": 0.18167813122272491, "reward_std": 0.28324925899505615, "rewards/correct_answer_reward_func": 0.3125, "rewards/efficient_thinking_reward_func": 5.79470682144165, "rewards/format_reward_func": 0.9524999856948853, "rewards/num_xml_reward_func": 1.1041666269302368, "rewards/tool_execution_reward_func": 0.8125, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0008837772397094431, "grad_norm": 0.7668036294329545, "kl": 0.444580078125, "learning_rate": 3e-06, "loss": -0.0087, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0008898305084745762, "grad_norm": 0.3955664525228031, "kl": 0.291259765625, "learning_rate": 3e-06, "loss": -0.0093, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0008958837772397094, "grad_norm": 0.3609851056156834, "kl": 0.26123046875, "learning_rate": 3e-06, "loss": -0.0094, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 2821.0, "completions/max_terminated_length": 2821.0, "completions/mean_length": 717.125, "completions/mean_terminated_length": 1274.888888888889, "completions/min_length": 0.0, "completions/min_terminated_length": 228.0, "epoch": 0.0009019370460048426, "grad_norm": 0.10668119884135797, "kl": 0.25, "learning_rate": 3e-06, "loss": -0.0054, "num_tokens": 1366797.0, "reward": 0.03348555043339729, "reward_std": 0.13349804282188416, "rewards/correct_answer_reward_func": 0.0625, "rewards/efficient_thinking_reward_func": 4.775701522827148, "rewards/format_reward_func": 0.8970834016799927, "rewards/num_xml_reward_func": 0.6145833730697632, "rewards/tool_execution_reward_func": 0.800000011920929, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0009079903147699758, "grad_norm": 0.07581171591865946, "kl": 0.21337890625, "learning_rate": 3e-06, "loss": -0.0055, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0009140435835351089, "grad_norm": 0.07368344797048941, "kl": 0.2066650390625, "learning_rate": 3e-06, "loss": -0.0055, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0009200968523002421, "grad_norm": 0.06383190574866472, "kl": 0.2005615234375, "learning_rate": 3e-06, "loss": -0.0055, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2775.0, "completions/max_terminated_length": 2775.0, "completions/mean_length": 1244.6875, "completions/mean_terminated_length": 1422.5, "completions/min_length": 0.0, "completions/min_terminated_length": 381.0, "epoch": 0.0009261501210653753, "grad_norm": 0.4120015190644857, "kl": 0.250732421875, "learning_rate": 3e-06, "loss": -0.0169, "num_tokens": 1402504.0, "reward": 0.07177050411701202, "reward_std": 0.3221079111099243, "rewards/correct_answer_reward_func": 0.25, "rewards/efficient_thinking_reward_func": 5.607255458831787, "rewards/format_reward_func": 0.9775000214576721, "rewards/num_xml_reward_func": 1.1770832538604736, "rewards/tool_execution_reward_func": 0.875, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0009322033898305085, "grad_norm": 0.4391974629991036, "kl": 0.245361328125, "learning_rate": 3e-06, "loss": -0.0169, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0009382566585956416, "grad_norm": 0.3734307496972298, "kl": 0.372314453125, "learning_rate": 3e-06, "loss": -0.0166, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0009443099273607748, "grad_norm": 5.409050230753026, "kl": 1.83984375, "learning_rate": 3e-06, "loss": -0.0136, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2702.0, "completions/max_terminated_length": 2702.0, "completions/mean_length": 975.8125, "completions/mean_terminated_length": 1201.0, "completions/min_length": 0.0, "completions/min_terminated_length": 349.0, "epoch": 0.000950363196125908, "grad_norm": 0.7067985121705038, "kl": 0.344482421875, "learning_rate": 3e-06, "loss": -0.0048, "num_tokens": 1437953.0, "reward": 0.06698047369718552, "reward_std": 0.18234598636627197, "rewards/correct_answer_reward_func": 0.125, "rewards/efficient_thinking_reward_func": 5.846774578094482, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.2760416269302368, "rewards/tool_execution_reward_func": 0.9375, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0009564164648910411, "grad_norm": 0.11769454310604478, "kl": 0.2841796875, "learning_rate": 3e-06, "loss": -0.005, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0009624697336561743, "grad_norm": 0.10329568036332133, "kl": 0.286865234375, "learning_rate": 3e-06, "loss": -0.005, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0009685230024213075, "grad_norm": 4.24103180505241, "kl": 0.876953125, "learning_rate": 3e-06, "loss": -0.0042, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2661.0, "completions/max_terminated_length": 2661.0, "completions/mean_length": 1132.625, "completions/mean_terminated_length": 1394.0, "completions/min_length": 0.0, "completions/min_terminated_length": 481.0, "epoch": 0.0009745762711864407, "grad_norm": 30399.081494084556, "kl": 3248.51220703125, "learning_rate": 3e-06, "loss": 11.4413, "num_tokens": 1475963.0, "reward": 0.0669461190700531, "reward_std": 0.18235941231250763, "rewards/correct_answer_reward_func": 0.125, "rewards/efficient_thinking_reward_func": 5.576177597045898, "rewards/format_reward_func": 0.9516667127609253, "rewards/num_xml_reward_func": 1.1041666269302368, "rewards/tool_execution_reward_func": 1.0, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.000980629539951574, "grad_norm": 301.320864829815, "kl": 42.82861328125, "learning_rate": 3e-06, "loss": 0.1518, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0009866828087167071, "grad_norm": 7.516991078245809, "kl": 2.22998046875, "learning_rate": 3e-06, "loss": 0.0084, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00099273607748184, "grad_norm": 0.9933703066829184, "kl": 0.6552734375, "learning_rate": 3e-06, "loss": 0.0029, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2659.0, "completions/max_terminated_length": 2659.0, "completions/mean_length": 1132.6875, "completions/mean_terminated_length": 1394.076923076923, "completions/min_length": 0.0, "completions/min_terminated_length": 1039.0, "epoch": 0.0009987893462469733, "grad_norm": 216588358.58769044, "kl": 14798848.96875, "learning_rate": 3e-06, "loss": 39560.0, "num_tokens": 1513922.0, "reward": 0.06703699380159378, "reward_std": 0.1823238879442215, "rewards/correct_answer_reward_func": 0.125, "rewards/efficient_thinking_reward_func": 8.870967864990234, "rewards/format_reward_func": 0.9958333373069763, "rewards/num_xml_reward_func": 1.5587797164916992, "rewards/tool_execution_reward_func": 1.0, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0010048426150121065, "grad_norm": 20.66026628810014, "kl": 2.27978515625, "learning_rate": 3e-06, "loss": 0.0026, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0010108958837772397, "grad_norm": 0.5852549123131363, "kl": 0.49462890625, "learning_rate": 3e-06, "loss": -0.0021, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.001016949152542373, "grad_norm": 10.985658110563065, "kl": 1.380859375, "learning_rate": 3e-06, "loss": 0.0001, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2760.0, "completions/max_terminated_length": 2760.0, "completions/mean_length": 1299.5625, "completions/mean_terminated_length": 1599.4615384615386, "completions/min_length": 0.0, "completions/min_terminated_length": 995.0, "epoch": 0.001023002421307506, "grad_norm": 3.1833824255736856, "kl": 0.49853515625, "learning_rate": 3e-06, "loss": -0.0108, "num_tokens": 1554603.0, "reward": 0.1483658403158188, "reward_std": 0.27007395029067993, "rewards/correct_answer_reward_func": 0.25, "rewards/efficient_thinking_reward_func": 8.19877815246582, "rewards/format_reward_func": 0.9624999761581421, "rewards/num_xml_reward_func": 1.3562500476837158, "rewards/tool_execution_reward_func": 1.0625, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0010290556900726393, "grad_norm": 0.43961018475161184, "kl": 0.41162109375, "learning_rate": 3e-06, "loss": -0.0113, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0010351089588377725, "grad_norm": 1.4869647567424702, "kl": 0.330078125, "learning_rate": 3e-06, "loss": -0.0114, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0010411622276029055, "grad_norm": 1.5593896451086025, "kl": 0.34423828125, "learning_rate": 3e-06, "loss": -0.0114, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2658.0, "completions/max_terminated_length": 2658.0, "completions/mean_length": 1120.4375, "completions/mean_terminated_length": 1493.9166666666667, "completions/min_length": 0.0, "completions/min_terminated_length": 1001.0, "epoch": 0.0010472154963680387, "grad_norm": 9.153186896120884, "kl": 1.18212890625, "learning_rate": 3e-06, "loss": -0.0004, "num_tokens": 1596462.0, "reward": 0.10031714290380478, "reward_std": 0.21521808207035065, "rewards/correct_answer_reward_func": 0.1875, "rewards/efficient_thinking_reward_func": 5.926279067993164, "rewards/format_reward_func": 0.9650000333786011, "rewards/num_xml_reward_func": 1.1458332538604736, "rewards/tool_execution_reward_func": 1.0625, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0010532687651331719, "grad_norm": 0.526258103973936, "kl": 0.560546875, "learning_rate": 3e-06, "loss": -0.0026, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.001059322033898305, "grad_norm": 0.14211436005078248, "kl": 0.45556640625, "learning_rate": 3e-06, "loss": -0.0029, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0010653753026634383, "grad_norm": 0.14305060184518628, "kl": 0.46484375, "learning_rate": 3e-06, "loss": -0.0028, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3333.0, "completions/max_terminated_length": 3333.0, "completions/mean_length": 1780.0, "completions/mean_terminated_length": 1780.0, "completions/min_length": 1066.0, "completions/min_terminated_length": 1066.0, "epoch": 0.0010714285714285715, "grad_norm": 0.7121759134327387, "kl": 0.4970703125, "learning_rate": 3e-06, "loss": 0.0185, "num_tokens": 1632542.0, "reward": 0.05717029049992561, "reward_std": 0.29225456714630127, "rewards/correct_answer_reward_func": 0.25, "rewards/efficient_thinking_reward_func": 7.056451797485352, "rewards/format_reward_func": 0.9625000357627869, "rewards/num_xml_reward_func": 1.3958333730697632, "rewards/tool_execution_reward_func": 1.0625, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0010774818401937047, "grad_norm": 0.511035937325882, "kl": 0.3818359375, "learning_rate": 3e-06, "loss": 0.0179, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0010835351089588377, "grad_norm": 118.18849489599667, "kl": 6.0556640625, "learning_rate": 3e-06, "loss": 0.0429, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0010895883777239709, "grad_norm": 123.64288191458726, "kl": 3.88916015625, "learning_rate": 3e-06, "loss": 0.0334, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2638.0, "completions/max_terminated_length": 2638.0, "completions/mean_length": 1067.25, "completions/mean_terminated_length": 1423.0, "completions/min_length": 0.0, "completions/min_terminated_length": 1119.0, "epoch": 0.001095641646489104, "grad_norm": 0.44662061313508766, "kl": 0.8115234375, "learning_rate": 3e-06, "loss": -0.0038, "num_tokens": 1673550.0, "reward": 0.10039953887462616, "reward_std": 0.21517714858055115, "rewards/correct_answer_reward_func": 0.1875, "rewards/efficient_thinking_reward_func": 9.27419376373291, "rewards/format_reward_func": 0.9892857074737549, "rewards/num_xml_reward_func": 1.5580357313156128, "rewards/tool_execution_reward_func": 0.9791666865348816, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0011016949152542373, "grad_norm": 0.19064814713811837, "kl": 0.673828125, "learning_rate": 3e-06, "loss": -0.0041, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0011077481840193705, "grad_norm": 0.12530575938029517, "kl": 0.578125, "learning_rate": 3e-06, "loss": -0.0044, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0011138014527845037, "grad_norm": 0.14053997598107024, "kl": 0.53515625, "learning_rate": 3e-06, "loss": -0.0045, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2780.0, "completions/max_terminated_length": 2780.0, "completions/mean_length": 1584.5625, "completions/mean_terminated_length": 1690.2, "completions/min_length": 0.0, "completions/min_terminated_length": 1083.0, "epoch": 0.0011198547215496369, "grad_norm": 0.36922212655466874, "kl": 0.328125, "learning_rate": 3e-06, "loss": -0.0082, "num_tokens": 1710599.0, "reward": 0.26312950253486633, "reward_std": 0.31617864966392517, "rewards/correct_answer_reward_func": 0.4375, "rewards/efficient_thinking_reward_func": 8.467741966247559, "rewards/format_reward_func": 0.9862499833106995, "rewards/num_xml_reward_func": 1.5145833492279053, "rewards/tool_execution_reward_func": 1.0, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00112590799031477, "grad_norm": 0.35037508840939274, "kl": 0.3310546875, "learning_rate": 3e-06, "loss": -0.0082, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.001131961259079903, "grad_norm": 0.33082477958916895, "kl": 0.35009765625, "learning_rate": 3e-06, "loss": -0.0081, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0011380145278450363, "grad_norm": 0.345834479974622, "kl": 0.3896484375, "learning_rate": 3e-06, "loss": -0.0079, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1811.0, "completions/max_terminated_length": 1811.0, "completions/mean_length": 1041.8125, "completions/mean_terminated_length": 1389.0833333333333, "completions/min_length": 0.0, "completions/min_terminated_length": 1031.0, "epoch": 0.0011440677966101695, "grad_norm": 35.33227872117956, "kl": 6.01953125, "learning_rate": 3e-06, "loss": 0.0108, "num_tokens": 1751200.0, "reward": 0.10040043294429779, "reward_std": 0.21517671644687653, "rewards/correct_answer_reward_func": 0.1875, "rewards/efficient_thinking_reward_func": 8.064516067504883, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.5625, "rewards/tool_execution_reward_func": 1.0, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0011501210653753027, "grad_norm": 0.835012686734699, "kl": 0.9775390625, "learning_rate": 3e-06, "loss": -0.0043, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0011561743341404359, "grad_norm": 0.2586421230474391, "kl": 0.8486328125, "learning_rate": 3e-06, "loss": -0.0047, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.001162227602905569, "grad_norm": 0.1651807846836529, "kl": 0.7734375, "learning_rate": 3e-06, "loss": -0.0049, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4084.0, "completions/max_terminated_length": 4084.0, "completions/mean_length": 1537.375, "completions/mean_terminated_length": 1639.8666666666666, "completions/min_length": 0.0, "completions/min_terminated_length": 1045.0, "epoch": 0.0011682808716707023, "grad_norm": 243.49244406063949, "kl": 5.927734375, "learning_rate": 3e-06, "loss": 0.0565, "num_tokens": 1787494.0, "reward": 0.23383872210979462, "reward_std": 0.2734927237033844, "rewards/correct_answer_reward_func": 0.4375, "rewards/efficient_thinking_reward_func": 6.451612949371338, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.5, "rewards/tool_execution_reward_func": 1.0, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0011743341404358355, "grad_norm": 0.6933472630429098, "kl": 0.685546875, "learning_rate": 3e-06, "loss": 0.0218, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0011803874092009684, "grad_norm": 0.38422473996938145, "kl": 0.61328125, "learning_rate": 3e-06, "loss": 0.0216, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0011864406779661016, "grad_norm": 0.33054590530199923, "kl": 0.6220703125, "learning_rate": 3e-06, "loss": 0.0216, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 3909.0, "completions/max_terminated_length": 3909.0, "completions/mean_length": 1388.375, "completions/mean_terminated_length": 1708.7692307692307, "completions/min_length": 0.0, "completions/min_terminated_length": 1148.0, "epoch": 0.0011924939467312348, "grad_norm": 188.57473992181284, "kl": 14.59765625, "learning_rate": 3e-06, "loss": 0.0927, "num_tokens": 1829544.0, "reward": 0.10040343552827835, "reward_std": 0.21517521142959595, "rewards/correct_answer_reward_func": 0.1875, "rewards/efficient_thinking_reward_func": 8.870967864990234, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.5775296688079834, "rewards/tool_execution_reward_func": 1.0, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.001198547215496368, "grad_norm": 1.773591133545115, "kl": 1.154296875, "learning_rate": 3e-06, "loss": 0.0063, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0012046004842615012, "grad_norm": 0.360837942568022, "kl": 0.9296875, "learning_rate": 3e-06, "loss": 0.0049, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0012106537530266344, "grad_norm": 0.21716087780595142, "kl": 0.8203125, "learning_rate": 3e-06, "loss": 0.0044, "step": 200 } ], "logging_steps": 1, "max_steps": 2000, "num_input_tokens_seen": 1829544, "num_train_epochs": 1, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }