|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.0012106537530266344, |
|
"eval_steps": 500, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 3274.0, |
|
"completions/max_terminated_length": 3274.0, |
|
"completions/mean_length": 1305.6875, |
|
"completions/mean_terminated_length": 1305.6875, |
|
"completions/min_length": 359.0, |
|
"completions/min_terminated_length": 359.0, |
|
"epoch": 6.053268765133172e-06, |
|
"grad_norm": 0.7806517512814045, |
|
"kl": 0.0191497802734375, |
|
"learning_rate": 0.0, |
|
"loss": -0.0018, |
|
"num_tokens": 28431.0, |
|
"reward": 0.10032547265291214, |
|
"reward_std": 0.21521395444869995, |
|
"rewards/correct_answer_reward_func": 0.1875, |
|
"rewards/efficient_thinking_reward_func": 5.08148193359375, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.1875, |
|
"rewards/tool_execution_reward_func": 0.9375, |
|
"step": 1 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 1.2106537530266343e-05, |
|
"grad_norm": 0.779905050857545, |
|
"kl": 0.0191497802734375, |
|
"learning_rate": 1.5000000000000002e-07, |
|
"loss": -0.0018, |
|
"step": 2 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 1.8159806295399516e-05, |
|
"grad_norm": 0.775895943887529, |
|
"kl": 0.0205230712890625, |
|
"learning_rate": 3.0000000000000004e-07, |
|
"loss": -0.0018, |
|
"step": 3 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 2.4213075060532686e-05, |
|
"grad_norm": 0.7054717897002856, |
|
"kl": 0.0189208984375, |
|
"learning_rate": 4.5e-07, |
|
"loss": -0.0018, |
|
"step": 4 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0625, |
|
"completions/max_length": 3197.0, |
|
"completions/max_terminated_length": 3197.0, |
|
"completions/mean_length": 1387.5625, |
|
"completions/mean_terminated_length": 1480.0666666666666, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 301.0, |
|
"epoch": 3.026634382566586e-05, |
|
"grad_norm": 2.8398488181877535, |
|
"kl": 0.0939483642578125, |
|
"learning_rate": 6.000000000000001e-07, |
|
"loss": 0.0005, |
|
"num_tokens": 62348.0, |
|
"reward": 0.0669252872467041, |
|
"reward_std": 0.18236754834651947, |
|
"rewards/correct_answer_reward_func": 0.125, |
|
"rewards/efficient_thinking_reward_func": 4.3189802169799805, |
|
"rewards/format_reward_func": 0.9474999904632568, |
|
"rewards/num_xml_reward_func": 1.0, |
|
"rewards/tool_execution_reward_func": 0.875, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 3.631961259079903e-05, |
|
"grad_norm": 0.6955667618263449, |
|
"kl": 0.013031005859375, |
|
"learning_rate": 7.5e-07, |
|
"loss": 0.0002, |
|
"step": 6 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 4.2372881355932206e-05, |
|
"grad_norm": 0.6839471972703409, |
|
"kl": 0.0131378173828125, |
|
"learning_rate": 9e-07, |
|
"loss": 0.0003, |
|
"step": 7 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 4.842615012106537e-05, |
|
"grad_norm": 0.6825085562302083, |
|
"kl": 0.0136871337890625, |
|
"learning_rate": 1.05e-06, |
|
"loss": 0.0003, |
|
"step": 8 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 1535.0, |
|
"completions/max_terminated_length": 1535.0, |
|
"completions/mean_length": 1288.375, |
|
"completions/mean_terminated_length": 1288.375, |
|
"completions/min_length": 501.0, |
|
"completions/min_terminated_length": 501.0, |
|
"epoch": 5.4479418886198546e-05, |
|
"grad_norm": 2.1585776860330728, |
|
"kl": 0.01605224609375, |
|
"learning_rate": 1.2000000000000002e-06, |
|
"loss": 0.0002, |
|
"num_tokens": 90542.0, |
|
"reward": 0.004091441631317139, |
|
"reward_std": 0.4142349064350128, |
|
"rewards/correct_answer_reward_func": 0.4375, |
|
"rewards/efficient_thinking_reward_func": 4.875295639038086, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.15625, |
|
"rewards/tool_execution_reward_func": 0.9375, |
|
"step": 9 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 6.053268765133172e-05, |
|
"grad_norm": 2.086709029627379, |
|
"kl": 0.030364990234375, |
|
"learning_rate": 1.35e-06, |
|
"loss": 0.0003, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 6.658595641646489e-05, |
|
"grad_norm": 2.055694134933805, |
|
"kl": 0.035125732421875, |
|
"learning_rate": 1.5e-06, |
|
"loss": 0.0003, |
|
"step": 11 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 7.263922518159807e-05, |
|
"grad_norm": 5.214306021603971, |
|
"kl": 0.24627685546875, |
|
"learning_rate": 1.65e-06, |
|
"loss": 0.0009, |
|
"step": 12 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 3169.0, |
|
"completions/max_terminated_length": 3169.0, |
|
"completions/mean_length": 1584.5625, |
|
"completions/mean_terminated_length": 1584.5625, |
|
"completions/min_length": 1048.0, |
|
"completions/min_terminated_length": 1048.0, |
|
"epoch": 7.869249394673124e-05, |
|
"grad_norm": 3.0602846246870006, |
|
"kl": 0.2955322265625, |
|
"learning_rate": 1.8e-06, |
|
"loss": 0.0012, |
|
"num_tokens": 123515.0, |
|
"reward": 0.00020614694221876562, |
|
"reward_std": 0.00014354230370372534, |
|
"rewards/correct_answer_reward_func": 0.0, |
|
"rewards/efficient_thinking_reward_func": 4.435483932495117, |
|
"rewards/format_reward_func": 0.9916666746139526, |
|
"rewards/num_xml_reward_func": 1.03125, |
|
"rewards/tool_execution_reward_func": 1.125, |
|
"step": 13 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 8.474576271186441e-05, |
|
"grad_norm": 3.24773890393597, |
|
"kl": 0.262451171875, |
|
"learning_rate": 1.95e-06, |
|
"loss": 0.0012, |
|
"step": 14 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 9.079903147699757e-05, |
|
"grad_norm": 1.0320633234812377, |
|
"kl": 0.21142578125, |
|
"learning_rate": 2.1e-06, |
|
"loss": 0.0009, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 9.685230024213075e-05, |
|
"grad_norm": 0.6955273697626486, |
|
"kl": 0.19775390625, |
|
"learning_rate": 2.25e-06, |
|
"loss": 0.0008, |
|
"step": 16 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0625, |
|
"completions/max_length": 2131.0, |
|
"completions/max_terminated_length": 2131.0, |
|
"completions/mean_length": 1081.8125, |
|
"completions/mean_terminated_length": 1153.9333333333334, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 279.0, |
|
"epoch": 0.00010290556900726392, |
|
"grad_norm": 1.383354124641088, |
|
"kl": 0.18017578125, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 0.0065, |
|
"num_tokens": 152500.0, |
|
"reward": 0.14100381731987, |
|
"reward_std": 0.44302019476890564, |
|
"rewards/correct_answer_reward_func": 0.5, |
|
"rewards/efficient_thinking_reward_func": 5.360107898712158, |
|
"rewards/format_reward_func": 0.9816666841506958, |
|
"rewards/num_xml_reward_func": 1.0458333492279053, |
|
"rewards/tool_execution_reward_func": 0.75, |
|
"step": 17 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00010895883777239709, |
|
"grad_norm": 1.370303518112428, |
|
"kl": 0.256591796875, |
|
"learning_rate": 2.55e-06, |
|
"loss": 0.0067, |
|
"step": 18 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00011501210653753027, |
|
"grad_norm": 1.3030395065430471, |
|
"kl": 0.32666015625, |
|
"learning_rate": 2.7e-06, |
|
"loss": 0.0069, |
|
"step": 19 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00012106537530266344, |
|
"grad_norm": 1.5799075174968453, |
|
"kl": 0.5322265625, |
|
"learning_rate": 2.85e-06, |
|
"loss": 0.0075, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0625, |
|
"completions/max_length": 2074.0, |
|
"completions/max_terminated_length": 2074.0, |
|
"completions/mean_length": 1281.9375, |
|
"completions/mean_terminated_length": 1367.4, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 420.0, |
|
"epoch": 0.0001271186440677966, |
|
"grad_norm": 38.55336802862083, |
|
"kl": 2.150390625, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0073, |
|
"num_tokens": 184727.0, |
|
"reward": 0.03352511301636696, |
|
"reward_std": 0.1334875077009201, |
|
"rewards/correct_answer_reward_func": 0.0625, |
|
"rewards/efficient_thinking_reward_func": 3.6303672790527344, |
|
"rewards/format_reward_func": 0.9537500143051147, |
|
"rewards/num_xml_reward_func": 0.8125, |
|
"rewards/tool_execution_reward_func": 0.9375, |
|
"step": 21 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00013317191283292979, |
|
"grad_norm": 10.045688461429338, |
|
"kl": 0.865234375, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0025, |
|
"step": 22 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00013922518159806296, |
|
"grad_norm": 2.9644172349588827, |
|
"kl": 0.8720703125, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0022, |
|
"step": 23 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00014527845036319613, |
|
"grad_norm": 2.3601465047763033, |
|
"kl": 0.6787109375, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0017, |
|
"step": 24 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 1914.0, |
|
"completions/max_terminated_length": 1914.0, |
|
"completions/mean_length": 1091.0625, |
|
"completions/mean_terminated_length": 1091.0625, |
|
"completions/min_length": 227.0, |
|
"completions/min_terminated_length": 227.0, |
|
"epoch": 0.0001513317191283293, |
|
"grad_norm": 9.206773673324696, |
|
"kl": 1.31640625, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0148, |
|
"num_tokens": 209764.0, |
|
"reward": -0.007718075066804886, |
|
"reward_std": 0.3225381076335907, |
|
"rewards/correct_answer_reward_func": 0.3125, |
|
"rewards/efficient_thinking_reward_func": 4.085506439208984, |
|
"rewards/format_reward_func": 0.9916666746139526, |
|
"rewards/num_xml_reward_func": 0.9270833730697632, |
|
"rewards/tool_execution_reward_func": 0.6875, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00015738498789346248, |
|
"grad_norm": 241.05237266393604, |
|
"kl": 1.84228515625, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0166, |
|
"step": 26 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00016343825665859565, |
|
"grad_norm": 1.2667482118640077, |
|
"kl": 0.6767578125, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0123, |
|
"step": 27 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00016949152542372882, |
|
"grad_norm": 1.066626868681045, |
|
"kl": 0.572265625, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0119, |
|
"step": 28 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 2413.0, |
|
"completions/max_terminated_length": 2413.0, |
|
"completions/mean_length": 1022.5625, |
|
"completions/mean_terminated_length": 1022.5625, |
|
"completions/min_length": 13.0, |
|
"completions/min_terminated_length": 13.0, |
|
"epoch": 0.000175544794188862, |
|
"grad_norm": 3.376000196555004, |
|
"kl": 0.90673828125, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0045, |
|
"num_tokens": 233745.0, |
|
"reward": 0.06693361699581146, |
|
"reward_std": 0.18236428499221802, |
|
"rewards/correct_answer_reward_func": 0.125, |
|
"rewards/efficient_thinking_reward_func": 4.435483932495117, |
|
"rewards/format_reward_func": 0.8999999761581421, |
|
"rewards/num_xml_reward_func": 1.0416666269302368, |
|
"rewards/tool_execution_reward_func": 0.6875, |
|
"step": 29 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00018159806295399514, |
|
"grad_norm": 0.3430397854754984, |
|
"kl": 0.67919921875, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0051, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00018765133171912832, |
|
"grad_norm": 0.2377290664264501, |
|
"kl": 0.70751953125, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0051, |
|
"step": 31 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0001937046004842615, |
|
"grad_norm": 0.2344163560775934, |
|
"kl": 0.751953125, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0051, |
|
"step": 32 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 2072.0, |
|
"completions/max_terminated_length": 2072.0, |
|
"completions/mean_length": 1032.0, |
|
"completions/mean_terminated_length": 1032.0, |
|
"completions/min_length": 355.0, |
|
"completions/min_terminated_length": 355.0, |
|
"epoch": 0.00019975786924939466, |
|
"grad_norm": 9.098443242784603, |
|
"kl": 1.21142578125, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0035, |
|
"num_tokens": 257837.0, |
|
"reward": 0.13366317749023438, |
|
"reward_std": 0.23877617716789246, |
|
"rewards/correct_answer_reward_func": 0.25, |
|
"rewards/efficient_thinking_reward_func": 4.384251594543457, |
|
"rewards/format_reward_func": 0.987500011920929, |
|
"rewards/num_xml_reward_func": 1.0625, |
|
"rewards/tool_execution_reward_func": 0.625, |
|
"step": 33 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00020581113801452784, |
|
"grad_norm": 2.3617131203251613, |
|
"kl": 0.775390625, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0014, |
|
"step": 34 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.000211864406779661, |
|
"grad_norm": 0.47446261418389957, |
|
"kl": 0.59765625, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0006, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00021791767554479418, |
|
"grad_norm": 0.34925057861597, |
|
"kl": 0.564453125, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0004, |
|
"step": 36 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.125, |
|
"completions/max_length": 2844.0, |
|
"completions/max_terminated_length": 2844.0, |
|
"completions/mean_length": 1202.9375, |
|
"completions/mean_terminated_length": 1374.7857142857142, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 377.0, |
|
"epoch": 0.00022397094430992736, |
|
"grad_norm": 7.330956289365437, |
|
"kl": 1.2607421875, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0044, |
|
"num_tokens": 292896.0, |
|
"reward": 0.03360215947031975, |
|
"reward_std": 0.1334669440984726, |
|
"rewards/correct_answer_reward_func": 0.0625, |
|
"rewards/efficient_thinking_reward_func": 5.241935729980469, |
|
"rewards/format_reward_func": 0.9662500023841858, |
|
"rewards/num_xml_reward_func": 1.1979166269302368, |
|
"rewards/tool_execution_reward_func": 0.9375, |
|
"step": 37 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00023002421307506053, |
|
"grad_norm": 0.549579766985337, |
|
"kl": 0.7529296875, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0029, |
|
"step": 38 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0002360774818401937, |
|
"grad_norm": 0.16634021782187236, |
|
"kl": 0.7275390625, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0028, |
|
"step": 39 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00024213075060532688, |
|
"grad_norm": 0.1711020261656612, |
|
"kl": 0.716796875, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0027, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 2655.0, |
|
"completions/max_terminated_length": 2655.0, |
|
"completions/mean_length": 1266.6875, |
|
"completions/mean_terminated_length": 1266.6875, |
|
"completions/min_length": 403.0, |
|
"completions/min_terminated_length": 403.0, |
|
"epoch": 0.00024818401937046, |
|
"grad_norm": 0.6184470607386672, |
|
"kl": 0.796875, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0074, |
|
"num_tokens": 320743.0, |
|
"reward": 0.18172809481620789, |
|
"reward_std": 0.28321510553359985, |
|
"rewards/correct_answer_reward_func": 0.3125, |
|
"rewards/efficient_thinking_reward_func": 6.04838752746582, |
|
"rewards/format_reward_func": 0.987500011920929, |
|
"rewards/num_xml_reward_func": 1.3541666269302368, |
|
"rewards/tool_execution_reward_func": 0.875, |
|
"step": 41 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0002542372881355932, |
|
"grad_norm": 0.4662817750620152, |
|
"kl": 0.775390625, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0075, |
|
"step": 42 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00026029055690072637, |
|
"grad_norm": 0.48985485884822744, |
|
"kl": 0.7978515625, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0074, |
|
"step": 43 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00026634382566585957, |
|
"grad_norm": 0.5614378811652494, |
|
"kl": 0.83984375, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0073, |
|
"step": 44 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.125, |
|
"completions/max_length": 3370.0, |
|
"completions/max_terminated_length": 3370.0, |
|
"completions/mean_length": 1417.3125, |
|
"completions/mean_terminated_length": 1619.7857142857142, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 1053.0, |
|
"epoch": 0.0002723970944309927, |
|
"grad_norm": 57880.63241282136, |
|
"kl": 1502.919921875, |
|
"learning_rate": 3e-06, |
|
"loss": 10.1932, |
|
"num_tokens": 359232.0, |
|
"reward": 0.06695549190044403, |
|
"reward_std": 0.18235576152801514, |
|
"rewards/correct_answer_reward_func": 0.125, |
|
"rewards/efficient_thinking_reward_func": 5.443548679351807, |
|
"rewards/format_reward_func": 0.987500011920929, |
|
"rewards/num_xml_reward_func": 1.1510416269302368, |
|
"rewards/tool_execution_reward_func": 1.1875, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0002784503631961259, |
|
"grad_norm": 2894.0307559416137, |
|
"kl": 157.4794921875, |
|
"learning_rate": 3e-06, |
|
"loss": 0.3165, |
|
"step": 46 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00028450363196125906, |
|
"grad_norm": 37.029243795631096, |
|
"kl": 3.7021484375, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0144, |
|
"step": 47 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00029055690072639226, |
|
"grad_norm": 12.662485260083166, |
|
"kl": 2.4423828125, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0108, |
|
"step": 48 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 3108.0, |
|
"completions/max_terminated_length": 3108.0, |
|
"completions/mean_length": 1516.75, |
|
"completions/mean_terminated_length": 1516.75, |
|
"completions/min_length": 1020.0, |
|
"completions/min_terminated_length": 1020.0, |
|
"epoch": 0.0002966101694915254, |
|
"grad_norm": 7845.857745835188, |
|
"kl": 527.46875, |
|
"learning_rate": 3e-06, |
|
"loss": 1.7351, |
|
"num_tokens": 391080.0, |
|
"reward": 0.1431797295808792, |
|
"reward_std": 0.3620518743991852, |
|
"rewards/correct_answer_reward_func": 0.375, |
|
"rewards/efficient_thinking_reward_func": 6.04838752746582, |
|
"rewards/format_reward_func": 0.9916666746139526, |
|
"rewards/num_xml_reward_func": 1.34375, |
|
"rewards/tool_execution_reward_func": 1.0, |
|
"step": 49 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0003026634382566586, |
|
"grad_norm": 705.0569148996761, |
|
"kl": 47.4765625, |
|
"learning_rate": 3e-06, |
|
"loss": 0.1643, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00030871670702179176, |
|
"grad_norm": 19.469174009613884, |
|
"kl": 5.75390625, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0042, |
|
"step": 51 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00031476997578692496, |
|
"grad_norm": 4.187483088072952, |
|
"kl": 2.0078125, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0101, |
|
"step": 52 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 3277.0, |
|
"completions/max_terminated_length": 3277.0, |
|
"completions/mean_length": 1857.125, |
|
"completions/mean_terminated_length": 1857.125, |
|
"completions/min_length": 1220.0, |
|
"completions/min_terminated_length": 1220.0, |
|
"epoch": 0.0003208232445520581, |
|
"grad_norm": 15.598878965568742, |
|
"kl": 1.259765625, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0128, |
|
"num_tokens": 428414.0, |
|
"reward": 0.06696485728025436, |
|
"reward_std": 0.18235208094120026, |
|
"rewards/correct_answer_reward_func": 0.125, |
|
"rewards/efficient_thinking_reward_func": 5.241935729980469, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.1979166269302368, |
|
"rewards/tool_execution_reward_func": 1.0625, |
|
"step": 53 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0003268765133171913, |
|
"grad_norm": 0.46962880382282596, |
|
"kl": 0.845703125, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0115, |
|
"step": 54 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00033292978208232445, |
|
"grad_norm": 0.3371767290491545, |
|
"kl": 0.734375, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0111, |
|
"step": 55 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00033898305084745765, |
|
"grad_norm": 0.35169159593818344, |
|
"kl": 0.69140625, |
|
"learning_rate": 3e-06, |
|
"loss": 0.011, |
|
"step": 56 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 2908.0, |
|
"completions/max_terminated_length": 2908.0, |
|
"completions/mean_length": 1546.0625, |
|
"completions/mean_terminated_length": 1546.0625, |
|
"completions/min_length": 1002.0, |
|
"completions/min_terminated_length": 1002.0, |
|
"epoch": 0.0003450363196125908, |
|
"grad_norm": 157.35744089074626, |
|
"kl": 16.322265625, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0504, |
|
"num_tokens": 460731.0, |
|
"reward": 0.19058775901794434, |
|
"reward_std": 0.35528117418289185, |
|
"rewards/correct_answer_reward_func": 0.5, |
|
"rewards/efficient_thinking_reward_func": 5.443548202514648, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.2291666269302368, |
|
"rewards/tool_execution_reward_func": 1.0, |
|
"step": 57 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.000351089588377724, |
|
"grad_norm": 190.19152750617562, |
|
"kl": 7.6640625, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0339, |
|
"step": 58 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00035714285714285714, |
|
"grad_norm": 6.638740120478302, |
|
"kl": 1.9609375, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0123, |
|
"step": 59 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0003631961259079903, |
|
"grad_norm": 0.9919110476899615, |
|
"kl": 1.177734375, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0097, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.1875, |
|
"completions/max_length": 3020.0, |
|
"completions/max_terminated_length": 3020.0, |
|
"completions/mean_length": 1400.375, |
|
"completions/mean_terminated_length": 1723.5384615384614, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 1168.0, |
|
"epoch": 0.0003692493946731235, |
|
"grad_norm": 13723.1434300017, |
|
"kl": 499.875, |
|
"learning_rate": 3e-06, |
|
"loss": 1.6878, |
|
"num_tokens": 503045.0, |
|
"reward": -0.17267484962940216, |
|
"reward_std": 0.3091583847999573, |
|
"rewards/correct_answer_reward_func": 0.25, |
|
"rewards/efficient_thinking_reward_func": 2.4193549156188965, |
|
"rewards/format_reward_func": 0.9662500023841858, |
|
"rewards/num_xml_reward_func": 0.5625, |
|
"rewards/tool_execution_reward_func": 1.125, |
|
"step": 61 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00037530266343825664, |
|
"grad_norm": 583.6315212975338, |
|
"kl": 59.34375, |
|
"learning_rate": 3e-06, |
|
"loss": 0.1944, |
|
"step": 62 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00038135593220338984, |
|
"grad_norm": 6.259462206668412, |
|
"kl": 1.6533203125, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0194, |
|
"step": 63 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.000387409200968523, |
|
"grad_norm": 1.0697409446207862, |
|
"kl": 0.7666015625, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0167, |
|
"step": 64 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0625, |
|
"completions/max_length": 2703.0, |
|
"completions/max_terminated_length": 2703.0, |
|
"completions/mean_length": 1332.75, |
|
"completions/mean_terminated_length": 1421.6, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 664.0, |
|
"epoch": 0.0003934624697336562, |
|
"grad_norm": 0.8884534962640673, |
|
"kl": 1.0087890625, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0275, |
|
"num_tokens": 536045.0, |
|
"reward": 0.12867216765880585, |
|
"reward_std": 0.4248371720314026, |
|
"rewards/correct_answer_reward_func": 0.5, |
|
"rewards/efficient_thinking_reward_func": 5.460975170135498, |
|
"rewards/format_reward_func": 0.9925000071525574, |
|
"rewards/num_xml_reward_func": 1.2291666269302368, |
|
"rewards/tool_execution_reward_func": 1.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00039951573849878933, |
|
"grad_norm": 0.8129440217914238, |
|
"kl": 0.91796875, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0278, |
|
"step": 66 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00040556900726392253, |
|
"grad_norm": 2074.331071701581, |
|
"kl": 95.181640625, |
|
"learning_rate": 3e-06, |
|
"loss": 0.275, |
|
"step": 67 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0004116222760290557, |
|
"grad_norm": 25.42750541497437, |
|
"kl": 2.0732421875, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0241, |
|
"step": 68 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.1875, |
|
"completions/max_length": 1718.0, |
|
"completions/max_terminated_length": 1718.0, |
|
"completions/mean_length": 1171.125, |
|
"completions/mean_terminated_length": 1441.3846153846155, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 800.0, |
|
"epoch": 0.0004176755447941889, |
|
"grad_norm": 3132.1136439947386, |
|
"kl": 258.822265625, |
|
"learning_rate": 3e-06, |
|
"loss": 0.993, |
|
"num_tokens": 574691.0, |
|
"reward": -0.17273107171058655, |
|
"reward_std": 0.3091248571872711, |
|
"rewards/correct_answer_reward_func": 0.25, |
|
"rewards/efficient_thinking_reward_func": 1.2329180240631104, |
|
"rewards/format_reward_func": 0.90625, |
|
"rewards/num_xml_reward_func": 0.28125, |
|
"rewards/tool_execution_reward_func": 0.9375, |
|
"step": 69 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.000423728813559322, |
|
"grad_norm": 83.706286199725, |
|
"kl": 14.453125, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0731, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0004297820823244552, |
|
"grad_norm": 13.647323217615291, |
|
"kl": 3.970703125, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0323, |
|
"step": 71 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00043583535108958837, |
|
"grad_norm": 1.7482065120313235, |
|
"kl": 1.4990234375, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0241, |
|
"step": 72 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.3125, |
|
"completions/max_length": 2964.0, |
|
"completions/max_terminated_length": 2964.0, |
|
"completions/mean_length": 1056.0, |
|
"completions/mean_terminated_length": 1536.0, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 1002.0, |
|
"epoch": 0.00044188861985472157, |
|
"grad_norm": 0.8589483061045617, |
|
"kl": 1.400390625, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0032, |
|
"num_tokens": 619647.0, |
|
"reward": 0.12374991923570633, |
|
"reward_std": 0.3325650691986084, |
|
"rewards/correct_answer_reward_func": 0.375, |
|
"rewards/efficient_thinking_reward_func": 3.02419376373291, |
|
"rewards/format_reward_func": 0.875, |
|
"rewards/num_xml_reward_func": 0.6666666269302368, |
|
"rewards/tool_execution_reward_func": 1.0, |
|
"step": 73 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0004479418886198547, |
|
"grad_norm": 0.6265176251259559, |
|
"kl": 1.064453125, |
|
"learning_rate": 3e-06, |
|
"loss": -0.004, |
|
"step": 74 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0004539951573849879, |
|
"grad_norm": 0.6501523287860633, |
|
"kl": 1.0078125, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0042, |
|
"step": 75 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00046004842615012106, |
|
"grad_norm": 0.6557107326779114, |
|
"kl": 1.041015625, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0041, |
|
"step": 76 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.125, |
|
"completions/max_length": 3914.0, |
|
"completions/max_terminated_length": 3914.0, |
|
"completions/mean_length": 1885.25, |
|
"completions/mean_terminated_length": 2154.5714285714284, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 1259.0, |
|
"epoch": 0.00046610169491525426, |
|
"grad_norm": 1426.697250240864, |
|
"kl": 71.76171875, |
|
"learning_rate": 3e-06, |
|
"loss": 0.3336, |
|
"num_tokens": 665623.0, |
|
"reward": -0.08629994839429855, |
|
"reward_std": 0.23610900342464447, |
|
"rewards/correct_answer_reward_func": 0.125, |
|
"rewards/efficient_thinking_reward_func": 2.0161290168762207, |
|
"rewards/format_reward_func": 0.9462500214576721, |
|
"rewards/num_xml_reward_func": 0.46875, |
|
"rewards/tool_execution_reward_func": 1.0625, |
|
"step": 77 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0004721549636803874, |
|
"grad_norm": 12.49919870676222, |
|
"kl": 1.5146484375, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0013, |
|
"step": 78 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00047820823244552055, |
|
"grad_norm": 1.8191523325467, |
|
"kl": 0.966796875, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0021, |
|
"step": 79 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00048426150121065375, |
|
"grad_norm": 0.8614783265852023, |
|
"kl": 0.779296875, |
|
"learning_rate": 3e-06, |
|
"loss": -0.003, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.1875, |
|
"completions/max_length": 3240.0, |
|
"completions/max_terminated_length": 3240.0, |
|
"completions/mean_length": 1146.0625, |
|
"completions/mean_terminated_length": 1410.5384615384614, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 954.0, |
|
"epoch": 0.000490314769975787, |
|
"grad_norm": 1343141904879.566, |
|
"kl": 132607115273.9043, |
|
"learning_rate": 3e-06, |
|
"loss": 187695104.0, |
|
"num_tokens": 703828.0, |
|
"reward": -0.19236184656620026, |
|
"reward_std": 0.43547719717025757, |
|
"rewards/correct_answer_reward_func": 0.5, |
|
"rewards/efficient_thinking_reward_func": 2.1309096813201904, |
|
"rewards/format_reward_func": 0.9219642877578735, |
|
"rewards/num_xml_reward_func": 0.46875, |
|
"rewards/tool_execution_reward_func": 0.9375, |
|
"step": 81 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00049636803874092, |
|
"grad_norm": 7965471.976798021, |
|
"kl": 1048591.578125, |
|
"learning_rate": 3e-06, |
|
"loss": 1488.7067, |
|
"step": 82 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0005024213075060532, |
|
"grad_norm": 16601.817266727197, |
|
"kl": 1132.98046875, |
|
"learning_rate": 3e-06, |
|
"loss": 2.5826, |
|
"step": 83 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0005084745762711864, |
|
"grad_norm": 3.849730508254259, |
|
"kl": 1.3916015625, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0188, |
|
"step": 84 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.125, |
|
"completions/max_length": 2706.0, |
|
"completions/max_terminated_length": 2706.0, |
|
"completions/mean_length": 1372.625, |
|
"completions/mean_terminated_length": 1568.7142857142858, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 13.0, |
|
"epoch": 0.0005145278450363196, |
|
"grad_norm": 75.5463129972844, |
|
"kl": 41.2880859375, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0314, |
|
"num_tokens": 741602.0, |
|
"reward": -0.09613406658172607, |
|
"reward_std": 0.323569655418396, |
|
"rewards/correct_answer_reward_func": 0.25, |
|
"rewards/efficient_thinking_reward_func": 2.0161290168762207, |
|
"rewards/format_reward_func": 0.9337499737739563, |
|
"rewards/num_xml_reward_func": 0.46875, |
|
"rewards/tool_execution_reward_func": 1.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0005205811138014527, |
|
"grad_norm": 8.361466445717037, |
|
"kl": 11.2041015625, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0104, |
|
"step": 86 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0005266343825665859, |
|
"grad_norm": 0.4858997056210773, |
|
"kl": 3.3115234375, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0073, |
|
"step": 87 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0005326876513317191, |
|
"grad_norm": 0.48251870840305755, |
|
"kl": 1.1220703125, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0071, |
|
"step": 88 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 2879.0, |
|
"completions/max_terminated_length": 2879.0, |
|
"completions/mean_length": 1475.9375, |
|
"completions/mean_terminated_length": 1475.9375, |
|
"completions/min_length": 949.0, |
|
"completions/min_terminated_length": 949.0, |
|
"epoch": 0.0005387409200968523, |
|
"grad_norm": 817.4364276672566, |
|
"kl": 11.91796875, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0879, |
|
"num_tokens": 772797.0, |
|
"reward": -0.029312893748283386, |
|
"reward_std": 0.3894536793231964, |
|
"rewards/correct_answer_reward_func": 0.375, |
|
"rewards/efficient_thinking_reward_func": 4.233870983123779, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 0.9479166269302368, |
|
"rewards/tool_execution_reward_func": 1.125, |
|
"step": 89 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0005447941888619854, |
|
"grad_norm": 3.3298619813321695, |
|
"kl": 1.474609375, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0212, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0005508474576271186, |
|
"grad_norm": 1.1274332258815973, |
|
"kl": 1.1123046875, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0196, |
|
"step": 91 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0005569007263922518, |
|
"grad_norm": 0.8850067409872743, |
|
"kl": 1.123046875, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0196, |
|
"step": 92 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.125, |
|
"completions/max_length": 2178.0, |
|
"completions/max_terminated_length": 2178.0, |
|
"completions/mean_length": 1189.5625, |
|
"completions/mean_terminated_length": 1359.5, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 13.0, |
|
"epoch": 0.000562953995157385, |
|
"grad_norm": 1.3395550909317473, |
|
"kl": 1.9501953125, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0077, |
|
"num_tokens": 807642.0, |
|
"reward": -0.009627980180084705, |
|
"reward_std": 0.2253102958202362, |
|
"rewards/correct_answer_reward_func": 0.125, |
|
"rewards/efficient_thinking_reward_func": 4.435483932495117, |
|
"rewards/format_reward_func": 0.8824999928474426, |
|
"rewards/num_xml_reward_func": 1.03125, |
|
"rewards/tool_execution_reward_func": 0.9375, |
|
"step": 93 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0005690072639225181, |
|
"grad_norm": 0.5682146184631082, |
|
"kl": 1.765625, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0072, |
|
"step": 94 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0005750605326876513, |
|
"grad_norm": 0.28055748126607594, |
|
"kl": 1.4423828125, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0067, |
|
"step": 95 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0005811138014527845, |
|
"grad_norm": 0.2679104384917441, |
|
"kl": 1.2880859375, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0067, |
|
"step": 96 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.125, |
|
"completions/max_length": 2370.0, |
|
"completions/max_terminated_length": 2370.0, |
|
"completions/mean_length": 1182.8125, |
|
"completions/mean_terminated_length": 1351.7857142857142, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 942.0, |
|
"epoch": 0.0005871670702179177, |
|
"grad_norm": 74.74923285644503, |
|
"kl": 5.83984375, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0258, |
|
"num_tokens": 842339.0, |
|
"reward": 0.15201856195926666, |
|
"reward_std": 0.4216999411582947, |
|
"rewards/correct_answer_reward_func": 0.5625, |
|
"rewards/efficient_thinking_reward_func": 5.040322303771973, |
|
"rewards/format_reward_func": 0.9916666746139526, |
|
"rewards/num_xml_reward_func": 1.1145833730697632, |
|
"rewards/tool_execution_reward_func": 1.1875, |
|
"step": 97 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0005932203389830508, |
|
"grad_norm": 6.21548581436623, |
|
"kl": 2.208984375, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0066, |
|
"step": 98 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.000599273607748184, |
|
"grad_norm": 0.9670438333546869, |
|
"kl": 1.333984375, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0034, |
|
"step": 99 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0006053268765133172, |
|
"grad_norm": 0.9705895262826928, |
|
"kl": 1.177734375, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0029, |
|
"step": 100 |
|
}, |
|
{ |
|
"clip_ratio/high_max": NaN, |
|
"clip_ratio/high_mean": NaN, |
|
"clip_ratio/low_mean": NaN, |
|
"clip_ratio/low_min": NaN, |
|
"clip_ratio/region_mean": NaN, |
|
"completions/clipped_ratio": 0.3125, |
|
"completions/max_length": 2850.0, |
|
"completions/max_terminated_length": 2850.0, |
|
"completions/mean_length": 1026.1875, |
|
"completions/mean_terminated_length": 1492.6363636363637, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 1078.0, |
|
"epoch": 0.0006113801452784503, |
|
"grad_norm": 0.34914402483189466, |
|
"kl": NaN, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0038, |
|
"num_tokens": 886858.0, |
|
"reward": -0.009644638746976852, |
|
"reward_std": 0.22530952095985413, |
|
"rewards/correct_answer_reward_func": 0.125, |
|
"rewards/efficient_thinking_reward_func": 5.01161003112793, |
|
"rewards/format_reward_func": 0.949999988079071, |
|
"rewards/num_xml_reward_func": 0.9479166269302368, |
|
"rewards/tool_execution_reward_func": 1.125, |
|
"step": 101 |
|
}, |
|
{ |
|
"clip_ratio/high_max": NaN, |
|
"clip_ratio/high_mean": NaN, |
|
"clip_ratio/low_mean": NaN, |
|
"clip_ratio/low_min": NaN, |
|
"clip_ratio/region_mean": NaN, |
|
"epoch": 0.0006174334140435835, |
|
"grad_norm": 0.409729728396783, |
|
"kl": NaN, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0039, |
|
"step": 102 |
|
}, |
|
{ |
|
"clip_ratio/high_max": NaN, |
|
"clip_ratio/high_mean": NaN, |
|
"clip_ratio/low_mean": NaN, |
|
"clip_ratio/low_min": NaN, |
|
"clip_ratio/region_mean": NaN, |
|
"epoch": 0.0006234866828087167, |
|
"grad_norm": 34.71662901286587, |
|
"kl": NaN, |
|
"learning_rate": 3e-06, |
|
"loss": 0.016, |
|
"step": 103 |
|
}, |
|
{ |
|
"clip_ratio/high_max": NaN, |
|
"clip_ratio/high_mean": NaN, |
|
"clip_ratio/low_mean": NaN, |
|
"clip_ratio/low_min": NaN, |
|
"clip_ratio/region_mean": NaN, |
|
"epoch": 0.0006295399515738499, |
|
"grad_norm": 178.12705241502368, |
|
"kl": NaN, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0662, |
|
"step": 104 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.1875, |
|
"completions/max_length": 2669.0, |
|
"completions/max_terminated_length": 2669.0, |
|
"completions/mean_length": 1197.0625, |
|
"completions/mean_terminated_length": 1473.3076923076924, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 1011.0, |
|
"epoch": 0.000635593220338983, |
|
"grad_norm": 113656977492.3838, |
|
"kl": 4060086272.1398926, |
|
"learning_rate": 3e-06, |
|
"loss": 26552720.0, |
|
"num_tokens": 925879.0, |
|
"reward": 0.12848791480064392, |
|
"reward_std": 0.3391018807888031, |
|
"rewards/correct_answer_reward_func": 0.375, |
|
"rewards/efficient_thinking_reward_func": 6.138603687286377, |
|
"rewards/format_reward_func": 0.9183332920074463, |
|
"rewards/num_xml_reward_func": 1.1041667461395264, |
|
"rewards/tool_execution_reward_func": 1.0625, |
|
"step": 105 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0006416464891041162, |
|
"grad_norm": 862090008.796202, |
|
"kl": 29097984.143554688, |
|
"learning_rate": 3e-06, |
|
"loss": 190154.0312, |
|
"step": 106 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0006476997578692494, |
|
"grad_norm": 5971.671602768365, |
|
"kl": 225.146484375, |
|
"learning_rate": 3e-06, |
|
"loss": 1.4726, |
|
"step": 107 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0006537530266343826, |
|
"grad_norm": 197.81456021441926, |
|
"kl": 10.714111328125, |
|
"learning_rate": 3e-06, |
|
"loss": 0.071, |
|
"step": 108 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.3125, |
|
"completions/max_length": 2507.0, |
|
"completions/max_terminated_length": 2507.0, |
|
"completions/mean_length": 1018.3125, |
|
"completions/mean_terminated_length": 1481.1818181818182, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 920.0, |
|
"epoch": 0.0006598062953995157, |
|
"grad_norm": 211.32333218858383, |
|
"kl": 29.7578125, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0762, |
|
"num_tokens": 970272.0, |
|
"reward": -0.05291781574487686, |
|
"reward_std": 0.282351553440094, |
|
"rewards/correct_answer_reward_func": 0.1875, |
|
"rewards/efficient_thinking_reward_func": 4.623542785644531, |
|
"rewards/format_reward_func": 0.9093055725097656, |
|
"rewards/num_xml_reward_func": 0.5659722089767456, |
|
"rewards/tool_execution_reward_func": 1.0, |
|
"step": 109 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0006658595641646489, |
|
"grad_norm": 47.690518846315975, |
|
"kl": 5.3515625, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0217, |
|
"step": 110 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0006719128329297821, |
|
"grad_norm": 3.1184302707951286, |
|
"kl": 1.0947265625, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0071, |
|
"step": 111 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0006779661016949153, |
|
"grad_norm": 0.5838887307984975, |
|
"kl": 0.3798828125, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0055, |
|
"step": 112 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.25, |
|
"completions/max_length": 2388.0, |
|
"completions/max_terminated_length": 2388.0, |
|
"completions/mean_length": 976.4375, |
|
"completions/mean_terminated_length": 1301.9166666666667, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 522.0, |
|
"epoch": 0.0006840193704600484, |
|
"grad_norm": 2.279673569786722, |
|
"kl": 0.669921875, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0027, |
|
"num_tokens": 1009859.0, |
|
"reward": -0.08624996989965439, |
|
"reward_std": 0.2361285239458084, |
|
"rewards/correct_answer_reward_func": 0.125, |
|
"rewards/efficient_thinking_reward_func": 5.102198123931885, |
|
"rewards/format_reward_func": 0.9054166674613953, |
|
"rewards/num_xml_reward_func": 0.71875, |
|
"rewards/tool_execution_reward_func": 0.90625, |
|
"step": 113 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0006900726392251816, |
|
"grad_norm": 0.29363313071510083, |
|
"kl": 0.310546875, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0017, |
|
"step": 114 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0006961259079903148, |
|
"grad_norm": 0.25416521021234684, |
|
"kl": 0.2568359375, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0015, |
|
"step": 115 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.000702179176755448, |
|
"grad_norm": 0.2532709696883375, |
|
"kl": 0.25048828125, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0015, |
|
"step": 116 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.375, |
|
"completions/max_length": 2938.0, |
|
"completions/max_terminated_length": 2938.0, |
|
"completions/mean_length": 825.1875, |
|
"completions/mean_terminated_length": 1320.3, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 265.0, |
|
"epoch": 0.0007082324455205811, |
|
"grad_norm": 0.06499596334850685, |
|
"kl": 0.195556640625, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0004, |
|
"num_tokens": 1055258.0, |
|
"reward": 0.00015617192548234016, |
|
"reward_std": 0.000146350241266191, |
|
"rewards/correct_answer_reward_func": 0.0, |
|
"rewards/efficient_thinking_reward_func": 5.059846878051758, |
|
"rewards/format_reward_func": 0.8541666865348816, |
|
"rewards/num_xml_reward_func": 0.78125, |
|
"rewards/tool_execution_reward_func": 1.0625, |
|
"step": 117 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0007142857142857143, |
|
"grad_norm": 0.12561554187034915, |
|
"kl": 0.206298828125, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0004, |
|
"step": 118 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0007203389830508475, |
|
"grad_norm": 0.03335194268511418, |
|
"kl": 0.185546875, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0004, |
|
"step": 119 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0007263922518159806, |
|
"grad_norm": 0.016335498310956653, |
|
"kl": 0.1824951171875, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0004, |
|
"step": 120 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.3125, |
|
"completions/max_length": 3041.0, |
|
"completions/max_terminated_length": 3041.0, |
|
"completions/mean_length": 1208.6875, |
|
"completions/mean_terminated_length": 1758.090909090909, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 237.0, |
|
"epoch": 0.0007324455205811138, |
|
"grad_norm": 3074.215567652929, |
|
"kl": 87.255126953125, |
|
"learning_rate": 3e-06, |
|
"loss": 0.4728, |
|
"num_tokens": 1102657.0, |
|
"reward": 0.09965705871582031, |
|
"reward_std": 0.3296266794204712, |
|
"rewards/correct_answer_reward_func": 0.3125, |
|
"rewards/efficient_thinking_reward_func": 3.1649727821350098, |
|
"rewards/format_reward_func": 0.9187500476837158, |
|
"rewards/num_xml_reward_func": 0.5104166865348816, |
|
"rewards/tool_execution_reward_func": 1.0, |
|
"step": 121 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.000738498789346247, |
|
"grad_norm": 179.50807324392667, |
|
"kl": 30.089599609375, |
|
"learning_rate": 3e-06, |
|
"loss": 0.1486, |
|
"step": 122 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0007445520581113802, |
|
"grad_norm": 20.649024610652848, |
|
"kl": 3.32763671875, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0155, |
|
"step": 123 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0007506053268765133, |
|
"grad_norm": 2.0418855369195485, |
|
"kl": 0.590087890625, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0023, |
|
"step": 124 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.3125, |
|
"completions/max_length": 1461.0, |
|
"completions/max_terminated_length": 1461.0, |
|
"completions/mean_length": 844.125, |
|
"completions/mean_terminated_length": 1227.8181818181818, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 919.0, |
|
"epoch": 0.0007566585956416465, |
|
"grad_norm": 1.2107206967986066, |
|
"kl": 0.489501953125, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0052, |
|
"num_tokens": 1144263.0, |
|
"reward": -0.043053146451711655, |
|
"reward_std": 0.1728256642818451, |
|
"rewards/correct_answer_reward_func": 0.0625, |
|
"rewards/efficient_thinking_reward_func": 3.937290668487549, |
|
"rewards/format_reward_func": 0.877500057220459, |
|
"rewards/num_xml_reward_func": 0.71875, |
|
"rewards/tool_execution_reward_func": 0.9166666865348816, |
|
"step": 125 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0007627118644067797, |
|
"grad_norm": 0.3153399141642928, |
|
"kl": 0.267333984375, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0046, |
|
"step": 126 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0007687651331719129, |
|
"grad_norm": 0.30940853761801884, |
|
"kl": 0.25048828125, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0046, |
|
"step": 127 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.000774818401937046, |
|
"grad_norm": 0.41896360385565806, |
|
"kl": 0.291259765625, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0048, |
|
"step": 128 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.1875, |
|
"completions/max_length": 2040.0, |
|
"completions/max_terminated_length": 2040.0, |
|
"completions/mean_length": 1019.8125, |
|
"completions/mean_terminated_length": 1255.1538461538462, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 915.0, |
|
"epoch": 0.0007808716707021792, |
|
"grad_norm": 3.046991653902829, |
|
"kl": 1.09619140625, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0034, |
|
"num_tokens": 1180448.0, |
|
"reward": 0.023746376857161522, |
|
"reward_std": 0.263210654258728, |
|
"rewards/correct_answer_reward_func": 0.1875, |
|
"rewards/efficient_thinking_reward_func": 5.699418067932129, |
|
"rewards/format_reward_func": 0.9104167222976685, |
|
"rewards/num_xml_reward_func": 1.089583396911621, |
|
"rewards/tool_execution_reward_func": 0.8541666865348816, |
|
"step": 129 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0007869249394673124, |
|
"grad_norm": 0.6371118649367347, |
|
"kl": 0.56591796875, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0022, |
|
"step": 130 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0007929782082324456, |
|
"grad_norm": 0.7977652642661696, |
|
"kl": 0.475341796875, |
|
"learning_rate": 3e-06, |
|
"loss": 0.002, |
|
"step": 131 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0007990314769975787, |
|
"grad_norm": 0.3954173262150925, |
|
"kl": 0.418212890625, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0019, |
|
"step": 132 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.1875, |
|
"completions/max_length": 2829.0, |
|
"completions/max_terminated_length": 2829.0, |
|
"completions/mean_length": 1284.6875, |
|
"completions/mean_terminated_length": 1581.1538461538462, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 946.0, |
|
"epoch": 0.0008050847457627119, |
|
"grad_norm": 0.16569380056333147, |
|
"kl": 0.203369140625, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0001, |
|
"num_tokens": 1220839.0, |
|
"reward": 0.06701274961233139, |
|
"reward_std": 0.18233336508274078, |
|
"rewards/correct_answer_reward_func": 0.125, |
|
"rewards/efficient_thinking_reward_func": 8.0274658203125, |
|
"rewards/format_reward_func": 0.96833336353302, |
|
"rewards/num_xml_reward_func": 1.4375, |
|
"rewards/tool_execution_reward_func": 1.0, |
|
"step": 133 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0008111380145278451, |
|
"grad_norm": 0.16664121027946213, |
|
"kl": 0.20068359375, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0001, |
|
"step": 134 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0008171912832929783, |
|
"grad_norm": 0.16072020293930506, |
|
"kl": 0.194091796875, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0001, |
|
"step": 135 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0008232445520581114, |
|
"grad_norm": 41.76166861124815, |
|
"kl": 1.373046875, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0071, |
|
"step": 136 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 3684.0, |
|
"completions/max_terminated_length": 3684.0, |
|
"completions/mean_length": 1599.125, |
|
"completions/mean_terminated_length": 1599.125, |
|
"completions/min_length": 589.0, |
|
"completions/min_terminated_length": 589.0, |
|
"epoch": 0.0008292978208232446, |
|
"grad_norm": 646644.6699956892, |
|
"kl": 49842.197265625, |
|
"learning_rate": 3e-06, |
|
"loss": 229.4684, |
|
"num_tokens": 1254025.0, |
|
"reward": 0.013887053355574608, |
|
"reward_std": 0.34718748927116394, |
|
"rewards/correct_answer_reward_func": 0.3125, |
|
"rewards/efficient_thinking_reward_func": 5.241935729980469, |
|
"rewards/format_reward_func": 0.9479166269302368, |
|
"rewards/num_xml_reward_func": 0.9635417461395264, |
|
"rewards/tool_execution_reward_func": 0.875, |
|
"step": 137 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0008353510895883778, |
|
"grad_norm": 27876.102822783414, |
|
"kl": 2136.07666015625, |
|
"learning_rate": 3e-06, |
|
"loss": 12.8243, |
|
"step": 138 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0008414043583535108, |
|
"grad_norm": 790.2979277928656, |
|
"kl": 118.16162109375, |
|
"learning_rate": 3e-06, |
|
"loss": 0.4985, |
|
"step": 139 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.000847457627118644, |
|
"grad_norm": 47.030265324687, |
|
"kl": 7.62548828125, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0346, |
|
"step": 140 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.125, |
|
"completions/max_length": 3041.0, |
|
"completions/max_terminated_length": 3041.0, |
|
"completions/mean_length": 1098.375, |
|
"completions/mean_terminated_length": 1255.2857142857142, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 325.0, |
|
"epoch": 0.0008535108958837772, |
|
"grad_norm": 27163.40903493807, |
|
"kl": 1403.2744140625, |
|
"learning_rate": 3e-06, |
|
"loss": 7.8279, |
|
"num_tokens": 1287339.0, |
|
"reward": -0.00962173379957676, |
|
"reward_std": 0.2253105789422989, |
|
"rewards/correct_answer_reward_func": 0.125, |
|
"rewards/efficient_thinking_reward_func": 4.435483932495117, |
|
"rewards/format_reward_func": 0.9901785850524902, |
|
"rewards/num_xml_reward_func": 1.0625, |
|
"rewards/tool_execution_reward_func": 0.875, |
|
"step": 141 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0008595641646489104, |
|
"grad_norm": 104.2213070575419, |
|
"kl": 5.6044921875, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0309, |
|
"step": 142 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0008656174334140435, |
|
"grad_norm": 5.517418949203676, |
|
"kl": 1.123046875, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0054, |
|
"step": 143 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0008716707021791767, |
|
"grad_norm": 0.5745631831412419, |
|
"kl": 0.42724609375, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0033, |
|
"step": 144 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0625, |
|
"completions/max_length": 2668.0, |
|
"completions/max_terminated_length": 2668.0, |
|
"completions/mean_length": 1254.25, |
|
"completions/mean_terminated_length": 1337.8666666666666, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 377.0, |
|
"epoch": 0.0008777239709443099, |
|
"grad_norm": 11.985045430350514, |
|
"kl": 1.40576171875, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0054, |
|
"num_tokens": 1319103.0, |
|
"reward": 0.18167813122272491, |
|
"reward_std": 0.28324925899505615, |
|
"rewards/correct_answer_reward_func": 0.3125, |
|
"rewards/efficient_thinking_reward_func": 5.79470682144165, |
|
"rewards/format_reward_func": 0.9524999856948853, |
|
"rewards/num_xml_reward_func": 1.1041666269302368, |
|
"rewards/tool_execution_reward_func": 0.8125, |
|
"step": 145 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0008837772397094431, |
|
"grad_norm": 0.7668036294329545, |
|
"kl": 0.444580078125, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0087, |
|
"step": 146 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0008898305084745762, |
|
"grad_norm": 0.3955664525228031, |
|
"kl": 0.291259765625, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0093, |
|
"step": 147 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0008958837772397094, |
|
"grad_norm": 0.3609851056156834, |
|
"kl": 0.26123046875, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0094, |
|
"step": 148 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.4375, |
|
"completions/max_length": 2821.0, |
|
"completions/max_terminated_length": 2821.0, |
|
"completions/mean_length": 717.125, |
|
"completions/mean_terminated_length": 1274.888888888889, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 228.0, |
|
"epoch": 0.0009019370460048426, |
|
"grad_norm": 0.10668119884135797, |
|
"kl": 0.25, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0054, |
|
"num_tokens": 1366797.0, |
|
"reward": 0.03348555043339729, |
|
"reward_std": 0.13349804282188416, |
|
"rewards/correct_answer_reward_func": 0.0625, |
|
"rewards/efficient_thinking_reward_func": 4.775701522827148, |
|
"rewards/format_reward_func": 0.8970834016799927, |
|
"rewards/num_xml_reward_func": 0.6145833730697632, |
|
"rewards/tool_execution_reward_func": 0.800000011920929, |
|
"step": 149 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0009079903147699758, |
|
"grad_norm": 0.07581171591865946, |
|
"kl": 0.21337890625, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0055, |
|
"step": 150 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0009140435835351089, |
|
"grad_norm": 0.07368344797048941, |
|
"kl": 0.2066650390625, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0055, |
|
"step": 151 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0009200968523002421, |
|
"grad_norm": 0.06383190574866472, |
|
"kl": 0.2005615234375, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0055, |
|
"step": 152 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.125, |
|
"completions/max_length": 2775.0, |
|
"completions/max_terminated_length": 2775.0, |
|
"completions/mean_length": 1244.6875, |
|
"completions/mean_terminated_length": 1422.5, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 381.0, |
|
"epoch": 0.0009261501210653753, |
|
"grad_norm": 0.4120015190644857, |
|
"kl": 0.250732421875, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0169, |
|
"num_tokens": 1402504.0, |
|
"reward": 0.07177050411701202, |
|
"reward_std": 0.3221079111099243, |
|
"rewards/correct_answer_reward_func": 0.25, |
|
"rewards/efficient_thinking_reward_func": 5.607255458831787, |
|
"rewards/format_reward_func": 0.9775000214576721, |
|
"rewards/num_xml_reward_func": 1.1770832538604736, |
|
"rewards/tool_execution_reward_func": 0.875, |
|
"step": 153 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0009322033898305085, |
|
"grad_norm": 0.4391974629991036, |
|
"kl": 0.245361328125, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0169, |
|
"step": 154 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0009382566585956416, |
|
"grad_norm": 0.3734307496972298, |
|
"kl": 0.372314453125, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0166, |
|
"step": 155 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0009443099273607748, |
|
"grad_norm": 5.409050230753026, |
|
"kl": 1.83984375, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0136, |
|
"step": 156 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.1875, |
|
"completions/max_length": 2702.0, |
|
"completions/max_terminated_length": 2702.0, |
|
"completions/mean_length": 975.8125, |
|
"completions/mean_terminated_length": 1201.0, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 349.0, |
|
"epoch": 0.000950363196125908, |
|
"grad_norm": 0.7067985121705038, |
|
"kl": 0.344482421875, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0048, |
|
"num_tokens": 1437953.0, |
|
"reward": 0.06698047369718552, |
|
"reward_std": 0.18234598636627197, |
|
"rewards/correct_answer_reward_func": 0.125, |
|
"rewards/efficient_thinking_reward_func": 5.846774578094482, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.2760416269302368, |
|
"rewards/tool_execution_reward_func": 0.9375, |
|
"step": 157 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0009564164648910411, |
|
"grad_norm": 0.11769454310604478, |
|
"kl": 0.2841796875, |
|
"learning_rate": 3e-06, |
|
"loss": -0.005, |
|
"step": 158 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0009624697336561743, |
|
"grad_norm": 0.10329568036332133, |
|
"kl": 0.286865234375, |
|
"learning_rate": 3e-06, |
|
"loss": -0.005, |
|
"step": 159 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0009685230024213075, |
|
"grad_norm": 4.24103180505241, |
|
"kl": 0.876953125, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0042, |
|
"step": 160 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.1875, |
|
"completions/max_length": 2661.0, |
|
"completions/max_terminated_length": 2661.0, |
|
"completions/mean_length": 1132.625, |
|
"completions/mean_terminated_length": 1394.0, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 481.0, |
|
"epoch": 0.0009745762711864407, |
|
"grad_norm": 30399.081494084556, |
|
"kl": 3248.51220703125, |
|
"learning_rate": 3e-06, |
|
"loss": 11.4413, |
|
"num_tokens": 1475963.0, |
|
"reward": 0.0669461190700531, |
|
"reward_std": 0.18235941231250763, |
|
"rewards/correct_answer_reward_func": 0.125, |
|
"rewards/efficient_thinking_reward_func": 5.576177597045898, |
|
"rewards/format_reward_func": 0.9516667127609253, |
|
"rewards/num_xml_reward_func": 1.1041666269302368, |
|
"rewards/tool_execution_reward_func": 1.0, |
|
"step": 161 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.000980629539951574, |
|
"grad_norm": 301.320864829815, |
|
"kl": 42.82861328125, |
|
"learning_rate": 3e-06, |
|
"loss": 0.1518, |
|
"step": 162 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0009866828087167071, |
|
"grad_norm": 7.516991078245809, |
|
"kl": 2.22998046875, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0084, |
|
"step": 163 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00099273607748184, |
|
"grad_norm": 0.9933703066829184, |
|
"kl": 0.6552734375, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0029, |
|
"step": 164 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.1875, |
|
"completions/max_length": 2659.0, |
|
"completions/max_terminated_length": 2659.0, |
|
"completions/mean_length": 1132.6875, |
|
"completions/mean_terminated_length": 1394.076923076923, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 1039.0, |
|
"epoch": 0.0009987893462469733, |
|
"grad_norm": 216588358.58769044, |
|
"kl": 14798848.96875, |
|
"learning_rate": 3e-06, |
|
"loss": 39560.0, |
|
"num_tokens": 1513922.0, |
|
"reward": 0.06703699380159378, |
|
"reward_std": 0.1823238879442215, |
|
"rewards/correct_answer_reward_func": 0.125, |
|
"rewards/efficient_thinking_reward_func": 8.870967864990234, |
|
"rewards/format_reward_func": 0.9958333373069763, |
|
"rewards/num_xml_reward_func": 1.5587797164916992, |
|
"rewards/tool_execution_reward_func": 1.0, |
|
"step": 165 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0010048426150121065, |
|
"grad_norm": 20.66026628810014, |
|
"kl": 2.27978515625, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0026, |
|
"step": 166 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0010108958837772397, |
|
"grad_norm": 0.5852549123131363, |
|
"kl": 0.49462890625, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0021, |
|
"step": 167 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.001016949152542373, |
|
"grad_norm": 10.985658110563065, |
|
"kl": 1.380859375, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0001, |
|
"step": 168 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.1875, |
|
"completions/max_length": 2760.0, |
|
"completions/max_terminated_length": 2760.0, |
|
"completions/mean_length": 1299.5625, |
|
"completions/mean_terminated_length": 1599.4615384615386, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 995.0, |
|
"epoch": 0.001023002421307506, |
|
"grad_norm": 3.1833824255736856, |
|
"kl": 0.49853515625, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0108, |
|
"num_tokens": 1554603.0, |
|
"reward": 0.1483658403158188, |
|
"reward_std": 0.27007395029067993, |
|
"rewards/correct_answer_reward_func": 0.25, |
|
"rewards/efficient_thinking_reward_func": 8.19877815246582, |
|
"rewards/format_reward_func": 0.9624999761581421, |
|
"rewards/num_xml_reward_func": 1.3562500476837158, |
|
"rewards/tool_execution_reward_func": 1.0625, |
|
"step": 169 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0010290556900726393, |
|
"grad_norm": 0.43961018475161184, |
|
"kl": 0.41162109375, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0113, |
|
"step": 170 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0010351089588377725, |
|
"grad_norm": 1.4869647567424702, |
|
"kl": 0.330078125, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0114, |
|
"step": 171 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0010411622276029055, |
|
"grad_norm": 1.5593896451086025, |
|
"kl": 0.34423828125, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0114, |
|
"step": 172 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.25, |
|
"completions/max_length": 2658.0, |
|
"completions/max_terminated_length": 2658.0, |
|
"completions/mean_length": 1120.4375, |
|
"completions/mean_terminated_length": 1493.9166666666667, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 1001.0, |
|
"epoch": 0.0010472154963680387, |
|
"grad_norm": 9.153186896120884, |
|
"kl": 1.18212890625, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0004, |
|
"num_tokens": 1596462.0, |
|
"reward": 0.10031714290380478, |
|
"reward_std": 0.21521808207035065, |
|
"rewards/correct_answer_reward_func": 0.1875, |
|
"rewards/efficient_thinking_reward_func": 5.926279067993164, |
|
"rewards/format_reward_func": 0.9650000333786011, |
|
"rewards/num_xml_reward_func": 1.1458332538604736, |
|
"rewards/tool_execution_reward_func": 1.0625, |
|
"step": 173 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0010532687651331719, |
|
"grad_norm": 0.526258103973936, |
|
"kl": 0.560546875, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0026, |
|
"step": 174 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.001059322033898305, |
|
"grad_norm": 0.14211436005078248, |
|
"kl": 0.45556640625, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0029, |
|
"step": 175 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0010653753026634383, |
|
"grad_norm": 0.14305060184518628, |
|
"kl": 0.46484375, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0028, |
|
"step": 176 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 3333.0, |
|
"completions/max_terminated_length": 3333.0, |
|
"completions/mean_length": 1780.0, |
|
"completions/mean_terminated_length": 1780.0, |
|
"completions/min_length": 1066.0, |
|
"completions/min_terminated_length": 1066.0, |
|
"epoch": 0.0010714285714285715, |
|
"grad_norm": 0.7121759134327387, |
|
"kl": 0.4970703125, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0185, |
|
"num_tokens": 1632542.0, |
|
"reward": 0.05717029049992561, |
|
"reward_std": 0.29225456714630127, |
|
"rewards/correct_answer_reward_func": 0.25, |
|
"rewards/efficient_thinking_reward_func": 7.056451797485352, |
|
"rewards/format_reward_func": 0.9625000357627869, |
|
"rewards/num_xml_reward_func": 1.3958333730697632, |
|
"rewards/tool_execution_reward_func": 1.0625, |
|
"step": 177 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0010774818401937047, |
|
"grad_norm": 0.511035937325882, |
|
"kl": 0.3818359375, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0179, |
|
"step": 178 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0010835351089588377, |
|
"grad_norm": 118.18849489599667, |
|
"kl": 6.0556640625, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0429, |
|
"step": 179 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0010895883777239709, |
|
"grad_norm": 123.64288191458726, |
|
"kl": 3.88916015625, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0334, |
|
"step": 180 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.25, |
|
"completions/max_length": 2638.0, |
|
"completions/max_terminated_length": 2638.0, |
|
"completions/mean_length": 1067.25, |
|
"completions/mean_terminated_length": 1423.0, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 1119.0, |
|
"epoch": 0.001095641646489104, |
|
"grad_norm": 0.44662061313508766, |
|
"kl": 0.8115234375, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0038, |
|
"num_tokens": 1673550.0, |
|
"reward": 0.10039953887462616, |
|
"reward_std": 0.21517714858055115, |
|
"rewards/correct_answer_reward_func": 0.1875, |
|
"rewards/efficient_thinking_reward_func": 9.27419376373291, |
|
"rewards/format_reward_func": 0.9892857074737549, |
|
"rewards/num_xml_reward_func": 1.5580357313156128, |
|
"rewards/tool_execution_reward_func": 0.9791666865348816, |
|
"step": 181 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0011016949152542373, |
|
"grad_norm": 0.19064814713811837, |
|
"kl": 0.673828125, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0041, |
|
"step": 182 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0011077481840193705, |
|
"grad_norm": 0.12530575938029517, |
|
"kl": 0.578125, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0044, |
|
"step": 183 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0011138014527845037, |
|
"grad_norm": 0.14053997598107024, |
|
"kl": 0.53515625, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0045, |
|
"step": 184 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0625, |
|
"completions/max_length": 2780.0, |
|
"completions/max_terminated_length": 2780.0, |
|
"completions/mean_length": 1584.5625, |
|
"completions/mean_terminated_length": 1690.2, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 1083.0, |
|
"epoch": 0.0011198547215496369, |
|
"grad_norm": 0.36922212655466874, |
|
"kl": 0.328125, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0082, |
|
"num_tokens": 1710599.0, |
|
"reward": 0.26312950253486633, |
|
"reward_std": 0.31617864966392517, |
|
"rewards/correct_answer_reward_func": 0.4375, |
|
"rewards/efficient_thinking_reward_func": 8.467741966247559, |
|
"rewards/format_reward_func": 0.9862499833106995, |
|
"rewards/num_xml_reward_func": 1.5145833492279053, |
|
"rewards/tool_execution_reward_func": 1.0, |
|
"step": 185 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00112590799031477, |
|
"grad_norm": 0.35037508840939274, |
|
"kl": 0.3310546875, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0082, |
|
"step": 186 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.001131961259079903, |
|
"grad_norm": 0.33082477958916895, |
|
"kl": 0.35009765625, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0081, |
|
"step": 187 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0011380145278450363, |
|
"grad_norm": 0.345834479974622, |
|
"kl": 0.3896484375, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0079, |
|
"step": 188 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.25, |
|
"completions/max_length": 1811.0, |
|
"completions/max_terminated_length": 1811.0, |
|
"completions/mean_length": 1041.8125, |
|
"completions/mean_terminated_length": 1389.0833333333333, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 1031.0, |
|
"epoch": 0.0011440677966101695, |
|
"grad_norm": 35.33227872117956, |
|
"kl": 6.01953125, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0108, |
|
"num_tokens": 1751200.0, |
|
"reward": 0.10040043294429779, |
|
"reward_std": 0.21517671644687653, |
|
"rewards/correct_answer_reward_func": 0.1875, |
|
"rewards/efficient_thinking_reward_func": 8.064516067504883, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.5625, |
|
"rewards/tool_execution_reward_func": 1.0, |
|
"step": 189 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0011501210653753027, |
|
"grad_norm": 0.835012686734699, |
|
"kl": 0.9775390625, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0043, |
|
"step": 190 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0011561743341404359, |
|
"grad_norm": 0.2586421230474391, |
|
"kl": 0.8486328125, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0047, |
|
"step": 191 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.001162227602905569, |
|
"grad_norm": 0.1651807846836529, |
|
"kl": 0.7734375, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0049, |
|
"step": 192 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0625, |
|
"completions/max_length": 4084.0, |
|
"completions/max_terminated_length": 4084.0, |
|
"completions/mean_length": 1537.375, |
|
"completions/mean_terminated_length": 1639.8666666666666, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 1045.0, |
|
"epoch": 0.0011682808716707023, |
|
"grad_norm": 243.49244406063949, |
|
"kl": 5.927734375, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0565, |
|
"num_tokens": 1787494.0, |
|
"reward": 0.23383872210979462, |
|
"reward_std": 0.2734927237033844, |
|
"rewards/correct_answer_reward_func": 0.4375, |
|
"rewards/efficient_thinking_reward_func": 6.451612949371338, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.5, |
|
"rewards/tool_execution_reward_func": 1.0, |
|
"step": 193 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0011743341404358355, |
|
"grad_norm": 0.6933472630429098, |
|
"kl": 0.685546875, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0218, |
|
"step": 194 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0011803874092009684, |
|
"grad_norm": 0.38422473996938145, |
|
"kl": 0.61328125, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0216, |
|
"step": 195 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0011864406779661016, |
|
"grad_norm": 0.33054590530199923, |
|
"kl": 0.6220703125, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0216, |
|
"step": 196 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.1875, |
|
"completions/max_length": 3909.0, |
|
"completions/max_terminated_length": 3909.0, |
|
"completions/mean_length": 1388.375, |
|
"completions/mean_terminated_length": 1708.7692307692307, |
|
"completions/min_length": 0.0, |
|
"completions/min_terminated_length": 1148.0, |
|
"epoch": 0.0011924939467312348, |
|
"grad_norm": 188.57473992181284, |
|
"kl": 14.59765625, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0927, |
|
"num_tokens": 1829544.0, |
|
"reward": 0.10040343552827835, |
|
"reward_std": 0.21517521142959595, |
|
"rewards/correct_answer_reward_func": 0.1875, |
|
"rewards/efficient_thinking_reward_func": 8.870967864990234, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.5775296688079834, |
|
"rewards/tool_execution_reward_func": 1.0, |
|
"step": 197 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.001198547215496368, |
|
"grad_norm": 1.773591133545115, |
|
"kl": 1.154296875, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0063, |
|
"step": 198 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0012046004842615012, |
|
"grad_norm": 0.360837942568022, |
|
"kl": 0.9296875, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0049, |
|
"step": 199 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0012106537530266344, |
|
"grad_norm": 0.21716087780595142, |
|
"kl": 0.8203125, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0044, |
|
"step": 200 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 2000, |
|
"num_input_tokens_seen": 1829544, |
|
"num_train_epochs": 1, |
|
"save_steps": 20, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|