|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9984, |
|
"eval_steps": 100, |
|
"global_step": 312, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 598.697940826416, |
|
"epoch": 0.0032, |
|
"grad_norm": 0.051513671875, |
|
"kl": 0.0, |
|
"learning_rate": 9.375e-08, |
|
"loss": 0.029, |
|
"reward": 0.8125000149011612, |
|
"reward_std": 0.17311252653598785, |
|
"rewards/accuracy_reward": 0.8125000149011612, |
|
"rewards/format_reward": 0.0, |
|
"step": 1 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 577.0130367279053, |
|
"epoch": 0.016, |
|
"grad_norm": 0.054931640625, |
|
"kl": 8.213493288167228e-05, |
|
"learning_rate": 4.6875e-07, |
|
"loss": 0.033, |
|
"reward": 0.7552083497866988, |
|
"reward_std": 0.15550211956724524, |
|
"rewards/accuracy_reward": 0.7552083497866988, |
|
"rewards/format_reward": 0.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 538.3833518981934, |
|
"epoch": 0.032, |
|
"grad_norm": 0.0791015625, |
|
"kl": 0.00010595652129268274, |
|
"learning_rate": 9.375e-07, |
|
"loss": 0.0076, |
|
"reward": 0.7458333462476731, |
|
"reward_std": 0.11220084913074971, |
|
"rewards/accuracy_reward": 0.7458333462476731, |
|
"rewards/format_reward": 0.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 572.8646041870118, |
|
"epoch": 0.048, |
|
"grad_norm": 0.07373046875, |
|
"kl": 0.00011188817652509896, |
|
"learning_rate": 1.40625e-06, |
|
"loss": 0.0166, |
|
"reward": 0.7125000156462192, |
|
"reward_std": 0.14682335481047631, |
|
"rewards/accuracy_reward": 0.7125000156462192, |
|
"rewards/format_reward": 0.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 579.118766784668, |
|
"epoch": 0.064, |
|
"grad_norm": 0.0693359375, |
|
"kl": 0.00010264360735163791, |
|
"learning_rate": 1.875e-06, |
|
"loss": 0.0194, |
|
"reward": 0.7750000149011612, |
|
"reward_std": 0.12534543462097644, |
|
"rewards/accuracy_reward": 0.7750000149011612, |
|
"rewards/format_reward": 0.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 535.8145973205567, |
|
"epoch": 0.08, |
|
"grad_norm": 0.06298828125, |
|
"kl": 0.00010584766669126111, |
|
"learning_rate": 2.3437500000000002e-06, |
|
"loss": 0.0068, |
|
"reward": 0.704166678339243, |
|
"reward_std": 0.11830127276480198, |
|
"rewards/accuracy_reward": 0.704166678339243, |
|
"rewards/format_reward": 0.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 545.3708541870117, |
|
"epoch": 0.096, |
|
"grad_norm": 0.08740234375, |
|
"kl": 0.00010152916220249609, |
|
"learning_rate": 2.8125e-06, |
|
"loss": 0.0204, |
|
"reward": 0.7583333522081375, |
|
"reward_std": 0.13496793992817402, |
|
"rewards/accuracy_reward": 0.7583333522081375, |
|
"rewards/format_reward": 0.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 550.9000144958496, |
|
"epoch": 0.112, |
|
"grad_norm": 0.08984375, |
|
"kl": 0.00010129613328899723, |
|
"learning_rate": 2.9991503375003e-06, |
|
"loss": 0.0164, |
|
"reward": 0.7979166805744171, |
|
"reward_std": 0.12246793955564499, |
|
"rewards/accuracy_reward": 0.7979166805744171, |
|
"rewards/format_reward": 0.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 557.5000205993653, |
|
"epoch": 0.128, |
|
"grad_norm": 0.0673828125, |
|
"kl": 0.00010441835365782026, |
|
"learning_rate": 2.993961440992859e-06, |
|
"loss": 0.0179, |
|
"reward": 0.7791666835546494, |
|
"reward_std": 0.14330127350986005, |
|
"rewards/accuracy_reward": 0.7791666835546494, |
|
"rewards/format_reward": 0.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 547.3437675476074, |
|
"epoch": 0.144, |
|
"grad_norm": 0.07763671875, |
|
"kl": 0.00010777117104225909, |
|
"learning_rate": 2.984071989079555e-06, |
|
"loss": 0.0222, |
|
"reward": 0.7750000141561031, |
|
"reward_std": 0.15867876969277858, |
|
"rewards/accuracy_reward": 0.7750000141561031, |
|
"rewards/format_reward": 0.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 541.658351135254, |
|
"epoch": 0.16, |
|
"grad_norm": 0.0634765625, |
|
"kl": 0.00010541909759922418, |
|
"learning_rate": 2.9695130976348534e-06, |
|
"loss": 0.0155, |
|
"reward": 0.7854166805744172, |
|
"reward_std": 0.11988959684967995, |
|
"rewards/accuracy_reward": 0.7854166805744172, |
|
"rewards/format_reward": 0.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 553.2916847229004, |
|
"epoch": 0.176, |
|
"grad_norm": 0.10693359375, |
|
"kl": 0.00011168952341904515, |
|
"learning_rate": 2.9503305743175096e-06, |
|
"loss": 0.0131, |
|
"reward": 0.7145833455026149, |
|
"reward_std": 0.16448003873229028, |
|
"rewards/accuracy_reward": 0.7145833455026149, |
|
"rewards/format_reward": 0.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 575.1771041870118, |
|
"epoch": 0.192, |
|
"grad_norm": 0.044189453125, |
|
"kl": 9.437939497729531e-05, |
|
"learning_rate": 2.9265847744427307e-06, |
|
"loss": 0.0192, |
|
"reward": 0.7625000149011611, |
|
"reward_std": 0.14587961547076703, |
|
"rewards/accuracy_reward": 0.7625000149011611, |
|
"rewards/format_reward": 0.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 519.7854316711425, |
|
"epoch": 0.208, |
|
"grad_norm": 0.042724609375, |
|
"kl": 0.00010650227795849787, |
|
"learning_rate": 2.8983504110820214e-06, |
|
"loss": 0.0206, |
|
"reward": 0.7937500163912773, |
|
"reward_std": 0.11636751592159271, |
|
"rewards/accuracy_reward": 0.7937500163912773, |
|
"rewards/format_reward": 0.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 517.5646003723144, |
|
"epoch": 0.224, |
|
"grad_norm": 0.0439453125, |
|
"kl": 0.00010277715955453459, |
|
"learning_rate": 2.865716319988224e-06, |
|
"loss": 0.0221, |
|
"reward": 0.7770833492279052, |
|
"reward_std": 0.12727919183671474, |
|
"rewards/accuracy_reward": 0.7770833492279052, |
|
"rewards/format_reward": 0.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 571.9395950317382, |
|
"epoch": 0.24, |
|
"grad_norm": 0.1064453125, |
|
"kl": 0.00011016046828444814, |
|
"learning_rate": 2.82878518008537e-06, |
|
"loss": 0.0138, |
|
"reward": 0.7729166805744171, |
|
"reward_std": 0.14394585900008677, |
|
"rewards/accuracy_reward": 0.7729166805744171, |
|
"rewards/format_reward": 0.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 547.0708488464355, |
|
"epoch": 0.256, |
|
"grad_norm": 0.07421875, |
|
"kl": 0.00010552951316640246, |
|
"learning_rate": 2.7876731904027993e-06, |
|
"loss": 0.0043, |
|
"reward": 0.7833333518356085, |
|
"reward_std": 0.1494016967713833, |
|
"rewards/accuracy_reward": 0.7833333518356085, |
|
"rewards/format_reward": 0.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 528.5791786193847, |
|
"epoch": 0.272, |
|
"grad_norm": 0.07373046875, |
|
"kl": 0.00010855200152946055, |
|
"learning_rate": 2.7425097044700246e-06, |
|
"loss": 0.0083, |
|
"reward": 0.8020833432674408, |
|
"reward_std": 0.14265668764710426, |
|
"rewards/accuracy_reward": 0.8020833432674408, |
|
"rewards/format_reward": 0.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 578.5750167846679, |
|
"epoch": 0.288, |
|
"grad_norm": 0.09765625, |
|
"kl": 0.00011546339583219378, |
|
"learning_rate": 2.6934368233226715e-06, |
|
"loss": 0.0211, |
|
"reward": 0.7250000156462193, |
|
"reward_std": 0.16125711128115655, |
|
"rewards/accuracy_reward": 0.7250000156462193, |
|
"rewards/format_reward": 0.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 548.235432434082, |
|
"epoch": 0.304, |
|
"grad_norm": 0.1123046875, |
|
"kl": 0.00010799223509820877, |
|
"learning_rate": 2.6406089484000465e-06, |
|
"loss": 0.0284, |
|
"reward": 0.8083333469927311, |
|
"reward_std": 0.16830127350986004, |
|
"rewards/accuracy_reward": 0.8083333469927311, |
|
"rewards/format_reward": 0.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.07568359375, |
|
"learning_rate": 2.584192295741087e-06, |
|
"loss": 0.0186, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completion_length": 464.36827672170307, |
|
"eval_kl": 8.895910598325799e-05, |
|
"eval_loss": 0.004563388880342245, |
|
"eval_reward": 0.8485529082799386, |
|
"eval_reward_std": 0.13733045984349565, |
|
"eval_rewards/accuracy_reward": 0.8485529082799386, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 1240.0045, |
|
"eval_samples_per_second": 0.806, |
|
"eval_steps_per_second": 0.068, |
|
"step": 100 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 528.855224609375, |
|
"epoch": 0.336, |
|
"grad_norm": 0.05615234375, |
|
"kl": 0.00010595467788334644, |
|
"learning_rate": 2.5243643730072105e-06, |
|
"loss": 0.0102, |
|
"reward": 0.7875000132247806, |
|
"reward_std": 0.12053418289870024, |
|
"rewards/accuracy_reward": 0.7875000132247806, |
|
"rewards/format_reward": 0.0, |
|
"step": 105 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 569.627101135254, |
|
"epoch": 0.352, |
|
"grad_norm": 0.06982421875, |
|
"kl": 0.00010751374265964842, |
|
"learning_rate": 2.461313420977536e-06, |
|
"loss": 0.0214, |
|
"reward": 0.7270833492279053, |
|
"reward_std": 0.13432335443794727, |
|
"rewards/accuracy_reward": 0.7270833492279053, |
|
"rewards/format_reward": 0.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 546.0083457946778, |
|
"epoch": 0.368, |
|
"grad_norm": 0.059326171875, |
|
"kl": 9.366698777739657e-05, |
|
"learning_rate": 2.3952378212737554e-06, |
|
"loss": 0.0181, |
|
"reward": 0.8020833432674408, |
|
"reward_std": 0.13432335443794727, |
|
"rewards/accuracy_reward": 0.8020833432674408, |
|
"rewards/format_reward": 0.0, |
|
"step": 115 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 554.027099609375, |
|
"epoch": 0.384, |
|
"grad_norm": 0.08642578125, |
|
"kl": 0.00010431327682454139, |
|
"learning_rate": 2.3263454721781537e-06, |
|
"loss": 0.0112, |
|
"reward": 0.8020833477377891, |
|
"reward_std": 0.10803418271243573, |
|
"rewards/accuracy_reward": 0.8020833477377891, |
|
"rewards/format_reward": 0.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 559.7708488464356, |
|
"epoch": 0.4, |
|
"grad_norm": 0.07421875, |
|
"kl": 0.00010436618849780643, |
|
"learning_rate": 2.2548531345087003e-06, |
|
"loss": 0.0268, |
|
"reward": 0.7500000167638063, |
|
"reward_std": 0.115722930803895, |
|
"rewards/accuracy_reward": 0.7500000167638063, |
|
"rewards/format_reward": 0.0, |
|
"step": 125 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 514.7854347229004, |
|
"epoch": 0.416, |
|
"grad_norm": 0.07470703125, |
|
"kl": 0.00010219837422482669, |
|
"learning_rate": 2.18098574960932e-06, |
|
"loss": 0.0252, |
|
"reward": 0.8145833492279053, |
|
"reward_std": 0.1509900216013193, |
|
"rewards/accuracy_reward": 0.8145833492279053, |
|
"rewards/format_reward": 0.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 581.3895988464355, |
|
"epoch": 0.432, |
|
"grad_norm": 0.057861328125, |
|
"kl": 0.0001146518128734897, |
|
"learning_rate": 2.104975731601208e-06, |
|
"loss": 0.0239, |
|
"reward": 0.6833333477377892, |
|
"reward_std": 0.154212948307395, |
|
"rewards/accuracy_reward": 0.6833333477377892, |
|
"rewards/format_reward": 0.0, |
|
"step": 135 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 565.6645988464355, |
|
"epoch": 0.448, |
|
"grad_norm": 0.04931640625, |
|
"kl": 0.00011206413109903224, |
|
"learning_rate": 2.027062236122014e-06, |
|
"loss": 0.0109, |
|
"reward": 0.768750012293458, |
|
"reward_std": 0.15227919220924377, |
|
"rewards/accuracy_reward": 0.768750012293458, |
|
"rewards/format_reward": 0.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 545.1396034240722, |
|
"epoch": 0.464, |
|
"grad_norm": 0.0732421875, |
|
"kl": 0.00010298133001924725, |
|
"learning_rate": 1.9474904078537343e-06, |
|
"loss": 0.0186, |
|
"reward": 0.7250000163912773, |
|
"reward_std": 0.14940169639885426, |
|
"rewards/accuracy_reward": 0.7250000163912773, |
|
"rewards/format_reward": 0.0, |
|
"step": 145 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 541.8229347229004, |
|
"epoch": 0.48, |
|
"grad_norm": 0.06591796875, |
|
"kl": 9.864068970273365e-05, |
|
"learning_rate": 1.866510609206841e-06, |
|
"loss": 0.0136, |
|
"reward": 0.7854166857898235, |
|
"reward_std": 0.16319086775183678, |
|
"rewards/accuracy_reward": 0.7854166857898235, |
|
"rewards/format_reward": 0.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 571.9604377746582, |
|
"epoch": 0.496, |
|
"grad_norm": 0.057861328125, |
|
"kl": 0.00010758389435068238, |
|
"learning_rate": 1.784377632587518e-06, |
|
"loss": 0.0095, |
|
"reward": 0.7041666833683848, |
|
"reward_std": 0.18754628263413906, |
|
"rewards/accuracy_reward": 0.7041666833683848, |
|
"rewards/format_reward": 0.0, |
|
"step": 155 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 602.4896034240722, |
|
"epoch": 0.512, |
|
"grad_norm": 0.05908203125, |
|
"kl": 0.0001044012439706421, |
|
"learning_rate": 1.7013498987264833e-06, |
|
"loss": 0.022, |
|
"reward": 0.6875000156462192, |
|
"reward_std": 0.1670121032744646, |
|
"rewards/accuracy_reward": 0.6875000156462192, |
|
"rewards/format_reward": 0.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 537.7500205993653, |
|
"epoch": 0.528, |
|
"grad_norm": 0.10498046875, |
|
"kl": 0.00010827416845131666, |
|
"learning_rate": 1.6176886435917677e-06, |
|
"loss": 0.0067, |
|
"reward": 0.7562500171363353, |
|
"reward_std": 0.14875711016356946, |
|
"rewards/accuracy_reward": 0.7562500171363353, |
|
"rewards/format_reward": 0.0, |
|
"step": 165 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 561.8604354858398, |
|
"epoch": 0.544, |
|
"grad_norm": 0.0634765625, |
|
"kl": 0.00010869488978642039, |
|
"learning_rate": 1.5336570964437077e-06, |
|
"loss": 0.0288, |
|
"reward": 0.722916679829359, |
|
"reward_std": 0.18080127388238906, |
|
"rewards/accuracy_reward": 0.722916679829359, |
|
"rewards/format_reward": 0.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 553.4791847229004, |
|
"epoch": 0.56, |
|
"grad_norm": 0.046142578125, |
|
"kl": 0.00011422934894653736, |
|
"learning_rate": 1.4495196516183096e-06, |
|
"loss": 0.0057, |
|
"reward": 0.7166666770353913, |
|
"reward_std": 0.13144585788249968, |
|
"rewards/accuracy_reward": 0.7166666770353913, |
|
"rewards/format_reward": 0.0, |
|
"step": 175 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 546.4104309082031, |
|
"epoch": 0.576, |
|
"grad_norm": 0.1083984375, |
|
"kl": 0.00010774649126688018, |
|
"learning_rate": 1.3655410366448499e-06, |
|
"loss": 0.0158, |
|
"reward": 0.7833333492279053, |
|
"reward_std": 0.159967939928174, |
|
"rewards/accuracy_reward": 0.7833333492279053, |
|
"rewards/format_reward": 0.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 602.3583526611328, |
|
"epoch": 0.592, |
|
"grad_norm": 0.0673828125, |
|
"kl": 0.00010071156602862175, |
|
"learning_rate": 1.2819854793151313e-06, |
|
"loss": 0.0106, |
|
"reward": 0.6895833477377892, |
|
"reward_std": 0.139134606346488, |
|
"rewards/accuracy_reward": 0.6895833477377892, |
|
"rewards/format_reward": 0.0, |
|
"step": 185 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 560.5500137329102, |
|
"epoch": 0.608, |
|
"grad_norm": 0.06640625, |
|
"kl": 0.00011235472247790312, |
|
"learning_rate": 1.199115876325091e-06, |
|
"loss": 0.0011, |
|
"reward": 0.7354166826233268, |
|
"reward_std": 0.17727919295430183, |
|
"rewards/accuracy_reward": 0.7354166826233268, |
|
"rewards/format_reward": 0.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 556.4875118255616, |
|
"epoch": 0.624, |
|
"grad_norm": 0.04736328125, |
|
"kl": 0.00011359924974385649, |
|
"learning_rate": 1.1171929661045361e-06, |
|
"loss": 0.0149, |
|
"reward": 0.7500000115483999, |
|
"reward_std": 0.12405626438558101, |
|
"rewards/accuracy_reward": 0.7500000115483999, |
|
"rewards/format_reward": 0.0, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.044189453125, |
|
"learning_rate": 1.036474508437579e-06, |
|
"loss": 0.0226, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completion_length": 465.14322685196015, |
|
"eval_kl": 8.934064043752367e-05, |
|
"eval_loss": 0.003860337659716606, |
|
"eval_reward": 0.8478044058212977, |
|
"eval_reward_std": 0.1438588550883139, |
|
"eval_rewards/accuracy_reward": 0.8478044058212977, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 1207.4697, |
|
"eval_samples_per_second": 0.828, |
|
"eval_steps_per_second": 0.07, |
|
"step": 200 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 535.7416812896729, |
|
"epoch": 0.656, |
|
"grad_norm": 0.06982421875, |
|
"kl": 0.00011015057557415276, |
|
"learning_rate": 9.57214473454992e-07, |
|
"loss": 0.0301, |
|
"reward": 0.7635416811332106, |
|
"reward_std": 0.14010148495435715, |
|
"rewards/accuracy_reward": 0.7635416811332106, |
|
"rewards/format_reward": 0.0, |
|
"step": 205 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 556.1916847229004, |
|
"epoch": 0.672, |
|
"grad_norm": 0.04296875, |
|
"kl": 0.00010989538504873053, |
|
"learning_rate": 8.796622425502193e-07, |
|
"loss": 0.0141, |
|
"reward": 0.735416679084301, |
|
"reward_std": 0.130801273137331, |
|
"rewards/accuracy_reward": 0.735416679084301, |
|
"rewards/format_reward": 0.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 600.8562660217285, |
|
"epoch": 0.688, |
|
"grad_norm": 0.060791015625, |
|
"kl": 0.00011158880770381074, |
|
"learning_rate": 8.040618237332491e-07, |
|
"loss": 0.0276, |
|
"reward": 0.6958333497866989, |
|
"reward_std": 0.1936467058956623, |
|
"rewards/accuracy_reward": 0.6958333497866989, |
|
"rewards/format_reward": 0.0, |
|
"step": 215 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 560.8437644958497, |
|
"epoch": 0.704, |
|
"grad_norm": 0.05029296875, |
|
"kl": 0.00010739189892774447, |
|
"learning_rate": 7.30651083891141e-07, |
|
"loss": 0.0285, |
|
"reward": 0.7666666835546494, |
|
"reward_std": 0.12534543462097644, |
|
"rewards/accuracy_reward": 0.7666666835546494, |
|
"rewards/format_reward": 0.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 580.1146057128906, |
|
"epoch": 0.72, |
|
"grad_norm": 0.0537109375, |
|
"kl": 0.00010475763965587249, |
|
"learning_rate": 6.596610003707959e-07, |
|
"loss": 0.0188, |
|
"reward": 0.7625000156462193, |
|
"reward_std": 0.14330127313733101, |
|
"rewards/accuracy_reward": 0.7625000156462193, |
|
"rewards/format_reward": 0.0, |
|
"step": 225 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 600.7646011352539, |
|
"epoch": 0.736, |
|
"grad_norm": 0.07958984375, |
|
"kl": 0.00010898288783209865, |
|
"learning_rate": 5.913149342387704e-07, |
|
"loss": 0.0155, |
|
"reward": 0.7000000149011611, |
|
"reward_std": 0.1673575345426798, |
|
"rewards/accuracy_reward": 0.7000000149011611, |
|
"rewards/format_reward": 0.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 541.1729309082032, |
|
"epoch": 0.752, |
|
"grad_norm": 0.07421875, |
|
"kl": 0.00011250958050368354, |
|
"learning_rate": 5.258279275047247e-07, |
|
"loss": 0.0154, |
|
"reward": 0.7770833522081375, |
|
"reward_std": 0.12504628151655198, |
|
"rewards/accuracy_reward": 0.7770833522081375, |
|
"rewards/format_reward": 0.0, |
|
"step": 235 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 552.583349609375, |
|
"epoch": 0.768, |
|
"grad_norm": 0.0751953125, |
|
"kl": 0.00011499359970912337, |
|
"learning_rate": 4.63406026519703e-07, |
|
"loss": 0.0189, |
|
"reward": 0.7812500141561032, |
|
"reward_std": 0.11284543462097645, |
|
"rewards/accuracy_reward": 0.7812500141561032, |
|
"rewards/format_reward": 0.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 573.3916809082032, |
|
"epoch": 0.784, |
|
"grad_norm": 0.080078125, |
|
"kl": 0.00011338020558468998, |
|
"learning_rate": 4.042456336780838e-07, |
|
"loss": 0.0157, |
|
"reward": 0.7458333499729634, |
|
"reward_std": 0.14106836169958115, |
|
"rewards/accuracy_reward": 0.7458333499729634, |
|
"rewards/format_reward": 0.0, |
|
"step": 245 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 570.3416870117187, |
|
"epoch": 0.8, |
|
"grad_norm": 0.07666015625, |
|
"kl": 0.00010969964605465066, |
|
"learning_rate": 3.4853288946298335e-07, |
|
"loss": 0.0117, |
|
"reward": 0.7062500147148967, |
|
"reward_std": 0.16413460709154606, |
|
"rewards/accuracy_reward": 0.7062500147148967, |
|
"rewards/format_reward": 0.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 555.5708526611328, |
|
"epoch": 0.816, |
|
"grad_norm": 0.0859375, |
|
"kl": 0.00011466140349511989, |
|
"learning_rate": 2.9644308677943315e-07, |
|
"loss": 0.0341, |
|
"reward": 0.7625000171363354, |
|
"reward_std": 0.1901246253401041, |
|
"rewards/accuracy_reward": 0.7625000171363354, |
|
"rewards/format_reward": 0.0, |
|
"step": 255 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 559.4687660217285, |
|
"epoch": 0.832, |
|
"grad_norm": 0.09326171875, |
|
"kl": 0.00011700030408974271, |
|
"learning_rate": 2.48140119418046e-07, |
|
"loss": 0.0038, |
|
"reward": 0.7437500134110451, |
|
"reward_std": 0.16988959908485413, |
|
"rewards/accuracy_reward": 0.7437500134110451, |
|
"rewards/format_reward": 0.0, |
|
"step": 260 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 586.2000167846679, |
|
"epoch": 0.848, |
|
"grad_norm": 0.0673828125, |
|
"kl": 0.00010750685323728249, |
|
"learning_rate": 2.0377596638451812e-07, |
|
"loss": 0.0151, |
|
"reward": 0.7458333522081375, |
|
"reward_std": 0.14459044374525548, |
|
"rewards/accuracy_reward": 0.7458333522081375, |
|
"rewards/format_reward": 0.0, |
|
"step": 265 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 559.5666870117187, |
|
"epoch": 0.864, |
|
"grad_norm": 0.040283203125, |
|
"kl": 0.00010784281039377674, |
|
"learning_rate": 1.634902137174483e-07, |
|
"loss": 0.0087, |
|
"reward": 0.7375000141561031, |
|
"reward_std": 0.11349002048373222, |
|
"rewards/accuracy_reward": 0.7375000141561031, |
|
"rewards/format_reward": 0.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 559.8062644958496, |
|
"epoch": 0.88, |
|
"grad_norm": 0.0296630859375, |
|
"kl": 0.00010248722046526381, |
|
"learning_rate": 1.274096152990203e-07, |
|
"loss": 0.0123, |
|
"reward": 0.7833333499729633, |
|
"reward_std": 0.12182335332036018, |
|
"rewards/accuracy_reward": 0.7833333499729633, |
|
"rewards/format_reward": 0.0, |
|
"step": 275 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 560.8646018981933, |
|
"epoch": 0.896, |
|
"grad_norm": 0.08349609375, |
|
"kl": 0.00010658866949597723, |
|
"learning_rate": 9.564769404039419e-08, |
|
"loss": 0.024, |
|
"reward": 0.7583333499729633, |
|
"reward_std": 0.14459044449031352, |
|
"rewards/accuracy_reward": 0.7583333499729633, |
|
"rewards/format_reward": 0.0, |
|
"step": 280 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 572.097932434082, |
|
"epoch": 0.912, |
|
"grad_norm": 0.0947265625, |
|
"kl": 0.00011029940260414151, |
|
"learning_rate": 6.830438469662892e-08, |
|
"loss": 0.0198, |
|
"reward": 0.7250000156462193, |
|
"reward_std": 0.14845795668661593, |
|
"rewards/accuracy_reward": 0.7250000156462193, |
|
"rewards/format_reward": 0.0, |
|
"step": 285 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 540.3437705993653, |
|
"epoch": 0.928, |
|
"grad_norm": 0.0517578125, |
|
"kl": 0.00010695143555494724, |
|
"learning_rate": 4.546571943496969e-08, |
|
"loss": 0.0248, |
|
"reward": 0.818750013411045, |
|
"reward_std": 0.13784543573856353, |
|
"rewards/accuracy_reward": 0.818750013411045, |
|
"rewards/format_reward": 0.0, |
|
"step": 290 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 581.6541847229004, |
|
"epoch": 0.944, |
|
"grad_norm": 0.068359375, |
|
"kl": 0.00010272065319441026, |
|
"learning_rate": 2.72035571458224e-08, |
|
"loss": 0.0252, |
|
"reward": 0.7541666850447655, |
|
"reward_std": 0.16477919220924378, |
|
"rewards/accuracy_reward": 0.7541666850447655, |
|
"rewards/format_reward": 0.0, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.111328125, |
|
"learning_rate": 1.357535734809795e-08, |
|
"loss": 0.0261, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completion_length": 464.5865904756649, |
|
"eval_kl": 8.974588041372066e-05, |
|
"eval_loss": 0.004327021539211273, |
|
"eval_reward": 0.8458083985808367, |
|
"eval_reward_std": 0.14224328386212537, |
|
"eval_rewards/accuracy_reward": 0.8458083985808367, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 1260.0209, |
|
"eval_samples_per_second": 0.794, |
|
"eval_steps_per_second": 0.067, |
|
"step": 300 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 563.8583469390869, |
|
"epoch": 0.976, |
|
"grad_norm": 0.279296875, |
|
"kl": 0.00011092701979578123, |
|
"learning_rate": 4.623999400308054e-09, |
|
"loss": 0.0154, |
|
"reward": 0.7604166809469461, |
|
"reward_std": 0.14394585862755777, |
|
"rewards/accuracy_reward": 0.7604166809469461, |
|
"rewards/format_reward": 0.0, |
|
"step": 305 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 554.9770988464355, |
|
"epoch": 0.992, |
|
"grad_norm": 0.07373046875, |
|
"kl": 0.00010652115543052787, |
|
"learning_rate": 3.77647586240204e-10, |
|
"loss": 0.0119, |
|
"reward": 0.7562500186264515, |
|
"reward_std": 0.1330341823399067, |
|
"rewards/accuracy_reward": 0.7562500186264515, |
|
"rewards/format_reward": 0.0, |
|
"step": 310 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 575.7395992279053, |
|
"epoch": 0.9984, |
|
"kl": 0.00010837154786713654, |
|
"reward": 0.7760416828095913, |
|
"reward_std": 0.17472399026155472, |
|
"rewards/accuracy_reward": 0.7760416828095913, |
|
"rewards/format_reward": 0.0, |
|
"step": 312, |
|
"total_flos": 0.0, |
|
"train_loss": 0.017263087375352208, |
|
"train_runtime": 14335.1534, |
|
"train_samples_per_second": 0.523, |
|
"train_steps_per_second": 0.022 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 312, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|