Qwen-2.5-7B-Simple-RL / trainer_state.json
HaichuanWang's picture
Model save
f089127 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9994666666666666,
"eval_steps": 100,
"global_step": 937,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 721.3750152587891,
"epoch": 0.0010666666666666667,
"grad_norm": 0.7148946902118419,
"kl": 0.0,
"learning_rate": 3.191489361702128e-08,
"loss": 0.0,
"reward": 0.7500000111758709,
"reward_std": 0.3608439117670059,
"rewards/accuracy_reward": 0.7500000111758709,
"rewards/format_reward": 0.0,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 560.6666808128357,
"epoch": 0.005333333333333333,
"grad_norm": 1.4615535350113826,
"kl": 9.819865226745605e-05,
"learning_rate": 1.5957446808510638e-07,
"loss": 0.0,
"reward": 0.6041666744276881,
"reward_std": 0.25259073823690414,
"rewards/accuracy_reward": 0.6041666744276881,
"rewards/format_reward": 0.0,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 588.8250148773193,
"epoch": 0.010666666666666666,
"grad_norm": 53.213496097955556,
"kl": 0.00019417405128479003,
"learning_rate": 3.1914893617021275e-07,
"loss": 0.0,
"reward": 0.6833333380520343,
"reward_std": 0.18763883411884308,
"rewards/accuracy_reward": 0.6833333380520343,
"rewards/format_reward": 0.0,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 552.5083473205566,
"epoch": 0.016,
"grad_norm": 2.4855773258124825,
"kl": 0.0002583146095275879,
"learning_rate": 4.787234042553192e-07,
"loss": 0.0,
"reward": 0.6416666753590107,
"reward_std": 0.28867512941360474,
"rewards/accuracy_reward": 0.6416666753590107,
"rewards/format_reward": 0.0,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 586.7083511352539,
"epoch": 0.021333333333333333,
"grad_norm": 25.07614059798895,
"kl": 0.00021169781684875487,
"learning_rate": 6.382978723404255e-07,
"loss": 0.0,
"reward": 0.6083333410322667,
"reward_std": 0.2742413729429245,
"rewards/accuracy_reward": 0.6083333410322667,
"rewards/format_reward": 0.0,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 559.4833463668823,
"epoch": 0.02666666666666667,
"grad_norm": 1.6163354066965443,
"kl": 0.0003616809844970703,
"learning_rate": 7.978723404255319e-07,
"loss": 0.0,
"reward": 0.6416666738688945,
"reward_std": 0.2742413729429245,
"rewards/accuracy_reward": 0.6416666738688945,
"rewards/format_reward": 0.0,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 595.7666809082032,
"epoch": 0.032,
"grad_norm": 6.599361807218988,
"kl": 0.001885068416595459,
"learning_rate": 9.574468085106384e-07,
"loss": 0.0001,
"reward": 0.5916666761040688,
"reward_std": 0.33197639882564545,
"rewards/accuracy_reward": 0.5916666761040688,
"rewards/format_reward": 0.0,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 586.0833473205566,
"epoch": 0.037333333333333336,
"grad_norm": 0.9958435532731883,
"kl": 0.003298938274383545,
"learning_rate": 1.1170212765957447e-06,
"loss": 0.0001,
"reward": 0.6250000081956386,
"reward_std": 0.303108885884285,
"rewards/accuracy_reward": 0.6250000081956386,
"rewards/format_reward": 0.0,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 666.2250175476074,
"epoch": 0.042666666666666665,
"grad_norm": 0.9670224736847026,
"kl": 0.0009075284004211425,
"learning_rate": 1.276595744680851e-06,
"loss": 0.0,
"reward": 0.6333333410322666,
"reward_std": 0.2742413729429245,
"rewards/accuracy_reward": 0.6333333410322666,
"rewards/format_reward": 0.0,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 606.2083518981933,
"epoch": 0.048,
"grad_norm": 4.8736022254558735,
"kl": 0.005312430858612061,
"learning_rate": 1.4361702127659576e-06,
"loss": 0.0002,
"reward": 0.6416666753590107,
"reward_std": 0.28867512941360474,
"rewards/accuracy_reward": 0.6416666753590107,
"rewards/format_reward": 0.0,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 658.9083461761475,
"epoch": 0.05333333333333334,
"grad_norm": 10.804243004291305,
"kl": 0.002775442600250244,
"learning_rate": 1.5957446808510639e-06,
"loss": 0.0001,
"reward": 0.7000000059604645,
"reward_std": 0.20207259058952332,
"rewards/accuracy_reward": 0.7000000059604645,
"rewards/format_reward": 0.0,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 646.8000122070313,
"epoch": 0.058666666666666666,
"grad_norm": 0.26620562599712166,
"kl": 0.003197479248046875,
"learning_rate": 1.7553191489361702e-06,
"loss": 0.0001,
"reward": 0.6750000067055225,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.6750000067055225,
"rewards/format_reward": 0.0,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 692.2500122070312,
"epoch": 0.064,
"grad_norm": 0.6575814481909016,
"kl": 0.002325701713562012,
"learning_rate": 1.9148936170212767e-06,
"loss": 0.0001,
"reward": 0.6583333395421505,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.6583333395421505,
"rewards/format_reward": 0.0,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 580.9916805267334,
"epoch": 0.06933333333333333,
"grad_norm": 11.574621122437701,
"kl": 0.0074417352676391605,
"learning_rate": 2.074468085106383e-06,
"loss": 0.0003,
"reward": 0.6083333373069764,
"reward_std": 0.14433756470680237,
"rewards/accuracy_reward": 0.6083333373069764,
"rewards/format_reward": 0.0,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 672.7583469390869,
"epoch": 0.07466666666666667,
"grad_norm": 0.6559939371701894,
"kl": 0.0022436141967773437,
"learning_rate": 2.2340425531914894e-06,
"loss": 0.0001,
"reward": 0.7083333387970925,
"reward_std": 0.17320507764816284,
"rewards/accuracy_reward": 0.7083333387970925,
"rewards/format_reward": 0.0,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 618.2250152587891,
"epoch": 0.08,
"grad_norm": 0.911060689959152,
"kl": 0.00368959903717041,
"learning_rate": 2.3936170212765957e-06,
"loss": 0.0001,
"reward": 0.7166666708886623,
"reward_std": 0.1587713211774826,
"rewards/accuracy_reward": 0.7166666708886623,
"rewards/format_reward": 0.0,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 660.7583503723145,
"epoch": 0.08533333333333333,
"grad_norm": 0.7301371123450753,
"kl": 0.003092670440673828,
"learning_rate": 2.553191489361702e-06,
"loss": 0.0001,
"reward": 0.7416666708886623,
"reward_std": 0.17320507764816284,
"rewards/accuracy_reward": 0.7416666708886623,
"rewards/format_reward": 0.0,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 598.4500175476074,
"epoch": 0.09066666666666667,
"grad_norm": 0.3060921026957072,
"kl": 0.00237274169921875,
"learning_rate": 2.7127659574468088e-06,
"loss": 0.0001,
"reward": 0.6500000044703483,
"reward_std": 0.14433756470680237,
"rewards/accuracy_reward": 0.6500000044703483,
"rewards/format_reward": 0.0,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 538.7833473205567,
"epoch": 0.096,
"grad_norm": 0.31714494836666324,
"kl": 0.0028713226318359377,
"learning_rate": 2.872340425531915e-06,
"loss": 0.0001,
"reward": 0.7916666693985462,
"reward_std": 0.08660253882408142,
"rewards/accuracy_reward": 0.7916666693985462,
"rewards/format_reward": 0.0,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 615.441683959961,
"epoch": 0.10133333333333333,
"grad_norm": 0.9408680917110538,
"kl": 1.2848326683044433,
"learning_rate": 2.9999895838948146e-06,
"loss": 0.0515,
"reward": 0.6666666738688946,
"reward_std": 0.24537386000156403,
"rewards/accuracy_reward": 0.6666666738688946,
"rewards/format_reward": 0.0,
"step": 95
},
{
"epoch": 0.10666666666666667,
"grad_norm": 0.5241475522990928,
"learning_rate": 2.9996250354024346e-06,
"loss": 0.0004,
"step": 100
},
{
"epoch": 0.10666666666666667,
"eval_clip_ratio": 0.0,
"eval_completion_length": 612.3530849609375,
"eval_kl": 181.0886371582031,
"eval_loss": 12.212464332580566,
"eval_reward": 0.6065333513915538,
"eval_reward_std": 0.24468103866577148,
"eval_rewards/accuracy_reward": 0.6065333513915538,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 6080.0652,
"eval_samples_per_second": 0.822,
"eval_steps_per_second": 0.034,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 613.6458488464356,
"epoch": 0.112,
"grad_norm": 0.3463297768491564,
"kl": 0.006574392318725586,
"learning_rate": 2.9987398263020837e-06,
"loss": 0.0002,
"reward": 0.7250000052154064,
"reward_std": 0.18042195588350296,
"rewards/accuracy_reward": 0.7250000052154064,
"rewards/format_reward": 0.0,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 645.7916801452636,
"epoch": 0.11733333333333333,
"grad_norm": 0.8886259095075897,
"kl": 0.0029788970947265624,
"learning_rate": 2.997334263932927e-06,
"loss": 0.0001,
"reward": 0.7000000067055225,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.7000000067055225,
"rewards/format_reward": 0.0,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 614.0666831970215,
"epoch": 0.12266666666666666,
"grad_norm": 1.1198898648840077,
"kl": 0.006386947631835937,
"learning_rate": 2.9954088362975936e-06,
"loss": 0.0003,
"reward": 0.7333333387970924,
"reward_std": 0.18763883411884308,
"rewards/accuracy_reward": 0.7333333387970924,
"rewards/format_reward": 0.0,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 537.3333446502686,
"epoch": 0.128,
"grad_norm": 40.94205497488467,
"kl": 0.012101554870605468,
"learning_rate": 2.99296421189274e-06,
"loss": 0.0005,
"reward": 0.7750000044703483,
"reward_std": 0.14433756470680237,
"rewards/accuracy_reward": 0.7750000044703483,
"rewards/format_reward": 0.0,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 614.6333480834961,
"epoch": 0.13333333333333333,
"grad_norm": 0.32034952885628715,
"kl": 0.006664371490478516,
"learning_rate": 2.9900012394769546e-06,
"loss": 0.0003,
"reward": 0.6833333410322666,
"reward_std": 0.2742413729429245,
"rewards/accuracy_reward": 0.6833333410322666,
"rewards/format_reward": 0.0,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 584.5750144958496,
"epoch": 0.13866666666666666,
"grad_norm": 0.752973255526379,
"kl": 0.005373382568359375,
"learning_rate": 2.986520947776075e-06,
"loss": 0.0002,
"reward": 0.7666666723787785,
"reward_std": 0.20207259058952332,
"rewards/accuracy_reward": 0.7666666723787785,
"rewards/format_reward": 0.0,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 548.7833450317382,
"epoch": 0.144,
"grad_norm": 0.2532104242029052,
"kl": 0.004761695861816406,
"learning_rate": 2.982524545126018e-06,
"loss": 0.0002,
"reward": 0.8333333380520344,
"reward_std": 0.1587713211774826,
"rewards/accuracy_reward": 0.8333333380520344,
"rewards/format_reward": 0.0,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 549.5000114440918,
"epoch": 0.14933333333333335,
"grad_norm": 0.09291231140770723,
"kl": 0.004460525512695312,
"learning_rate": 2.9780134190532553e-06,
"loss": 0.0002,
"reward": 0.8583333365619182,
"reward_std": 0.1154700517654419,
"rewards/accuracy_reward": 0.8583333365619182,
"rewards/format_reward": 0.0,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 608.5416820526123,
"epoch": 0.15466666666666667,
"grad_norm": 0.32170364852520245,
"kl": 0.004180049896240235,
"learning_rate": 2.972989135793071e-06,
"loss": 0.0002,
"reward": 0.6166666731238365,
"reward_std": 0.2309401035308838,
"rewards/accuracy_reward": 0.6166666731238365,
"rewards/format_reward": 0.0,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 618.9750152587891,
"epoch": 0.16,
"grad_norm": 0.2025206773963469,
"kl": 0.005657100677490234,
"learning_rate": 2.967453439745775e-06,
"loss": 0.0002,
"reward": 0.7416666716337204,
"reward_std": 0.1587713211774826,
"rewards/accuracy_reward": 0.7416666716337204,
"rewards/format_reward": 0.0,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 611.0750167846679,
"epoch": 0.16533333333333333,
"grad_norm": 0.184411613214241,
"kl": 0.0198455810546875,
"learning_rate": 2.961408252871058e-06,
"loss": 0.0008,
"reward": 0.7666666731238365,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.7666666731238365,
"rewards/format_reward": 0.0,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 569.2500148773194,
"epoch": 0.17066666666666666,
"grad_norm": 0.19077032525046195,
"kl": 0.005090141296386718,
"learning_rate": 2.9548556740207e-06,
"loss": 0.0002,
"reward": 0.6750000052154064,
"reward_std": 0.18763883411884308,
"rewards/accuracy_reward": 0.6750000052154064,
"rewards/format_reward": 0.0,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 589.1916816711425,
"epoch": 0.176,
"grad_norm": 0.4638160803395305,
"kl": 0.005456733703613281,
"learning_rate": 2.9477979782098592e-06,
"loss": 0.0002,
"reward": 0.7416666708886623,
"reward_std": 0.14433756470680237,
"rewards/accuracy_reward": 0.7416666708886623,
"rewards/format_reward": 0.0,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 614.9333473205567,
"epoch": 0.18133333333333335,
"grad_norm": 0.45443884391008643,
"kl": 0.005468559265136719,
"learning_rate": 2.9402376158272022e-06,
"loss": 0.0002,
"reward": 0.7833333373069763,
"reward_std": 0.14433756470680237,
"rewards/accuracy_reward": 0.7833333373069763,
"rewards/format_reward": 0.0,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 610.7750144958496,
"epoch": 0.18666666666666668,
"grad_norm": 0.40031783341085664,
"kl": 0.004351997375488281,
"learning_rate": 2.9321772117841463e-06,
"loss": 0.0002,
"reward": 0.7000000052154064,
"reward_std": 0.18763883411884308,
"rewards/accuracy_reward": 0.7000000052154064,
"rewards/format_reward": 0.0,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 544.2250152587891,
"epoch": 0.192,
"grad_norm": 0.21967968438769525,
"kl": 0.0079864501953125,
"learning_rate": 2.923619564603501e-06,
"loss": 0.0003,
"reward": 0.8250000037252903,
"reward_std": 0.12990380823612213,
"rewards/accuracy_reward": 0.8250000037252903,
"rewards/format_reward": 0.0,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 550.1250141143798,
"epoch": 0.19733333333333333,
"grad_norm": 0.44685209999243625,
"kl": 0.007423973083496094,
"learning_rate": 2.9145676454478435e-06,
"loss": 0.0003,
"reward": 0.7166666716337204,
"reward_std": 0.17320507764816284,
"rewards/accuracy_reward": 0.7166666716337204,
"rewards/format_reward": 0.0,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 549.8583499908448,
"epoch": 0.20266666666666666,
"grad_norm": 0.7466901778117272,
"kl": 0.00619049072265625,
"learning_rate": 2.9050245970879456e-06,
"loss": 0.0002,
"reward": 0.8250000044703484,
"reward_std": 0.1587713211774826,
"rewards/accuracy_reward": 0.8250000044703484,
"rewards/format_reward": 0.0,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 528.4500106811523,
"epoch": 0.208,
"grad_norm": 0.4099308077828794,
"kl": 0.005546188354492188,
"learning_rate": 2.8949937328116252e-06,
"loss": 0.0002,
"reward": 0.8500000037252903,
"reward_std": 0.1154700517654419,
"rewards/accuracy_reward": 0.8500000037252903,
"rewards/format_reward": 0.0,
"step": 195
},
{
"epoch": 0.21333333333333335,
"grad_norm": 0.32264634496767935,
"learning_rate": 2.884478535273393e-06,
"loss": 0.0003,
"step": 200
},
{
"epoch": 0.21333333333333335,
"eval_clip_ratio": 0.0,
"eval_completion_length": 585.7144854492187,
"eval_kl": 0.01230205078125,
"eval_loss": 0.014915091916918755,
"eval_reward": 0.6674666836738586,
"eval_reward_std": 0.17562994873523713,
"eval_rewards/accuracy_reward": 0.6674666836738586,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 6022.9643,
"eval_samples_per_second": 0.83,
"eval_steps_per_second": 0.035,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 544.2750169754029,
"epoch": 0.21866666666666668,
"grad_norm": 0.5023905552183052,
"kl": 0.007254695892333985,
"learning_rate": 2.8734826552852934e-06,
"loss": 0.0003,
"reward": 0.8041666708886623,
"reward_std": 0.13712068647146225,
"rewards/accuracy_reward": 0.8041666708886623,
"rewards/format_reward": 0.0,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 561.0333488464355,
"epoch": 0.224,
"grad_norm": 0.5434607871254864,
"kl": 0.01640605926513672,
"learning_rate": 2.86200991054937e-06,
"loss": 0.0007,
"reward": 0.7750000044703483,
"reward_std": 0.1587713211774826,
"rewards/accuracy_reward": 0.7750000044703483,
"rewards/format_reward": 0.0,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 555.3666790008544,
"epoch": 0.22933333333333333,
"grad_norm": 0.024550981610644177,
"kl": 0.006365013122558594,
"learning_rate": 2.850064284332176e-06,
"loss": 0.0003,
"reward": 0.8166666693985463,
"reward_std": 0.08660253882408142,
"rewards/accuracy_reward": 0.8166666693985463,
"rewards/format_reward": 0.0,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 586.9333484649658,
"epoch": 0.23466666666666666,
"grad_norm": 0.41077115319026264,
"kl": 0.02978935241699219,
"learning_rate": 2.8376499240818166e-06,
"loss": 0.0012,
"reward": 0.6833333395421505,
"reward_std": 0.20207259058952332,
"rewards/accuracy_reward": 0.6833333395421505,
"rewards/format_reward": 0.0,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 596.8916847229004,
"epoch": 0.24,
"grad_norm": 2.182794026264333,
"kl": 0.006468582153320313,
"learning_rate": 2.8247711399879734e-06,
"loss": 0.0003,
"reward": 0.7750000022351742,
"reward_std": 0.08660253882408142,
"rewards/accuracy_reward": 0.7750000022351742,
"rewards/format_reward": 0.0,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 565.9666809082031,
"epoch": 0.24533333333333332,
"grad_norm": 0.27585581380583146,
"kl": 0.00873394012451172,
"learning_rate": 2.8114324034854378e-06,
"loss": 0.0003,
"reward": 0.8833333365619183,
"reward_std": 0.10103629529476166,
"rewards/accuracy_reward": 0.8833333365619183,
"rewards/format_reward": 0.0,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 536.3666790008544,
"epoch": 0.25066666666666665,
"grad_norm": 0.23729950863573765,
"kl": 0.0073108673095703125,
"learning_rate": 2.7976383457016535e-06,
"loss": 0.0003,
"reward": 0.7666666686534882,
"reward_std": 0.07216878235340118,
"rewards/accuracy_reward": 0.7666666686534882,
"rewards/format_reward": 0.0,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 637.5833518981933,
"epoch": 0.256,
"grad_norm": 0.17208585244977956,
"kl": 0.007070159912109375,
"learning_rate": 2.7833937558488187e-06,
"loss": 0.0003,
"reward": 0.7000000037252903,
"reward_std": 0.14433756470680237,
"rewards/accuracy_reward": 0.7000000037252903,
"rewards/format_reward": 0.0,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 507.2166820526123,
"epoch": 0.2613333333333333,
"grad_norm": 0.3031116441751077,
"kl": 0.007193946838378906,
"learning_rate": 2.7687035795611003e-06,
"loss": 0.0003,
"reward": 0.8416666708886623,
"reward_std": 0.14433756470680237,
"rewards/accuracy_reward": 0.8416666708886623,
"rewards/format_reward": 0.0,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 558.5666793823242,
"epoch": 0.26666666666666666,
"grad_norm": 13.547581195880626,
"kl": 0.035246658325195315,
"learning_rate": 2.7535729171775408e-06,
"loss": 0.0014,
"reward": 0.7333333380520344,
"reward_std": 0.18763883411884308,
"rewards/accuracy_reward": 0.7333333380520344,
"rewards/format_reward": 0.0,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 581.6166820526123,
"epoch": 0.272,
"grad_norm": 0.5656864832529094,
"kl": 0.007228469848632813,
"learning_rate": 2.7380070219712514e-06,
"loss": 0.0003,
"reward": 0.8583333373069764,
"reward_std": 0.12990380823612213,
"rewards/accuracy_reward": 0.8583333373069764,
"rewards/format_reward": 0.0,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 593.0583480834961,
"epoch": 0.2773333333333333,
"grad_norm": 0.6148965285931733,
"kl": 0.00813121795654297,
"learning_rate": 2.722011298325509e-06,
"loss": 0.0003,
"reward": 0.7750000044703483,
"reward_std": 0.14433756470680237,
"rewards/accuracy_reward": 0.7750000044703483,
"rewards/format_reward": 0.0,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 559.4750144958496,
"epoch": 0.2826666666666667,
"grad_norm": 0.5111668603735469,
"kl": 0.006645774841308594,
"learning_rate": 2.705591299857385e-06,
"loss": 0.0003,
"reward": 0.7250000044703484,
"reward_std": 0.14433756470680237,
"rewards/accuracy_reward": 0.7250000044703484,
"rewards/format_reward": 0.0,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 606.691682434082,
"epoch": 0.288,
"grad_norm": 0.42461394853936735,
"kl": 0.00601654052734375,
"learning_rate": 2.6887527274895657e-06,
"loss": 0.0002,
"reward": 0.7916666716337204,
"reward_std": 0.1587713211774826,
"rewards/accuracy_reward": 0.7916666716337204,
"rewards/format_reward": 0.0,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 521.008349609375,
"epoch": 0.29333333333333333,
"grad_norm": 0.02587523894842855,
"kl": 0.15564346313476562,
"learning_rate": 2.6715014274710265e-06,
"loss": 0.0062,
"reward": 0.8583333373069764,
"reward_std": 0.12990380823612213,
"rewards/accuracy_reward": 0.8583333373069764,
"rewards/format_reward": 0.0,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 614.7916824340821,
"epoch": 0.2986666666666667,
"grad_norm": 0.014189638152861336,
"kl": 0.040869140625,
"learning_rate": 2.65384338934725e-06,
"loss": 0.0016,
"reward": 0.7583333373069763,
"reward_std": 0.14433756470680237,
"rewards/accuracy_reward": 0.7583333373069763,
"rewards/format_reward": 0.0,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 576.7083457946777,
"epoch": 0.304,
"grad_norm": 0.84554441688506,
"kl": 0.0066776275634765625,
"learning_rate": 2.6357847438806916e-06,
"loss": 0.0003,
"reward": 0.7916666701436043,
"reward_std": 0.14433756470680237,
"rewards/accuracy_reward": 0.7916666701436043,
"rewards/format_reward": 0.0,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 618.333345413208,
"epoch": 0.30933333333333335,
"grad_norm": 0.3105742502280914,
"kl": 0.008008956909179688,
"learning_rate": 2.617331760922218e-06,
"loss": 0.0003,
"reward": 0.7250000029802323,
"reward_std": 0.1154700517654419,
"rewards/accuracy_reward": 0.7250000029802323,
"rewards/format_reward": 0.0,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 641.9583492279053,
"epoch": 0.31466666666666665,
"grad_norm": 0.7728079687941386,
"kl": 0.008575248718261718,
"learning_rate": 2.598490847234253e-06,
"loss": 0.0003,
"reward": 0.6833333373069763,
"reward_std": 0.12990380823612213,
"rewards/accuracy_reward": 0.6833333373069763,
"rewards/format_reward": 0.0,
"step": 295
},
{
"epoch": 0.32,
"grad_norm": 0.18440635331679817,
"learning_rate": 2.5792685442663883e-06,
"loss": 0.0002,
"step": 300
},
{
"epoch": 0.32,
"eval_clip_ratio": 0.0,
"eval_completion_length": 576.1156858398438,
"eval_kl": 0.0124282958984375,
"eval_loss": 0.015103225596249104,
"eval_reward": 0.665000018286705,
"eval_reward_std": 0.1802487503528595,
"eval_rewards/accuracy_reward": 0.665000018286705,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 5982.2541,
"eval_samples_per_second": 0.836,
"eval_steps_per_second": 0.035,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 556.7458492279053,
"epoch": 0.3253333333333333,
"grad_norm": 0.5403363319648429,
"kl": 0.0075927734375,
"learning_rate": 2.559671525884232e-06,
"loss": 0.0004,
"reward": 0.7541666712611914,
"reward_std": 0.17320507764816284,
"rewards/accuracy_reward": 0.7541666712611914,
"rewards/format_reward": 0.0,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 574.9333492279053,
"epoch": 0.33066666666666666,
"grad_norm": 0.3724847648939602,
"kl": 0.006891632080078125,
"learning_rate": 2.539706596052286e-06,
"loss": 0.0003,
"reward": 0.7583333410322666,
"reward_std": 0.25980761647224426,
"rewards/accuracy_reward": 0.7583333410322666,
"rewards/format_reward": 0.0,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 474.5166786193848,
"epoch": 0.336,
"grad_norm": 0.36557996981307944,
"kl": 0.011857986450195312,
"learning_rate": 2.5193806864716466e-06,
"loss": 0.0005,
"reward": 0.8333333365619182,
"reward_std": 0.10103629529476166,
"rewards/accuracy_reward": 0.8333333365619182,
"rewards/format_reward": 0.0,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 553.7416828155517,
"epoch": 0.3413333333333333,
"grad_norm": 0.2438346381823879,
"kl": 0.008208465576171876,
"learning_rate": 2.4987008541733663e-06,
"loss": 0.0003,
"reward": 0.7833333365619183,
"reward_std": 0.1154700517654419,
"rewards/accuracy_reward": 0.7833333365619183,
"rewards/format_reward": 0.0,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 615.3500122070312,
"epoch": 0.3466666666666667,
"grad_norm": 0.340457263416433,
"kl": 0.00625,
"learning_rate": 2.477674279068291e-06,
"loss": 0.0003,
"reward": 0.7250000037252903,
"reward_std": 0.14433756470680237,
"rewards/accuracy_reward": 0.7250000037252903,
"rewards/format_reward": 0.0,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 603.6416778564453,
"epoch": 0.352,
"grad_norm": 0.16359424999897557,
"kl": 0.0068511962890625,
"learning_rate": 2.4563082614542412e-06,
"loss": 0.0003,
"reward": 0.8333333358168602,
"reward_std": 0.08660253882408142,
"rewards/accuracy_reward": 0.8333333358168602,
"rewards/format_reward": 0.0,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 599.6083488464355,
"epoch": 0.35733333333333334,
"grad_norm": 0.25279559853298,
"kl": 0.00695953369140625,
"learning_rate": 2.4346102194813937e-06,
"loss": 0.0003,
"reward": 0.8250000029802322,
"reward_std": 0.10103629529476166,
"rewards/accuracy_reward": 0.8250000029802322,
"rewards/format_reward": 0.0,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 526.5416801452636,
"epoch": 0.3626666666666667,
"grad_norm": 0.6628547412135258,
"kl": 0.009409332275390625,
"learning_rate": 2.4125876865767443e-06,
"loss": 0.0004,
"reward": 0.8250000037252903,
"reward_std": 0.12990380823612213,
"rewards/accuracy_reward": 0.8250000037252903,
"rewards/format_reward": 0.0,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 622.2250129699707,
"epoch": 0.368,
"grad_norm": 0.12252643555238332,
"kl": 0.009972000122070312,
"learning_rate": 2.390248308828548e-06,
"loss": 0.0004,
"reward": 0.7750000037252903,
"reward_std": 0.1154700517654419,
"rewards/accuracy_reward": 0.7750000037252903,
"rewards/format_reward": 0.0,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 561.5333473205567,
"epoch": 0.37333333333333335,
"grad_norm": 0.25234822752428143,
"kl": 0.008985519409179688,
"learning_rate": 2.367599842331646e-06,
"loss": 0.0004,
"reward": 0.8583333373069764,
"reward_std": 0.12990380823612213,
"rewards/accuracy_reward": 0.8583333373069764,
"rewards/format_reward": 0.0,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 615.158349609375,
"epoch": 0.37866666666666665,
"grad_norm": 0.4811409561182482,
"kl": 0.0074596405029296875,
"learning_rate": 2.344650150494596e-06,
"loss": 0.0003,
"reward": 0.8083333380520343,
"reward_std": 0.1587713211774826,
"rewards/accuracy_reward": 0.8083333380520343,
"rewards/format_reward": 0.0,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 585.7250133514405,
"epoch": 0.384,
"grad_norm": 0.24324280894262207,
"kl": 0.010303878784179687,
"learning_rate": 2.3214072013095436e-06,
"loss": 0.0004,
"reward": 0.7666666701436042,
"reward_std": 0.1154700517654419,
"rewards/accuracy_reward": 0.7666666701436042,
"rewards/format_reward": 0.0,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 592.5333507537841,
"epoch": 0.3893333333333333,
"grad_norm": 0.6103293670855143,
"kl": 0.0204681396484375,
"learning_rate": 2.2978790645857867e-06,
"loss": 0.0008,
"reward": 0.7750000074505806,
"reward_std": 0.24537386000156403,
"rewards/accuracy_reward": 0.7750000074505806,
"rewards/format_reward": 0.0,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 628.2000106811523,
"epoch": 0.39466666666666667,
"grad_norm": 0.46834964962728914,
"kl": 0.03272857666015625,
"learning_rate": 2.274073909147986e-06,
"loss": 0.0013,
"reward": 0.6916666716337204,
"reward_std": 0.17320507764816284,
"rewards/accuracy_reward": 0.6916666716337204,
"rewards/format_reward": 0.0,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 590.9000160217286,
"epoch": 0.4,
"grad_norm": 2.0549829286804853,
"kl": 0.06357192993164062,
"learning_rate": 2.25e-06,
"loss": 0.0025,
"reward": 0.7083333373069763,
"reward_std": 0.12990380823612213,
"rewards/accuracy_reward": 0.7083333373069763,
"rewards/format_reward": 0.0,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 585.6750099182129,
"epoch": 0.4053333333333333,
"grad_norm": 12.164472239999887,
"kl": 0.3590213775634766,
"learning_rate": 2.225665695455325e-06,
"loss": 0.0143,
"reward": 0.7000000052154064,
"reward_std": 0.1587713211774826,
"rewards/accuracy_reward": 0.7000000052154064,
"rewards/format_reward": 0.0,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 550.1000122070312,
"epoch": 0.4106666666666667,
"grad_norm": 1.366348076215836,
"kl": 0.05910415649414062,
"learning_rate": 2.20107944423514e-06,
"loss": 0.0024,
"reward": 0.8000000067055225,
"reward_std": 0.20207259058952332,
"rewards/accuracy_reward": 0.8000000067055225,
"rewards/format_reward": 0.0,
"step": 385
},
{
"clip_ratio": 0.0,
"completion_length": 548.5416797637939,
"epoch": 0.416,
"grad_norm": 0.8962462379303796,
"kl": 0.33824996948242186,
"learning_rate": 2.1762497825349665e-06,
"loss": 0.0135,
"reward": 0.7416666708886623,
"reward_std": 0.17320507764816284,
"rewards/accuracy_reward": 0.7416666708886623,
"rewards/format_reward": 0.0,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 578.9750118255615,
"epoch": 0.42133333333333334,
"grad_norm": 1.8407040143574982,
"kl": 1.4021547317504883,
"learning_rate": 2.1511853310609467e-06,
"loss": 0.0558,
"reward": 0.6583333373069763,
"reward_std": 0.1587713211774826,
"rewards/accuracy_reward": 0.6583333373069763,
"rewards/format_reward": 0.0,
"step": 395
},
{
"epoch": 0.4266666666666667,
"grad_norm": 1.2731052300370491,
"learning_rate": 2.1258947920367943e-06,
"loss": 0.0066,
"step": 400
},
{
"epoch": 0.4266666666666667,
"eval_clip_ratio": 0.0,
"eval_completion_length": 569.7938848144531,
"eval_kl": 13424.193332373046,
"eval_loss": 583.3372192382812,
"eval_reward": 0.6630666847705841,
"eval_reward_std": 0.18186533155441284,
"eval_rewards/accuracy_reward": 0.6630666847705841,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 5981.5778,
"eval_samples_per_second": 0.836,
"eval_steps_per_second": 0.035,
"step": 400
},
{
"clip_ratio": 0.0,
"completion_length": 569.2000112533569,
"epoch": 0.432,
"grad_norm": 0.38484188447051687,
"kl": 0.11343555450439453,
"learning_rate": 2.100386946182431e-06,
"loss": 0.0025,
"reward": 0.7250000055879354,
"reward_std": 0.20207259058952332,
"rewards/accuracy_reward": 0.7250000055879354,
"rewards/format_reward": 0.0,
"step": 405
},
{
"clip_ratio": 0.0,
"completion_length": 554.5666835784912,
"epoch": 0.43733333333333335,
"grad_norm": 1.003802162008014,
"kl": 0.1323028564453125,
"learning_rate": 2.0746706496653765e-06,
"loss": 0.0053,
"reward": 0.7500000044703483,
"reward_std": 0.17320507764816284,
"rewards/accuracy_reward": 0.7500000044703483,
"rewards/format_reward": 0.0,
"step": 410
},
{
"clip_ratio": 0.0,
"completion_length": 586.441682434082,
"epoch": 0.44266666666666665,
"grad_norm": 1.3927518076538108,
"kl": 0.047705459594726565,
"learning_rate": 2.048754831025942e-06,
"loss": 0.0019,
"reward": 0.7666666738688945,
"reward_std": 0.24537386000156403,
"rewards/accuracy_reward": 0.7666666738688945,
"rewards/format_reward": 0.0,
"step": 415
},
{
"clip_ratio": 0.0,
"completion_length": 558.3500160217285,
"epoch": 0.448,
"grad_norm": 2.893427001373877,
"kl": 0.1003173828125,
"learning_rate": 2.0226484880772943e-06,
"loss": 0.004,
"reward": 0.8250000014901161,
"reward_std": 0.05773502588272095,
"rewards/accuracy_reward": 0.8250000014901161,
"rewards/format_reward": 0.0,
"step": 420
},
{
"clip_ratio": 0.0,
"completion_length": 590.9166778564453,
"epoch": 0.4533333333333333,
"grad_norm": 0.36645522567137545,
"kl": 0.13446083068847656,
"learning_rate": 1.9963606847814702e-06,
"loss": 0.0054,
"reward": 0.7416666716337204,
"reward_std": 0.18763883411884308,
"rewards/accuracy_reward": 0.7416666716337204,
"rewards/format_reward": 0.0,
"step": 425
},
{
"clip_ratio": 0.0,
"completion_length": 537.2083503723145,
"epoch": 0.45866666666666667,
"grad_norm": 0.3671458640275,
"kl": 0.054613494873046876,
"learning_rate": 1.9699005481024273e-06,
"loss": 0.0022,
"reward": 0.7750000044703483,
"reward_std": 0.1587713211774826,
"rewards/accuracy_reward": 0.7750000044703483,
"rewards/format_reward": 0.0,
"step": 430
},
{
"clip_ratio": 0.0,
"completion_length": 601.3833442687989,
"epoch": 0.464,
"grad_norm": 1.046879617710471,
"kl": 0.020965576171875,
"learning_rate": 1.943277264837214e-06,
"loss": 0.0008,
"reward": 0.7333333358168602,
"reward_std": 0.07216878235340118,
"rewards/accuracy_reward": 0.7333333358168602,
"rewards/format_reward": 0.0,
"step": 435
},
{
"clip_ratio": 0.0,
"completion_length": 547.3333511352539,
"epoch": 0.4693333333333333,
"grad_norm": 0.21409956983248427,
"kl": 0.018289947509765626,
"learning_rate": 1.9165000784263734e-06,
"loss": 0.0007,
"reward": 0.7666666716337204,
"reward_std": 0.1587713211774826,
"rewards/accuracy_reward": 0.7666666716337204,
"rewards/format_reward": 0.0,
"step": 440
},
{
"clip_ratio": 0.0,
"completion_length": 566.5416786193848,
"epoch": 0.4746666666666667,
"grad_norm": 0.3456348995650795,
"kl": 0.03510856628417969,
"learning_rate": 1.8895782857446754e-06,
"loss": 0.0014,
"reward": 0.7916666723787784,
"reward_std": 0.17320507764816284,
"rewards/accuracy_reward": 0.7916666723787784,
"rewards/format_reward": 0.0,
"step": 445
},
{
"clip_ratio": 0.0,
"completion_length": 580.8000129699707,
"epoch": 0.48,
"grad_norm": 0.9792448320575614,
"kl": 0.030012130737304688,
"learning_rate": 1.8625212338733005e-06,
"loss": 0.0012,
"reward": 0.7500000044703483,
"reward_std": 0.1587713211774826,
"rewards/accuracy_reward": 0.7500000044703483,
"rewards/format_reward": 0.0,
"step": 450
},
{
"clip_ratio": 0.0,
"completion_length": 612.6583484649658,
"epoch": 0.48533333333333334,
"grad_norm": 1.9811858834651732,
"kl": 0.045981216430664065,
"learning_rate": 1.835338316854588e-06,
"loss": 0.0018,
"reward": 0.7083333395421505,
"reward_std": 0.20207259058952332,
"rewards/accuracy_reward": 0.7083333395421505,
"rewards/format_reward": 0.0,
"step": 455
},
{
"clip_ratio": 0.0,
"completion_length": 580.3916793823242,
"epoch": 0.49066666666666664,
"grad_norm": 0.22045853934983553,
"kl": 0.022677230834960937,
"learning_rate": 1.8080389724304863e-06,
"loss": 0.0009,
"reward": 0.7833333380520344,
"reward_std": 0.1587713211774826,
"rewards/accuracy_reward": 0.7833333380520344,
"rewards/format_reward": 0.0,
"step": 460
},
{
"clip_ratio": 0.0,
"completion_length": 614.6166839599609,
"epoch": 0.496,
"grad_norm": 0.29012755281009805,
"kl": 0.03320999145507812,
"learning_rate": 1.7806326787658219e-06,
"loss": 0.0013,
"reward": 0.7000000067055225,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.7000000067055225,
"rewards/format_reward": 0.0,
"step": 465
},
{
"clip_ratio": 0.0,
"completion_length": 616.9083450317382,
"epoch": 0.5013333333333333,
"grad_norm": 0.45521762259008347,
"kl": 0.05942230224609375,
"learning_rate": 1.7531289511575427e-06,
"loss": 0.0024,
"reward": 0.6833333410322666,
"reward_std": 0.25980761647224426,
"rewards/accuracy_reward": 0.6833333410322666,
"rewards/format_reward": 0.0,
"step": 470
},
{
"clip_ratio": 0.0,
"completion_length": 669.9333499908447,
"epoch": 0.5066666666666667,
"grad_norm": 1.2238798671126319,
"kl": 0.04663505554199219,
"learning_rate": 1.7255373387310633e-06,
"loss": 0.0019,
"reward": 0.6833333395421505,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.6833333395421505,
"rewards/format_reward": 0.0,
"step": 475
},
{
"clip_ratio": 0.0,
"completion_length": 598.641682434082,
"epoch": 0.512,
"grad_norm": 0.6120301813056643,
"kl": 0.09711380004882812,
"learning_rate": 1.6978674211248676e-06,
"loss": 0.0039,
"reward": 0.7500000037252903,
"reward_std": 0.14433756470680237,
"rewards/accuracy_reward": 0.7500000037252903,
"rewards/format_reward": 0.0,
"step": 480
},
{
"clip_ratio": 0.0,
"completion_length": 608.883345413208,
"epoch": 0.5173333333333333,
"grad_norm": 26.043777462496045,
"kl": 0.18498878479003905,
"learning_rate": 1.6701288051645182e-06,
"loss": 0.0074,
"reward": 0.6750000044703484,
"reward_std": 0.1587713211774826,
"rewards/accuracy_reward": 0.6750000044703484,
"rewards/format_reward": 0.0,
"step": 485
},
{
"clip_ratio": 0.0,
"completion_length": 597.6416805267334,
"epoch": 0.5226666666666666,
"grad_norm": 1.3300653593847538,
"kl": 0.08308296203613282,
"learning_rate": 1.642331121527223e-06,
"loss": 0.0033,
"reward": 0.7750000029802322,
"reward_std": 0.12990380823612213,
"rewards/accuracy_reward": 0.7750000029802322,
"rewards/format_reward": 0.0,
"step": 490
},
{
"clip_ratio": 0.0,
"completion_length": 557.4333480834961,
"epoch": 0.528,
"grad_norm": 2.4699038123399015,
"kl": 0.07253971099853515,
"learning_rate": 1.6144840213981257e-06,
"loss": 0.0029,
"reward": 0.7583333380520344,
"reward_std": 0.14433756470680237,
"rewards/accuracy_reward": 0.7583333380520344,
"rewards/format_reward": 0.0,
"step": 495
},
{
"epoch": 0.5333333333333333,
"grad_norm": 1.2756243338289226,
"learning_rate": 1.5865971731194738e-06,
"loss": 0.0097,
"step": 500
},
{
"epoch": 0.5333333333333333,
"eval_clip_ratio": 0.0,
"eval_completion_length": 600.5289517089843,
"eval_kl": 0.145334228515625,
"eval_loss": 0.03381989896297455,
"eval_reward": 0.6379333512067795,
"eval_reward_std": 0.20980908365249634,
"eval_rewards/accuracy_reward": 0.6379333512067795,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 6091.2544,
"eval_samples_per_second": 0.821,
"eval_steps_per_second": 0.034,
"step": 500
},
{
"clip_ratio": 0.0,
"completion_length": 585.5958461761475,
"epoch": 0.5386666666666666,
"grad_norm": 0.9549348299070431,
"kl": 0.15351810455322265,
"learning_rate": 1.5586802588338262e-06,
"loss": 0.0026,
"reward": 0.7125000048428773,
"reward_std": 0.18042195588350296,
"rewards/accuracy_reward": 0.7125000048428773,
"rewards/format_reward": 0.0,
"step": 505
},
{
"clip_ratio": 0.0,
"completion_length": 578.0666816711425,
"epoch": 0.544,
"grad_norm": 1.2238718249429645,
"kl": 0.026650619506835938,
"learning_rate": 1.5307429711224756e-06,
"loss": 0.0011,
"reward": 0.7583333380520344,
"reward_std": 0.1587713211774826,
"rewards/accuracy_reward": 0.7583333380520344,
"rewards/format_reward": 0.0,
"step": 510
},
{
"clip_ratio": 0.0,
"completion_length": 592.3000137329102,
"epoch": 0.5493333333333333,
"grad_norm": 9.727501760877075,
"kl": 0.07199592590332031,
"learning_rate": 1.5027950096402447e-06,
"loss": 0.0029,
"reward": 0.7416666731238365,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.7416666731238365,
"rewards/format_reward": 0.0,
"step": 515
},
{
"clip_ratio": 0.0,
"completion_length": 616.6750164031982,
"epoch": 0.5546666666666666,
"grad_norm": 15.439896164361146,
"kl": 0.05631599426269531,
"learning_rate": 1.474846077747821e-06,
"loss": 0.0023,
"reward": 0.7500000059604645,
"reward_std": 0.20207259058952332,
"rewards/accuracy_reward": 0.7500000059604645,
"rewards/format_reward": 0.0,
"step": 520
},
{
"clip_ratio": 0.0,
"completion_length": 613.5250183105469,
"epoch": 0.56,
"grad_norm": 0.9960427525825399,
"kl": 0.05883331298828125,
"learning_rate": 1.4469058791428154e-06,
"loss": 0.0024,
"reward": 0.6916666723787784,
"reward_std": 0.20207259058952332,
"rewards/accuracy_reward": 0.6916666723787784,
"rewards/format_reward": 0.0,
"step": 525
},
{
"clip_ratio": 0.0,
"completion_length": 591.8166831970215,
"epoch": 0.5653333333333334,
"grad_norm": 3.046337395450479,
"kl": 0.07379722595214844,
"learning_rate": 1.4189841144906928e-06,
"loss": 0.0029,
"reward": 0.7583333380520344,
"reward_std": 0.1587713211774826,
"rewards/accuracy_reward": 0.7583333380520344,
"rewards/format_reward": 0.0,
"step": 530
},
{
"clip_ratio": 0.0,
"completion_length": 542.066683959961,
"epoch": 0.5706666666666667,
"grad_norm": 1.340816244971557,
"kl": 0.04628486633300781,
"learning_rate": 1.3910904780567642e-06,
"loss": 0.0019,
"reward": 0.8250000052154064,
"reward_std": 0.1587713211774826,
"rewards/accuracy_reward": 0.8250000052154064,
"rewards/format_reward": 0.0,
"step": 535
},
{
"clip_ratio": 0.0,
"completion_length": 587.141682434082,
"epoch": 0.576,
"grad_norm": 2.6344014025933356,
"kl": 0.041788482666015626,
"learning_rate": 1.3632346543403946e-06,
"loss": 0.0017,
"reward": 0.6916666738688946,
"reward_std": 0.25980761647224426,
"rewards/accuracy_reward": 0.6916666738688946,
"rewards/format_reward": 0.0,
"step": 540
},
{
"clip_ratio": 0.0,
"completion_length": 641.9750152587891,
"epoch": 0.5813333333333334,
"grad_norm": 4.6049111419007165,
"kl": 0.12230720520019531,
"learning_rate": 1.335426314712607e-06,
"loss": 0.0049,
"reward": 0.6833333387970925,
"reward_std": 0.20207259058952332,
"rewards/accuracy_reward": 0.6833333387970925,
"rewards/format_reward": 0.0,
"step": 545
},
{
"clip_ratio": 0.0,
"completion_length": 688.2666839599609,
"epoch": 0.5866666666666667,
"grad_norm": 3.521827139400347,
"kl": 0.06717300415039062,
"learning_rate": 1.3076751140582396e-06,
"loss": 0.0027,
"reward": 0.6833333402872086,
"reward_std": 0.24537386000156403,
"rewards/accuracy_reward": 0.6833333402872086,
"rewards/format_reward": 0.0,
"step": 550
},
{
"clip_ratio": 0.0,
"completion_length": 577.8250099182129,
"epoch": 0.592,
"grad_norm": 3.564377427961792,
"kl": 0.1139495849609375,
"learning_rate": 1.2799906874238297e-06,
"loss": 0.0045,
"reward": 0.7250000029802323,
"reward_std": 0.10103629529476166,
"rewards/accuracy_reward": 0.7250000029802323,
"rewards/format_reward": 0.0,
"step": 555
},
{
"clip_ratio": 0.0,
"completion_length": 645.683349609375,
"epoch": 0.5973333333333334,
"grad_norm": 0.9903458969279051,
"kl": 0.12611656188964843,
"learning_rate": 1.2523826466723843e-06,
"loss": 0.005,
"reward": 0.6833333387970925,
"reward_std": 0.18763883411884308,
"rewards/accuracy_reward": 0.6833333387970925,
"rewards/format_reward": 0.0,
"step": 560
},
{
"clip_ratio": 0.0,
"completion_length": 582.9333503723144,
"epoch": 0.6026666666666667,
"grad_norm": 2.225108665952508,
"kl": 0.07784194946289062,
"learning_rate": 1.2248605771462016e-06,
"loss": 0.0031,
"reward": 0.6833333387970925,
"reward_std": 0.18763883411884308,
"rewards/accuracy_reward": 0.6833333387970925,
"rewards/format_reward": 0.0,
"step": 565
},
{
"clip_ratio": 0.0,
"completion_length": 566.6333442687989,
"epoch": 0.608,
"grad_norm": 3.291993199224936,
"kl": 0.1331024169921875,
"learning_rate": 1.1974340343388974e-06,
"loss": 0.0053,
"reward": 0.7083333395421505,
"reward_std": 0.24537386000156403,
"rewards/accuracy_reward": 0.7083333395421505,
"rewards/format_reward": 0.0,
"step": 570
},
{
"clip_ratio": 0.0,
"completion_length": 574.4333473205567,
"epoch": 0.6133333333333333,
"grad_norm": 3.912966689921013,
"kl": 0.22335777282714844,
"learning_rate": 1.1701125405777965e-06,
"loss": 0.0089,
"reward": 0.7500000044703483,
"reward_std": 0.1587713211774826,
"rewards/accuracy_reward": 0.7500000044703483,
"rewards/format_reward": 0.0,
"step": 575
},
{
"clip_ratio": 0.0,
"completion_length": 618.2916778564453,
"epoch": 0.6186666666666667,
"grad_norm": 21.294549926589646,
"kl": 1.0905929565429688,
"learning_rate": 1.142905581717841e-06,
"loss": 0.0436,
"reward": 0.7500000067055226,
"reward_std": 0.2309401035308838,
"rewards/accuracy_reward": 0.7500000067055226,
"rewards/format_reward": 0.0,
"step": 580
},
{
"clip_ratio": 0.0,
"completion_length": 533.8166828155518,
"epoch": 0.624,
"grad_norm": 6.403461046316986,
"kl": 0.11275444030761719,
"learning_rate": 1.1158226038481584e-06,
"loss": 0.0045,
"reward": 0.8750000037252903,
"reward_std": 0.1154700517654419,
"rewards/accuracy_reward": 0.8750000037252903,
"rewards/format_reward": 0.0,
"step": 585
},
{
"clip_ratio": 0.0,
"completion_length": 638.766682434082,
"epoch": 0.6293333333333333,
"grad_norm": 0.9273470799146647,
"kl": 0.7552982330322265,
"learning_rate": 1.0888730100124355e-06,
"loss": 0.0302,
"reward": 0.7083333402872085,
"reward_std": 0.24537386000156403,
"rewards/accuracy_reward": 0.7083333402872085,
"rewards/format_reward": 0.0,
"step": 590
},
{
"clip_ratio": 0.0,
"completion_length": 529.8416812896728,
"epoch": 0.6346666666666667,
"grad_norm": 15.858090968775276,
"kl": 0.2991436004638672,
"learning_rate": 1.062066156944242e-06,
"loss": 0.012,
"reward": 0.8333333373069763,
"reward_std": 0.1587713211774826,
"rewards/accuracy_reward": 0.8333333373069763,
"rewards/format_reward": 0.0,
"step": 595
},
{
"epoch": 0.64,
"grad_norm": 17.119879704647552,
"learning_rate": 1.0354113518184304e-06,
"loss": 0.0045,
"step": 600
},
{
"epoch": 0.64,
"eval_clip_ratio": 0.0,
"eval_completion_length": 598.1299530761719,
"eval_kl": 0.886337255859375,
"eval_loss": 0.07811599224805832,
"eval_reward": 0.6349333519935608,
"eval_reward_std": 0.20807703318595885,
"eval_rewards/accuracy_reward": 0.6349333519935608,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 6102.8046,
"eval_samples_per_second": 0.819,
"eval_steps_per_second": 0.034,
"step": 600
},
{
"clip_ratio": 0.0,
"completion_length": 618.4583503723145,
"epoch": 0.6453333333333333,
"grad_norm": 237.8237245765829,
"kl": 0.350294303894043,
"learning_rate": 1.008917849019739e-06,
"loss": 0.0234,
"reward": 0.7333333380520344,
"reward_std": 0.1587713211774826,
"rewards/accuracy_reward": 0.7333333380520344,
"rewards/format_reward": 0.0,
"step": 605
},
{
"clip_ratio": 0.0,
"completion_length": 583.5416801452636,
"epoch": 0.6506666666666666,
"grad_norm": 7.9821732038803805,
"kl": 0.12260932922363281,
"learning_rate": 9.825948469297303e-07,
"loss": 0.0049,
"reward": 0.7750000044703483,
"reward_std": 0.18763883411884308,
"rewards/accuracy_reward": 0.7750000044703483,
"rewards/format_reward": 0.0,
"step": 610
},
{
"clip_ratio": 0.0,
"completion_length": 609.900015258789,
"epoch": 0.656,
"grad_norm": 94865.97576667031,
"kl": 173.92265777587892,
"learning_rate": 9.564514847331647e-07,
"loss": 6.9811,
"reward": 0.7166666716337204,
"reward_std": 0.18763883411884308,
"rewards/accuracy_reward": 0.7166666716337204,
"rewards/format_reward": 0.0,
"step": 615
},
{
"clip_ratio": 0.0,
"completion_length": 563.7083461761474,
"epoch": 0.6613333333333333,
"grad_norm": 1.7355610026067,
"kl": 0.3627006530761719,
"learning_rate": 9.304968392449361e-07,
"loss": 0.0145,
"reward": 0.7083333380520344,
"reward_std": 0.1587713211774826,
"rewards/accuracy_reward": 0.7083333380520344,
"rewards/format_reward": 0.0,
"step": 620
},
{
"clip_ratio": 0.0,
"completion_length": 650.9000160217286,
"epoch": 0.6666666666666666,
"grad_norm": 4.055117700318325,
"kl": 1.220144271850586,
"learning_rate": 9.047399217586552e-07,
"loss": 0.0488,
"reward": 0.7333333402872085,
"reward_std": 0.2309401035308838,
"rewards/accuracy_reward": 0.7333333402872085,
"rewards/format_reward": 0.0,
"step": 625
},
{
"clip_ratio": 0.0,
"completion_length": 596.6833438873291,
"epoch": 0.672,
"grad_norm": 3.2951177269303034,
"kl": 0.484442138671875,
"learning_rate": 8.791896749179831e-07,
"loss": 0.0194,
"reward": 0.6666666716337204,
"reward_std": 0.1587713211774826,
"rewards/accuracy_reward": 0.6666666716337204,
"rewards/format_reward": 0.0,
"step": 630
},
{
"clip_ratio": 0.0,
"completion_length": 688.1416854858398,
"epoch": 0.6773333333333333,
"grad_norm": 6.185421881006396,
"kl": 0.8364780426025391,
"learning_rate": 8.538549696118023e-07,
"loss": 0.0335,
"reward": 0.7083333365619182,
"reward_std": 0.14433756470680237,
"rewards/accuracy_reward": 0.7083333365619182,
"rewards/format_reward": 0.0,
"step": 635
},
{
"clip_ratio": 0.0,
"completion_length": 614.8666843414306,
"epoch": 0.6826666666666666,
"grad_norm": 3.6901707366378296,
"kl": 0.5915939331054687,
"learning_rate": 8.287446018942973e-07,
"loss": 0.0236,
"reward": 0.7416666716337204,
"reward_std": 0.17320507764816284,
"rewards/accuracy_reward": 0.7416666716337204,
"rewards/format_reward": 0.0,
"step": 640
},
{
"clip_ratio": 0.0,
"completion_length": 602.1500183105469,
"epoch": 0.688,
"grad_norm": 2.298300879666136,
"kl": 0.34840736389160154,
"learning_rate": 8.038672899310176e-07,
"loss": 0.014,
"reward": 0.6833333380520343,
"reward_std": 0.20207259058952332,
"rewards/accuracy_reward": 0.6833333380520343,
"rewards/format_reward": 0.0,
"step": 645
},
{
"clip_ratio": 0.0,
"completion_length": 615.6250175476074,
"epoch": 0.6933333333333334,
"grad_norm": 11.20824930878958,
"kl": 0.19427947998046874,
"learning_rate": 7.792316709719875e-07,
"loss": 0.0078,
"reward": 0.6583333395421505,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.6583333395421505,
"rewards/format_reward": 0.0,
"step": 650
},
{
"clip_ratio": 0.0,
"completion_length": 594.0416820526123,
"epoch": 0.6986666666666667,
"grad_norm": 11.717220592320137,
"kl": 2.438280487060547,
"learning_rate": 7.548462983529016e-07,
"loss": 0.0976,
"reward": 0.7583333425223827,
"reward_std": 0.28867512941360474,
"rewards/accuracy_reward": 0.7583333425223827,
"rewards/format_reward": 0.0,
"step": 655
},
{
"clip_ratio": 0.0,
"completion_length": 607.308352279663,
"epoch": 0.704,
"grad_norm": 1.5198877689737873,
"kl": 0.19807205200195313,
"learning_rate": 7.307196385254621e-07,
"loss": 0.0079,
"reward": 0.7416666723787785,
"reward_std": 0.20207259058952332,
"rewards/accuracy_reward": 0.7416666723787785,
"rewards/format_reward": 0.0,
"step": 660
},
{
"clip_ratio": 0.0,
"completion_length": 612.7500148773194,
"epoch": 0.7093333333333334,
"grad_norm": 1.2366765264367552,
"kl": 0.3959392547607422,
"learning_rate": 7.068600681178772e-07,
"loss": 0.0158,
"reward": 0.8000000052154064,
"reward_std": 0.17320507764816284,
"rewards/accuracy_reward": 0.8000000052154064,
"rewards/format_reward": 0.0,
"step": 665
},
{
"clip_ratio": 0.0,
"completion_length": 627.3750144958497,
"epoch": 0.7146666666666667,
"grad_norm": 1.26819472034522,
"kl": 0.4778144836425781,
"learning_rate": 6.832758710265492e-07,
"loss": 0.0191,
"reward": 0.6666666716337204,
"reward_std": 0.18763883411884308,
"rewards/accuracy_reward": 0.6666666716337204,
"rewards/format_reward": 0.0,
"step": 670
},
{
"clip_ratio": 0.0,
"completion_length": 619.6833473205567,
"epoch": 0.72,
"grad_norm": 4.274976105654847,
"kl": 0.11606101989746094,
"learning_rate": 6.599752355399538e-07,
"loss": 0.0046,
"reward": 0.7000000052154064,
"reward_std": 0.18763883411884308,
"rewards/accuracy_reward": 0.7000000052154064,
"rewards/format_reward": 0.0,
"step": 675
},
{
"clip_ratio": 0.0,
"completion_length": 638.4916831970215,
"epoch": 0.7253333333333334,
"grad_norm": 0.36858027198187077,
"kl": 0.18754081726074218,
"learning_rate": 6.369662514957191e-07,
"loss": 0.0075,
"reward": 0.7166666738688946,
"reward_std": 0.24537386000156403,
"rewards/accuracy_reward": 0.7166666738688946,
"rewards/format_reward": 0.0,
"step": 680
},
{
"clip_ratio": 0.0,
"completion_length": 608.3666835784912,
"epoch": 0.7306666666666667,
"grad_norm": 3.047545343694443,
"kl": 0.05216217041015625,
"learning_rate": 6.142569074718818e-07,
"loss": 0.0021,
"reward": 0.7500000059604645,
"reward_std": 0.20207259058952332,
"rewards/accuracy_reward": 0.7500000059604645,
"rewards/format_reward": 0.0,
"step": 685
},
{
"clip_ratio": 0.0,
"completion_length": 662.0166839599609,
"epoch": 0.736,
"grad_norm": 0.45904383752278927,
"kl": 0.04991302490234375,
"learning_rate": 5.918550880133018e-07,
"loss": 0.002,
"reward": 0.5833333417773247,
"reward_std": 0.28867512941360474,
"rewards/accuracy_reward": 0.5833333417773247,
"rewards/format_reward": 0.0,
"step": 690
},
{
"clip_ratio": 0.0,
"completion_length": 565.958348083496,
"epoch": 0.7413333333333333,
"grad_norm": 0.616371610316275,
"kl": 1.7408302307128907,
"learning_rate": 5.697685708941996e-07,
"loss": 0.0696,
"reward": 0.7166666708886623,
"reward_std": 0.1587713211774826,
"rewards/accuracy_reward": 0.7166666708886623,
"rewards/format_reward": 0.0,
"step": 695
},
{
"epoch": 0.7466666666666667,
"grad_norm": 1.148258838235239,
"learning_rate": 5.480050244177573e-07,
"loss": 0.0044,
"step": 700
},
{
"epoch": 0.7466666666666667,
"eval_clip_ratio": 0.0,
"eval_completion_length": 599.234618359375,
"eval_kl": 155189.75018701173,
"eval_loss": 8703.8037109375,
"eval_reward": 0.6249333506584167,
"eval_reward_std": 0.2200859182357788,
"eval_rewards/accuracy_reward": 0.6249333506584167,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 6119.4517,
"eval_samples_per_second": 0.817,
"eval_steps_per_second": 0.034,
"step": 700
},
{
"clip_ratio": 0.0,
"completion_length": 594.833348083496,
"epoch": 0.752,
"grad_norm": 33.42924134136288,
"kl": 0.2813268661499023,
"learning_rate": 5.265720047537318e-07,
"loss": 0.0181,
"reward": 0.6875000059604645,
"reward_std": 0.20207259058952332,
"rewards/accuracy_reward": 0.6875000059604645,
"rewards/format_reward": 0.0,
"step": 705
},
{
"clip_ratio": 0.0,
"completion_length": 599.6500148773193,
"epoch": 0.7573333333333333,
"grad_norm": 31.602691481649085,
"kl": 0.2062408447265625,
"learning_rate": 5.054769533149999e-07,
"loss": 0.0083,
"reward": 0.8000000044703484,
"reward_std": 0.14433756470680237,
"rewards/accuracy_reward": 0.8000000044703484,
"rewards/format_reward": 0.0,
"step": 710
},
{
"clip_ratio": 0.0,
"completion_length": 563.0916854858399,
"epoch": 0.7626666666666667,
"grad_norm": 14.824445401418682,
"kl": 0.5208763122558594,
"learning_rate": 4.847271941739458e-07,
"loss": 0.0209,
"reward": 0.6583333395421505,
"reward_std": 0.2309401035308838,
"rewards/accuracy_reward": 0.6583333395421505,
"rewards/format_reward": 0.0,
"step": 715
},
{
"clip_ratio": 0.0,
"completion_length": 575.7666816711426,
"epoch": 0.768,
"grad_norm": 3.361014733027139,
"kl": 0.14543685913085938,
"learning_rate": 4.643299315195855e-07,
"loss": 0.0058,
"reward": 0.7083333395421505,
"reward_std": 0.20207259058952332,
"rewards/accuracy_reward": 0.7083333395421505,
"rewards/format_reward": 0.0,
"step": 720
},
{
"clip_ratio": 0.0,
"completion_length": 593.8666786193847,
"epoch": 0.7733333333333333,
"grad_norm": 0.49783824002899163,
"kl": 0.14712867736816407,
"learning_rate": 4.442922471563205e-07,
"loss": 0.0059,
"reward": 0.7250000067055226,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.7250000067055226,
"rewards/format_reward": 0.0,
"step": 725
},
{
"clip_ratio": 0.0,
"completion_length": 660.7500198364257,
"epoch": 0.7786666666666666,
"grad_norm": 1.3968867213585785,
"kl": 11.599226379394532,
"learning_rate": 4.24621098045175e-07,
"loss": 0.4664,
"reward": 0.6833333380520343,
"reward_std": 0.18763883411884308,
"rewards/accuracy_reward": 0.6833333380520343,
"rewards/format_reward": 0.0,
"step": 730
},
{
"clip_ratio": 0.0,
"completion_length": 633.0583499908447,
"epoch": 0.784,
"grad_norm": 1.7330543134117584,
"kl": 0.2003498077392578,
"learning_rate": 4.053233138883835e-07,
"loss": 0.008,
"reward": 0.6333333395421505,
"reward_std": 0.2309401035308838,
"rewards/accuracy_reward": 0.6333333395421505,
"rewards/format_reward": 0.0,
"step": 735
},
{
"clip_ratio": 0.0,
"completion_length": 607.7166835784913,
"epoch": 0.7893333333333333,
"grad_norm": 2.1665020676583495,
"kl": 0.12012977600097656,
"learning_rate": 3.864055947581605e-07,
"loss": 0.0048,
"reward": 0.6583333402872086,
"reward_std": 0.24537386000156403,
"rewards/accuracy_reward": 0.6583333402872086,
"rewards/format_reward": 0.0,
"step": 740
},
{
"clip_ratio": 0.0,
"completion_length": 669.0750186920166,
"epoch": 0.7946666666666666,
"grad_norm": 1.1622261596005705,
"kl": 0.07356147766113282,
"learning_rate": 3.6787450877047543e-07,
"loss": 0.0029,
"reward": 0.6333333387970924,
"reward_std": 0.17320507764816284,
"rewards/accuracy_reward": 0.6333333387970924,
"rewards/format_reward": 0.0,
"step": 745
},
{
"clip_ratio": 0.0,
"completion_length": 578.5916860580444,
"epoch": 0.8,
"grad_norm": 42.60513773493881,
"kl": 0.33280181884765625,
"learning_rate": 3.4973648980464454e-07,
"loss": 0.0133,
"reward": 0.6833333395421505,
"reward_std": 0.20207259058952332,
"rewards/accuracy_reward": 0.6833333395421505,
"rewards/format_reward": 0.0,
"step": 750
},
{
"clip_ratio": 0.0,
"completion_length": 572.9250129699707,
"epoch": 0.8053333333333333,
"grad_norm": 0.5348584171724263,
"kl": 0.2312854766845703,
"learning_rate": 3.3199783526952656e-07,
"loss": 0.0092,
"reward": 0.7000000059604645,
"reward_std": 0.2309401035308838,
"rewards/accuracy_reward": 0.7000000059604645,
"rewards/format_reward": 0.0,
"step": 755
},
{
"clip_ratio": 0.0,
"completion_length": 618.1166831970215,
"epoch": 0.8106666666666666,
"grad_norm": 1.0994184328283247,
"kl": 0.5087726593017579,
"learning_rate": 3.146647039171002e-07,
"loss": 0.0203,
"reward": 0.6500000089406968,
"reward_std": 0.303108885884285,
"rewards/accuracy_reward": 0.6500000089406968,
"rewards/format_reward": 0.0,
"step": 760
},
{
"clip_ratio": 0.0,
"completion_length": 580.0750160217285,
"epoch": 0.816,
"grad_norm": 0.4374571849706074,
"kl": 0.1738201141357422,
"learning_rate": 2.977431137041848e-07,
"loss": 0.0069,
"reward": 0.7666666731238365,
"reward_std": 0.20207259058952332,
"rewards/accuracy_reward": 0.7666666731238365,
"rewards/format_reward": 0.0,
"step": 765
},
{
"clip_ratio": 0.0,
"completion_length": 605.3416835784913,
"epoch": 0.8213333333333334,
"grad_norm": 4.308504111793084,
"kl": 0.3406654357910156,
"learning_rate": 2.8123893970304154e-07,
"loss": 0.0136,
"reward": 0.7416666723787785,
"reward_std": 0.20207259058952332,
"rewards/accuracy_reward": 0.7416666723787785,
"rewards/format_reward": 0.0,
"step": 770
},
{
"clip_ratio": 0.0,
"completion_length": 605.7750175476074,
"epoch": 0.8266666666666667,
"grad_norm": 4.226173869297776,
"kl": 0.3512901306152344,
"learning_rate": 2.651579120615855e-07,
"loss": 0.014,
"reward": 0.7333333395421505,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.7333333395421505,
"rewards/format_reward": 0.0,
"step": 775
},
{
"clip_ratio": 0.0,
"completion_length": 630.9083503723144,
"epoch": 0.832,
"grad_norm": 2.7537487444245845,
"kl": 0.2918212890625,
"learning_rate": 2.495056140139119e-07,
"loss": 0.0117,
"reward": 0.6833333402872086,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.6833333402872086,
"rewards/format_reward": 0.0,
"step": 780
},
{
"clip_ratio": 0.0,
"completion_length": 699.4166809082031,
"epoch": 0.8373333333333334,
"grad_norm": 2.403342890055738,
"kl": 0.37193450927734373,
"learning_rate": 2.3428747994183364e-07,
"loss": 0.0149,
"reward": 0.6416666746139527,
"reward_std": 0.2742413729429245,
"rewards/accuracy_reward": 0.6416666746139527,
"rewards/format_reward": 0.0,
"step": 785
},
{
"clip_ratio": 0.0,
"completion_length": 585.433345413208,
"epoch": 0.8426666666666667,
"grad_norm": 2.286592494028236,
"kl": 0.18648300170898438,
"learning_rate": 2.1950879348809548e-07,
"loss": 0.0074,
"reward": 0.6416666723787785,
"reward_std": 0.20207259058952332,
"rewards/accuracy_reward": 0.6416666723787785,
"rewards/format_reward": 0.0,
"step": 790
},
{
"clip_ratio": 0.0,
"completion_length": 657.300016784668,
"epoch": 0.848,
"grad_norm": 16.438686897747974,
"kl": 0.27127418518066404,
"learning_rate": 2.0517468572192632e-07,
"loss": 0.0109,
"reward": 0.7416666731238365,
"reward_std": 0.20207259058952332,
"rewards/accuracy_reward": 0.7416666731238365,
"rewards/format_reward": 0.0,
"step": 795
},
{
"epoch": 0.8533333333333334,
"grad_norm": 0.8259215940647905,
"learning_rate": 1.9129013335756317e-07,
"loss": 0.0058,
"step": 800
},
{
"epoch": 0.8533333333333334,
"eval_clip_ratio": 0.0,
"eval_completion_length": 595.2233535644531,
"eval_kl": 0.593589892578125,
"eval_loss": 0.06349216401576996,
"eval_reward": 0.6285333512663841,
"eval_reward_std": 0.22216437902450561,
"eval_rewards/accuracy_reward": 0.6285333512663841,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 6111.5827,
"eval_samples_per_second": 0.818,
"eval_steps_per_second": 0.034,
"step": 800
},
{
"clip_ratio": 0.0,
"completion_length": 593.562516784668,
"epoch": 0.8586666666666667,
"grad_norm": 0.4214347379103132,
"kl": 0.13609142303466798,
"learning_rate": 1.7785995702636698e-07,
"loss": 0.0051,
"reward": 0.7166666727513075,
"reward_std": 0.20207259058952332,
"rewards/accuracy_reward": 0.7166666727513075,
"rewards/format_reward": 0.0,
"step": 805
},
{
"clip_ratio": 0.0,
"completion_length": 633.9166858673095,
"epoch": 0.864,
"grad_norm": 55.27072438894511,
"kl": 0.23999671936035155,
"learning_rate": 1.64888819603129e-07,
"loss": 0.0096,
"reward": 0.7333333410322667,
"reward_std": 0.24537386000156403,
"rewards/accuracy_reward": 0.7333333410322667,
"rewards/format_reward": 0.0,
"step": 810
},
{
"clip_ratio": 0.0,
"completion_length": 553.4083469390869,
"epoch": 0.8693333333333333,
"grad_norm": 0.33509715227443615,
"kl": 0.4740461349487305,
"learning_rate": 1.5238122458714925e-07,
"loss": 0.019,
"reward": 0.8166666723787784,
"reward_std": 0.18763883411884308,
"rewards/accuracy_reward": 0.8166666723787784,
"rewards/format_reward": 0.0,
"step": 815
},
{
"clip_ratio": 0.0,
"completion_length": 622.3166831970215,
"epoch": 0.8746666666666667,
"grad_norm": 5.11412403288406,
"kl": 0.2153860092163086,
"learning_rate": 1.4034151453864846e-07,
"loss": 0.0086,
"reward": 0.7000000074505806,
"reward_std": 0.24537386000156403,
"rewards/accuracy_reward": 0.7000000074505806,
"rewards/format_reward": 0.0,
"step": 820
},
{
"clip_ratio": 0.0,
"completion_length": 605.7666816711426,
"epoch": 0.88,
"grad_norm": 0.4543276386621551,
"kl": 0.3385311126708984,
"learning_rate": 1.287738695710592e-07,
"loss": 0.0136,
"reward": 0.7416666746139526,
"reward_std": 0.2742413729429245,
"rewards/accuracy_reward": 0.7416666746139526,
"rewards/format_reward": 0.0,
"step": 825
},
{
"clip_ratio": 0.0,
"completion_length": 645.6416862487793,
"epoch": 0.8853333333333333,
"grad_norm": 4.187663668914122,
"kl": 0.4212055206298828,
"learning_rate": 1.1768230589971457e-07,
"loss": 0.0168,
"reward": 0.7083333395421505,
"reward_std": 0.24537386000156403,
"rewards/accuracy_reward": 0.7083333395421505,
"rewards/format_reward": 0.0,
"step": 830
},
{
"clip_ratio": 0.0,
"completion_length": 557.1166831970215,
"epoch": 0.8906666666666667,
"grad_norm": 0.7419623861693787,
"kl": 0.22484512329101564,
"learning_rate": 1.0707067444744439e-07,
"loss": 0.009,
"reward": 0.8250000052154064,
"reward_std": 0.18763883411884308,
"rewards/accuracy_reward": 0.8250000052154064,
"rewards/format_reward": 0.0,
"step": 835
},
{
"clip_ratio": 0.0,
"completion_length": 576.6416793823242,
"epoch": 0.896,
"grad_norm": 7.202147799954427,
"kl": 0.22338714599609374,
"learning_rate": 9.69426595075566e-08,
"loss": 0.0089,
"reward": 0.6833333387970925,
"reward_std": 0.17320507764816284,
"rewards/accuracy_reward": 0.6833333387970925,
"rewards/format_reward": 0.0,
"step": 840
},
{
"clip_ratio": 0.0,
"completion_length": 652.1083518981934,
"epoch": 0.9013333333333333,
"grad_norm": 0.604632511176017,
"kl": 0.1972179412841797,
"learning_rate": 8.730177746467616e-08,
"loss": 0.0079,
"reward": 0.6500000074505806,
"reward_std": 0.25980761647224426,
"rewards/accuracy_reward": 0.6500000074505806,
"rewards/format_reward": 0.0,
"step": 845
},
{
"clip_ratio": 0.0,
"completion_length": 594.641679763794,
"epoch": 0.9066666666666666,
"grad_norm": 5.532928736058938,
"kl": 0.34342117309570314,
"learning_rate": 7.81513755738742e-08,
"loss": 0.0137,
"reward": 0.6916666723787784,
"reward_std": 0.20207259058952332,
"rewards/accuracy_reward": 0.6916666723787784,
"rewards/format_reward": 0.0,
"step": 850
},
{
"clip_ratio": 0.0,
"completion_length": 591.6250190734863,
"epoch": 0.912,
"grad_norm": 2.930096657150403,
"kl": 0.7959453582763671,
"learning_rate": 6.949463079852491e-08,
"loss": 0.0319,
"reward": 0.7666666738688945,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.7666666738688945,
"rewards/format_reward": 0.0,
"step": 855
},
{
"clip_ratio": 0.0,
"completion_length": 583.5083526611328,
"epoch": 0.9173333333333333,
"grad_norm": 1.0029555883021177,
"kl": 0.09391098022460938,
"learning_rate": 6.133454870728111e-08,
"loss": 0.0038,
"reward": 0.8333333395421505,
"reward_std": 0.18763883411884308,
"rewards/accuracy_reward": 0.8333333395421505,
"rewards/format_reward": 0.0,
"step": 860
},
{
"clip_ratio": 0.0,
"completion_length": 619.716682434082,
"epoch": 0.9226666666666666,
"grad_norm": 6.476959419244453,
"kl": 1.3393241882324218,
"learning_rate": 5.367396243056022e-08,
"loss": 0.0536,
"reward": 0.6666666761040687,
"reward_std": 0.303108885884285,
"rewards/accuracy_reward": 0.6666666761040687,
"rewards/format_reward": 0.0,
"step": 865
},
{
"clip_ratio": 0.0,
"completion_length": 649.6083534240722,
"epoch": 0.928,
"grad_norm": 10.21710384011178,
"kl": 0.2799686431884766,
"learning_rate": 4.6515531676899316e-08,
"loss": 0.0112,
"reward": 0.7250000067055226,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.7250000067055226,
"rewards/format_reward": 0.0,
"step": 870
},
{
"clip_ratio": 0.0,
"completion_length": 642.5583480834961,
"epoch": 0.9333333333333333,
"grad_norm": 2.2223793495325506,
"kl": 0.41688003540039065,
"learning_rate": 3.986174180951896e-08,
"loss": 0.0167,
"reward": 0.7000000059604645,
"reward_std": 0.20207259058952332,
"rewards/accuracy_reward": 0.7000000059604645,
"rewards/format_reward": 0.0,
"step": 875
},
{
"clip_ratio": 0.0,
"completion_length": 627.0166820526123,
"epoch": 0.9386666666666666,
"grad_norm": 0.9639105805168805,
"kl": 0.21553115844726561,
"learning_rate": 3.3714902983421944e-08,
"loss": 0.0086,
"reward": 0.8083333402872086,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.8083333402872086,
"rewards/format_reward": 0.0,
"step": 880
},
{
"clip_ratio": 0.0,
"completion_length": 612.1166828155517,
"epoch": 0.944,
"grad_norm": 2.9700295445501137,
"kl": 0.363385009765625,
"learning_rate": 2.807714934332073e-08,
"loss": 0.0145,
"reward": 0.6666666738688946,
"reward_std": 0.25980761647224426,
"rewards/accuracy_reward": 0.6666666738688946,
"rewards/format_reward": 0.0,
"step": 885
},
{
"clip_ratio": 0.0,
"completion_length": 607.191682434082,
"epoch": 0.9493333333333334,
"grad_norm": 1.5280325145682865,
"kl": 0.478509521484375,
"learning_rate": 2.2950438282676455e-08,
"loss": 0.0191,
"reward": 0.7500000059604645,
"reward_std": 0.20207259058952332,
"rewards/accuracy_reward": 0.7500000059604645,
"rewards/format_reward": 0.0,
"step": 890
},
{
"clip_ratio": 0.0,
"completion_length": 595.3250190734864,
"epoch": 0.9546666666666667,
"grad_norm": 1.3220444396009707,
"kl": 0.19265098571777345,
"learning_rate": 1.8336549764102594e-08,
"loss": 0.0077,
"reward": 0.7833333380520344,
"reward_std": 0.1587713211774826,
"rewards/accuracy_reward": 0.7833333380520344,
"rewards/format_reward": 0.0,
"step": 895
},
{
"epoch": 0.96,
"grad_norm": 2.367573715833922,
"learning_rate": 1.4237085701374109e-08,
"loss": 0.0039,
"step": 900
},
{
"epoch": 0.96,
"eval_clip_ratio": 0.0,
"eval_completion_length": 597.8314856933594,
"eval_kl": 0.409832080078125,
"eval_loss": 0.04817482829093933,
"eval_reward": 0.633200018286705,
"eval_reward_std": 0.20888532328605652,
"eval_rewards/accuracy_reward": 0.633200018286705,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 6071.9836,
"eval_samples_per_second": 0.823,
"eval_steps_per_second": 0.034,
"step": 900
},
{
"clip_ratio": 0.0,
"completion_length": 574.0916843414307,
"epoch": 0.9653333333333334,
"grad_norm": 5.414436257998155,
"kl": 0.3074363708496094,
"learning_rate": 1.0653469403252015e-08,
"loss": 0.0207,
"reward": 0.7250000070780516,
"reward_std": 0.2381569817662239,
"rewards/accuracy_reward": 0.7250000070780516,
"rewards/format_reward": 0.0,
"step": 905
},
{
"clip_ratio": 0.0,
"completion_length": 623.0000164031983,
"epoch": 0.9706666666666667,
"grad_norm": 4.617084054859332,
"kl": 0.23618698120117188,
"learning_rate": 7.586945079319673e-09,
"loss": 0.0094,
"reward": 0.6833333410322666,
"reward_std": 0.25980761647224426,
"rewards/accuracy_reward": 0.6833333410322666,
"rewards/format_reward": 0.0,
"step": 910
},
{
"clip_ratio": 0.0,
"completion_length": 635.7666885375977,
"epoch": 0.976,
"grad_norm": 4.6029934543244755,
"kl": 0.3018619537353516,
"learning_rate": 5.038577408000844e-09,
"loss": 0.0121,
"reward": 0.7166666731238365,
"reward_std": 0.2309401035308838,
"rewards/accuracy_reward": 0.7166666731238365,
"rewards/format_reward": 0.0,
"step": 915
},
{
"clip_ratio": 0.0,
"completion_length": 551.4250095367431,
"epoch": 0.9813333333333333,
"grad_norm": 3.163010566375475,
"kl": 0.25188522338867186,
"learning_rate": 3.009251166909699e-09,
"loss": 0.0101,
"reward": 0.7416666708886623,
"reward_std": 0.14433756470680237,
"rewards/accuracy_reward": 0.7416666708886623,
"rewards/format_reward": 0.0,
"step": 920
},
{
"clip_ratio": 0.0,
"completion_length": 610.1583431243896,
"epoch": 0.9866666666666667,
"grad_norm": 0.16665497610442914,
"kl": 0.0295257568359375,
"learning_rate": 1.4996709256617225e-09,
"loss": 0.0012,
"reward": 0.6750000052154064,
"reward_std": 0.17320507764816284,
"rewards/accuracy_reward": 0.6750000052154064,
"rewards/format_reward": 0.0,
"step": 925
},
{
"clip_ratio": 0.0,
"completion_length": 609.3500144958496,
"epoch": 0.992,
"grad_norm": 0.8617998481481968,
"kl": 0.2718994140625,
"learning_rate": 5.103608012512195e-10,
"loss": 0.0109,
"reward": 0.7166666723787785,
"reward_std": 0.20207259058952332,
"rewards/accuracy_reward": 0.7166666723787785,
"rewards/format_reward": 0.0,
"step": 930
},
{
"clip_ratio": 0.0,
"completion_length": 632.933349609375,
"epoch": 0.9973333333333333,
"grad_norm": 5.070012098805089,
"kl": 0.15453033447265624,
"learning_rate": 4.1664276081376796e-11,
"loss": 0.0062,
"reward": 0.7333333387970924,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.7333333387970924,
"rewards/format_reward": 0.0,
"step": 935
},
{
"clip_ratio": 0.0,
"completion_length": 568.4375152587891,
"epoch": 0.9994666666666666,
"kl": 0.20409584045410156,
"reward": 0.7916666734963655,
"reward_std": 0.25259073823690414,
"rewards/accuracy_reward": 0.7916666734963655,
"rewards/format_reward": 0.0,
"step": 937,
"total_flos": 0.0,
"train_loss": 0.04672712115020638,
"train_runtime": 102271.8167,
"train_samples_per_second": 0.073,
"train_steps_per_second": 0.009
}
],
"logging_steps": 5,
"max_steps": 937,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}