{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994666666666666, "eval_steps": 100, "global_step": 937, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 721.3750152587891, "epoch": 0.0010666666666666667, "grad_norm": 0.7148946902118419, "kl": 0.0, "learning_rate": 3.191489361702128e-08, "loss": 0.0, "reward": 0.7500000111758709, "reward_std": 0.3608439117670059, "rewards/accuracy_reward": 0.7500000111758709, "rewards/format_reward": 0.0, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 560.6666808128357, "epoch": 0.005333333333333333, "grad_norm": 1.4615535350113826, "kl": 9.819865226745605e-05, "learning_rate": 1.5957446808510638e-07, "loss": 0.0, "reward": 0.6041666744276881, "reward_std": 0.25259073823690414, "rewards/accuracy_reward": 0.6041666744276881, "rewards/format_reward": 0.0, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 588.8250148773193, "epoch": 0.010666666666666666, "grad_norm": 53.213496097955556, "kl": 0.00019417405128479003, "learning_rate": 3.1914893617021275e-07, "loss": 0.0, "reward": 0.6833333380520343, "reward_std": 0.18763883411884308, "rewards/accuracy_reward": 0.6833333380520343, "rewards/format_reward": 0.0, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 552.5083473205566, "epoch": 0.016, "grad_norm": 2.4855773258124825, "kl": 0.0002583146095275879, "learning_rate": 4.787234042553192e-07, "loss": 0.0, "reward": 0.6416666753590107, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.6416666753590107, "rewards/format_reward": 0.0, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 586.7083511352539, "epoch": 0.021333333333333333, "grad_norm": 25.07614059798895, "kl": 0.00021169781684875487, "learning_rate": 6.382978723404255e-07, "loss": 0.0, "reward": 0.6083333410322667, "reward_std": 0.2742413729429245, "rewards/accuracy_reward": 0.6083333410322667, "rewards/format_reward": 0.0, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 559.4833463668823, "epoch": 0.02666666666666667, "grad_norm": 1.6163354066965443, "kl": 0.0003616809844970703, "learning_rate": 7.978723404255319e-07, "loss": 0.0, "reward": 0.6416666738688945, "reward_std": 0.2742413729429245, "rewards/accuracy_reward": 0.6416666738688945, "rewards/format_reward": 0.0, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 595.7666809082032, "epoch": 0.032, "grad_norm": 6.599361807218988, "kl": 0.001885068416595459, "learning_rate": 9.574468085106384e-07, "loss": 0.0001, "reward": 0.5916666761040688, "reward_std": 0.33197639882564545, "rewards/accuracy_reward": 0.5916666761040688, "rewards/format_reward": 0.0, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 586.0833473205566, "epoch": 0.037333333333333336, "grad_norm": 0.9958435532731883, "kl": 0.003298938274383545, "learning_rate": 1.1170212765957447e-06, "loss": 0.0001, "reward": 0.6250000081956386, "reward_std": 0.303108885884285, "rewards/accuracy_reward": 0.6250000081956386, "rewards/format_reward": 0.0, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 666.2250175476074, "epoch": 0.042666666666666665, "grad_norm": 0.9670224736847026, "kl": 0.0009075284004211425, "learning_rate": 1.276595744680851e-06, "loss": 0.0, "reward": 0.6333333410322666, "reward_std": 0.2742413729429245, "rewards/accuracy_reward": 0.6333333410322666, "rewards/format_reward": 0.0, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 606.2083518981933, "epoch": 0.048, "grad_norm": 4.8736022254558735, "kl": 0.005312430858612061, "learning_rate": 1.4361702127659576e-06, "loss": 0.0002, "reward": 0.6416666753590107, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.6416666753590107, "rewards/format_reward": 0.0, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 658.9083461761475, "epoch": 0.05333333333333334, "grad_norm": 10.804243004291305, "kl": 0.002775442600250244, "learning_rate": 1.5957446808510639e-06, "loss": 0.0001, "reward": 0.7000000059604645, "reward_std": 0.20207259058952332, "rewards/accuracy_reward": 0.7000000059604645, "rewards/format_reward": 0.0, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 646.8000122070313, "epoch": 0.058666666666666666, "grad_norm": 0.26620562599712166, "kl": 0.003197479248046875, "learning_rate": 1.7553191489361702e-06, "loss": 0.0001, "reward": 0.6750000067055225, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.6750000067055225, "rewards/format_reward": 0.0, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 692.2500122070312, "epoch": 0.064, "grad_norm": 0.6575814481909016, "kl": 0.002325701713562012, "learning_rate": 1.9148936170212767e-06, "loss": 0.0001, "reward": 0.6583333395421505, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.6583333395421505, "rewards/format_reward": 0.0, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 580.9916805267334, "epoch": 0.06933333333333333, "grad_norm": 11.574621122437701, "kl": 0.0074417352676391605, "learning_rate": 2.074468085106383e-06, "loss": 0.0003, "reward": 0.6083333373069764, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.6083333373069764, "rewards/format_reward": 0.0, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 672.7583469390869, "epoch": 0.07466666666666667, "grad_norm": 0.6559939371701894, "kl": 0.0022436141967773437, "learning_rate": 2.2340425531914894e-06, "loss": 0.0001, "reward": 0.7083333387970925, "reward_std": 0.17320507764816284, "rewards/accuracy_reward": 0.7083333387970925, "rewards/format_reward": 0.0, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 618.2250152587891, "epoch": 0.08, "grad_norm": 0.911060689959152, "kl": 0.00368959903717041, "learning_rate": 2.3936170212765957e-06, "loss": 0.0001, "reward": 0.7166666708886623, "reward_std": 0.1587713211774826, "rewards/accuracy_reward": 0.7166666708886623, "rewards/format_reward": 0.0, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 660.7583503723145, "epoch": 0.08533333333333333, "grad_norm": 0.7301371123450753, "kl": 0.003092670440673828, "learning_rate": 2.553191489361702e-06, "loss": 0.0001, "reward": 0.7416666708886623, "reward_std": 0.17320507764816284, "rewards/accuracy_reward": 0.7416666708886623, "rewards/format_reward": 0.0, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 598.4500175476074, "epoch": 0.09066666666666667, "grad_norm": 0.3060921026957072, "kl": 0.00237274169921875, "learning_rate": 2.7127659574468088e-06, "loss": 0.0001, "reward": 0.6500000044703483, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.6500000044703483, "rewards/format_reward": 0.0, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 538.7833473205567, "epoch": 0.096, "grad_norm": 0.31714494836666324, "kl": 0.0028713226318359377, "learning_rate": 2.872340425531915e-06, "loss": 0.0001, "reward": 0.7916666693985462, "reward_std": 0.08660253882408142, "rewards/accuracy_reward": 0.7916666693985462, "rewards/format_reward": 0.0, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 615.441683959961, "epoch": 0.10133333333333333, "grad_norm": 0.9408680917110538, "kl": 1.2848326683044433, "learning_rate": 2.9999895838948146e-06, "loss": 0.0515, "reward": 0.6666666738688946, "reward_std": 0.24537386000156403, "rewards/accuracy_reward": 0.6666666738688946, "rewards/format_reward": 0.0, "step": 95 }, { "epoch": 0.10666666666666667, "grad_norm": 0.5241475522990928, "learning_rate": 2.9996250354024346e-06, "loss": 0.0004, "step": 100 }, { "epoch": 0.10666666666666667, "eval_clip_ratio": 0.0, "eval_completion_length": 612.3530849609375, "eval_kl": 181.0886371582031, "eval_loss": 12.212464332580566, "eval_reward": 0.6065333513915538, "eval_reward_std": 0.24468103866577148, "eval_rewards/accuracy_reward": 0.6065333513915538, "eval_rewards/format_reward": 0.0, "eval_runtime": 6080.0652, "eval_samples_per_second": 0.822, "eval_steps_per_second": 0.034, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 613.6458488464356, "epoch": 0.112, "grad_norm": 0.3463297768491564, "kl": 0.006574392318725586, "learning_rate": 2.9987398263020837e-06, "loss": 0.0002, "reward": 0.7250000052154064, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.7250000052154064, "rewards/format_reward": 0.0, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 645.7916801452636, "epoch": 0.11733333333333333, "grad_norm": 0.8886259095075897, "kl": 0.0029788970947265624, "learning_rate": 2.997334263932927e-06, "loss": 0.0001, "reward": 0.7000000067055225, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.7000000067055225, "rewards/format_reward": 0.0, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 614.0666831970215, "epoch": 0.12266666666666666, "grad_norm": 1.1198898648840077, "kl": 0.006386947631835937, "learning_rate": 2.9954088362975936e-06, "loss": 0.0003, "reward": 0.7333333387970924, "reward_std": 0.18763883411884308, "rewards/accuracy_reward": 0.7333333387970924, "rewards/format_reward": 0.0, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 537.3333446502686, "epoch": 0.128, "grad_norm": 40.94205497488467, "kl": 0.012101554870605468, "learning_rate": 2.99296421189274e-06, "loss": 0.0005, "reward": 0.7750000044703483, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.7750000044703483, "rewards/format_reward": 0.0, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 614.6333480834961, "epoch": 0.13333333333333333, "grad_norm": 0.32034952885628715, "kl": 0.006664371490478516, "learning_rate": 2.9900012394769546e-06, "loss": 0.0003, "reward": 0.6833333410322666, "reward_std": 0.2742413729429245, "rewards/accuracy_reward": 0.6833333410322666, "rewards/format_reward": 0.0, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 584.5750144958496, "epoch": 0.13866666666666666, "grad_norm": 0.752973255526379, "kl": 0.005373382568359375, "learning_rate": 2.986520947776075e-06, "loss": 0.0002, "reward": 0.7666666723787785, "reward_std": 0.20207259058952332, "rewards/accuracy_reward": 0.7666666723787785, "rewards/format_reward": 0.0, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 548.7833450317382, "epoch": 0.144, "grad_norm": 0.2532104242029052, "kl": 0.004761695861816406, "learning_rate": 2.982524545126018e-06, "loss": 0.0002, "reward": 0.8333333380520344, "reward_std": 0.1587713211774826, "rewards/accuracy_reward": 0.8333333380520344, "rewards/format_reward": 0.0, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 549.5000114440918, "epoch": 0.14933333333333335, "grad_norm": 0.09291231140770723, "kl": 0.004460525512695312, "learning_rate": 2.9780134190532553e-06, "loss": 0.0002, "reward": 0.8583333365619182, "reward_std": 0.1154700517654419, "rewards/accuracy_reward": 0.8583333365619182, "rewards/format_reward": 0.0, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 608.5416820526123, "epoch": 0.15466666666666667, "grad_norm": 0.32170364852520245, "kl": 0.004180049896240235, "learning_rate": 2.972989135793071e-06, "loss": 0.0002, "reward": 0.6166666731238365, "reward_std": 0.2309401035308838, "rewards/accuracy_reward": 0.6166666731238365, "rewards/format_reward": 0.0, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 618.9750152587891, "epoch": 0.16, "grad_norm": 0.2025206773963469, "kl": 0.005657100677490234, "learning_rate": 2.967453439745775e-06, "loss": 0.0002, "reward": 0.7416666716337204, "reward_std": 0.1587713211774826, "rewards/accuracy_reward": 0.7416666716337204, "rewards/format_reward": 0.0, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 611.0750167846679, "epoch": 0.16533333333333333, "grad_norm": 0.184411613214241, "kl": 0.0198455810546875, "learning_rate": 2.961408252871058e-06, "loss": 0.0008, "reward": 0.7666666731238365, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.7666666731238365, "rewards/format_reward": 0.0, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 569.2500148773194, "epoch": 0.17066666666666666, "grad_norm": 0.19077032525046195, "kl": 0.005090141296386718, "learning_rate": 2.9548556740207e-06, "loss": 0.0002, "reward": 0.6750000052154064, "reward_std": 0.18763883411884308, "rewards/accuracy_reward": 0.6750000052154064, "rewards/format_reward": 0.0, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 589.1916816711425, "epoch": 0.176, "grad_norm": 0.4638160803395305, "kl": 0.005456733703613281, "learning_rate": 2.9477979782098592e-06, "loss": 0.0002, "reward": 0.7416666708886623, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.7416666708886623, "rewards/format_reward": 0.0, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 614.9333473205567, "epoch": 0.18133333333333335, "grad_norm": 0.45443884391008643, "kl": 0.005468559265136719, "learning_rate": 2.9402376158272022e-06, "loss": 0.0002, "reward": 0.7833333373069763, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.7833333373069763, "rewards/format_reward": 0.0, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 610.7750144958496, "epoch": 0.18666666666666668, "grad_norm": 0.40031783341085664, "kl": 0.004351997375488281, "learning_rate": 2.9321772117841463e-06, "loss": 0.0002, "reward": 0.7000000052154064, "reward_std": 0.18763883411884308, "rewards/accuracy_reward": 0.7000000052154064, "rewards/format_reward": 0.0, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 544.2250152587891, "epoch": 0.192, "grad_norm": 0.21967968438769525, "kl": 0.0079864501953125, "learning_rate": 2.923619564603501e-06, "loss": 0.0003, "reward": 0.8250000037252903, "reward_std": 0.12990380823612213, "rewards/accuracy_reward": 0.8250000037252903, "rewards/format_reward": 0.0, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 550.1250141143798, "epoch": 0.19733333333333333, "grad_norm": 0.44685209999243625, "kl": 0.007423973083496094, "learning_rate": 2.9145676454478435e-06, "loss": 0.0003, "reward": 0.7166666716337204, "reward_std": 0.17320507764816284, "rewards/accuracy_reward": 0.7166666716337204, "rewards/format_reward": 0.0, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 549.8583499908448, "epoch": 0.20266666666666666, "grad_norm": 0.7466901778117272, "kl": 0.00619049072265625, "learning_rate": 2.9050245970879456e-06, "loss": 0.0002, "reward": 0.8250000044703484, "reward_std": 0.1587713211774826, "rewards/accuracy_reward": 0.8250000044703484, "rewards/format_reward": 0.0, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 528.4500106811523, "epoch": 0.208, "grad_norm": 0.4099308077828794, "kl": 0.005546188354492188, "learning_rate": 2.8949937328116252e-06, "loss": 0.0002, "reward": 0.8500000037252903, "reward_std": 0.1154700517654419, "rewards/accuracy_reward": 0.8500000037252903, "rewards/format_reward": 0.0, "step": 195 }, { "epoch": 0.21333333333333335, "grad_norm": 0.32264634496767935, "learning_rate": 2.884478535273393e-06, "loss": 0.0003, "step": 200 }, { "epoch": 0.21333333333333335, "eval_clip_ratio": 0.0, "eval_completion_length": 585.7144854492187, "eval_kl": 0.01230205078125, "eval_loss": 0.014915091916918755, "eval_reward": 0.6674666836738586, "eval_reward_std": 0.17562994873523713, "eval_rewards/accuracy_reward": 0.6674666836738586, "eval_rewards/format_reward": 0.0, "eval_runtime": 6022.9643, "eval_samples_per_second": 0.83, "eval_steps_per_second": 0.035, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 544.2750169754029, "epoch": 0.21866666666666668, "grad_norm": 0.5023905552183052, "kl": 0.007254695892333985, "learning_rate": 2.8734826552852934e-06, "loss": 0.0003, "reward": 0.8041666708886623, "reward_std": 0.13712068647146225, "rewards/accuracy_reward": 0.8041666708886623, "rewards/format_reward": 0.0, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 561.0333488464355, "epoch": 0.224, "grad_norm": 0.5434607871254864, "kl": 0.01640605926513672, "learning_rate": 2.86200991054937e-06, "loss": 0.0007, "reward": 0.7750000044703483, "reward_std": 0.1587713211774826, "rewards/accuracy_reward": 0.7750000044703483, "rewards/format_reward": 0.0, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 555.3666790008544, "epoch": 0.22933333333333333, "grad_norm": 0.024550981610644177, "kl": 0.006365013122558594, "learning_rate": 2.850064284332176e-06, "loss": 0.0003, "reward": 0.8166666693985463, "reward_std": 0.08660253882408142, "rewards/accuracy_reward": 0.8166666693985463, "rewards/format_reward": 0.0, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 586.9333484649658, "epoch": 0.23466666666666666, "grad_norm": 0.41077115319026264, "kl": 0.02978935241699219, "learning_rate": 2.8376499240818166e-06, "loss": 0.0012, "reward": 0.6833333395421505, "reward_std": 0.20207259058952332, "rewards/accuracy_reward": 0.6833333395421505, "rewards/format_reward": 0.0, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 596.8916847229004, "epoch": 0.24, "grad_norm": 2.182794026264333, "kl": 0.006468582153320313, "learning_rate": 2.8247711399879734e-06, "loss": 0.0003, "reward": 0.7750000022351742, "reward_std": 0.08660253882408142, "rewards/accuracy_reward": 0.7750000022351742, "rewards/format_reward": 0.0, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 565.9666809082031, "epoch": 0.24533333333333332, "grad_norm": 0.27585581380583146, "kl": 0.00873394012451172, "learning_rate": 2.8114324034854378e-06, "loss": 0.0003, "reward": 0.8833333365619183, "reward_std": 0.10103629529476166, "rewards/accuracy_reward": 0.8833333365619183, "rewards/format_reward": 0.0, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 536.3666790008544, "epoch": 0.25066666666666665, "grad_norm": 0.23729950863573765, "kl": 0.0073108673095703125, "learning_rate": 2.7976383457016535e-06, "loss": 0.0003, "reward": 0.7666666686534882, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.7666666686534882, "rewards/format_reward": 0.0, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 637.5833518981933, "epoch": 0.256, "grad_norm": 0.17208585244977956, "kl": 0.007070159912109375, "learning_rate": 2.7833937558488187e-06, "loss": 0.0003, "reward": 0.7000000037252903, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.7000000037252903, "rewards/format_reward": 0.0, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 507.2166820526123, "epoch": 0.2613333333333333, "grad_norm": 0.3031116441751077, "kl": 0.007193946838378906, "learning_rate": 2.7687035795611003e-06, "loss": 0.0003, "reward": 0.8416666708886623, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.8416666708886623, "rewards/format_reward": 0.0, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 558.5666793823242, "epoch": 0.26666666666666666, "grad_norm": 13.547581195880626, "kl": 0.035246658325195315, "learning_rate": 2.7535729171775408e-06, "loss": 0.0014, "reward": 0.7333333380520344, "reward_std": 0.18763883411884308, "rewards/accuracy_reward": 0.7333333380520344, "rewards/format_reward": 0.0, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 581.6166820526123, "epoch": 0.272, "grad_norm": 0.5656864832529094, "kl": 0.007228469848632813, "learning_rate": 2.7380070219712514e-06, "loss": 0.0003, "reward": 0.8583333373069764, "reward_std": 0.12990380823612213, "rewards/accuracy_reward": 0.8583333373069764, "rewards/format_reward": 0.0, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 593.0583480834961, "epoch": 0.2773333333333333, "grad_norm": 0.6148965285931733, "kl": 0.00813121795654297, "learning_rate": 2.722011298325509e-06, "loss": 0.0003, "reward": 0.7750000044703483, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.7750000044703483, "rewards/format_reward": 0.0, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 559.4750144958496, "epoch": 0.2826666666666667, "grad_norm": 0.5111668603735469, "kl": 0.006645774841308594, "learning_rate": 2.705591299857385e-06, "loss": 0.0003, "reward": 0.7250000044703484, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.7250000044703484, "rewards/format_reward": 0.0, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 606.691682434082, "epoch": 0.288, "grad_norm": 0.42461394853936735, "kl": 0.00601654052734375, "learning_rate": 2.6887527274895657e-06, "loss": 0.0002, "reward": 0.7916666716337204, "reward_std": 0.1587713211774826, "rewards/accuracy_reward": 0.7916666716337204, "rewards/format_reward": 0.0, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 521.008349609375, "epoch": 0.29333333333333333, "grad_norm": 0.02587523894842855, "kl": 0.15564346313476562, "learning_rate": 2.6715014274710265e-06, "loss": 0.0062, "reward": 0.8583333373069764, "reward_std": 0.12990380823612213, "rewards/accuracy_reward": 0.8583333373069764, "rewards/format_reward": 0.0, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 614.7916824340821, "epoch": 0.2986666666666667, "grad_norm": 0.014189638152861336, "kl": 0.040869140625, "learning_rate": 2.65384338934725e-06, "loss": 0.0016, "reward": 0.7583333373069763, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.7583333373069763, "rewards/format_reward": 0.0, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 576.7083457946777, "epoch": 0.304, "grad_norm": 0.84554441688506, "kl": 0.0066776275634765625, "learning_rate": 2.6357847438806916e-06, "loss": 0.0003, "reward": 0.7916666701436043, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.7916666701436043, "rewards/format_reward": 0.0, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 618.333345413208, "epoch": 0.30933333333333335, "grad_norm": 0.3105742502280914, "kl": 0.008008956909179688, "learning_rate": 2.617331760922218e-06, "loss": 0.0003, "reward": 0.7250000029802323, "reward_std": 0.1154700517654419, "rewards/accuracy_reward": 0.7250000029802323, "rewards/format_reward": 0.0, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 641.9583492279053, "epoch": 0.31466666666666665, "grad_norm": 0.7728079687941386, "kl": 0.008575248718261718, "learning_rate": 2.598490847234253e-06, "loss": 0.0003, "reward": 0.6833333373069763, "reward_std": 0.12990380823612213, "rewards/accuracy_reward": 0.6833333373069763, "rewards/format_reward": 0.0, "step": 295 }, { "epoch": 0.32, "grad_norm": 0.18440635331679817, "learning_rate": 2.5792685442663883e-06, "loss": 0.0002, "step": 300 }, { "epoch": 0.32, "eval_clip_ratio": 0.0, "eval_completion_length": 576.1156858398438, "eval_kl": 0.0124282958984375, "eval_loss": 0.015103225596249104, "eval_reward": 0.665000018286705, "eval_reward_std": 0.1802487503528595, "eval_rewards/accuracy_reward": 0.665000018286705, "eval_rewards/format_reward": 0.0, "eval_runtime": 5982.2541, "eval_samples_per_second": 0.836, "eval_steps_per_second": 0.035, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 556.7458492279053, "epoch": 0.3253333333333333, "grad_norm": 0.5403363319648429, "kl": 0.0075927734375, "learning_rate": 2.559671525884232e-06, "loss": 0.0004, "reward": 0.7541666712611914, "reward_std": 0.17320507764816284, "rewards/accuracy_reward": 0.7541666712611914, "rewards/format_reward": 0.0, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 574.9333492279053, "epoch": 0.33066666666666666, "grad_norm": 0.3724847648939602, "kl": 0.006891632080078125, "learning_rate": 2.539706596052286e-06, "loss": 0.0003, "reward": 0.7583333410322666, "reward_std": 0.25980761647224426, "rewards/accuracy_reward": 0.7583333410322666, "rewards/format_reward": 0.0, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 474.5166786193848, "epoch": 0.336, "grad_norm": 0.36557996981307944, "kl": 0.011857986450195312, "learning_rate": 2.5193806864716466e-06, "loss": 0.0005, "reward": 0.8333333365619182, "reward_std": 0.10103629529476166, "rewards/accuracy_reward": 0.8333333365619182, "rewards/format_reward": 0.0, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 553.7416828155517, "epoch": 0.3413333333333333, "grad_norm": 0.2438346381823879, "kl": 0.008208465576171876, "learning_rate": 2.4987008541733663e-06, "loss": 0.0003, "reward": 0.7833333365619183, "reward_std": 0.1154700517654419, "rewards/accuracy_reward": 0.7833333365619183, "rewards/format_reward": 0.0, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 615.3500122070312, "epoch": 0.3466666666666667, "grad_norm": 0.340457263416433, "kl": 0.00625, "learning_rate": 2.477674279068291e-06, "loss": 0.0003, "reward": 0.7250000037252903, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.7250000037252903, "rewards/format_reward": 0.0, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 603.6416778564453, "epoch": 0.352, "grad_norm": 0.16359424999897557, "kl": 0.0068511962890625, "learning_rate": 2.4563082614542412e-06, "loss": 0.0003, "reward": 0.8333333358168602, "reward_std": 0.08660253882408142, "rewards/accuracy_reward": 0.8333333358168602, "rewards/format_reward": 0.0, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 599.6083488464355, "epoch": 0.35733333333333334, "grad_norm": 0.25279559853298, "kl": 0.00695953369140625, "learning_rate": 2.4346102194813937e-06, "loss": 0.0003, "reward": 0.8250000029802322, "reward_std": 0.10103629529476166, "rewards/accuracy_reward": 0.8250000029802322, "rewards/format_reward": 0.0, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 526.5416801452636, "epoch": 0.3626666666666667, "grad_norm": 0.6628547412135258, "kl": 0.009409332275390625, "learning_rate": 2.4125876865767443e-06, "loss": 0.0004, "reward": 0.8250000037252903, "reward_std": 0.12990380823612213, "rewards/accuracy_reward": 0.8250000037252903, "rewards/format_reward": 0.0, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 622.2250129699707, "epoch": 0.368, "grad_norm": 0.12252643555238332, "kl": 0.009972000122070312, "learning_rate": 2.390248308828548e-06, "loss": 0.0004, "reward": 0.7750000037252903, "reward_std": 0.1154700517654419, "rewards/accuracy_reward": 0.7750000037252903, "rewards/format_reward": 0.0, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 561.5333473205567, "epoch": 0.37333333333333335, "grad_norm": 0.25234822752428143, "kl": 0.008985519409179688, "learning_rate": 2.367599842331646e-06, "loss": 0.0004, "reward": 0.8583333373069764, "reward_std": 0.12990380823612213, "rewards/accuracy_reward": 0.8583333373069764, "rewards/format_reward": 0.0, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 615.158349609375, "epoch": 0.37866666666666665, "grad_norm": 0.4811409561182482, "kl": 0.0074596405029296875, "learning_rate": 2.344650150494596e-06, "loss": 0.0003, "reward": 0.8083333380520343, "reward_std": 0.1587713211774826, "rewards/accuracy_reward": 0.8083333380520343, "rewards/format_reward": 0.0, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 585.7250133514405, "epoch": 0.384, "grad_norm": 0.24324280894262207, "kl": 0.010303878784179687, "learning_rate": 2.3214072013095436e-06, "loss": 0.0004, "reward": 0.7666666701436042, "reward_std": 0.1154700517654419, "rewards/accuracy_reward": 0.7666666701436042, "rewards/format_reward": 0.0, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 592.5333507537841, "epoch": 0.3893333333333333, "grad_norm": 0.6103293670855143, "kl": 0.0204681396484375, "learning_rate": 2.2978790645857867e-06, "loss": 0.0008, "reward": 0.7750000074505806, "reward_std": 0.24537386000156403, "rewards/accuracy_reward": 0.7750000074505806, "rewards/format_reward": 0.0, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 628.2000106811523, "epoch": 0.39466666666666667, "grad_norm": 0.46834964962728914, "kl": 0.03272857666015625, "learning_rate": 2.274073909147986e-06, "loss": 0.0013, "reward": 0.6916666716337204, "reward_std": 0.17320507764816284, "rewards/accuracy_reward": 0.6916666716337204, "rewards/format_reward": 0.0, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 590.9000160217286, "epoch": 0.4, "grad_norm": 2.0549829286804853, "kl": 0.06357192993164062, "learning_rate": 2.25e-06, "loss": 0.0025, "reward": 0.7083333373069763, "reward_std": 0.12990380823612213, "rewards/accuracy_reward": 0.7083333373069763, "rewards/format_reward": 0.0, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 585.6750099182129, "epoch": 0.4053333333333333, "grad_norm": 12.164472239999887, "kl": 0.3590213775634766, "learning_rate": 2.225665695455325e-06, "loss": 0.0143, "reward": 0.7000000052154064, "reward_std": 0.1587713211774826, "rewards/accuracy_reward": 0.7000000052154064, "rewards/format_reward": 0.0, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 550.1000122070312, "epoch": 0.4106666666666667, "grad_norm": 1.366348076215836, "kl": 0.05910415649414062, "learning_rate": 2.20107944423514e-06, "loss": 0.0024, "reward": 0.8000000067055225, "reward_std": 0.20207259058952332, "rewards/accuracy_reward": 0.8000000067055225, "rewards/format_reward": 0.0, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 548.5416797637939, "epoch": 0.416, "grad_norm": 0.8962462379303796, "kl": 0.33824996948242186, "learning_rate": 2.1762497825349665e-06, "loss": 0.0135, "reward": 0.7416666708886623, "reward_std": 0.17320507764816284, "rewards/accuracy_reward": 0.7416666708886623, "rewards/format_reward": 0.0, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 578.9750118255615, "epoch": 0.42133333333333334, "grad_norm": 1.8407040143574982, "kl": 1.4021547317504883, "learning_rate": 2.1511853310609467e-06, "loss": 0.0558, "reward": 0.6583333373069763, "reward_std": 0.1587713211774826, "rewards/accuracy_reward": 0.6583333373069763, "rewards/format_reward": 0.0, "step": 395 }, { "epoch": 0.4266666666666667, "grad_norm": 1.2731052300370491, "learning_rate": 2.1258947920367943e-06, "loss": 0.0066, "step": 400 }, { "epoch": 0.4266666666666667, "eval_clip_ratio": 0.0, "eval_completion_length": 569.7938848144531, "eval_kl": 13424.193332373046, "eval_loss": 583.3372192382812, "eval_reward": 0.6630666847705841, "eval_reward_std": 0.18186533155441284, "eval_rewards/accuracy_reward": 0.6630666847705841, "eval_rewards/format_reward": 0.0, "eval_runtime": 5981.5778, "eval_samples_per_second": 0.836, "eval_steps_per_second": 0.035, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 569.2000112533569, "epoch": 0.432, "grad_norm": 0.38484188447051687, "kl": 0.11343555450439453, "learning_rate": 2.100386946182431e-06, "loss": 0.0025, "reward": 0.7250000055879354, "reward_std": 0.20207259058952332, "rewards/accuracy_reward": 0.7250000055879354, "rewards/format_reward": 0.0, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 554.5666835784912, "epoch": 0.43733333333333335, "grad_norm": 1.003802162008014, "kl": 0.1323028564453125, "learning_rate": 2.0746706496653765e-06, "loss": 0.0053, "reward": 0.7500000044703483, "reward_std": 0.17320507764816284, "rewards/accuracy_reward": 0.7500000044703483, "rewards/format_reward": 0.0, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 586.441682434082, "epoch": 0.44266666666666665, "grad_norm": 1.3927518076538108, "kl": 0.047705459594726565, "learning_rate": 2.048754831025942e-06, "loss": 0.0019, "reward": 0.7666666738688945, "reward_std": 0.24537386000156403, "rewards/accuracy_reward": 0.7666666738688945, "rewards/format_reward": 0.0, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 558.3500160217285, "epoch": 0.448, "grad_norm": 2.893427001373877, "kl": 0.1003173828125, "learning_rate": 2.0226484880772943e-06, "loss": 0.004, "reward": 0.8250000014901161, "reward_std": 0.05773502588272095, "rewards/accuracy_reward": 0.8250000014901161, "rewards/format_reward": 0.0, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 590.9166778564453, "epoch": 0.4533333333333333, "grad_norm": 0.36645522567137545, "kl": 0.13446083068847656, "learning_rate": 1.9963606847814702e-06, "loss": 0.0054, "reward": 0.7416666716337204, "reward_std": 0.18763883411884308, "rewards/accuracy_reward": 0.7416666716337204, "rewards/format_reward": 0.0, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 537.2083503723145, "epoch": 0.45866666666666667, "grad_norm": 0.3671458640275, "kl": 0.054613494873046876, "learning_rate": 1.9699005481024273e-06, "loss": 0.0022, "reward": 0.7750000044703483, "reward_std": 0.1587713211774826, "rewards/accuracy_reward": 0.7750000044703483, "rewards/format_reward": 0.0, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 601.3833442687989, "epoch": 0.464, "grad_norm": 1.046879617710471, "kl": 0.020965576171875, "learning_rate": 1.943277264837214e-06, "loss": 0.0008, "reward": 0.7333333358168602, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.7333333358168602, "rewards/format_reward": 0.0, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 547.3333511352539, "epoch": 0.4693333333333333, "grad_norm": 0.21409956983248427, "kl": 0.018289947509765626, "learning_rate": 1.9165000784263734e-06, "loss": 0.0007, "reward": 0.7666666716337204, "reward_std": 0.1587713211774826, "rewards/accuracy_reward": 0.7666666716337204, "rewards/format_reward": 0.0, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 566.5416786193848, "epoch": 0.4746666666666667, "grad_norm": 0.3456348995650795, "kl": 0.03510856628417969, "learning_rate": 1.8895782857446754e-06, "loss": 0.0014, "reward": 0.7916666723787784, "reward_std": 0.17320507764816284, "rewards/accuracy_reward": 0.7916666723787784, "rewards/format_reward": 0.0, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 580.8000129699707, "epoch": 0.48, "grad_norm": 0.9792448320575614, "kl": 0.030012130737304688, "learning_rate": 1.8625212338733005e-06, "loss": 0.0012, "reward": 0.7500000044703483, "reward_std": 0.1587713211774826, "rewards/accuracy_reward": 0.7500000044703483, "rewards/format_reward": 0.0, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 612.6583484649658, "epoch": 0.48533333333333334, "grad_norm": 1.9811858834651732, "kl": 0.045981216430664065, "learning_rate": 1.835338316854588e-06, "loss": 0.0018, "reward": 0.7083333395421505, "reward_std": 0.20207259058952332, "rewards/accuracy_reward": 0.7083333395421505, "rewards/format_reward": 0.0, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 580.3916793823242, "epoch": 0.49066666666666664, "grad_norm": 0.22045853934983553, "kl": 0.022677230834960937, "learning_rate": 1.8080389724304863e-06, "loss": 0.0009, "reward": 0.7833333380520344, "reward_std": 0.1587713211774826, "rewards/accuracy_reward": 0.7833333380520344, "rewards/format_reward": 0.0, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 614.6166839599609, "epoch": 0.496, "grad_norm": 0.29012755281009805, "kl": 0.03320999145507812, "learning_rate": 1.7806326787658219e-06, "loss": 0.0013, "reward": 0.7000000067055225, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.7000000067055225, "rewards/format_reward": 0.0, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 616.9083450317382, "epoch": 0.5013333333333333, "grad_norm": 0.45521762259008347, "kl": 0.05942230224609375, "learning_rate": 1.7531289511575427e-06, "loss": 0.0024, "reward": 0.6833333410322666, "reward_std": 0.25980761647224426, "rewards/accuracy_reward": 0.6833333410322666, "rewards/format_reward": 0.0, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 669.9333499908447, "epoch": 0.5066666666666667, "grad_norm": 1.2238798671126319, "kl": 0.04663505554199219, "learning_rate": 1.7255373387310633e-06, "loss": 0.0019, "reward": 0.6833333395421505, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.6833333395421505, "rewards/format_reward": 0.0, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 598.641682434082, "epoch": 0.512, "grad_norm": 0.6120301813056643, "kl": 0.09711380004882812, "learning_rate": 1.6978674211248676e-06, "loss": 0.0039, "reward": 0.7500000037252903, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.7500000037252903, "rewards/format_reward": 0.0, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 608.883345413208, "epoch": 0.5173333333333333, "grad_norm": 26.043777462496045, "kl": 0.18498878479003905, "learning_rate": 1.6701288051645182e-06, "loss": 0.0074, "reward": 0.6750000044703484, "reward_std": 0.1587713211774826, "rewards/accuracy_reward": 0.6750000044703484, "rewards/format_reward": 0.0, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 597.6416805267334, "epoch": 0.5226666666666666, "grad_norm": 1.3300653593847538, "kl": 0.08308296203613282, "learning_rate": 1.642331121527223e-06, "loss": 0.0033, "reward": 0.7750000029802322, "reward_std": 0.12990380823612213, "rewards/accuracy_reward": 0.7750000029802322, "rewards/format_reward": 0.0, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 557.4333480834961, "epoch": 0.528, "grad_norm": 2.4699038123399015, "kl": 0.07253971099853515, "learning_rate": 1.6144840213981257e-06, "loss": 0.0029, "reward": 0.7583333380520344, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.7583333380520344, "rewards/format_reward": 0.0, "step": 495 }, { "epoch": 0.5333333333333333, "grad_norm": 1.2756243338289226, "learning_rate": 1.5865971731194738e-06, "loss": 0.0097, "step": 500 }, { "epoch": 0.5333333333333333, "eval_clip_ratio": 0.0, "eval_completion_length": 600.5289517089843, "eval_kl": 0.145334228515625, "eval_loss": 0.03381989896297455, "eval_reward": 0.6379333512067795, "eval_reward_std": 0.20980908365249634, "eval_rewards/accuracy_reward": 0.6379333512067795, "eval_rewards/format_reward": 0.0, "eval_runtime": 6091.2544, "eval_samples_per_second": 0.821, "eval_steps_per_second": 0.034, "step": 500 }, { "clip_ratio": 0.0, "completion_length": 585.5958461761475, "epoch": 0.5386666666666666, "grad_norm": 0.9549348299070431, "kl": 0.15351810455322265, "learning_rate": 1.5586802588338262e-06, "loss": 0.0026, "reward": 0.7125000048428773, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.7125000048428773, "rewards/format_reward": 0.0, "step": 505 }, { "clip_ratio": 0.0, "completion_length": 578.0666816711425, "epoch": 0.544, "grad_norm": 1.2238718249429645, "kl": 0.026650619506835938, "learning_rate": 1.5307429711224756e-06, "loss": 0.0011, "reward": 0.7583333380520344, "reward_std": 0.1587713211774826, "rewards/accuracy_reward": 0.7583333380520344, "rewards/format_reward": 0.0, "step": 510 }, { "clip_ratio": 0.0, "completion_length": 592.3000137329102, "epoch": 0.5493333333333333, "grad_norm": 9.727501760877075, "kl": 0.07199592590332031, "learning_rate": 1.5027950096402447e-06, "loss": 0.0029, "reward": 0.7416666731238365, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.7416666731238365, "rewards/format_reward": 0.0, "step": 515 }, { "clip_ratio": 0.0, "completion_length": 616.6750164031982, "epoch": 0.5546666666666666, "grad_norm": 15.439896164361146, "kl": 0.05631599426269531, "learning_rate": 1.474846077747821e-06, "loss": 0.0023, "reward": 0.7500000059604645, "reward_std": 0.20207259058952332, "rewards/accuracy_reward": 0.7500000059604645, "rewards/format_reward": 0.0, "step": 520 }, { "clip_ratio": 0.0, "completion_length": 613.5250183105469, "epoch": 0.56, "grad_norm": 0.9960427525825399, "kl": 0.05883331298828125, "learning_rate": 1.4469058791428154e-06, "loss": 0.0024, "reward": 0.6916666723787784, "reward_std": 0.20207259058952332, "rewards/accuracy_reward": 0.6916666723787784, "rewards/format_reward": 0.0, "step": 525 }, { "clip_ratio": 0.0, "completion_length": 591.8166831970215, "epoch": 0.5653333333333334, "grad_norm": 3.046337395450479, "kl": 0.07379722595214844, "learning_rate": 1.4189841144906928e-06, "loss": 0.0029, "reward": 0.7583333380520344, "reward_std": 0.1587713211774826, "rewards/accuracy_reward": 0.7583333380520344, "rewards/format_reward": 0.0, "step": 530 }, { "clip_ratio": 0.0, "completion_length": 542.066683959961, "epoch": 0.5706666666666667, "grad_norm": 1.340816244971557, "kl": 0.04628486633300781, "learning_rate": 1.3910904780567642e-06, "loss": 0.0019, "reward": 0.8250000052154064, "reward_std": 0.1587713211774826, "rewards/accuracy_reward": 0.8250000052154064, "rewards/format_reward": 0.0, "step": 535 }, { "clip_ratio": 0.0, "completion_length": 587.141682434082, "epoch": 0.576, "grad_norm": 2.6344014025933356, "kl": 0.041788482666015626, "learning_rate": 1.3632346543403946e-06, "loss": 0.0017, "reward": 0.6916666738688946, "reward_std": 0.25980761647224426, "rewards/accuracy_reward": 0.6916666738688946, "rewards/format_reward": 0.0, "step": 540 }, { "clip_ratio": 0.0, "completion_length": 641.9750152587891, "epoch": 0.5813333333333334, "grad_norm": 4.6049111419007165, "kl": 0.12230720520019531, "learning_rate": 1.335426314712607e-06, "loss": 0.0049, "reward": 0.6833333387970925, "reward_std": 0.20207259058952332, "rewards/accuracy_reward": 0.6833333387970925, "rewards/format_reward": 0.0, "step": 545 }, { "clip_ratio": 0.0, "completion_length": 688.2666839599609, "epoch": 0.5866666666666667, "grad_norm": 3.521827139400347, "kl": 0.06717300415039062, "learning_rate": 1.3076751140582396e-06, "loss": 0.0027, "reward": 0.6833333402872086, "reward_std": 0.24537386000156403, "rewards/accuracy_reward": 0.6833333402872086, "rewards/format_reward": 0.0, "step": 550 }, { "clip_ratio": 0.0, "completion_length": 577.8250099182129, "epoch": 0.592, "grad_norm": 3.564377427961792, "kl": 0.1139495849609375, "learning_rate": 1.2799906874238297e-06, "loss": 0.0045, "reward": 0.7250000029802323, "reward_std": 0.10103629529476166, "rewards/accuracy_reward": 0.7250000029802323, "rewards/format_reward": 0.0, "step": 555 }, { "clip_ratio": 0.0, "completion_length": 645.683349609375, "epoch": 0.5973333333333334, "grad_norm": 0.9903458969279051, "kl": 0.12611656188964843, "learning_rate": 1.2523826466723843e-06, "loss": 0.005, "reward": 0.6833333387970925, "reward_std": 0.18763883411884308, "rewards/accuracy_reward": 0.6833333387970925, "rewards/format_reward": 0.0, "step": 560 }, { "clip_ratio": 0.0, "completion_length": 582.9333503723144, "epoch": 0.6026666666666667, "grad_norm": 2.225108665952508, "kl": 0.07784194946289062, "learning_rate": 1.2248605771462016e-06, "loss": 0.0031, "reward": 0.6833333387970925, "reward_std": 0.18763883411884308, "rewards/accuracy_reward": 0.6833333387970925, "rewards/format_reward": 0.0, "step": 565 }, { "clip_ratio": 0.0, "completion_length": 566.6333442687989, "epoch": 0.608, "grad_norm": 3.291993199224936, "kl": 0.1331024169921875, "learning_rate": 1.1974340343388974e-06, "loss": 0.0053, "reward": 0.7083333395421505, "reward_std": 0.24537386000156403, "rewards/accuracy_reward": 0.7083333395421505, "rewards/format_reward": 0.0, "step": 570 }, { "clip_ratio": 0.0, "completion_length": 574.4333473205567, "epoch": 0.6133333333333333, "grad_norm": 3.912966689921013, "kl": 0.22335777282714844, "learning_rate": 1.1701125405777965e-06, "loss": 0.0089, "reward": 0.7500000044703483, "reward_std": 0.1587713211774826, "rewards/accuracy_reward": 0.7500000044703483, "rewards/format_reward": 0.0, "step": 575 }, { "clip_ratio": 0.0, "completion_length": 618.2916778564453, "epoch": 0.6186666666666667, "grad_norm": 21.294549926589646, "kl": 1.0905929565429688, "learning_rate": 1.142905581717841e-06, "loss": 0.0436, "reward": 0.7500000067055226, "reward_std": 0.2309401035308838, "rewards/accuracy_reward": 0.7500000067055226, "rewards/format_reward": 0.0, "step": 580 }, { "clip_ratio": 0.0, "completion_length": 533.8166828155518, "epoch": 0.624, "grad_norm": 6.403461046316986, "kl": 0.11275444030761719, "learning_rate": 1.1158226038481584e-06, "loss": 0.0045, "reward": 0.8750000037252903, "reward_std": 0.1154700517654419, "rewards/accuracy_reward": 0.8750000037252903, "rewards/format_reward": 0.0, "step": 585 }, { "clip_ratio": 0.0, "completion_length": 638.766682434082, "epoch": 0.6293333333333333, "grad_norm": 0.9273470799146647, "kl": 0.7552982330322265, "learning_rate": 1.0888730100124355e-06, "loss": 0.0302, "reward": 0.7083333402872085, "reward_std": 0.24537386000156403, "rewards/accuracy_reward": 0.7083333402872085, "rewards/format_reward": 0.0, "step": 590 }, { "clip_ratio": 0.0, "completion_length": 529.8416812896728, "epoch": 0.6346666666666667, "grad_norm": 15.858090968775276, "kl": 0.2991436004638672, "learning_rate": 1.062066156944242e-06, "loss": 0.012, "reward": 0.8333333373069763, "reward_std": 0.1587713211774826, "rewards/accuracy_reward": 0.8333333373069763, "rewards/format_reward": 0.0, "step": 595 }, { "epoch": 0.64, "grad_norm": 17.119879704647552, "learning_rate": 1.0354113518184304e-06, "loss": 0.0045, "step": 600 }, { "epoch": 0.64, "eval_clip_ratio": 0.0, "eval_completion_length": 598.1299530761719, "eval_kl": 0.886337255859375, "eval_loss": 0.07811599224805832, "eval_reward": 0.6349333519935608, "eval_reward_std": 0.20807703318595885, "eval_rewards/accuracy_reward": 0.6349333519935608, "eval_rewards/format_reward": 0.0, "eval_runtime": 6102.8046, "eval_samples_per_second": 0.819, "eval_steps_per_second": 0.034, "step": 600 }, { "clip_ratio": 0.0, "completion_length": 618.4583503723145, "epoch": 0.6453333333333333, "grad_norm": 237.8237245765829, "kl": 0.350294303894043, "learning_rate": 1.008917849019739e-06, "loss": 0.0234, "reward": 0.7333333380520344, "reward_std": 0.1587713211774826, "rewards/accuracy_reward": 0.7333333380520344, "rewards/format_reward": 0.0, "step": 605 }, { "clip_ratio": 0.0, "completion_length": 583.5416801452636, "epoch": 0.6506666666666666, "grad_norm": 7.9821732038803805, "kl": 0.12260932922363281, "learning_rate": 9.825948469297303e-07, "loss": 0.0049, "reward": 0.7750000044703483, "reward_std": 0.18763883411884308, "rewards/accuracy_reward": 0.7750000044703483, "rewards/format_reward": 0.0, "step": 610 }, { "clip_ratio": 0.0, "completion_length": 609.900015258789, "epoch": 0.656, "grad_norm": 94865.97576667031, "kl": 173.92265777587892, "learning_rate": 9.564514847331647e-07, "loss": 6.9811, "reward": 0.7166666716337204, "reward_std": 0.18763883411884308, "rewards/accuracy_reward": 0.7166666716337204, "rewards/format_reward": 0.0, "step": 615 }, { "clip_ratio": 0.0, "completion_length": 563.7083461761474, "epoch": 0.6613333333333333, "grad_norm": 1.7355610026067, "kl": 0.3627006530761719, "learning_rate": 9.304968392449361e-07, "loss": 0.0145, "reward": 0.7083333380520344, "reward_std": 0.1587713211774826, "rewards/accuracy_reward": 0.7083333380520344, "rewards/format_reward": 0.0, "step": 620 }, { "clip_ratio": 0.0, "completion_length": 650.9000160217286, "epoch": 0.6666666666666666, "grad_norm": 4.055117700318325, "kl": 1.220144271850586, "learning_rate": 9.047399217586552e-07, "loss": 0.0488, "reward": 0.7333333402872085, "reward_std": 0.2309401035308838, "rewards/accuracy_reward": 0.7333333402872085, "rewards/format_reward": 0.0, "step": 625 }, { "clip_ratio": 0.0, "completion_length": 596.6833438873291, "epoch": 0.672, "grad_norm": 3.2951177269303034, "kl": 0.484442138671875, "learning_rate": 8.791896749179831e-07, "loss": 0.0194, "reward": 0.6666666716337204, "reward_std": 0.1587713211774826, "rewards/accuracy_reward": 0.6666666716337204, "rewards/format_reward": 0.0, "step": 630 }, { "clip_ratio": 0.0, "completion_length": 688.1416854858398, "epoch": 0.6773333333333333, "grad_norm": 6.185421881006396, "kl": 0.8364780426025391, "learning_rate": 8.538549696118023e-07, "loss": 0.0335, "reward": 0.7083333365619182, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.7083333365619182, "rewards/format_reward": 0.0, "step": 635 }, { "clip_ratio": 0.0, "completion_length": 614.8666843414306, "epoch": 0.6826666666666666, "grad_norm": 3.6901707366378296, "kl": 0.5915939331054687, "learning_rate": 8.287446018942973e-07, "loss": 0.0236, "reward": 0.7416666716337204, "reward_std": 0.17320507764816284, "rewards/accuracy_reward": 0.7416666716337204, "rewards/format_reward": 0.0, "step": 640 }, { "clip_ratio": 0.0, "completion_length": 602.1500183105469, "epoch": 0.688, "grad_norm": 2.298300879666136, "kl": 0.34840736389160154, "learning_rate": 8.038672899310176e-07, "loss": 0.014, "reward": 0.6833333380520343, "reward_std": 0.20207259058952332, "rewards/accuracy_reward": 0.6833333380520343, "rewards/format_reward": 0.0, "step": 645 }, { "clip_ratio": 0.0, "completion_length": 615.6250175476074, "epoch": 0.6933333333333334, "grad_norm": 11.20824930878958, "kl": 0.19427947998046874, "learning_rate": 7.792316709719875e-07, "loss": 0.0078, "reward": 0.6583333395421505, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.6583333395421505, "rewards/format_reward": 0.0, "step": 650 }, { "clip_ratio": 0.0, "completion_length": 594.0416820526123, "epoch": 0.6986666666666667, "grad_norm": 11.717220592320137, "kl": 2.438280487060547, "learning_rate": 7.548462983529016e-07, "loss": 0.0976, "reward": 0.7583333425223827, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.7583333425223827, "rewards/format_reward": 0.0, "step": 655 }, { "clip_ratio": 0.0, "completion_length": 607.308352279663, "epoch": 0.704, "grad_norm": 1.5198877689737873, "kl": 0.19807205200195313, "learning_rate": 7.307196385254621e-07, "loss": 0.0079, "reward": 0.7416666723787785, "reward_std": 0.20207259058952332, "rewards/accuracy_reward": 0.7416666723787785, "rewards/format_reward": 0.0, "step": 660 }, { "clip_ratio": 0.0, "completion_length": 612.7500148773194, "epoch": 0.7093333333333334, "grad_norm": 1.2366765264367552, "kl": 0.3959392547607422, "learning_rate": 7.068600681178772e-07, "loss": 0.0158, "reward": 0.8000000052154064, "reward_std": 0.17320507764816284, "rewards/accuracy_reward": 0.8000000052154064, "rewards/format_reward": 0.0, "step": 665 }, { "clip_ratio": 0.0, "completion_length": 627.3750144958497, "epoch": 0.7146666666666667, "grad_norm": 1.26819472034522, "kl": 0.4778144836425781, "learning_rate": 6.832758710265492e-07, "loss": 0.0191, "reward": 0.6666666716337204, "reward_std": 0.18763883411884308, "rewards/accuracy_reward": 0.6666666716337204, "rewards/format_reward": 0.0, "step": 670 }, { "clip_ratio": 0.0, "completion_length": 619.6833473205567, "epoch": 0.72, "grad_norm": 4.274976105654847, "kl": 0.11606101989746094, "learning_rate": 6.599752355399538e-07, "loss": 0.0046, "reward": 0.7000000052154064, "reward_std": 0.18763883411884308, "rewards/accuracy_reward": 0.7000000052154064, "rewards/format_reward": 0.0, "step": 675 }, { "clip_ratio": 0.0, "completion_length": 638.4916831970215, "epoch": 0.7253333333333334, "grad_norm": 0.36858027198187077, "kl": 0.18754081726074218, "learning_rate": 6.369662514957191e-07, "loss": 0.0075, "reward": 0.7166666738688946, "reward_std": 0.24537386000156403, "rewards/accuracy_reward": 0.7166666738688946, "rewards/format_reward": 0.0, "step": 680 }, { "clip_ratio": 0.0, "completion_length": 608.3666835784912, "epoch": 0.7306666666666667, "grad_norm": 3.047545343694443, "kl": 0.05216217041015625, "learning_rate": 6.142569074718818e-07, "loss": 0.0021, "reward": 0.7500000059604645, "reward_std": 0.20207259058952332, "rewards/accuracy_reward": 0.7500000059604645, "rewards/format_reward": 0.0, "step": 685 }, { "clip_ratio": 0.0, "completion_length": 662.0166839599609, "epoch": 0.736, "grad_norm": 0.45904383752278927, "kl": 0.04991302490234375, "learning_rate": 5.918550880133018e-07, "loss": 0.002, "reward": 0.5833333417773247, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.5833333417773247, "rewards/format_reward": 0.0, "step": 690 }, { "clip_ratio": 0.0, "completion_length": 565.958348083496, "epoch": 0.7413333333333333, "grad_norm": 0.616371610316275, "kl": 1.7408302307128907, "learning_rate": 5.697685708941996e-07, "loss": 0.0696, "reward": 0.7166666708886623, "reward_std": 0.1587713211774826, "rewards/accuracy_reward": 0.7166666708886623, "rewards/format_reward": 0.0, "step": 695 }, { "epoch": 0.7466666666666667, "grad_norm": 1.148258838235239, "learning_rate": 5.480050244177573e-07, "loss": 0.0044, "step": 700 }, { "epoch": 0.7466666666666667, "eval_clip_ratio": 0.0, "eval_completion_length": 599.234618359375, "eval_kl": 155189.75018701173, "eval_loss": 8703.8037109375, "eval_reward": 0.6249333506584167, "eval_reward_std": 0.2200859182357788, "eval_rewards/accuracy_reward": 0.6249333506584167, "eval_rewards/format_reward": 0.0, "eval_runtime": 6119.4517, "eval_samples_per_second": 0.817, "eval_steps_per_second": 0.034, "step": 700 }, { "clip_ratio": 0.0, "completion_length": 594.833348083496, "epoch": 0.752, "grad_norm": 33.42924134136288, "kl": 0.2813268661499023, "learning_rate": 5.265720047537318e-07, "loss": 0.0181, "reward": 0.6875000059604645, "reward_std": 0.20207259058952332, "rewards/accuracy_reward": 0.6875000059604645, "rewards/format_reward": 0.0, "step": 705 }, { "clip_ratio": 0.0, "completion_length": 599.6500148773193, "epoch": 0.7573333333333333, "grad_norm": 31.602691481649085, "kl": 0.2062408447265625, "learning_rate": 5.054769533149999e-07, "loss": 0.0083, "reward": 0.8000000044703484, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.8000000044703484, "rewards/format_reward": 0.0, "step": 710 }, { "clip_ratio": 0.0, "completion_length": 563.0916854858399, "epoch": 0.7626666666666667, "grad_norm": 14.824445401418682, "kl": 0.5208763122558594, "learning_rate": 4.847271941739458e-07, "loss": 0.0209, "reward": 0.6583333395421505, "reward_std": 0.2309401035308838, "rewards/accuracy_reward": 0.6583333395421505, "rewards/format_reward": 0.0, "step": 715 }, { "clip_ratio": 0.0, "completion_length": 575.7666816711426, "epoch": 0.768, "grad_norm": 3.361014733027139, "kl": 0.14543685913085938, "learning_rate": 4.643299315195855e-07, "loss": 0.0058, "reward": 0.7083333395421505, "reward_std": 0.20207259058952332, "rewards/accuracy_reward": 0.7083333395421505, "rewards/format_reward": 0.0, "step": 720 }, { "clip_ratio": 0.0, "completion_length": 593.8666786193847, "epoch": 0.7733333333333333, "grad_norm": 0.49783824002899163, "kl": 0.14712867736816407, "learning_rate": 4.442922471563205e-07, "loss": 0.0059, "reward": 0.7250000067055226, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.7250000067055226, "rewards/format_reward": 0.0, "step": 725 }, { "clip_ratio": 0.0, "completion_length": 660.7500198364257, "epoch": 0.7786666666666666, "grad_norm": 1.3968867213585785, "kl": 11.599226379394532, "learning_rate": 4.24621098045175e-07, "loss": 0.4664, "reward": 0.6833333380520343, "reward_std": 0.18763883411884308, "rewards/accuracy_reward": 0.6833333380520343, "rewards/format_reward": 0.0, "step": 730 }, { "clip_ratio": 0.0, "completion_length": 633.0583499908447, "epoch": 0.784, "grad_norm": 1.7330543134117584, "kl": 0.2003498077392578, "learning_rate": 4.053233138883835e-07, "loss": 0.008, "reward": 0.6333333395421505, "reward_std": 0.2309401035308838, "rewards/accuracy_reward": 0.6333333395421505, "rewards/format_reward": 0.0, "step": 735 }, { "clip_ratio": 0.0, "completion_length": 607.7166835784913, "epoch": 0.7893333333333333, "grad_norm": 2.1665020676583495, "kl": 0.12012977600097656, "learning_rate": 3.864055947581605e-07, "loss": 0.0048, "reward": 0.6583333402872086, "reward_std": 0.24537386000156403, "rewards/accuracy_reward": 0.6583333402872086, "rewards/format_reward": 0.0, "step": 740 }, { "clip_ratio": 0.0, "completion_length": 669.0750186920166, "epoch": 0.7946666666666666, "grad_norm": 1.1622261596005705, "kl": 0.07356147766113282, "learning_rate": 3.6787450877047543e-07, "loss": 0.0029, "reward": 0.6333333387970924, "reward_std": 0.17320507764816284, "rewards/accuracy_reward": 0.6333333387970924, "rewards/format_reward": 0.0, "step": 745 }, { "clip_ratio": 0.0, "completion_length": 578.5916860580444, "epoch": 0.8, "grad_norm": 42.60513773493881, "kl": 0.33280181884765625, "learning_rate": 3.4973648980464454e-07, "loss": 0.0133, "reward": 0.6833333395421505, "reward_std": 0.20207259058952332, "rewards/accuracy_reward": 0.6833333395421505, "rewards/format_reward": 0.0, "step": 750 }, { "clip_ratio": 0.0, "completion_length": 572.9250129699707, "epoch": 0.8053333333333333, "grad_norm": 0.5348584171724263, "kl": 0.2312854766845703, "learning_rate": 3.3199783526952656e-07, "loss": 0.0092, "reward": 0.7000000059604645, "reward_std": 0.2309401035308838, "rewards/accuracy_reward": 0.7000000059604645, "rewards/format_reward": 0.0, "step": 755 }, { "clip_ratio": 0.0, "completion_length": 618.1166831970215, "epoch": 0.8106666666666666, "grad_norm": 1.0994184328283247, "kl": 0.5087726593017579, "learning_rate": 3.146647039171002e-07, "loss": 0.0203, "reward": 0.6500000089406968, "reward_std": 0.303108885884285, "rewards/accuracy_reward": 0.6500000089406968, "rewards/format_reward": 0.0, "step": 760 }, { "clip_ratio": 0.0, "completion_length": 580.0750160217285, "epoch": 0.816, "grad_norm": 0.4374571849706074, "kl": 0.1738201141357422, "learning_rate": 2.977431137041848e-07, "loss": 0.0069, "reward": 0.7666666731238365, "reward_std": 0.20207259058952332, "rewards/accuracy_reward": 0.7666666731238365, "rewards/format_reward": 0.0, "step": 765 }, { "clip_ratio": 0.0, "completion_length": 605.3416835784913, "epoch": 0.8213333333333334, "grad_norm": 4.308504111793084, "kl": 0.3406654357910156, "learning_rate": 2.8123893970304154e-07, "loss": 0.0136, "reward": 0.7416666723787785, "reward_std": 0.20207259058952332, "rewards/accuracy_reward": 0.7416666723787785, "rewards/format_reward": 0.0, "step": 770 }, { "clip_ratio": 0.0, "completion_length": 605.7750175476074, "epoch": 0.8266666666666667, "grad_norm": 4.226173869297776, "kl": 0.3512901306152344, "learning_rate": 2.651579120615855e-07, "loss": 0.014, "reward": 0.7333333395421505, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.7333333395421505, "rewards/format_reward": 0.0, "step": 775 }, { "clip_ratio": 0.0, "completion_length": 630.9083503723144, "epoch": 0.832, "grad_norm": 2.7537487444245845, "kl": 0.2918212890625, "learning_rate": 2.495056140139119e-07, "loss": 0.0117, "reward": 0.6833333402872086, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.6833333402872086, "rewards/format_reward": 0.0, "step": 780 }, { "clip_ratio": 0.0, "completion_length": 699.4166809082031, "epoch": 0.8373333333333334, "grad_norm": 2.403342890055738, "kl": 0.37193450927734373, "learning_rate": 2.3428747994183364e-07, "loss": 0.0149, "reward": 0.6416666746139527, "reward_std": 0.2742413729429245, "rewards/accuracy_reward": 0.6416666746139527, "rewards/format_reward": 0.0, "step": 785 }, { "clip_ratio": 0.0, "completion_length": 585.433345413208, "epoch": 0.8426666666666667, "grad_norm": 2.286592494028236, "kl": 0.18648300170898438, "learning_rate": 2.1950879348809548e-07, "loss": 0.0074, "reward": 0.6416666723787785, "reward_std": 0.20207259058952332, "rewards/accuracy_reward": 0.6416666723787785, "rewards/format_reward": 0.0, "step": 790 }, { "clip_ratio": 0.0, "completion_length": 657.300016784668, "epoch": 0.848, "grad_norm": 16.438686897747974, "kl": 0.27127418518066404, "learning_rate": 2.0517468572192632e-07, "loss": 0.0109, "reward": 0.7416666731238365, "reward_std": 0.20207259058952332, "rewards/accuracy_reward": 0.7416666731238365, "rewards/format_reward": 0.0, "step": 795 }, { "epoch": 0.8533333333333334, "grad_norm": 0.8259215940647905, "learning_rate": 1.9129013335756317e-07, "loss": 0.0058, "step": 800 }, { "epoch": 0.8533333333333334, "eval_clip_ratio": 0.0, "eval_completion_length": 595.2233535644531, "eval_kl": 0.593589892578125, "eval_loss": 0.06349216401576996, "eval_reward": 0.6285333512663841, "eval_reward_std": 0.22216437902450561, "eval_rewards/accuracy_reward": 0.6285333512663841, "eval_rewards/format_reward": 0.0, "eval_runtime": 6111.5827, "eval_samples_per_second": 0.818, "eval_steps_per_second": 0.034, "step": 800 }, { "clip_ratio": 0.0, "completion_length": 593.562516784668, "epoch": 0.8586666666666667, "grad_norm": 0.4214347379103132, "kl": 0.13609142303466798, "learning_rate": 1.7785995702636698e-07, "loss": 0.0051, "reward": 0.7166666727513075, "reward_std": 0.20207259058952332, "rewards/accuracy_reward": 0.7166666727513075, "rewards/format_reward": 0.0, "step": 805 }, { "clip_ratio": 0.0, "completion_length": 633.9166858673095, "epoch": 0.864, "grad_norm": 55.27072438894511, "kl": 0.23999671936035155, "learning_rate": 1.64888819603129e-07, "loss": 0.0096, "reward": 0.7333333410322667, "reward_std": 0.24537386000156403, "rewards/accuracy_reward": 0.7333333410322667, "rewards/format_reward": 0.0, "step": 810 }, { "clip_ratio": 0.0, "completion_length": 553.4083469390869, "epoch": 0.8693333333333333, "grad_norm": 0.33509715227443615, "kl": 0.4740461349487305, "learning_rate": 1.5238122458714925e-07, "loss": 0.019, "reward": 0.8166666723787784, "reward_std": 0.18763883411884308, "rewards/accuracy_reward": 0.8166666723787784, "rewards/format_reward": 0.0, "step": 815 }, { "clip_ratio": 0.0, "completion_length": 622.3166831970215, "epoch": 0.8746666666666667, "grad_norm": 5.11412403288406, "kl": 0.2153860092163086, "learning_rate": 1.4034151453864846e-07, "loss": 0.0086, "reward": 0.7000000074505806, "reward_std": 0.24537386000156403, "rewards/accuracy_reward": 0.7000000074505806, "rewards/format_reward": 0.0, "step": 820 }, { "clip_ratio": 0.0, "completion_length": 605.7666816711426, "epoch": 0.88, "grad_norm": 0.4543276386621551, "kl": 0.3385311126708984, "learning_rate": 1.287738695710592e-07, "loss": 0.0136, "reward": 0.7416666746139526, "reward_std": 0.2742413729429245, "rewards/accuracy_reward": 0.7416666746139526, "rewards/format_reward": 0.0, "step": 825 }, { "clip_ratio": 0.0, "completion_length": 645.6416862487793, "epoch": 0.8853333333333333, "grad_norm": 4.187663668914122, "kl": 0.4212055206298828, "learning_rate": 1.1768230589971457e-07, "loss": 0.0168, "reward": 0.7083333395421505, "reward_std": 0.24537386000156403, "rewards/accuracy_reward": 0.7083333395421505, "rewards/format_reward": 0.0, "step": 830 }, { "clip_ratio": 0.0, "completion_length": 557.1166831970215, "epoch": 0.8906666666666667, "grad_norm": 0.7419623861693787, "kl": 0.22484512329101564, "learning_rate": 1.0707067444744439e-07, "loss": 0.009, "reward": 0.8250000052154064, "reward_std": 0.18763883411884308, "rewards/accuracy_reward": 0.8250000052154064, "rewards/format_reward": 0.0, "step": 835 }, { "clip_ratio": 0.0, "completion_length": 576.6416793823242, "epoch": 0.896, "grad_norm": 7.202147799954427, "kl": 0.22338714599609374, "learning_rate": 9.69426595075566e-08, "loss": 0.0089, "reward": 0.6833333387970925, "reward_std": 0.17320507764816284, "rewards/accuracy_reward": 0.6833333387970925, "rewards/format_reward": 0.0, "step": 840 }, { "clip_ratio": 0.0, "completion_length": 652.1083518981934, "epoch": 0.9013333333333333, "grad_norm": 0.604632511176017, "kl": 0.1972179412841797, "learning_rate": 8.730177746467616e-08, "loss": 0.0079, "reward": 0.6500000074505806, "reward_std": 0.25980761647224426, "rewards/accuracy_reward": 0.6500000074505806, "rewards/format_reward": 0.0, "step": 845 }, { "clip_ratio": 0.0, "completion_length": 594.641679763794, "epoch": 0.9066666666666666, "grad_norm": 5.532928736058938, "kl": 0.34342117309570314, "learning_rate": 7.81513755738742e-08, "loss": 0.0137, "reward": 0.6916666723787784, "reward_std": 0.20207259058952332, "rewards/accuracy_reward": 0.6916666723787784, "rewards/format_reward": 0.0, "step": 850 }, { "clip_ratio": 0.0, "completion_length": 591.6250190734863, "epoch": 0.912, "grad_norm": 2.930096657150403, "kl": 0.7959453582763671, "learning_rate": 6.949463079852491e-08, "loss": 0.0319, "reward": 0.7666666738688945, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.7666666738688945, "rewards/format_reward": 0.0, "step": 855 }, { "clip_ratio": 0.0, "completion_length": 583.5083526611328, "epoch": 0.9173333333333333, "grad_norm": 1.0029555883021177, "kl": 0.09391098022460938, "learning_rate": 6.133454870728111e-08, "loss": 0.0038, "reward": 0.8333333395421505, "reward_std": 0.18763883411884308, "rewards/accuracy_reward": 0.8333333395421505, "rewards/format_reward": 0.0, "step": 860 }, { "clip_ratio": 0.0, "completion_length": 619.716682434082, "epoch": 0.9226666666666666, "grad_norm": 6.476959419244453, "kl": 1.3393241882324218, "learning_rate": 5.367396243056022e-08, "loss": 0.0536, "reward": 0.6666666761040687, "reward_std": 0.303108885884285, "rewards/accuracy_reward": 0.6666666761040687, "rewards/format_reward": 0.0, "step": 865 }, { "clip_ratio": 0.0, "completion_length": 649.6083534240722, "epoch": 0.928, "grad_norm": 10.21710384011178, "kl": 0.2799686431884766, "learning_rate": 4.6515531676899316e-08, "loss": 0.0112, "reward": 0.7250000067055226, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.7250000067055226, "rewards/format_reward": 0.0, "step": 870 }, { "clip_ratio": 0.0, "completion_length": 642.5583480834961, "epoch": 0.9333333333333333, "grad_norm": 2.2223793495325506, "kl": 0.41688003540039065, "learning_rate": 3.986174180951896e-08, "loss": 0.0167, "reward": 0.7000000059604645, "reward_std": 0.20207259058952332, "rewards/accuracy_reward": 0.7000000059604645, "rewards/format_reward": 0.0, "step": 875 }, { "clip_ratio": 0.0, "completion_length": 627.0166820526123, "epoch": 0.9386666666666666, "grad_norm": 0.9639105805168805, "kl": 0.21553115844726561, "learning_rate": 3.3714902983421944e-08, "loss": 0.0086, "reward": 0.8083333402872086, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.8083333402872086, "rewards/format_reward": 0.0, "step": 880 }, { "clip_ratio": 0.0, "completion_length": 612.1166828155517, "epoch": 0.944, "grad_norm": 2.9700295445501137, "kl": 0.363385009765625, "learning_rate": 2.807714934332073e-08, "loss": 0.0145, "reward": 0.6666666738688946, "reward_std": 0.25980761647224426, "rewards/accuracy_reward": 0.6666666738688946, "rewards/format_reward": 0.0, "step": 885 }, { "clip_ratio": 0.0, "completion_length": 607.191682434082, "epoch": 0.9493333333333334, "grad_norm": 1.5280325145682865, "kl": 0.478509521484375, "learning_rate": 2.2950438282676455e-08, "loss": 0.0191, "reward": 0.7500000059604645, "reward_std": 0.20207259058952332, "rewards/accuracy_reward": 0.7500000059604645, "rewards/format_reward": 0.0, "step": 890 }, { "clip_ratio": 0.0, "completion_length": 595.3250190734864, "epoch": 0.9546666666666667, "grad_norm": 1.3220444396009707, "kl": 0.19265098571777345, "learning_rate": 1.8336549764102594e-08, "loss": 0.0077, "reward": 0.7833333380520344, "reward_std": 0.1587713211774826, "rewards/accuracy_reward": 0.7833333380520344, "rewards/format_reward": 0.0, "step": 895 }, { "epoch": 0.96, "grad_norm": 2.367573715833922, "learning_rate": 1.4237085701374109e-08, "loss": 0.0039, "step": 900 }, { "epoch": 0.96, "eval_clip_ratio": 0.0, "eval_completion_length": 597.8314856933594, "eval_kl": 0.409832080078125, "eval_loss": 0.04817482829093933, "eval_reward": 0.633200018286705, "eval_reward_std": 0.20888532328605652, "eval_rewards/accuracy_reward": 0.633200018286705, "eval_rewards/format_reward": 0.0, "eval_runtime": 6071.9836, "eval_samples_per_second": 0.823, "eval_steps_per_second": 0.034, "step": 900 }, { "clip_ratio": 0.0, "completion_length": 574.0916843414307, "epoch": 0.9653333333333334, "grad_norm": 5.414436257998155, "kl": 0.3074363708496094, "learning_rate": 1.0653469403252015e-08, "loss": 0.0207, "reward": 0.7250000070780516, "reward_std": 0.2381569817662239, "rewards/accuracy_reward": 0.7250000070780516, "rewards/format_reward": 0.0, "step": 905 }, { "clip_ratio": 0.0, "completion_length": 623.0000164031983, "epoch": 0.9706666666666667, "grad_norm": 4.617084054859332, "kl": 0.23618698120117188, "learning_rate": 7.586945079319673e-09, "loss": 0.0094, "reward": 0.6833333410322666, "reward_std": 0.25980761647224426, "rewards/accuracy_reward": 0.6833333410322666, "rewards/format_reward": 0.0, "step": 910 }, { "clip_ratio": 0.0, "completion_length": 635.7666885375977, "epoch": 0.976, "grad_norm": 4.6029934543244755, "kl": 0.3018619537353516, "learning_rate": 5.038577408000844e-09, "loss": 0.0121, "reward": 0.7166666731238365, "reward_std": 0.2309401035308838, "rewards/accuracy_reward": 0.7166666731238365, "rewards/format_reward": 0.0, "step": 915 }, { "clip_ratio": 0.0, "completion_length": 551.4250095367431, "epoch": 0.9813333333333333, "grad_norm": 3.163010566375475, "kl": 0.25188522338867186, "learning_rate": 3.009251166909699e-09, "loss": 0.0101, "reward": 0.7416666708886623, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.7416666708886623, "rewards/format_reward": 0.0, "step": 920 }, { "clip_ratio": 0.0, "completion_length": 610.1583431243896, "epoch": 0.9866666666666667, "grad_norm": 0.16665497610442914, "kl": 0.0295257568359375, "learning_rate": 1.4996709256617225e-09, "loss": 0.0012, "reward": 0.6750000052154064, "reward_std": 0.17320507764816284, "rewards/accuracy_reward": 0.6750000052154064, "rewards/format_reward": 0.0, "step": 925 }, { "clip_ratio": 0.0, "completion_length": 609.3500144958496, "epoch": 0.992, "grad_norm": 0.8617998481481968, "kl": 0.2718994140625, "learning_rate": 5.103608012512195e-10, "loss": 0.0109, "reward": 0.7166666723787785, "reward_std": 0.20207259058952332, "rewards/accuracy_reward": 0.7166666723787785, "rewards/format_reward": 0.0, "step": 930 }, { "clip_ratio": 0.0, "completion_length": 632.933349609375, "epoch": 0.9973333333333333, "grad_norm": 5.070012098805089, "kl": 0.15453033447265624, "learning_rate": 4.1664276081376796e-11, "loss": 0.0062, "reward": 0.7333333387970924, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.7333333387970924, "rewards/format_reward": 0.0, "step": 935 }, { "clip_ratio": 0.0, "completion_length": 568.4375152587891, "epoch": 0.9994666666666666, "kl": 0.20409584045410156, "reward": 0.7916666734963655, "reward_std": 0.25259073823690414, "rewards/accuracy_reward": 0.7916666734963655, "rewards/format_reward": 0.0, "step": 937, "total_flos": 0.0, "train_loss": 0.04672712115020638, "train_runtime": 102271.8167, "train_samples_per_second": 0.073, "train_steps_per_second": 0.009 } ], "logging_steps": 5, "max_steps": 937, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }