{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.17142857142857143, "eval_steps": 500, "global_step": 150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 2693.6875610351562, "entropy": 0.3662109375, "epoch": 0.001142857142857143, "grad_norm": 0.12395373731851578, "kl": 0.0, "learning_rate": 6.666666666666667e-08, "loss": 0.0, "reward": 0.7708333535119891, "reward_std": 0.4629540964961052, "rewards/accuracy_reward": 0.25000001303851604, "rewards/format_reward": 0.5208333386108279, "step": 1 }, { "completion_length": 3127.3958435058594, "entropy": 0.353515625, "epoch": 0.002285714285714286, "grad_norm": 0.14846429228782654, "kl": 0.0, "learning_rate": 1.3333333333333334e-07, "loss": 0.0, "reward": 0.6458333637565374, "reward_std": 0.4249730706214905, "rewards/accuracy_reward": 0.2812500102445483, "rewards/format_reward": 0.3645833386108279, "step": 2 }, { "completion_length": 3685.041748046875, "entropy": 0.4443359375, "epoch": 0.0034285714285714284, "grad_norm": 0.10399040579795837, "kl": 4.1425228118896484e-05, "learning_rate": 2e-07, "loss": 0.0, "reward": 0.23958333674818277, "reward_std": 0.3668827787041664, "rewards/accuracy_reward": 0.0729166679084301, "rewards/format_reward": 0.16666667256504297, "step": 3 }, { "completion_length": 2380.291778564453, "entropy": 0.40478515625, "epoch": 0.004571428571428572, "grad_norm": 0.16352659463882446, "kl": 3.409385681152344e-05, "learning_rate": 2.6666666666666667e-07, "loss": 0.0, "reward": 0.8229166865348816, "reward_std": 0.507609948515892, "rewards/accuracy_reward": 0.19791667722165585, "rewards/format_reward": 0.6250000223517418, "step": 4 }, { "completion_length": 3441.2188720703125, "entropy": 0.45458984375, "epoch": 0.005714285714285714, "grad_norm": 0.15812984108924866, "kl": 4.1961669921875e-05, "learning_rate": 3.333333333333333e-07, "loss": 0.0, "reward": 0.42708334885537624, "reward_std": 0.5058739930391312, "rewards/accuracy_reward": 0.07291666697710752, "rewards/format_reward": 0.35416667722165585, "step": 5 }, { "completion_length": 3382.3438110351562, "entropy": 0.45166015625, "epoch": 0.006857142857142857, "grad_norm": 0.15454305708408356, "kl": 4.26173210144043e-05, "learning_rate": 4e-07, "loss": 0.0, "reward": 0.40625000558793545, "reward_std": 0.5202516540884972, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.3229166744276881, "step": 6 }, { "completion_length": 3277.291748046875, "entropy": 0.39404296875, "epoch": 0.008, "grad_norm": 0.13690507411956787, "kl": 2.562999725341797e-05, "learning_rate": 4.6666666666666666e-07, "loss": 0.0, "reward": 0.8854166865348816, "reward_std": 0.6845719665288925, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.6145833432674408, "step": 7 }, { "completion_length": 2841.916748046875, "entropy": 0.36083984375, "epoch": 0.009142857142857144, "grad_norm": 0.1767321527004242, "kl": 2.4050474166870117e-05, "learning_rate": 5.333333333333333e-07, "loss": 0.0, "reward": 0.8854166967794299, "reward_std": 0.3672378845512867, "rewards/accuracy_reward": 0.3958333535119891, "rewards/format_reward": 0.4895833460614085, "step": 8 }, { "completion_length": 3480.6563110351562, "entropy": 0.4384765625, "epoch": 0.010285714285714285, "grad_norm": 0.15406936407089233, "kl": 3.796815872192383e-05, "learning_rate": 6e-07, "loss": 0.0, "reward": 0.5520833432674408, "reward_std": 0.6496799141168594, "rewards/accuracy_reward": 0.17708333488553762, "rewards/format_reward": 0.3750000074505806, "step": 9 }, { "completion_length": 2963.572967529297, "entropy": 0.3544921875, "epoch": 0.011428571428571429, "grad_norm": 0.15688633918762207, "kl": 2.5287270545959473e-05, "learning_rate": 6.666666666666666e-07, "loss": 0.0, "reward": 0.5937500223517418, "reward_std": 0.5099271312355995, "rewards/accuracy_reward": 0.17708333861082792, "rewards/format_reward": 0.4166666753590107, "step": 10 }, { "completion_length": 3573.7500610351562, "entropy": 0.37890625, "epoch": 0.012571428571428572, "grad_norm": 0.12983083724975586, "kl": 2.5391578674316406e-05, "learning_rate": 7.333333333333332e-07, "loss": 0.0, "reward": 0.3125000111758709, "reward_std": 0.5802810192108154, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.20833334140479565, "step": 11 }, { "completion_length": 2520.8958740234375, "entropy": 0.39111328125, "epoch": 0.013714285714285714, "grad_norm": 0.20449091494083405, "kl": 3.743171691894531e-05, "learning_rate": 8e-07, "loss": 0.0, "reward": 0.8020833656191826, "reward_std": 0.4411254972219467, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.6562500223517418, "step": 12 }, { "completion_length": 3038.041748046875, "entropy": 0.3828125, "epoch": 0.014857142857142857, "grad_norm": 0.14574433863162994, "kl": 2.5153160095214844e-05, "learning_rate": 8.666666666666667e-07, "loss": 0.0, "reward": 0.6875000298023224, "reward_std": 0.3254704251885414, "rewards/accuracy_reward": 0.22916666697710752, "rewards/format_reward": 0.4583333432674408, "step": 13 }, { "completion_length": 3116.3125610351562, "entropy": 0.37109375, "epoch": 0.016, "grad_norm": 0.202586367726326, "kl": 1.9026920199394226e-05, "learning_rate": 9.333333333333333e-07, "loss": 0.0, "reward": 0.5833333507180214, "reward_std": 0.4630111753940582, "rewards/accuracy_reward": 0.21875001024454832, "rewards/format_reward": 0.3645833395421505, "step": 14 }, { "completion_length": 2924.0521240234375, "entropy": 0.36328125, "epoch": 0.017142857142857144, "grad_norm": 0.09130721539258957, "kl": 1.4469027519226074e-05, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.604166679084301, "reward_std": 0.22134994342923164, "rewards/accuracy_reward": 0.1979166716337204, "rewards/format_reward": 0.4062500074505806, "step": 15 }, { "completion_length": 3887.5521850585938, "entropy": 0.4755859375, "epoch": 0.018285714285714287, "grad_norm": 0.11806491017341614, "kl": 2.9832124710083008e-05, "learning_rate": 9.998781585307575e-07, "loss": 0.0, "reward": 0.11458333674818277, "reward_std": 0.26997610181570053, "rewards/accuracy_reward": 0.041666666977107525, "rewards/format_reward": 0.07291666977107525, "step": 16 }, { "completion_length": 2579.625030517578, "entropy": 0.44091796875, "epoch": 0.019428571428571427, "grad_norm": 0.2032601535320282, "kl": 3.93986701965332e-05, "learning_rate": 9.99512700102336e-07, "loss": 0.0, "reward": 0.7083333507180214, "reward_std": 0.39187028259038925, "rewards/accuracy_reward": 0.19791667442768812, "rewards/format_reward": 0.5104166753590107, "step": 17 }, { "completion_length": 3089.104248046875, "entropy": 0.3671875, "epoch": 0.02057142857142857, "grad_norm": 0.11376938223838806, "kl": 1.2531876564025879e-05, "learning_rate": 9.989038226169207e-07, "loss": 0.0, "reward": 0.5625000251457095, "reward_std": 0.35285963863134384, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.3958333386108279, "step": 18 }, { "completion_length": 3130.760498046875, "entropy": 0.39111328125, "epoch": 0.021714285714285714, "grad_norm": 0.09636794775724411, "kl": 2.8267502784729004e-05, "learning_rate": 9.98051855792412e-07, "loss": 0.0, "reward": 0.8125000111758709, "reward_std": 0.3496965616941452, "rewards/accuracy_reward": 0.36458333395421505, "rewards/format_reward": 0.44791667722165585, "step": 19 }, { "completion_length": 2585.9896545410156, "entropy": 0.329833984375, "epoch": 0.022857142857142857, "grad_norm": 0.15105831623077393, "kl": 6.628036499023438e-05, "learning_rate": 9.969572609838744e-07, "loss": 0.0, "reward": 0.9791666716337204, "reward_std": 0.3452813923358917, "rewards/accuracy_reward": 0.2812500037252903, "rewards/format_reward": 0.6979166716337204, "step": 20 }, { "completion_length": 2804.229248046875, "entropy": 0.42578125, "epoch": 0.024, "grad_norm": 0.2109123021364212, "kl": 0.00016552209854125977, "learning_rate": 9.956206309337066e-07, "loss": 0.0, "reward": 0.6145833432674408, "reward_std": 0.4177238382399082, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.46875002048909664, "step": 21 }, { "completion_length": 1903.1459045410156, "entropy": 0.419921875, "epoch": 0.025142857142857144, "grad_norm": 0.20996998250484467, "kl": 0.00026351213455200195, "learning_rate": 9.940426894506606e-07, "loss": 0.0, "reward": 1.1041667014360428, "reward_std": 0.4033822976052761, "rewards/accuracy_reward": 0.29166667722165585, "rewards/format_reward": 0.8125000149011612, "step": 22 }, { "completion_length": 2714.0729370117188, "entropy": 0.36865234375, "epoch": 0.026285714285714287, "grad_norm": 0.16544093191623688, "kl": 0.00011658668518066406, "learning_rate": 9.922242910178859e-07, "loss": 0.0, "reward": 0.6770833507180214, "reward_std": 0.6271640285849571, "rewards/accuracy_reward": 0.1770833432674408, "rewards/format_reward": 0.5000000223517418, "step": 23 }, { "completion_length": 2834.2396850585938, "entropy": 0.373046875, "epoch": 0.027428571428571427, "grad_norm": 0.10939397662878036, "kl": 0.0001084059476852417, "learning_rate": 9.901664203302124e-07, "loss": 0.0, "reward": 0.7916666865348816, "reward_std": 0.5711240321397781, "rewards/accuracy_reward": 0.2187500074505806, "rewards/format_reward": 0.572916679084301, "step": 24 }, { "completion_length": 2877.1354370117188, "entropy": 0.4296875, "epoch": 0.02857142857142857, "grad_norm": 0.10193013399839401, "kl": 0.00018364191055297852, "learning_rate": 9.878701917609207e-07, "loss": 0.0, "reward": 0.677083358168602, "reward_std": 0.2898401468992233, "rewards/accuracy_reward": 0.2395833432674408, "rewards/format_reward": 0.4375, "step": 25 }, { "completion_length": 3221.2396850585938, "entropy": 0.4248046875, "epoch": 0.029714285714285714, "grad_norm": 0.07458896934986115, "kl": 3.0487775802612305e-05, "learning_rate": 9.853368487582886e-07, "loss": 0.0, "reward": 0.6562500149011612, "reward_std": 0.25371449440717697, "rewards/accuracy_reward": 0.19791666977107525, "rewards/format_reward": 0.4583333358168602, "step": 26 }, { "completion_length": 3297.3959350585938, "entropy": 0.45703125, "epoch": 0.030857142857142857, "grad_norm": 0.0925775095820427, "kl": 0.00012201815843582153, "learning_rate": 9.825677631722435e-07, "loss": 0.0, "reward": 0.541666679084301, "reward_std": 0.4426998719573021, "rewards/accuracy_reward": 0.15625000279396772, "rewards/format_reward": 0.385416679084301, "step": 27 }, { "completion_length": 2984.2188110351562, "entropy": 0.3994140625, "epoch": 0.032, "grad_norm": 0.12723609805107117, "kl": 0.00015980005264282227, "learning_rate": 9.795644345114794e-07, "loss": 0.0, "reward": 0.8437500447034836, "reward_std": 0.48607436567544937, "rewards/accuracy_reward": 0.3333333460614085, "rewards/format_reward": 0.5104166865348816, "step": 28 }, { "completion_length": 3707.2501220703125, "entropy": 0.43408203125, "epoch": 0.03314285714285714, "grad_norm": 0.20130394399166107, "kl": 0.0003858804702758789, "learning_rate": 9.76328489131448e-07, "loss": 0.0, "reward": 0.2500000027939677, "reward_std": 0.37919554859399796, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.18750001024454832, "step": 29 }, { "completion_length": 3099.9063110351562, "entropy": 0.384765625, "epoch": 0.03428571428571429, "grad_norm": 0.13564985990524292, "kl": 0.0005750656127929688, "learning_rate": 9.728616793536587e-07, "loss": 0.0, "reward": 0.8854166828095913, "reward_std": 0.5532158613204956, "rewards/accuracy_reward": 0.3125000009313226, "rewards/format_reward": 0.572916679084301, "step": 30 }, { "completion_length": 3310.8125610351562, "entropy": 0.40087890625, "epoch": 0.03542857142857143, "grad_norm": 0.14703762531280518, "kl": 0.0007152557373046875, "learning_rate": 9.69165882516764e-07, "loss": 0.0, "reward": 0.4583333469927311, "reward_std": 0.4937985762953758, "rewards/accuracy_reward": 0.16666667070239782, "rewards/format_reward": 0.2916666716337204, "step": 31 }, { "completion_length": 3543.666748046875, "entropy": 0.4521484375, "epoch": 0.036571428571428574, "grad_norm": 0.11301636695861816, "kl": 0.00032842159271240234, "learning_rate": 9.65243099959949e-07, "loss": 0.0, "reward": 0.6250000223517418, "reward_std": 0.46596524864435196, "rewards/accuracy_reward": 0.28125000558793545, "rewards/format_reward": 0.34375, "step": 32 }, { "completion_length": 3395.947998046875, "entropy": 0.384765625, "epoch": 0.037714285714285714, "grad_norm": 0.12107253074645996, "kl": 0.00042450428009033203, "learning_rate": 9.610954559391704e-07, "loss": 0.0, "reward": 0.604166679084301, "reward_std": 0.5497709587216377, "rewards/accuracy_reward": 0.18750000093132257, "rewards/format_reward": 0.41666667722165585, "step": 33 }, { "completion_length": 2621.218780517578, "entropy": 0.45263671875, "epoch": 0.038857142857142854, "grad_norm": 0.15317150950431824, "kl": 0.0013637542724609375, "learning_rate": 9.567251964768342e-07, "loss": 0.0001, "reward": 0.8541666865348816, "reward_std": 0.4670567326247692, "rewards/accuracy_reward": 0.31250001303851604, "rewards/format_reward": 0.5416666828095913, "step": 34 }, { "completion_length": 3166.3958740234375, "entropy": 0.43115234375, "epoch": 0.04, "grad_norm": 0.1469903290271759, "kl": 0.0011509060859680176, "learning_rate": 9.521346881455354e-07, "loss": 0.0, "reward": 0.6458333656191826, "reward_std": 0.6130613833665848, "rewards/accuracy_reward": 0.23958333767950535, "rewards/format_reward": 0.4062500149011612, "step": 35 }, { "completion_length": 3509.697998046875, "entropy": 0.513671875, "epoch": 0.04114285714285714, "grad_norm": 0.11033376306295395, "kl": 0.0011191368103027344, "learning_rate": 9.473264167865171e-07, "loss": 0.0, "reward": 0.23958333395421505, "reward_std": 0.24118434637784958, "rewards/accuracy_reward": 0.031250000931322575, "rewards/format_reward": 0.20833334140479565, "step": 36 }, { "completion_length": 3363.8333740234375, "entropy": 0.42138671875, "epoch": 0.04228571428571429, "grad_norm": 0.11778294295072556, "kl": 0.0008115768432617188, "learning_rate": 9.42302986163543e-07, "loss": 0.0, "reward": 0.2812500149011612, "reward_std": 0.13804075866937637, "rewards/accuracy_reward": 0.031250000931322575, "rewards/format_reward": 0.25, "step": 37 }, { "completion_length": 3610.8438110351562, "entropy": 0.44677734375, "epoch": 0.04342857142857143, "grad_norm": 0.061884235590696335, "kl": 0.0005736351013183594, "learning_rate": 9.370671165529144e-07, "loss": 0.0, "reward": 0.21875000558793545, "reward_std": 0.17128896713256836, "rewards/accuracy_reward": 0.10416666697710752, "rewards/format_reward": 0.11458333861082792, "step": 38 }, { "completion_length": 2926.4063110351562, "entropy": 0.36669921875, "epoch": 0.044571428571428574, "grad_norm": 0.1068028062582016, "kl": 0.0011527538299560547, "learning_rate": 9.316216432703916e-07, "loss": 0.0, "reward": 0.7708333656191826, "reward_std": 0.1930682435631752, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.5208333507180214, "step": 39 }, { "completion_length": 2785.1146545410156, "entropy": 0.388671875, "epoch": 0.045714285714285714, "grad_norm": 0.17572174966335297, "kl": 0.0032024383544921875, "learning_rate": 9.259695151358214e-07, "loss": 0.0001, "reward": 0.7291666902601719, "reward_std": 0.3721684589982033, "rewards/accuracy_reward": 0.1979166716337204, "rewards/format_reward": 0.5312500186264515, "step": 40 }, { "completion_length": 3123.947998046875, "entropy": 0.35791015625, "epoch": 0.046857142857142854, "grad_norm": 0.144905224442482, "kl": 0.0007574558258056641, "learning_rate": 9.20113792876298e-07, "loss": 0.0, "reward": 0.5416666865348816, "reward_std": 0.4893290549516678, "rewards/accuracy_reward": 0.12500000651925802, "rewards/format_reward": 0.416666679084301, "step": 41 }, { "completion_length": 3056.197998046875, "entropy": 0.48193359375, "epoch": 0.048, "grad_norm": 0.07001210004091263, "kl": 0.000598907470703125, "learning_rate": 9.140576474687263e-07, "loss": 0.0, "reward": 0.30208333395421505, "reward_std": 0.15690935403108597, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.2812500009313226, "step": 42 }, { "completion_length": 3097.479248046875, "entropy": 0.4033203125, "epoch": 0.04914285714285714, "grad_norm": 0.09348543733358383, "kl": 0.001148223876953125, "learning_rate": 9.078043584226815e-07, "loss": 0.0, "reward": 0.4895833358168602, "reward_std": 0.3020758181810379, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.32291666977107525, "step": 43 }, { "completion_length": 2797.7084197998047, "entropy": 0.39013671875, "epoch": 0.05028571428571429, "grad_norm": 0.15556485950946808, "kl": 0.0014767646789550781, "learning_rate": 9.013573120044966e-07, "loss": 0.0001, "reward": 0.8020833386108279, "reward_std": 0.3607782945036888, "rewards/accuracy_reward": 0.2708333395421505, "rewards/format_reward": 0.5312500102445483, "step": 44 }, { "completion_length": 3618.791748046875, "entropy": 0.423828125, "epoch": 0.05142857142857143, "grad_norm": 0.09923144429922104, "kl": 0.0026645660400390625, "learning_rate": 8.9471999940354e-07, "loss": 0.0001, "reward": 0.5833333348855376, "reward_std": 0.4189528524875641, "rewards/accuracy_reward": 0.2604166716337204, "rewards/format_reward": 0.3229166781529784, "step": 45 }, { "completion_length": 3482.479248046875, "entropy": 0.50634765625, "epoch": 0.052571428571428575, "grad_norm": 0.12269324064254761, "kl": 0.0013875961303710938, "learning_rate": 8.878960148416747e-07, "loss": 0.0001, "reward": 0.22916667722165585, "reward_std": 0.26679350435733795, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.18750000186264515, "step": 46 }, { "completion_length": 2958.291748046875, "entropy": 0.390625, "epoch": 0.053714285714285714, "grad_norm": 0.16963143646717072, "kl": 0.0011917352676391602, "learning_rate": 8.808890536269229e-07, "loss": 0.0, "reward": 0.8854166967794299, "reward_std": 0.5451135858893394, "rewards/accuracy_reward": 0.3437500149011612, "rewards/format_reward": 0.5416666669771075, "step": 47 }, { "completion_length": 2956.416717529297, "entropy": 0.396484375, "epoch": 0.054857142857142854, "grad_norm": 0.14105089008808136, "kl": 0.0033426284790039062, "learning_rate": 8.737029101523929e-07, "loss": 0.0001, "reward": 0.7395833507180214, "reward_std": 0.5457281768321991, "rewards/accuracy_reward": 0.29166666977107525, "rewards/format_reward": 0.4479166679084301, "step": 48 }, { "completion_length": 2448.1146850585938, "entropy": 0.36865234375, "epoch": 0.056, "grad_norm": 0.15970121324062347, "kl": 0.006764888763427734, "learning_rate": 8.663414758415478e-07, "loss": 0.0003, "reward": 0.895833395421505, "reward_std": 0.464010052382946, "rewards/accuracy_reward": 0.25000000558793545, "rewards/format_reward": 0.6458333507180214, "step": 49 }, { "completion_length": 3050.1041870117188, "entropy": 0.34521484375, "epoch": 0.05714285714285714, "grad_norm": 0.12386268377304077, "kl": 0.0011911392211914062, "learning_rate": 8.588087370409302e-07, "loss": 0.0, "reward": 0.6562500325962901, "reward_std": 0.4570996016263962, "rewards/accuracy_reward": 0.2916666818782687, "rewards/format_reward": 0.3645833497866988, "step": 50 }, { "completion_length": 2495.2708740234375, "entropy": 0.44873046875, "epoch": 0.05828571428571429, "grad_norm": 0.12384030222892761, "kl": 0.005596160888671875, "learning_rate": 8.511087728614862e-07, "loss": 0.0002, "reward": 0.6875000149011612, "reward_std": 0.3033446706831455, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.5416666716337204, "step": 51 }, { "completion_length": 3027.406280517578, "entropy": 0.384765625, "epoch": 0.05942857142857143, "grad_norm": 0.0901699811220169, "kl": 0.0021982192993164062, "learning_rate": 8.432457529696548e-07, "loss": 0.0001, "reward": 0.8750000298023224, "reward_std": 0.5351639539003372, "rewards/accuracy_reward": 0.3958333507180214, "rewards/format_reward": 0.4791666865348816, "step": 52 }, { "completion_length": 2952.8646850585938, "entropy": 0.41845703125, "epoch": 0.060571428571428575, "grad_norm": 0.09236861765384674, "kl": 0.0012669563293457031, "learning_rate": 8.352239353294194e-07, "loss": 0.0001, "reward": 0.8541666865348816, "reward_std": 0.5214647725224495, "rewards/accuracy_reward": 0.260416679084301, "rewards/format_reward": 0.5937500074505806, "step": 53 }, { "completion_length": 2996.1250610351562, "entropy": 0.3837890625, "epoch": 0.061714285714285715, "grad_norm": 0.15135987102985382, "kl": 0.0015659332275390625, "learning_rate": 8.270476638965461e-07, "loss": 0.0001, "reward": 0.9479166939854622, "reward_std": 0.7639089524745941, "rewards/accuracy_reward": 0.4062500111758709, "rewards/format_reward": 0.5416666828095913, "step": 54 }, { "completion_length": 3076.2500610351562, "entropy": 0.4130859375, "epoch": 0.06285714285714286, "grad_norm": 0.12680813670158386, "kl": 0.0023276805877685547, "learning_rate": 8.187213662662538e-07, "loss": 0.0001, "reward": 0.6979166865348816, "reward_std": 0.5675121322274208, "rewards/accuracy_reward": 0.23958333861082792, "rewards/format_reward": 0.458333358168602, "step": 55 }, { "completion_length": 3058.104248046875, "entropy": 0.4072265625, "epoch": 0.064, "grad_norm": 0.10628776252269745, "kl": 0.0009794235229492188, "learning_rate": 8.102495512755938e-07, "loss": 0.0, "reward": 0.6562500298023224, "reward_std": 0.3362164571881294, "rewards/accuracy_reward": 0.19791666697710752, "rewards/format_reward": 0.4583333469927311, "step": 56 }, { "completion_length": 3532.2813110351562, "entropy": 0.3369140625, "epoch": 0.06514285714285714, "grad_norm": 0.09391733258962631, "kl": 0.0005993843078613281, "learning_rate": 8.01636806561836e-07, "loss": 0.0, "reward": 0.3854166865348816, "reward_std": 0.3325711265206337, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.3020833432674408, "step": 57 }, { "completion_length": 2239.1145935058594, "entropy": 0.322998046875, "epoch": 0.06628571428571428, "grad_norm": 0.11698172241449356, "kl": 0.0037631988525390625, "learning_rate": 7.928877960781808e-07, "loss": 0.0002, "reward": 1.0000000223517418, "reward_std": 0.39449498802423477, "rewards/accuracy_reward": 0.2604166669771075, "rewards/format_reward": 0.7395833358168602, "step": 58 }, { "completion_length": 3092.3438110351562, "entropy": 0.3662109375, "epoch": 0.06742857142857143, "grad_norm": 0.10882271081209183, "kl": 0.001026153564453125, "learning_rate": 7.840072575681468e-07, "loss": 0.0, "reward": 0.5625000298023224, "reward_std": 0.39667778089642525, "rewards/accuracy_reward": 0.19791667442768812, "rewards/format_reward": 0.36458333395421505, "step": 59 }, { "completion_length": 3120.5209350585938, "entropy": 0.38037109375, "epoch": 0.06857142857142857, "grad_norm": 0.1319083720445633, "kl": 0.0019273757934570312, "learning_rate": 7.75e-07, "loss": 0.0001, "reward": 0.583333358168602, "reward_std": 0.49578939378261566, "rewards/accuracy_reward": 0.13541666977107525, "rewards/format_reward": 0.4479166902601719, "step": 60 }, { "completion_length": 2971.9166870117188, "entropy": 0.36669921875, "epoch": 0.06971428571428571, "grad_norm": 0.17734739184379578, "kl": 0.0010924339294433594, "learning_rate": 7.658709009626109e-07, "loss": 0.0, "reward": 0.8020833730697632, "reward_std": 0.5149242952466011, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.5937500149011612, "step": 61 }, { "completion_length": 2529.7188720703125, "entropy": 0.329833984375, "epoch": 0.07085714285714285, "grad_norm": 0.26185768842697144, "kl": 0.016622543334960938, "learning_rate": 7.566249040241553e-07, "loss": 0.0007, "reward": 0.9270833656191826, "reward_std": 0.4443442225456238, "rewards/accuracy_reward": 0.27083334885537624, "rewards/format_reward": 0.6562500149011612, "step": 62 }, { "completion_length": 2196.1250610351562, "entropy": 0.36279296875, "epoch": 0.072, "grad_norm": 0.1202726885676384, "kl": 0.0028791427612304688, "learning_rate": 7.472670160550848e-07, "loss": 0.0001, "reward": 1.1562500596046448, "reward_std": 0.43789636343717575, "rewards/accuracy_reward": 0.385416679084301, "rewards/format_reward": 0.7708333432674408, "step": 63 }, { "completion_length": 3074.041717529297, "entropy": 0.42041015625, "epoch": 0.07314285714285715, "grad_norm": 0.10591074079275131, "kl": 0.0019989013671875, "learning_rate": 7.37802304516818e-07, "loss": 0.0001, "reward": 0.6250000149011612, "reward_std": 0.4529266282916069, "rewards/accuracy_reward": 0.18750000651925802, "rewards/format_reward": 0.4375000074505806, "step": 64 }, { "completion_length": 2871.416748046875, "entropy": 0.365478515625, "epoch": 0.07428571428571429, "grad_norm": 0.16383042931556702, "kl": 0.001850128173828125, "learning_rate": 7.282358947176205e-07, "loss": 0.0001, "reward": 0.6666666865348816, "reward_std": 0.38071464747190475, "rewards/accuracy_reward": 0.19791667722165585, "rewards/format_reward": 0.4687500149011612, "step": 65 }, { "completion_length": 2014.6354370117188, "entropy": 0.33642578125, "epoch": 0.07542857142857143, "grad_norm": 0.26954010128974915, "kl": 0.008558273315429688, "learning_rate": 7.185729670371604e-07, "loss": 0.0003, "reward": 0.9375000447034836, "reward_std": 0.35192636400461197, "rewards/accuracy_reward": 0.34375000838190317, "rewards/format_reward": 0.59375, "step": 66 }, { "completion_length": 3587.479248046875, "entropy": 0.36572265625, "epoch": 0.07657142857142857, "grad_norm": 0.08370436728000641, "kl": 0.001728057861328125, "learning_rate": 7.08818754121241e-07, "loss": 0.0001, "reward": 0.22916667442768812, "reward_std": 0.2259194478392601, "rewards/accuracy_reward": 0.010416666977107525, "rewards/format_reward": 0.21875000279396772, "step": 67 }, { "completion_length": 2018.3750457763672, "entropy": 0.36474609375, "epoch": 0.07771428571428571, "grad_norm": 0.18221524357795715, "kl": 0.0046844482421875, "learning_rate": 6.989785380482312e-07, "loss": 0.0002, "reward": 0.916666716337204, "reward_std": 0.35510556399822235, "rewards/accuracy_reward": 0.2604166716337204, "rewards/format_reward": 0.6562500149011612, "step": 68 }, { "completion_length": 2231.1979370117188, "entropy": 0.41796875, "epoch": 0.07885714285714286, "grad_norm": 0.19485469162464142, "kl": 0.0045928955078125, "learning_rate": 6.890576474687263e-07, "loss": 0.0002, "reward": 0.6666666939854622, "reward_std": 0.3663952201604843, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.6041666865348816, "step": 69 }, { "completion_length": 3180.8854370117188, "entropy": 0.38232421875, "epoch": 0.08, "grad_norm": 0.13257598876953125, "kl": 0.00274658203125, "learning_rate": 6.790614547199906e-07, "loss": 0.0001, "reward": 0.45833334885537624, "reward_std": 0.4246904104948044, "rewards/accuracy_reward": 0.07291666977107525, "rewards/format_reward": 0.38541666977107525, "step": 70 }, { "completion_length": 2643.1771240234375, "entropy": 0.43798828125, "epoch": 0.08114285714285714, "grad_norm": 0.13770414888858795, "kl": 0.0029726028442382812, "learning_rate": 6.68995372916741e-07, "loss": 0.0001, "reward": 0.7500000149011612, "reward_std": 0.30269280821084976, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.5000000149011612, "step": 71 }, { "completion_length": 2940.6146545410156, "entropy": 0.4892578125, "epoch": 0.08228571428571428, "grad_norm": 0.20104120671749115, "kl": 0.0030574798583984375, "learning_rate": 6.588648530198504e-07, "loss": 0.0001, "reward": 0.5000000149011612, "reward_std": 0.47806398570537567, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.4375000111758709, "step": 72 }, { "completion_length": 3779.4583740234375, "entropy": 0.513671875, "epoch": 0.08342857142857144, "grad_norm": 0.09181614220142365, "kl": 0.0015926361083984375, "learning_rate": 6.486753808845564e-07, "loss": 0.0001, "reward": 0.3229166744276881, "reward_std": 0.4466712549328804, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.19791667070239782, "step": 73 }, { "completion_length": 3236.4688720703125, "entropy": 0.4130859375, "epoch": 0.08457142857142858, "grad_norm": 0.1495177149772644, "kl": 0.0028803348541259766, "learning_rate": 6.384324742897735e-07, "loss": 0.0001, "reward": 0.645833358168602, "reward_std": 0.4744175747036934, "rewards/accuracy_reward": 0.26041666977107525, "rewards/format_reward": 0.385416679084301, "step": 74 }, { "completion_length": 3159.635498046875, "entropy": 0.404296875, "epoch": 0.08571428571428572, "grad_norm": 0.11836569011211395, "kl": 0.0026121139526367188, "learning_rate": 6.281416799501187e-07, "loss": 0.0001, "reward": 0.697916679084301, "reward_std": 0.46560388058423996, "rewards/accuracy_reward": 0.22916666697710752, "rewards/format_reward": 0.4687500111758709, "step": 75 }, { "completion_length": 2450.041748046875, "entropy": 0.412353515625, "epoch": 0.08685714285714285, "grad_norm": 0.1628679782152176, "kl": 0.0018434524536132812, "learning_rate": 6.178085705122674e-07, "loss": 0.0001, "reward": 0.6875, "reward_std": 0.31900282949209213, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.6041666716337204, "step": 76 }, { "completion_length": 3208.3751220703125, "entropy": 0.453125, "epoch": 0.088, "grad_norm": 0.1415146142244339, "kl": 0.002140045166015625, "learning_rate": 6.074387415372676e-07, "loss": 0.0001, "reward": 0.5104166818782687, "reward_std": 0.44949568808078766, "rewards/accuracy_reward": 0.09375000558793545, "rewards/format_reward": 0.4166666744276881, "step": 77 }, { "completion_length": 2896.6771240234375, "entropy": 0.3759765625, "epoch": 0.08914285714285715, "grad_norm": 0.13142429292201996, "kl": 0.0013580322265625, "learning_rate": 5.97037808470444e-07, "loss": 0.0001, "reward": 0.7500000298023224, "reward_std": 0.5779594928026199, "rewards/accuracy_reward": 0.250000006519258, "rewards/format_reward": 0.5000000223517418, "step": 78 }, { "completion_length": 2312.0938110351562, "entropy": 0.3544921875, "epoch": 0.09028571428571429, "grad_norm": 0.1474093347787857, "kl": 0.0019969940185546875, "learning_rate": 5.866114036005362e-07, "loss": 0.0001, "reward": 0.8125000149011612, "reward_std": 0.36936958134174347, "rewards/accuracy_reward": 0.20833333674818277, "rewards/format_reward": 0.604166679084301, "step": 79 }, { "completion_length": 3418.0833740234375, "entropy": 0.49755859375, "epoch": 0.09142857142857143, "grad_norm": 0.14068344235420227, "kl": 0.002742767333984375, "learning_rate": 5.761651730097142e-07, "loss": 0.0001, "reward": 0.5208333544433117, "reward_std": 0.4246201291680336, "rewards/accuracy_reward": 0.16666666697710752, "rewards/format_reward": 0.3541666716337204, "step": 80 }, { "completion_length": 2992.0729370117188, "entropy": 0.56787109375, "epoch": 0.09257142857142857, "grad_norm": 0.1596606820821762, "kl": 0.005756378173828125, "learning_rate": 5.657047735161255e-07, "loss": 0.0002, "reward": 0.4895833432674408, "reward_std": 0.3185732662677765, "rewards/accuracy_reward": 0.11458333395421505, "rewards/format_reward": 0.3750000074505806, "step": 81 }, { "completion_length": 2483.9584350585938, "entropy": 0.39306640625, "epoch": 0.09371428571428571, "grad_norm": 0.13652034103870392, "kl": 0.002727508544921875, "learning_rate": 5.552358696106288e-07, "loss": 0.0001, "reward": 0.8020833432674408, "reward_std": 0.24248424544930458, "rewards/accuracy_reward": 0.3020833395421505, "rewards/format_reward": 0.5000000074505806, "step": 82 }, { "completion_length": 2964.8646850585938, "entropy": 0.48486328125, "epoch": 0.09485714285714286, "grad_norm": 0.10738710314035416, "kl": 0.00264739990234375, "learning_rate": 5.447641303893714e-07, "loss": 0.0001, "reward": 0.5312500074505806, "reward_std": 0.3985592797398567, "rewards/accuracy_reward": 0.1770833395421505, "rewards/format_reward": 0.3541666716337204, "step": 83 }, { "completion_length": 3069.7396240234375, "entropy": 0.45263671875, "epoch": 0.096, "grad_norm": 0.14286305010318756, "kl": 0.0017604827880859375, "learning_rate": 5.342952264838747e-07, "loss": 0.0001, "reward": 0.739583358168602, "reward_std": 0.44136959314346313, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.4895833432674408, "step": 84 }, { "completion_length": 2664.2188110351562, "entropy": 0.324951171875, "epoch": 0.09714285714285714, "grad_norm": 0.13478516042232513, "kl": 0.002002716064453125, "learning_rate": 5.238348269902859e-07, "loss": 0.0001, "reward": 0.7604166716337204, "reward_std": 0.5178688690066338, "rewards/accuracy_reward": 0.15625000186264515, "rewards/format_reward": 0.6041666679084301, "step": 85 }, { "completion_length": 2774.291748046875, "entropy": 0.465576171875, "epoch": 0.09828571428571428, "grad_norm": 0.16704270243644714, "kl": 0.00365447998046875, "learning_rate": 5.133885963994639e-07, "loss": 0.0001, "reward": 0.6250000102445483, "reward_std": 0.2606133744120598, "rewards/accuracy_reward": 0.16666666977107525, "rewards/format_reward": 0.4583333386108279, "step": 86 }, { "completion_length": 2400.302215576172, "entropy": 0.4453125, "epoch": 0.09942857142857142, "grad_norm": 0.23015545308589935, "kl": 0.0040435791015625, "learning_rate": 5.02962191529556e-07, "loss": 0.0002, "reward": 0.8125000447034836, "reward_std": 0.5173326060175896, "rewards/accuracy_reward": 0.18750001024454832, "rewards/format_reward": 0.625, "step": 87 }, { "completion_length": 2469.7396697998047, "entropy": 0.41943359375, "epoch": 0.10057142857142858, "grad_norm": 0.16470564901828766, "kl": 0.0041961669921875, "learning_rate": 4.925612584627324e-07, "loss": 0.0002, "reward": 1.0208333730697632, "reward_std": 0.693773627281189, "rewards/accuracy_reward": 0.3750000186264515, "rewards/format_reward": 0.6458333507180214, "step": 88 }, { "completion_length": 2834.635498046875, "entropy": 0.37939453125, "epoch": 0.10171428571428572, "grad_norm": 0.1899234652519226, "kl": 0.003200531005859375, "learning_rate": 4.821914294877326e-07, "loss": 0.0001, "reward": 0.6562500149011612, "reward_std": 0.5321320816874504, "rewards/accuracy_reward": 0.17708333395421505, "rewards/format_reward": 0.479166679084301, "step": 89 }, { "completion_length": 2226.3959045410156, "entropy": 0.59619140625, "epoch": 0.10285714285714286, "grad_norm": 0.15134188532829285, "kl": 0.0078887939453125, "learning_rate": 4.7185832004988133e-07, "loss": 0.0003, "reward": 0.6458333563059568, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.031250000931322575, "rewards/format_reward": 0.6145833460614085, "step": 90 }, { "completion_length": 2510.8334045410156, "entropy": 0.423828125, "epoch": 0.104, "grad_norm": 0.15344281494617462, "kl": 0.0041046142578125, "learning_rate": 4.6156752571022637e-07, "loss": 0.0002, "reward": 0.8958333637565374, "reward_std": 0.40820014476776123, "rewards/accuracy_reward": 0.2604166669771075, "rewards/format_reward": 0.6354166818782687, "step": 91 }, { "completion_length": 2551.062530517578, "entropy": 0.390869140625, "epoch": 0.10514285714285715, "grad_norm": 0.12282641232013702, "kl": 0.00506591796875, "learning_rate": 4.513246191154434e-07, "loss": 0.0002, "reward": 0.6770833432674408, "reward_std": 0.3805246874690056, "rewards/accuracy_reward": 0.09375000093132257, "rewards/format_reward": 0.5833333432674408, "step": 92 }, { "completion_length": 3784.7188110351562, "entropy": 0.638671875, "epoch": 0.10628571428571429, "grad_norm": 0.2033790946006775, "kl": 0.0058460235595703125, "learning_rate": 4.4113514698014953e-07, "loss": 0.0002, "reward": 0.0729166679084301, "reward_std": 0.18205293267965317, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0729166679084301, "step": 93 }, { "completion_length": 2915.9375, "entropy": 0.5439453125, "epoch": 0.10742857142857143, "grad_norm": 0.19546350836753845, "kl": 0.004344940185546875, "learning_rate": 4.3100462708325914e-07, "loss": 0.0002, "reward": 0.5833333395421505, "reward_std": 0.427902989089489, "rewards/accuracy_reward": 0.18750000279396772, "rewards/format_reward": 0.3958333358168602, "step": 94 }, { "completion_length": 3644.4688110351562, "entropy": 0.47607421875, "epoch": 0.10857142857142857, "grad_norm": 0.09118141978979111, "kl": 0.0022993087768554688, "learning_rate": 4.209385452800095e-07, "loss": 0.0001, "reward": 0.3958333358168602, "reward_std": 0.4476298391819, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.2916666679084301, "step": 95 }, { "completion_length": 2458.0833740234375, "entropy": 0.37353515625, "epoch": 0.10971428571428571, "grad_norm": 0.16451668739318848, "kl": 0.0038547515869140625, "learning_rate": 4.1094235253127374e-07, "loss": 0.0002, "reward": 0.8750000447034836, "reward_std": 0.4621882885694504, "rewards/accuracy_reward": 0.2708333348855376, "rewards/format_reward": 0.6041666865348816, "step": 96 }, { "completion_length": 2523.0833740234375, "entropy": 0.421875, "epoch": 0.11085714285714286, "grad_norm": 0.1885625571012497, "kl": 0.003082275390625, "learning_rate": 4.0102146195176887e-07, "loss": 0.0001, "reward": 0.927083358168602, "reward_std": 0.5721743106842041, "rewards/accuracy_reward": 0.2708333348855376, "rewards/format_reward": 0.6562500223517418, "step": 97 }, { "completion_length": 2336.5625915527344, "entropy": 0.384033203125, "epoch": 0.112, "grad_norm": 0.15700186789035797, "kl": 0.0025310516357421875, "learning_rate": 3.911812458787591e-07, "loss": 0.0001, "reward": 0.7916666865348816, "reward_std": 0.2110944464802742, "rewards/accuracy_reward": 0.13541666697710752, "rewards/format_reward": 0.6562500223517418, "step": 98 }, { "completion_length": 2524.8958740234375, "entropy": 0.379150390625, "epoch": 0.11314285714285714, "grad_norm": 0.1743498593568802, "kl": 0.003711700439453125, "learning_rate": 3.8142703296283953e-07, "loss": 0.0001, "reward": 0.8125000204890966, "reward_std": 0.4803639128804207, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.5416666697710752, "step": 99 }, { "completion_length": 2306.0521545410156, "entropy": 0.35009765625, "epoch": 0.11428571428571428, "grad_norm": 0.12883873283863068, "kl": 0.003726959228515625, "learning_rate": 3.7176410528237945e-07, "loss": 0.0001, "reward": 1.0416666865348816, "reward_std": 0.44550345838069916, "rewards/accuracy_reward": 0.3437500074505806, "rewards/format_reward": 0.6979166716337204, "step": 100 }, { "completion_length": 2115.5313110351562, "entropy": 0.437255859375, "epoch": 0.11542857142857142, "grad_norm": 0.1613183170557022, "kl": 0.00330352783203125, "learning_rate": 3.62197695483182e-07, "loss": 0.0001, "reward": 0.833333358168602, "reward_std": 0.2652370296418667, "rewards/accuracy_reward": 0.15625000558793545, "rewards/format_reward": 0.6770833358168602, "step": 101 }, { "completion_length": 1823.4167175292969, "entropy": 0.369384765625, "epoch": 0.11657142857142858, "grad_norm": 0.11039572954177856, "kl": 0.0045318603515625, "learning_rate": 3.5273298394491515e-07, "loss": 0.0002, "reward": 0.9375000298023224, "reward_std": 0.2463684342801571, "rewards/accuracy_reward": 0.12500000279396772, "rewards/format_reward": 0.8125000298023224, "step": 102 }, { "completion_length": 2301.593780517578, "entropy": 0.376953125, "epoch": 0.11771428571428572, "grad_norm": 0.27249184250831604, "kl": 0.00434112548828125, "learning_rate": 3.433750959758446e-07, "loss": 0.0002, "reward": 0.895833358168602, "reward_std": 0.5925451144576073, "rewards/accuracy_reward": 0.19791667815297842, "rewards/format_reward": 0.6979166865348816, "step": 103 }, { "completion_length": 2650.4063720703125, "entropy": 0.45458984375, "epoch": 0.11885714285714286, "grad_norm": 0.14005857706069946, "kl": 0.00519561767578125, "learning_rate": 3.3412909903738936e-07, "loss": 0.0002, "reward": 0.572916679084301, "reward_std": 0.42935075983405113, "rewards/accuracy_reward": 0.09375000186264515, "rewards/format_reward": 0.479166679084301, "step": 104 }, { "completion_length": 2248.6666870117188, "entropy": 0.35205078125, "epoch": 0.12, "grad_norm": 0.18584585189819336, "kl": 0.0032196044921875, "learning_rate": 3.250000000000001e-07, "loss": 0.0001, "reward": 0.885416716337204, "reward_std": 0.5438356846570969, "rewards/accuracy_reward": 0.25000000838190317, "rewards/format_reward": 0.6354166865348816, "step": 105 }, { "completion_length": 2213.1250915527344, "entropy": 0.30859375, "epoch": 0.12114285714285715, "grad_norm": 0.13285432755947113, "kl": 0.003154754638671875, "learning_rate": 3.159927424318531e-07, "loss": 0.0001, "reward": 1.0625000204890966, "reward_std": 0.4201487675309181, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.6562500055879354, "step": 106 }, { "completion_length": 2495.4166870117188, "entropy": 0.54296875, "epoch": 0.12228571428571429, "grad_norm": 0.2279452681541443, "kl": 0.005664825439453125, "learning_rate": 3.0711220392181934e-07, "loss": 0.0002, "reward": 0.8437500298023224, "reward_std": 0.39580530673265457, "rewards/accuracy_reward": 0.19791666697710752, "rewards/format_reward": 0.645833358168602, "step": 107 }, { "completion_length": 2470.9896545410156, "entropy": 0.41259765625, "epoch": 0.12342857142857143, "grad_norm": 0.1852155178785324, "kl": 0.005123138427734375, "learning_rate": 2.9836319343816397e-07, "loss": 0.0002, "reward": 0.7604167014360428, "reward_std": 0.44793232530355453, "rewards/accuracy_reward": 0.1770833358168602, "rewards/format_reward": 0.583333358168602, "step": 108 }, { "completion_length": 2746.9166870117188, "entropy": 0.41796875, "epoch": 0.12457142857142857, "grad_norm": 0.15881556272506714, "kl": 0.00360107421875, "learning_rate": 2.897504487244061e-07, "loss": 0.0001, "reward": 0.6458333656191826, "reward_std": 0.3334706202149391, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.4895833507180214, "step": 109 }, { "completion_length": 2482.5000610351562, "entropy": 0.400390625, "epoch": 0.12571428571428572, "grad_norm": 0.1606108397245407, "kl": 0.0030384063720703125, "learning_rate": 2.812786337337463e-07, "loss": 0.0001, "reward": 0.8750000298023224, "reward_std": 0.560508705675602, "rewards/accuracy_reward": 0.22916667349636555, "rewards/format_reward": 0.645833358168602, "step": 110 }, { "completion_length": 2674.791748046875, "entropy": 0.486572265625, "epoch": 0.12685714285714286, "grad_norm": 0.14915555715560913, "kl": 0.00421905517578125, "learning_rate": 2.729523361034538e-07, "loss": 0.0002, "reward": 0.614583358168602, "reward_std": 0.38519187271595, "rewards/accuracy_reward": 0.12500000186264515, "rewards/format_reward": 0.4895833507180214, "step": 111 }, { "completion_length": 2968.5834350585938, "entropy": 0.470703125, "epoch": 0.128, "grad_norm": 0.2177121490240097, "kl": 0.0030193328857421875, "learning_rate": 2.6477606467058035e-07, "loss": 0.0001, "reward": 0.8229166865348816, "reward_std": 0.5380749329924583, "rewards/accuracy_reward": 0.2708333386108279, "rewards/format_reward": 0.5520833507180214, "step": 112 }, { "completion_length": 1850.3125457763672, "entropy": 0.37451171875, "epoch": 0.12914285714285714, "grad_norm": 0.18240460753440857, "kl": 0.004482269287109375, "learning_rate": 2.567542470303452e-07, "loss": 0.0002, "reward": 0.9375000149011612, "reward_std": 0.39232632517814636, "rewards/accuracy_reward": 0.1979166716337204, "rewards/format_reward": 0.7395833432674408, "step": 113 }, { "completion_length": 1991.9479675292969, "entropy": 0.34228515625, "epoch": 0.13028571428571428, "grad_norm": 0.12856441736221313, "kl": 0.00384521484375, "learning_rate": 2.488912271385139e-07, "loss": 0.0002, "reward": 0.9687500298023224, "reward_std": 0.41063307225704193, "rewards/accuracy_reward": 0.16666666977107525, "rewards/format_reward": 0.802083358168602, "step": 114 }, { "completion_length": 2642.2813110351562, "entropy": 0.470703125, "epoch": 0.13142857142857142, "grad_norm": 0.1557956039905548, "kl": 0.005847930908203125, "learning_rate": 2.411912629590699e-07, "loss": 0.0002, "reward": 0.7500000149011612, "reward_std": 0.3538191542029381, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.5000000149011612, "step": 115 }, { "completion_length": 3452.9271240234375, "entropy": 0.5498046875, "epoch": 0.13257142857142856, "grad_norm": 0.12163397669792175, "kl": 0.00417327880859375, "learning_rate": 2.336585241584522e-07, "loss": 0.0002, "reward": 0.3125000009313226, "reward_std": 0.3494237996637821, "rewards/accuracy_reward": 0.09375000279396772, "rewards/format_reward": 0.21875000558793545, "step": 116 }, { "completion_length": 2859.000030517578, "entropy": 0.5322265625, "epoch": 0.1337142857142857, "grad_norm": 0.23236258327960968, "kl": 0.0061187744140625, "learning_rate": 2.2629708984760706e-07, "loss": 0.0002, "reward": 0.48958336375653744, "reward_std": 0.34661681205034256, "rewards/accuracy_reward": 0.052083334885537624, "rewards/format_reward": 0.4375000027939677, "step": 117 }, { "completion_length": 2824.354248046875, "entropy": 0.3935546875, "epoch": 0.13485714285714287, "grad_norm": 0.11868440359830856, "kl": 0.002780914306640625, "learning_rate": 2.1911094637307714e-07, "loss": 0.0001, "reward": 1.0416666865348816, "reward_std": 0.6117755249142647, "rewards/accuracy_reward": 0.4270833358168602, "rewards/format_reward": 0.6145833432674408, "step": 118 }, { "completion_length": 2026.5937805175781, "entropy": 0.446533203125, "epoch": 0.136, "grad_norm": 0.1918383240699768, "kl": 0.00518035888671875, "learning_rate": 2.1210398515832536e-07, "loss": 0.0002, "reward": 0.989583358168602, "reward_std": 0.3108450919389725, "rewards/accuracy_reward": 0.2604166716337204, "rewards/format_reward": 0.7291666865348816, "step": 119 }, { "completion_length": 1974.4270935058594, "entropy": 0.43359375, "epoch": 0.13714285714285715, "grad_norm": 0.19848716259002686, "kl": 0.006320953369140625, "learning_rate": 2.0528000059645995e-07, "loss": 0.0003, "reward": 0.864583358168602, "reward_std": 0.4301687255501747, "rewards/accuracy_reward": 0.16666666697710752, "rewards/format_reward": 0.6979166865348816, "step": 120 }, { "completion_length": 1066.2396240234375, "entropy": 0.30810546875, "epoch": 0.1382857142857143, "grad_norm": 0.15486152470111847, "kl": 0.00482940673828125, "learning_rate": 1.986426879955034e-07, "loss": 0.0002, "reward": 1.2500000298023224, "reward_std": 0.25760992616415024, "rewards/accuracy_reward": 0.3020833386108279, "rewards/format_reward": 0.9479166865348816, "step": 121 }, { "completion_length": 2534.8958740234375, "entropy": 0.4521484375, "epoch": 0.13942857142857143, "grad_norm": 0.12797077000141144, "kl": 0.00380706787109375, "learning_rate": 1.9219564157731844e-07, "loss": 0.0002, "reward": 0.8229167014360428, "reward_std": 0.3391122668981552, "rewards/accuracy_reward": 0.20833333861082792, "rewards/format_reward": 0.6145833432674408, "step": 122 }, { "completion_length": 2572.791748046875, "entropy": 0.451416015625, "epoch": 0.14057142857142857, "grad_norm": 0.1227995902299881, "kl": 0.0032405853271484375, "learning_rate": 1.8594235253127372e-07, "loss": 0.0001, "reward": 0.7083333544433117, "reward_std": 0.3739900141954422, "rewards/accuracy_reward": 0.1354166679084301, "rewards/format_reward": 0.5729166828095913, "step": 123 }, { "completion_length": 2198.6563110351562, "entropy": 0.33984375, "epoch": 0.1417142857142857, "grad_norm": 0.2465568333864212, "kl": 0.015537261962890625, "learning_rate": 1.7988620712370195e-07, "loss": 0.0006, "reward": 0.9687500298023224, "reward_std": 0.5658619552850723, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.7187500298023224, "step": 124 }, { "completion_length": 2865.250030517578, "entropy": 0.431640625, "epoch": 0.14285714285714285, "grad_norm": 0.10473211109638214, "kl": 0.00345611572265625, "learning_rate": 1.7403048486417868e-07, "loss": 0.0001, "reward": 0.6979166697710752, "reward_std": 0.3267679661512375, "rewards/accuracy_reward": 0.30208333395421505, "rewards/format_reward": 0.3958333386108279, "step": 125 }, { "completion_length": 2886.2188720703125, "entropy": 0.4560546875, "epoch": 0.144, "grad_norm": 0.09086798876523972, "kl": 0.0029506683349609375, "learning_rate": 1.6837835672960831e-07, "loss": 0.0001, "reward": 0.7395833358168602, "reward_std": 0.3677559196949005, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.5312500149011612, "step": 126 }, { "completion_length": 2771.3438415527344, "entropy": 0.41796875, "epoch": 0.14514285714285713, "grad_norm": 0.15809084475040436, "kl": 0.004436492919921875, "learning_rate": 1.6293288344708566e-07, "loss": 0.0002, "reward": 0.635416679084301, "reward_std": 0.49130160734057426, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.541666679084301, "step": 127 }, { "completion_length": 2852.291748046875, "entropy": 0.5322265625, "epoch": 0.1462857142857143, "grad_norm": 0.17982318997383118, "kl": 0.004756927490234375, "learning_rate": 1.5769701383645698e-07, "loss": 0.0002, "reward": 0.9166666967794299, "reward_std": 0.5036755502223969, "rewards/accuracy_reward": 0.3645833358168602, "rewards/format_reward": 0.5520833535119891, "step": 128 }, { "completion_length": 3367.9583740234375, "entropy": 0.49658203125, "epoch": 0.14742857142857144, "grad_norm": 0.1593668907880783, "kl": 0.00467681884765625, "learning_rate": 1.5267358321348285e-07, "loss": 0.0002, "reward": 0.4583333507180214, "reward_std": 0.4737073630094528, "rewards/accuracy_reward": 0.1666666753590107, "rewards/format_reward": 0.2916666679084301, "step": 129 }, { "completion_length": 2817.7709350585938, "entropy": 0.5, "epoch": 0.14857142857142858, "grad_norm": 0.16384749114513397, "kl": 0.00354766845703125, "learning_rate": 1.4786531185446452e-07, "loss": 0.0001, "reward": 0.479166679084301, "reward_std": 0.40873220562934875, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.4166666716337204, "step": 130 }, { "completion_length": 2720.8021850585938, "entropy": 0.49658203125, "epoch": 0.14971428571428572, "grad_norm": 0.24187295138835907, "kl": 0.004924774169921875, "learning_rate": 1.432748035231658e-07, "loss": 0.0002, "reward": 0.9062500298023224, "reward_std": 0.504379153251648, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.5312500223517418, "step": 131 }, { "completion_length": 2590.0521545410156, "entropy": 0.4248046875, "epoch": 0.15085714285714286, "grad_norm": 0.13521018624305725, "kl": 0.003265380859375, "learning_rate": 1.3890454406082956e-07, "loss": 0.0001, "reward": 0.833333358168602, "reward_std": 0.5318443104624748, "rewards/accuracy_reward": 0.2916666744276881, "rewards/format_reward": 0.541666679084301, "step": 132 }, { "completion_length": 3037.8854370117188, "entropy": 0.49072265625, "epoch": 0.152, "grad_norm": 0.17249611020088196, "kl": 0.004932403564453125, "learning_rate": 1.3475690004005097e-07, "loss": 0.0002, "reward": 0.5104166865348816, "reward_std": 0.2723224312067032, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.4062500149011612, "step": 133 }, { "completion_length": 2485.322998046875, "entropy": 0.5224609375, "epoch": 0.15314285714285714, "grad_norm": 0.16694706678390503, "kl": 0.00672149658203125, "learning_rate": 1.308341174832359e-07, "loss": 0.0003, "reward": 0.8750000149011612, "reward_std": 0.49518734961748123, "rewards/accuracy_reward": 0.2500000102445483, "rewards/format_reward": 0.6250000149011612, "step": 134 }, { "completion_length": 1746.3958740234375, "entropy": 0.373046875, "epoch": 0.15428571428571428, "grad_norm": 0.1858878880739212, "kl": 0.007049560546875, "learning_rate": 1.2713832064634125e-07, "loss": 0.0003, "reward": 1.1145833432674408, "reward_std": 0.3352552205324173, "rewards/accuracy_reward": 0.43750000558793545, "rewards/format_reward": 0.6770833432674408, "step": 135 }, { "completion_length": 2145.4584045410156, "entropy": 0.35986328125, "epoch": 0.15542857142857142, "grad_norm": 0.196150541305542, "kl": 0.00507354736328125, "learning_rate": 1.2367151086855187e-07, "loss": 0.0002, "reward": 0.9895833358168602, "reward_std": 0.6333772391080856, "rewards/accuracy_reward": 0.3125000074505806, "rewards/format_reward": 0.6770833507180214, "step": 136 }, { "completion_length": 2813.510467529297, "entropy": 0.387939453125, "epoch": 0.15657142857142858, "grad_norm": 0.13014180958271027, "kl": 0.00402069091796875, "learning_rate": 1.2043556548852063e-07, "loss": 0.0002, "reward": 0.677083358168602, "reward_std": 0.5024930611252785, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.5312500260770321, "step": 137 }, { "completion_length": 2061.0000610351562, "entropy": 0.35107421875, "epoch": 0.15771428571428572, "grad_norm": 0.10559725016355515, "kl": 0.0037689208984375, "learning_rate": 1.1743223682775649e-07, "loss": 0.0002, "reward": 0.9270833730697632, "reward_std": 0.30403000861406326, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.739583358168602, "step": 138 }, { "completion_length": 3106.291748046875, "entropy": 0.55615234375, "epoch": 0.15885714285714286, "grad_norm": 0.15882417559623718, "kl": 0.00519561767578125, "learning_rate": 1.1466315124171128e-07, "loss": 0.0002, "reward": 0.708333358168602, "reward_std": 0.5456142984330654, "rewards/accuracy_reward": 0.1666666753590107, "rewards/format_reward": 0.541666679084301, "step": 139 }, { "completion_length": 2395.885498046875, "entropy": 0.4853515625, "epoch": 0.16, "grad_norm": 0.2862064242362976, "kl": 0.006809234619140625, "learning_rate": 1.1212980823907929e-07, "loss": 0.0003, "reward": 0.7500000298023224, "reward_std": 0.38956041634082794, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.5833333507180214, "step": 140 }, { "completion_length": 1969.2292175292969, "entropy": 0.33935546875, "epoch": 0.16114285714285714, "grad_norm": 0.17285719513893127, "kl": 0.0047760009765625, "learning_rate": 1.0983357966978745e-07, "loss": 0.0002, "reward": 0.9895833730697632, "reward_std": 0.520443569868803, "rewards/accuracy_reward": 0.2187500074505806, "rewards/format_reward": 0.770833358168602, "step": 141 }, { "completion_length": 2584.9688110351562, "entropy": 0.447998046875, "epoch": 0.16228571428571428, "grad_norm": 0.13187262415885925, "kl": 0.0047740936279296875, "learning_rate": 1.0777570898211405e-07, "loss": 0.0002, "reward": 0.9166667014360428, "reward_std": 0.4435262605547905, "rewards/accuracy_reward": 0.2187500111758709, "rewards/format_reward": 0.6979166865348816, "step": 142 }, { "completion_length": 2300.7500610351562, "entropy": 0.4326171875, "epoch": 0.16342857142857142, "grad_norm": 0.25766721367836, "kl": 0.00977325439453125, "learning_rate": 1.0595731054933934e-07, "loss": 0.0004, "reward": 0.6875000074505806, "reward_std": 0.3443669453263283, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.6041666716337204, "step": 143 }, { "completion_length": 2849.4688110351562, "entropy": 0.45556640625, "epoch": 0.16457142857142856, "grad_norm": 0.14796483516693115, "kl": 0.00493621826171875, "learning_rate": 1.0437936906629334e-07, "loss": 0.0002, "reward": 0.677083358168602, "reward_std": 0.4717573896050453, "rewards/accuracy_reward": 0.2187500037252903, "rewards/format_reward": 0.4583333432674408, "step": 144 }, { "completion_length": 1885.4479522705078, "entropy": 0.353515625, "epoch": 0.1657142857142857, "grad_norm": 0.1617114096879959, "kl": 0.005008697509765625, "learning_rate": 1.0304273901612565e-07, "loss": 0.0002, "reward": 1.0208333730697632, "reward_std": 0.3080247640609741, "rewards/accuracy_reward": 0.3020833460614085, "rewards/format_reward": 0.7187500149011612, "step": 145 }, { "completion_length": 1947.0312805175781, "entropy": 0.376953125, "epoch": 0.16685714285714287, "grad_norm": 0.11680302768945694, "kl": 0.0033721923828125, "learning_rate": 1.0194814420758804e-07, "loss": 0.0001, "reward": 0.8750000149011612, "reward_std": 0.22604453563690186, "rewards/accuracy_reward": 0.07291666977107525, "rewards/format_reward": 0.8020833432674408, "step": 146 }, { "completion_length": 2215.7084045410156, "entropy": 0.392578125, "epoch": 0.168, "grad_norm": 0.21416479349136353, "kl": 0.00583648681640625, "learning_rate": 1.0109617738307911e-07, "loss": 0.0002, "reward": 0.8437500149011612, "reward_std": 0.5214347615838051, "rewards/accuracy_reward": 0.1770833395421505, "rewards/format_reward": 0.6666666865348816, "step": 147 }, { "completion_length": 1631.229248046875, "entropy": 0.302734375, "epoch": 0.16914285714285715, "grad_norm": 0.17106008529663086, "kl": 0.00476837158203125, "learning_rate": 1.0048729989766394e-07, "loss": 0.0002, "reward": 0.9895833730697632, "reward_std": 0.29220427572727203, "rewards/accuracy_reward": 0.13541666977107525, "rewards/format_reward": 0.8541666865348816, "step": 148 }, { "completion_length": 2407.8229370117188, "entropy": 0.3408203125, "epoch": 0.1702857142857143, "grad_norm": 0.15983973443508148, "kl": 0.008087158203125, "learning_rate": 1.0012184146924223e-07, "loss": 0.0003, "reward": 0.9270833432674408, "reward_std": 0.48707588016986847, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.6770833432674408, "step": 149 }, { "completion_length": 2178.729217529297, "entropy": 0.36962890625, "epoch": 0.17142857142857143, "grad_norm": 0.19081099331378937, "kl": 0.004444122314453125, "learning_rate": 1e-07, "loss": 0.0002, "reward": 1.0312500298023224, "reward_std": 0.5531396120786667, "rewards/accuracy_reward": 0.291666679084301, "rewards/format_reward": 0.7395833432674408, "step": 150 }, { "epoch": 0.17142857142857143, "step": 150, "total_flos": 0.0, "train_loss": 0.00011944215420650531, "train_runtime": 12092.6435, "train_samples_per_second": 1.191, "train_steps_per_second": 0.012 } ], "logging_steps": 1, "max_steps": 150, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }