|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9980456026058632, |
|
"eval_steps": 100, |
|
"global_step": 383, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 870.0854415893555, |
|
"epoch": 0.0026058631921824105, |
|
"grad_norm": 0.16709676384925842, |
|
"learning_rate": 5.000000000000001e-07, |
|
"loss": -0.0813, |
|
"reward": 0.49043366871774197, |
|
"reward_std": 0.34182496182620525, |
|
"rewards/accuracy_reward": 0.30165815725922585, |
|
"rewards/semantic_entropy_math_reward": 0.18877551332116127, |
|
"step": 1 |
|
}, |
|
{ |
|
"completion_length": 879.3475608825684, |
|
"epoch": 0.005211726384364821, |
|
"grad_norm": 0.22682946920394897, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": -0.0873, |
|
"reward": 0.4734876109287143, |
|
"reward_std": 0.31084196362644434, |
|
"rewards/accuracy_reward": 0.2780612204223871, |
|
"rewards/semantic_entropy_math_reward": 0.19542638212442398, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 943.9164390563965, |
|
"epoch": 0.007817589576547232, |
|
"grad_norm": 0.23663833737373352, |
|
"learning_rate": 1.5e-06, |
|
"loss": -0.0952, |
|
"reward": 0.4689322207123041, |
|
"reward_std": 0.31457919254899025, |
|
"rewards/accuracy_reward": 0.27678570710122585, |
|
"rewards/semantic_entropy_math_reward": 0.1921465010382235, |
|
"step": 3 |
|
}, |
|
{ |
|
"completion_length": 878.9853134155273, |
|
"epoch": 0.010423452768729642, |
|
"grad_norm": 0.3010266125202179, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": -0.0915, |
|
"reward": 0.48915816470980644, |
|
"reward_std": 0.3204598156735301, |
|
"rewards/accuracy_reward": 0.29783162754029036, |
|
"rewards/semantic_entropy_math_reward": 0.1913265334442258, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 861.0484504699707, |
|
"epoch": 0.013029315960912053, |
|
"grad_norm": 0.1940070539712906, |
|
"learning_rate": 2.5e-06, |
|
"loss": -0.0721, |
|
"reward": 0.45945699140429497, |
|
"reward_std": 0.31874877866357565, |
|
"rewards/accuracy_reward": 0.2710459101945162, |
|
"rewards/semantic_entropy_math_reward": 0.18841107934713364, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 937.8188629150391, |
|
"epoch": 0.015635179153094463, |
|
"grad_norm": 0.06981602311134338, |
|
"learning_rate": 3e-06, |
|
"loss": -0.096, |
|
"reward": 0.5111151468008757, |
|
"reward_std": 0.32009167689830065, |
|
"rewards/accuracy_reward": 0.30102040339261293, |
|
"rewards/semantic_entropy_math_reward": 0.21009475644677877, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 891.3507537841797, |
|
"epoch": 0.018241042345276872, |
|
"grad_norm": 0.1923670619726181, |
|
"learning_rate": 3.5e-06, |
|
"loss": -0.0935, |
|
"reward": 0.5479227416217327, |
|
"reward_std": 0.324985328130424, |
|
"rewards/accuracy_reward": 0.34183673094958067, |
|
"rewards/semantic_entropy_math_reward": 0.2060860088095069, |
|
"step": 7 |
|
}, |
|
{ |
|
"completion_length": 889.3501167297363, |
|
"epoch": 0.020846905537459284, |
|
"grad_norm": 0.04959714412689209, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": -0.1075, |
|
"reward": 0.5332543794065714, |
|
"reward_std": 0.31880190316587687, |
|
"rewards/accuracy_reward": 0.3227040730416775, |
|
"rewards/semantic_entropy_math_reward": 0.21055030450224876, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 916.3003616333008, |
|
"epoch": 0.023452768729641693, |
|
"grad_norm": 0.06921309232711792, |
|
"learning_rate": 4.5e-06, |
|
"loss": -0.0841, |
|
"reward": 0.5948433019220829, |
|
"reward_std": 0.2975190822035074, |
|
"rewards/accuracy_reward": 0.3590561132878065, |
|
"rewards/semantic_entropy_math_reward": 0.23578718956559896, |
|
"step": 9 |
|
}, |
|
{ |
|
"completion_length": 943.690673828125, |
|
"epoch": 0.026058631921824105, |
|
"grad_norm": 0.0358135886490345, |
|
"learning_rate": 5e-06, |
|
"loss": -0.0922, |
|
"reward": 0.6337463427335024, |
|
"reward_std": 0.2962046544998884, |
|
"rewards/accuracy_reward": 0.38711734022945166, |
|
"rewards/semantic_entropy_math_reward": 0.24662901367992163, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 931.7365913391113, |
|
"epoch": 0.028664495114006514, |
|
"grad_norm": 0.041781261563301086, |
|
"learning_rate": 5.500000000000001e-06, |
|
"loss": -0.0894, |
|
"reward": 0.6365706976503134, |
|
"reward_std": 0.2825650768354535, |
|
"rewards/accuracy_reward": 0.38647958263754845, |
|
"rewards/semantic_entropy_math_reward": 0.25009111408144236, |
|
"step": 11 |
|
}, |
|
{ |
|
"completion_length": 911.2397766113281, |
|
"epoch": 0.031270358306188926, |
|
"grad_norm": 0.06622398644685745, |
|
"learning_rate": 6e-06, |
|
"loss": -0.1081, |
|
"reward": 0.7182033378630877, |
|
"reward_std": 0.28334413934499025, |
|
"rewards/accuracy_reward": 0.43239795323461294, |
|
"rewards/semantic_entropy_math_reward": 0.28580539766699076, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 905.0082626342773, |
|
"epoch": 0.033876221498371335, |
|
"grad_norm": 0.02759523130953312, |
|
"learning_rate": 6.5000000000000004e-06, |
|
"loss": -0.0973, |
|
"reward": 0.6452259533107281, |
|
"reward_std": 0.27447106316685677, |
|
"rewards/accuracy_reward": 0.38201529905200005, |
|
"rewards/semantic_entropy_math_reward": 0.26321063842624426, |
|
"step": 13 |
|
}, |
|
{ |
|
"completion_length": 881.1122283935547, |
|
"epoch": 0.036482084690553744, |
|
"grad_norm": 0.02610129304230213, |
|
"learning_rate": 7e-06, |
|
"loss": -0.0942, |
|
"reward": 0.6983418371528387, |
|
"reward_std": 0.27343872655183077, |
|
"rewards/accuracy_reward": 0.4234693720936775, |
|
"rewards/semantic_entropy_math_reward": 0.2748724529519677, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 883.7735748291016, |
|
"epoch": 0.03908794788273615, |
|
"grad_norm": 0.017909087240695953, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": -0.0933, |
|
"reward": 0.6560677662491798, |
|
"reward_std": 0.25880660582333803, |
|
"rewards/accuracy_reward": 0.38265305012464523, |
|
"rewards/semantic_entropy_math_reward": 0.2734147273004055, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 899.4240875244141, |
|
"epoch": 0.04169381107491857, |
|
"grad_norm": 0.0163668654859066, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": -0.0898, |
|
"reward": 0.669825067743659, |
|
"reward_std": 0.2818459044210613, |
|
"rewards/accuracy_reward": 0.39732142351567745, |
|
"rewards/semantic_entropy_math_reward": 0.27250364422798157, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 898.7939796447754, |
|
"epoch": 0.04429967426710098, |
|
"grad_norm": 0.009694581851363182, |
|
"learning_rate": 8.5e-06, |
|
"loss": -0.0763, |
|
"reward": 0.6652696784585714, |
|
"reward_std": 0.22073616785928607, |
|
"rewards/accuracy_reward": 0.3781887646764517, |
|
"rewards/semantic_entropy_math_reward": 0.2870809203013778, |
|
"step": 17 |
|
}, |
|
{ |
|
"completion_length": 878.1428489685059, |
|
"epoch": 0.046905537459283386, |
|
"grad_norm": 0.010209346190094948, |
|
"learning_rate": 9e-06, |
|
"loss": -0.0777, |
|
"reward": 0.7228498347103596, |
|
"reward_std": 0.24326845351606607, |
|
"rewards/accuracy_reward": 0.4311224389821291, |
|
"rewards/semantic_entropy_math_reward": 0.29172741062939167, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 896.1135025024414, |
|
"epoch": 0.049511400651465795, |
|
"grad_norm": 0.013440839014947414, |
|
"learning_rate": 9.5e-06, |
|
"loss": -0.0752, |
|
"reward": 0.6634475197643042, |
|
"reward_std": 0.1989616984501481, |
|
"rewards/accuracy_reward": 0.35905611515045166, |
|
"rewards/semantic_entropy_math_reward": 0.3043913906440139, |
|
"step": 19 |
|
}, |
|
{ |
|
"completion_length": 839.2117195129395, |
|
"epoch": 0.05211726384364821, |
|
"grad_norm": 0.017011163756251335, |
|
"learning_rate": 1e-05, |
|
"loss": -0.0586, |
|
"reward": 0.7821610607206821, |
|
"reward_std": 0.21515017794445157, |
|
"rewards/accuracy_reward": 0.46938773803412914, |
|
"rewards/semantic_entropy_math_reward": 0.3127733152359724, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 887.4808578491211, |
|
"epoch": 0.05472312703583062, |
|
"grad_norm": 0.009101642295718193, |
|
"learning_rate": 9.999812749151968e-06, |
|
"loss": -0.0595, |
|
"reward": 0.6434037834405899, |
|
"reward_std": 0.1854215101338923, |
|
"rewards/accuracy_reward": 0.35331631638109684, |
|
"rewards/semantic_entropy_math_reward": 0.2900874586775899, |
|
"step": 21 |
|
}, |
|
{ |
|
"completion_length": 842.1600532531738, |
|
"epoch": 0.05732899022801303, |
|
"grad_norm": 0.011061098426580429, |
|
"learning_rate": 9.99925101063302e-06, |
|
"loss": -0.0568, |
|
"reward": 0.722485426813364, |
|
"reward_std": 0.21386212622746825, |
|
"rewards/accuracy_reward": 0.40688774874433875, |
|
"rewards/semantic_entropy_math_reward": 0.31559765338897705, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 872.0918083190918, |
|
"epoch": 0.05993485342019544, |
|
"grad_norm": 0.009069804102182388, |
|
"learning_rate": 9.998314826517564e-06, |
|
"loss": -0.0593, |
|
"reward": 0.7820699661970139, |
|
"reward_std": 0.21369126765057445, |
|
"rewards/accuracy_reward": 0.4738520346581936, |
|
"rewards/semantic_entropy_math_reward": 0.30821793153882027, |
|
"step": 23 |
|
}, |
|
{ |
|
"completion_length": 790.3061065673828, |
|
"epoch": 0.06254071661237785, |
|
"grad_norm": 0.007478907238692045, |
|
"learning_rate": 9.997004266926105e-06, |
|
"loss": -0.0458, |
|
"reward": 0.7919096164405346, |
|
"reward_std": 0.1941184471361339, |
|
"rewards/accuracy_reward": 0.45918366592377424, |
|
"rewards/semantic_entropy_math_reward": 0.3327259514480829, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 849.5184745788574, |
|
"epoch": 0.06514657980456026, |
|
"grad_norm": 0.012659430503845215, |
|
"learning_rate": 9.995319430020004e-06, |
|
"loss": -0.0484, |
|
"reward": 0.7839832194149494, |
|
"reward_std": 0.1770358441863209, |
|
"rewards/accuracy_reward": 0.46301019191741943, |
|
"rewards/semantic_entropy_math_reward": 0.32097303587943316, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 840.4846801757812, |
|
"epoch": 0.06775244299674267, |
|
"grad_norm": 0.010360866785049438, |
|
"learning_rate": 9.993260441994116e-06, |
|
"loss": -0.0613, |
|
"reward": 0.8100400976836681, |
|
"reward_std": 0.1907408689148724, |
|
"rewards/accuracy_reward": 0.484693868085742, |
|
"rewards/semantic_entropy_math_reward": 0.32534620352089405, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 820.5082778930664, |
|
"epoch": 0.07035830618892508, |
|
"grad_norm": 0.01019768975675106, |
|
"learning_rate": 9.990827457067342e-06, |
|
"loss": -0.0412, |
|
"reward": 0.8268039189279079, |
|
"reward_std": 0.15273668291047215, |
|
"rewards/accuracy_reward": 0.49362243339419365, |
|
"rewards/semantic_entropy_math_reward": 0.33318148739635944, |
|
"step": 27 |
|
}, |
|
{ |
|
"completion_length": 836.6026649475098, |
|
"epoch": 0.07296416938110749, |
|
"grad_norm": 0.009926711209118366, |
|
"learning_rate": 9.988020657471078e-06, |
|
"loss": -0.0391, |
|
"reward": 0.728134099394083, |
|
"reward_std": 0.15909703564830124, |
|
"rewards/accuracy_reward": 0.40688774175941944, |
|
"rewards/semantic_entropy_math_reward": 0.32124634832143784, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 793.8048286437988, |
|
"epoch": 0.0755700325732899, |
|
"grad_norm": 0.020279383286833763, |
|
"learning_rate": 9.984840253435569e-06, |
|
"loss": -0.0405, |
|
"reward": 0.8198797274380922, |
|
"reward_std": 0.18700192403048277, |
|
"rewards/accuracy_reward": 0.4929846879094839, |
|
"rewards/semantic_entropy_math_reward": 0.32689503859728575, |
|
"step": 29 |
|
}, |
|
{ |
|
"completion_length": 744.598201751709, |
|
"epoch": 0.0781758957654723, |
|
"grad_norm": 0.011854313313961029, |
|
"learning_rate": 9.98128648317415e-06, |
|
"loss": -0.0396, |
|
"reward": 0.850127536803484, |
|
"reward_std": 0.16003184486180544, |
|
"rewards/accuracy_reward": 0.48915815353393555, |
|
"rewards/semantic_entropy_math_reward": 0.3609693832695484, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 764.6517677307129, |
|
"epoch": 0.08078175895765473, |
|
"grad_norm": 0.007330858614295721, |
|
"learning_rate": 9.977359612865424e-06, |
|
"loss": -0.0272, |
|
"reward": 0.9031523056328297, |
|
"reward_std": 0.1402361944783479, |
|
"rewards/accuracy_reward": 0.5363520253449678, |
|
"rewards/semantic_entropy_math_reward": 0.366800295189023, |
|
"step": 31 |
|
}, |
|
{ |
|
"completion_length": 734.7104434967041, |
|
"epoch": 0.08338762214983714, |
|
"grad_norm": 0.016964299604296684, |
|
"learning_rate": 9.973059936633308e-06, |
|
"loss": -0.0311, |
|
"reward": 0.8176020421087742, |
|
"reward_std": 0.1600553193129599, |
|
"rewards/accuracy_reward": 0.46619896963238716, |
|
"rewards/semantic_entropy_math_reward": 0.35140305012464523, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 766.8622283935547, |
|
"epoch": 0.08599348534201955, |
|
"grad_norm": 0.013955877162516117, |
|
"learning_rate": 9.968387776525009e-06, |
|
"loss": -0.0309, |
|
"reward": 0.8125910945236683, |
|
"reward_std": 0.1700099449371919, |
|
"rewards/accuracy_reward": 0.46874998696148396, |
|
"rewards/semantic_entropy_math_reward": 0.34384110383689404, |
|
"step": 33 |
|
}, |
|
{ |
|
"completion_length": 828.6734504699707, |
|
"epoch": 0.08859934853420195, |
|
"grad_norm": 0.015905598178505898, |
|
"learning_rate": 9.963343482486907e-06, |
|
"loss": -0.0339, |
|
"reward": 0.7439868859946728, |
|
"reward_std": 0.1670152919832617, |
|
"rewards/accuracy_reward": 0.4119897894561291, |
|
"rewards/semantic_entropy_math_reward": 0.33199707977473736, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 804.4808464050293, |
|
"epoch": 0.09120521172638436, |
|
"grad_norm": 0.011662989854812622, |
|
"learning_rate": 9.957927432338332e-06, |
|
"loss": -0.0364, |
|
"reward": 0.7976494319736958, |
|
"reward_std": 0.16072418447583914, |
|
"rewards/accuracy_reward": 0.4547193758189678, |
|
"rewards/semantic_entropy_math_reward": 0.34293002262711525, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 816.8692359924316, |
|
"epoch": 0.09381107491856677, |
|
"grad_norm": 0.014741488732397556, |
|
"learning_rate": 9.952140031743282e-06, |
|
"loss": -0.0329, |
|
"reward": 0.8408345356583595, |
|
"reward_std": 0.15276295854710042, |
|
"rewards/accuracy_reward": 0.48915815353393555, |
|
"rewards/semantic_entropy_math_reward": 0.3516763783991337, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 757.6179695129395, |
|
"epoch": 0.09641693811074918, |
|
"grad_norm": 0.021201658993959427, |
|
"learning_rate": 9.945981714180021e-06, |
|
"loss": -0.0365, |
|
"reward": 0.8421100489795208, |
|
"reward_std": 0.1715679601766169, |
|
"rewards/accuracy_reward": 0.5038265194743872, |
|
"rewards/semantic_entropy_math_reward": 0.33828352577984333, |
|
"step": 37 |
|
}, |
|
{ |
|
"completion_length": 770.365421295166, |
|
"epoch": 0.09902280130293159, |
|
"grad_norm": 0.03967665135860443, |
|
"learning_rate": 9.939452940908627e-06, |
|
"loss": -0.0316, |
|
"reward": 0.8511297404766083, |
|
"reward_std": 0.14679769705981016, |
|
"rewards/accuracy_reward": 0.498724477365613, |
|
"rewards/semantic_entropy_math_reward": 0.35240524634718895, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 753.9119720458984, |
|
"epoch": 0.10162866449511401, |
|
"grad_norm": 0.02615044265985489, |
|
"learning_rate": 9.932554200936428e-06, |
|
"loss": -0.0365, |
|
"reward": 0.8091289959847927, |
|
"reward_std": 0.17394380597397685, |
|
"rewards/accuracy_reward": 0.46173468325287104, |
|
"rewards/semantic_entropy_math_reward": 0.3473943155258894, |
|
"step": 39 |
|
}, |
|
{ |
|
"completion_length": 727.7244834899902, |
|
"epoch": 0.10423452768729642, |
|
"grad_norm": 0.20424392819404602, |
|
"learning_rate": 9.925286010981394e-06, |
|
"loss": -0.0368, |
|
"reward": 0.8333636894822121, |
|
"reward_std": 0.1578838722780347, |
|
"rewards/accuracy_reward": 0.48214284889400005, |
|
"rewards/semantic_entropy_math_reward": 0.3512208443135023, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 773.675365447998, |
|
"epoch": 0.10684039087947883, |
|
"grad_norm": 0.004368220455944538, |
|
"learning_rate": 9.917648915433413e-06, |
|
"loss": -0.0389, |
|
"reward": 0.8713556751608849, |
|
"reward_std": 0.18799859285354614, |
|
"rewards/accuracy_reward": 0.5153061114251614, |
|
"rewards/semantic_entropy_math_reward": 0.35604955814778805, |
|
"step": 41 |
|
}, |
|
{ |
|
"completion_length": 756.2442359924316, |
|
"epoch": 0.10944625407166124, |
|
"grad_norm": 0.004377900157123804, |
|
"learning_rate": 9.909643486313533e-06, |
|
"loss": -0.0419, |
|
"reward": 0.8909438773989677, |
|
"reward_std": 0.15308711864054203, |
|
"rewards/accuracy_reward": 0.5248724389821291, |
|
"rewards/semantic_entropy_math_reward": 0.36607141979038715, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 740.527400970459, |
|
"epoch": 0.11205211726384365, |
|
"grad_norm": 0.004278510343283415, |
|
"learning_rate": 9.901270323231114e-06, |
|
"loss": -0.0376, |
|
"reward": 0.8292638286948204, |
|
"reward_std": 0.16438758978620172, |
|
"rewards/accuracy_reward": 0.483418358489871, |
|
"rewards/semantic_entropy_math_reward": 0.3458454851061106, |
|
"step": 43 |
|
}, |
|
{ |
|
"completion_length": 665.3762626647949, |
|
"epoch": 0.11465798045602606, |
|
"grad_norm": 0.004815321881324053, |
|
"learning_rate": 9.892530053338909e-06, |
|
"loss": -0.0286, |
|
"reward": 0.9122630879282951, |
|
"reward_std": 0.16950253094546497, |
|
"rewards/accuracy_reward": 0.5433673392981291, |
|
"rewards/semantic_entropy_math_reward": 0.36889576353132725, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 692.8494720458984, |
|
"epoch": 0.11726384364820847, |
|
"grad_norm": 0.010744983330368996, |
|
"learning_rate": 9.883423331286096e-06, |
|
"loss": -0.025, |
|
"reward": 0.875364426523447, |
|
"reward_std": 0.15541742404457182, |
|
"rewards/accuracy_reward": 0.5051020309329033, |
|
"rewards/semantic_entropy_math_reward": 0.370262386277318, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 702.8003749847412, |
|
"epoch": 0.11986970684039087, |
|
"grad_norm": 0.006386768072843552, |
|
"learning_rate": 9.873950839169248e-06, |
|
"loss": -0.0375, |
|
"reward": 0.8963192403316498, |
|
"reward_std": 0.17361515946686268, |
|
"rewards/accuracy_reward": 0.5344387628138065, |
|
"rewards/semantic_entropy_math_reward": 0.3618804607540369, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 747.270393371582, |
|
"epoch": 0.12247557003257328, |
|
"grad_norm": 0.006052403245121241, |
|
"learning_rate": 9.864113286481237e-06, |
|
"loss": -0.0302, |
|
"reward": 0.8596027493476868, |
|
"reward_std": 0.16451329877600074, |
|
"rewards/accuracy_reward": 0.5146683547645807, |
|
"rewards/semantic_entropy_math_reward": 0.3449344038963318, |
|
"step": 47 |
|
}, |
|
{ |
|
"completion_length": 677.6504974365234, |
|
"epoch": 0.1250814332247557, |
|
"grad_norm": 0.010900772176682949, |
|
"learning_rate": 9.853911410058097e-06, |
|
"loss": -0.026, |
|
"reward": 0.8704445790499449, |
|
"reward_std": 0.14085088530555367, |
|
"rewards/accuracy_reward": 0.5165816182270646, |
|
"rewards/semantic_entropy_math_reward": 0.3538629673421383, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 713.0108261108398, |
|
"epoch": 0.1276872964169381, |
|
"grad_norm": 0.004999789409339428, |
|
"learning_rate": 9.843345974023833e-06, |
|
"loss": -0.0185, |
|
"reward": 0.8516763634979725, |
|
"reward_std": 0.14363743993453681, |
|
"rewards/accuracy_reward": 0.4885203968733549, |
|
"rewards/semantic_entropy_math_reward": 0.3631559703499079, |
|
"step": 49 |
|
}, |
|
{ |
|
"completion_length": 696.6517677307129, |
|
"epoch": 0.13029315960912052, |
|
"grad_norm": 0.011122237890958786, |
|
"learning_rate": 9.832417769733185e-06, |
|
"loss": -0.0379, |
|
"reward": 0.8595116399228573, |
|
"reward_std": 0.16439938032999635, |
|
"rewards/accuracy_reward": 0.506377536803484, |
|
"rewards/semantic_entropy_math_reward": 0.35313410498201847, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 672.8233242034912, |
|
"epoch": 0.13289902280130292, |
|
"grad_norm": 0.006625417619943619, |
|
"learning_rate": 9.821127615712365e-06, |
|
"loss": -0.0212, |
|
"reward": 0.9153607748448849, |
|
"reward_std": 0.17063219111878425, |
|
"rewards/accuracy_reward": 0.5542091727256775, |
|
"rewards/semantic_entropy_math_reward": 0.3611515983939171, |
|
"step": 51 |
|
}, |
|
{ |
|
"completion_length": 678.4897918701172, |
|
"epoch": 0.13550488599348534, |
|
"grad_norm": 0.006391481496393681, |
|
"learning_rate": 9.809476357597738e-06, |
|
"loss": -0.0301, |
|
"reward": 0.8757288567721844, |
|
"reward_std": 0.14586452580988407, |
|
"rewards/accuracy_reward": 0.5012754984200001, |
|
"rewards/semantic_entropy_math_reward": 0.37445334158837795, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 637.1996021270752, |
|
"epoch": 0.13811074918566776, |
|
"grad_norm": 0.004039392340928316, |
|
"learning_rate": 9.797464868072489e-06, |
|
"loss": -0.0229, |
|
"reward": 0.939504336565733, |
|
"reward_std": 0.12869648274499923, |
|
"rewards/accuracy_reward": 0.5529336631298065, |
|
"rewards/semantic_entropy_math_reward": 0.38657068461179733, |
|
"step": 53 |
|
}, |
|
{ |
|
"completion_length": 669.5726890563965, |
|
"epoch": 0.14071661237785016, |
|
"grad_norm": 0.004615962039679289, |
|
"learning_rate": 9.785094046801256e-06, |
|
"loss": -0.0263, |
|
"reward": 0.9114431440830231, |
|
"reward_std": 0.14767074608244002, |
|
"rewards/accuracy_reward": 0.5293367225676775, |
|
"rewards/semantic_entropy_math_reward": 0.3821064028888941, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 694.8207740783691, |
|
"epoch": 0.14332247557003258, |
|
"grad_norm": 0.0039047168102115393, |
|
"learning_rate": 9.77236482036275e-06, |
|
"loss": -0.0294, |
|
"reward": 0.8075801469385624, |
|
"reward_std": 0.11974696815013885, |
|
"rewards/accuracy_reward": 0.4477040711790323, |
|
"rewards/semantic_entropy_math_reward": 0.35987609066069126, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 667.4215412139893, |
|
"epoch": 0.14592833876221498, |
|
"grad_norm": 0.0031674006022512913, |
|
"learning_rate": 9.759278142180348e-06, |
|
"loss": -0.034, |
|
"reward": 0.9129008483141661, |
|
"reward_std": 0.1429015859030187, |
|
"rewards/accuracy_reward": 0.5325254974886775, |
|
"rewards/semantic_entropy_math_reward": 0.3803753647953272, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 667.6166915893555, |
|
"epoch": 0.1485342019543974, |
|
"grad_norm": 0.00376137369312346, |
|
"learning_rate": 9.745834992450688e-06, |
|
"loss": -0.027, |
|
"reward": 0.8073068428784609, |
|
"reward_std": 0.1307279309257865, |
|
"rewards/accuracy_reward": 0.4521683547645807, |
|
"rewards/semantic_entropy_math_reward": 0.35513848531991243, |
|
"step": 57 |
|
}, |
|
{ |
|
"completion_length": 629.4394016265869, |
|
"epoch": 0.1511400651465798, |
|
"grad_norm": 0.00376802496612072, |
|
"learning_rate": 9.732036378070243e-06, |
|
"loss": -0.0146, |
|
"reward": 0.8919460512697697, |
|
"reward_std": 0.14173119352199137, |
|
"rewards/accuracy_reward": 0.5223214142024517, |
|
"rewards/semantic_entropy_math_reward": 0.36962463706731796, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 679.9802112579346, |
|
"epoch": 0.15374592833876222, |
|
"grad_norm": 0.00410770159214735, |
|
"learning_rate": 9.717883332559911e-06, |
|
"loss": -0.0235, |
|
"reward": 0.8637937270104885, |
|
"reward_std": 0.13675629626959562, |
|
"rewards/accuracy_reward": 0.49298468604683876, |
|
"rewards/semantic_entropy_math_reward": 0.3708090353757143, |
|
"step": 59 |
|
}, |
|
{ |
|
"completion_length": 613.1179676055908, |
|
"epoch": 0.1563517915309446, |
|
"grad_norm": 0.0037613979075104, |
|
"learning_rate": 9.703376915987601e-06, |
|
"loss": -0.0233, |
|
"reward": 0.8473032042384148, |
|
"reward_std": 0.1510353204794228, |
|
"rewards/accuracy_reward": 0.47130101174116135, |
|
"rewards/semantic_entropy_math_reward": 0.3760021850466728, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 659.0210304260254, |
|
"epoch": 0.15895765472312703, |
|
"grad_norm": 0.0041074734181165695, |
|
"learning_rate": 9.688518214888836e-06, |
|
"loss": -0.0397, |
|
"reward": 0.8512208238244057, |
|
"reward_std": 0.13073399825952947, |
|
"rewards/accuracy_reward": 0.4853316228836775, |
|
"rewards/semantic_entropy_math_reward": 0.3658892083913088, |
|
"step": 61 |
|
}, |
|
{ |
|
"completion_length": 647.3169574737549, |
|
"epoch": 0.16156351791530946, |
|
"grad_norm": 0.004084247630089521, |
|
"learning_rate": 9.673308342185366e-06, |
|
"loss": -0.0256, |
|
"reward": 0.8939504288136959, |
|
"reward_std": 0.1518772984854877, |
|
"rewards/accuracy_reward": 0.5274234591051936, |
|
"rewards/semantic_entropy_math_reward": 0.36652695946395397, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 617.5650367736816, |
|
"epoch": 0.16416938110749185, |
|
"grad_norm": 0.0045938920229673386, |
|
"learning_rate": 9.657748437101819e-06, |
|
"loss": -0.0281, |
|
"reward": 0.9265670254826546, |
|
"reward_std": 0.14439171738922596, |
|
"rewards/accuracy_reward": 0.5573979429900646, |
|
"rewards/semantic_entropy_math_reward": 0.36916909366846085, |
|
"step": 63 |
|
}, |
|
{ |
|
"completion_length": 662.421537399292, |
|
"epoch": 0.16677524429967427, |
|
"grad_norm": 0.004075649194419384, |
|
"learning_rate": 9.641839665080363e-06, |
|
"loss": -0.0301, |
|
"reward": 0.9174562618136406, |
|
"reward_std": 0.14708451775368303, |
|
"rewards/accuracy_reward": 0.5350765231996775, |
|
"rewards/semantic_entropy_math_reward": 0.3823797293007374, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 679.7850646972656, |
|
"epoch": 0.16938110749185667, |
|
"grad_norm": 0.003912299871444702, |
|
"learning_rate": 9.625583217693419e-06, |
|
"loss": -0.0303, |
|
"reward": 0.9068877436220646, |
|
"reward_std": 0.13392312987707555, |
|
"rewards/accuracy_reward": 0.5363520327955484, |
|
"rewards/semantic_entropy_math_reward": 0.37053569965064526, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 648.4598083496094, |
|
"epoch": 0.1719869706840391, |
|
"grad_norm": 0.00416301004588604, |
|
"learning_rate": 9.60898031255441e-06, |
|
"loss": -0.0225, |
|
"reward": 0.9016034826636314, |
|
"reward_std": 0.14559987792745233, |
|
"rewards/accuracy_reward": 0.5089285578578711, |
|
"rewards/semantic_entropy_math_reward": 0.3926749173551798, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 573.0261383056641, |
|
"epoch": 0.1745928338762215, |
|
"grad_norm": 0.006119817961007357, |
|
"learning_rate": 9.592032193226564e-06, |
|
"loss": -0.015, |
|
"reward": 0.9476129561662674, |
|
"reward_std": 0.12818793952465057, |
|
"rewards/accuracy_reward": 0.5656887628138065, |
|
"rewards/semantic_entropy_math_reward": 0.38192419335246086, |
|
"step": 67 |
|
}, |
|
{ |
|
"completion_length": 651.288890838623, |
|
"epoch": 0.1771986970684039, |
|
"grad_norm": 0.00714410375803709, |
|
"learning_rate": 9.574740129129767e-06, |
|
"loss": -0.0264, |
|
"reward": 0.8074890486896038, |
|
"reward_std": 0.12236620881594718, |
|
"rewards/accuracy_reward": 0.4457908058539033, |
|
"rewards/semantic_entropy_math_reward": 0.36169825680553913, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 643.5050830841064, |
|
"epoch": 0.17980456026058633, |
|
"grad_norm": 0.006653693970292807, |
|
"learning_rate": 9.557105415445485e-06, |
|
"loss": -0.0202, |
|
"reward": 0.9240160100162029, |
|
"reward_std": 0.1365425349213183, |
|
"rewards/accuracy_reward": 0.5446428470313549, |
|
"rewards/semantic_entropy_math_reward": 0.3793731667101383, |
|
"step": 69 |
|
}, |
|
{ |
|
"completion_length": 582.908145904541, |
|
"epoch": 0.18241042345276873, |
|
"grad_norm": 0.01532233040779829, |
|
"learning_rate": 9.539129373019755e-06, |
|
"loss": -0.018, |
|
"reward": 0.937226664274931, |
|
"reward_std": 0.13966482668183744, |
|
"rewards/accuracy_reward": 0.5548469256609678, |
|
"rewards/semantic_entropy_math_reward": 0.382379736751318, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 583.0146541595459, |
|
"epoch": 0.18501628664495115, |
|
"grad_norm": 0.004541024565696716, |
|
"learning_rate": 9.520813348264252e-06, |
|
"loss": -0.0259, |
|
"reward": 0.872904509305954, |
|
"reward_std": 0.10791157651692629, |
|
"rewards/accuracy_reward": 0.47704081051051617, |
|
"rewards/semantic_entropy_math_reward": 0.3958636950701475, |
|
"step": 71 |
|
}, |
|
{ |
|
"completion_length": 658.2251129150391, |
|
"epoch": 0.18762214983713354, |
|
"grad_norm": 0.00619466882199049, |
|
"learning_rate": 9.502158713055444e-06, |
|
"loss": -0.0395, |
|
"reward": 0.8880284130573273, |
|
"reward_std": 0.1113471242133528, |
|
"rewards/accuracy_reward": 0.5089285634458065, |
|
"rewards/semantic_entropy_math_reward": 0.37909984961152077, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 624.5025367736816, |
|
"epoch": 0.19022801302931597, |
|
"grad_norm": 0.0047431401908397675, |
|
"learning_rate": 9.483166864631837e-06, |
|
"loss": -0.0331, |
|
"reward": 0.8842018693685532, |
|
"reward_std": 0.10373708885163069, |
|
"rewards/accuracy_reward": 0.5019132532179356, |
|
"rewards/semantic_entropy_math_reward": 0.38228863291442394, |
|
"step": 73 |
|
}, |
|
{ |
|
"completion_length": 589.4891395568848, |
|
"epoch": 0.19283387622149836, |
|
"grad_norm": 0.004772627260535955, |
|
"learning_rate": 9.46383922548932e-06, |
|
"loss": -0.0249, |
|
"reward": 0.9389577209949493, |
|
"reward_std": 0.11472503608092666, |
|
"rewards/accuracy_reward": 0.545280609279871, |
|
"rewards/semantic_entropy_math_reward": 0.39367711916565895, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 620.705982208252, |
|
"epoch": 0.19543973941368079, |
|
"grad_norm": 0.0062498371116817, |
|
"learning_rate": 9.444177243274619e-06, |
|
"loss": -0.0257, |
|
"reward": 0.9043367132544518, |
|
"reward_std": 0.14091078890487552, |
|
"rewards/accuracy_reward": 0.5248724389821291, |
|
"rewards/semantic_entropy_math_reward": 0.37946427799761295, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 648.9623603820801, |
|
"epoch": 0.19804560260586318, |
|
"grad_norm": 0.004575234837830067, |
|
"learning_rate": 9.424182390676872e-06, |
|
"loss": -0.0282, |
|
"reward": 0.8821063973009586, |
|
"reward_std": 0.11956909950822592, |
|
"rewards/accuracy_reward": 0.5012755002826452, |
|
"rewards/semantic_entropy_math_reward": 0.3808308970183134, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 591.5235843658447, |
|
"epoch": 0.2006514657980456, |
|
"grad_norm": 0.005398905370384455, |
|
"learning_rate": 9.403856165317322e-06, |
|
"loss": -0.0297, |
|
"reward": 0.949161771684885, |
|
"reward_std": 0.12031427887268364, |
|
"rewards/accuracy_reward": 0.5548469219356775, |
|
"rewards/semantic_entropy_math_reward": 0.3943148721009493, |
|
"step": 77 |
|
}, |
|
{ |
|
"completion_length": 634.9636363983154, |
|
"epoch": 0.20325732899022803, |
|
"grad_norm": 0.0076393503695726395, |
|
"learning_rate": 9.383200089637143e-06, |
|
"loss": -0.0267, |
|
"reward": 0.8719934187829494, |
|
"reward_std": 0.12535155925434083, |
|
"rewards/accuracy_reward": 0.49489794485270977, |
|
"rewards/semantic_entropy_math_reward": 0.3770954702049494, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 594.6001148223877, |
|
"epoch": 0.20586319218241042, |
|
"grad_norm": 0.0050405478104949, |
|
"learning_rate": 9.362215710783411e-06, |
|
"loss": -0.0178, |
|
"reward": 0.9342201091349125, |
|
"reward_std": 0.1078009803313762, |
|
"rewards/accuracy_reward": 0.5452806055545807, |
|
"rewards/semantic_entropy_math_reward": 0.38893949054181576, |
|
"step": 79 |
|
}, |
|
{ |
|
"completion_length": 616.5573806762695, |
|
"epoch": 0.20846905537459284, |
|
"grad_norm": 0.00523959519341588, |
|
"learning_rate": 9.34090460049322e-06, |
|
"loss": -0.0283, |
|
"reward": 0.9657434336841106, |
|
"reward_std": 0.1119958006311208, |
|
"rewards/accuracy_reward": 0.5758928414434195, |
|
"rewards/semantic_entropy_math_reward": 0.38985058665275574, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 597.3839282989502, |
|
"epoch": 0.21107491856677524, |
|
"grad_norm": 0.007267440669238567, |
|
"learning_rate": 9.319268354975958e-06, |
|
"loss": -0.0246, |
|
"reward": 0.9277514442801476, |
|
"reward_std": 0.1274285425315611, |
|
"rewards/accuracy_reward": 0.5382652953267097, |
|
"rewards/semantic_entropy_math_reward": 0.38948614709079266, |
|
"step": 81 |
|
}, |
|
{ |
|
"completion_length": 635.056755065918, |
|
"epoch": 0.21368078175895766, |
|
"grad_norm": 0.009285375475883484, |
|
"learning_rate": 9.297308594793757e-06, |
|
"loss": -0.0302, |
|
"reward": 0.9357689209282398, |
|
"reward_std": 0.11352400272153318, |
|
"rewards/accuracy_reward": 0.5414540711790323, |
|
"rewards/semantic_entropy_math_reward": 0.394314868375659, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 599.1862125396729, |
|
"epoch": 0.21628664495114006, |
|
"grad_norm": 0.007068796548992395, |
|
"learning_rate": 9.275026964740101e-06, |
|
"loss": -0.0221, |
|
"reward": 0.8811042085289955, |
|
"reward_std": 0.14417032268829644, |
|
"rewards/accuracy_reward": 0.4999999925494194, |
|
"rewards/semantic_entropy_math_reward": 0.3811042346060276, |
|
"step": 83 |
|
}, |
|
{ |
|
"completion_length": 617.5911846160889, |
|
"epoch": 0.21889250814332248, |
|
"grad_norm": 0.006198438350111246, |
|
"learning_rate": 9.252425133716639e-06, |
|
"loss": -0.0388, |
|
"reward": 0.8672558218240738, |
|
"reward_std": 0.12377029308117926, |
|
"rewards/accuracy_reward": 0.4948979541659355, |
|
"rewards/semantic_entropy_math_reward": 0.3723578620702028, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 561.2780570983887, |
|
"epoch": 0.22149837133550487, |
|
"grad_norm": 0.006817261688411236, |
|
"learning_rate": 9.229504794608182e-06, |
|
"loss": -0.0129, |
|
"reward": 0.9309402145445347, |
|
"reward_std": 0.1257460000924766, |
|
"rewards/accuracy_reward": 0.5331632606685162, |
|
"rewards/semantic_entropy_math_reward": 0.3977769538760185, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 547.4725685119629, |
|
"epoch": 0.2241042345276873, |
|
"grad_norm": 0.005772161763161421, |
|
"learning_rate": 9.206267664155906e-06, |
|
"loss": -0.0404, |
|
"reward": 0.9456996843218803, |
|
"reward_std": 0.1207300984824542, |
|
"rewards/accuracy_reward": 0.5529336631298065, |
|
"rewards/semantic_entropy_math_reward": 0.3927660472691059, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 583.5822620391846, |
|
"epoch": 0.22671009771986972, |
|
"grad_norm": 0.020753948017954826, |
|
"learning_rate": 9.182715482828764e-06, |
|
"loss": -0.0265, |
|
"reward": 0.9639212638139725, |
|
"reward_std": 0.09742264985106885, |
|
"rewards/accuracy_reward": 0.570790808647871, |
|
"rewards/semantic_entropy_math_reward": 0.39313045144081116, |
|
"step": 87 |
|
}, |
|
{ |
|
"completion_length": 584.519121170044, |
|
"epoch": 0.2293159609120521, |
|
"grad_norm": 0.007225371431559324, |
|
"learning_rate": 9.158850014693123e-06, |
|
"loss": -0.0268, |
|
"reward": 0.922284971922636, |
|
"reward_std": 0.1539946385892108, |
|
"rewards/accuracy_reward": 0.5414540749043226, |
|
"rewards/semantic_entropy_math_reward": 0.3808309081941843, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 588.8143997192383, |
|
"epoch": 0.23192182410423454, |
|
"grad_norm": 0.006209821905940771, |
|
"learning_rate": 9.134673047280644e-06, |
|
"loss": -0.0311, |
|
"reward": 0.8960458897054195, |
|
"reward_std": 0.09522696980275214, |
|
"rewards/accuracy_reward": 0.5153061179444194, |
|
"rewards/semantic_entropy_math_reward": 0.3807397987693548, |
|
"step": 89 |
|
}, |
|
{ |
|
"completion_length": 563.0012645721436, |
|
"epoch": 0.23452768729641693, |
|
"grad_norm": 0.007620004937052727, |
|
"learning_rate": 9.110186391454389e-06, |
|
"loss": -0.0224, |
|
"reward": 0.9369533210992813, |
|
"reward_std": 0.10029464948456734, |
|
"rewards/accuracy_reward": 0.5427295845001936, |
|
"rewards/semantic_entropy_math_reward": 0.39422375708818436, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 562.1103248596191, |
|
"epoch": 0.23713355048859935, |
|
"grad_norm": 0.008318237029016018, |
|
"learning_rate": 9.085391881273182e-06, |
|
"loss": -0.0276, |
|
"reward": 0.9031523168087006, |
|
"reward_std": 0.11616136459633708, |
|
"rewards/accuracy_reward": 0.5197704005986452, |
|
"rewards/semantic_entropy_math_reward": 0.3833819255232811, |
|
"step": 91 |
|
}, |
|
{ |
|
"completion_length": 566.0152969360352, |
|
"epoch": 0.23973941368078175, |
|
"grad_norm": 0.013799573294818401, |
|
"learning_rate": 9.060291373854252e-06, |
|
"loss": -0.0309, |
|
"reward": 0.9221938513219357, |
|
"reward_std": 0.1298237289302051, |
|
"rewards/accuracy_reward": 0.5471938699483871, |
|
"rewards/semantic_entropy_math_reward": 0.3749999888241291, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 478.0140209197998, |
|
"epoch": 0.24234527687296417, |
|
"grad_norm": 0.010932566598057747, |
|
"learning_rate": 9.034886749234112e-06, |
|
"loss": -0.0229, |
|
"reward": 0.9631923995912075, |
|
"reward_std": 0.08580980636179447, |
|
"rewards/accuracy_reward": 0.5599489714950323, |
|
"rewards/semantic_entropy_math_reward": 0.4032434243708849, |
|
"step": 93 |
|
}, |
|
{ |
|
"completion_length": 528.445779800415, |
|
"epoch": 0.24495114006514657, |
|
"grad_norm": 0.03433404862880707, |
|
"learning_rate": 9.009179910227767e-06, |
|
"loss": -0.0259, |
|
"reward": 0.8987791389226913, |
|
"reward_std": 0.1273018317297101, |
|
"rewards/accuracy_reward": 0.5191326383501291, |
|
"rewards/semantic_entropy_math_reward": 0.3796464875340462, |
|
"step": 94 |
|
}, |
|
{ |
|
"completion_length": 566.7614727020264, |
|
"epoch": 0.247557003257329, |
|
"grad_norm": 0.00829783733934164, |
|
"learning_rate": 8.98317278228618e-06, |
|
"loss": -0.0304, |
|
"reward": 0.8782798796892166, |
|
"reward_std": 0.13542052335105836, |
|
"rewards/accuracy_reward": 0.4936224343255162, |
|
"rewards/semantic_entropy_math_reward": 0.3846574295312166, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 506.3705234527588, |
|
"epoch": 0.2501628664495114, |
|
"grad_norm": 0.00881106685847044, |
|
"learning_rate": 8.956867313352055e-06, |
|
"loss": -0.0221, |
|
"reward": 0.9182762429118156, |
|
"reward_std": 0.08380106370896101, |
|
"rewards/accuracy_reward": 0.5191326439380646, |
|
"rewards/semantic_entropy_math_reward": 0.399143585935235, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 570.8501167297363, |
|
"epoch": 0.25276872964169383, |
|
"grad_norm": 0.007192031946033239, |
|
"learning_rate": 8.930265473713939e-06, |
|
"loss": -0.0305, |
|
"reward": 0.9258381798863411, |
|
"reward_std": 0.12165293795987964, |
|
"rewards/accuracy_reward": 0.5440050940960646, |
|
"rewards/semantic_entropy_math_reward": 0.3818330932408571, |
|
"step": 97 |
|
}, |
|
{ |
|
"completion_length": 508.7321243286133, |
|
"epoch": 0.2553745928338762, |
|
"grad_norm": 0.007854027673602104, |
|
"learning_rate": 8.90336925585864e-06, |
|
"loss": -0.0173, |
|
"reward": 0.9938957393169403, |
|
"reward_std": 0.10928461601724848, |
|
"rewards/accuracy_reward": 0.5911989696323872, |
|
"rewards/semantic_entropy_math_reward": 0.40269677340984344, |
|
"step": 98 |
|
}, |
|
{ |
|
"completion_length": 524.1619777679443, |
|
"epoch": 0.2579804560260586, |
|
"grad_norm": 0.01574082300066948, |
|
"learning_rate": 8.876180674322006e-06, |
|
"loss": -0.024, |
|
"reward": 0.941326517611742, |
|
"reward_std": 0.1133777701179497, |
|
"rewards/accuracy_reward": 0.5459183529019356, |
|
"rewards/semantic_entropy_math_reward": 0.39540815725922585, |
|
"step": 99 |
|
}, |
|
{ |
|
"completion_length": 512.2608318328857, |
|
"epoch": 0.26058631921824105, |
|
"grad_norm": 0.00712746474891901, |
|
"learning_rate": 8.84870176553801e-06, |
|
"loss": -0.0223, |
|
"reward": 0.9894314855337143, |
|
"reward_std": 0.10396600887179375, |
|
"rewards/accuracy_reward": 0.5778061039745808, |
|
"rewards/semantic_entropy_math_reward": 0.4116253647953272, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 576.9661922454834, |
|
"epoch": 0.26319218241042347, |
|
"grad_norm": 0.007007178850471973, |
|
"learning_rate": 8.820934587686247e-06, |
|
"loss": -0.0296, |
|
"reward": 0.8965925425291061, |
|
"reward_std": 0.12761994334869087, |
|
"rewards/accuracy_reward": 0.49936224333941936, |
|
"rewards/semantic_entropy_math_reward": 0.39723031409084797, |
|
"step": 101 |
|
}, |
|
{ |
|
"completion_length": 513.0254955291748, |
|
"epoch": 0.26579804560260584, |
|
"grad_norm": 0.007423168048262596, |
|
"learning_rate": 8.792881220537752e-06, |
|
"loss": -0.0214, |
|
"reward": 0.9517128206789494, |
|
"reward_std": 0.08766992692835629, |
|
"rewards/accuracy_reward": 0.5465561114251614, |
|
"rewards/semantic_entropy_math_reward": 0.4051567129790783, |
|
"step": 102 |
|
}, |
|
{ |
|
"completion_length": 548.6473083496094, |
|
"epoch": 0.26840390879478826, |
|
"grad_norm": 0.008394024334847927, |
|
"learning_rate": 8.764543765299245e-06, |
|
"loss": -0.0183, |
|
"reward": 0.9393221363425255, |
|
"reward_std": 0.1155447622295469, |
|
"rewards/accuracy_reward": 0.541454067453742, |
|
"rewards/semantic_entropy_math_reward": 0.39786806143820286, |
|
"step": 103 |
|
}, |
|
{ |
|
"completion_length": 543.691312789917, |
|
"epoch": 0.2710097719869707, |
|
"grad_norm": 0.006529523991048336, |
|
"learning_rate": 8.735924344455732e-06, |
|
"loss": -0.0215, |
|
"reward": 0.9579081423580647, |
|
"reward_std": 0.11347127868793905, |
|
"rewards/accuracy_reward": 0.5554846785962582, |
|
"rewards/semantic_entropy_math_reward": 0.40242345817387104, |
|
"step": 104 |
|
}, |
|
{ |
|
"completion_length": 507.02295112609863, |
|
"epoch": 0.2736156351791531, |
|
"grad_norm": 0.007422362454235554, |
|
"learning_rate": 8.707025101611546e-06, |
|
"loss": -0.0244, |
|
"reward": 0.9213738925755024, |
|
"reward_std": 0.10976181924343109, |
|
"rewards/accuracy_reward": 0.5172193758189678, |
|
"rewards/semantic_entropy_math_reward": 0.40415450744330883, |
|
"step": 105 |
|
}, |
|
{ |
|
"completion_length": 452.20470428466797, |
|
"epoch": 0.2762214983713355, |
|
"grad_norm": 0.007135653402656317, |
|
"learning_rate": 8.677848201329775e-06, |
|
"loss": -0.0076, |
|
"reward": 0.9969934076070786, |
|
"reward_std": 0.08652752835769206, |
|
"rewards/accuracy_reward": 0.5752550922334194, |
|
"rewards/semantic_entropy_math_reward": 0.4217383284121752, |
|
"step": 106 |
|
}, |
|
{ |
|
"completion_length": 510.8156795501709, |
|
"epoch": 0.2788273615635179, |
|
"grad_norm": 0.00994784850627184, |
|
"learning_rate": 8.64839582897015e-06, |
|
"loss": -0.021, |
|
"reward": 0.8670735992491245, |
|
"reward_std": 0.11074995016679168, |
|
"rewards/accuracy_reward": 0.4942601975053549, |
|
"rewards/semantic_entropy_math_reward": 0.3728134222328663, |
|
"step": 107 |
|
}, |
|
{ |
|
"completion_length": 523.8329010009766, |
|
"epoch": 0.2814332247557003, |
|
"grad_norm": 0.00812004879117012, |
|
"learning_rate": 8.61867019052535e-06, |
|
"loss": -0.0254, |
|
"reward": 0.9470662958920002, |
|
"reward_std": 0.10799844446592033, |
|
"rewards/accuracy_reward": 0.5350765231996775, |
|
"rewards/semantic_entropy_math_reward": 0.4119897708296776, |
|
"step": 108 |
|
}, |
|
{ |
|
"completion_length": 484.8297176361084, |
|
"epoch": 0.28403908794788274, |
|
"grad_norm": 0.00825856626033783, |
|
"learning_rate": 8.588673512455781e-06, |
|
"loss": -0.0122, |
|
"reward": 0.9991800077259541, |
|
"reward_std": 0.10875873418990523, |
|
"rewards/accuracy_reward": 0.5924744792282581, |
|
"rewards/semantic_entropy_math_reward": 0.40670554153621197, |
|
"step": 109 |
|
}, |
|
{ |
|
"completion_length": 465.98851013183594, |
|
"epoch": 0.28664495114006516, |
|
"grad_norm": 0.011785555630922318, |
|
"learning_rate": 8.558408041522801e-06, |
|
"loss": -0.011, |
|
"reward": 0.9412353932857513, |
|
"reward_std": 0.11806983663700521, |
|
"rewards/accuracy_reward": 0.5382652934640646, |
|
"rewards/semantic_entropy_math_reward": 0.4029701240360737, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 464.45534896850586, |
|
"epoch": 0.28925081433224753, |
|
"grad_norm": 0.008536071516573429, |
|
"learning_rate": 8.527876044620453e-06, |
|
"loss": -0.011, |
|
"reward": 0.9099853932857513, |
|
"reward_std": 0.12709335889667273, |
|
"rewards/accuracy_reward": 0.5216836612671614, |
|
"rewards/semantic_entropy_math_reward": 0.3883017376065254, |
|
"step": 111 |
|
}, |
|
{ |
|
"completion_length": 508.1205291748047, |
|
"epoch": 0.29185667752442995, |
|
"grad_norm": 0.006830580998212099, |
|
"learning_rate": 8.497079808605659e-06, |
|
"loss": -0.0125, |
|
"reward": 0.9495262391865253, |
|
"reward_std": 0.08507522073341534, |
|
"rewards/accuracy_reward": 0.5389030510559678, |
|
"rewards/semantic_entropy_math_reward": 0.41062317602336407, |
|
"step": 112 |
|
}, |
|
{ |
|
"completion_length": 468.2008819580078, |
|
"epoch": 0.2944625407166124, |
|
"grad_norm": 0.008947983384132385, |
|
"learning_rate": 8.466021640126946e-06, |
|
"loss": -0.0144, |
|
"reward": 0.9574526138603687, |
|
"reward_std": 0.0999740477418527, |
|
"rewards/accuracy_reward": 0.5503826402127743, |
|
"rewards/semantic_entropy_math_reward": 0.4070699568837881, |
|
"step": 113 |
|
}, |
|
{ |
|
"completion_length": 535.3322677612305, |
|
"epoch": 0.2970684039087948, |
|
"grad_norm": 0.011211535893380642, |
|
"learning_rate": 8.434703865451666e-06, |
|
"loss": -0.0204, |
|
"reward": 0.8916727267205715, |
|
"reward_std": 0.13218142627738416, |
|
"rewards/accuracy_reward": 0.5057397801429033, |
|
"rewards/semantic_entropy_math_reward": 0.385932931676507, |
|
"step": 114 |
|
}, |
|
{ |
|
"completion_length": 543.2493572235107, |
|
"epoch": 0.2996742671009772, |
|
"grad_norm": 0.010953478515148163, |
|
"learning_rate": 8.403128830291767e-06, |
|
"loss": -0.017, |
|
"reward": 0.8920371569693089, |
|
"reward_std": 0.10963007633108646, |
|
"rewards/accuracy_reward": 0.5076530482620001, |
|
"rewards/semantic_entropy_math_reward": 0.3843841031193733, |
|
"step": 115 |
|
}, |
|
{ |
|
"completion_length": 527.7404232025146, |
|
"epoch": 0.3022801302931596, |
|
"grad_norm": 0.012620029971003532, |
|
"learning_rate": 8.371298899628091e-06, |
|
"loss": -0.024, |
|
"reward": 0.9088921137154102, |
|
"reward_std": 0.12522040773183107, |
|
"rewards/accuracy_reward": 0.5242346785962582, |
|
"rewards/semantic_entropy_math_reward": 0.3846574239432812, |
|
"step": 116 |
|
}, |
|
{ |
|
"completion_length": 519.883918762207, |
|
"epoch": 0.304885993485342, |
|
"grad_norm": 0.013618368655443192, |
|
"learning_rate": 8.339216457533244e-06, |
|
"loss": -0.0187, |
|
"reward": 0.919278409332037, |
|
"reward_std": 0.1413876386359334, |
|
"rewards/accuracy_reward": 0.5280612204223871, |
|
"rewards/semantic_entropy_math_reward": 0.39121718890964985, |
|
"step": 117 |
|
}, |
|
{ |
|
"completion_length": 569.508279800415, |
|
"epoch": 0.30749185667752443, |
|
"grad_norm": 0.009268795140087605, |
|
"learning_rate": 8.306883906993022e-06, |
|
"loss": -0.0223, |
|
"reward": 0.8455721400678158, |
|
"reward_std": 0.14180739130824804, |
|
"rewards/accuracy_reward": 0.4693877473473549, |
|
"rewards/semantic_entropy_math_reward": 0.3761843889951706, |
|
"step": 118 |
|
}, |
|
{ |
|
"completion_length": 548.354585647583, |
|
"epoch": 0.31009771986970686, |
|
"grad_norm": 0.008740663528442383, |
|
"learning_rate": 8.274303669726427e-06, |
|
"loss": -0.0185, |
|
"reward": 0.904518935829401, |
|
"reward_std": 0.11498924950137734, |
|
"rewards/accuracy_reward": 0.5191326458007097, |
|
"rewards/semantic_entropy_math_reward": 0.3853862937539816, |
|
"step": 119 |
|
}, |
|
{ |
|
"completion_length": 504.8743495941162, |
|
"epoch": 0.3127035830618892, |
|
"grad_norm": 0.008732241578400135, |
|
"learning_rate": 8.24147818600428e-06, |
|
"loss": -0.0162, |
|
"reward": 0.9195517338812351, |
|
"reward_std": 0.10652195988222957, |
|
"rewards/accuracy_reward": 0.5184948872774839, |
|
"rewards/semantic_entropy_math_reward": 0.4010568540543318, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 552.7168216705322, |
|
"epoch": 0.31530944625407165, |
|
"grad_norm": 0.00948347244411707, |
|
"learning_rate": 8.20840991446645e-06, |
|
"loss": -0.0168, |
|
"reward": 0.8840196654200554, |
|
"reward_std": 0.11878083180636168, |
|
"rewards/accuracy_reward": 0.498724477365613, |
|
"rewards/semantic_entropy_math_reward": 0.3852951806038618, |
|
"step": 121 |
|
}, |
|
{ |
|
"completion_length": 536.5752468109131, |
|
"epoch": 0.31791530944625407, |
|
"grad_norm": 0.010393758304417133, |
|
"learning_rate": 8.175101331937692e-06, |
|
"loss": -0.0261, |
|
"reward": 0.9547193609178066, |
|
"reward_std": 0.1325584959704429, |
|
"rewards/accuracy_reward": 0.5618622377514839, |
|
"rewards/semantic_entropy_math_reward": 0.3928571343421936, |
|
"step": 122 |
|
}, |
|
{ |
|
"completion_length": 514.7365875244141, |
|
"epoch": 0.3205211726384365, |
|
"grad_norm": 0.011021023616194725, |
|
"learning_rate": 8.141554933242135e-06, |
|
"loss": -0.0215, |
|
"reward": 0.839012373238802, |
|
"reward_std": 0.11658453289419413, |
|
"rewards/accuracy_reward": 0.4521683529019356, |
|
"rewards/semantic_entropy_math_reward": 0.3868440166115761, |
|
"step": 123 |
|
}, |
|
{ |
|
"completion_length": 546.1747303009033, |
|
"epoch": 0.3231270358306189, |
|
"grad_norm": 0.012457598000764847, |
|
"learning_rate": 8.10777323101642e-06, |
|
"loss": -0.0149, |
|
"reward": 0.8359147049486637, |
|
"reward_std": 0.13787020510062575, |
|
"rewards/accuracy_reward": 0.4540816228836775, |
|
"rewards/semantic_entropy_math_reward": 0.38183307833969593, |
|
"step": 124 |
|
}, |
|
{ |
|
"completion_length": 540.0956497192383, |
|
"epoch": 0.3257328990228013, |
|
"grad_norm": 0.00853867456316948, |
|
"learning_rate": 8.073758755521506e-06, |
|
"loss": -0.0105, |
|
"reward": 0.8753644078969955, |
|
"reward_std": 0.10535615705884993, |
|
"rewards/accuracy_reward": 0.4891581516712904, |
|
"rewards/semantic_entropy_math_reward": 0.3862062580883503, |
|
"step": 125 |
|
}, |
|
{ |
|
"completion_length": 528.5184841156006, |
|
"epoch": 0.3283387622149837, |
|
"grad_norm": 0.010948549024760723, |
|
"learning_rate": 8.03951405445314e-06, |
|
"loss": -0.0143, |
|
"reward": 0.9124453142285347, |
|
"reward_std": 0.12273865600582212, |
|
"rewards/accuracy_reward": 0.5114795807749033, |
|
"rewards/semantic_entropy_math_reward": 0.40096573159098625, |
|
"step": 126 |
|
}, |
|
{ |
|
"completion_length": 544.5784301757812, |
|
"epoch": 0.3309446254071661, |
|
"grad_norm": 0.013710816390812397, |
|
"learning_rate": 8.005041692751055e-06, |
|
"loss": -0.0167, |
|
"reward": 0.9290269501507282, |
|
"reward_std": 0.13117512222379446, |
|
"rewards/accuracy_reward": 0.5286989714950323, |
|
"rewards/semantic_entropy_math_reward": 0.4003279786556959, |
|
"step": 127 |
|
}, |
|
{ |
|
"completion_length": 577.0956478118896, |
|
"epoch": 0.33355048859934855, |
|
"grad_norm": 0.03651218116283417, |
|
"learning_rate": 7.970344252406832e-06, |
|
"loss": -0.0183, |
|
"reward": 0.9039722643792629, |
|
"reward_std": 0.1263378494186327, |
|
"rewards/accuracy_reward": 0.5178571362048388, |
|
"rewards/semantic_entropy_math_reward": 0.38611516170203686, |
|
"step": 128 |
|
}, |
|
{ |
|
"completion_length": 541.4247360229492, |
|
"epoch": 0.33615635179153097, |
|
"grad_norm": 0.013164433650672436, |
|
"learning_rate": 7.935424332270523e-06, |
|
"loss": -0.0233, |
|
"reward": 0.9340378865599632, |
|
"reward_std": 0.1010712229181081, |
|
"rewards/accuracy_reward": 0.5306122377514839, |
|
"rewards/semantic_entropy_math_reward": 0.4034256376326084, |
|
"step": 129 |
|
}, |
|
{ |
|
"completion_length": 581.0784339904785, |
|
"epoch": 0.33876221498371334, |
|
"grad_norm": 0.012624763883650303, |
|
"learning_rate": 7.900284547855992e-06, |
|
"loss": -0.0054, |
|
"reward": 0.8548651225864887, |
|
"reward_std": 0.11518756812438369, |
|
"rewards/accuracy_reward": 0.467474477365613, |
|
"rewards/semantic_entropy_math_reward": 0.3873906545341015, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 534.0159378051758, |
|
"epoch": 0.34136807817589576, |
|
"grad_norm": 0.011506155133247375, |
|
"learning_rate": 7.864927531145012e-06, |
|
"loss": -0.0124, |
|
"reward": 0.9199161604046822, |
|
"reward_std": 0.10405887564411387, |
|
"rewards/accuracy_reward": 0.5204081516712904, |
|
"rewards/semantic_entropy_math_reward": 0.39950801618397236, |
|
"step": 131 |
|
}, |
|
{ |
|
"completion_length": 548.1957817077637, |
|
"epoch": 0.3439739413680782, |
|
"grad_norm": 0.013849125243723392, |
|
"learning_rate": 7.829355930390126e-06, |
|
"loss": -0.0222, |
|
"reward": 0.8961370177567005, |
|
"reward_std": 0.11209307186072692, |
|
"rewards/accuracy_reward": 0.487882643006742, |
|
"rewards/semantic_entropy_math_reward": 0.4082543719559908, |
|
"step": 132 |
|
}, |
|
{ |
|
"completion_length": 533.2085285186768, |
|
"epoch": 0.3465798045602606, |
|
"grad_norm": 0.007529080845415592, |
|
"learning_rate": 7.7935724099163e-06, |
|
"loss": -0.011, |
|
"reward": 0.9659256264567375, |
|
"reward_std": 0.10825655586086214, |
|
"rewards/accuracy_reward": 0.5688775414600968, |
|
"rewards/semantic_entropy_math_reward": 0.3970481026917696, |
|
"step": 133 |
|
}, |
|
{ |
|
"completion_length": 530.067590713501, |
|
"epoch": 0.349185667752443, |
|
"grad_norm": 0.013162490911781788, |
|
"learning_rate": 7.757579649921354e-06, |
|
"loss": -0.0155, |
|
"reward": 0.9479773864150047, |
|
"reward_std": 0.14899065819918178, |
|
"rewards/accuracy_reward": 0.5433673392981291, |
|
"rewards/semantic_entropy_math_reward": 0.4046100489795208, |
|
"step": 134 |
|
}, |
|
{ |
|
"completion_length": 564.3252391815186, |
|
"epoch": 0.3517915309446254, |
|
"grad_norm": 0.014966626651585102, |
|
"learning_rate": 7.721380346275221e-06, |
|
"loss": -0.0217, |
|
"reward": 0.9267492741346359, |
|
"reward_std": 0.1269418167648837, |
|
"rewards/accuracy_reward": 0.5306122275069356, |
|
"rewards/semantic_entropy_math_reward": 0.39613701961934566, |
|
"step": 135 |
|
}, |
|
{ |
|
"completion_length": 551.4642715454102, |
|
"epoch": 0.3543973941368078, |
|
"grad_norm": 0.007552569266408682, |
|
"learning_rate": 7.684977210318024e-06, |
|
"loss": -0.0128, |
|
"reward": 0.9007835239171982, |
|
"reward_std": 0.10546381218591705, |
|
"rewards/accuracy_reward": 0.5031887609511614, |
|
"rewards/semantic_entropy_math_reward": 0.39759473875164986, |
|
"step": 136 |
|
}, |
|
{ |
|
"completion_length": 532.7742252349854, |
|
"epoch": 0.35700325732899024, |
|
"grad_norm": 0.008308288641273975, |
|
"learning_rate": 7.648372968656995e-06, |
|
"loss": -0.01, |
|
"reward": 0.9131742008030415, |
|
"reward_std": 0.11054620456707198, |
|
"rewards/accuracy_reward": 0.5057397875934839, |
|
"rewards/semantic_entropy_math_reward": 0.40743439458310604, |
|
"step": 137 |
|
}, |
|
{ |
|
"completion_length": 537.5076389312744, |
|
"epoch": 0.35960912052117266, |
|
"grad_norm": 0.007219830993562937, |
|
"learning_rate": 7.611570362962247e-06, |
|
"loss": -0.0164, |
|
"reward": 0.9657434225082397, |
|
"reward_std": 0.12022296455688775, |
|
"rewards/accuracy_reward": 0.5491071343421936, |
|
"rewards/semantic_entropy_math_reward": 0.41663629189133644, |
|
"step": 138 |
|
}, |
|
{ |
|
"completion_length": 516.6587944030762, |
|
"epoch": 0.36221498371335503, |
|
"grad_norm": 0.010173982009291649, |
|
"learning_rate": 7.574572149761437e-06, |
|
"loss": -0.0154, |
|
"reward": 0.9734876044094563, |
|
"reward_std": 0.11138522112742066, |
|
"rewards/accuracy_reward": 0.5695152971893549, |
|
"rewards/semantic_entropy_math_reward": 0.40397229604423046, |
|
"step": 139 |
|
}, |
|
{ |
|
"completion_length": 567.7136325836182, |
|
"epoch": 0.36482084690553745, |
|
"grad_norm": 0.008159446530044079, |
|
"learning_rate": 7.5373811002332785e-06, |
|
"loss": -0.0118, |
|
"reward": 0.910987600684166, |
|
"reward_std": 0.09473348397295922, |
|
"rewards/accuracy_reward": 0.5095663145184517, |
|
"rewards/semantic_entropy_math_reward": 0.4014212768524885, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 566.8067493438721, |
|
"epoch": 0.3674267100977199, |
|
"grad_norm": 0.006627434398978949, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": -0.0138, |
|
"reward": 0.9380466379225254, |
|
"reward_std": 0.10335312166716903, |
|
"rewards/accuracy_reward": 0.5382652897387743, |
|
"rewards/semantic_entropy_math_reward": 0.39978133141994476, |
|
"step": 141 |
|
}, |
|
{ |
|
"completion_length": 527.3093032836914, |
|
"epoch": 0.3700325732899023, |
|
"grad_norm": 0.007407406345009804, |
|
"learning_rate": 7.462431648918689e-06, |
|
"loss": -0.0173, |
|
"reward": 0.904336716979742, |
|
"reward_std": 0.11712831069598906, |
|
"rewards/accuracy_reward": 0.5070152916014194, |
|
"rewards/semantic_entropy_math_reward": 0.3973214253783226, |
|
"step": 142 |
|
}, |
|
{ |
|
"completion_length": 515.315034866333, |
|
"epoch": 0.37263843648208467, |
|
"grad_norm": 0.008976518176496029, |
|
"learning_rate": 7.424678860871584e-06, |
|
"loss": -0.0096, |
|
"reward": 0.9782252088189125, |
|
"reward_std": 0.11401024123188108, |
|
"rewards/accuracy_reward": 0.5752550922334194, |
|
"rewards/semantic_entropy_math_reward": 0.4029701091349125, |
|
"step": 143 |
|
}, |
|
{ |
|
"completion_length": 565.4763851165771, |
|
"epoch": 0.3752442996742671, |
|
"grad_norm": 0.008781256154179573, |
|
"learning_rate": 7.3867444635553165e-06, |
|
"loss": -0.0196, |
|
"reward": 0.9317601844668388, |
|
"reward_std": 0.12317358143627644, |
|
"rewards/accuracy_reward": 0.5331632494926453, |
|
"rewards/semantic_entropy_math_reward": 0.398596940562129, |
|
"step": 144 |
|
}, |
|
{ |
|
"completion_length": 585.6358242034912, |
|
"epoch": 0.3778501628664495, |
|
"grad_norm": 0.006888892501592636, |
|
"learning_rate": 7.3486312982691134e-06, |
|
"loss": -0.0132, |
|
"reward": 0.8779154568910599, |
|
"reward_std": 0.10448942880611867, |
|
"rewards/accuracy_reward": 0.48278059996664524, |
|
"rewards/semantic_entropy_math_reward": 0.39513483084738255, |
|
"step": 145 |
|
}, |
|
{ |
|
"completion_length": 547.9228191375732, |
|
"epoch": 0.38045602605863194, |
|
"grad_norm": 0.009227856993675232, |
|
"learning_rate": 7.310342219701981e-06, |
|
"loss": -0.0185, |
|
"reward": 0.8972303010523319, |
|
"reward_std": 0.13705383567139506, |
|
"rewards/accuracy_reward": 0.49808672443032265, |
|
"rewards/semantic_entropy_math_reward": 0.3991435803472996, |
|
"step": 146 |
|
}, |
|
{ |
|
"completion_length": 527.695779800415, |
|
"epoch": 0.38306188925081436, |
|
"grad_norm": 0.010845191776752472, |
|
"learning_rate": 7.271880095718895e-06, |
|
"loss": -0.0189, |
|
"reward": 0.9622813388705254, |
|
"reward_std": 0.10206798242870718, |
|
"rewards/accuracy_reward": 0.5612244810909033, |
|
"rewards/semantic_entropy_math_reward": 0.40105685591697693, |
|
"step": 147 |
|
}, |
|
{ |
|
"completion_length": 479.8839225769043, |
|
"epoch": 0.3856677524429967, |
|
"grad_norm": 0.012252271175384521, |
|
"learning_rate": 7.233247807145989e-06, |
|
"loss": -0.0217, |
|
"reward": 0.8895772397518158, |
|
"reward_std": 0.11137077608145773, |
|
"rewards/accuracy_reward": 0.48022957891225815, |
|
"rewards/semantic_entropy_math_reward": 0.4093476627022028, |
|
"step": 148 |
|
}, |
|
{ |
|
"completion_length": 499.83226013183594, |
|
"epoch": 0.38827361563517915, |
|
"grad_norm": 0.01276180800050497, |
|
"learning_rate": 7.19444824755478e-06, |
|
"loss": -0.0136, |
|
"reward": 0.9279336594045162, |
|
"reward_std": 0.09600863244850188, |
|
"rewards/accuracy_reward": 0.5312499888241291, |
|
"rewards/semantic_entropy_math_reward": 0.3966836631298065, |
|
"step": 149 |
|
}, |
|
{ |
|
"completion_length": 505.0191249847412, |
|
"epoch": 0.39087947882736157, |
|
"grad_norm": 0.009752064011991024, |
|
"learning_rate": 7.155484323045442e-06, |
|
"loss": -0.025, |
|
"reward": 0.8462098762392998, |
|
"reward_std": 0.13227392174303532, |
|
"rewards/accuracy_reward": 0.4540816266089678, |
|
"rewards/semantic_entropy_math_reward": 0.39212826639413834, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 486.4183597564697, |
|
"epoch": 0.393485342019544, |
|
"grad_norm": 0.011875058524310589, |
|
"learning_rate": 7.11635895202914e-06, |
|
"loss": -0.019, |
|
"reward": 0.8894861489534378, |
|
"reward_std": 0.09644467895850539, |
|
"rewards/accuracy_reward": 0.4917091755196452, |
|
"rewards/semantic_entropy_math_reward": 0.39777696318924427, |
|
"step": 151 |
|
}, |
|
{ |
|
"completion_length": 516.7933540344238, |
|
"epoch": 0.39609120521172636, |
|
"grad_norm": 0.00881208572536707, |
|
"learning_rate": 7.0770750650094335e-06, |
|
"loss": -0.0209, |
|
"reward": 0.9480685032904148, |
|
"reward_std": 0.10766224935650826, |
|
"rewards/accuracy_reward": 0.5471938634291291, |
|
"rewards/semantic_entropy_math_reward": 0.40087462589144707, |
|
"step": 152 |
|
}, |
|
{ |
|
"completion_length": 506.79143714904785, |
|
"epoch": 0.3986970684039088, |
|
"grad_norm": 0.012560566887259483, |
|
"learning_rate": 7.037635604362786e-06, |
|
"loss": -0.0162, |
|
"reward": 0.9439686462283134, |
|
"reward_std": 0.10271801124326885, |
|
"rewards/accuracy_reward": 0.5318877454847097, |
|
"rewards/semantic_entropy_math_reward": 0.41208088397979736, |
|
"step": 153 |
|
}, |
|
{ |
|
"completion_length": 438.49106216430664, |
|
"epoch": 0.4013029315960912, |
|
"grad_norm": 0.011885056272149086, |
|
"learning_rate": 6.9980435241181785e-06, |
|
"loss": -0.0173, |
|
"reward": 0.9576348140835762, |
|
"reward_std": 0.09978608437813818, |
|
"rewards/accuracy_reward": 0.541454067453742, |
|
"rewards/semantic_entropy_math_reward": 0.4161807522177696, |
|
"step": 154 |
|
}, |
|
{ |
|
"completion_length": 513.610954284668, |
|
"epoch": 0.40390879478827363, |
|
"grad_norm": 0.009011725895106792, |
|
"learning_rate": 6.958301789735853e-06, |
|
"loss": -0.0224, |
|
"reward": 0.8590561151504517, |
|
"reward_std": 0.10832268767990172, |
|
"rewards/accuracy_reward": 0.46811223216354847, |
|
"rewards/semantic_entropy_math_reward": 0.390943868085742, |
|
"step": 155 |
|
}, |
|
{ |
|
"completion_length": 480.7538185119629, |
|
"epoch": 0.40651465798045605, |
|
"grad_norm": 0.013393928296864033, |
|
"learning_rate": 6.918413377885193e-06, |
|
"loss": -0.026, |
|
"reward": 0.9114431329071522, |
|
"reward_std": 0.09208173397928476, |
|
"rewards/accuracy_reward": 0.49489794485270977, |
|
"rewards/semantic_entropy_math_reward": 0.4165451843291521, |
|
"step": 156 |
|
}, |
|
{ |
|
"completion_length": 491.4189987182617, |
|
"epoch": 0.4091205211726384, |
|
"grad_norm": 0.009988815523684025, |
|
"learning_rate": 6.878381276221777e-06, |
|
"loss": -0.0103, |
|
"reward": 0.8825619332492352, |
|
"reward_std": 0.10942790098488331, |
|
"rewards/accuracy_reward": 0.4846938634291291, |
|
"rewards/semantic_entropy_math_reward": 0.3978680744767189, |
|
"step": 157 |
|
}, |
|
{ |
|
"completion_length": 496.1402931213379, |
|
"epoch": 0.41172638436482084, |
|
"grad_norm": 0.008957677520811558, |
|
"learning_rate": 6.838208483163601e-06, |
|
"loss": -0.0255, |
|
"reward": 0.8830174691975117, |
|
"reward_std": 0.10132573859300464, |
|
"rewards/accuracy_reward": 0.49426019191741943, |
|
"rewards/semantic_entropy_math_reward": 0.3887572903186083, |
|
"step": 158 |
|
}, |
|
{ |
|
"completion_length": 493.60011863708496, |
|
"epoch": 0.41433224755700326, |
|
"grad_norm": 0.007134555373340845, |
|
"learning_rate": 6.797898007666493e-06, |
|
"loss": -0.0279, |
|
"reward": 0.8826530575752258, |
|
"reward_std": 0.0909608873189427, |
|
"rewards/accuracy_reward": 0.4853316191583872, |
|
"rewards/semantic_entropy_math_reward": 0.3973214253783226, |
|
"step": 159 |
|
}, |
|
{ |
|
"completion_length": 483.09437370300293, |
|
"epoch": 0.4169381107491857, |
|
"grad_norm": 0.009405710734426975, |
|
"learning_rate": 6.757452868998737e-06, |
|
"loss": -0.016, |
|
"reward": 0.9526238851249218, |
|
"reward_std": 0.09115565207321197, |
|
"rewards/accuracy_reward": 0.5363520253449678, |
|
"rewards/semantic_entropy_math_reward": 0.4162718504667282, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 475.0184898376465, |
|
"epoch": 0.41954397394136805, |
|
"grad_norm": 0.008445804007351398, |
|
"learning_rate": 6.716876096514944e-06, |
|
"loss": -0.0231, |
|
"reward": 0.9812317453324795, |
|
"reward_std": 0.09919259638991207, |
|
"rewards/accuracy_reward": 0.5631377417594194, |
|
"rewards/semantic_entropy_math_reward": 0.4180940203368664, |
|
"step": 161 |
|
}, |
|
{ |
|
"completion_length": 448.816951751709, |
|
"epoch": 0.4221498371335505, |
|
"grad_norm": 0.006540235131978989, |
|
"learning_rate": 6.676170729429132e-06, |
|
"loss": -0.0164, |
|
"reward": 0.9816872999072075, |
|
"reward_std": 0.07881008239928633, |
|
"rewards/accuracy_reward": 0.5535714123398066, |
|
"rewards/semantic_entropy_math_reward": 0.4281158857047558, |
|
"step": 162 |
|
}, |
|
{ |
|
"completion_length": 501.0682258605957, |
|
"epoch": 0.4247557003257329, |
|
"grad_norm": 0.006146623287349939, |
|
"learning_rate": 6.635339816587109e-06, |
|
"loss": -0.0158, |
|
"reward": 0.9382288418710232, |
|
"reward_std": 0.09887587034609169, |
|
"rewards/accuracy_reward": 0.5229591736570001, |
|
"rewards/semantic_entropy_math_reward": 0.41526967100799084, |
|
"step": 163 |
|
}, |
|
{ |
|
"completion_length": 506.3195095062256, |
|
"epoch": 0.4273615635179153, |
|
"grad_norm": 0.007581164129078388, |
|
"learning_rate": 6.594386416238095e-06, |
|
"loss": -0.0192, |
|
"reward": 0.908618800342083, |
|
"reward_std": 0.12335213355254382, |
|
"rewards/accuracy_reward": 0.5153061114251614, |
|
"rewards/semantic_entropy_math_reward": 0.39331267960369587, |
|
"step": 164 |
|
}, |
|
{ |
|
"completion_length": 503.4725742340088, |
|
"epoch": 0.42996742671009774, |
|
"grad_norm": 0.006690696347504854, |
|
"learning_rate": 6.553313595805666e-06, |
|
"loss": -0.018, |
|
"reward": 0.9704810380935669, |
|
"reward_std": 0.08365448092808947, |
|
"rewards/accuracy_reward": 0.5497448891401291, |
|
"rewards/semantic_entropy_math_reward": 0.42073615454137325, |
|
"step": 165 |
|
}, |
|
{ |
|
"completion_length": 469.7703971862793, |
|
"epoch": 0.4325732899022801, |
|
"grad_norm": 0.006663058884441853, |
|
"learning_rate": 6.512124431658006e-06, |
|
"loss": -0.0153, |
|
"reward": 0.9451530575752258, |
|
"reward_std": 0.09975362673867494, |
|
"rewards/accuracy_reward": 0.5338010098785162, |
|
"rewards/semantic_entropy_math_reward": 0.411352027207613, |
|
"step": 166 |
|
}, |
|
{ |
|
"completion_length": 488.9763946533203, |
|
"epoch": 0.43517915309446253, |
|
"grad_norm": 0.010632487945258617, |
|
"learning_rate": 6.470822008877482e-06, |
|
"loss": -0.013, |
|
"reward": 0.968567755073309, |
|
"reward_std": 0.09277999610640109, |
|
"rewards/accuracy_reward": 0.5682397782802582, |
|
"rewards/semantic_entropy_math_reward": 0.40032798051834106, |
|
"step": 167 |
|
}, |
|
{ |
|
"completion_length": 489.58289909362793, |
|
"epoch": 0.43778501628664496, |
|
"grad_norm": 0.006177806295454502, |
|
"learning_rate": 6.4294094210295725e-06, |
|
"loss": -0.0196, |
|
"reward": 0.9651056863367558, |
|
"reward_std": 0.08780966483755037, |
|
"rewards/accuracy_reward": 0.5478316228836775, |
|
"rewards/semantic_entropy_math_reward": 0.4172740466892719, |
|
"step": 168 |
|
}, |
|
{ |
|
"completion_length": 475.0695056915283, |
|
"epoch": 0.4403908794788274, |
|
"grad_norm": 0.008250612765550613, |
|
"learning_rate": 6.3878897699311525e-06, |
|
"loss": -0.0085, |
|
"reward": 0.9698432683944702, |
|
"reward_std": 0.09511806583032012, |
|
"rewards/accuracy_reward": 0.5452805999666452, |
|
"rewards/semantic_entropy_math_reward": 0.424562668427825, |
|
"step": 169 |
|
}, |
|
{ |
|
"completion_length": 485.93302726745605, |
|
"epoch": 0.44299674267100975, |
|
"grad_norm": 0.00776874041184783, |
|
"learning_rate": 6.346266165418173e-06, |
|
"loss": -0.0268, |
|
"reward": 0.9740342572331429, |
|
"reward_std": 0.09444212296511978, |
|
"rewards/accuracy_reward": 0.554846927523613, |
|
"rewards/semantic_entropy_math_reward": 0.41918730549514294, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 497.7283020019531, |
|
"epoch": 0.44560260586319217, |
|
"grad_norm": 0.007973974570631981, |
|
"learning_rate": 6.304541725112734e-06, |
|
"loss": -0.0158, |
|
"reward": 0.9323068186640739, |
|
"reward_std": 0.09420971409417689, |
|
"rewards/accuracy_reward": 0.5280612092465162, |
|
"rewards/semantic_entropy_math_reward": 0.4042456317692995, |
|
"step": 171 |
|
}, |
|
{ |
|
"completion_length": 498.0956573486328, |
|
"epoch": 0.4482084690553746, |
|
"grad_norm": 0.007753515616059303, |
|
"learning_rate": 6.262719574189564e-06, |
|
"loss": -0.0223, |
|
"reward": 0.9188228957355022, |
|
"reward_std": 0.09362561779562384, |
|
"rewards/accuracy_reward": 0.5197703912854195, |
|
"rewards/semantic_entropy_math_reward": 0.39905248768627644, |
|
"step": 172 |
|
}, |
|
{ |
|
"completion_length": 470.75508880615234, |
|
"epoch": 0.450814332247557, |
|
"grad_norm": 0.007259304169565439, |
|
"learning_rate": 6.2208028451419575e-06, |
|
"loss": -0.01, |
|
"reward": 0.9154518619179726, |
|
"reward_std": 0.08905957284150645, |
|
"rewards/accuracy_reward": 0.5076530463993549, |
|
"rewards/semantic_entropy_math_reward": 0.4077988266944885, |
|
"step": 173 |
|
}, |
|
{ |
|
"completion_length": 454.5631294250488, |
|
"epoch": 0.45342019543973944, |
|
"grad_norm": 0.0075342534109950066, |
|
"learning_rate": 6.178794677547138e-06, |
|
"loss": -0.0105, |
|
"reward": 1.002368789166212, |
|
"reward_std": 0.0902844921220094, |
|
"rewards/accuracy_reward": 0.5733418203890324, |
|
"rewards/semantic_entropy_math_reward": 0.4290269613265991, |
|
"step": 174 |
|
}, |
|
{ |
|
"completion_length": 456.7385063171387, |
|
"epoch": 0.4560260586319218, |
|
"grad_norm": 0.008164818398654461, |
|
"learning_rate": 6.136698217831106e-06, |
|
"loss": -0.0083, |
|
"reward": 0.9758564047515392, |
|
"reward_std": 0.08968472620472312, |
|
"rewards/accuracy_reward": 0.5471938662230968, |
|
"rewards/semantic_entropy_math_reward": 0.42866252548992634, |
|
"step": 175 |
|
}, |
|
{ |
|
"completion_length": 461.8769073486328, |
|
"epoch": 0.4586319218241042, |
|
"grad_norm": 0.018572386354207993, |
|
"learning_rate": 6.094516619032975e-06, |
|
"loss": -0.0141, |
|
"reward": 0.9573614969849586, |
|
"reward_std": 0.09445905161555856, |
|
"rewards/accuracy_reward": 0.5452805962413549, |
|
"rewards/semantic_entropy_math_reward": 0.41208089515566826, |
|
"step": 176 |
|
}, |
|
{ |
|
"completion_length": 460.4668254852295, |
|
"epoch": 0.46123778501628665, |
|
"grad_norm": 0.01110108196735382, |
|
"learning_rate": 6.052253040568804e-06, |
|
"loss": -0.0138, |
|
"reward": 0.9251093156635761, |
|
"reward_std": 0.1255237814038992, |
|
"rewards/accuracy_reward": 0.5191326439380646, |
|
"rewards/semantic_entropy_math_reward": 0.4059766735881567, |
|
"step": 177 |
|
}, |
|
{ |
|
"completion_length": 450.43493843078613, |
|
"epoch": 0.4638436482084691, |
|
"grad_norm": 0.010779962874948978, |
|
"learning_rate": 6.009910647994956e-06, |
|
"loss": -0.0185, |
|
"reward": 0.9765852577984333, |
|
"reward_std": 0.10055442503653467, |
|
"rewards/accuracy_reward": 0.560586716979742, |
|
"rewards/semantic_entropy_math_reward": 0.4159985240548849, |
|
"step": 178 |
|
}, |
|
{ |
|
"completion_length": 507.37243843078613, |
|
"epoch": 0.46644951140065144, |
|
"grad_norm": 0.009717591106891632, |
|
"learning_rate": 5.967492612770999e-06, |
|
"loss": -0.0185, |
|
"reward": 0.947704054415226, |
|
"reward_std": 0.09296219941461459, |
|
"rewards/accuracy_reward": 0.5401785597205162, |
|
"rewards/semantic_entropy_math_reward": 0.4075255021452904, |
|
"step": 179 |
|
}, |
|
{ |
|
"completion_length": 454.82906913757324, |
|
"epoch": 0.46905537459283386, |
|
"grad_norm": 0.007440619170665741, |
|
"learning_rate": 5.925002112022158e-06, |
|
"loss": -0.0158, |
|
"reward": 0.9303935505449772, |
|
"reward_std": 0.10131418029777706, |
|
"rewards/accuracy_reward": 0.5210459046065807, |
|
"rewards/semantic_entropy_math_reward": 0.40934766083955765, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 485.0899181365967, |
|
"epoch": 0.4716612377850163, |
|
"grad_norm": 0.011163070797920227, |
|
"learning_rate": 5.882442328301356e-06, |
|
"loss": -0.025, |
|
"reward": 0.958454791456461, |
|
"reward_std": 0.10061666322872043, |
|
"rewards/accuracy_reward": 0.554209167137742, |
|
"rewards/semantic_entropy_math_reward": 0.4042456094175577, |
|
"step": 181 |
|
}, |
|
{ |
|
"completion_length": 465.15815353393555, |
|
"epoch": 0.4742671009771987, |
|
"grad_norm": 0.008569203317165375, |
|
"learning_rate": 5.839816449350824e-06, |
|
"loss": -0.0195, |
|
"reward": 0.9420553632080555, |
|
"reward_std": 0.06814239080995321, |
|
"rewards/accuracy_reward": 0.529336728155613, |
|
"rewards/semantic_entropy_math_reward": 0.41271864995360374, |
|
"step": 182 |
|
}, |
|
{ |
|
"completion_length": 514.3443832397461, |
|
"epoch": 0.47687296416938113, |
|
"grad_norm": 0.0077494359575212, |
|
"learning_rate": 5.7971276678633625e-06, |
|
"loss": -0.0187, |
|
"reward": 1.012208428233862, |
|
"reward_std": 0.07250937505159527, |
|
"rewards/accuracy_reward": 0.600127536803484, |
|
"rewards/semantic_entropy_math_reward": 0.4120808970183134, |
|
"step": 183 |
|
}, |
|
{ |
|
"completion_length": 487.92983055114746, |
|
"epoch": 0.4794788273615635, |
|
"grad_norm": 0.007494843099266291, |
|
"learning_rate": 5.754379181243179e-06, |
|
"loss": -0.0225, |
|
"reward": 0.9169095903635025, |
|
"reward_std": 0.09801324107684195, |
|
"rewards/accuracy_reward": 0.5159438718110323, |
|
"rewards/semantic_entropy_math_reward": 0.40096573159098625, |
|
"step": 184 |
|
}, |
|
{ |
|
"completion_length": 480.8692512512207, |
|
"epoch": 0.4820846905537459, |
|
"grad_norm": 0.008148902095854282, |
|
"learning_rate": 5.711574191366427e-06, |
|
"loss": -0.0216, |
|
"reward": 1.0052842535078526, |
|
"reward_std": 0.09645737172104418, |
|
"rewards/accuracy_reward": 0.5880101881921291, |
|
"rewards/semantic_entropy_math_reward": 0.41727403923869133, |
|
"step": 185 |
|
}, |
|
{ |
|
"completion_length": 472.0790710449219, |
|
"epoch": 0.48469055374592834, |
|
"grad_norm": 0.00966109149158001, |
|
"learning_rate": 5.668715904341365e-06, |
|
"loss": -0.0201, |
|
"reward": 0.9480684921145439, |
|
"reward_std": 0.09934344311477616, |
|
"rewards/accuracy_reward": 0.5299744755029678, |
|
"rewards/semantic_entropy_math_reward": 0.4180940203368664, |
|
"step": 186 |
|
}, |
|
{ |
|
"completion_length": 475.868616104126, |
|
"epoch": 0.48729641693811077, |
|
"grad_norm": 0.010433156974613667, |
|
"learning_rate": 5.62580753026823e-06, |
|
"loss": -0.0251, |
|
"reward": 0.8839285559952259, |
|
"reward_std": 0.0782707966864109, |
|
"rewards/accuracy_reward": 0.4687499953433871, |
|
"rewards/semantic_entropy_math_reward": 0.4151785559952259, |
|
"step": 187 |
|
}, |
|
{ |
|
"completion_length": 476.2665729522705, |
|
"epoch": 0.48990228013029313, |
|
"grad_norm": 0.007323419209569693, |
|
"learning_rate": 5.5828522829987965e-06, |
|
"loss": -0.0156, |
|
"reward": 0.9895225577056408, |
|
"reward_std": 0.08914521127007902, |
|
"rewards/accuracy_reward": 0.5669642705470324, |
|
"rewards/semantic_entropy_math_reward": 0.42255830578505993, |
|
"step": 188 |
|
}, |
|
{ |
|
"completion_length": 479.14284896850586, |
|
"epoch": 0.49250814332247556, |
|
"grad_norm": 0.0054732197895646095, |
|
"learning_rate": 5.539853379895656e-06, |
|
"loss": -0.0189, |
|
"reward": 1.0017310306429863, |
|
"reward_std": 0.06712543140747584, |
|
"rewards/accuracy_reward": 0.5733418166637421, |
|
"rewards/semantic_entropy_math_reward": 0.42838921397924423, |
|
"step": 189 |
|
}, |
|
{ |
|
"completion_length": 462.00828552246094, |
|
"epoch": 0.495114006514658, |
|
"grad_norm": 0.0109707685187459, |
|
"learning_rate": 5.496814041591234e-06, |
|
"loss": -0.0202, |
|
"reward": 0.9079810194671154, |
|
"reward_std": 0.10293658822774887, |
|
"rewards/accuracy_reward": 0.5012755002826452, |
|
"rewards/semantic_entropy_math_reward": 0.4067055266350508, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 497.70853996276855, |
|
"epoch": 0.4977198697068404, |
|
"grad_norm": 0.008094045333564281, |
|
"learning_rate": 5.453737491746572e-06, |
|
"loss": -0.0197, |
|
"reward": 0.9156340956687927, |
|
"reward_std": 0.0916732897167094, |
|
"rewards/accuracy_reward": 0.5191326476633549, |
|
"rewards/semantic_entropy_math_reward": 0.39650145545601845, |
|
"step": 191 |
|
}, |
|
{ |
|
"completion_length": 478.77613639831543, |
|
"epoch": 0.5003257328990228, |
|
"grad_norm": 0.014000942930579185, |
|
"learning_rate": 5.410626956809864e-06, |
|
"loss": -0.0141, |
|
"reward": 0.9786807335913181, |
|
"reward_std": 0.09398427966516465, |
|
"rewards/accuracy_reward": 0.5637754946947098, |
|
"rewards/semantic_entropy_math_reward": 0.4149052444845438, |
|
"step": 192 |
|
}, |
|
{ |
|
"completion_length": 456.85266494750977, |
|
"epoch": 0.5029315960912052, |
|
"grad_norm": 0.00816336553543806, |
|
"learning_rate": 5.367485665774802e-06, |
|
"loss": -0.0095, |
|
"reward": 0.9785896465182304, |
|
"reward_std": 0.08977296199009288, |
|
"rewards/accuracy_reward": 0.565051008015871, |
|
"rewards/semantic_entropy_math_reward": 0.4135386198759079, |
|
"step": 193 |
|
}, |
|
{ |
|
"completion_length": 449.39922523498535, |
|
"epoch": 0.5055374592833877, |
|
"grad_norm": 0.007704311050474644, |
|
"learning_rate": 5.324316849938715e-06, |
|
"loss": -0.0191, |
|
"reward": 0.9876093044877052, |
|
"reward_std": 0.10061339178355411, |
|
"rewards/accuracy_reward": 0.581632636487484, |
|
"rewards/semantic_entropy_math_reward": 0.40597667545080185, |
|
"step": 194 |
|
}, |
|
{ |
|
"completion_length": 450.8992233276367, |
|
"epoch": 0.50814332247557, |
|
"grad_norm": 0.008355666883289814, |
|
"learning_rate": 5.281123742660558e-06, |
|
"loss": -0.0215, |
|
"reward": 0.9452441520988941, |
|
"reward_std": 0.09853553911671042, |
|
"rewards/accuracy_reward": 0.5312499823048711, |
|
"rewards/semantic_entropy_math_reward": 0.41399415768682957, |
|
"step": 195 |
|
}, |
|
{ |
|
"completion_length": 475.535701751709, |
|
"epoch": 0.5107491856677524, |
|
"grad_norm": 0.00795848947018385, |
|
"learning_rate": 5.237909579118713e-06, |
|
"loss": -0.0228, |
|
"reward": 0.9396865740418434, |
|
"reward_std": 0.08185443480033427, |
|
"rewards/accuracy_reward": 0.5293367225676775, |
|
"rewards/semantic_entropy_math_reward": 0.41034984588623047, |
|
"step": 196 |
|
}, |
|
{ |
|
"completion_length": 444.8494815826416, |
|
"epoch": 0.5133550488599349, |
|
"grad_norm": 0.011833498254418373, |
|
"learning_rate": 5.194677596068689e-06, |
|
"loss": -0.0114, |
|
"reward": 0.9045189321041107, |
|
"reward_std": 0.086035909014754, |
|
"rewards/accuracy_reward": 0.4910714216530323, |
|
"rewards/semantic_entropy_math_reward": 0.413447517901659, |
|
"step": 197 |
|
}, |
|
{ |
|
"completion_length": 444.4464168548584, |
|
"epoch": 0.5159609120521172, |
|
"grad_norm": 0.00945763848721981, |
|
"learning_rate": 5.1514310316006835e-06, |
|
"loss": -0.0135, |
|
"reward": 0.9529883228242397, |
|
"reward_std": 0.08475825749337673, |
|
"rewards/accuracy_reward": 0.5350765138864517, |
|
"rewards/semantic_entropy_math_reward": 0.4179117977619171, |
|
"step": 198 |
|
}, |
|
{ |
|
"completion_length": 471.7232036590576, |
|
"epoch": 0.5185667752442997, |
|
"grad_norm": 0.009042466059327126, |
|
"learning_rate": 5.1081731248970435e-06, |
|
"loss": -0.0201, |
|
"reward": 0.953717190772295, |
|
"reward_std": 0.09949397918535396, |
|
"rewards/accuracy_reward": 0.5446428433060646, |
|
"rewards/semantic_entropy_math_reward": 0.4090743362903595, |
|
"step": 199 |
|
}, |
|
{ |
|
"completion_length": 445.26083183288574, |
|
"epoch": 0.5211726384364821, |
|
"grad_norm": 0.012422044761478901, |
|
"learning_rate": 5.064907115989655e-06, |
|
"loss": -0.0276, |
|
"reward": 0.9502550885081291, |
|
"reward_std": 0.11640744589385577, |
|
"rewards/accuracy_reward": 0.5471938643604517, |
|
"rewards/semantic_entropy_math_reward": 0.4030612148344517, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 475.43494415283203, |
|
"epoch": 0.5237785016286645, |
|
"grad_norm": 0.013881414197385311, |
|
"learning_rate": 5.021636245517261e-06, |
|
"loss": -0.019, |
|
"reward": 0.9939868673682213, |
|
"reward_std": 0.1037318678572774, |
|
"rewards/accuracy_reward": 0.5790816228836775, |
|
"rewards/semantic_entropy_math_reward": 0.4149052333086729, |
|
"step": 201 |
|
}, |
|
{ |
|
"completion_length": 469.54272079467773, |
|
"epoch": 0.5263843648208469, |
|
"grad_norm": 0.013678316958248615, |
|
"learning_rate": 4.978363754482741e-06, |
|
"loss": -0.0194, |
|
"reward": 0.8790087252855301, |
|
"reward_std": 0.1069271593587473, |
|
"rewards/accuracy_reward": 0.48915815725922585, |
|
"rewards/semantic_entropy_math_reward": 0.38985056802630424, |
|
"step": 202 |
|
}, |
|
{ |
|
"completion_length": 471.13008880615234, |
|
"epoch": 0.5289902280130293, |
|
"grad_norm": 0.012411783449351788, |
|
"learning_rate": 4.935092884010347e-06, |
|
"loss": -0.0129, |
|
"reward": 0.9805028922855854, |
|
"reward_std": 0.09484815062023699, |
|
"rewards/accuracy_reward": 0.5695152897387743, |
|
"rewards/semantic_entropy_math_reward": 0.41098759695887566, |
|
"step": 203 |
|
}, |
|
{ |
|
"completion_length": 494.3297119140625, |
|
"epoch": 0.5315960912052117, |
|
"grad_norm": 0.009701291099190712, |
|
"learning_rate": 4.891826875102958e-06, |
|
"loss": -0.0205, |
|
"reward": 0.9369533248245716, |
|
"reward_std": 0.11289398185908794, |
|
"rewards/accuracy_reward": 0.5414540693163872, |
|
"rewards/semantic_entropy_math_reward": 0.3954992648214102, |
|
"step": 204 |
|
}, |
|
{ |
|
"completion_length": 433.42409896850586, |
|
"epoch": 0.5342019543973942, |
|
"grad_norm": 0.011294050142168999, |
|
"learning_rate": 4.848568968399317e-06, |
|
"loss": -0.0136, |
|
"reward": 0.9180940166115761, |
|
"reward_std": 0.10581530025228858, |
|
"rewards/accuracy_reward": 0.5121173374354839, |
|
"rewards/semantic_entropy_math_reward": 0.4059766698628664, |
|
"step": 205 |
|
}, |
|
{ |
|
"completion_length": 444.698335647583, |
|
"epoch": 0.5368078175895765, |
|
"grad_norm": 0.008201202377676964, |
|
"learning_rate": 4.805322403931312e-06, |
|
"loss": -0.0129, |
|
"reward": 1.0149416662752628, |
|
"reward_std": 0.08048964510089718, |
|
"rewards/accuracy_reward": 0.5956632532179356, |
|
"rewards/semantic_entropy_math_reward": 0.419278422370553, |
|
"step": 206 |
|
}, |
|
{ |
|
"completion_length": 476.51465797424316, |
|
"epoch": 0.539413680781759, |
|
"grad_norm": 0.011591507121920586, |
|
"learning_rate": 4.762090420881289e-06, |
|
"loss": -0.0122, |
|
"reward": 0.9323979467153549, |
|
"reward_std": 0.11307998397387564, |
|
"rewards/accuracy_reward": 0.5216836631298065, |
|
"rewards/semantic_entropy_math_reward": 0.41071428172290325, |
|
"step": 207 |
|
}, |
|
{ |
|
"completion_length": 476.31631660461426, |
|
"epoch": 0.5420195439739414, |
|
"grad_norm": 0.008181254379451275, |
|
"learning_rate": 4.718876257339444e-06, |
|
"loss": -0.0196, |
|
"reward": 0.9384110532701015, |
|
"reward_std": 0.08237648417707533, |
|
"rewards/accuracy_reward": 0.5223214160650969, |
|
"rewards/semantic_entropy_math_reward": 0.41608964651823044, |
|
"step": 208 |
|
}, |
|
{ |
|
"completion_length": 490.5790729522705, |
|
"epoch": 0.5446254071661237, |
|
"grad_norm": 0.009356926195323467, |
|
"learning_rate": 4.6756831500612846e-06, |
|
"loss": -0.0217, |
|
"reward": 0.9779518954455853, |
|
"reward_std": 0.10836848570033908, |
|
"rewards/accuracy_reward": 0.5561224352568388, |
|
"rewards/semantic_entropy_math_reward": 0.4218294396996498, |
|
"step": 209 |
|
}, |
|
{ |
|
"completion_length": 516.4917030334473, |
|
"epoch": 0.5472312703583062, |
|
"grad_norm": 0.011323334649205208, |
|
"learning_rate": 4.632514334225201e-06, |
|
"loss": -0.0184, |
|
"reward": 0.9516217112541199, |
|
"reward_std": 0.09244993596803397, |
|
"rewards/accuracy_reward": 0.5248724445700645, |
|
"rewards/semantic_entropy_math_reward": 0.42674924805760384, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 492.48659896850586, |
|
"epoch": 0.5498371335504886, |
|
"grad_norm": 0.009480936452746391, |
|
"learning_rate": 4.589373043190137e-06, |
|
"loss": -0.0143, |
|
"reward": 0.9234693683683872, |
|
"reward_std": 0.10728166525950655, |
|
"rewards/accuracy_reward": 0.5140305981040001, |
|
"rewards/semantic_entropy_math_reward": 0.4094387833029032, |
|
"step": 211 |
|
}, |
|
{ |
|
"completion_length": 491.74297523498535, |
|
"epoch": 0.552442996742671, |
|
"grad_norm": 0.010586866177618504, |
|
"learning_rate": 4.546262508253429e-06, |
|
"loss": -0.0127, |
|
"reward": 0.9772230423986912, |
|
"reward_std": 0.10844850935973227, |
|
"rewards/accuracy_reward": 0.5612244736403227, |
|
"rewards/semantic_entropy_math_reward": 0.4159985277801752, |
|
"step": 212 |
|
}, |
|
{ |
|
"completion_length": 458.1702690124512, |
|
"epoch": 0.5550488599348534, |
|
"grad_norm": 0.00833533238619566, |
|
"learning_rate": 4.503185958408767e-06, |
|
"loss": -0.021, |
|
"reward": 1.0099307373166084, |
|
"reward_std": 0.10288633638992906, |
|
"rewards/accuracy_reward": 0.5854591727256775, |
|
"rewards/semantic_entropy_math_reward": 0.4244715701788664, |
|
"step": 213 |
|
}, |
|
{ |
|
"completion_length": 514.4853172302246, |
|
"epoch": 0.5576547231270358, |
|
"grad_norm": 0.007394696585834026, |
|
"learning_rate": 4.460146620104347e-06, |
|
"loss": -0.0224, |
|
"reward": 0.9210094511508942, |
|
"reward_std": 0.11905729840509593, |
|
"rewards/accuracy_reward": 0.5178571306169033, |
|
"rewards/semantic_entropy_math_reward": 0.4031523112207651, |
|
"step": 214 |
|
}, |
|
{ |
|
"completion_length": 476.78442573547363, |
|
"epoch": 0.5602605863192183, |
|
"grad_norm": 0.009832123294472694, |
|
"learning_rate": 4.417147717001205e-06, |
|
"loss": -0.0229, |
|
"reward": 0.8485787101089954, |
|
"reward_std": 0.09484727546805516, |
|
"rewards/accuracy_reward": 0.445790808647871, |
|
"rewards/semantic_entropy_math_reward": 0.40278790704905987, |
|
"step": 215 |
|
}, |
|
{ |
|
"completion_length": 459.61988639831543, |
|
"epoch": 0.5628664495114006, |
|
"grad_norm": 0.012813005596399307, |
|
"learning_rate": 4.374192469731771e-06, |
|
"loss": -0.0134, |
|
"reward": 0.9386843740940094, |
|
"reward_std": 0.095578909793403, |
|
"rewards/accuracy_reward": 0.5223214216530323, |
|
"rewards/semantic_entropy_math_reward": 0.41636295430362225, |
|
"step": 216 |
|
}, |
|
{ |
|
"completion_length": 434.995530128479, |
|
"epoch": 0.5654723127035831, |
|
"grad_norm": 0.0082651786506176, |
|
"learning_rate": 4.331284095658637e-06, |
|
"loss": -0.0164, |
|
"reward": 0.981778409332037, |
|
"reward_std": 0.09468310995725915, |
|
"rewards/accuracy_reward": 0.5618622284382582, |
|
"rewards/semantic_entropy_math_reward": 0.41991617158055305, |
|
"step": 217 |
|
}, |
|
{ |
|
"completion_length": 474.4394016265869, |
|
"epoch": 0.5680781758957655, |
|
"grad_norm": 0.010903375223279, |
|
"learning_rate": 4.2884258086335755e-06, |
|
"loss": -0.007, |
|
"reward": 0.9291180595755577, |
|
"reward_std": 0.11282830289565027, |
|
"rewards/accuracy_reward": 0.5184948835521936, |
|
"rewards/semantic_entropy_math_reward": 0.4106231704354286, |
|
"step": 218 |
|
}, |
|
{ |
|
"completion_length": 482.2563667297363, |
|
"epoch": 0.5706840390879478, |
|
"grad_norm": 0.0167391300201416, |
|
"learning_rate": 4.245620818756822e-06, |
|
"loss": -0.0106, |
|
"reward": 0.931213553994894, |
|
"reward_std": 0.08836283767595887, |
|
"rewards/accuracy_reward": 0.5153061132878065, |
|
"rewards/semantic_entropy_math_reward": 0.41590742766857147, |
|
"step": 219 |
|
}, |
|
{ |
|
"completion_length": 544.2040729522705, |
|
"epoch": 0.5732899022801303, |
|
"grad_norm": 0.009337971918284893, |
|
"learning_rate": 4.202872332136639e-06, |
|
"loss": -0.0152, |
|
"reward": 0.9517128244042397, |
|
"reward_std": 0.09688535577151924, |
|
"rewards/accuracy_reward": 0.533163245767355, |
|
"rewards/semantic_entropy_math_reward": 0.4185495525598526, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 641.5816173553467, |
|
"epoch": 0.5758957654723127, |
|
"grad_norm": 0.008485456928610802, |
|
"learning_rate": 4.160183550649176e-06, |
|
"loss": -0.0199, |
|
"reward": 0.9877004250884056, |
|
"reward_std": 0.0827121082256781, |
|
"rewards/accuracy_reward": 0.5637754965573549, |
|
"rewards/semantic_entropy_math_reward": 0.4239249173551798, |
|
"step": 221 |
|
}, |
|
{ |
|
"completion_length": 570.7506313323975, |
|
"epoch": 0.5785016286644951, |
|
"grad_norm": 0.010655037127435207, |
|
"learning_rate": 4.117557671698648e-06, |
|
"loss": -0.0159, |
|
"reward": 0.9324890524148941, |
|
"reward_std": 0.11734752007760108, |
|
"rewards/accuracy_reward": 0.5325254965573549, |
|
"rewards/semantic_entropy_math_reward": 0.399963553994894, |
|
"step": 222 |
|
}, |
|
{ |
|
"completion_length": 640.9151592254639, |
|
"epoch": 0.5811074918566775, |
|
"grad_norm": 0.018679160624742508, |
|
"learning_rate": 4.074997887977843e-06, |
|
"loss": -0.0209, |
|
"reward": 0.9269314669072628, |
|
"reward_std": 0.09998149517923594, |
|
"rewards/accuracy_reward": 0.5242346804589033, |
|
"rewards/semantic_entropy_math_reward": 0.4026967938989401, |
|
"step": 223 |
|
}, |
|
{ |
|
"completion_length": 558.047176361084, |
|
"epoch": 0.5837133550488599, |
|
"grad_norm": 0.009510348550975323, |
|
"learning_rate": 4.032507387229002e-06, |
|
"loss": -0.0118, |
|
"reward": 0.9808673299849033, |
|
"reward_std": 0.12139773485250771, |
|
"rewards/accuracy_reward": 0.5605867244303226, |
|
"rewards/semantic_entropy_math_reward": 0.4202805943787098, |
|
"step": 224 |
|
}, |
|
{ |
|
"completion_length": 568.3686084747314, |
|
"epoch": 0.5863192182410424, |
|
"grad_norm": 0.009974831715226173, |
|
"learning_rate": 3.9900893520050446e-06, |
|
"loss": -0.0176, |
|
"reward": 0.9728498421609402, |
|
"reward_std": 0.1163054957287386, |
|
"rewards/accuracy_reward": 0.561224477365613, |
|
"rewards/semantic_entropy_math_reward": 0.4116253536194563, |
|
"step": 225 |
|
}, |
|
{ |
|
"completion_length": 528.923454284668, |
|
"epoch": 0.5889250814332248, |
|
"grad_norm": 0.012413197197020054, |
|
"learning_rate": 3.9477469594311975e-06, |
|
"loss": -0.0071, |
|
"reward": 0.9459730163216591, |
|
"reward_std": 0.13538508489727974, |
|
"rewards/accuracy_reward": 0.5491071380674839, |
|
"rewards/semantic_entropy_math_reward": 0.3968658857047558, |
|
"step": 226 |
|
}, |
|
{ |
|
"completion_length": 507.6760063171387, |
|
"epoch": 0.5915309446254071, |
|
"grad_norm": 0.009683658368885517, |
|
"learning_rate": 3.905483380967027e-06, |
|
"loss": -0.0079, |
|
"reward": 0.9868804439902306, |
|
"reward_std": 0.09878062365169171, |
|
"rewards/accuracy_reward": 0.5644132550805807, |
|
"rewards/semantic_entropy_math_reward": 0.42246719636023045, |
|
"step": 227 |
|
}, |
|
{ |
|
"completion_length": 537.5465469360352, |
|
"epoch": 0.5941368078175896, |
|
"grad_norm": 0.018190421164035797, |
|
"learning_rate": 3.863301782168896e-06, |
|
"loss": -0.0161, |
|
"reward": 0.8993257842957973, |
|
"reward_std": 0.10135983489453793, |
|
"rewards/accuracy_reward": 0.4910714142024517, |
|
"rewards/semantic_entropy_math_reward": 0.40825437754392624, |
|
"step": 228 |
|
}, |
|
{ |
|
"completion_length": 543.8405494689941, |
|
"epoch": 0.596742671009772, |
|
"grad_norm": 0.011154105886816978, |
|
"learning_rate": 3.821205322452863e-06, |
|
"loss": -0.0188, |
|
"reward": 0.8654336482286453, |
|
"reward_std": 0.1210846480098553, |
|
"rewards/accuracy_reward": 0.47576529532670975, |
|
"rewards/semantic_entropy_math_reward": 0.3896683640778065, |
|
"step": 229 |
|
}, |
|
{ |
|
"completion_length": 499.15688133239746, |
|
"epoch": 0.5993485342019544, |
|
"grad_norm": 0.010389345698058605, |
|
"learning_rate": 3.779197154858044e-06, |
|
"loss": -0.0076, |
|
"reward": 0.8955903425812721, |
|
"reward_std": 0.0861530471011065, |
|
"rewards/accuracy_reward": 0.4980867300182581, |
|
"rewards/semantic_entropy_math_reward": 0.39750363677740097, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 535.5503730773926, |
|
"epoch": 0.6019543973941368, |
|
"grad_norm": 0.011222809553146362, |
|
"learning_rate": 3.7372804258104367e-06, |
|
"loss": -0.0143, |
|
"reward": 1.011935107409954, |
|
"reward_std": 0.08841318869963288, |
|
"rewards/accuracy_reward": 0.5835459046065807, |
|
"rewards/semantic_entropy_math_reward": 0.4283891972154379, |
|
"step": 231 |
|
}, |
|
{ |
|
"completion_length": 420.7837963104248, |
|
"epoch": 0.6045602605863192, |
|
"grad_norm": 0.01220863126218319, |
|
"learning_rate": 3.695458274887268e-06, |
|
"loss": -0.0126, |
|
"reward": 0.9781341105699539, |
|
"reward_std": 0.09525648917770013, |
|
"rewards/accuracy_reward": 0.5720663126558065, |
|
"rewards/semantic_entropy_math_reward": 0.40606776997447014, |
|
"step": 232 |
|
}, |
|
{ |
|
"completion_length": 444.399866104126, |
|
"epoch": 0.6071661237785017, |
|
"grad_norm": 0.008971706964075565, |
|
"learning_rate": 3.6537338345818273e-06, |
|
"loss": -0.0234, |
|
"reward": 0.9351311847567558, |
|
"reward_std": 0.10607956547755748, |
|
"rewards/accuracy_reward": 0.5248724352568388, |
|
"rewards/semantic_entropy_math_reward": 0.41025873832404613, |
|
"step": 233 |
|
}, |
|
{ |
|
"completion_length": 408.4878730773926, |
|
"epoch": 0.609771986970684, |
|
"grad_norm": 0.009326077997684479, |
|
"learning_rate": 3.6121102300688504e-06, |
|
"loss": -0.0136, |
|
"reward": 1.0050109177827835, |
|
"reward_std": 0.08207120059523731, |
|
"rewards/accuracy_reward": 0.5695152934640646, |
|
"rewards/semantic_entropy_math_reward": 0.43549559637904167, |
|
"step": 234 |
|
}, |
|
{ |
|
"completion_length": 430.00828742980957, |
|
"epoch": 0.6123778501628665, |
|
"grad_norm": 0.01133815012872219, |
|
"learning_rate": 3.5705905789704296e-06, |
|
"loss": -0.0174, |
|
"reward": 0.9181851297616959, |
|
"reward_std": 0.10594801499973983, |
|
"rewards/accuracy_reward": 0.5210459036752582, |
|
"rewards/semantic_entropy_math_reward": 0.39713920652866364, |
|
"step": 235 |
|
}, |
|
{ |
|
"completion_length": 408.0299701690674, |
|
"epoch": 0.6149837133550489, |
|
"grad_norm": 0.008568109013140202, |
|
"learning_rate": 3.529177991122519e-06, |
|
"loss": -0.0131, |
|
"reward": 0.958545908331871, |
|
"reward_std": 0.08321992680430412, |
|
"rewards/accuracy_reward": 0.5535714142024517, |
|
"rewards/semantic_entropy_math_reward": 0.4049744922667742, |
|
"step": 236 |
|
}, |
|
{ |
|
"completion_length": 436.0191307067871, |
|
"epoch": 0.6175895765472312, |
|
"grad_norm": 0.015509643591940403, |
|
"learning_rate": 3.487875568341995e-06, |
|
"loss": -0.0115, |
|
"reward": 0.8883017376065254, |
|
"reward_std": 0.09416370574035682, |
|
"rewards/accuracy_reward": 0.4859693758189678, |
|
"rewards/semantic_entropy_math_reward": 0.4023323506116867, |
|
"step": 237 |
|
}, |
|
{ |
|
"completion_length": 350.04272270202637, |
|
"epoch": 0.6201954397394137, |
|
"grad_norm": 0.012752141803503036, |
|
"learning_rate": 3.446686404194337e-06, |
|
"loss": -0.014, |
|
"reward": 0.9876093231141567, |
|
"reward_std": 0.08275497186696157, |
|
"rewards/accuracy_reward": 0.5644132569432259, |
|
"rewards/semantic_entropy_math_reward": 0.4231960419565439, |
|
"step": 238 |
|
}, |
|
{ |
|
"completion_length": 392.4043254852295, |
|
"epoch": 0.6228013029315961, |
|
"grad_norm": 0.014255838468670845, |
|
"learning_rate": 3.4056135837619077e-06, |
|
"loss": -0.0111, |
|
"reward": 1.0281523205339909, |
|
"reward_std": 0.06774875608971342, |
|
"rewards/accuracy_reward": 0.5835459064692259, |
|
"rewards/semantic_entropy_math_reward": 0.4446064066141844, |
|
"step": 239 |
|
}, |
|
{ |
|
"completion_length": 403.4055976867676, |
|
"epoch": 0.6254071661237784, |
|
"grad_norm": 0.013257958926260471, |
|
"learning_rate": 3.3646601834128924e-06, |
|
"loss": -0.0138, |
|
"reward": 0.9554482288658619, |
|
"reward_std": 0.09038078342564404, |
|
"rewards/accuracy_reward": 0.5414540711790323, |
|
"rewards/semantic_entropy_math_reward": 0.41399415768682957, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 422.57716178894043, |
|
"epoch": 0.6280130293159609, |
|
"grad_norm": 0.009931429289281368, |
|
"learning_rate": 3.3238292705708675e-06, |
|
"loss": -0.0042, |
|
"reward": 0.9145408011972904, |
|
"reward_std": 0.08421986317262053, |
|
"rewards/accuracy_reward": 0.4961734600365162, |
|
"rewards/semantic_entropy_math_reward": 0.4183673318475485, |
|
"step": 241 |
|
}, |
|
{ |
|
"completion_length": 385.666446685791, |
|
"epoch": 0.6306188925081433, |
|
"grad_norm": 0.020119503140449524, |
|
"learning_rate": 3.2831239034850593e-06, |
|
"loss": -0.0077, |
|
"reward": 0.960094753652811, |
|
"reward_std": 0.10691097256494686, |
|
"rewards/accuracy_reward": 0.5389030482620001, |
|
"rewards/semantic_entropy_math_reward": 0.4211916830390692, |
|
"step": 242 |
|
}, |
|
{ |
|
"completion_length": 401.9961643218994, |
|
"epoch": 0.6332247557003258, |
|
"grad_norm": 0.015258848667144775, |
|
"learning_rate": 3.2425471310012645e-06, |
|
"loss": -0.0073, |
|
"reward": 0.921100564301014, |
|
"reward_std": 0.09436794393695891, |
|
"rewards/accuracy_reward": 0.5204081535339355, |
|
"rewards/semantic_entropy_math_reward": 0.40069241262972355, |
|
"step": 243 |
|
}, |
|
{ |
|
"completion_length": 425.9100704193115, |
|
"epoch": 0.6358306188925081, |
|
"grad_norm": 0.009504982270300388, |
|
"learning_rate": 3.2021019923335093e-06, |
|
"loss": -0.0087, |
|
"reward": 0.8651603311300278, |
|
"reward_std": 0.05741609656251967, |
|
"rewards/accuracy_reward": 0.4534438643604517, |
|
"rewards/semantic_entropy_math_reward": 0.4117164649069309, |
|
"step": 244 |
|
}, |
|
{ |
|
"completion_length": 391.7308597564697, |
|
"epoch": 0.6384364820846905, |
|
"grad_norm": 0.008338342420756817, |
|
"learning_rate": 3.1617915168363994e-06, |
|
"loss": -0.0048, |
|
"reward": 0.9585458934307098, |
|
"reward_std": 0.09049107087776065, |
|
"rewards/accuracy_reward": 0.5363520346581936, |
|
"rewards/semantic_entropy_math_reward": 0.42219386994838715, |
|
"step": 245 |
|
}, |
|
{ |
|
"completion_length": 426.6033020019531, |
|
"epoch": 0.641042345276873, |
|
"grad_norm": 0.018495287746191025, |
|
"learning_rate": 3.121618723778225e-06, |
|
"loss": -0.0197, |
|
"reward": 0.9272047840058804, |
|
"reward_std": 0.09722855128347874, |
|
"rewards/accuracy_reward": 0.5248724352568388, |
|
"rewards/semantic_entropy_math_reward": 0.4023323468863964, |
|
"step": 246 |
|
}, |
|
{ |
|
"completion_length": 403.5931053161621, |
|
"epoch": 0.6436482084690553, |
|
"grad_norm": 0.011493879370391369, |
|
"learning_rate": 3.081586622114809e-06, |
|
"loss": -0.0171, |
|
"reward": 0.9197339601814747, |
|
"reward_std": 0.09083502274006605, |
|
"rewards/accuracy_reward": 0.5076530463993549, |
|
"rewards/semantic_entropy_math_reward": 0.41208089888095856, |
|
"step": 247 |
|
}, |
|
{ |
|
"completion_length": 390.2971897125244, |
|
"epoch": 0.6462540716612378, |
|
"grad_norm": 0.012770140543580055, |
|
"learning_rate": 3.041698210264149e-06, |
|
"loss": -0.0035, |
|
"reward": 0.9225582852959633, |
|
"reward_std": 0.10072695807320997, |
|
"rewards/accuracy_reward": 0.5133928395807743, |
|
"rewards/semantic_entropy_math_reward": 0.40916544012725353, |
|
"step": 248 |
|
}, |
|
{ |
|
"completion_length": 386.1511402130127, |
|
"epoch": 0.6488599348534202, |
|
"grad_norm": 0.011112012900412083, |
|
"learning_rate": 3.001956475881822e-06, |
|
"loss": -0.0101, |
|
"reward": 0.9212827943265438, |
|
"reward_std": 0.11616118880920112, |
|
"rewards/accuracy_reward": 0.5019132569432259, |
|
"rewards/semantic_entropy_math_reward": 0.4193695206195116, |
|
"step": 249 |
|
}, |
|
{ |
|
"completion_length": 416.366060256958, |
|
"epoch": 0.6514657980456026, |
|
"grad_norm": 0.013438215479254723, |
|
"learning_rate": 2.962364395637216e-06, |
|
"loss": -0.0105, |
|
"reward": 0.938411071896553, |
|
"reward_std": 0.09871607949025929, |
|
"rewards/accuracy_reward": 0.5344387628138065, |
|
"rewards/semantic_entropy_math_reward": 0.4039722979068756, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 411.18238830566406, |
|
"epoch": 0.654071661237785, |
|
"grad_norm": 0.014477844350039959, |
|
"learning_rate": 2.9229249349905686e-06, |
|
"loss": -0.0185, |
|
"reward": 0.9585458822548389, |
|
"reward_std": 0.07620244607096538, |
|
"rewards/accuracy_reward": 0.5350765157490969, |
|
"rewards/semantic_entropy_math_reward": 0.4234693832695484, |
|
"step": 251 |
|
}, |
|
{ |
|
"completion_length": 429.57652282714844, |
|
"epoch": 0.6566775244299674, |
|
"grad_norm": 0.009758449159562588, |
|
"learning_rate": 2.8836410479708625e-06, |
|
"loss": -0.0066, |
|
"reward": 0.9825072661042213, |
|
"reward_std": 0.0784962668112712, |
|
"rewards/accuracy_reward": 0.5567601975053549, |
|
"rewards/semantic_entropy_math_reward": 0.4257470816373825, |
|
"step": 252 |
|
}, |
|
{ |
|
"completion_length": 410.80356216430664, |
|
"epoch": 0.6592833876221499, |
|
"grad_norm": 0.011529424227774143, |
|
"learning_rate": 2.84451567695456e-06, |
|
"loss": -0.0155, |
|
"reward": 0.9254737384617329, |
|
"reward_std": 0.08181263762526214, |
|
"rewards/accuracy_reward": 0.5159438662230968, |
|
"rewards/semantic_entropy_math_reward": 0.409529872238636, |
|
"step": 253 |
|
}, |
|
{ |
|
"completion_length": 418.6396598815918, |
|
"epoch": 0.6618892508143323, |
|
"grad_norm": 0.012521790340542793, |
|
"learning_rate": 2.805551752445222e-06, |
|
"loss": -0.0072, |
|
"reward": 0.9676566906273365, |
|
"reward_std": 0.07710790610872209, |
|
"rewards/accuracy_reward": 0.5484693758189678, |
|
"rewards/semantic_entropy_math_reward": 0.41918731294572353, |
|
"step": 254 |
|
}, |
|
{ |
|
"completion_length": 418.7059841156006, |
|
"epoch": 0.6644951140065146, |
|
"grad_norm": 0.010554789565503597, |
|
"learning_rate": 2.766752192854012e-06, |
|
"loss": -0.0079, |
|
"reward": 0.9526238888502121, |
|
"reward_std": 0.10265779460314661, |
|
"rewards/accuracy_reward": 0.5369897820055485, |
|
"rewards/semantic_entropy_math_reward": 0.41563410870730877, |
|
"step": 255 |
|
}, |
|
{ |
|
"completion_length": 408.3431034088135, |
|
"epoch": 0.6671009771986971, |
|
"grad_norm": 0.009969337843358517, |
|
"learning_rate": 2.728119904281105e-06, |
|
"loss": -0.0084, |
|
"reward": 0.986880462616682, |
|
"reward_std": 0.09043557988479733, |
|
"rewards/accuracy_reward": 0.5682397838681936, |
|
"rewards/semantic_entropy_math_reward": 0.41864065267145634, |
|
"step": 256 |
|
}, |
|
{ |
|
"completion_length": 420.5044631958008, |
|
"epoch": 0.6697068403908795, |
|
"grad_norm": 0.011664465069770813, |
|
"learning_rate": 2.6896577802980184e-06, |
|
"loss": -0.0099, |
|
"reward": 0.9866982325911522, |
|
"reward_std": 0.10724300192669034, |
|
"rewards/accuracy_reward": 0.5752550922334194, |
|
"rewards/semantic_entropy_math_reward": 0.4114431384950876, |
|
"step": 257 |
|
}, |
|
{ |
|
"completion_length": 453.64093589782715, |
|
"epoch": 0.6723127035830619, |
|
"grad_norm": 0.00969005562365055, |
|
"learning_rate": 2.651368701730889e-06, |
|
"loss": -0.0053, |
|
"reward": 0.9547193720936775, |
|
"reward_std": 0.10146705091756303, |
|
"rewards/accuracy_reward": 0.5331632532179356, |
|
"rewards/semantic_entropy_math_reward": 0.4215561244636774, |
|
"step": 258 |
|
}, |
|
{ |
|
"completion_length": 482.51912689208984, |
|
"epoch": 0.6749185667752443, |
|
"grad_norm": 0.009962331503629684, |
|
"learning_rate": 2.6132555364446856e-06, |
|
"loss": -0.0089, |
|
"reward": 0.8847485221922398, |
|
"reward_std": 0.10891495714895427, |
|
"rewards/accuracy_reward": 0.4904336640611291, |
|
"rewards/semantic_entropy_math_reward": 0.3943148609250784, |
|
"step": 259 |
|
}, |
|
{ |
|
"completion_length": 472.77039527893066, |
|
"epoch": 0.6775244299674267, |
|
"grad_norm": 0.008120470680296421, |
|
"learning_rate": 2.5753211391284172e-06, |
|
"loss": -0.0056, |
|
"reward": 0.9668367207050323, |
|
"reward_std": 0.08560021687299013, |
|
"rewards/accuracy_reward": 0.5433673411607742, |
|
"rewards/semantic_entropy_math_reward": 0.42346938140690327, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 455.7142734527588, |
|
"epoch": 0.6801302931596092, |
|
"grad_norm": 0.008612104691565037, |
|
"learning_rate": 2.537568351081311e-06, |
|
"loss": -0.0055, |
|
"reward": 0.9637390524148941, |
|
"reward_std": 0.08404424658510834, |
|
"rewards/accuracy_reward": 0.5580357015132904, |
|
"rewards/semantic_entropy_math_reward": 0.4057033322751522, |
|
"step": 261 |
|
}, |
|
{ |
|
"completion_length": 452.24807357788086, |
|
"epoch": 0.6827361563517915, |
|
"grad_norm": 0.007962207309901714, |
|
"learning_rate": 2.5000000000000015e-06, |
|
"loss": -0.0213, |
|
"reward": 1.033163245767355, |
|
"reward_std": 0.08445578732062131, |
|
"rewards/accuracy_reward": 0.6065050885081291, |
|
"rewards/semantic_entropy_math_reward": 0.4266581479460001, |
|
"step": 262 |
|
}, |
|
{ |
|
"completion_length": 445.8513927459717, |
|
"epoch": 0.6853420195439739, |
|
"grad_norm": 0.00862111710011959, |
|
"learning_rate": 2.4626188997667224e-06, |
|
"loss": -0.0107, |
|
"reward": 0.9684766530990601, |
|
"reward_std": 0.106060303398408, |
|
"rewards/accuracy_reward": 0.5497448798269033, |
|
"rewards/semantic_entropy_math_reward": 0.41873176395893097, |
|
"step": 263 |
|
}, |
|
{ |
|
"completion_length": 438.24169731140137, |
|
"epoch": 0.6879478827361564, |
|
"grad_norm": 0.007783900015056133, |
|
"learning_rate": 2.425427850238565e-06, |
|
"loss": -0.0095, |
|
"reward": 0.9946246296167374, |
|
"reward_std": 0.09433889319188893, |
|
"rewards/accuracy_reward": 0.5784438736736774, |
|
"rewards/semantic_entropy_math_reward": 0.4161807559430599, |
|
"step": 264 |
|
}, |
|
{ |
|
"completion_length": 446.2295837402344, |
|
"epoch": 0.6905537459283387, |
|
"grad_norm": 0.008394320495426655, |
|
"learning_rate": 2.388429637037753e-06, |
|
"loss": 0.0002, |
|
"reward": 1.0675109289586544, |
|
"reward_std": 0.08982915757223964, |
|
"rewards/accuracy_reward": 0.6301020346581936, |
|
"rewards/semantic_entropy_math_reward": 0.43740889988839626, |
|
"step": 265 |
|
}, |
|
{ |
|
"completion_length": 420.28698539733887, |
|
"epoch": 0.6931596091205212, |
|
"grad_norm": 0.011627813801169395, |
|
"learning_rate": 2.3516270313430085e-06, |
|
"loss": -0.0075, |
|
"reward": 0.9636479578912258, |
|
"reward_std": 0.09795083408243954, |
|
"rewards/accuracy_reward": 0.5420918222516775, |
|
"rewards/semantic_entropy_math_reward": 0.4215561132878065, |
|
"step": 266 |
|
}, |
|
{ |
|
"completion_length": 504.16006660461426, |
|
"epoch": 0.6957654723127036, |
|
"grad_norm": 0.013352944515645504, |
|
"learning_rate": 2.3150227896819782e-06, |
|
"loss": 0.0002, |
|
"reward": 0.9232871569693089, |
|
"reward_std": 0.11924422497395426, |
|
"rewards/accuracy_reward": 0.5172193739563227, |
|
"rewards/semantic_entropy_math_reward": 0.4060677755624056, |
|
"step": 267 |
|
}, |
|
{ |
|
"completion_length": 440.1288185119629, |
|
"epoch": 0.698371335504886, |
|
"grad_norm": 0.008399576880037785, |
|
"learning_rate": 2.278619653724781e-06, |
|
"loss": -0.005, |
|
"reward": 0.9916180558502674, |
|
"reward_std": 0.0687884486396797, |
|
"rewards/accuracy_reward": 0.5637754965573549, |
|
"rewards/semantic_entropy_math_reward": 0.4278425555676222, |
|
"step": 268 |
|
}, |
|
{ |
|
"completion_length": 496.5880012512207, |
|
"epoch": 0.7009771986970684, |
|
"grad_norm": 0.01074276678264141, |
|
"learning_rate": 2.2424203500786473e-06, |
|
"loss": -0.0021, |
|
"reward": 0.9390488304197788, |
|
"reward_std": 0.10012663877569139, |
|
"rewards/accuracy_reward": 0.5357142742723227, |
|
"rewards/semantic_entropy_math_reward": 0.403334541246295, |
|
"step": 269 |
|
}, |
|
{ |
|
"completion_length": 462.8348159790039, |
|
"epoch": 0.7035830618892508, |
|
"grad_norm": 0.012036530300974846, |
|
"learning_rate": 2.206427590083703e-06, |
|
"loss": -0.0068, |
|
"reward": 0.9727587252855301, |
|
"reward_std": 0.07587114511989057, |
|
"rewards/accuracy_reward": 0.5516581516712904, |
|
"rewards/semantic_entropy_math_reward": 0.42110058292746544, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 462.12371253967285, |
|
"epoch": 0.7061889250814333, |
|
"grad_norm": 0.00853002816438675, |
|
"learning_rate": 2.170644069609876e-06, |
|
"loss": -0.0111, |
|
"reward": 0.9571792744100094, |
|
"reward_std": 0.09096840798156336, |
|
"rewards/accuracy_reward": 0.5503826402127743, |
|
"rewards/semantic_entropy_math_reward": 0.4067966416478157, |
|
"step": 271 |
|
}, |
|
{ |
|
"completion_length": 428.95662117004395, |
|
"epoch": 0.7087947882736156, |
|
"grad_norm": 0.009711945429444313, |
|
"learning_rate": 2.1350724688549906e-06, |
|
"loss": -0.0152, |
|
"reward": 0.9361333549022675, |
|
"reward_std": 0.08675176231190562, |
|
"rewards/accuracy_reward": 0.515306118875742, |
|
"rewards/semantic_entropy_math_reward": 0.4208272397518158, |
|
"step": 272 |
|
}, |
|
{ |
|
"completion_length": 462.191312789917, |
|
"epoch": 0.711400651465798, |
|
"grad_norm": 0.013711754232645035, |
|
"learning_rate": 2.09971545214401e-06, |
|
"loss": -0.0046, |
|
"reward": 0.9466107748448849, |
|
"reward_std": 0.09364626772003248, |
|
"rewards/accuracy_reward": 0.5338010117411613, |
|
"rewards/semantic_entropy_math_reward": 0.4128097593784332, |
|
"step": 273 |
|
}, |
|
{ |
|
"completion_length": 483.0580234527588, |
|
"epoch": 0.7140065146579805, |
|
"grad_norm": 0.008413317613303661, |
|
"learning_rate": 2.0645756677294788e-06, |
|
"loss": -0.0117, |
|
"reward": 0.9661078602075577, |
|
"reward_std": 0.07902665832079947, |
|
"rewards/accuracy_reward": 0.5414540749043226, |
|
"rewards/semantic_entropy_math_reward": 0.4246537797152996, |
|
"step": 274 |
|
}, |
|
{ |
|
"completion_length": 485.255090713501, |
|
"epoch": 0.7166123778501629, |
|
"grad_norm": 0.01059303991496563, |
|
"learning_rate": 2.029655747593169e-06, |
|
"loss": -0.0064, |
|
"reward": 0.9963556602597237, |
|
"reward_std": 0.08435712786740623, |
|
"rewards/accuracy_reward": 0.5797193804755807, |
|
"rewards/semantic_entropy_math_reward": 0.41663630679249763, |
|
"step": 275 |
|
}, |
|
{ |
|
"completion_length": 524.714916229248, |
|
"epoch": 0.7192182410423453, |
|
"grad_norm": 0.010354903526604176, |
|
"learning_rate": 1.9949583072489455e-06, |
|
"loss": -0.0095, |
|
"reward": 0.9393221400678158, |
|
"reward_std": 0.10403139912523329, |
|
"rewards/accuracy_reward": 0.536352027207613, |
|
"rewards/semantic_entropy_math_reward": 0.4029701128602028, |
|
"step": 276 |
|
}, |
|
{ |
|
"completion_length": 493.7672061920166, |
|
"epoch": 0.7218241042345277, |
|
"grad_norm": 0.009775525890290737, |
|
"learning_rate": 1.9604859455468587e-06, |
|
"loss": -0.0051, |
|
"reward": 0.9140852615237236, |
|
"reward_std": 0.07866077253129333, |
|
"rewards/accuracy_reward": 0.48405610769987106, |
|
"rewards/semantic_entropy_math_reward": 0.4300291370600462, |
|
"step": 277 |
|
}, |
|
{ |
|
"completion_length": 486.60011863708496, |
|
"epoch": 0.7244299674267101, |
|
"grad_norm": 0.010633684694766998, |
|
"learning_rate": 1.926241244478496e-06, |
|
"loss": -0.0114, |
|
"reward": 0.9836916588246822, |
|
"reward_std": 0.13731968333013356, |
|
"rewards/accuracy_reward": 0.5714285597205162, |
|
"rewards/semantic_entropy_math_reward": 0.4122631140053272, |
|
"step": 278 |
|
}, |
|
{ |
|
"completion_length": 490.8520317077637, |
|
"epoch": 0.7270358306188925, |
|
"grad_norm": 0.01332115475088358, |
|
"learning_rate": 1.8922267689835806e-06, |
|
"loss": -0.0069, |
|
"reward": 0.9645590335130692, |
|
"reward_std": 0.08677118818741292, |
|
"rewards/accuracy_reward": 0.5395408049225807, |
|
"rewards/semantic_entropy_math_reward": 0.42501820996403694, |
|
"step": 279 |
|
}, |
|
{ |
|
"completion_length": 510.359037399292, |
|
"epoch": 0.7296416938110749, |
|
"grad_norm": 0.010328337550163269, |
|
"learning_rate": 1.8584450667578656e-06, |
|
"loss": -0.0053, |
|
"reward": 0.9415087439119816, |
|
"reward_std": 0.1030702197458595, |
|
"rewards/accuracy_reward": 0.5414540711790323, |
|
"rewards/semantic_entropy_math_reward": 0.4000546522438526, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 488.9508819580078, |
|
"epoch": 0.7322475570032573, |
|
"grad_norm": 0.009401313029229641, |
|
"learning_rate": 1.8248986680623077e-06, |
|
"loss": -0.0144, |
|
"reward": 1.0168549418449402, |
|
"reward_std": 0.08474622888024896, |
|
"rewards/accuracy_reward": 0.5988520290702581, |
|
"rewards/semantic_entropy_math_reward": 0.41800292022526264, |
|
"step": 281 |
|
}, |
|
{ |
|
"completion_length": 461.94578552246094, |
|
"epoch": 0.7348534201954398, |
|
"grad_norm": 0.01206615474075079, |
|
"learning_rate": 1.7915900855335506e-06, |
|
"loss": -0.0079, |
|
"reward": 1.0083818957209587, |
|
"reward_std": 0.06797894753981382, |
|
"rewards/accuracy_reward": 0.5841836612671614, |
|
"rewards/semantic_entropy_math_reward": 0.42419823817908764, |
|
"step": 282 |
|
}, |
|
{ |
|
"completion_length": 493.0318717956543, |
|
"epoch": 0.7374592833876221, |
|
"grad_norm": 0.009084763005375862, |
|
"learning_rate": 1.7585218139957205e-06, |
|
"loss": -0.0093, |
|
"reward": 1.0141217038035393, |
|
"reward_std": 0.10102251661010087, |
|
"rewards/accuracy_reward": 0.5899234563112259, |
|
"rewards/semantic_entropy_math_reward": 0.42419824562966824, |
|
"step": 283 |
|
}, |
|
{ |
|
"completion_length": 480.63264083862305, |
|
"epoch": 0.7400651465798046, |
|
"grad_norm": 0.01057640090584755, |
|
"learning_rate": 1.7256963302735752e-06, |
|
"loss": -0.0137, |
|
"reward": 0.9901603423058987, |
|
"reward_std": 0.11432356107980013, |
|
"rewards/accuracy_reward": 0.5790816191583872, |
|
"rewards/semantic_entropy_math_reward": 0.4110787082463503, |
|
"step": 284 |
|
}, |
|
{ |
|
"completion_length": 534.7289409637451, |
|
"epoch": 0.742671009771987, |
|
"grad_norm": 0.011105289682745934, |
|
"learning_rate": 1.6931160930069789e-06, |
|
"loss": -0.0098, |
|
"reward": 0.9258381761610508, |
|
"reward_std": 0.0985641247825697, |
|
"rewards/accuracy_reward": 0.5197703987360001, |
|
"rewards/semantic_entropy_math_reward": 0.4060677830129862, |
|
"step": 285 |
|
}, |
|
{ |
|
"completion_length": 492.93940353393555, |
|
"epoch": 0.7452768729641693, |
|
"grad_norm": 0.010538027621805668, |
|
"learning_rate": 1.6607835424667578e-06, |
|
"loss": -0.0052, |
|
"reward": 0.9999999739229679, |
|
"reward_std": 0.0725285234657349, |
|
"rewards/accuracy_reward": 0.5707908049225807, |
|
"rewards/semantic_entropy_math_reward": 0.4292091801762581, |
|
"step": 286 |
|
}, |
|
{ |
|
"completion_length": 479.8284320831299, |
|
"epoch": 0.7478827361563518, |
|
"grad_norm": 0.012495830655097961, |
|
"learning_rate": 1.6287011003719105e-06, |
|
"loss": -0.0044, |
|
"reward": 0.9080721437931061, |
|
"reward_std": 0.08464997098781168, |
|
"rewards/accuracy_reward": 0.498724477365613, |
|
"rewards/semantic_entropy_math_reward": 0.40934765711426735, |
|
"step": 287 |
|
}, |
|
{ |
|
"completion_length": 479.3641548156738, |
|
"epoch": 0.7504885993485342, |
|
"grad_norm": 0.014318572357296944, |
|
"learning_rate": 1.596871169708235e-06, |
|
"loss": -0.0114, |
|
"reward": 0.9038812033832073, |
|
"reward_std": 0.08901800989406183, |
|
"rewards/accuracy_reward": 0.4942601975053549, |
|
"rewards/semantic_entropy_math_reward": 0.4096209779381752, |
|
"step": 288 |
|
}, |
|
{ |
|
"completion_length": 478.834171295166, |
|
"epoch": 0.7530944625407167, |
|
"grad_norm": 0.011369728483259678, |
|
"learning_rate": 1.5652961345483353e-06, |
|
"loss": -0.0118, |
|
"reward": 0.9845116622745991, |
|
"reward_std": 0.10669447341933846, |
|
"rewards/accuracy_reward": 0.5752550875768065, |
|
"rewards/semantic_entropy_math_reward": 0.4092565383762121, |
|
"step": 289 |
|
}, |
|
{ |
|
"completion_length": 515.2219257354736, |
|
"epoch": 0.755700325732899, |
|
"grad_norm": 0.01009457278996706, |
|
"learning_rate": 1.5339783598730568e-06, |
|
"loss": -0.0119, |
|
"reward": 0.933946780860424, |
|
"reward_std": 0.09003342734649777, |
|
"rewards/accuracy_reward": 0.5280612148344517, |
|
"rewards/semantic_entropy_math_reward": 0.4058855511248112, |
|
"step": 290 |
|
}, |
|
{ |
|
"completion_length": 524.5248641967773, |
|
"epoch": 0.7583061889250814, |
|
"grad_norm": 0.009736046195030212, |
|
"learning_rate": 1.5029201913943425e-06, |
|
"loss": -0.0148, |
|
"reward": 0.9850582927465439, |
|
"reward_std": 0.10781788837630302, |
|
"rewards/accuracy_reward": 0.5733418259769678, |
|
"rewards/semantic_entropy_math_reward": 0.4117164667695761, |
|
"step": 291 |
|
}, |
|
{ |
|
"completion_length": 512.2193756103516, |
|
"epoch": 0.7609120521172639, |
|
"grad_norm": 0.007600969634950161, |
|
"learning_rate": 1.4721239553795485e-06, |
|
"loss": -0.0079, |
|
"reward": 0.9823250733315945, |
|
"reward_std": 0.09170323330909014, |
|
"rewards/accuracy_reward": 0.5580357071012259, |
|
"rewards/semantic_entropy_math_reward": 0.4242893476039171, |
|
"step": 292 |
|
}, |
|
{ |
|
"completion_length": 450.7780475616455, |
|
"epoch": 0.7635179153094462, |
|
"grad_norm": 0.00899919867515564, |
|
"learning_rate": 1.4415919584771999e-06, |
|
"loss": -0.0121, |
|
"reward": 1.015852764248848, |
|
"reward_std": 0.10539426596369594, |
|
"rewards/accuracy_reward": 0.5943877436220646, |
|
"rewards/semantic_entropy_math_reward": 0.42146499268710613, |
|
"step": 293 |
|
}, |
|
{ |
|
"completion_length": 484.11031913757324, |
|
"epoch": 0.7661237785016287, |
|
"grad_norm": 0.012872518040239811, |
|
"learning_rate": 1.4113264875442201e-06, |
|
"loss": -0.0119, |
|
"reward": 0.949526209384203, |
|
"reward_std": 0.08500090334564447, |
|
"rewards/accuracy_reward": 0.5242346823215485, |
|
"rewards/semantic_entropy_math_reward": 0.4252915382385254, |
|
"step": 294 |
|
}, |
|
{ |
|
"completion_length": 507.8871078491211, |
|
"epoch": 0.7687296416938111, |
|
"grad_norm": 0.01095646247267723, |
|
"learning_rate": 1.3813298094746491e-06, |
|
"loss": -0.0146, |
|
"reward": 0.9959912374615669, |
|
"reward_std": 0.09144581796135753, |
|
"rewards/accuracy_reward": 0.5682397857308388, |
|
"rewards/semantic_entropy_math_reward": 0.4277514461427927, |
|
"step": 295 |
|
}, |
|
{ |
|
"completion_length": 492.8201427459717, |
|
"epoch": 0.7713355048859935, |
|
"grad_norm": 0.007233647629618645, |
|
"learning_rate": 1.35160417102985e-06, |
|
"loss": -0.0137, |
|
"reward": 0.96993438154459, |
|
"reward_std": 0.0985939132515341, |
|
"rewards/accuracy_reward": 0.5580357015132904, |
|
"rewards/semantic_entropy_math_reward": 0.41189867816865444, |
|
"step": 296 |
|
}, |
|
{ |
|
"completion_length": 479.56950759887695, |
|
"epoch": 0.7739413680781759, |
|
"grad_norm": 0.010265953838825226, |
|
"learning_rate": 1.3221517986702249e-06, |
|
"loss": -0.0049, |
|
"reward": 0.9783162996172905, |
|
"reward_std": 0.0974840281996876, |
|
"rewards/accuracy_reward": 0.5605867244303226, |
|
"rewards/semantic_entropy_math_reward": 0.41772958263754845, |
|
"step": 297 |
|
}, |
|
{ |
|
"completion_length": 514.214277267456, |
|
"epoch": 0.7765472312703583, |
|
"grad_norm": 0.006785084027796984, |
|
"learning_rate": 1.292974898388456e-06, |
|
"loss": -0.015, |
|
"reward": 0.9332179203629494, |
|
"reward_std": 0.08228117297403514, |
|
"rewards/accuracy_reward": 0.5102040730416775, |
|
"rewards/semantic_entropy_math_reward": 0.423013836145401, |
|
"step": 298 |
|
}, |
|
{ |
|
"completion_length": 485.87435150146484, |
|
"epoch": 0.7791530944625407, |
|
"grad_norm": 0.016027240082621574, |
|
"learning_rate": 1.2640756555442684e-06, |
|
"loss": -0.0101, |
|
"reward": 0.8885750621557236, |
|
"reward_std": 0.07733660298981704, |
|
"rewards/accuracy_reward": 0.47895407397300005, |
|
"rewards/semantic_entropy_math_reward": 0.40962098725140095, |
|
"step": 299 |
|
}, |
|
{ |
|
"completion_length": 505.1874885559082, |
|
"epoch": 0.7817589576547231, |
|
"grad_norm": 0.009378643706440926, |
|
"learning_rate": 1.235456234700756e-06, |
|
"loss": -0.0063, |
|
"reward": 0.9045189283788204, |
|
"reward_std": 0.07878015120513737, |
|
"rewards/accuracy_reward": 0.5012754946947098, |
|
"rewards/semantic_entropy_math_reward": 0.4032434318214655, |
|
"step": 300 |
|
}, |
|
{ |
|
"completion_length": 475.78697967529297, |
|
"epoch": 0.7843648208469055, |
|
"grad_norm": 0.008114474825561047, |
|
"learning_rate": 1.207118779462248e-06, |
|
"loss": -0.0127, |
|
"reward": 1.0125728696584702, |
|
"reward_std": 0.08841744478559121, |
|
"rewards/accuracy_reward": 0.5873724296689034, |
|
"rewards/semantic_entropy_math_reward": 0.42520042695105076, |
|
"step": 301 |
|
}, |
|
{ |
|
"completion_length": 517.571418762207, |
|
"epoch": 0.786970684039088, |
|
"grad_norm": 0.0094959931448102, |
|
"learning_rate": 1.1790654123137552e-06, |
|
"loss": -0.0013, |
|
"reward": 0.9774963334202766, |
|
"reward_std": 0.08901865262305364, |
|
"rewards/accuracy_reward": 0.5561224445700645, |
|
"rewards/semantic_entropy_math_reward": 0.4213739037513733, |
|
"step": 302 |
|
}, |
|
{ |
|
"completion_length": 443.41580295562744, |
|
"epoch": 0.7895765472312704, |
|
"grad_norm": 0.00883433036506176, |
|
"learning_rate": 1.1512982344619904e-06, |
|
"loss": -0.0046, |
|
"reward": 1.045098390430212, |
|
"reward_std": 0.06228679193009157, |
|
"rewards/accuracy_reward": 0.593749986961484, |
|
"rewards/semantic_entropy_math_reward": 0.4513483848422766, |
|
"step": 303 |
|
}, |
|
{ |
|
"completion_length": 522.1760082244873, |
|
"epoch": 0.7921824104234527, |
|
"grad_norm": 0.008245239965617657, |
|
"learning_rate": 1.1238193256779955e-06, |
|
"loss": -0.0149, |
|
"reward": 0.9394132420420647, |
|
"reward_std": 0.09620352147612721, |
|
"rewards/accuracy_reward": 0.5293367244303226, |
|
"rewards/semantic_entropy_math_reward": 0.410076517611742, |
|
"step": 304 |
|
}, |
|
{ |
|
"completion_length": 448.2991008758545, |
|
"epoch": 0.7947882736156352, |
|
"grad_norm": 0.012398505583405495, |
|
"learning_rate": 1.0966307441413598e-06, |
|
"loss": -0.0058, |
|
"reward": 0.9621902219951153, |
|
"reward_std": 0.08929981710389256, |
|
"rewards/accuracy_reward": 0.5338010061532259, |
|
"rewards/semantic_entropy_math_reward": 0.4283892009407282, |
|
"step": 305 |
|
}, |
|
{ |
|
"completion_length": 491.30419731140137, |
|
"epoch": 0.7973941368078176, |
|
"grad_norm": 0.009740319103002548, |
|
"learning_rate": 1.0697345262860638e-06, |
|
"loss": -0.0068, |
|
"reward": 0.9690233021974564, |
|
"reward_std": 0.08115439751418307, |
|
"rewards/accuracy_reward": 0.5459183566272259, |
|
"rewards/semantic_entropy_math_reward": 0.4231049530208111, |
|
"step": 306 |
|
}, |
|
{ |
|
"completion_length": 497.92601203918457, |
|
"epoch": 0.8, |
|
"grad_norm": 0.012608438730239868, |
|
"learning_rate": 1.0431326866479457e-06, |
|
"loss": -0.0167, |
|
"reward": 0.8842018768191338, |
|
"reward_std": 0.11789050558581948, |
|
"rewards/accuracy_reward": 0.48724489100277424, |
|
"rewards/semantic_entropy_math_reward": 0.39695698767900467, |
|
"step": 307 |
|
}, |
|
{ |
|
"completion_length": 488.2034339904785, |
|
"epoch": 0.8026058631921824, |
|
"grad_norm": 0.008150148205459118, |
|
"learning_rate": 1.01682721771382e-06, |
|
"loss": -0.0213, |
|
"reward": 1.0246902257204056, |
|
"reward_std": 0.07845566415926442, |
|
"rewards/accuracy_reward": 0.5778061132878065, |
|
"rewards/semantic_entropy_math_reward": 0.446884099394083, |
|
"step": 308 |
|
}, |
|
{ |
|
"completion_length": 501.41644287109375, |
|
"epoch": 0.8052117263843648, |
|
"grad_norm": 0.009125792421400547, |
|
"learning_rate": 9.908200897722332e-07, |
|
"loss": -0.0093, |
|
"reward": 0.9240160211920738, |
|
"reward_std": 0.09154359906096943, |
|
"rewards/accuracy_reward": 0.5127550903707743, |
|
"rewards/semantic_entropy_math_reward": 0.41126092337071896, |
|
"step": 309 |
|
}, |
|
{ |
|
"completion_length": 485.86797523498535, |
|
"epoch": 0.8078175895765473, |
|
"grad_norm": 0.013204295188188553, |
|
"learning_rate": 9.6511325076589e-07, |
|
"loss": -0.0129, |
|
"reward": 0.9015123657882214, |
|
"reward_std": 0.08482572343200445, |
|
"rewards/accuracy_reward": 0.4878826420754194, |
|
"rewards/semantic_entropy_math_reward": 0.4136297255754471, |
|
"step": 310 |
|
}, |
|
{ |
|
"completion_length": 472.4763946533203, |
|
"epoch": 0.8104234527687296, |
|
"grad_norm": 0.009827625937759876, |
|
"learning_rate": 9.397086261457511e-07, |
|
"loss": -0.0147, |
|
"reward": 0.9812317602336407, |
|
"reward_std": 0.11015114816837013, |
|
"rewards/accuracy_reward": 0.5656887628138065, |
|
"rewards/semantic_entropy_math_reward": 0.41554298996925354, |
|
"step": 311 |
|
}, |
|
{ |
|
"completion_length": 487.07525062561035, |
|
"epoch": 0.8130293159609121, |
|
"grad_norm": 0.014018489047884941, |
|
"learning_rate": 9.146081187268185e-07, |
|
"loss": -0.0131, |
|
"reward": 0.9449708163738251, |
|
"reward_std": 0.10275353176984936, |
|
"rewards/accuracy_reward": 0.5459183566272259, |
|
"rewards/semantic_entropy_math_reward": 0.39905246533453465, |
|
"step": 312 |
|
}, |
|
{ |
|
"completion_length": 478.88073348999023, |
|
"epoch": 0.8156351791530945, |
|
"grad_norm": 0.008230605162680149, |
|
"learning_rate": 8.898136085456127e-07, |
|
"loss": -0.0052, |
|
"reward": 0.9649234525859356, |
|
"reward_std": 0.08008495910326019, |
|
"rewards/accuracy_reward": 0.5459183566272259, |
|
"rewards/semantic_entropy_math_reward": 0.41900509409606457, |
|
"step": 313 |
|
}, |
|
{ |
|
"completion_length": 477.53187561035156, |
|
"epoch": 0.8182410423452768, |
|
"grad_norm": 0.011597930453717709, |
|
"learning_rate": 8.65326952719357e-07, |
|
"loss": -0.0081, |
|
"reward": 1.0210458859801292, |
|
"reward_std": 0.08366672950796783, |
|
"rewards/accuracy_reward": 0.5816326439380646, |
|
"rewards/semantic_entropy_math_reward": 0.43941325321793556, |
|
"step": 314 |
|
}, |
|
{ |
|
"completion_length": 481.01720428466797, |
|
"epoch": 0.8208469055374593, |
|
"grad_norm": 0.01356886513531208, |
|
"learning_rate": 8.411499853068783e-07, |
|
"loss": -0.0092, |
|
"reward": 0.9314868524670601, |
|
"reward_std": 0.09904140722937882, |
|
"rewards/accuracy_reward": 0.5121173355728388, |
|
"rewards/semantic_entropy_math_reward": 0.4193695206195116, |
|
"step": 315 |
|
}, |
|
{ |
|
"completion_length": 447.973840713501, |
|
"epoch": 0.8234527687296417, |
|
"grad_norm": 0.0068409559316933155, |
|
"learning_rate": 8.172845171712379e-07, |
|
"loss": -0.0132, |
|
"reward": 1.0037354119122028, |
|
"reward_std": 0.07698465495195705, |
|
"rewards/accuracy_reward": 0.5637755002826452, |
|
"rewards/semantic_entropy_math_reward": 0.4399599079042673, |
|
"step": 316 |
|
}, |
|
{ |
|
"completion_length": 483.59756660461426, |
|
"epoch": 0.826058631921824, |
|
"grad_norm": 0.011241559870541096, |
|
"learning_rate": 7.937323358440935e-07, |
|
"loss": -0.017, |
|
"reward": 0.9950801581144333, |
|
"reward_std": 0.08729210350429639, |
|
"rewards/accuracy_reward": 0.5637754984200001, |
|
"rewards/semantic_entropy_math_reward": 0.4313046485185623, |
|
"step": 317 |
|
}, |
|
{ |
|
"completion_length": 467.542724609375, |
|
"epoch": 0.8286644951140065, |
|
"grad_norm": 0.014086179435253143, |
|
"learning_rate": 7.70495205391818e-07, |
|
"loss": -0.014, |
|
"reward": 0.9365889057517052, |
|
"reward_std": 0.08978691761149094, |
|
"rewards/accuracy_reward": 0.5280612111091614, |
|
"rewards/semantic_entropy_math_reward": 0.4085276983678341, |
|
"step": 318 |
|
}, |
|
{ |
|
"completion_length": 469.43749046325684, |
|
"epoch": 0.8312703583061889, |
|
"grad_norm": 0.009214820340275764, |
|
"learning_rate": 7.475748662833615e-07, |
|
"loss": -0.0077, |
|
"reward": 0.9763119295239449, |
|
"reward_std": 0.09077268361579627, |
|
"rewards/accuracy_reward": 0.5542091699317098, |
|
"rewards/semantic_entropy_math_reward": 0.42210277169942856, |
|
"step": 319 |
|
}, |
|
{ |
|
"completion_length": 465.4904251098633, |
|
"epoch": 0.8338762214983714, |
|
"grad_norm": 0.007901620119810104, |
|
"learning_rate": 7.249730352599e-07, |
|
"loss": -0.0133, |
|
"reward": 0.9463374502956867, |
|
"reward_std": 0.08334645116701722, |
|
"rewards/accuracy_reward": 0.5312499832361937, |
|
"rewards/semantic_entropy_math_reward": 0.4150874614715576, |
|
"step": 320 |
|
}, |
|
{ |
|
"completion_length": 461.4591751098633, |
|
"epoch": 0.8364820846905537, |
|
"grad_norm": 0.012061836197972298, |
|
"learning_rate": 7.026914052062433e-07, |
|
"loss": -0.006, |
|
"reward": 1.0148505568504333, |
|
"reward_std": 0.08093352918513119, |
|
"rewards/accuracy_reward": 0.5816326476633549, |
|
"rewards/semantic_entropy_math_reward": 0.4332179147750139, |
|
"step": 321 |
|
}, |
|
{ |
|
"completion_length": 425.9540729522705, |
|
"epoch": 0.8390879478827361, |
|
"grad_norm": 0.010147208347916603, |
|
"learning_rate": 6.807316450240425e-07, |
|
"loss": -0.008, |
|
"reward": 1.0222302973270416, |
|
"reward_std": 0.055277756706345826, |
|
"rewards/accuracy_reward": 0.5918367300182581, |
|
"rewards/semantic_entropy_math_reward": 0.430393585935235, |
|
"step": 322 |
|
}, |
|
{ |
|
"completion_length": 456.4866008758545, |
|
"epoch": 0.8416938110749186, |
|
"grad_norm": 0.0100060123950243, |
|
"learning_rate": 6.590953995067812e-07, |
|
"loss": -0.013, |
|
"reward": 0.9208272397518158, |
|
"reward_std": 0.07600355852628127, |
|
"rewards/accuracy_reward": 0.49808672070503235, |
|
"rewards/semantic_entropy_math_reward": 0.42274050787091255, |
|
"step": 323 |
|
}, |
|
{ |
|
"completion_length": 449.3877468109131, |
|
"epoch": 0.844299674267101, |
|
"grad_norm": 0.011219106614589691, |
|
"learning_rate": 6.377842892165892e-07, |
|
"loss": -0.0098, |
|
"reward": 0.9591836631298065, |
|
"reward_std": 0.11074057850055397, |
|
"rewards/accuracy_reward": 0.5427295789122581, |
|
"rewards/semantic_entropy_math_reward": 0.4164540767669678, |
|
"step": 324 |
|
}, |
|
{ |
|
"completion_length": 476.1007499694824, |
|
"epoch": 0.8469055374592834, |
|
"grad_norm": 0.02076311595737934, |
|
"learning_rate": 6.167999103628569e-07, |
|
"loss": -0.0083, |
|
"reward": 0.9911625236272812, |
|
"reward_std": 0.10244781541405246, |
|
"rewards/accuracy_reward": 0.5714285615831614, |
|
"rewards/semantic_entropy_math_reward": 0.419733963906765, |
|
"step": 325 |
|
}, |
|
{ |
|
"completion_length": 446.6645317077637, |
|
"epoch": 0.8495114006514658, |
|
"grad_norm": 0.019433218985795975, |
|
"learning_rate": 5.961438346826792e-07, |
|
"loss": -0.0009, |
|
"reward": 0.9219205342233181, |
|
"reward_std": 0.07894981937715784, |
|
"rewards/accuracy_reward": 0.5012755040079355, |
|
"rewards/semantic_entropy_math_reward": 0.4206450283527374, |
|
"step": 326 |
|
}, |
|
{ |
|
"completion_length": 459.9164447784424, |
|
"epoch": 0.8521172638436482, |
|
"grad_norm": 0.009447368793189526, |
|
"learning_rate": 5.758176093231294e-07, |
|
"loss": -0.0137, |
|
"reward": 1.0237791389226913, |
|
"reward_std": 0.08699435199378058, |
|
"rewards/accuracy_reward": 0.5956632476300001, |
|
"rewards/semantic_entropy_math_reward": 0.4281158745288849, |
|
"step": 327 |
|
}, |
|
{ |
|
"completion_length": 486.7385063171387, |
|
"epoch": 0.8547231270358306, |
|
"grad_norm": 0.012096602469682693, |
|
"learning_rate": 5.558227567253832e-07, |
|
"loss": -0.0081, |
|
"reward": 0.9570881724357605, |
|
"reward_std": 0.12206664809491485, |
|
"rewards/accuracy_reward": 0.531249986961484, |
|
"rewards/semantic_entropy_math_reward": 0.4258381836116314, |
|
"step": 328 |
|
}, |
|
{ |
|
"completion_length": 475.43876457214355, |
|
"epoch": 0.857328990228013, |
|
"grad_norm": 0.00933607667684555, |
|
"learning_rate": 5.361607745106817e-07, |
|
"loss": -0.0169, |
|
"reward": 0.9730320535600185, |
|
"reward_std": 0.08804468740709126, |
|
"rewards/accuracy_reward": 0.5567601844668388, |
|
"rewards/semantic_entropy_math_reward": 0.4162718579173088, |
|
"step": 329 |
|
}, |
|
{ |
|
"completion_length": 431.8603210449219, |
|
"epoch": 0.8599348534201955, |
|
"grad_norm": 0.02015325427055359, |
|
"learning_rate": 5.168331353681643e-07, |
|
"loss": -0.0021, |
|
"reward": 0.9969023317098618, |
|
"reward_std": 0.09806176437996328, |
|
"rewards/accuracy_reward": 0.5733418297022581, |
|
"rewards/semantic_entropy_math_reward": 0.42356047965586185, |
|
"step": 330 |
|
}, |
|
{ |
|
"completion_length": 427.3131294250488, |
|
"epoch": 0.8625407166123779, |
|
"grad_norm": 0.013669871725142002, |
|
"learning_rate": 4.97841286944557e-07, |
|
"loss": -0.0153, |
|
"reward": 1.0245991051197052, |
|
"reward_std": 0.07894148712512106, |
|
"rewards/accuracy_reward": 0.578443868085742, |
|
"rewards/semantic_entropy_math_reward": 0.4461552295833826, |
|
"step": 331 |
|
}, |
|
{ |
|
"completion_length": 422.89731216430664, |
|
"epoch": 0.8651465798045602, |
|
"grad_norm": 0.010783183388411999, |
|
"learning_rate": 4.791866517357491e-07, |
|
"loss": -0.0006, |
|
"reward": 1.0032798871397972, |
|
"reward_std": 0.07655263572814874, |
|
"rewards/accuracy_reward": 0.5758928395807743, |
|
"rewards/semantic_entropy_math_reward": 0.42738700844347477, |
|
"step": 332 |
|
}, |
|
{ |
|
"completion_length": 433.96299934387207, |
|
"epoch": 0.8677524429967427, |
|
"grad_norm": 0.010686203837394714, |
|
"learning_rate": 4.608706269802471e-07, |
|
"loss": -0.0031, |
|
"reward": 1.0082908011972904, |
|
"reward_std": 0.0961291438434273, |
|
"rewards/accuracy_reward": 0.5848214142024517, |
|
"rewards/semantic_entropy_math_reward": 0.42346937395632267, |
|
"step": 333 |
|
}, |
|
{ |
|
"completion_length": 483.29973793029785, |
|
"epoch": 0.8703583061889251, |
|
"grad_norm": 0.013967434875667095, |
|
"learning_rate": 4.428945845545168e-07, |
|
"loss": -0.0167, |
|
"reward": 0.9419642761349678, |
|
"reward_std": 0.08668096817564219, |
|
"rewards/accuracy_reward": 0.5433673374354839, |
|
"rewards/semantic_entropy_math_reward": 0.3985969349741936, |
|
"step": 334 |
|
}, |
|
{ |
|
"completion_length": 453.30356216430664, |
|
"epoch": 0.8729641693811075, |
|
"grad_norm": 0.009853254072368145, |
|
"learning_rate": 4.2525987087023433e-07, |
|
"loss": -0.0047, |
|
"reward": 0.9753097556531429, |
|
"reward_std": 0.09639269160106778, |
|
"rewards/accuracy_reward": 0.5452805999666452, |
|
"rewards/semantic_entropy_math_reward": 0.43002915009856224, |
|
"step": 335 |
|
}, |
|
{ |
|
"completion_length": 431.1064987182617, |
|
"epoch": 0.8755700325732899, |
|
"grad_norm": 0.009506824426352978, |
|
"learning_rate": 4.0796780677343606e-07, |
|
"loss": -0.0053, |
|
"reward": 0.9678389020264149, |
|
"reward_std": 0.0869278380414471, |
|
"rewards/accuracy_reward": 0.5497448872774839, |
|
"rewards/semantic_entropy_math_reward": 0.41809402219951153, |
|
"step": 336 |
|
}, |
|
{ |
|
"completion_length": 432.71044731140137, |
|
"epoch": 0.8781758957654723, |
|
"grad_norm": 0.012929832562804222, |
|
"learning_rate": 3.910196874455896e-07, |
|
"loss": -0.0077, |
|
"reward": 0.9853316098451614, |
|
"reward_std": 0.08627731067826971, |
|
"rewards/accuracy_reward": 0.5535714197903872, |
|
"rewards/semantic_entropy_math_reward": 0.43176020309329033, |
|
"step": 337 |
|
}, |
|
{ |
|
"completion_length": 440.03825759887695, |
|
"epoch": 0.8807817589576548, |
|
"grad_norm": 0.01218812633305788, |
|
"learning_rate": 3.744167823065814e-07, |
|
"loss": -0.0122, |
|
"reward": 0.9091654308140278, |
|
"reward_std": 0.09821587102487683, |
|
"rewards/accuracy_reward": 0.4968112148344517, |
|
"rewards/semantic_entropy_math_reward": 0.4123542197048664, |
|
"step": 338 |
|
}, |
|
{ |
|
"completion_length": 457.76976108551025, |
|
"epoch": 0.8833876221498371, |
|
"grad_norm": 0.008668128401041031, |
|
"learning_rate": 3.581603349196372e-07, |
|
"loss": -0.0077, |
|
"reward": 0.930757999420166, |
|
"reward_std": 0.07734870351850986, |
|
"rewards/accuracy_reward": 0.5197703987360001, |
|
"rewards/semantic_entropy_math_reward": 0.41098760068416595, |
|
"step": 339 |
|
}, |
|
{ |
|
"completion_length": 442.6613426208496, |
|
"epoch": 0.8859934853420195, |
|
"grad_norm": 0.008681760169565678, |
|
"learning_rate": 3.4225156289818096e-07, |
|
"loss": -0.0059, |
|
"reward": 1.0034620836377144, |
|
"reward_std": 0.08158292178995907, |
|
"rewards/accuracy_reward": 0.5816326476633549, |
|
"rewards/semantic_entropy_math_reward": 0.42182943783700466, |
|
"step": 340 |
|
}, |
|
{ |
|
"completion_length": 485.2876148223877, |
|
"epoch": 0.888599348534202, |
|
"grad_norm": 0.008547916077077389, |
|
"learning_rate": 3.26691657814634e-07, |
|
"loss": -0.0087, |
|
"reward": 0.9770408011972904, |
|
"reward_std": 0.07393853948451579, |
|
"rewards/accuracy_reward": 0.5433673355728388, |
|
"rewards/semantic_entropy_math_reward": 0.43367345817387104, |
|
"step": 341 |
|
}, |
|
{ |
|
"completion_length": 452.9138927459717, |
|
"epoch": 0.8912052117263843, |
|
"grad_norm": 0.01058235950767994, |
|
"learning_rate": 3.1148178511116624e-07, |
|
"loss": -0.0017, |
|
"reward": 1.0015488117933273, |
|
"reward_std": 0.09451375540811568, |
|
"rewards/accuracy_reward": 0.5701530463993549, |
|
"rewards/semantic_entropy_math_reward": 0.4313957579433918, |
|
"step": 342 |
|
}, |
|
{ |
|
"completion_length": 476.1976947784424, |
|
"epoch": 0.8938110749185668, |
|
"grad_norm": 0.015603607520461082, |
|
"learning_rate": 2.966230840124007e-07, |
|
"loss": -0.0084, |
|
"reward": 0.9884292893111706, |
|
"reward_std": 0.0848599491873756, |
|
"rewards/accuracy_reward": 0.548469377681613, |
|
"rewards/semantic_entropy_math_reward": 0.4399598930031061, |
|
"step": 343 |
|
}, |
|
{ |
|
"completion_length": 474.1007595062256, |
|
"epoch": 0.8964169381107492, |
|
"grad_norm": 0.009830434806644917, |
|
"learning_rate": 2.821166674400905e-07, |
|
"loss": -0.0111, |
|
"reward": 0.9547193609178066, |
|
"reward_std": 0.08753847831394523, |
|
"rewards/accuracy_reward": 0.5414540711790323, |
|
"rewards/semantic_entropy_math_reward": 0.4132652934640646, |
|
"step": 344 |
|
}, |
|
{ |
|
"completion_length": 462.9668254852295, |
|
"epoch": 0.8990228013029316, |
|
"grad_norm": 0.012285560369491577, |
|
"learning_rate": 2.6796362192975766e-07, |
|
"loss": -0.0061, |
|
"reward": 0.9971756413578987, |
|
"reward_std": 0.10323424229864031, |
|
"rewards/accuracy_reward": 0.5733418203890324, |
|
"rewards/semantic_entropy_math_reward": 0.4238338116556406, |
|
"step": 345 |
|
}, |
|
{ |
|
"completion_length": 448.88455963134766, |
|
"epoch": 0.901628664495114, |
|
"grad_norm": 0.010707194916903973, |
|
"learning_rate": 2.5416500754931294e-07, |
|
"loss": -0.0088, |
|
"reward": 1.0225036218762398, |
|
"reward_std": 0.09066123812226579, |
|
"rewards/accuracy_reward": 0.6001275330781937, |
|
"rewards/semantic_entropy_math_reward": 0.4223760813474655, |
|
"step": 346 |
|
}, |
|
{ |
|
"completion_length": 447.4311122894287, |
|
"epoch": 0.9042345276872964, |
|
"grad_norm": 0.009493795223534107, |
|
"learning_rate": 2.407218578196524e-07, |
|
"loss": -0.0071, |
|
"reward": 0.9620080180466175, |
|
"reward_std": 0.09404018375789747, |
|
"rewards/accuracy_reward": 0.5382652971893549, |
|
"rewards/semantic_entropy_math_reward": 0.4237427096813917, |
|
"step": 347 |
|
}, |
|
{ |
|
"completion_length": 486.0082836151123, |
|
"epoch": 0.9068403908794789, |
|
"grad_norm": 0.010954687371850014, |
|
"learning_rate": 2.2763517963725169e-07, |
|
"loss": -0.0128, |
|
"reward": 0.9949890375137329, |
|
"reward_std": 0.095983712002635, |
|
"rewards/accuracy_reward": 0.5829081572592258, |
|
"rewards/semantic_entropy_math_reward": 0.4120809007436037, |
|
"step": 348 |
|
}, |
|
{ |
|
"completion_length": 431.5510139465332, |
|
"epoch": 0.9094462540716612, |
|
"grad_norm": 0.012899513356387615, |
|
"learning_rate": 2.1490595319874574e-07, |
|
"loss": -0.0081, |
|
"reward": 0.9666545018553734, |
|
"reward_std": 0.09545677714049816, |
|
"rewards/accuracy_reward": 0.5471938643604517, |
|
"rewards/semantic_entropy_math_reward": 0.41946063563227654, |
|
"step": 349 |
|
}, |
|
{ |
|
"completion_length": 485.63200759887695, |
|
"epoch": 0.9120521172638436, |
|
"grad_norm": 0.01148886326700449, |
|
"learning_rate": 2.0253513192751374e-07, |
|
"loss": -0.0082, |
|
"reward": 0.9554482325911522, |
|
"reward_std": 0.100743565504672, |
|
"rewards/accuracy_reward": 0.5440050885081291, |
|
"rewards/semantic_entropy_math_reward": 0.4114431384950876, |
|
"step": 350 |
|
}, |
|
{ |
|
"completion_length": 460.51784324645996, |
|
"epoch": 0.9146579804560261, |
|
"grad_norm": 0.012849576771259308, |
|
"learning_rate": 1.905236424022633e-07, |
|
"loss": -0.012, |
|
"reward": 0.8979591764509678, |
|
"reward_std": 0.09470229386352003, |
|
"rewards/accuracy_reward": 0.4872448891401291, |
|
"rewards/semantic_entropy_math_reward": 0.41071427799761295, |
|
"step": 351 |
|
}, |
|
{ |
|
"completion_length": 487.3086643218994, |
|
"epoch": 0.9172638436482085, |
|
"grad_norm": 0.01112964004278183, |
|
"learning_rate": 1.7887238428763553e-07, |
|
"loss": -0.0101, |
|
"reward": 0.931213553994894, |
|
"reward_std": 0.10900401265826076, |
|
"rewards/accuracy_reward": 0.5197703968733549, |
|
"rewards/semantic_entropy_math_reward": 0.4114431384950876, |
|
"step": 352 |
|
}, |
|
{ |
|
"completion_length": 445.08800315856934, |
|
"epoch": 0.9198697068403909, |
|
"grad_norm": 0.009342041797935963, |
|
"learning_rate": 1.6758223026681507e-07, |
|
"loss": -0.0039, |
|
"reward": 0.9932579882442951, |
|
"reward_std": 0.06484666049072985, |
|
"rewards/accuracy_reward": 0.5561224408447742, |
|
"rewards/semantic_entropy_math_reward": 0.4371355604380369, |
|
"step": 353 |
|
}, |
|
{ |
|
"completion_length": 441.0204029083252, |
|
"epoch": 0.9224755700325733, |
|
"grad_norm": 0.012000740505754948, |
|
"learning_rate": 1.5665402597616842e-07, |
|
"loss": -0.0088, |
|
"reward": 0.9781340956687927, |
|
"reward_std": 0.09719502518419176, |
|
"rewards/accuracy_reward": 0.5682397782802582, |
|
"rewards/semantic_entropy_math_reward": 0.40989430248737335, |
|
"step": 354 |
|
}, |
|
{ |
|
"completion_length": 419.2601947784424, |
|
"epoch": 0.9250814332247557, |
|
"grad_norm": 0.013637058436870575, |
|
"learning_rate": 1.4608858994190344e-07, |
|
"loss": -0.0084, |
|
"reward": 0.9946246184408665, |
|
"reward_std": 0.09013567195506766, |
|
"rewards/accuracy_reward": 0.5758928433060646, |
|
"rewards/semantic_entropy_math_reward": 0.41873177886009216, |
|
"step": 355 |
|
}, |
|
{ |
|
"completion_length": 424.69897270202637, |
|
"epoch": 0.9276872964169381, |
|
"grad_norm": 0.011202794499695301, |
|
"learning_rate": 1.358867135187636e-07, |
|
"loss": -0.0031, |
|
"reward": 1.043093990534544, |
|
"reward_std": 0.07236083660973236, |
|
"rewards/accuracy_reward": 0.6052295807749033, |
|
"rewards/semantic_entropy_math_reward": 0.43786441907286644, |
|
"step": 356 |
|
}, |
|
{ |
|
"completion_length": 453.50764656066895, |
|
"epoch": 0.9302931596091205, |
|
"grad_norm": 0.011242231354117393, |
|
"learning_rate": 1.2604916083075236e-07, |
|
"loss": -0.0184, |
|
"reward": 0.9417820461094379, |
|
"reward_std": 0.11088896542787552, |
|
"rewards/accuracy_reward": 0.5344387609511614, |
|
"rewards/semantic_entropy_math_reward": 0.40734328515827656, |
|
"step": 357 |
|
}, |
|
{ |
|
"completion_length": 401.16899394989014, |
|
"epoch": 0.9328990228013029, |
|
"grad_norm": 0.009380885399878025, |
|
"learning_rate": 1.1657666871390471e-07, |
|
"loss": 0.0012, |
|
"reward": 1.0115706995129585, |
|
"reward_std": 0.08255202136933804, |
|
"rewards/accuracy_reward": 0.5841836612671614, |
|
"rewards/semantic_entropy_math_reward": 0.42738702334463596, |
|
"step": 358 |
|
}, |
|
{ |
|
"completion_length": 450.27869033813477, |
|
"epoch": 0.9355048859934854, |
|
"grad_norm": 0.012342661619186401, |
|
"learning_rate": 1.0746994666109234e-07, |
|
"loss": -0.0078, |
|
"reward": 0.9944424033164978, |
|
"reward_std": 0.08844125363975763, |
|
"rewards/accuracy_reward": 0.5758928433060646, |
|
"rewards/semantic_entropy_math_reward": 0.41854955442249775, |
|
"step": 359 |
|
}, |
|
{ |
|
"completion_length": 439.46044921875, |
|
"epoch": 0.9381107491856677, |
|
"grad_norm": 0.01147682499140501, |
|
"learning_rate": 9.872967676888611e-08, |
|
"loss": -0.0142, |
|
"reward": 0.9127186425030231, |
|
"reward_std": 0.11078671208815649, |
|
"rewards/accuracy_reward": 0.5121173411607742, |
|
"rewards/semantic_entropy_math_reward": 0.4006012938916683, |
|
"step": 360 |
|
}, |
|
{ |
|
"completion_length": 484.5235786437988, |
|
"epoch": 0.9407166123778502, |
|
"grad_norm": 0.023287979885935783, |
|
"learning_rate": 9.035651368646647e-08, |
|
"loss": -0.0154, |
|
"reward": 0.8902150094509125, |
|
"reward_std": 0.10496682248776779, |
|
"rewards/accuracy_reward": 0.47257651947438717, |
|
"rewards/semantic_entropy_math_reward": 0.41763847321271896, |
|
"step": 361 |
|
}, |
|
{ |
|
"completion_length": 460.7825222015381, |
|
"epoch": 0.9433224755700326, |
|
"grad_norm": 0.010122385807335377, |
|
"learning_rate": 8.235108456658814e-08, |
|
"loss": -0.0086, |
|
"reward": 0.9826894924044609, |
|
"reward_std": 0.09733415581285954, |
|
"rewards/accuracy_reward": 0.5644132476300001, |
|
"rewards/semantic_entropy_math_reward": 0.4182762373238802, |
|
"step": 362 |
|
}, |
|
{ |
|
"completion_length": 488.81886291503906, |
|
"epoch": 0.9459283387622149, |
|
"grad_norm": 0.010546687990427017, |
|
"learning_rate": 7.471398901860772e-08, |
|
"loss": -0.0247, |
|
"reward": 0.9782251939177513, |
|
"reward_std": 0.10399020987097174, |
|
"rewards/accuracy_reward": 0.5605867225676775, |
|
"rewards/semantic_entropy_math_reward": 0.41763848438858986, |
|
"step": 363 |
|
}, |
|
{ |
|
"completion_length": 421.8590488433838, |
|
"epoch": 0.9485342019543974, |
|
"grad_norm": 0.011549089103937149, |
|
"learning_rate": 6.744579906357185e-08, |
|
"loss": -0.0123, |
|
"reward": 0.9907980933785439, |
|
"reward_std": 0.08595408496330492, |
|
"rewards/accuracy_reward": 0.5567601937800646, |
|
"rewards/semantic_entropy_math_reward": 0.434037895873189, |
|
"step": 364 |
|
}, |
|
{ |
|
"completion_length": 454.81504821777344, |
|
"epoch": 0.9511400651465798, |
|
"grad_norm": 0.012615454383194447, |
|
"learning_rate": 6.054705909137426e-08, |
|
"loss": -0.011, |
|
"reward": 0.9697521664202213, |
|
"reward_std": 0.10905627987813205, |
|
"rewards/accuracy_reward": 0.5516581553965807, |
|
"rewards/semantic_entropy_math_reward": 0.4180940240621567, |
|
"step": 365 |
|
}, |
|
{ |
|
"completion_length": 454.48596382141113, |
|
"epoch": 0.9537459283387623, |
|
"grad_norm": 0.010557249188423157, |
|
"learning_rate": 5.401828581997948e-08, |
|
"loss": -0.0029, |
|
"reward": 0.9792273677885532, |
|
"reward_std": 0.08946075174026191, |
|
"rewards/accuracy_reward": 0.5497448872774839, |
|
"rewards/semantic_entropy_math_reward": 0.42948249727487564, |
|
"step": 366 |
|
}, |
|
{ |
|
"completion_length": 438.12499046325684, |
|
"epoch": 0.9563517915309446, |
|
"grad_norm": 0.008979610167443752, |
|
"learning_rate": 4.7859968256719344e-08, |
|
"loss": -0.0124, |
|
"reward": 0.9787718281149864, |
|
"reward_std": 0.08572447136975825, |
|
"rewards/accuracy_reward": 0.5452805999666452, |
|
"rewards/semantic_entropy_math_reward": 0.43349125795066357, |
|
"step": 367 |
|
}, |
|
{ |
|
"completion_length": 449.46810150146484, |
|
"epoch": 0.958957654723127, |
|
"grad_norm": 0.01041451282799244, |
|
"learning_rate": 4.207256766166845e-08, |
|
"loss": -0.0071, |
|
"reward": 0.9146319180727005, |
|
"reward_std": 0.08887826814316213, |
|
"rewards/accuracy_reward": 0.5063775405287743, |
|
"rewards/semantic_entropy_math_reward": 0.40825437009334564, |
|
"step": 368 |
|
}, |
|
{ |
|
"completion_length": 447.2391471862793, |
|
"epoch": 0.9615635179153095, |
|
"grad_norm": 0.008584603667259216, |
|
"learning_rate": 3.665651751309451e-08, |
|
"loss": -0.0096, |
|
"reward": 0.9768585935235023, |
|
"reward_std": 0.08100897446274757, |
|
"rewards/accuracy_reward": 0.544005086645484, |
|
"rewards/semantic_entropy_math_reward": 0.43285347521305084, |
|
"step": 369 |
|
}, |
|
{ |
|
"completion_length": 423.11478424072266, |
|
"epoch": 0.9641693811074918, |
|
"grad_norm": 0.009456335566937923, |
|
"learning_rate": 3.16122234749916e-08, |
|
"loss": -0.0034, |
|
"reward": 1.026512373238802, |
|
"reward_std": 0.08707130316179246, |
|
"rewards/accuracy_reward": 0.5943877454847097, |
|
"rewards/semantic_entropy_math_reward": 0.4321246314793825, |
|
"step": 370 |
|
}, |
|
{ |
|
"completion_length": 447.5299644470215, |
|
"epoch": 0.9667752442996743, |
|
"grad_norm": 0.015312188304960728, |
|
"learning_rate": 2.6940063366693303e-08, |
|
"loss": -0.0019, |
|
"reward": 0.9723943062126637, |
|
"reward_std": 0.08906970918178558, |
|
"rewards/accuracy_reward": 0.5535714235156775, |
|
"rewards/semantic_entropy_math_reward": 0.41882289201021194, |
|
"step": 371 |
|
}, |
|
{ |
|
"completion_length": 449.52231216430664, |
|
"epoch": 0.9693811074918567, |
|
"grad_norm": 0.009211807511746883, |
|
"learning_rate": 2.264038713457706e-08, |
|
"loss": -0.0086, |
|
"reward": 0.9643768034875393, |
|
"reward_std": 0.09445711580337957, |
|
"rewards/accuracy_reward": 0.542091827839613, |
|
"rewards/semantic_entropy_math_reward": 0.42228496447205544, |
|
"step": 372 |
|
}, |
|
{ |
|
"completion_length": 427.89858627319336, |
|
"epoch": 0.971986970684039, |
|
"grad_norm": 0.014192759990692139, |
|
"learning_rate": 1.8713516825851207e-08, |
|
"loss": -0.0046, |
|
"reward": 0.975127536803484, |
|
"reward_std": 0.08639455388765782, |
|
"rewards/accuracy_reward": 0.5382652953267097, |
|
"rewards/semantic_entropy_math_reward": 0.4368622377514839, |
|
"step": 373 |
|
}, |
|
{ |
|
"completion_length": 458.57715606689453, |
|
"epoch": 0.9745928338762215, |
|
"grad_norm": 0.017490204423666, |
|
"learning_rate": 1.51597465644332e-08, |
|
"loss": -0.0092, |
|
"reward": 0.9249271154403687, |
|
"reward_std": 0.10873797861859202, |
|
"rewards/accuracy_reward": 0.5191326383501291, |
|
"rewards/semantic_entropy_math_reward": 0.40579446218907833, |
|
"step": 374 |
|
}, |
|
{ |
|
"completion_length": 458.8858337402344, |
|
"epoch": 0.9771986970684039, |
|
"grad_norm": 0.01276995800435543, |
|
"learning_rate": 1.1979342528922189e-08, |
|
"loss": -0.0095, |
|
"reward": 0.9507106244564056, |
|
"reward_std": 0.1035163603228284, |
|
"rewards/accuracy_reward": 0.5350765194743872, |
|
"rewards/semantic_entropy_math_reward": 0.4156340938061476, |
|
"step": 375 |
|
}, |
|
{ |
|
"completion_length": 443.0637664794922, |
|
"epoch": 0.9798045602605863, |
|
"grad_norm": 0.008754832670092583, |
|
"learning_rate": 9.1725429326589e-09, |
|
"loss": -0.0068, |
|
"reward": 0.987335991114378, |
|
"reward_std": 0.07803339039674029, |
|
"rewards/accuracy_reward": 0.5542091745883226, |
|
"rewards/semantic_entropy_math_reward": 0.4331268183887005, |
|
"step": 376 |
|
}, |
|
{ |
|
"completion_length": 438.10521697998047, |
|
"epoch": 0.9824104234527687, |
|
"grad_norm": 0.01094447448849678, |
|
"learning_rate": 6.739558005884883e-09, |
|
"loss": -0.0073, |
|
"reward": 0.945426382124424, |
|
"reward_std": 0.07054417789913714, |
|
"rewards/accuracy_reward": 0.5318877473473549, |
|
"rewards/semantic_entropy_math_reward": 0.41353862918913364, |
|
"step": 377 |
|
}, |
|
{ |
|
"completion_length": 422.69897079467773, |
|
"epoch": 0.9850162866449511, |
|
"grad_norm": 0.01369545143097639, |
|
"learning_rate": 4.6805699799967744e-09, |
|
"loss": -0.0054, |
|
"reward": 0.9959912300109863, |
|
"reward_std": 0.09532720362767577, |
|
"rewards/accuracy_reward": 0.5682397801429033, |
|
"rewards/semantic_entropy_math_reward": 0.42775145545601845, |
|
"step": 378 |
|
}, |
|
{ |
|
"completion_length": 425.08226013183594, |
|
"epoch": 0.9876221498371336, |
|
"grad_norm": 0.009248544462025166, |
|
"learning_rate": 2.995733073895557e-09, |
|
"loss": -0.0082, |
|
"reward": 1.0608600415289402, |
|
"reward_std": 0.09763774264138192, |
|
"rewards/accuracy_reward": 0.6352040730416775, |
|
"rewards/semantic_entropy_math_reward": 0.425655959174037, |
|
"step": 379 |
|
}, |
|
{ |
|
"completion_length": 419.81759452819824, |
|
"epoch": 0.990228013029316, |
|
"grad_norm": 0.008247760124504566, |
|
"learning_rate": 1.6851734824380184e-09, |
|
"loss": -0.0027, |
|
"reward": 0.985878262668848, |
|
"reward_std": 0.08342396467924118, |
|
"rewards/accuracy_reward": 0.5720663201063871, |
|
"rewards/semantic_entropy_math_reward": 0.41381194815039635, |
|
"step": 380 |
|
}, |
|
{ |
|
"completion_length": 452.42090797424316, |
|
"epoch": 0.9928338762214983, |
|
"grad_norm": 0.011512123979628086, |
|
"learning_rate": 7.48989366980979e-10, |
|
"loss": -0.0088, |
|
"reward": 0.9500728640705347, |
|
"reward_std": 0.08807020282256417, |
|
"rewards/accuracy_reward": 0.5459183580242097, |
|
"rewards/semantic_entropy_math_reward": 0.4041545204818249, |
|
"step": 381 |
|
}, |
|
{ |
|
"completion_length": 460.566951751709, |
|
"epoch": 0.9954397394136808, |
|
"grad_norm": 0.014522072859108448, |
|
"learning_rate": 1.872508480332824e-10, |
|
"loss": -0.0131, |
|
"reward": 0.9883381687104702, |
|
"reward_std": 0.09443258820101619, |
|
"rewards/accuracy_reward": 0.5573979429900646, |
|
"rewards/semantic_entropy_math_reward": 0.43094020895659924, |
|
"step": 382 |
|
}, |
|
{ |
|
"completion_length": 440.04973220825195, |
|
"epoch": 0.9980456026058632, |
|
"grad_norm": 0.015413707122206688, |
|
"learning_rate": 0.0, |
|
"loss": -0.0133, |
|
"reward": 0.9645590074360371, |
|
"reward_std": 0.0993940774933435, |
|
"rewards/accuracy_reward": 0.5497448854148388, |
|
"rewards/semantic_entropy_math_reward": 0.4148141276091337, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.9980456026058632, |
|
"step": 383, |
|
"total_flos": 0.0, |
|
"train_loss": 0.0, |
|
"train_runtime": 2.7488, |
|
"train_samples_per_second": 31267.524, |
|
"train_steps_per_second": 139.332 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 383, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 20, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|