|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9965010496850945, |
|
"eval_steps": 100, |
|
"global_step": 178, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 548.2455501556396, |
|
"epoch": 0.005598320503848845, |
|
"grad_norm": 0.01954779587686062, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.238839291036129, |
|
"reward_std": 0.11606137279886752, |
|
"rewards/semantic_entropy_math_reward": 0.238839291036129, |
|
"step": 1 |
|
}, |
|
{ |
|
"completion_length": 526.9025402069092, |
|
"epoch": 0.01119664100769769, |
|
"grad_norm": 0.018136516213417053, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.2369791674427688, |
|
"reward_std": 0.10657330928370357, |
|
"rewards/semantic_entropy_math_reward": 0.2369791674427688, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 562.8943519592285, |
|
"epoch": 0.016794961511546535, |
|
"grad_norm": 0.01578659377992153, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.23115079675335437, |
|
"reward_std": 0.09216207754798234, |
|
"rewards/semantic_entropy_math_reward": 0.23115079675335437, |
|
"step": 3 |
|
}, |
|
{ |
|
"completion_length": 571.5796279907227, |
|
"epoch": 0.02239328201539538, |
|
"grad_norm": 0.018513264134526253, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.2011408838443458, |
|
"reward_std": 0.09730021236464381, |
|
"rewards/semantic_entropy_math_reward": 0.2011408838443458, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 569.882453918457, |
|
"epoch": 0.02799160251924423, |
|
"grad_norm": 0.023015329614281654, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.23325893050059676, |
|
"reward_std": 0.11453290120698512, |
|
"rewards/semantic_entropy_math_reward": 0.23325893050059676, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 541.0096778869629, |
|
"epoch": 0.03358992302309307, |
|
"grad_norm": 0.017617080360651016, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.2580605214461684, |
|
"reward_std": 0.1201386651955545, |
|
"rewards/semantic_entropy_math_reward": 0.2580605214461684, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 579.73512840271, |
|
"epoch": 0.03918824352694192, |
|
"grad_norm": 0.0158968698233366, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.2048611151985824, |
|
"reward_std": 0.0955489007756114, |
|
"rewards/semantic_entropy_math_reward": 0.2048611151985824, |
|
"step": 7 |
|
}, |
|
{ |
|
"completion_length": 515.7500076293945, |
|
"epoch": 0.04478656403079076, |
|
"grad_norm": 0.022326918318867683, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.21825397666543722, |
|
"reward_std": 0.0982005288824439, |
|
"rewards/semantic_entropy_math_reward": 0.21825397666543722, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 590.3921222686768, |
|
"epoch": 0.05038488453463961, |
|
"grad_norm": 0.016080491244792938, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.21205357322469354, |
|
"reward_std": 0.10619991598650813, |
|
"rewards/semantic_entropy_math_reward": 0.21205357322469354, |
|
"step": 9 |
|
}, |
|
{ |
|
"completion_length": 581.6651878356934, |
|
"epoch": 0.05598320503848846, |
|
"grad_norm": 0.017059899866580963, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.21899801935069263, |
|
"reward_std": 0.10072531108744442, |
|
"rewards/semantic_entropy_math_reward": 0.21899801935069263, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 571.7886981964111, |
|
"epoch": 0.0615815255423373, |
|
"grad_norm": 0.01674991473555565, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.2311508022248745, |
|
"reward_std": 0.10565085639245808, |
|
"rewards/semantic_entropy_math_reward": 0.2311508022248745, |
|
"step": 11 |
|
}, |
|
{ |
|
"completion_length": 562.854923248291, |
|
"epoch": 0.06717984604618614, |
|
"grad_norm": 0.018872996792197227, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.2327629025094211, |
|
"reward_std": 0.10614555445499718, |
|
"rewards/semantic_entropy_math_reward": 0.2327629025094211, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 568.9709930419922, |
|
"epoch": 0.072778166550035, |
|
"grad_norm": 0.015940172597765923, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.23288691393099725, |
|
"reward_std": 0.10073893028311431, |
|
"rewards/semantic_entropy_math_reward": 0.23288691393099725, |
|
"step": 13 |
|
}, |
|
{ |
|
"completion_length": 593.6733703613281, |
|
"epoch": 0.07837648705388384, |
|
"grad_norm": 0.015898453071713448, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.22792659373953938, |
|
"reward_std": 0.09615750901866704, |
|
"rewards/semantic_entropy_math_reward": 0.22792659373953938, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 584.9248657226562, |
|
"epoch": 0.08397480755773268, |
|
"grad_norm": 0.014255843125283718, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.2467758092097938, |
|
"reward_std": 0.08825354627333581, |
|
"rewards/semantic_entropy_math_reward": 0.2467758092097938, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 516.8154811859131, |
|
"epoch": 0.08957312806158152, |
|
"grad_norm": 0.022125836461782455, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.2909226221963763, |
|
"reward_std": 0.12354861758649349, |
|
"rewards/semantic_entropy_math_reward": 0.2909226221963763, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 551.7343845367432, |
|
"epoch": 0.09517144856543037, |
|
"grad_norm": 0.016181064769625664, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.2682291769888252, |
|
"reward_std": 0.10501077049411833, |
|
"rewards/semantic_entropy_math_reward": 0.2682291769888252, |
|
"step": 17 |
|
}, |
|
{ |
|
"completion_length": 618.2076015472412, |
|
"epoch": 0.10076976906927922, |
|
"grad_norm": 0.019598443061113358, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.2562004067003727, |
|
"reward_std": 0.11642820481210947, |
|
"rewards/semantic_entropy_math_reward": 0.2562004067003727, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 561.2827529907227, |
|
"epoch": 0.10636808957312806, |
|
"grad_norm": 0.015643073245882988, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.26202877750620246, |
|
"reward_std": 0.10126263811253011, |
|
"rewards/semantic_entropy_math_reward": 0.26202877750620246, |
|
"step": 19 |
|
}, |
|
{ |
|
"completion_length": 579.395845413208, |
|
"epoch": 0.11196641007697691, |
|
"grad_norm": 0.017556050792336464, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.2761656870134175, |
|
"reward_std": 0.11453696829266846, |
|
"rewards/semantic_entropy_math_reward": 0.2761656870134175, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 580.3199481964111, |
|
"epoch": 0.11756473058082575, |
|
"grad_norm": 0.01937718316912651, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.2857142901048064, |
|
"reward_std": 0.10658370074816048, |
|
"rewards/semantic_entropy_math_reward": 0.2857142901048064, |
|
"step": 21 |
|
}, |
|
{ |
|
"completion_length": 638.837064743042, |
|
"epoch": 0.1231630510846746, |
|
"grad_norm": 0.018222520127892494, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.21614584047347307, |
|
"reward_std": 0.0887793127913028, |
|
"rewards/semantic_entropy_math_reward": 0.21614584047347307, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 616.3043251037598, |
|
"epoch": 0.12876137158852344, |
|
"grad_norm": 0.015998607501387596, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.2881944440305233, |
|
"reward_std": 0.09268029552185908, |
|
"rewards/semantic_entropy_math_reward": 0.2881944440305233, |
|
"step": 23 |
|
}, |
|
{ |
|
"completion_length": 608.8288803100586, |
|
"epoch": 0.13435969209237228, |
|
"grad_norm": 0.015030119568109512, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.2903025895357132, |
|
"reward_std": 0.10364439443219453, |
|
"rewards/semantic_entropy_math_reward": 0.2903025895357132, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 569.1443557739258, |
|
"epoch": 0.13995801259622112, |
|
"grad_norm": 0.01883069984614849, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.2831101273186505, |
|
"reward_std": 0.10425049322657287, |
|
"rewards/semantic_entropy_math_reward": 0.2831101273186505, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 573.0587940216064, |
|
"epoch": 0.14555633310007, |
|
"grad_norm": 0.019200004637241364, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.2563244076445699, |
|
"reward_std": 0.09196250140666962, |
|
"rewards/semantic_entropy_math_reward": 0.2563244076445699, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 601.1659317016602, |
|
"epoch": 0.15115465360391883, |
|
"grad_norm": 0.016727037727832794, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.26822917023673654, |
|
"reward_std": 0.09728709328919649, |
|
"rewards/semantic_entropy_math_reward": 0.26822917023673654, |
|
"step": 27 |
|
}, |
|
{ |
|
"completion_length": 615.2797718048096, |
|
"epoch": 0.15675297410776767, |
|
"grad_norm": 0.01875486597418785, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.3012152877636254, |
|
"reward_std": 0.11328667728230357, |
|
"rewards/semantic_entropy_math_reward": 0.3012152877636254, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 611.0647449493408, |
|
"epoch": 0.16235129461161651, |
|
"grad_norm": 0.01615230180323124, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.33482143841683865, |
|
"reward_std": 0.09433427458861843, |
|
"rewards/semantic_entropy_math_reward": 0.33482143841683865, |
|
"step": 29 |
|
}, |
|
{ |
|
"completion_length": 622.8876724243164, |
|
"epoch": 0.16794961511546536, |
|
"grad_norm": 0.016654223203659058, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.28807044588029385, |
|
"reward_std": 0.1034616008400917, |
|
"rewards/semantic_entropy_math_reward": 0.28807044588029385, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 639.4181709289551, |
|
"epoch": 0.1735479356193142, |
|
"grad_norm": 0.01327348593622446, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.2810019925236702, |
|
"reward_std": 0.09964999603107572, |
|
"rewards/semantic_entropy_math_reward": 0.2810019925236702, |
|
"step": 31 |
|
}, |
|
{ |
|
"completion_length": 629.0714416503906, |
|
"epoch": 0.17914625612316304, |
|
"grad_norm": 0.017067549750208855, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.2867063535377383, |
|
"reward_std": 0.10988868214190006, |
|
"rewards/semantic_entropy_math_reward": 0.2867063535377383, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 622.2953987121582, |
|
"epoch": 0.1847445766270119, |
|
"grad_norm": 0.01429623831063509, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.3038194472901523, |
|
"reward_std": 0.10957263736054301, |
|
"rewards/semantic_entropy_math_reward": 0.3038194472901523, |
|
"step": 33 |
|
}, |
|
{ |
|
"completion_length": 597.1503067016602, |
|
"epoch": 0.19034289713086075, |
|
"grad_norm": 0.013799657113850117, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.3093998031690717, |
|
"reward_std": 0.10120404063491151, |
|
"rewards/semantic_entropy_math_reward": 0.3093998031690717, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 636.1056709289551, |
|
"epoch": 0.1959412176347096, |
|
"grad_norm": 0.011626561172306538, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.3293650816194713, |
|
"reward_std": 0.09622950968332589, |
|
"rewards/semantic_entropy_math_reward": 0.3293650816194713, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 591.780517578125, |
|
"epoch": 0.20153953813855843, |
|
"grad_norm": 0.011638457886874676, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.38231647573411465, |
|
"reward_std": 0.11920557962730527, |
|
"rewards/semantic_entropy_math_reward": 0.38231647573411465, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 580.5230712890625, |
|
"epoch": 0.20713785864240727, |
|
"grad_norm": 0.012719900347292423, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.4064980298280716, |
|
"reward_std": 0.12387050059624016, |
|
"rewards/semantic_entropy_math_reward": 0.4064980298280716, |
|
"step": 37 |
|
}, |
|
{ |
|
"completion_length": 617.6317043304443, |
|
"epoch": 0.21273617914625612, |
|
"grad_norm": 0.012224080041050911, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.30778770707547665, |
|
"reward_std": 0.08965765126049519, |
|
"rewards/semantic_entropy_math_reward": 0.30778770707547665, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 608.4538803100586, |
|
"epoch": 0.21833449965010496, |
|
"grad_norm": 0.011945121921598911, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.3193204468116164, |
|
"reward_std": 0.10897644911892712, |
|
"rewards/semantic_entropy_math_reward": 0.3193204468116164, |
|
"step": 39 |
|
}, |
|
{ |
|
"completion_length": 642.5602779388428, |
|
"epoch": 0.22393282015395383, |
|
"grad_norm": 0.012211363762617111, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.30456350091844797, |
|
"reward_std": 0.10713349282741547, |
|
"rewards/semantic_entropy_math_reward": 0.30456350091844797, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 639.2768001556396, |
|
"epoch": 0.22953114065780267, |
|
"grad_norm": 0.012564162723720074, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.29551092034671456, |
|
"reward_std": 0.10152047942392528, |
|
"rewards/semantic_entropy_math_reward": 0.29551092034671456, |
|
"step": 41 |
|
}, |
|
{ |
|
"completion_length": 621.8891487121582, |
|
"epoch": 0.2351294611616515, |
|
"grad_norm": 0.01134682260453701, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.3234127121977508, |
|
"reward_std": 0.1085170682054013, |
|
"rewards/semantic_entropy_math_reward": 0.3234127121977508, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 628.6324501037598, |
|
"epoch": 0.24072778166550035, |
|
"grad_norm": 0.012990601360797882, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.31237600184977055, |
|
"reward_std": 0.09812885848805308, |
|
"rewards/semantic_entropy_math_reward": 0.31237600184977055, |
|
"step": 43 |
|
}, |
|
{ |
|
"completion_length": 627.9442024230957, |
|
"epoch": 0.2463261021693492, |
|
"grad_norm": 0.012126186862587929, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.3050595261156559, |
|
"reward_std": 0.10460688779130578, |
|
"rewards/semantic_entropy_math_reward": 0.3050595261156559, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 628.8229331970215, |
|
"epoch": 0.25192442267319803, |
|
"grad_norm": 0.011053094640374184, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.32576886005699635, |
|
"reward_std": 0.09965397394262254, |
|
"rewards/semantic_entropy_math_reward": 0.32576886005699635, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 607.1689109802246, |
|
"epoch": 0.2575227431770469, |
|
"grad_norm": 0.012990483082830906, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.3343254020437598, |
|
"reward_std": 0.11526623973622918, |
|
"rewards/semantic_entropy_math_reward": 0.3343254020437598, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 623.7559700012207, |
|
"epoch": 0.2631210636808957, |
|
"grad_norm": 0.01174214854836464, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.3116319542750716, |
|
"reward_std": 0.09533112193457782, |
|
"rewards/semantic_entropy_math_reward": 0.3116319542750716, |
|
"step": 47 |
|
}, |
|
{ |
|
"completion_length": 629.1674213409424, |
|
"epoch": 0.26871938418474456, |
|
"grad_norm": 0.01178384106606245, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.31150794867426157, |
|
"reward_std": 0.10652086476329714, |
|
"rewards/semantic_entropy_math_reward": 0.31150794867426157, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 652.0907974243164, |
|
"epoch": 0.2743177046885934, |
|
"grad_norm": 0.011742531321942806, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.2786458325572312, |
|
"reward_std": 0.09241121797822416, |
|
"rewards/semantic_entropy_math_reward": 0.2786458325572312, |
|
"step": 49 |
|
}, |
|
{ |
|
"completion_length": 643.3266448974609, |
|
"epoch": 0.27991602519244224, |
|
"grad_norm": 0.011445428244769573, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.27740576677024364, |
|
"reward_std": 0.08390604832675308, |
|
"rewards/semantic_entropy_math_reward": 0.27740576677024364, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 605.7641487121582, |
|
"epoch": 0.28551434569629114, |
|
"grad_norm": 0.042504504323005676, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.29414683301001787, |
|
"reward_std": 0.11176870320923626, |
|
"rewards/semantic_entropy_math_reward": 0.29414683301001787, |
|
"step": 51 |
|
}, |
|
{ |
|
"completion_length": 599.8273887634277, |
|
"epoch": 0.29111266620014, |
|
"grad_norm": 0.014151890762150288, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.30481151957064867, |
|
"reward_std": 0.09737780690193176, |
|
"rewards/semantic_entropy_math_reward": 0.30481151957064867, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 617.6384086608887, |
|
"epoch": 0.2967109867039888, |
|
"grad_norm": 0.012832826003432274, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.34548612125217915, |
|
"reward_std": 0.10560432635247707, |
|
"rewards/semantic_entropy_math_reward": 0.34548612125217915, |
|
"step": 53 |
|
}, |
|
{ |
|
"completion_length": 639.7872161865234, |
|
"epoch": 0.30230930720783766, |
|
"grad_norm": 0.013676963746547699, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.29154266975820065, |
|
"reward_std": 0.09982788749039173, |
|
"rewards/semantic_entropy_math_reward": 0.29154266975820065, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 622.1547737121582, |
|
"epoch": 0.3079076277116865, |
|
"grad_norm": 0.012248532846570015, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.3303571534343064, |
|
"reward_std": 0.10564735904335976, |
|
"rewards/semantic_entropy_math_reward": 0.3303571534343064, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 612.3623600006104, |
|
"epoch": 0.31350594821553535, |
|
"grad_norm": 0.012541095726191998, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.37437996733933687, |
|
"reward_std": 0.11628623493015766, |
|
"rewards/semantic_entropy_math_reward": 0.37437996733933687, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 590.5974807739258, |
|
"epoch": 0.3191042687193842, |
|
"grad_norm": 0.01250051986426115, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.3829365186393261, |
|
"reward_std": 0.11953789182007313, |
|
"rewards/semantic_entropy_math_reward": 0.3829365186393261, |
|
"step": 57 |
|
}, |
|
{ |
|
"completion_length": 629.9375152587891, |
|
"epoch": 0.32470258922323303, |
|
"grad_norm": 0.01244643796235323, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.30481151584535837, |
|
"reward_std": 0.10653208615258336, |
|
"rewards/semantic_entropy_math_reward": 0.30481151584535837, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 663.9650478363037, |
|
"epoch": 0.33030090972708187, |
|
"grad_norm": 0.011867938563227654, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.2619047740008682, |
|
"reward_std": 0.08327494573313743, |
|
"rewards/semantic_entropy_math_reward": 0.2619047740008682, |
|
"step": 59 |
|
}, |
|
{ |
|
"completion_length": 645.1510581970215, |
|
"epoch": 0.3358992302309307, |
|
"grad_norm": 0.01233300007879734, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.3126240177080035, |
|
"reward_std": 0.10046433750540018, |
|
"rewards/semantic_entropy_math_reward": 0.3126240177080035, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 623.23512840271, |
|
"epoch": 0.34149755073477955, |
|
"grad_norm": 0.012479487806558609, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.3306051716208458, |
|
"reward_std": 0.10351803922094405, |
|
"rewards/semantic_entropy_math_reward": 0.3306051716208458, |
|
"step": 61 |
|
}, |
|
{ |
|
"completion_length": 632.4717350006104, |
|
"epoch": 0.3470958712386284, |
|
"grad_norm": 0.012427465058863163, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.31994048738852143, |
|
"reward_std": 0.10373943694867194, |
|
"rewards/semantic_entropy_math_reward": 0.31994048738852143, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 623.9613189697266, |
|
"epoch": 0.35269419174247724, |
|
"grad_norm": 0.013336232863366604, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.31547620333731174, |
|
"reward_std": 0.11058970703743398, |
|
"rewards/semantic_entropy_math_reward": 0.31547620333731174, |
|
"step": 63 |
|
}, |
|
{ |
|
"completion_length": 640.9360294342041, |
|
"epoch": 0.3582925122463261, |
|
"grad_norm": 0.011113813146948814, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.33519346360117197, |
|
"reward_std": 0.08675730333197862, |
|
"rewards/semantic_entropy_math_reward": 0.33519346360117197, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 663.0587882995605, |
|
"epoch": 0.363890832750175, |
|
"grad_norm": 0.01317086722701788, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.2875744178891182, |
|
"reward_std": 0.09582534979563206, |
|
"rewards/semantic_entropy_math_reward": 0.2875744178891182, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 625.5915279388428, |
|
"epoch": 0.3694891532540238, |
|
"grad_norm": 0.013479222543537617, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.3365575489588082, |
|
"reward_std": 0.10884693474508822, |
|
"rewards/semantic_entropy_math_reward": 0.3365575489588082, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 642.8943557739258, |
|
"epoch": 0.37508747375787266, |
|
"grad_norm": 0.012459836900234222, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.2860863204114139, |
|
"reward_std": 0.09167324285954237, |
|
"rewards/semantic_entropy_math_reward": 0.2860863204114139, |
|
"step": 67 |
|
}, |
|
{ |
|
"completion_length": 594.4159355163574, |
|
"epoch": 0.3806857942617215, |
|
"grad_norm": 0.013866654597222805, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.37289187777787447, |
|
"reward_std": 0.10999573534354568, |
|
"rewards/semantic_entropy_math_reward": 0.37289187777787447, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 640.9055099487305, |
|
"epoch": 0.38628411476557034, |
|
"grad_norm": 0.012836214154958725, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.2891865139827132, |
|
"reward_std": 0.10698562301695347, |
|
"rewards/semantic_entropy_math_reward": 0.2891865139827132, |
|
"step": 69 |
|
}, |
|
{ |
|
"completion_length": 623.5349769592285, |
|
"epoch": 0.3918824352694192, |
|
"grad_norm": 0.013130308128893375, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.3031994132325053, |
|
"reward_std": 0.10316204163245857, |
|
"rewards/semantic_entropy_math_reward": 0.3031994132325053, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 593.7172718048096, |
|
"epoch": 0.397480755773268, |
|
"grad_norm": 0.013811892829835415, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.36693950183689594, |
|
"reward_std": 0.10302408610004932, |
|
"rewards/semantic_entropy_math_reward": 0.36693950183689594, |
|
"step": 71 |
|
}, |
|
{ |
|
"completion_length": 597.6815605163574, |
|
"epoch": 0.40307907627711687, |
|
"grad_norm": 0.013042716309428215, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.359002991579473, |
|
"reward_std": 0.12008155998773873, |
|
"rewards/semantic_entropy_math_reward": 0.359002991579473, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 594.0282955169678, |
|
"epoch": 0.4086773967809657, |
|
"grad_norm": 0.01363166980445385, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.3308531828224659, |
|
"reward_std": 0.1005927964579314, |
|
"rewards/semantic_entropy_math_reward": 0.3308531828224659, |
|
"step": 73 |
|
}, |
|
{ |
|
"completion_length": 638.837064743042, |
|
"epoch": 0.41427571728481455, |
|
"grad_norm": 0.01323084905743599, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.2997271902859211, |
|
"reward_std": 0.1091692647896707, |
|
"rewards/semantic_entropy_math_reward": 0.2997271902859211, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 627.7797698974609, |
|
"epoch": 0.4198740377886634, |
|
"grad_norm": 0.013865278102457523, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.3164682723581791, |
|
"reward_std": 0.10915092006325722, |
|
"rewards/semantic_entropy_math_reward": 0.3164682723581791, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 623.7016448974609, |
|
"epoch": 0.42547235829251223, |
|
"grad_norm": 0.013494770042598248, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.36892362777143717, |
|
"reward_std": 0.101050935103558, |
|
"rewards/semantic_entropy_math_reward": 0.36892362777143717, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 632.0245666503906, |
|
"epoch": 0.4310706787963611, |
|
"grad_norm": 0.013310288079082966, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.31349207321181893, |
|
"reward_std": 0.10522611381020397, |
|
"rewards/semantic_entropy_math_reward": 0.31349207321181893, |
|
"step": 77 |
|
}, |
|
{ |
|
"completion_length": 629.9278392791748, |
|
"epoch": 0.4366689993002099, |
|
"grad_norm": 0.012805829755961895, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.30803572107106447, |
|
"reward_std": 0.09856258635409176, |
|
"rewards/semantic_entropy_math_reward": 0.30803572107106447, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 607.979175567627, |
|
"epoch": 0.44226731980405876, |
|
"grad_norm": 0.013692095875740051, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.32142858440056443, |
|
"reward_std": 0.1157424389384687, |
|
"rewards/semantic_entropy_math_reward": 0.32142858440056443, |
|
"step": 79 |
|
}, |
|
{ |
|
"completion_length": 616.4501533508301, |
|
"epoch": 0.44786564030790765, |
|
"grad_norm": 0.014579751528799534, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.3225446487776935, |
|
"reward_std": 0.10182817134773359, |
|
"rewards/semantic_entropy_math_reward": 0.3225446487776935, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 654.9092407226562, |
|
"epoch": 0.4534639608117565, |
|
"grad_norm": 0.013945615850389004, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.3175843362696469, |
|
"reward_std": 0.09967625606805086, |
|
"rewards/semantic_entropy_math_reward": 0.3175843362696469, |
|
"step": 81 |
|
}, |
|
{ |
|
"completion_length": 615.4084930419922, |
|
"epoch": 0.45906228131560534, |
|
"grad_norm": 0.014666857197880745, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.3359375102445483, |
|
"reward_std": 0.10500352433882654, |
|
"rewards/semantic_entropy_math_reward": 0.3359375102445483, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 615.1317005157471, |
|
"epoch": 0.4646606018194542, |
|
"grad_norm": 0.014408071525394917, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.3276289766654372, |
|
"reward_std": 0.10603603813797235, |
|
"rewards/semantic_entropy_math_reward": 0.3276289766654372, |
|
"step": 83 |
|
}, |
|
{ |
|
"completion_length": 566.331111907959, |
|
"epoch": 0.470258922323303, |
|
"grad_norm": 0.015806101262569427, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.40525794867426157, |
|
"reward_std": 0.12493724888190627, |
|
"rewards/semantic_entropy_math_reward": 0.40525794867426157, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 651.4486694335938, |
|
"epoch": 0.47585724282715186, |
|
"grad_norm": 0.014175321906805038, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.29402281949296594, |
|
"reward_std": 0.10067101614549756, |
|
"rewards/semantic_entropy_math_reward": 0.29402281949296594, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 627.6443481445312, |
|
"epoch": 0.4814555633310007, |
|
"grad_norm": 0.016120119020342827, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.3126240139827132, |
|
"reward_std": 0.10280289477668703, |
|
"rewards/semantic_entropy_math_reward": 0.3126240139827132, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 635.4985237121582, |
|
"epoch": 0.48705388383484954, |
|
"grad_norm": 0.014732223004102707, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.3711557686328888, |
|
"reward_std": 0.1085683039855212, |
|
"rewards/semantic_entropy_math_reward": 0.3711557686328888, |
|
"step": 87 |
|
}, |
|
{ |
|
"completion_length": 598.274564743042, |
|
"epoch": 0.4926522043386984, |
|
"grad_norm": 0.015387475490570068, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.39397322107106447, |
|
"reward_std": 0.12118404218927026, |
|
"rewards/semantic_entropy_math_reward": 0.39397322107106447, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 607.96950340271, |
|
"epoch": 0.4982505248425472, |
|
"grad_norm": 0.015765592455863953, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.35466271452605724, |
|
"reward_std": 0.10943171451799572, |
|
"rewards/semantic_entropy_math_reward": 0.35466271452605724, |
|
"step": 89 |
|
}, |
|
{ |
|
"completion_length": 585.819206237793, |
|
"epoch": 0.5038488453463961, |
|
"grad_norm": 0.016726847738027573, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.3410218358039856, |
|
"reward_std": 0.1104447974357754, |
|
"rewards/semantic_entropy_math_reward": 0.3410218358039856, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 637.3958435058594, |
|
"epoch": 0.509447165850245, |
|
"grad_norm": 0.015591591596603394, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.3288690596818924, |
|
"reward_std": 0.10522727854549885, |
|
"rewards/semantic_entropy_math_reward": 0.3288690596818924, |
|
"step": 91 |
|
}, |
|
{ |
|
"completion_length": 625.2872085571289, |
|
"epoch": 0.5150454863540938, |
|
"grad_norm": 0.016664857044816017, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.32118055783212185, |
|
"reward_std": 0.12649832549504936, |
|
"rewards/semantic_entropy_math_reward": 0.32118055783212185, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 593.0282936096191, |
|
"epoch": 0.5206438068579426, |
|
"grad_norm": 0.01702903024852276, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.33296132273972034, |
|
"reward_std": 0.12325585260987282, |
|
"rewards/semantic_entropy_math_reward": 0.33296132273972034, |
|
"step": 93 |
|
}, |
|
{ |
|
"completion_length": 617.7559642791748, |
|
"epoch": 0.5262421273617914, |
|
"grad_norm": 0.0172974094748497, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.316096234600991, |
|
"reward_std": 0.1146786012686789, |
|
"rewards/semantic_entropy_math_reward": 0.316096234600991, |
|
"step": 94 |
|
}, |
|
{ |
|
"completion_length": 636.2671279907227, |
|
"epoch": 0.5318404478656403, |
|
"grad_norm": 0.017720209434628487, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.2721974281594157, |
|
"reward_std": 0.10946512292139232, |
|
"rewards/semantic_entropy_math_reward": 0.2721974281594157, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 586.1860256195068, |
|
"epoch": 0.5374387683694891, |
|
"grad_norm": 0.01675945706665516, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.34561012499034405, |
|
"reward_std": 0.08876445493660867, |
|
"rewards/semantic_entropy_math_reward": 0.34561012499034405, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 611.3735218048096, |
|
"epoch": 0.543037088873338, |
|
"grad_norm": 0.01725374162197113, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.36408730782568455, |
|
"reward_std": 0.11099315108731389, |
|
"rewards/semantic_entropy_math_reward": 0.36408730782568455, |
|
"step": 97 |
|
}, |
|
{ |
|
"completion_length": 614.9278392791748, |
|
"epoch": 0.5486354093771868, |
|
"grad_norm": 0.017101502045989037, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.32390873692929745, |
|
"reward_std": 0.10804228414781392, |
|
"rewards/semantic_entropy_math_reward": 0.32390873692929745, |
|
"step": 98 |
|
}, |
|
{ |
|
"completion_length": 607.1049213409424, |
|
"epoch": 0.5542337298810357, |
|
"grad_norm": 0.019047001376748085, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.35553076304495335, |
|
"reward_std": 0.1101542457472533, |
|
"rewards/semantic_entropy_math_reward": 0.35553076304495335, |
|
"step": 99 |
|
}, |
|
{ |
|
"completion_length": 622.6064147949219, |
|
"epoch": 0.5598320503848845, |
|
"grad_norm": 0.019254466518759727, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.3213045708835125, |
|
"reward_std": 0.09209134639240801, |
|
"rewards/semantic_entropy_math_reward": 0.3213045708835125, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 598.7961406707764, |
|
"epoch": 0.5654303708887334, |
|
"grad_norm": 0.020497458055615425, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.33866569120436907, |
|
"reward_std": 0.10611562361009419, |
|
"rewards/semantic_entropy_math_reward": 0.33866569120436907, |
|
"step": 101 |
|
}, |
|
{ |
|
"completion_length": 650.3459911346436, |
|
"epoch": 0.5710286913925823, |
|
"grad_norm": 0.020331766456365585, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.2831101296469569, |
|
"reward_std": 0.09731970471329987, |
|
"rewards/semantic_entropy_math_reward": 0.2831101296469569, |
|
"step": 102 |
|
}, |
|
{ |
|
"completion_length": 607.9479351043701, |
|
"epoch": 0.5766270118964311, |
|
"grad_norm": 0.02547612227499485, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.32688493095338345, |
|
"reward_std": 0.10671929223462939, |
|
"rewards/semantic_entropy_math_reward": 0.32688493095338345, |
|
"step": 103 |
|
}, |
|
{ |
|
"completion_length": 633.948673248291, |
|
"epoch": 0.58222533240028, |
|
"grad_norm": 0.020476222038269043, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.3084077490493655, |
|
"reward_std": 0.1001708343392238, |
|
"rewards/semantic_entropy_math_reward": 0.3084077490493655, |
|
"step": 104 |
|
}, |
|
{ |
|
"completion_length": 586.1733665466309, |
|
"epoch": 0.5878236529041287, |
|
"grad_norm": 0.030577119439840317, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.3710317499935627, |
|
"reward_std": 0.1044681896455586, |
|
"rewards/semantic_entropy_math_reward": 0.3710317499935627, |
|
"step": 105 |
|
}, |
|
{ |
|
"completion_length": 584.3757553100586, |
|
"epoch": 0.5934219734079776, |
|
"grad_norm": 0.030157793313264847, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.36545139644294977, |
|
"reward_std": 0.10664909472689033, |
|
"rewards/semantic_entropy_math_reward": 0.36545139644294977, |
|
"step": 106 |
|
}, |
|
{ |
|
"completion_length": 610.7224864959717, |
|
"epoch": 0.5990202939118264, |
|
"grad_norm": 0.02985748089849949, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.361111125908792, |
|
"reward_std": 0.10544992447830737, |
|
"rewards/semantic_entropy_math_reward": 0.361111125908792, |
|
"step": 107 |
|
}, |
|
{ |
|
"completion_length": 603.1250133514404, |
|
"epoch": 0.6046186144156753, |
|
"grad_norm": 0.03455930948257446, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.3306051706895232, |
|
"reward_std": 0.09714385017286986, |
|
"rewards/semantic_entropy_math_reward": 0.3306051706895232, |
|
"step": 108 |
|
}, |
|
{ |
|
"completion_length": 607.9910755157471, |
|
"epoch": 0.6102169349195241, |
|
"grad_norm": 0.08368990570306778, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.3103918735869229, |
|
"reward_std": 0.12056175642646849, |
|
"rewards/semantic_entropy_math_reward": 0.3103918735869229, |
|
"step": 109 |
|
}, |
|
{ |
|
"completion_length": 583.9657821655273, |
|
"epoch": 0.615815255423373, |
|
"grad_norm": 0.06647204607725143, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.40166171081364155, |
|
"reward_std": 0.11974087287671864, |
|
"rewards/semantic_entropy_math_reward": 0.40166171081364155, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 617.5520992279053, |
|
"epoch": 0.6214135759272218, |
|
"grad_norm": 0.06331615895032883, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.31026787124574184, |
|
"reward_std": 0.0944897800218314, |
|
"rewards/semantic_entropy_math_reward": 0.31026787124574184, |
|
"step": 111 |
|
}, |
|
{ |
|
"completion_length": 597.5513515472412, |
|
"epoch": 0.6270118964310707, |
|
"grad_norm": 0.1182793602347374, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.3891369206830859, |
|
"reward_std": 0.1367324935272336, |
|
"rewards/semantic_entropy_math_reward": 0.3891369206830859, |
|
"step": 112 |
|
}, |
|
{ |
|
"completion_length": 599.026050567627, |
|
"epoch": 0.6326102169349195, |
|
"grad_norm": 0.32519498467445374, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.34784227423369884, |
|
"reward_std": 0.11579506384441629, |
|
"rewards/semantic_entropy_math_reward": 0.34784227423369884, |
|
"step": 113 |
|
}, |
|
{ |
|
"completion_length": 605.5989627838135, |
|
"epoch": 0.6382085374387684, |
|
"grad_norm": 0.30284300446510315, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.35925100883468986, |
|
"reward_std": 0.09489296341780573, |
|
"rewards/semantic_entropy_math_reward": 0.35925100883468986, |
|
"step": 114 |
|
}, |
|
{ |
|
"completion_length": 588.3794746398926, |
|
"epoch": 0.6438068579426172, |
|
"grad_norm": 0.5091419219970703, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.2926587341353297, |
|
"reward_std": 0.10896235378459096, |
|
"rewards/semantic_entropy_math_reward": 0.2926587341353297, |
|
"step": 115 |
|
}, |
|
{ |
|
"completion_length": 585.5081958770752, |
|
"epoch": 0.6494051784464661, |
|
"grad_norm": 1.0533686876296997, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.302951404126361, |
|
"reward_std": 0.10090323083568364, |
|
"rewards/semantic_entropy_math_reward": 0.302951404126361, |
|
"step": 116 |
|
}, |
|
{ |
|
"completion_length": 564.0706882476807, |
|
"epoch": 0.655003498950315, |
|
"grad_norm": 1.4623417854309082, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.2781498096883297, |
|
"reward_std": 0.10372308688238263, |
|
"rewards/semantic_entropy_math_reward": 0.2781498096883297, |
|
"step": 117 |
|
}, |
|
{ |
|
"completion_length": 590.0558166503906, |
|
"epoch": 0.6606018194541637, |
|
"grad_norm": 1.9276891946792603, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.2147817499935627, |
|
"reward_std": 0.0795764367794618, |
|
"rewards/semantic_entropy_math_reward": 0.2147817499935627, |
|
"step": 118 |
|
}, |
|
{ |
|
"completion_length": 631.612361907959, |
|
"epoch": 0.6662001399580126, |
|
"grad_norm": 2.644303321838379, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.17051091720350087, |
|
"reward_std": 0.07038468832615763, |
|
"rewards/semantic_entropy_math_reward": 0.17051091720350087, |
|
"step": 119 |
|
}, |
|
{ |
|
"completion_length": 666.3355770111084, |
|
"epoch": 0.6717984604618614, |
|
"grad_norm": 3.114243984222412, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.13591270288452506, |
|
"reward_std": 0.06976182584185153, |
|
"rewards/semantic_entropy_math_reward": 0.13591270288452506, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 745.9434585571289, |
|
"epoch": 0.6773967809657103, |
|
"grad_norm": 3.7797133922576904, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.07477678649593145, |
|
"reward_std": 0.03976587820216082, |
|
"rewards/semantic_entropy_math_reward": 0.07477678649593145, |
|
"step": 121 |
|
}, |
|
{ |
|
"completion_length": 807.1480865478516, |
|
"epoch": 0.6829951014695591, |
|
"grad_norm": 2.0227487087249756, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.03819444542750716, |
|
"reward_std": 0.017828965152148157, |
|
"rewards/semantic_entropy_math_reward": 0.03819444542750716, |
|
"step": 122 |
|
}, |
|
{ |
|
"completion_length": 865.9404907226562, |
|
"epoch": 0.688593421973408, |
|
"grad_norm": 0.9964215755462646, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.007936508511193097, |
|
"reward_std": 0.004420435056090355, |
|
"rewards/semantic_entropy_math_reward": 0.007936508511193097, |
|
"step": 123 |
|
}, |
|
{ |
|
"completion_length": 884.2105827331543, |
|
"epoch": 0.6941917424772568, |
|
"grad_norm": 0.30527350306510925, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.0017361112404614687, |
|
"reward_std": 0.0005792402662336826, |
|
"rewards/semantic_entropy_math_reward": 0.0017361112404614687, |
|
"step": 124 |
|
}, |
|
{ |
|
"completion_length": 922.7299270629883, |
|
"epoch": 0.6997900629811057, |
|
"grad_norm": 0.14305374026298523, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.003100198577158153, |
|
"reward_std": 0.001635652908589691, |
|
"rewards/semantic_entropy_math_reward": 0.003100198577158153, |
|
"step": 125 |
|
}, |
|
{ |
|
"completion_length": 945.5491256713867, |
|
"epoch": 0.7053883834849545, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/semantic_entropy_math_reward": 0.0, |
|
"step": 126 |
|
}, |
|
{ |
|
"completion_length": 936.7715911865234, |
|
"epoch": 0.7109867039888034, |
|
"grad_norm": 0.2572309970855713, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.002852182718925178, |
|
"reward_std": 0.001252256624866277, |
|
"rewards/semantic_entropy_math_reward": 0.002852182718925178, |
|
"step": 127 |
|
}, |
|
{ |
|
"completion_length": 928.2775459289551, |
|
"epoch": 0.7165850244926522, |
|
"grad_norm": 0.1054806336760521, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.0008680556202307343, |
|
"reward_std": 0.0002896201331168413, |
|
"rewards/semantic_entropy_math_reward": 0.0008680556202307343, |
|
"step": 128 |
|
}, |
|
{ |
|
"completion_length": 933.6108818054199, |
|
"epoch": 0.722183344996501, |
|
"grad_norm": 0.2958269417285919, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.0019841270986944437, |
|
"reward_std": 0.0009626364917494357, |
|
"rewards/semantic_entropy_math_reward": 0.0019841270986944437, |
|
"step": 129 |
|
}, |
|
{ |
|
"completion_length": 934.7604331970215, |
|
"epoch": 0.72778166550035, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/semantic_entropy_math_reward": 0.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 943.2418365478516, |
|
"epoch": 0.7333799860041987, |
|
"grad_norm": 0.4255366623401642, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.0008680556202307343, |
|
"reward_std": 0.0002896201331168413, |
|
"rewards/semantic_entropy_math_reward": 0.0008680556202307343, |
|
"step": 131 |
|
}, |
|
{ |
|
"completion_length": 928.0707015991211, |
|
"epoch": 0.7389783065080476, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/semantic_entropy_math_reward": 0.0, |
|
"step": 132 |
|
}, |
|
{ |
|
"completion_length": 920.4546318054199, |
|
"epoch": 0.7445766270118964, |
|
"grad_norm": 0.25327780842781067, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.0008680556202307343, |
|
"reward_std": 0.0002896201331168413, |
|
"rewards/semantic_entropy_math_reward": 0.0008680556202307343, |
|
"step": 133 |
|
}, |
|
{ |
|
"completion_length": 894.5632667541504, |
|
"epoch": 0.7501749475157453, |
|
"grad_norm": 0.29155024886131287, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.0008680556202307343, |
|
"reward_std": 0.0002896201331168413, |
|
"rewards/semantic_entropy_math_reward": 0.0008680556202307343, |
|
"step": 134 |
|
}, |
|
{ |
|
"completion_length": 903.4895935058594, |
|
"epoch": 0.7557732680195941, |
|
"grad_norm": 0.313747376203537, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.0019841270986944437, |
|
"reward_std": 0.0009626365499570966, |
|
"rewards/semantic_entropy_math_reward": 0.0019841270986944437, |
|
"step": 135 |
|
}, |
|
{ |
|
"completion_length": 858.8898887634277, |
|
"epoch": 0.761371588523443, |
|
"grad_norm": 0.23342648148536682, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.002604166860692203, |
|
"reward_std": 0.000868860399350524, |
|
"rewards/semantic_entropy_math_reward": 0.002604166860692203, |
|
"step": 136 |
|
}, |
|
{ |
|
"completion_length": 880.4099922180176, |
|
"epoch": 0.7669699090272918, |
|
"grad_norm": 0.863322913646698, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.002604166860692203, |
|
"reward_std": 0.000868860399350524, |
|
"rewards/semantic_entropy_math_reward": 0.002604166860692203, |
|
"step": 137 |
|
}, |
|
{ |
|
"completion_length": 870.8638534545898, |
|
"epoch": 0.7725682295311407, |
|
"grad_norm": 0.23507745563983917, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.004092262242920697, |
|
"reward_std": 0.002001996588660404, |
|
"rewards/semantic_entropy_math_reward": 0.004092262242920697, |
|
"step": 138 |
|
}, |
|
{ |
|
"completion_length": 826.1190567016602, |
|
"epoch": 0.7781665500349895, |
|
"grad_norm": 0.5723982453346252, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.005828373366966844, |
|
"reward_std": 0.00254713196773082, |
|
"rewards/semantic_entropy_math_reward": 0.005828373366966844, |
|
"step": 139 |
|
}, |
|
{ |
|
"completion_length": 822.2314186096191, |
|
"epoch": 0.7837648705388384, |
|
"grad_norm": 1.1474024057388306, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.012896826025098562, |
|
"reward_std": 0.006677947181742638, |
|
"rewards/semantic_entropy_math_reward": 0.012896826025098562, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 847.2269515991211, |
|
"epoch": 0.7893631910426872, |
|
"grad_norm": 1.597180724143982, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.011656746733933687, |
|
"reward_std": 0.005111316044349223, |
|
"rewards/semantic_entropy_math_reward": 0.011656746733933687, |
|
"step": 141 |
|
}, |
|
{ |
|
"completion_length": 842.1793327331543, |
|
"epoch": 0.794961511546536, |
|
"grad_norm": 0.7798174619674683, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.012152778450399637, |
|
"reward_std": 0.004957869183272123, |
|
"rewards/semantic_entropy_math_reward": 0.012152778450399637, |
|
"step": 142 |
|
}, |
|
{ |
|
"completion_length": 831.2462921142578, |
|
"epoch": 0.8005598320503848, |
|
"grad_norm": 2.8579814434051514, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.028769842814654112, |
|
"reward_std": 0.01406612026039511, |
|
"rewards/semantic_entropy_math_reward": 0.028769842814654112, |
|
"step": 143 |
|
}, |
|
{ |
|
"completion_length": 789.3541831970215, |
|
"epoch": 0.8061581525542337, |
|
"grad_norm": 3.0082154273986816, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.026289683999493718, |
|
"reward_std": 0.014727714413311332, |
|
"rewards/semantic_entropy_math_reward": 0.026289683999493718, |
|
"step": 144 |
|
}, |
|
{ |
|
"completion_length": 766.1949501037598, |
|
"epoch": 0.8117564730580826, |
|
"grad_norm": 5.142900466918945, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.049107144586741924, |
|
"reward_std": 0.028222940571140498, |
|
"rewards/semantic_entropy_math_reward": 0.049107144586741924, |
|
"step": 145 |
|
}, |
|
{ |
|
"completion_length": 758.3102760314941, |
|
"epoch": 0.8173547935619314, |
|
"grad_norm": 2.1914703845977783, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.071180559694767, |
|
"reward_std": 0.042430724075529724, |
|
"rewards/semantic_entropy_math_reward": 0.071180559694767, |
|
"step": 146 |
|
}, |
|
{ |
|
"completion_length": 705.601203918457, |
|
"epoch": 0.8229531140657803, |
|
"grad_norm": 2.148831605911255, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.09474206739105284, |
|
"reward_std": 0.051515123690478504, |
|
"rewards/semantic_entropy_math_reward": 0.09474206739105284, |
|
"step": 147 |
|
}, |
|
{ |
|
"completion_length": 714.9263496398926, |
|
"epoch": 0.8285514345696291, |
|
"grad_norm": 2.954876184463501, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.10466270102187991, |
|
"reward_std": 0.06257531465962529, |
|
"rewards/semantic_entropy_math_reward": 0.10466270102187991, |
|
"step": 148 |
|
}, |
|
{ |
|
"completion_length": 674.5491104125977, |
|
"epoch": 0.834149755073478, |
|
"grad_norm": 2.0350775718688965, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.13653274183161557, |
|
"reward_std": 0.07629197556525469, |
|
"rewards/semantic_entropy_math_reward": 0.13653274183161557, |
|
"step": 149 |
|
}, |
|
{ |
|
"completion_length": 637.178581237793, |
|
"epoch": 0.8397480755773268, |
|
"grad_norm": 3.029639482498169, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.1703869104385376, |
|
"reward_std": 0.08733957540243864, |
|
"rewards/semantic_entropy_math_reward": 0.1703869104385376, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 673.2440567016602, |
|
"epoch": 0.8453463960811757, |
|
"grad_norm": 1.96150803565979, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.14583333884365857, |
|
"reward_std": 0.08530042838538066, |
|
"rewards/semantic_entropy_math_reward": 0.14583333884365857, |
|
"step": 151 |
|
}, |
|
{ |
|
"completion_length": 618.9092407226562, |
|
"epoch": 0.8509447165850245, |
|
"grad_norm": 1.5204460620880127, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.12810020195320249, |
|
"reward_std": 0.07424356217961758, |
|
"rewards/semantic_entropy_math_reward": 0.12810020195320249, |
|
"step": 152 |
|
}, |
|
{ |
|
"completion_length": 596.5007553100586, |
|
"epoch": 0.8565430370888734, |
|
"grad_norm": 2.1267242431640625, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.07378472597338259, |
|
"reward_std": 0.04104530799668282, |
|
"rewards/semantic_entropy_math_reward": 0.07378472597338259, |
|
"step": 153 |
|
}, |
|
{ |
|
"completion_length": 596.3943500518799, |
|
"epoch": 0.8621413575927221, |
|
"grad_norm": 1.156010627746582, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.051835319376550615, |
|
"reward_std": 0.028525067784357816, |
|
"rewards/semantic_entropy_math_reward": 0.051835319376550615, |
|
"step": 154 |
|
}, |
|
{ |
|
"completion_length": 649.7128067016602, |
|
"epoch": 0.867739678096571, |
|
"grad_norm": 0.9627940058708191, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.03881448460742831, |
|
"reward_std": 0.022329314553644508, |
|
"rewards/semantic_entropy_math_reward": 0.03881448460742831, |
|
"step": 155 |
|
}, |
|
{ |
|
"completion_length": 678.5580520629883, |
|
"epoch": 0.8733379986004198, |
|
"grad_norm": 1.0966858863830566, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.037822422687895596, |
|
"reward_std": 0.0146516069653444, |
|
"rewards/semantic_entropy_math_reward": 0.037822422687895596, |
|
"step": 156 |
|
}, |
|
{ |
|
"completion_length": 727.682300567627, |
|
"epoch": 0.8789363191042687, |
|
"grad_norm": 1.220819115638733, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.04303075547795743, |
|
"reward_std": 0.02123075199779123, |
|
"rewards/semantic_entropy_math_reward": 0.04303075547795743, |
|
"step": 157 |
|
}, |
|
{ |
|
"completion_length": 693.4948043823242, |
|
"epoch": 0.8845346396081175, |
|
"grad_norm": 2.7909467220306396, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.09176587720867246, |
|
"reward_std": 0.04625473154010251, |
|
"rewards/semantic_entropy_math_reward": 0.09176587720867246, |
|
"step": 158 |
|
}, |
|
{ |
|
"completion_length": 734.0967407226562, |
|
"epoch": 0.8901329601119664, |
|
"grad_norm": 1.8816090822219849, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.09660218469798565, |
|
"reward_std": 0.0377241056994535, |
|
"rewards/semantic_entropy_math_reward": 0.09660218469798565, |
|
"step": 159 |
|
}, |
|
{ |
|
"completion_length": 750.3869190216064, |
|
"epoch": 0.8957312806158153, |
|
"grad_norm": 1.124207854270935, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.09176587732508779, |
|
"reward_std": 0.04596630920423195, |
|
"rewards/semantic_entropy_math_reward": 0.09176587732508779, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 760.7053718566895, |
|
"epoch": 0.9013296011196641, |
|
"grad_norm": 1.2950589656829834, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.09536210435908288, |
|
"reward_std": 0.05483918351819739, |
|
"rewards/semantic_entropy_math_reward": 0.09536210435908288, |
|
"step": 161 |
|
}, |
|
{ |
|
"completion_length": 761.5186157226562, |
|
"epoch": 0.906927921623513, |
|
"grad_norm": 0.7260070443153381, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.10466269659809768, |
|
"reward_std": 0.04580701712984592, |
|
"rewards/semantic_entropy_math_reward": 0.10466269659809768, |
|
"step": 162 |
|
}, |
|
{ |
|
"completion_length": 713.2239723205566, |
|
"epoch": 0.9125262421273618, |
|
"grad_norm": 0.9426623582839966, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.13727678637951612, |
|
"reward_std": 0.06796611158642918, |
|
"rewards/semantic_entropy_math_reward": 0.13727678637951612, |
|
"step": 163 |
|
}, |
|
{ |
|
"completion_length": 705.643611907959, |
|
"epoch": 0.9181245626312107, |
|
"grad_norm": 1.4740947484970093, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.18241567676886916, |
|
"reward_std": 0.08067136688623577, |
|
"rewards/semantic_entropy_math_reward": 0.18241567676886916, |
|
"step": 164 |
|
}, |
|
{ |
|
"completion_length": 731.9352836608887, |
|
"epoch": 0.9237228831350595, |
|
"grad_norm": 0.8142076134681702, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.15947421244345605, |
|
"reward_std": 0.0733173037879169, |
|
"rewards/semantic_entropy_math_reward": 0.15947421244345605, |
|
"step": 165 |
|
}, |
|
{ |
|
"completion_length": 794.4799270629883, |
|
"epoch": 0.9293212036389084, |
|
"grad_norm": 0.5677748322486877, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.14360119448974729, |
|
"reward_std": 0.06343726580962539, |
|
"rewards/semantic_entropy_math_reward": 0.14360119448974729, |
|
"step": 166 |
|
}, |
|
{ |
|
"completion_length": 767.6942100524902, |
|
"epoch": 0.9349195241427571, |
|
"grad_norm": 0.707513153553009, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.1733631044626236, |
|
"reward_std": 0.06966668769018725, |
|
"rewards/semantic_entropy_math_reward": 0.1733631044626236, |
|
"step": 167 |
|
}, |
|
{ |
|
"completion_length": 822.2001686096191, |
|
"epoch": 0.940517844646606, |
|
"grad_norm": 0.5552101731300354, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.1324404844781384, |
|
"reward_std": 0.05633544718148187, |
|
"rewards/semantic_entropy_math_reward": 0.1324404844781384, |
|
"step": 168 |
|
}, |
|
{ |
|
"completion_length": 841.3861694335938, |
|
"epoch": 0.9461161651504548, |
|
"grad_norm": 0.26897794008255005, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.13591270486358553, |
|
"reward_std": 0.04643521481193602, |
|
"rewards/semantic_entropy_math_reward": 0.13591270486358553, |
|
"step": 169 |
|
}, |
|
{ |
|
"completion_length": 842.628734588623, |
|
"epoch": 0.9517144856543037, |
|
"grad_norm": 0.4329053461551666, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.12388393026776612, |
|
"reward_std": 0.06243461259873584, |
|
"rewards/semantic_entropy_math_reward": 0.12388393026776612, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 852.5639991760254, |
|
"epoch": 0.9573128061581525, |
|
"grad_norm": 0.3284468352794647, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.12909226480405778, |
|
"reward_std": 0.05489638983272016, |
|
"rewards/semantic_entropy_math_reward": 0.12909226480405778, |
|
"step": 171 |
|
}, |
|
{ |
|
"completion_length": 864.0989837646484, |
|
"epoch": 0.9629111266620014, |
|
"grad_norm": 0.4691842794418335, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.12165178672876209, |
|
"reward_std": 0.05811122967861593, |
|
"rewards/semantic_entropy_math_reward": 0.12165178672876209, |
|
"step": 172 |
|
}, |
|
{ |
|
"completion_length": 894.2678718566895, |
|
"epoch": 0.9685094471658502, |
|
"grad_norm": 0.43808817863464355, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.0978422649204731, |
|
"reward_std": 0.043283838051138446, |
|
"rewards/semantic_entropy_math_reward": 0.0978422649204731, |
|
"step": 173 |
|
}, |
|
{ |
|
"completion_length": 890.3757591247559, |
|
"epoch": 0.9741077676696991, |
|
"grad_norm": 0.7790409922599792, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.09747024194803089, |
|
"reward_std": 0.04836806608363986, |
|
"rewards/semantic_entropy_math_reward": 0.09747024194803089, |
|
"step": 174 |
|
}, |
|
{ |
|
"completion_length": 922.7834930419922, |
|
"epoch": 0.979706088173548, |
|
"grad_norm": 1.0157604217529297, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.06386408989783376, |
|
"reward_std": 0.03726256394293159, |
|
"rewards/semantic_entropy_math_reward": 0.06386408989783376, |
|
"step": 175 |
|
}, |
|
{ |
|
"completion_length": 921.322193145752, |
|
"epoch": 0.9853044086773968, |
|
"grad_norm": 1.7826768159866333, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.07279266219120473, |
|
"reward_std": 0.04188264685217291, |
|
"rewards/semantic_entropy_math_reward": 0.07279266219120473, |
|
"step": 176 |
|
}, |
|
{ |
|
"completion_length": 897.9613227844238, |
|
"epoch": 0.9909027291812457, |
|
"grad_norm": 5.7943010330200195, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.10453869169577956, |
|
"reward_std": 0.04441492026671767, |
|
"rewards/semantic_entropy_math_reward": 0.10453869169577956, |
|
"step": 177 |
|
}, |
|
{ |
|
"completion_length": 899.3638496398926, |
|
"epoch": 0.9965010496850945, |
|
"grad_norm": 3.6061346530914307, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.08655754150822759, |
|
"reward_std": 0.03804769192356616, |
|
"rewards/semantic_entropy_math_reward": 0.08655754150822759, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.9965010496850945, |
|
"step": 178, |
|
"total_flos": 0.0, |
|
"train_loss": 5.786010212772085e-09, |
|
"train_runtime": 7433.3342, |
|
"train_samples_per_second": 2.691, |
|
"train_steps_per_second": 0.024 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 178, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|