{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9965010496850945, "eval_steps": 100, "global_step": 178, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 548.2455501556396, "epoch": 0.005598320503848845, "grad_norm": 0.01954779587686062, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.238839291036129, "reward_std": 0.11606137279886752, "rewards/semantic_entropy_math_reward": 0.238839291036129, "step": 1 }, { "completion_length": 526.9025402069092, "epoch": 0.01119664100769769, "grad_norm": 0.018136516213417053, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.2369791674427688, "reward_std": 0.10657330928370357, "rewards/semantic_entropy_math_reward": 0.2369791674427688, "step": 2 }, { "completion_length": 562.8943519592285, "epoch": 0.016794961511546535, "grad_norm": 0.01578659377992153, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.23115079675335437, "reward_std": 0.09216207754798234, "rewards/semantic_entropy_math_reward": 0.23115079675335437, "step": 3 }, { "completion_length": 571.5796279907227, "epoch": 0.02239328201539538, "grad_norm": 0.018513264134526253, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.2011408838443458, "reward_std": 0.09730021236464381, "rewards/semantic_entropy_math_reward": 0.2011408838443458, "step": 4 }, { "completion_length": 569.882453918457, "epoch": 0.02799160251924423, "grad_norm": 0.023015329614281654, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.23325893050059676, "reward_std": 0.11453290120698512, "rewards/semantic_entropy_math_reward": 0.23325893050059676, "step": 5 }, { "completion_length": 541.0096778869629, "epoch": 0.03358992302309307, "grad_norm": 0.017617080360651016, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.2580605214461684, "reward_std": 0.1201386651955545, "rewards/semantic_entropy_math_reward": 0.2580605214461684, "step": 6 }, { "completion_length": 579.73512840271, "epoch": 0.03918824352694192, "grad_norm": 0.0158968698233366, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.2048611151985824, "reward_std": 0.0955489007756114, "rewards/semantic_entropy_math_reward": 0.2048611151985824, "step": 7 }, { "completion_length": 515.7500076293945, "epoch": 0.04478656403079076, "grad_norm": 0.022326918318867683, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.21825397666543722, "reward_std": 0.0982005288824439, "rewards/semantic_entropy_math_reward": 0.21825397666543722, "step": 8 }, { "completion_length": 590.3921222686768, "epoch": 0.05038488453463961, "grad_norm": 0.016080491244792938, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.21205357322469354, "reward_std": 0.10619991598650813, "rewards/semantic_entropy_math_reward": 0.21205357322469354, "step": 9 }, { "completion_length": 581.6651878356934, "epoch": 0.05598320503848846, "grad_norm": 0.017059899866580963, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.21899801935069263, "reward_std": 0.10072531108744442, "rewards/semantic_entropy_math_reward": 0.21899801935069263, "step": 10 }, { "completion_length": 571.7886981964111, "epoch": 0.0615815255423373, "grad_norm": 0.01674991473555565, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.2311508022248745, "reward_std": 0.10565085639245808, "rewards/semantic_entropy_math_reward": 0.2311508022248745, "step": 11 }, { "completion_length": 562.854923248291, "epoch": 0.06717984604618614, "grad_norm": 0.018872996792197227, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.2327629025094211, "reward_std": 0.10614555445499718, "rewards/semantic_entropy_math_reward": 0.2327629025094211, "step": 12 }, { "completion_length": 568.9709930419922, "epoch": 0.072778166550035, "grad_norm": 0.015940172597765923, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.23288691393099725, "reward_std": 0.10073893028311431, "rewards/semantic_entropy_math_reward": 0.23288691393099725, "step": 13 }, { "completion_length": 593.6733703613281, "epoch": 0.07837648705388384, "grad_norm": 0.015898453071713448, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.22792659373953938, "reward_std": 0.09615750901866704, "rewards/semantic_entropy_math_reward": 0.22792659373953938, "step": 14 }, { "completion_length": 584.9248657226562, "epoch": 0.08397480755773268, "grad_norm": 0.014255843125283718, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.2467758092097938, "reward_std": 0.08825354627333581, "rewards/semantic_entropy_math_reward": 0.2467758092097938, "step": 15 }, { "completion_length": 516.8154811859131, "epoch": 0.08957312806158152, "grad_norm": 0.022125836461782455, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.2909226221963763, "reward_std": 0.12354861758649349, "rewards/semantic_entropy_math_reward": 0.2909226221963763, "step": 16 }, { "completion_length": 551.7343845367432, "epoch": 0.09517144856543037, "grad_norm": 0.016181064769625664, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.2682291769888252, "reward_std": 0.10501077049411833, "rewards/semantic_entropy_math_reward": 0.2682291769888252, "step": 17 }, { "completion_length": 618.2076015472412, "epoch": 0.10076976906927922, "grad_norm": 0.019598443061113358, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.2562004067003727, "reward_std": 0.11642820481210947, "rewards/semantic_entropy_math_reward": 0.2562004067003727, "step": 18 }, { "completion_length": 561.2827529907227, "epoch": 0.10636808957312806, "grad_norm": 0.015643073245882988, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.26202877750620246, "reward_std": 0.10126263811253011, "rewards/semantic_entropy_math_reward": 0.26202877750620246, "step": 19 }, { "completion_length": 579.395845413208, "epoch": 0.11196641007697691, "grad_norm": 0.017556050792336464, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.2761656870134175, "reward_std": 0.11453696829266846, "rewards/semantic_entropy_math_reward": 0.2761656870134175, "step": 20 }, { "completion_length": 580.3199481964111, "epoch": 0.11756473058082575, "grad_norm": 0.01937718316912651, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.2857142901048064, "reward_std": 0.10658370074816048, "rewards/semantic_entropy_math_reward": 0.2857142901048064, "step": 21 }, { "completion_length": 638.837064743042, "epoch": 0.1231630510846746, "grad_norm": 0.018222520127892494, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.21614584047347307, "reward_std": 0.0887793127913028, "rewards/semantic_entropy_math_reward": 0.21614584047347307, "step": 22 }, { "completion_length": 616.3043251037598, "epoch": 0.12876137158852344, "grad_norm": 0.015998607501387596, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.2881944440305233, "reward_std": 0.09268029552185908, "rewards/semantic_entropy_math_reward": 0.2881944440305233, "step": 23 }, { "completion_length": 608.8288803100586, "epoch": 0.13435969209237228, "grad_norm": 0.015030119568109512, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.2903025895357132, "reward_std": 0.10364439443219453, "rewards/semantic_entropy_math_reward": 0.2903025895357132, "step": 24 }, { "completion_length": 569.1443557739258, "epoch": 0.13995801259622112, "grad_norm": 0.01883069984614849, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.2831101273186505, "reward_std": 0.10425049322657287, "rewards/semantic_entropy_math_reward": 0.2831101273186505, "step": 25 }, { "completion_length": 573.0587940216064, "epoch": 0.14555633310007, "grad_norm": 0.019200004637241364, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.2563244076445699, "reward_std": 0.09196250140666962, "rewards/semantic_entropy_math_reward": 0.2563244076445699, "step": 26 }, { "completion_length": 601.1659317016602, "epoch": 0.15115465360391883, "grad_norm": 0.016727037727832794, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.26822917023673654, "reward_std": 0.09728709328919649, "rewards/semantic_entropy_math_reward": 0.26822917023673654, "step": 27 }, { "completion_length": 615.2797718048096, "epoch": 0.15675297410776767, "grad_norm": 0.01875486597418785, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.3012152877636254, "reward_std": 0.11328667728230357, "rewards/semantic_entropy_math_reward": 0.3012152877636254, "step": 28 }, { "completion_length": 611.0647449493408, "epoch": 0.16235129461161651, "grad_norm": 0.01615230180323124, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.33482143841683865, "reward_std": 0.09433427458861843, "rewards/semantic_entropy_math_reward": 0.33482143841683865, "step": 29 }, { "completion_length": 622.8876724243164, "epoch": 0.16794961511546536, "grad_norm": 0.016654223203659058, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.28807044588029385, "reward_std": 0.1034616008400917, "rewards/semantic_entropy_math_reward": 0.28807044588029385, "step": 30 }, { "completion_length": 639.4181709289551, "epoch": 0.1735479356193142, "grad_norm": 0.01327348593622446, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.2810019925236702, "reward_std": 0.09964999603107572, "rewards/semantic_entropy_math_reward": 0.2810019925236702, "step": 31 }, { "completion_length": 629.0714416503906, "epoch": 0.17914625612316304, "grad_norm": 0.017067549750208855, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.2867063535377383, "reward_std": 0.10988868214190006, "rewards/semantic_entropy_math_reward": 0.2867063535377383, "step": 32 }, { "completion_length": 622.2953987121582, "epoch": 0.1847445766270119, "grad_norm": 0.01429623831063509, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.3038194472901523, "reward_std": 0.10957263736054301, "rewards/semantic_entropy_math_reward": 0.3038194472901523, "step": 33 }, { "completion_length": 597.1503067016602, "epoch": 0.19034289713086075, "grad_norm": 0.013799657113850117, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.3093998031690717, "reward_std": 0.10120404063491151, "rewards/semantic_entropy_math_reward": 0.3093998031690717, "step": 34 }, { "completion_length": 636.1056709289551, "epoch": 0.1959412176347096, "grad_norm": 0.011626561172306538, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.3293650816194713, "reward_std": 0.09622950968332589, "rewards/semantic_entropy_math_reward": 0.3293650816194713, "step": 35 }, { "completion_length": 591.780517578125, "epoch": 0.20153953813855843, "grad_norm": 0.011638457886874676, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.38231647573411465, "reward_std": 0.11920557962730527, "rewards/semantic_entropy_math_reward": 0.38231647573411465, "step": 36 }, { "completion_length": 580.5230712890625, "epoch": 0.20713785864240727, "grad_norm": 0.012719900347292423, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.4064980298280716, "reward_std": 0.12387050059624016, "rewards/semantic_entropy_math_reward": 0.4064980298280716, "step": 37 }, { "completion_length": 617.6317043304443, "epoch": 0.21273617914625612, "grad_norm": 0.012224080041050911, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.30778770707547665, "reward_std": 0.08965765126049519, "rewards/semantic_entropy_math_reward": 0.30778770707547665, "step": 38 }, { "completion_length": 608.4538803100586, "epoch": 0.21833449965010496, "grad_norm": 0.011945121921598911, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.3193204468116164, "reward_std": 0.10897644911892712, "rewards/semantic_entropy_math_reward": 0.3193204468116164, "step": 39 }, { "completion_length": 642.5602779388428, "epoch": 0.22393282015395383, "grad_norm": 0.012211363762617111, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.30456350091844797, "reward_std": 0.10713349282741547, "rewards/semantic_entropy_math_reward": 0.30456350091844797, "step": 40 }, { "completion_length": 639.2768001556396, "epoch": 0.22953114065780267, "grad_norm": 0.012564162723720074, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.29551092034671456, "reward_std": 0.10152047942392528, "rewards/semantic_entropy_math_reward": 0.29551092034671456, "step": 41 }, { "completion_length": 621.8891487121582, "epoch": 0.2351294611616515, "grad_norm": 0.01134682260453701, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.3234127121977508, "reward_std": 0.1085170682054013, "rewards/semantic_entropy_math_reward": 0.3234127121977508, "step": 42 }, { "completion_length": 628.6324501037598, "epoch": 0.24072778166550035, "grad_norm": 0.012990601360797882, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.31237600184977055, "reward_std": 0.09812885848805308, "rewards/semantic_entropy_math_reward": 0.31237600184977055, "step": 43 }, { "completion_length": 627.9442024230957, "epoch": 0.2463261021693492, "grad_norm": 0.012126186862587929, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.3050595261156559, "reward_std": 0.10460688779130578, "rewards/semantic_entropy_math_reward": 0.3050595261156559, "step": 44 }, { "completion_length": 628.8229331970215, "epoch": 0.25192442267319803, "grad_norm": 0.011053094640374184, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.32576886005699635, "reward_std": 0.09965397394262254, "rewards/semantic_entropy_math_reward": 0.32576886005699635, "step": 45 }, { "completion_length": 607.1689109802246, "epoch": 0.2575227431770469, "grad_norm": 0.012990483082830906, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.3343254020437598, "reward_std": 0.11526623973622918, "rewards/semantic_entropy_math_reward": 0.3343254020437598, "step": 46 }, { "completion_length": 623.7559700012207, "epoch": 0.2631210636808957, "grad_norm": 0.01174214854836464, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.3116319542750716, "reward_std": 0.09533112193457782, "rewards/semantic_entropy_math_reward": 0.3116319542750716, "step": 47 }, { "completion_length": 629.1674213409424, "epoch": 0.26871938418474456, "grad_norm": 0.01178384106606245, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.31150794867426157, "reward_std": 0.10652086476329714, "rewards/semantic_entropy_math_reward": 0.31150794867426157, "step": 48 }, { "completion_length": 652.0907974243164, "epoch": 0.2743177046885934, "grad_norm": 0.011742531321942806, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.2786458325572312, "reward_std": 0.09241121797822416, "rewards/semantic_entropy_math_reward": 0.2786458325572312, "step": 49 }, { "completion_length": 643.3266448974609, "epoch": 0.27991602519244224, "grad_norm": 0.011445428244769573, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.27740576677024364, "reward_std": 0.08390604832675308, "rewards/semantic_entropy_math_reward": 0.27740576677024364, "step": 50 }, { "completion_length": 605.7641487121582, "epoch": 0.28551434569629114, "grad_norm": 0.042504504323005676, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.29414683301001787, "reward_std": 0.11176870320923626, "rewards/semantic_entropy_math_reward": 0.29414683301001787, "step": 51 }, { "completion_length": 599.8273887634277, "epoch": 0.29111266620014, "grad_norm": 0.014151890762150288, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.30481151957064867, "reward_std": 0.09737780690193176, "rewards/semantic_entropy_math_reward": 0.30481151957064867, "step": 52 }, { "completion_length": 617.6384086608887, "epoch": 0.2967109867039888, "grad_norm": 0.012832826003432274, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.34548612125217915, "reward_std": 0.10560432635247707, "rewards/semantic_entropy_math_reward": 0.34548612125217915, "step": 53 }, { "completion_length": 639.7872161865234, "epoch": 0.30230930720783766, "grad_norm": 0.013676963746547699, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.29154266975820065, "reward_std": 0.09982788749039173, "rewards/semantic_entropy_math_reward": 0.29154266975820065, "step": 54 }, { "completion_length": 622.1547737121582, "epoch": 0.3079076277116865, "grad_norm": 0.012248532846570015, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.3303571534343064, "reward_std": 0.10564735904335976, "rewards/semantic_entropy_math_reward": 0.3303571534343064, "step": 55 }, { "completion_length": 612.3623600006104, "epoch": 0.31350594821553535, "grad_norm": 0.012541095726191998, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.37437996733933687, "reward_std": 0.11628623493015766, "rewards/semantic_entropy_math_reward": 0.37437996733933687, "step": 56 }, { "completion_length": 590.5974807739258, "epoch": 0.3191042687193842, "grad_norm": 0.01250051986426115, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.3829365186393261, "reward_std": 0.11953789182007313, "rewards/semantic_entropy_math_reward": 0.3829365186393261, "step": 57 }, { "completion_length": 629.9375152587891, "epoch": 0.32470258922323303, "grad_norm": 0.01244643796235323, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.30481151584535837, "reward_std": 0.10653208615258336, "rewards/semantic_entropy_math_reward": 0.30481151584535837, "step": 58 }, { "completion_length": 663.9650478363037, "epoch": 0.33030090972708187, "grad_norm": 0.011867938563227654, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.2619047740008682, "reward_std": 0.08327494573313743, "rewards/semantic_entropy_math_reward": 0.2619047740008682, "step": 59 }, { "completion_length": 645.1510581970215, "epoch": 0.3358992302309307, "grad_norm": 0.01233300007879734, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.3126240177080035, "reward_std": 0.10046433750540018, "rewards/semantic_entropy_math_reward": 0.3126240177080035, "step": 60 }, { "completion_length": 623.23512840271, "epoch": 0.34149755073477955, "grad_norm": 0.012479487806558609, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.3306051716208458, "reward_std": 0.10351803922094405, "rewards/semantic_entropy_math_reward": 0.3306051716208458, "step": 61 }, { "completion_length": 632.4717350006104, "epoch": 0.3470958712386284, "grad_norm": 0.012427465058863163, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.31994048738852143, "reward_std": 0.10373943694867194, "rewards/semantic_entropy_math_reward": 0.31994048738852143, "step": 62 }, { "completion_length": 623.9613189697266, "epoch": 0.35269419174247724, "grad_norm": 0.013336232863366604, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.31547620333731174, "reward_std": 0.11058970703743398, "rewards/semantic_entropy_math_reward": 0.31547620333731174, "step": 63 }, { "completion_length": 640.9360294342041, "epoch": 0.3582925122463261, "grad_norm": 0.011113813146948814, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.33519346360117197, "reward_std": 0.08675730333197862, "rewards/semantic_entropy_math_reward": 0.33519346360117197, "step": 64 }, { "completion_length": 663.0587882995605, "epoch": 0.363890832750175, "grad_norm": 0.01317086722701788, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.2875744178891182, "reward_std": 0.09582534979563206, "rewards/semantic_entropy_math_reward": 0.2875744178891182, "step": 65 }, { "completion_length": 625.5915279388428, "epoch": 0.3694891532540238, "grad_norm": 0.013479222543537617, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.3365575489588082, "reward_std": 0.10884693474508822, "rewards/semantic_entropy_math_reward": 0.3365575489588082, "step": 66 }, { "completion_length": 642.8943557739258, "epoch": 0.37508747375787266, "grad_norm": 0.012459836900234222, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.2860863204114139, "reward_std": 0.09167324285954237, "rewards/semantic_entropy_math_reward": 0.2860863204114139, "step": 67 }, { "completion_length": 594.4159355163574, "epoch": 0.3806857942617215, "grad_norm": 0.013866654597222805, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.37289187777787447, "reward_std": 0.10999573534354568, "rewards/semantic_entropy_math_reward": 0.37289187777787447, "step": 68 }, { "completion_length": 640.9055099487305, "epoch": 0.38628411476557034, "grad_norm": 0.012836214154958725, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.2891865139827132, "reward_std": 0.10698562301695347, "rewards/semantic_entropy_math_reward": 0.2891865139827132, "step": 69 }, { "completion_length": 623.5349769592285, "epoch": 0.3918824352694192, "grad_norm": 0.013130308128893375, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.3031994132325053, "reward_std": 0.10316204163245857, "rewards/semantic_entropy_math_reward": 0.3031994132325053, "step": 70 }, { "completion_length": 593.7172718048096, "epoch": 0.397480755773268, "grad_norm": 0.013811892829835415, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.36693950183689594, "reward_std": 0.10302408610004932, "rewards/semantic_entropy_math_reward": 0.36693950183689594, "step": 71 }, { "completion_length": 597.6815605163574, "epoch": 0.40307907627711687, "grad_norm": 0.013042716309428215, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.359002991579473, "reward_std": 0.12008155998773873, "rewards/semantic_entropy_math_reward": 0.359002991579473, "step": 72 }, { "completion_length": 594.0282955169678, "epoch": 0.4086773967809657, "grad_norm": 0.01363166980445385, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.3308531828224659, "reward_std": 0.1005927964579314, "rewards/semantic_entropy_math_reward": 0.3308531828224659, "step": 73 }, { "completion_length": 638.837064743042, "epoch": 0.41427571728481455, "grad_norm": 0.01323084905743599, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.2997271902859211, "reward_std": 0.1091692647896707, "rewards/semantic_entropy_math_reward": 0.2997271902859211, "step": 74 }, { "completion_length": 627.7797698974609, "epoch": 0.4198740377886634, "grad_norm": 0.013865278102457523, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.3164682723581791, "reward_std": 0.10915092006325722, "rewards/semantic_entropy_math_reward": 0.3164682723581791, "step": 75 }, { "completion_length": 623.7016448974609, "epoch": 0.42547235829251223, "grad_norm": 0.013494770042598248, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.36892362777143717, "reward_std": 0.101050935103558, "rewards/semantic_entropy_math_reward": 0.36892362777143717, "step": 76 }, { "completion_length": 632.0245666503906, "epoch": 0.4310706787963611, "grad_norm": 0.013310288079082966, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.31349207321181893, "reward_std": 0.10522611381020397, "rewards/semantic_entropy_math_reward": 0.31349207321181893, "step": 77 }, { "completion_length": 629.9278392791748, "epoch": 0.4366689993002099, "grad_norm": 0.012805829755961895, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.30803572107106447, "reward_std": 0.09856258635409176, "rewards/semantic_entropy_math_reward": 0.30803572107106447, "step": 78 }, { "completion_length": 607.979175567627, "epoch": 0.44226731980405876, "grad_norm": 0.013692095875740051, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.32142858440056443, "reward_std": 0.1157424389384687, "rewards/semantic_entropy_math_reward": 0.32142858440056443, "step": 79 }, { "completion_length": 616.4501533508301, "epoch": 0.44786564030790765, "grad_norm": 0.014579751528799534, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.3225446487776935, "reward_std": 0.10182817134773359, "rewards/semantic_entropy_math_reward": 0.3225446487776935, "step": 80 }, { "completion_length": 654.9092407226562, "epoch": 0.4534639608117565, "grad_norm": 0.013945615850389004, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.3175843362696469, "reward_std": 0.09967625606805086, "rewards/semantic_entropy_math_reward": 0.3175843362696469, "step": 81 }, { "completion_length": 615.4084930419922, "epoch": 0.45906228131560534, "grad_norm": 0.014666857197880745, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.3359375102445483, "reward_std": 0.10500352433882654, "rewards/semantic_entropy_math_reward": 0.3359375102445483, "step": 82 }, { "completion_length": 615.1317005157471, "epoch": 0.4646606018194542, "grad_norm": 0.014408071525394917, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.3276289766654372, "reward_std": 0.10603603813797235, "rewards/semantic_entropy_math_reward": 0.3276289766654372, "step": 83 }, { "completion_length": 566.331111907959, "epoch": 0.470258922323303, "grad_norm": 0.015806101262569427, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.40525794867426157, "reward_std": 0.12493724888190627, "rewards/semantic_entropy_math_reward": 0.40525794867426157, "step": 84 }, { "completion_length": 651.4486694335938, "epoch": 0.47585724282715186, "grad_norm": 0.014175321906805038, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.29402281949296594, "reward_std": 0.10067101614549756, "rewards/semantic_entropy_math_reward": 0.29402281949296594, "step": 85 }, { "completion_length": 627.6443481445312, "epoch": 0.4814555633310007, "grad_norm": 0.016120119020342827, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.3126240139827132, "reward_std": 0.10280289477668703, "rewards/semantic_entropy_math_reward": 0.3126240139827132, "step": 86 }, { "completion_length": 635.4985237121582, "epoch": 0.48705388383484954, "grad_norm": 0.014732223004102707, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.3711557686328888, "reward_std": 0.1085683039855212, "rewards/semantic_entropy_math_reward": 0.3711557686328888, "step": 87 }, { "completion_length": 598.274564743042, "epoch": 0.4926522043386984, "grad_norm": 0.015387475490570068, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.39397322107106447, "reward_std": 0.12118404218927026, "rewards/semantic_entropy_math_reward": 0.39397322107106447, "step": 88 }, { "completion_length": 607.96950340271, "epoch": 0.4982505248425472, "grad_norm": 0.015765592455863953, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.35466271452605724, "reward_std": 0.10943171451799572, "rewards/semantic_entropy_math_reward": 0.35466271452605724, "step": 89 }, { "completion_length": 585.819206237793, "epoch": 0.5038488453463961, "grad_norm": 0.016726847738027573, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.3410218358039856, "reward_std": 0.1104447974357754, "rewards/semantic_entropy_math_reward": 0.3410218358039856, "step": 90 }, { "completion_length": 637.3958435058594, "epoch": 0.509447165850245, "grad_norm": 0.015591591596603394, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.3288690596818924, "reward_std": 0.10522727854549885, "rewards/semantic_entropy_math_reward": 0.3288690596818924, "step": 91 }, { "completion_length": 625.2872085571289, "epoch": 0.5150454863540938, "grad_norm": 0.016664857044816017, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.32118055783212185, "reward_std": 0.12649832549504936, "rewards/semantic_entropy_math_reward": 0.32118055783212185, "step": 92 }, { "completion_length": 593.0282936096191, "epoch": 0.5206438068579426, "grad_norm": 0.01702903024852276, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.33296132273972034, "reward_std": 0.12325585260987282, "rewards/semantic_entropy_math_reward": 0.33296132273972034, "step": 93 }, { "completion_length": 617.7559642791748, "epoch": 0.5262421273617914, "grad_norm": 0.0172974094748497, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.316096234600991, "reward_std": 0.1146786012686789, "rewards/semantic_entropy_math_reward": 0.316096234600991, "step": 94 }, { "completion_length": 636.2671279907227, "epoch": 0.5318404478656403, "grad_norm": 0.017720209434628487, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.2721974281594157, "reward_std": 0.10946512292139232, "rewards/semantic_entropy_math_reward": 0.2721974281594157, "step": 95 }, { "completion_length": 586.1860256195068, "epoch": 0.5374387683694891, "grad_norm": 0.01675945706665516, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.34561012499034405, "reward_std": 0.08876445493660867, "rewards/semantic_entropy_math_reward": 0.34561012499034405, "step": 96 }, { "completion_length": 611.3735218048096, "epoch": 0.543037088873338, "grad_norm": 0.01725374162197113, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.36408730782568455, "reward_std": 0.11099315108731389, "rewards/semantic_entropy_math_reward": 0.36408730782568455, "step": 97 }, { "completion_length": 614.9278392791748, "epoch": 0.5486354093771868, "grad_norm": 0.017101502045989037, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.32390873692929745, "reward_std": 0.10804228414781392, "rewards/semantic_entropy_math_reward": 0.32390873692929745, "step": 98 }, { "completion_length": 607.1049213409424, "epoch": 0.5542337298810357, "grad_norm": 0.019047001376748085, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.35553076304495335, "reward_std": 0.1101542457472533, "rewards/semantic_entropy_math_reward": 0.35553076304495335, "step": 99 }, { "completion_length": 622.6064147949219, "epoch": 0.5598320503848845, "grad_norm": 0.019254466518759727, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.3213045708835125, "reward_std": 0.09209134639240801, "rewards/semantic_entropy_math_reward": 0.3213045708835125, "step": 100 }, { "completion_length": 598.7961406707764, "epoch": 0.5654303708887334, "grad_norm": 0.020497458055615425, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.33866569120436907, "reward_std": 0.10611562361009419, "rewards/semantic_entropy_math_reward": 0.33866569120436907, "step": 101 }, { "completion_length": 650.3459911346436, "epoch": 0.5710286913925823, "grad_norm": 0.020331766456365585, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.2831101296469569, "reward_std": 0.09731970471329987, "rewards/semantic_entropy_math_reward": 0.2831101296469569, "step": 102 }, { "completion_length": 607.9479351043701, "epoch": 0.5766270118964311, "grad_norm": 0.02547612227499485, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.32688493095338345, "reward_std": 0.10671929223462939, "rewards/semantic_entropy_math_reward": 0.32688493095338345, "step": 103 }, { "completion_length": 633.948673248291, "epoch": 0.58222533240028, "grad_norm": 0.020476222038269043, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.3084077490493655, "reward_std": 0.1001708343392238, "rewards/semantic_entropy_math_reward": 0.3084077490493655, "step": 104 }, { "completion_length": 586.1733665466309, "epoch": 0.5878236529041287, "grad_norm": 0.030577119439840317, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.3710317499935627, "reward_std": 0.1044681896455586, "rewards/semantic_entropy_math_reward": 0.3710317499935627, "step": 105 }, { "completion_length": 584.3757553100586, "epoch": 0.5934219734079776, "grad_norm": 0.030157793313264847, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.36545139644294977, "reward_std": 0.10664909472689033, "rewards/semantic_entropy_math_reward": 0.36545139644294977, "step": 106 }, { "completion_length": 610.7224864959717, "epoch": 0.5990202939118264, "grad_norm": 0.02985748089849949, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.361111125908792, "reward_std": 0.10544992447830737, "rewards/semantic_entropy_math_reward": 0.361111125908792, "step": 107 }, { "completion_length": 603.1250133514404, "epoch": 0.6046186144156753, "grad_norm": 0.03455930948257446, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.3306051706895232, "reward_std": 0.09714385017286986, "rewards/semantic_entropy_math_reward": 0.3306051706895232, "step": 108 }, { "completion_length": 607.9910755157471, "epoch": 0.6102169349195241, "grad_norm": 0.08368990570306778, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.3103918735869229, "reward_std": 0.12056175642646849, "rewards/semantic_entropy_math_reward": 0.3103918735869229, "step": 109 }, { "completion_length": 583.9657821655273, "epoch": 0.615815255423373, "grad_norm": 0.06647204607725143, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.40166171081364155, "reward_std": 0.11974087287671864, "rewards/semantic_entropy_math_reward": 0.40166171081364155, "step": 110 }, { "completion_length": 617.5520992279053, "epoch": 0.6214135759272218, "grad_norm": 0.06331615895032883, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.31026787124574184, "reward_std": 0.0944897800218314, "rewards/semantic_entropy_math_reward": 0.31026787124574184, "step": 111 }, { "completion_length": 597.5513515472412, "epoch": 0.6270118964310707, "grad_norm": 0.1182793602347374, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.3891369206830859, "reward_std": 0.1367324935272336, "rewards/semantic_entropy_math_reward": 0.3891369206830859, "step": 112 }, { "completion_length": 599.026050567627, "epoch": 0.6326102169349195, "grad_norm": 0.32519498467445374, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.34784227423369884, "reward_std": 0.11579506384441629, "rewards/semantic_entropy_math_reward": 0.34784227423369884, "step": 113 }, { "completion_length": 605.5989627838135, "epoch": 0.6382085374387684, "grad_norm": 0.30284300446510315, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.35925100883468986, "reward_std": 0.09489296341780573, "rewards/semantic_entropy_math_reward": 0.35925100883468986, "step": 114 }, { "completion_length": 588.3794746398926, "epoch": 0.6438068579426172, "grad_norm": 0.5091419219970703, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.2926587341353297, "reward_std": 0.10896235378459096, "rewards/semantic_entropy_math_reward": 0.2926587341353297, "step": 115 }, { "completion_length": 585.5081958770752, "epoch": 0.6494051784464661, "grad_norm": 1.0533686876296997, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.302951404126361, "reward_std": 0.10090323083568364, "rewards/semantic_entropy_math_reward": 0.302951404126361, "step": 116 }, { "completion_length": 564.0706882476807, "epoch": 0.655003498950315, "grad_norm": 1.4623417854309082, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.2781498096883297, "reward_std": 0.10372308688238263, "rewards/semantic_entropy_math_reward": 0.2781498096883297, "step": 117 }, { "completion_length": 590.0558166503906, "epoch": 0.6606018194541637, "grad_norm": 1.9276891946792603, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.2147817499935627, "reward_std": 0.0795764367794618, "rewards/semantic_entropy_math_reward": 0.2147817499935627, "step": 118 }, { "completion_length": 631.612361907959, "epoch": 0.6662001399580126, "grad_norm": 2.644303321838379, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.17051091720350087, "reward_std": 0.07038468832615763, "rewards/semantic_entropy_math_reward": 0.17051091720350087, "step": 119 }, { "completion_length": 666.3355770111084, "epoch": 0.6717984604618614, "grad_norm": 3.114243984222412, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.13591270288452506, "reward_std": 0.06976182584185153, "rewards/semantic_entropy_math_reward": 0.13591270288452506, "step": 120 }, { "completion_length": 745.9434585571289, "epoch": 0.6773967809657103, "grad_norm": 3.7797133922576904, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.07477678649593145, "reward_std": 0.03976587820216082, "rewards/semantic_entropy_math_reward": 0.07477678649593145, "step": 121 }, { "completion_length": 807.1480865478516, "epoch": 0.6829951014695591, "grad_norm": 2.0227487087249756, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.03819444542750716, "reward_std": 0.017828965152148157, "rewards/semantic_entropy_math_reward": 0.03819444542750716, "step": 122 }, { "completion_length": 865.9404907226562, "epoch": 0.688593421973408, "grad_norm": 0.9964215755462646, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.007936508511193097, "reward_std": 0.004420435056090355, "rewards/semantic_entropy_math_reward": 0.007936508511193097, "step": 123 }, { "completion_length": 884.2105827331543, "epoch": 0.6941917424772568, "grad_norm": 0.30527350306510925, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.0017361112404614687, "reward_std": 0.0005792402662336826, "rewards/semantic_entropy_math_reward": 0.0017361112404614687, "step": 124 }, { "completion_length": 922.7299270629883, "epoch": 0.6997900629811057, "grad_norm": 0.14305374026298523, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.003100198577158153, "reward_std": 0.001635652908589691, "rewards/semantic_entropy_math_reward": 0.003100198577158153, "step": 125 }, { "completion_length": 945.5491256713867, "epoch": 0.7053883834849545, "grad_norm": 0.0, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 126 }, { "completion_length": 936.7715911865234, "epoch": 0.7109867039888034, "grad_norm": 0.2572309970855713, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.002852182718925178, "reward_std": 0.001252256624866277, "rewards/semantic_entropy_math_reward": 0.002852182718925178, "step": 127 }, { "completion_length": 928.2775459289551, "epoch": 0.7165850244926522, "grad_norm": 0.1054806336760521, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.0008680556202307343, "reward_std": 0.0002896201331168413, "rewards/semantic_entropy_math_reward": 0.0008680556202307343, "step": 128 }, { "completion_length": 933.6108818054199, "epoch": 0.722183344996501, "grad_norm": 0.2958269417285919, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.0019841270986944437, "reward_std": 0.0009626364917494357, "rewards/semantic_entropy_math_reward": 0.0019841270986944437, "step": 129 }, { "completion_length": 934.7604331970215, "epoch": 0.72778166550035, "grad_norm": 0.0, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 130 }, { "completion_length": 943.2418365478516, "epoch": 0.7333799860041987, "grad_norm": 0.4255366623401642, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.0008680556202307343, "reward_std": 0.0002896201331168413, "rewards/semantic_entropy_math_reward": 0.0008680556202307343, "step": 131 }, { "completion_length": 928.0707015991211, "epoch": 0.7389783065080476, "grad_norm": 0.0, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 132 }, { "completion_length": 920.4546318054199, "epoch": 0.7445766270118964, "grad_norm": 0.25327780842781067, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.0008680556202307343, "reward_std": 0.0002896201331168413, "rewards/semantic_entropy_math_reward": 0.0008680556202307343, "step": 133 }, { "completion_length": 894.5632667541504, "epoch": 0.7501749475157453, "grad_norm": 0.29155024886131287, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.0008680556202307343, "reward_std": 0.0002896201331168413, "rewards/semantic_entropy_math_reward": 0.0008680556202307343, "step": 134 }, { "completion_length": 903.4895935058594, "epoch": 0.7557732680195941, "grad_norm": 0.313747376203537, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.0019841270986944437, "reward_std": 0.0009626365499570966, "rewards/semantic_entropy_math_reward": 0.0019841270986944437, "step": 135 }, { "completion_length": 858.8898887634277, "epoch": 0.761371588523443, "grad_norm": 0.23342648148536682, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.002604166860692203, "reward_std": 0.000868860399350524, "rewards/semantic_entropy_math_reward": 0.002604166860692203, "step": 136 }, { "completion_length": 880.4099922180176, "epoch": 0.7669699090272918, "grad_norm": 0.863322913646698, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.002604166860692203, "reward_std": 0.000868860399350524, "rewards/semantic_entropy_math_reward": 0.002604166860692203, "step": 137 }, { "completion_length": 870.8638534545898, "epoch": 0.7725682295311407, "grad_norm": 0.23507745563983917, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.004092262242920697, "reward_std": 0.002001996588660404, "rewards/semantic_entropy_math_reward": 0.004092262242920697, "step": 138 }, { "completion_length": 826.1190567016602, "epoch": 0.7781665500349895, "grad_norm": 0.5723982453346252, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.005828373366966844, "reward_std": 0.00254713196773082, "rewards/semantic_entropy_math_reward": 0.005828373366966844, "step": 139 }, { "completion_length": 822.2314186096191, "epoch": 0.7837648705388384, "grad_norm": 1.1474024057388306, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.012896826025098562, "reward_std": 0.006677947181742638, "rewards/semantic_entropy_math_reward": 0.012896826025098562, "step": 140 }, { "completion_length": 847.2269515991211, "epoch": 0.7893631910426872, "grad_norm": 1.597180724143982, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.011656746733933687, "reward_std": 0.005111316044349223, "rewards/semantic_entropy_math_reward": 0.011656746733933687, "step": 141 }, { "completion_length": 842.1793327331543, "epoch": 0.794961511546536, "grad_norm": 0.7798174619674683, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.012152778450399637, "reward_std": 0.004957869183272123, "rewards/semantic_entropy_math_reward": 0.012152778450399637, "step": 142 }, { "completion_length": 831.2462921142578, "epoch": 0.8005598320503848, "grad_norm": 2.8579814434051514, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.028769842814654112, "reward_std": 0.01406612026039511, "rewards/semantic_entropy_math_reward": 0.028769842814654112, "step": 143 }, { "completion_length": 789.3541831970215, "epoch": 0.8061581525542337, "grad_norm": 3.0082154273986816, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.026289683999493718, "reward_std": 0.014727714413311332, "rewards/semantic_entropy_math_reward": 0.026289683999493718, "step": 144 }, { "completion_length": 766.1949501037598, "epoch": 0.8117564730580826, "grad_norm": 5.142900466918945, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.049107144586741924, "reward_std": 0.028222940571140498, "rewards/semantic_entropy_math_reward": 0.049107144586741924, "step": 145 }, { "completion_length": 758.3102760314941, "epoch": 0.8173547935619314, "grad_norm": 2.1914703845977783, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.071180559694767, "reward_std": 0.042430724075529724, "rewards/semantic_entropy_math_reward": 0.071180559694767, "step": 146 }, { "completion_length": 705.601203918457, "epoch": 0.8229531140657803, "grad_norm": 2.148831605911255, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.09474206739105284, "reward_std": 0.051515123690478504, "rewards/semantic_entropy_math_reward": 0.09474206739105284, "step": 147 }, { "completion_length": 714.9263496398926, "epoch": 0.8285514345696291, "grad_norm": 2.954876184463501, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.10466270102187991, "reward_std": 0.06257531465962529, "rewards/semantic_entropy_math_reward": 0.10466270102187991, "step": 148 }, { "completion_length": 674.5491104125977, "epoch": 0.834149755073478, "grad_norm": 2.0350775718688965, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.13653274183161557, "reward_std": 0.07629197556525469, "rewards/semantic_entropy_math_reward": 0.13653274183161557, "step": 149 }, { "completion_length": 637.178581237793, "epoch": 0.8397480755773268, "grad_norm": 3.029639482498169, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.1703869104385376, "reward_std": 0.08733957540243864, "rewards/semantic_entropy_math_reward": 0.1703869104385376, "step": 150 }, { "completion_length": 673.2440567016602, "epoch": 0.8453463960811757, "grad_norm": 1.96150803565979, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.14583333884365857, "reward_std": 0.08530042838538066, "rewards/semantic_entropy_math_reward": 0.14583333884365857, "step": 151 }, { "completion_length": 618.9092407226562, "epoch": 0.8509447165850245, "grad_norm": 1.5204460620880127, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.12810020195320249, "reward_std": 0.07424356217961758, "rewards/semantic_entropy_math_reward": 0.12810020195320249, "step": 152 }, { "completion_length": 596.5007553100586, "epoch": 0.8565430370888734, "grad_norm": 2.1267242431640625, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.07378472597338259, "reward_std": 0.04104530799668282, "rewards/semantic_entropy_math_reward": 0.07378472597338259, "step": 153 }, { "completion_length": 596.3943500518799, "epoch": 0.8621413575927221, "grad_norm": 1.156010627746582, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.051835319376550615, "reward_std": 0.028525067784357816, "rewards/semantic_entropy_math_reward": 0.051835319376550615, "step": 154 }, { "completion_length": 649.7128067016602, "epoch": 0.867739678096571, "grad_norm": 0.9627940058708191, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.03881448460742831, "reward_std": 0.022329314553644508, "rewards/semantic_entropy_math_reward": 0.03881448460742831, "step": 155 }, { "completion_length": 678.5580520629883, "epoch": 0.8733379986004198, "grad_norm": 1.0966858863830566, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.037822422687895596, "reward_std": 0.0146516069653444, "rewards/semantic_entropy_math_reward": 0.037822422687895596, "step": 156 }, { "completion_length": 727.682300567627, "epoch": 0.8789363191042687, "grad_norm": 1.220819115638733, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.04303075547795743, "reward_std": 0.02123075199779123, "rewards/semantic_entropy_math_reward": 0.04303075547795743, "step": 157 }, { "completion_length": 693.4948043823242, "epoch": 0.8845346396081175, "grad_norm": 2.7909467220306396, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.09176587720867246, "reward_std": 0.04625473154010251, "rewards/semantic_entropy_math_reward": 0.09176587720867246, "step": 158 }, { "completion_length": 734.0967407226562, "epoch": 0.8901329601119664, "grad_norm": 1.8816090822219849, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.09660218469798565, "reward_std": 0.0377241056994535, "rewards/semantic_entropy_math_reward": 0.09660218469798565, "step": 159 }, { "completion_length": 750.3869190216064, "epoch": 0.8957312806158153, "grad_norm": 1.124207854270935, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.09176587732508779, "reward_std": 0.04596630920423195, "rewards/semantic_entropy_math_reward": 0.09176587732508779, "step": 160 }, { "completion_length": 760.7053718566895, "epoch": 0.9013296011196641, "grad_norm": 1.2950589656829834, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.09536210435908288, "reward_std": 0.05483918351819739, "rewards/semantic_entropy_math_reward": 0.09536210435908288, "step": 161 }, { "completion_length": 761.5186157226562, "epoch": 0.906927921623513, "grad_norm": 0.7260070443153381, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.10466269659809768, "reward_std": 0.04580701712984592, "rewards/semantic_entropy_math_reward": 0.10466269659809768, "step": 162 }, { "completion_length": 713.2239723205566, "epoch": 0.9125262421273618, "grad_norm": 0.9426623582839966, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.13727678637951612, "reward_std": 0.06796611158642918, "rewards/semantic_entropy_math_reward": 0.13727678637951612, "step": 163 }, { "completion_length": 705.643611907959, "epoch": 0.9181245626312107, "grad_norm": 1.4740947484970093, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.18241567676886916, "reward_std": 0.08067136688623577, "rewards/semantic_entropy_math_reward": 0.18241567676886916, "step": 164 }, { "completion_length": 731.9352836608887, "epoch": 0.9237228831350595, "grad_norm": 0.8142076134681702, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.15947421244345605, "reward_std": 0.0733173037879169, "rewards/semantic_entropy_math_reward": 0.15947421244345605, "step": 165 }, { "completion_length": 794.4799270629883, "epoch": 0.9293212036389084, "grad_norm": 0.5677748322486877, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.14360119448974729, "reward_std": 0.06343726580962539, "rewards/semantic_entropy_math_reward": 0.14360119448974729, "step": 166 }, { "completion_length": 767.6942100524902, "epoch": 0.9349195241427571, "grad_norm": 0.707513153553009, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.1733631044626236, "reward_std": 0.06966668769018725, "rewards/semantic_entropy_math_reward": 0.1733631044626236, "step": 167 }, { "completion_length": 822.2001686096191, "epoch": 0.940517844646606, "grad_norm": 0.5552101731300354, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.1324404844781384, "reward_std": 0.05633544718148187, "rewards/semantic_entropy_math_reward": 0.1324404844781384, "step": 168 }, { "completion_length": 841.3861694335938, "epoch": 0.9461161651504548, "grad_norm": 0.26897794008255005, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.13591270486358553, "reward_std": 0.04643521481193602, "rewards/semantic_entropy_math_reward": 0.13591270486358553, "step": 169 }, { "completion_length": 842.628734588623, "epoch": 0.9517144856543037, "grad_norm": 0.4329053461551666, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.12388393026776612, "reward_std": 0.06243461259873584, "rewards/semantic_entropy_math_reward": 0.12388393026776612, "step": 170 }, { "completion_length": 852.5639991760254, "epoch": 0.9573128061581525, "grad_norm": 0.3284468352794647, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.12909226480405778, "reward_std": 0.05489638983272016, "rewards/semantic_entropy_math_reward": 0.12909226480405778, "step": 171 }, { "completion_length": 864.0989837646484, "epoch": 0.9629111266620014, "grad_norm": 0.4691842794418335, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.12165178672876209, "reward_std": 0.05811122967861593, "rewards/semantic_entropy_math_reward": 0.12165178672876209, "step": 172 }, { "completion_length": 894.2678718566895, "epoch": 0.9685094471658502, "grad_norm": 0.43808817863464355, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.0978422649204731, "reward_std": 0.043283838051138446, "rewards/semantic_entropy_math_reward": 0.0978422649204731, "step": 173 }, { "completion_length": 890.3757591247559, "epoch": 0.9741077676696991, "grad_norm": 0.7790409922599792, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.09747024194803089, "reward_std": 0.04836806608363986, "rewards/semantic_entropy_math_reward": 0.09747024194803089, "step": 174 }, { "completion_length": 922.7834930419922, "epoch": 0.979706088173548, "grad_norm": 1.0157604217529297, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.06386408989783376, "reward_std": 0.03726256394293159, "rewards/semantic_entropy_math_reward": 0.06386408989783376, "step": 175 }, { "completion_length": 921.322193145752, "epoch": 0.9853044086773968, "grad_norm": 1.7826768159866333, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.07279266219120473, "reward_std": 0.04188264685217291, "rewards/semantic_entropy_math_reward": 0.07279266219120473, "step": 176 }, { "completion_length": 897.9613227844238, "epoch": 0.9909027291812457, "grad_norm": 5.7943010330200195, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.10453869169577956, "reward_std": 0.04441492026671767, "rewards/semantic_entropy_math_reward": 0.10453869169577956, "step": 177 }, { "completion_length": 899.3638496398926, "epoch": 0.9965010496850945, "grad_norm": 3.6061346530914307, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.08655754150822759, "reward_std": 0.03804769192356616, "rewards/semantic_entropy_math_reward": 0.08655754150822759, "step": 178 }, { "epoch": 0.9965010496850945, "step": 178, "total_flos": 0.0, "train_loss": 5.786010212772085e-09, "train_runtime": 7433.3342, "train_samples_per_second": 2.691, "train_steps_per_second": 0.024 } ], "logging_steps": 1, "max_steps": 178, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }