Haitao999's picture
Model save
89040f1 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9965010496850945,
"eval_steps": 100,
"global_step": 178,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 548.2455501556396,
"epoch": 0.005598320503848845,
"grad_norm": 0.01954779587686062,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.238839291036129,
"reward_std": 0.11606137279886752,
"rewards/semantic_entropy_math_reward": 0.238839291036129,
"step": 1
},
{
"completion_length": 526.9025402069092,
"epoch": 0.01119664100769769,
"grad_norm": 0.018136516213417053,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.2369791674427688,
"reward_std": 0.10657330928370357,
"rewards/semantic_entropy_math_reward": 0.2369791674427688,
"step": 2
},
{
"completion_length": 562.8943519592285,
"epoch": 0.016794961511546535,
"grad_norm": 0.01578659377992153,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.23115079675335437,
"reward_std": 0.09216207754798234,
"rewards/semantic_entropy_math_reward": 0.23115079675335437,
"step": 3
},
{
"completion_length": 571.5796279907227,
"epoch": 0.02239328201539538,
"grad_norm": 0.018513264134526253,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.2011408838443458,
"reward_std": 0.09730021236464381,
"rewards/semantic_entropy_math_reward": 0.2011408838443458,
"step": 4
},
{
"completion_length": 569.882453918457,
"epoch": 0.02799160251924423,
"grad_norm": 0.023015329614281654,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.23325893050059676,
"reward_std": 0.11453290120698512,
"rewards/semantic_entropy_math_reward": 0.23325893050059676,
"step": 5
},
{
"completion_length": 541.0096778869629,
"epoch": 0.03358992302309307,
"grad_norm": 0.017617080360651016,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.2580605214461684,
"reward_std": 0.1201386651955545,
"rewards/semantic_entropy_math_reward": 0.2580605214461684,
"step": 6
},
{
"completion_length": 579.73512840271,
"epoch": 0.03918824352694192,
"grad_norm": 0.0158968698233366,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.2048611151985824,
"reward_std": 0.0955489007756114,
"rewards/semantic_entropy_math_reward": 0.2048611151985824,
"step": 7
},
{
"completion_length": 515.7500076293945,
"epoch": 0.04478656403079076,
"grad_norm": 0.022326918318867683,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.21825397666543722,
"reward_std": 0.0982005288824439,
"rewards/semantic_entropy_math_reward": 0.21825397666543722,
"step": 8
},
{
"completion_length": 590.3921222686768,
"epoch": 0.05038488453463961,
"grad_norm": 0.016080491244792938,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.21205357322469354,
"reward_std": 0.10619991598650813,
"rewards/semantic_entropy_math_reward": 0.21205357322469354,
"step": 9
},
{
"completion_length": 581.6651878356934,
"epoch": 0.05598320503848846,
"grad_norm": 0.017059899866580963,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.21899801935069263,
"reward_std": 0.10072531108744442,
"rewards/semantic_entropy_math_reward": 0.21899801935069263,
"step": 10
},
{
"completion_length": 571.7886981964111,
"epoch": 0.0615815255423373,
"grad_norm": 0.01674991473555565,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.2311508022248745,
"reward_std": 0.10565085639245808,
"rewards/semantic_entropy_math_reward": 0.2311508022248745,
"step": 11
},
{
"completion_length": 562.854923248291,
"epoch": 0.06717984604618614,
"grad_norm": 0.018872996792197227,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.2327629025094211,
"reward_std": 0.10614555445499718,
"rewards/semantic_entropy_math_reward": 0.2327629025094211,
"step": 12
},
{
"completion_length": 568.9709930419922,
"epoch": 0.072778166550035,
"grad_norm": 0.015940172597765923,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.23288691393099725,
"reward_std": 0.10073893028311431,
"rewards/semantic_entropy_math_reward": 0.23288691393099725,
"step": 13
},
{
"completion_length": 593.6733703613281,
"epoch": 0.07837648705388384,
"grad_norm": 0.015898453071713448,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.22792659373953938,
"reward_std": 0.09615750901866704,
"rewards/semantic_entropy_math_reward": 0.22792659373953938,
"step": 14
},
{
"completion_length": 584.9248657226562,
"epoch": 0.08397480755773268,
"grad_norm": 0.014255843125283718,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.2467758092097938,
"reward_std": 0.08825354627333581,
"rewards/semantic_entropy_math_reward": 0.2467758092097938,
"step": 15
},
{
"completion_length": 516.8154811859131,
"epoch": 0.08957312806158152,
"grad_norm": 0.022125836461782455,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.2909226221963763,
"reward_std": 0.12354861758649349,
"rewards/semantic_entropy_math_reward": 0.2909226221963763,
"step": 16
},
{
"completion_length": 551.7343845367432,
"epoch": 0.09517144856543037,
"grad_norm": 0.016181064769625664,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.2682291769888252,
"reward_std": 0.10501077049411833,
"rewards/semantic_entropy_math_reward": 0.2682291769888252,
"step": 17
},
{
"completion_length": 618.2076015472412,
"epoch": 0.10076976906927922,
"grad_norm": 0.019598443061113358,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.2562004067003727,
"reward_std": 0.11642820481210947,
"rewards/semantic_entropy_math_reward": 0.2562004067003727,
"step": 18
},
{
"completion_length": 561.2827529907227,
"epoch": 0.10636808957312806,
"grad_norm": 0.015643073245882988,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.26202877750620246,
"reward_std": 0.10126263811253011,
"rewards/semantic_entropy_math_reward": 0.26202877750620246,
"step": 19
},
{
"completion_length": 579.395845413208,
"epoch": 0.11196641007697691,
"grad_norm": 0.017556050792336464,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.2761656870134175,
"reward_std": 0.11453696829266846,
"rewards/semantic_entropy_math_reward": 0.2761656870134175,
"step": 20
},
{
"completion_length": 580.3199481964111,
"epoch": 0.11756473058082575,
"grad_norm": 0.01937718316912651,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.2857142901048064,
"reward_std": 0.10658370074816048,
"rewards/semantic_entropy_math_reward": 0.2857142901048064,
"step": 21
},
{
"completion_length": 638.837064743042,
"epoch": 0.1231630510846746,
"grad_norm": 0.018222520127892494,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.21614584047347307,
"reward_std": 0.0887793127913028,
"rewards/semantic_entropy_math_reward": 0.21614584047347307,
"step": 22
},
{
"completion_length": 616.3043251037598,
"epoch": 0.12876137158852344,
"grad_norm": 0.015998607501387596,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.2881944440305233,
"reward_std": 0.09268029552185908,
"rewards/semantic_entropy_math_reward": 0.2881944440305233,
"step": 23
},
{
"completion_length": 608.8288803100586,
"epoch": 0.13435969209237228,
"grad_norm": 0.015030119568109512,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.2903025895357132,
"reward_std": 0.10364439443219453,
"rewards/semantic_entropy_math_reward": 0.2903025895357132,
"step": 24
},
{
"completion_length": 569.1443557739258,
"epoch": 0.13995801259622112,
"grad_norm": 0.01883069984614849,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.2831101273186505,
"reward_std": 0.10425049322657287,
"rewards/semantic_entropy_math_reward": 0.2831101273186505,
"step": 25
},
{
"completion_length": 573.0587940216064,
"epoch": 0.14555633310007,
"grad_norm": 0.019200004637241364,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.2563244076445699,
"reward_std": 0.09196250140666962,
"rewards/semantic_entropy_math_reward": 0.2563244076445699,
"step": 26
},
{
"completion_length": 601.1659317016602,
"epoch": 0.15115465360391883,
"grad_norm": 0.016727037727832794,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.26822917023673654,
"reward_std": 0.09728709328919649,
"rewards/semantic_entropy_math_reward": 0.26822917023673654,
"step": 27
},
{
"completion_length": 615.2797718048096,
"epoch": 0.15675297410776767,
"grad_norm": 0.01875486597418785,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.3012152877636254,
"reward_std": 0.11328667728230357,
"rewards/semantic_entropy_math_reward": 0.3012152877636254,
"step": 28
},
{
"completion_length": 611.0647449493408,
"epoch": 0.16235129461161651,
"grad_norm": 0.01615230180323124,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.33482143841683865,
"reward_std": 0.09433427458861843,
"rewards/semantic_entropy_math_reward": 0.33482143841683865,
"step": 29
},
{
"completion_length": 622.8876724243164,
"epoch": 0.16794961511546536,
"grad_norm": 0.016654223203659058,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.28807044588029385,
"reward_std": 0.1034616008400917,
"rewards/semantic_entropy_math_reward": 0.28807044588029385,
"step": 30
},
{
"completion_length": 639.4181709289551,
"epoch": 0.1735479356193142,
"grad_norm": 0.01327348593622446,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.2810019925236702,
"reward_std": 0.09964999603107572,
"rewards/semantic_entropy_math_reward": 0.2810019925236702,
"step": 31
},
{
"completion_length": 629.0714416503906,
"epoch": 0.17914625612316304,
"grad_norm": 0.017067549750208855,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.2867063535377383,
"reward_std": 0.10988868214190006,
"rewards/semantic_entropy_math_reward": 0.2867063535377383,
"step": 32
},
{
"completion_length": 622.2953987121582,
"epoch": 0.1847445766270119,
"grad_norm": 0.01429623831063509,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.3038194472901523,
"reward_std": 0.10957263736054301,
"rewards/semantic_entropy_math_reward": 0.3038194472901523,
"step": 33
},
{
"completion_length": 597.1503067016602,
"epoch": 0.19034289713086075,
"grad_norm": 0.013799657113850117,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.3093998031690717,
"reward_std": 0.10120404063491151,
"rewards/semantic_entropy_math_reward": 0.3093998031690717,
"step": 34
},
{
"completion_length": 636.1056709289551,
"epoch": 0.1959412176347096,
"grad_norm": 0.011626561172306538,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.3293650816194713,
"reward_std": 0.09622950968332589,
"rewards/semantic_entropy_math_reward": 0.3293650816194713,
"step": 35
},
{
"completion_length": 591.780517578125,
"epoch": 0.20153953813855843,
"grad_norm": 0.011638457886874676,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.38231647573411465,
"reward_std": 0.11920557962730527,
"rewards/semantic_entropy_math_reward": 0.38231647573411465,
"step": 36
},
{
"completion_length": 580.5230712890625,
"epoch": 0.20713785864240727,
"grad_norm": 0.012719900347292423,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.4064980298280716,
"reward_std": 0.12387050059624016,
"rewards/semantic_entropy_math_reward": 0.4064980298280716,
"step": 37
},
{
"completion_length": 617.6317043304443,
"epoch": 0.21273617914625612,
"grad_norm": 0.012224080041050911,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.30778770707547665,
"reward_std": 0.08965765126049519,
"rewards/semantic_entropy_math_reward": 0.30778770707547665,
"step": 38
},
{
"completion_length": 608.4538803100586,
"epoch": 0.21833449965010496,
"grad_norm": 0.011945121921598911,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.3193204468116164,
"reward_std": 0.10897644911892712,
"rewards/semantic_entropy_math_reward": 0.3193204468116164,
"step": 39
},
{
"completion_length": 642.5602779388428,
"epoch": 0.22393282015395383,
"grad_norm": 0.012211363762617111,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.30456350091844797,
"reward_std": 0.10713349282741547,
"rewards/semantic_entropy_math_reward": 0.30456350091844797,
"step": 40
},
{
"completion_length": 639.2768001556396,
"epoch": 0.22953114065780267,
"grad_norm": 0.012564162723720074,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.29551092034671456,
"reward_std": 0.10152047942392528,
"rewards/semantic_entropy_math_reward": 0.29551092034671456,
"step": 41
},
{
"completion_length": 621.8891487121582,
"epoch": 0.2351294611616515,
"grad_norm": 0.01134682260453701,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.3234127121977508,
"reward_std": 0.1085170682054013,
"rewards/semantic_entropy_math_reward": 0.3234127121977508,
"step": 42
},
{
"completion_length": 628.6324501037598,
"epoch": 0.24072778166550035,
"grad_norm": 0.012990601360797882,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.31237600184977055,
"reward_std": 0.09812885848805308,
"rewards/semantic_entropy_math_reward": 0.31237600184977055,
"step": 43
},
{
"completion_length": 627.9442024230957,
"epoch": 0.2463261021693492,
"grad_norm": 0.012126186862587929,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.3050595261156559,
"reward_std": 0.10460688779130578,
"rewards/semantic_entropy_math_reward": 0.3050595261156559,
"step": 44
},
{
"completion_length": 628.8229331970215,
"epoch": 0.25192442267319803,
"grad_norm": 0.011053094640374184,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.32576886005699635,
"reward_std": 0.09965397394262254,
"rewards/semantic_entropy_math_reward": 0.32576886005699635,
"step": 45
},
{
"completion_length": 607.1689109802246,
"epoch": 0.2575227431770469,
"grad_norm": 0.012990483082830906,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.3343254020437598,
"reward_std": 0.11526623973622918,
"rewards/semantic_entropy_math_reward": 0.3343254020437598,
"step": 46
},
{
"completion_length": 623.7559700012207,
"epoch": 0.2631210636808957,
"grad_norm": 0.01174214854836464,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.3116319542750716,
"reward_std": 0.09533112193457782,
"rewards/semantic_entropy_math_reward": 0.3116319542750716,
"step": 47
},
{
"completion_length": 629.1674213409424,
"epoch": 0.26871938418474456,
"grad_norm": 0.01178384106606245,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.31150794867426157,
"reward_std": 0.10652086476329714,
"rewards/semantic_entropy_math_reward": 0.31150794867426157,
"step": 48
},
{
"completion_length": 652.0907974243164,
"epoch": 0.2743177046885934,
"grad_norm": 0.011742531321942806,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.2786458325572312,
"reward_std": 0.09241121797822416,
"rewards/semantic_entropy_math_reward": 0.2786458325572312,
"step": 49
},
{
"completion_length": 643.3266448974609,
"epoch": 0.27991602519244224,
"grad_norm": 0.011445428244769573,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.27740576677024364,
"reward_std": 0.08390604832675308,
"rewards/semantic_entropy_math_reward": 0.27740576677024364,
"step": 50
},
{
"completion_length": 605.7641487121582,
"epoch": 0.28551434569629114,
"grad_norm": 0.042504504323005676,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.29414683301001787,
"reward_std": 0.11176870320923626,
"rewards/semantic_entropy_math_reward": 0.29414683301001787,
"step": 51
},
{
"completion_length": 599.8273887634277,
"epoch": 0.29111266620014,
"grad_norm": 0.014151890762150288,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.30481151957064867,
"reward_std": 0.09737780690193176,
"rewards/semantic_entropy_math_reward": 0.30481151957064867,
"step": 52
},
{
"completion_length": 617.6384086608887,
"epoch": 0.2967109867039888,
"grad_norm": 0.012832826003432274,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.34548612125217915,
"reward_std": 0.10560432635247707,
"rewards/semantic_entropy_math_reward": 0.34548612125217915,
"step": 53
},
{
"completion_length": 639.7872161865234,
"epoch": 0.30230930720783766,
"grad_norm": 0.013676963746547699,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.29154266975820065,
"reward_std": 0.09982788749039173,
"rewards/semantic_entropy_math_reward": 0.29154266975820065,
"step": 54
},
{
"completion_length": 622.1547737121582,
"epoch": 0.3079076277116865,
"grad_norm": 0.012248532846570015,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.3303571534343064,
"reward_std": 0.10564735904335976,
"rewards/semantic_entropy_math_reward": 0.3303571534343064,
"step": 55
},
{
"completion_length": 612.3623600006104,
"epoch": 0.31350594821553535,
"grad_norm": 0.012541095726191998,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.37437996733933687,
"reward_std": 0.11628623493015766,
"rewards/semantic_entropy_math_reward": 0.37437996733933687,
"step": 56
},
{
"completion_length": 590.5974807739258,
"epoch": 0.3191042687193842,
"grad_norm": 0.01250051986426115,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.3829365186393261,
"reward_std": 0.11953789182007313,
"rewards/semantic_entropy_math_reward": 0.3829365186393261,
"step": 57
},
{
"completion_length": 629.9375152587891,
"epoch": 0.32470258922323303,
"grad_norm": 0.01244643796235323,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.30481151584535837,
"reward_std": 0.10653208615258336,
"rewards/semantic_entropy_math_reward": 0.30481151584535837,
"step": 58
},
{
"completion_length": 663.9650478363037,
"epoch": 0.33030090972708187,
"grad_norm": 0.011867938563227654,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.2619047740008682,
"reward_std": 0.08327494573313743,
"rewards/semantic_entropy_math_reward": 0.2619047740008682,
"step": 59
},
{
"completion_length": 645.1510581970215,
"epoch": 0.3358992302309307,
"grad_norm": 0.01233300007879734,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.3126240177080035,
"reward_std": 0.10046433750540018,
"rewards/semantic_entropy_math_reward": 0.3126240177080035,
"step": 60
},
{
"completion_length": 623.23512840271,
"epoch": 0.34149755073477955,
"grad_norm": 0.012479487806558609,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.3306051716208458,
"reward_std": 0.10351803922094405,
"rewards/semantic_entropy_math_reward": 0.3306051716208458,
"step": 61
},
{
"completion_length": 632.4717350006104,
"epoch": 0.3470958712386284,
"grad_norm": 0.012427465058863163,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.31994048738852143,
"reward_std": 0.10373943694867194,
"rewards/semantic_entropy_math_reward": 0.31994048738852143,
"step": 62
},
{
"completion_length": 623.9613189697266,
"epoch": 0.35269419174247724,
"grad_norm": 0.013336232863366604,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.31547620333731174,
"reward_std": 0.11058970703743398,
"rewards/semantic_entropy_math_reward": 0.31547620333731174,
"step": 63
},
{
"completion_length": 640.9360294342041,
"epoch": 0.3582925122463261,
"grad_norm": 0.011113813146948814,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.33519346360117197,
"reward_std": 0.08675730333197862,
"rewards/semantic_entropy_math_reward": 0.33519346360117197,
"step": 64
},
{
"completion_length": 663.0587882995605,
"epoch": 0.363890832750175,
"grad_norm": 0.01317086722701788,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.2875744178891182,
"reward_std": 0.09582534979563206,
"rewards/semantic_entropy_math_reward": 0.2875744178891182,
"step": 65
},
{
"completion_length": 625.5915279388428,
"epoch": 0.3694891532540238,
"grad_norm": 0.013479222543537617,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.3365575489588082,
"reward_std": 0.10884693474508822,
"rewards/semantic_entropy_math_reward": 0.3365575489588082,
"step": 66
},
{
"completion_length": 642.8943557739258,
"epoch": 0.37508747375787266,
"grad_norm": 0.012459836900234222,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.2860863204114139,
"reward_std": 0.09167324285954237,
"rewards/semantic_entropy_math_reward": 0.2860863204114139,
"step": 67
},
{
"completion_length": 594.4159355163574,
"epoch": 0.3806857942617215,
"grad_norm": 0.013866654597222805,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.37289187777787447,
"reward_std": 0.10999573534354568,
"rewards/semantic_entropy_math_reward": 0.37289187777787447,
"step": 68
},
{
"completion_length": 640.9055099487305,
"epoch": 0.38628411476557034,
"grad_norm": 0.012836214154958725,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.2891865139827132,
"reward_std": 0.10698562301695347,
"rewards/semantic_entropy_math_reward": 0.2891865139827132,
"step": 69
},
{
"completion_length": 623.5349769592285,
"epoch": 0.3918824352694192,
"grad_norm": 0.013130308128893375,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.3031994132325053,
"reward_std": 0.10316204163245857,
"rewards/semantic_entropy_math_reward": 0.3031994132325053,
"step": 70
},
{
"completion_length": 593.7172718048096,
"epoch": 0.397480755773268,
"grad_norm": 0.013811892829835415,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.36693950183689594,
"reward_std": 0.10302408610004932,
"rewards/semantic_entropy_math_reward": 0.36693950183689594,
"step": 71
},
{
"completion_length": 597.6815605163574,
"epoch": 0.40307907627711687,
"grad_norm": 0.013042716309428215,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.359002991579473,
"reward_std": 0.12008155998773873,
"rewards/semantic_entropy_math_reward": 0.359002991579473,
"step": 72
},
{
"completion_length": 594.0282955169678,
"epoch": 0.4086773967809657,
"grad_norm": 0.01363166980445385,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.3308531828224659,
"reward_std": 0.1005927964579314,
"rewards/semantic_entropy_math_reward": 0.3308531828224659,
"step": 73
},
{
"completion_length": 638.837064743042,
"epoch": 0.41427571728481455,
"grad_norm": 0.01323084905743599,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.2997271902859211,
"reward_std": 0.1091692647896707,
"rewards/semantic_entropy_math_reward": 0.2997271902859211,
"step": 74
},
{
"completion_length": 627.7797698974609,
"epoch": 0.4198740377886634,
"grad_norm": 0.013865278102457523,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.3164682723581791,
"reward_std": 0.10915092006325722,
"rewards/semantic_entropy_math_reward": 0.3164682723581791,
"step": 75
},
{
"completion_length": 623.7016448974609,
"epoch": 0.42547235829251223,
"grad_norm": 0.013494770042598248,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.36892362777143717,
"reward_std": 0.101050935103558,
"rewards/semantic_entropy_math_reward": 0.36892362777143717,
"step": 76
},
{
"completion_length": 632.0245666503906,
"epoch": 0.4310706787963611,
"grad_norm": 0.013310288079082966,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.31349207321181893,
"reward_std": 0.10522611381020397,
"rewards/semantic_entropy_math_reward": 0.31349207321181893,
"step": 77
},
{
"completion_length": 629.9278392791748,
"epoch": 0.4366689993002099,
"grad_norm": 0.012805829755961895,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.30803572107106447,
"reward_std": 0.09856258635409176,
"rewards/semantic_entropy_math_reward": 0.30803572107106447,
"step": 78
},
{
"completion_length": 607.979175567627,
"epoch": 0.44226731980405876,
"grad_norm": 0.013692095875740051,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.32142858440056443,
"reward_std": 0.1157424389384687,
"rewards/semantic_entropy_math_reward": 0.32142858440056443,
"step": 79
},
{
"completion_length": 616.4501533508301,
"epoch": 0.44786564030790765,
"grad_norm": 0.014579751528799534,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.3225446487776935,
"reward_std": 0.10182817134773359,
"rewards/semantic_entropy_math_reward": 0.3225446487776935,
"step": 80
},
{
"completion_length": 654.9092407226562,
"epoch": 0.4534639608117565,
"grad_norm": 0.013945615850389004,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.3175843362696469,
"reward_std": 0.09967625606805086,
"rewards/semantic_entropy_math_reward": 0.3175843362696469,
"step": 81
},
{
"completion_length": 615.4084930419922,
"epoch": 0.45906228131560534,
"grad_norm": 0.014666857197880745,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.3359375102445483,
"reward_std": 0.10500352433882654,
"rewards/semantic_entropy_math_reward": 0.3359375102445483,
"step": 82
},
{
"completion_length": 615.1317005157471,
"epoch": 0.4646606018194542,
"grad_norm": 0.014408071525394917,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.3276289766654372,
"reward_std": 0.10603603813797235,
"rewards/semantic_entropy_math_reward": 0.3276289766654372,
"step": 83
},
{
"completion_length": 566.331111907959,
"epoch": 0.470258922323303,
"grad_norm": 0.015806101262569427,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.40525794867426157,
"reward_std": 0.12493724888190627,
"rewards/semantic_entropy_math_reward": 0.40525794867426157,
"step": 84
},
{
"completion_length": 651.4486694335938,
"epoch": 0.47585724282715186,
"grad_norm": 0.014175321906805038,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.29402281949296594,
"reward_std": 0.10067101614549756,
"rewards/semantic_entropy_math_reward": 0.29402281949296594,
"step": 85
},
{
"completion_length": 627.6443481445312,
"epoch": 0.4814555633310007,
"grad_norm": 0.016120119020342827,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.3126240139827132,
"reward_std": 0.10280289477668703,
"rewards/semantic_entropy_math_reward": 0.3126240139827132,
"step": 86
},
{
"completion_length": 635.4985237121582,
"epoch": 0.48705388383484954,
"grad_norm": 0.014732223004102707,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.3711557686328888,
"reward_std": 0.1085683039855212,
"rewards/semantic_entropy_math_reward": 0.3711557686328888,
"step": 87
},
{
"completion_length": 598.274564743042,
"epoch": 0.4926522043386984,
"grad_norm": 0.015387475490570068,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.39397322107106447,
"reward_std": 0.12118404218927026,
"rewards/semantic_entropy_math_reward": 0.39397322107106447,
"step": 88
},
{
"completion_length": 607.96950340271,
"epoch": 0.4982505248425472,
"grad_norm": 0.015765592455863953,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.35466271452605724,
"reward_std": 0.10943171451799572,
"rewards/semantic_entropy_math_reward": 0.35466271452605724,
"step": 89
},
{
"completion_length": 585.819206237793,
"epoch": 0.5038488453463961,
"grad_norm": 0.016726847738027573,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.3410218358039856,
"reward_std": 0.1104447974357754,
"rewards/semantic_entropy_math_reward": 0.3410218358039856,
"step": 90
},
{
"completion_length": 637.3958435058594,
"epoch": 0.509447165850245,
"grad_norm": 0.015591591596603394,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.3288690596818924,
"reward_std": 0.10522727854549885,
"rewards/semantic_entropy_math_reward": 0.3288690596818924,
"step": 91
},
{
"completion_length": 625.2872085571289,
"epoch": 0.5150454863540938,
"grad_norm": 0.016664857044816017,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.32118055783212185,
"reward_std": 0.12649832549504936,
"rewards/semantic_entropy_math_reward": 0.32118055783212185,
"step": 92
},
{
"completion_length": 593.0282936096191,
"epoch": 0.5206438068579426,
"grad_norm": 0.01702903024852276,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.33296132273972034,
"reward_std": 0.12325585260987282,
"rewards/semantic_entropy_math_reward": 0.33296132273972034,
"step": 93
},
{
"completion_length": 617.7559642791748,
"epoch": 0.5262421273617914,
"grad_norm": 0.0172974094748497,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.316096234600991,
"reward_std": 0.1146786012686789,
"rewards/semantic_entropy_math_reward": 0.316096234600991,
"step": 94
},
{
"completion_length": 636.2671279907227,
"epoch": 0.5318404478656403,
"grad_norm": 0.017720209434628487,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.2721974281594157,
"reward_std": 0.10946512292139232,
"rewards/semantic_entropy_math_reward": 0.2721974281594157,
"step": 95
},
{
"completion_length": 586.1860256195068,
"epoch": 0.5374387683694891,
"grad_norm": 0.01675945706665516,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.34561012499034405,
"reward_std": 0.08876445493660867,
"rewards/semantic_entropy_math_reward": 0.34561012499034405,
"step": 96
},
{
"completion_length": 611.3735218048096,
"epoch": 0.543037088873338,
"grad_norm": 0.01725374162197113,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.36408730782568455,
"reward_std": 0.11099315108731389,
"rewards/semantic_entropy_math_reward": 0.36408730782568455,
"step": 97
},
{
"completion_length": 614.9278392791748,
"epoch": 0.5486354093771868,
"grad_norm": 0.017101502045989037,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.32390873692929745,
"reward_std": 0.10804228414781392,
"rewards/semantic_entropy_math_reward": 0.32390873692929745,
"step": 98
},
{
"completion_length": 607.1049213409424,
"epoch": 0.5542337298810357,
"grad_norm": 0.019047001376748085,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.35553076304495335,
"reward_std": 0.1101542457472533,
"rewards/semantic_entropy_math_reward": 0.35553076304495335,
"step": 99
},
{
"completion_length": 622.6064147949219,
"epoch": 0.5598320503848845,
"grad_norm": 0.019254466518759727,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.3213045708835125,
"reward_std": 0.09209134639240801,
"rewards/semantic_entropy_math_reward": 0.3213045708835125,
"step": 100
},
{
"completion_length": 598.7961406707764,
"epoch": 0.5654303708887334,
"grad_norm": 0.020497458055615425,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.33866569120436907,
"reward_std": 0.10611562361009419,
"rewards/semantic_entropy_math_reward": 0.33866569120436907,
"step": 101
},
{
"completion_length": 650.3459911346436,
"epoch": 0.5710286913925823,
"grad_norm": 0.020331766456365585,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.2831101296469569,
"reward_std": 0.09731970471329987,
"rewards/semantic_entropy_math_reward": 0.2831101296469569,
"step": 102
},
{
"completion_length": 607.9479351043701,
"epoch": 0.5766270118964311,
"grad_norm": 0.02547612227499485,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.32688493095338345,
"reward_std": 0.10671929223462939,
"rewards/semantic_entropy_math_reward": 0.32688493095338345,
"step": 103
},
{
"completion_length": 633.948673248291,
"epoch": 0.58222533240028,
"grad_norm": 0.020476222038269043,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.3084077490493655,
"reward_std": 0.1001708343392238,
"rewards/semantic_entropy_math_reward": 0.3084077490493655,
"step": 104
},
{
"completion_length": 586.1733665466309,
"epoch": 0.5878236529041287,
"grad_norm": 0.030577119439840317,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.3710317499935627,
"reward_std": 0.1044681896455586,
"rewards/semantic_entropy_math_reward": 0.3710317499935627,
"step": 105
},
{
"completion_length": 584.3757553100586,
"epoch": 0.5934219734079776,
"grad_norm": 0.030157793313264847,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.36545139644294977,
"reward_std": 0.10664909472689033,
"rewards/semantic_entropy_math_reward": 0.36545139644294977,
"step": 106
},
{
"completion_length": 610.7224864959717,
"epoch": 0.5990202939118264,
"grad_norm": 0.02985748089849949,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.361111125908792,
"reward_std": 0.10544992447830737,
"rewards/semantic_entropy_math_reward": 0.361111125908792,
"step": 107
},
{
"completion_length": 603.1250133514404,
"epoch": 0.6046186144156753,
"grad_norm": 0.03455930948257446,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.3306051706895232,
"reward_std": 0.09714385017286986,
"rewards/semantic_entropy_math_reward": 0.3306051706895232,
"step": 108
},
{
"completion_length": 607.9910755157471,
"epoch": 0.6102169349195241,
"grad_norm": 0.08368990570306778,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.3103918735869229,
"reward_std": 0.12056175642646849,
"rewards/semantic_entropy_math_reward": 0.3103918735869229,
"step": 109
},
{
"completion_length": 583.9657821655273,
"epoch": 0.615815255423373,
"grad_norm": 0.06647204607725143,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.40166171081364155,
"reward_std": 0.11974087287671864,
"rewards/semantic_entropy_math_reward": 0.40166171081364155,
"step": 110
},
{
"completion_length": 617.5520992279053,
"epoch": 0.6214135759272218,
"grad_norm": 0.06331615895032883,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.31026787124574184,
"reward_std": 0.0944897800218314,
"rewards/semantic_entropy_math_reward": 0.31026787124574184,
"step": 111
},
{
"completion_length": 597.5513515472412,
"epoch": 0.6270118964310707,
"grad_norm": 0.1182793602347374,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.3891369206830859,
"reward_std": 0.1367324935272336,
"rewards/semantic_entropy_math_reward": 0.3891369206830859,
"step": 112
},
{
"completion_length": 599.026050567627,
"epoch": 0.6326102169349195,
"grad_norm": 0.32519498467445374,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.34784227423369884,
"reward_std": 0.11579506384441629,
"rewards/semantic_entropy_math_reward": 0.34784227423369884,
"step": 113
},
{
"completion_length": 605.5989627838135,
"epoch": 0.6382085374387684,
"grad_norm": 0.30284300446510315,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.35925100883468986,
"reward_std": 0.09489296341780573,
"rewards/semantic_entropy_math_reward": 0.35925100883468986,
"step": 114
},
{
"completion_length": 588.3794746398926,
"epoch": 0.6438068579426172,
"grad_norm": 0.5091419219970703,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.2926587341353297,
"reward_std": 0.10896235378459096,
"rewards/semantic_entropy_math_reward": 0.2926587341353297,
"step": 115
},
{
"completion_length": 585.5081958770752,
"epoch": 0.6494051784464661,
"grad_norm": 1.0533686876296997,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.302951404126361,
"reward_std": 0.10090323083568364,
"rewards/semantic_entropy_math_reward": 0.302951404126361,
"step": 116
},
{
"completion_length": 564.0706882476807,
"epoch": 0.655003498950315,
"grad_norm": 1.4623417854309082,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.2781498096883297,
"reward_std": 0.10372308688238263,
"rewards/semantic_entropy_math_reward": 0.2781498096883297,
"step": 117
},
{
"completion_length": 590.0558166503906,
"epoch": 0.6606018194541637,
"grad_norm": 1.9276891946792603,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.2147817499935627,
"reward_std": 0.0795764367794618,
"rewards/semantic_entropy_math_reward": 0.2147817499935627,
"step": 118
},
{
"completion_length": 631.612361907959,
"epoch": 0.6662001399580126,
"grad_norm": 2.644303321838379,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.17051091720350087,
"reward_std": 0.07038468832615763,
"rewards/semantic_entropy_math_reward": 0.17051091720350087,
"step": 119
},
{
"completion_length": 666.3355770111084,
"epoch": 0.6717984604618614,
"grad_norm": 3.114243984222412,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.13591270288452506,
"reward_std": 0.06976182584185153,
"rewards/semantic_entropy_math_reward": 0.13591270288452506,
"step": 120
},
{
"completion_length": 745.9434585571289,
"epoch": 0.6773967809657103,
"grad_norm": 3.7797133922576904,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.07477678649593145,
"reward_std": 0.03976587820216082,
"rewards/semantic_entropy_math_reward": 0.07477678649593145,
"step": 121
},
{
"completion_length": 807.1480865478516,
"epoch": 0.6829951014695591,
"grad_norm": 2.0227487087249756,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.03819444542750716,
"reward_std": 0.017828965152148157,
"rewards/semantic_entropy_math_reward": 0.03819444542750716,
"step": 122
},
{
"completion_length": 865.9404907226562,
"epoch": 0.688593421973408,
"grad_norm": 0.9964215755462646,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.007936508511193097,
"reward_std": 0.004420435056090355,
"rewards/semantic_entropy_math_reward": 0.007936508511193097,
"step": 123
},
{
"completion_length": 884.2105827331543,
"epoch": 0.6941917424772568,
"grad_norm": 0.30527350306510925,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.0017361112404614687,
"reward_std": 0.0005792402662336826,
"rewards/semantic_entropy_math_reward": 0.0017361112404614687,
"step": 124
},
{
"completion_length": 922.7299270629883,
"epoch": 0.6997900629811057,
"grad_norm": 0.14305374026298523,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.003100198577158153,
"reward_std": 0.001635652908589691,
"rewards/semantic_entropy_math_reward": 0.003100198577158153,
"step": 125
},
{
"completion_length": 945.5491256713867,
"epoch": 0.7053883834849545,
"grad_norm": 0.0,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/semantic_entropy_math_reward": 0.0,
"step": 126
},
{
"completion_length": 936.7715911865234,
"epoch": 0.7109867039888034,
"grad_norm": 0.2572309970855713,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.002852182718925178,
"reward_std": 0.001252256624866277,
"rewards/semantic_entropy_math_reward": 0.002852182718925178,
"step": 127
},
{
"completion_length": 928.2775459289551,
"epoch": 0.7165850244926522,
"grad_norm": 0.1054806336760521,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.0008680556202307343,
"reward_std": 0.0002896201331168413,
"rewards/semantic_entropy_math_reward": 0.0008680556202307343,
"step": 128
},
{
"completion_length": 933.6108818054199,
"epoch": 0.722183344996501,
"grad_norm": 0.2958269417285919,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.0019841270986944437,
"reward_std": 0.0009626364917494357,
"rewards/semantic_entropy_math_reward": 0.0019841270986944437,
"step": 129
},
{
"completion_length": 934.7604331970215,
"epoch": 0.72778166550035,
"grad_norm": 0.0,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/semantic_entropy_math_reward": 0.0,
"step": 130
},
{
"completion_length": 943.2418365478516,
"epoch": 0.7333799860041987,
"grad_norm": 0.4255366623401642,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.0008680556202307343,
"reward_std": 0.0002896201331168413,
"rewards/semantic_entropy_math_reward": 0.0008680556202307343,
"step": 131
},
{
"completion_length": 928.0707015991211,
"epoch": 0.7389783065080476,
"grad_norm": 0.0,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/semantic_entropy_math_reward": 0.0,
"step": 132
},
{
"completion_length": 920.4546318054199,
"epoch": 0.7445766270118964,
"grad_norm": 0.25327780842781067,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.0008680556202307343,
"reward_std": 0.0002896201331168413,
"rewards/semantic_entropy_math_reward": 0.0008680556202307343,
"step": 133
},
{
"completion_length": 894.5632667541504,
"epoch": 0.7501749475157453,
"grad_norm": 0.29155024886131287,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.0008680556202307343,
"reward_std": 0.0002896201331168413,
"rewards/semantic_entropy_math_reward": 0.0008680556202307343,
"step": 134
},
{
"completion_length": 903.4895935058594,
"epoch": 0.7557732680195941,
"grad_norm": 0.313747376203537,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.0019841270986944437,
"reward_std": 0.0009626365499570966,
"rewards/semantic_entropy_math_reward": 0.0019841270986944437,
"step": 135
},
{
"completion_length": 858.8898887634277,
"epoch": 0.761371588523443,
"grad_norm": 0.23342648148536682,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.002604166860692203,
"reward_std": 0.000868860399350524,
"rewards/semantic_entropy_math_reward": 0.002604166860692203,
"step": 136
},
{
"completion_length": 880.4099922180176,
"epoch": 0.7669699090272918,
"grad_norm": 0.863322913646698,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.002604166860692203,
"reward_std": 0.000868860399350524,
"rewards/semantic_entropy_math_reward": 0.002604166860692203,
"step": 137
},
{
"completion_length": 870.8638534545898,
"epoch": 0.7725682295311407,
"grad_norm": 0.23507745563983917,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.004092262242920697,
"reward_std": 0.002001996588660404,
"rewards/semantic_entropy_math_reward": 0.004092262242920697,
"step": 138
},
{
"completion_length": 826.1190567016602,
"epoch": 0.7781665500349895,
"grad_norm": 0.5723982453346252,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.005828373366966844,
"reward_std": 0.00254713196773082,
"rewards/semantic_entropy_math_reward": 0.005828373366966844,
"step": 139
},
{
"completion_length": 822.2314186096191,
"epoch": 0.7837648705388384,
"grad_norm": 1.1474024057388306,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.012896826025098562,
"reward_std": 0.006677947181742638,
"rewards/semantic_entropy_math_reward": 0.012896826025098562,
"step": 140
},
{
"completion_length": 847.2269515991211,
"epoch": 0.7893631910426872,
"grad_norm": 1.597180724143982,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.011656746733933687,
"reward_std": 0.005111316044349223,
"rewards/semantic_entropy_math_reward": 0.011656746733933687,
"step": 141
},
{
"completion_length": 842.1793327331543,
"epoch": 0.794961511546536,
"grad_norm": 0.7798174619674683,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.012152778450399637,
"reward_std": 0.004957869183272123,
"rewards/semantic_entropy_math_reward": 0.012152778450399637,
"step": 142
},
{
"completion_length": 831.2462921142578,
"epoch": 0.8005598320503848,
"grad_norm": 2.8579814434051514,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.028769842814654112,
"reward_std": 0.01406612026039511,
"rewards/semantic_entropy_math_reward": 0.028769842814654112,
"step": 143
},
{
"completion_length": 789.3541831970215,
"epoch": 0.8061581525542337,
"grad_norm": 3.0082154273986816,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.026289683999493718,
"reward_std": 0.014727714413311332,
"rewards/semantic_entropy_math_reward": 0.026289683999493718,
"step": 144
},
{
"completion_length": 766.1949501037598,
"epoch": 0.8117564730580826,
"grad_norm": 5.142900466918945,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.049107144586741924,
"reward_std": 0.028222940571140498,
"rewards/semantic_entropy_math_reward": 0.049107144586741924,
"step": 145
},
{
"completion_length": 758.3102760314941,
"epoch": 0.8173547935619314,
"grad_norm": 2.1914703845977783,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.071180559694767,
"reward_std": 0.042430724075529724,
"rewards/semantic_entropy_math_reward": 0.071180559694767,
"step": 146
},
{
"completion_length": 705.601203918457,
"epoch": 0.8229531140657803,
"grad_norm": 2.148831605911255,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.09474206739105284,
"reward_std": 0.051515123690478504,
"rewards/semantic_entropy_math_reward": 0.09474206739105284,
"step": 147
},
{
"completion_length": 714.9263496398926,
"epoch": 0.8285514345696291,
"grad_norm": 2.954876184463501,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.10466270102187991,
"reward_std": 0.06257531465962529,
"rewards/semantic_entropy_math_reward": 0.10466270102187991,
"step": 148
},
{
"completion_length": 674.5491104125977,
"epoch": 0.834149755073478,
"grad_norm": 2.0350775718688965,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.13653274183161557,
"reward_std": 0.07629197556525469,
"rewards/semantic_entropy_math_reward": 0.13653274183161557,
"step": 149
},
{
"completion_length": 637.178581237793,
"epoch": 0.8397480755773268,
"grad_norm": 3.029639482498169,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.1703869104385376,
"reward_std": 0.08733957540243864,
"rewards/semantic_entropy_math_reward": 0.1703869104385376,
"step": 150
},
{
"completion_length": 673.2440567016602,
"epoch": 0.8453463960811757,
"grad_norm": 1.96150803565979,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.14583333884365857,
"reward_std": 0.08530042838538066,
"rewards/semantic_entropy_math_reward": 0.14583333884365857,
"step": 151
},
{
"completion_length": 618.9092407226562,
"epoch": 0.8509447165850245,
"grad_norm": 1.5204460620880127,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.12810020195320249,
"reward_std": 0.07424356217961758,
"rewards/semantic_entropy_math_reward": 0.12810020195320249,
"step": 152
},
{
"completion_length": 596.5007553100586,
"epoch": 0.8565430370888734,
"grad_norm": 2.1267242431640625,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.07378472597338259,
"reward_std": 0.04104530799668282,
"rewards/semantic_entropy_math_reward": 0.07378472597338259,
"step": 153
},
{
"completion_length": 596.3943500518799,
"epoch": 0.8621413575927221,
"grad_norm": 1.156010627746582,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.051835319376550615,
"reward_std": 0.028525067784357816,
"rewards/semantic_entropy_math_reward": 0.051835319376550615,
"step": 154
},
{
"completion_length": 649.7128067016602,
"epoch": 0.867739678096571,
"grad_norm": 0.9627940058708191,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.03881448460742831,
"reward_std": 0.022329314553644508,
"rewards/semantic_entropy_math_reward": 0.03881448460742831,
"step": 155
},
{
"completion_length": 678.5580520629883,
"epoch": 0.8733379986004198,
"grad_norm": 1.0966858863830566,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.037822422687895596,
"reward_std": 0.0146516069653444,
"rewards/semantic_entropy_math_reward": 0.037822422687895596,
"step": 156
},
{
"completion_length": 727.682300567627,
"epoch": 0.8789363191042687,
"grad_norm": 1.220819115638733,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.04303075547795743,
"reward_std": 0.02123075199779123,
"rewards/semantic_entropy_math_reward": 0.04303075547795743,
"step": 157
},
{
"completion_length": 693.4948043823242,
"epoch": 0.8845346396081175,
"grad_norm": 2.7909467220306396,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.09176587720867246,
"reward_std": 0.04625473154010251,
"rewards/semantic_entropy_math_reward": 0.09176587720867246,
"step": 158
},
{
"completion_length": 734.0967407226562,
"epoch": 0.8901329601119664,
"grad_norm": 1.8816090822219849,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.09660218469798565,
"reward_std": 0.0377241056994535,
"rewards/semantic_entropy_math_reward": 0.09660218469798565,
"step": 159
},
{
"completion_length": 750.3869190216064,
"epoch": 0.8957312806158153,
"grad_norm": 1.124207854270935,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.09176587732508779,
"reward_std": 0.04596630920423195,
"rewards/semantic_entropy_math_reward": 0.09176587732508779,
"step": 160
},
{
"completion_length": 760.7053718566895,
"epoch": 0.9013296011196641,
"grad_norm": 1.2950589656829834,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.09536210435908288,
"reward_std": 0.05483918351819739,
"rewards/semantic_entropy_math_reward": 0.09536210435908288,
"step": 161
},
{
"completion_length": 761.5186157226562,
"epoch": 0.906927921623513,
"grad_norm": 0.7260070443153381,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.10466269659809768,
"reward_std": 0.04580701712984592,
"rewards/semantic_entropy_math_reward": 0.10466269659809768,
"step": 162
},
{
"completion_length": 713.2239723205566,
"epoch": 0.9125262421273618,
"grad_norm": 0.9426623582839966,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.13727678637951612,
"reward_std": 0.06796611158642918,
"rewards/semantic_entropy_math_reward": 0.13727678637951612,
"step": 163
},
{
"completion_length": 705.643611907959,
"epoch": 0.9181245626312107,
"grad_norm": 1.4740947484970093,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.18241567676886916,
"reward_std": 0.08067136688623577,
"rewards/semantic_entropy_math_reward": 0.18241567676886916,
"step": 164
},
{
"completion_length": 731.9352836608887,
"epoch": 0.9237228831350595,
"grad_norm": 0.8142076134681702,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.15947421244345605,
"reward_std": 0.0733173037879169,
"rewards/semantic_entropy_math_reward": 0.15947421244345605,
"step": 165
},
{
"completion_length": 794.4799270629883,
"epoch": 0.9293212036389084,
"grad_norm": 0.5677748322486877,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.14360119448974729,
"reward_std": 0.06343726580962539,
"rewards/semantic_entropy_math_reward": 0.14360119448974729,
"step": 166
},
{
"completion_length": 767.6942100524902,
"epoch": 0.9349195241427571,
"grad_norm": 0.707513153553009,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.1733631044626236,
"reward_std": 0.06966668769018725,
"rewards/semantic_entropy_math_reward": 0.1733631044626236,
"step": 167
},
{
"completion_length": 822.2001686096191,
"epoch": 0.940517844646606,
"grad_norm": 0.5552101731300354,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.1324404844781384,
"reward_std": 0.05633544718148187,
"rewards/semantic_entropy_math_reward": 0.1324404844781384,
"step": 168
},
{
"completion_length": 841.3861694335938,
"epoch": 0.9461161651504548,
"grad_norm": 0.26897794008255005,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.13591270486358553,
"reward_std": 0.04643521481193602,
"rewards/semantic_entropy_math_reward": 0.13591270486358553,
"step": 169
},
{
"completion_length": 842.628734588623,
"epoch": 0.9517144856543037,
"grad_norm": 0.4329053461551666,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.12388393026776612,
"reward_std": 0.06243461259873584,
"rewards/semantic_entropy_math_reward": 0.12388393026776612,
"step": 170
},
{
"completion_length": 852.5639991760254,
"epoch": 0.9573128061581525,
"grad_norm": 0.3284468352794647,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.12909226480405778,
"reward_std": 0.05489638983272016,
"rewards/semantic_entropy_math_reward": 0.12909226480405778,
"step": 171
},
{
"completion_length": 864.0989837646484,
"epoch": 0.9629111266620014,
"grad_norm": 0.4691842794418335,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.12165178672876209,
"reward_std": 0.05811122967861593,
"rewards/semantic_entropy_math_reward": 0.12165178672876209,
"step": 172
},
{
"completion_length": 894.2678718566895,
"epoch": 0.9685094471658502,
"grad_norm": 0.43808817863464355,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.0978422649204731,
"reward_std": 0.043283838051138446,
"rewards/semantic_entropy_math_reward": 0.0978422649204731,
"step": 173
},
{
"completion_length": 890.3757591247559,
"epoch": 0.9741077676696991,
"grad_norm": 0.7790409922599792,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.09747024194803089,
"reward_std": 0.04836806608363986,
"rewards/semantic_entropy_math_reward": 0.09747024194803089,
"step": 174
},
{
"completion_length": 922.7834930419922,
"epoch": 0.979706088173548,
"grad_norm": 1.0157604217529297,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.06386408989783376,
"reward_std": 0.03726256394293159,
"rewards/semantic_entropy_math_reward": 0.06386408989783376,
"step": 175
},
{
"completion_length": 921.322193145752,
"epoch": 0.9853044086773968,
"grad_norm": 1.7826768159866333,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.07279266219120473,
"reward_std": 0.04188264685217291,
"rewards/semantic_entropy_math_reward": 0.07279266219120473,
"step": 176
},
{
"completion_length": 897.9613227844238,
"epoch": 0.9909027291812457,
"grad_norm": 5.7943010330200195,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.10453869169577956,
"reward_std": 0.04441492026671767,
"rewards/semantic_entropy_math_reward": 0.10453869169577956,
"step": 177
},
{
"completion_length": 899.3638496398926,
"epoch": 0.9965010496850945,
"grad_norm": 3.6061346530914307,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.08655754150822759,
"reward_std": 0.03804769192356616,
"rewards/semantic_entropy_math_reward": 0.08655754150822759,
"step": 178
},
{
"epoch": 0.9965010496850945,
"step": 178,
"total_flos": 0.0,
"train_loss": 5.786010212772085e-09,
"train_runtime": 7433.3342,
"train_samples_per_second": 2.691,
"train_steps_per_second": 0.024
}
],
"logging_steps": 1,
"max_steps": 178,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}