{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999429921003339, "eval_steps": 100, "global_step": 767, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 837.117338180542, "epoch": 0.0013030377066536364, "grad_norm": 0.11719735711812973, "kl": 0.0, "learning_rate": 1.298701298701299e-07, "loss": 0.0, "reward": 0.6811224427074194, "reward_std": 0.3973510405048728, "rewards/accuracy_reward": 0.29081632010638714, "rewards/semantic_entropy_math_reward": 0.39030611142516136, "step": 1 }, { "completion_length": 972.1976890563965, "epoch": 0.0026060754133072728, "grad_norm": 0.2169051319360733, "kl": 0.0, "learning_rate": 2.597402597402598e-07, "loss": 0.0, "reward": 0.6302842609584332, "reward_std": 0.39035186264663935, "rewards/accuracy_reward": 0.2716836668550968, "rewards/semantic_entropy_math_reward": 0.3586005736142397, "step": 2 }, { "completion_length": 926.5216674804688, "epoch": 0.003909113119960909, "grad_norm": 0.16815422475337982, "kl": 0.0001952052116394043, "learning_rate": 3.896103896103896e-07, "loss": 0.0, "reward": 0.683673458173871, "reward_std": 0.4213488204404712, "rewards/accuracy_reward": 0.31632652459666133, "rewards/semantic_entropy_math_reward": 0.36734691448509693, "step": 3 }, { "completion_length": 881.6836547851562, "epoch": 0.0052121508266145455, "grad_norm": 0.4654393494129181, "kl": 0.00020813941955566406, "learning_rate": 5.194805194805196e-07, "loss": 0.0, "reward": 0.6614431571215391, "reward_std": 0.377839676104486, "rewards/accuracy_reward": 0.27551020169630647, "rewards/semantic_entropy_math_reward": 0.385932931676507, "step": 4 }, { "completion_length": 956.8762588500977, "epoch": 0.006515188533268181, "grad_norm": 0.19370803236961365, "kl": 0.00020241737365722656, "learning_rate": 6.493506493506493e-07, "loss": 0.0, "reward": 0.6102405320852995, "reward_std": 0.388237239792943, "rewards/accuracy_reward": 0.2640306055545807, "rewards/semantic_entropy_math_reward": 0.3462098930031061, "step": 5 }, { "completion_length": 938.7933540344238, "epoch": 0.007818226239921818, "grad_norm": 0.09594990313053131, "kl": 0.00023156404495239258, "learning_rate": 7.792207792207792e-07, "loss": 0.0, "reward": 0.6949708424508572, "reward_std": 0.3696737764403224, "rewards/accuracy_reward": 0.311224483884871, "rewards/semantic_entropy_math_reward": 0.38374633714556694, "step": 6 }, { "completion_length": 873.9221801757812, "epoch": 0.009121263946575454, "grad_norm": 0.5217923521995544, "kl": 0.00040721893310546875, "learning_rate": 9.090909090909091e-07, "loss": 0.0, "reward": 0.6257288511842489, "reward_std": 0.35030856262892485, "rewards/accuracy_reward": 0.25510203931480646, "rewards/semantic_entropy_math_reward": 0.370626799762249, "step": 7 }, { "completion_length": 887.3800849914551, "epoch": 0.010424301653229091, "grad_norm": 0.13424217700958252, "kl": 0.0002671480178833008, "learning_rate": 1.0389610389610392e-06, "loss": 0.0, "reward": 0.7221210040152073, "reward_std": 0.3779106102883816, "rewards/accuracy_reward": 0.32780611608177423, "rewards/semantic_entropy_math_reward": 0.3943148562684655, "step": 8 }, { "completion_length": 860.005090713501, "epoch": 0.011727339359882726, "grad_norm": 0.16939085721969604, "kl": 0.0003975033760070801, "learning_rate": 1.168831168831169e-06, "loss": 0.0, "reward": 0.6339285708963871, "reward_std": 0.3661261713132262, "rewards/accuracy_reward": 0.27678570710122585, "rewards/semantic_entropy_math_reward": 0.357142835855484, "step": 9 }, { "completion_length": 860.2563591003418, "epoch": 0.013030377066536362, "grad_norm": 0.22313131392002106, "kl": 0.0009191036224365234, "learning_rate": 1.2987012987012986e-06, "loss": 0.0, "reward": 0.6468658875674009, "reward_std": 0.37869824562221766, "rewards/accuracy_reward": 0.2602040767669678, "rewards/semantic_entropy_math_reward": 0.38666180428117514, "step": 10 }, { "completion_length": 912.9655456542969, "epoch": 0.014333414773189999, "grad_norm": 0.13730381429195404, "kl": 0.0010298490524291992, "learning_rate": 1.4285714285714286e-06, "loss": 0.0, "reward": 0.7288629822432995, "reward_std": 0.38233583606779575, "rewards/accuracy_reward": 0.3112244848161936, "rewards/semantic_entropy_math_reward": 0.4176384676247835, "step": 11 }, { "completion_length": 887.2882499694824, "epoch": 0.015636452479843636, "grad_norm": 0.08811619132757187, "kl": 0.0010454654693603516, "learning_rate": 1.5584415584415584e-06, "loss": 0.0, "reward": 0.7310495655983686, "reward_std": 0.3922089450061321, "rewards/accuracy_reward": 0.31122448295354843, "rewards/semantic_entropy_math_reward": 0.41982507333159447, "step": 12 }, { "completion_length": 957.2002410888672, "epoch": 0.01693949018649727, "grad_norm": 0.08093774318695068, "kl": 0.0013872385025024414, "learning_rate": 1.6883116883116885e-06, "loss": 0.0001, "reward": 0.7572886236011982, "reward_std": 0.39389276318252087, "rewards/accuracy_reward": 0.3265306046232581, "rewards/semantic_entropy_math_reward": 0.43075800873339176, "step": 13 }, { "completion_length": 892.6275367736816, "epoch": 0.01824252789315091, "grad_norm": 0.060123980045318604, "kl": 0.0018470287322998047, "learning_rate": 1.8181818181818183e-06, "loss": 0.0001, "reward": 0.8494897894561291, "reward_std": 0.36673240549862385, "rewards/accuracy_reward": 0.3928571380674839, "rewards/semantic_entropy_math_reward": 0.456632636487484, "step": 14 }, { "completion_length": 894.3673400878906, "epoch": 0.019545565599804544, "grad_norm": 0.022749852389097214, "kl": 0.0034775733947753906, "learning_rate": 1.9480519480519483e-06, "loss": 0.0001, "reward": 0.8185131289064884, "reward_std": 0.38146936148405075, "rewards/accuracy_reward": 0.32908162544481456, "rewards/semantic_entropy_math_reward": 0.48943147249519825, "step": 15 }, { "completion_length": 846.9183502197266, "epoch": 0.020848603306458182, "grad_norm": 0.020909909158945084, "kl": 0.0016388893127441406, "learning_rate": 2.0779220779220784e-06, "loss": 0.0001, "reward": 0.8575072810053825, "reward_std": 0.4253711849451065, "rewards/accuracy_reward": 0.38010203279554844, "rewards/semantic_entropy_math_reward": 0.47740523517131805, "step": 16 }, { "completion_length": 882.2754821777344, "epoch": 0.022151641013111817, "grad_norm": 0.01860440894961357, "kl": 0.001783609390258789, "learning_rate": 2.207792207792208e-06, "loss": 0.0001, "reward": 0.8316326588392258, "reward_std": 0.39866786170750856, "rewards/accuracy_reward": 0.3545918306335807, "rewards/semantic_entropy_math_reward": 0.47704081051051617, "step": 17 }, { "completion_length": 950.2474327087402, "epoch": 0.023454678719765452, "grad_norm": 0.023692086338996887, "kl": 0.0025060176849365234, "learning_rate": 2.337662337662338e-06, "loss": 0.0001, "reward": 0.8241618098691106, "reward_std": 0.3441563588567078, "rewards/accuracy_reward": 0.36096938140690327, "rewards/semantic_entropy_math_reward": 0.4631924098357558, "step": 18 }, { "completion_length": 878.4017677307129, "epoch": 0.02475771642641909, "grad_norm": 0.014910629019141197, "kl": 0.0018167495727539062, "learning_rate": 2.4675324675324676e-06, "loss": 0.0001, "reward": 0.9615524858236313, "reward_std": 0.32960202172398567, "rewards/accuracy_reward": 0.4196428544819355, "rewards/semantic_entropy_math_reward": 0.5419096015393734, "step": 19 }, { "completion_length": 872.860954284668, "epoch": 0.026060754133072725, "grad_norm": 0.01278847735375166, "kl": 0.0022249221801757812, "learning_rate": 2.597402597402597e-06, "loss": 0.0001, "reward": 0.978498537093401, "reward_std": 0.3813723949715495, "rewards/accuracy_reward": 0.4336734665557742, "rewards/semantic_entropy_math_reward": 0.5448250565677881, "step": 20 }, { "completion_length": 948.8890113830566, "epoch": 0.027363791839726363, "grad_norm": 0.00900535099208355, "kl": 0.002373218536376953, "learning_rate": 2.7272727272727272e-06, "loss": 0.0001, "reward": 0.9145408179610968, "reward_std": 0.36285410914570093, "rewards/accuracy_reward": 0.39413264486938715, "rewards/semantic_entropy_math_reward": 0.520408159121871, "step": 21 }, { "completion_length": 826.0229454040527, "epoch": 0.028666829546379998, "grad_norm": 0.006731751374900341, "kl": 0.0026221275329589844, "learning_rate": 2.8571428571428573e-06, "loss": 0.0001, "reward": 1.045007273554802, "reward_std": 0.36006426997482777, "rewards/accuracy_reward": 0.4655612148344517, "rewards/semantic_entropy_math_reward": 0.5794460419565439, "step": 22 }, { "completion_length": 876.6007499694824, "epoch": 0.029969867253033633, "grad_norm": 0.007352499291300774, "kl": 0.0028810501098632812, "learning_rate": 2.9870129870129873e-06, "loss": 0.0001, "reward": 0.8857507407665253, "reward_std": 0.32827571872621775, "rewards/accuracy_reward": 0.36096938140690327, "rewards/semantic_entropy_math_reward": 0.5247813425958157, "step": 23 }, { "completion_length": 876.844367980957, "epoch": 0.03127290495968727, "grad_norm": 0.019214602187275887, "kl": 0.0031137466430664062, "learning_rate": 3.116883116883117e-06, "loss": 0.0001, "reward": 1.0346209965646267, "reward_std": 0.38960837246850133, "rewards/accuracy_reward": 0.4693877431564033, "rewards/semantic_entropy_math_reward": 0.5652332212775946, "step": 24 }, { "completion_length": 883.5548286437988, "epoch": 0.03257594266634091, "grad_norm": 0.011122680269181728, "kl": 0.005339145660400391, "learning_rate": 3.246753246753247e-06, "loss": 0.0002, "reward": 0.9342201203107834, "reward_std": 0.3377914186567068, "rewards/accuracy_reward": 0.401785708963871, "rewards/semantic_entropy_math_reward": 0.5324343908578157, "step": 25 }, { "completion_length": 812.6938591003418, "epoch": 0.03387898037299454, "grad_norm": 0.007444577291607857, "kl": 0.0033130645751953125, "learning_rate": 3.376623376623377e-06, "loss": 0.0001, "reward": 1.0052842311561108, "reward_std": 0.32810083008371294, "rewards/accuracy_reward": 0.4247448882088065, "rewards/semantic_entropy_math_reward": 0.5805393513292074, "step": 26 }, { "completion_length": 902.4158020019531, "epoch": 0.03518201807964818, "grad_norm": 0.011285696178674698, "kl": 0.0042247772216796875, "learning_rate": 3.506493506493507e-06, "loss": 0.0002, "reward": 1.0041909366846085, "reward_std": 0.3440978638827801, "rewards/accuracy_reward": 0.465561218559742, "rewards/semantic_entropy_math_reward": 0.538629712536931, "step": 27 }, { "completion_length": 844.1390075683594, "epoch": 0.03648505578630182, "grad_norm": 0.0073396568186581135, "kl": 0.0029196739196777344, "learning_rate": 3.6363636363636366e-06, "loss": 0.0001, "reward": 0.9125364236533642, "reward_std": 0.29272227128967643, "rewards/accuracy_reward": 0.35969386901706457, "rewards/semantic_entropy_math_reward": 0.5528425462543964, "step": 28 }, { "completion_length": 822.195140838623, "epoch": 0.03778809349295545, "grad_norm": 0.020257532596588135, "kl": 0.0035996437072753906, "learning_rate": 3.7662337662337666e-06, "loss": 0.0001, "reward": 0.9602769706398249, "reward_std": 0.3424716182053089, "rewards/accuracy_reward": 0.39540815842337906, "rewards/semantic_entropy_math_reward": 0.5648687873035669, "step": 29 }, { "completion_length": 872.8749771118164, "epoch": 0.03909113119960909, "grad_norm": 0.013227947056293488, "kl": 0.002673625946044922, "learning_rate": 3.896103896103897e-06, "loss": 0.0001, "reward": 0.8850218430161476, "reward_std": 0.30675759073346853, "rewards/accuracy_reward": 0.36607142072170973, "rewards/semantic_entropy_math_reward": 0.5189504232257605, "step": 30 }, { "completion_length": 862.5892677307129, "epoch": 0.040394168906262726, "grad_norm": 0.014418520964682102, "kl": 0.004436492919921875, "learning_rate": 4.025974025974026e-06, "loss": 0.0002, "reward": 0.9806851223111153, "reward_std": 0.30820578522980213, "rewards/accuracy_reward": 0.4107142804423347, "rewards/semantic_entropy_math_reward": 0.5699708443135023, "step": 31 }, { "completion_length": 914.6390151977539, "epoch": 0.041697206612916364, "grad_norm": 0.007041982375085354, "kl": 0.0035076141357421875, "learning_rate": 4.155844155844157e-06, "loss": 0.0001, "reward": 0.9010568410158157, "reward_std": 0.2954637464135885, "rewards/accuracy_reward": 0.38647958636283875, "rewards/semantic_entropy_math_reward": 0.5145772434771061, "step": 32 }, { "completion_length": 825.9298286437988, "epoch": 0.043000244319569995, "grad_norm": 0.018051927909255028, "kl": 0.005063056945800781, "learning_rate": 4.2857142857142855e-06, "loss": 0.0002, "reward": 1.0628644227981567, "reward_std": 0.3121971767395735, "rewards/accuracy_reward": 0.45790815353393555, "rewards/semantic_entropy_math_reward": 0.6049562580883503, "step": 33 }, { "completion_length": 864.1938667297363, "epoch": 0.044303282026223634, "grad_norm": 0.006250304169952869, "kl": 0.0040149688720703125, "learning_rate": 4.415584415584416e-06, "loss": 0.0002, "reward": 0.9241982474923134, "reward_std": 0.269044600892812, "rewards/accuracy_reward": 0.3749999925494194, "rewards/semantic_entropy_math_reward": 0.5491982270032167, "step": 34 }, { "completion_length": 852.3354454040527, "epoch": 0.04560631973287727, "grad_norm": 0.018021654337644577, "kl": 0.004012107849121094, "learning_rate": 4.5454545454545455e-06, "loss": 0.0002, "reward": 0.9327623881399632, "reward_std": 0.3168298453092575, "rewards/accuracy_reward": 0.3864795882254839, "rewards/semantic_entropy_math_reward": 0.5462827850133181, "step": 35 }, { "completion_length": 840.0777816772461, "epoch": 0.046909357439530903, "grad_norm": 0.006817657966166735, "kl": 0.0049877166748046875, "learning_rate": 4.675324675324676e-06, "loss": 0.0002, "reward": 1.0307944566011429, "reward_std": 0.3188680615276098, "rewards/accuracy_reward": 0.45280611515045166, "rewards/semantic_entropy_math_reward": 0.5779883302748203, "step": 36 }, { "completion_length": 876.6913070678711, "epoch": 0.04821239514618454, "grad_norm": 0.008254569955170155, "kl": 0.005570411682128906, "learning_rate": 4.805194805194806e-06, "loss": 0.0002, "reward": 0.8990524746477604, "reward_std": 0.23824061546474695, "rewards/accuracy_reward": 0.33418366592377424, "rewards/semantic_entropy_math_reward": 0.5648687966167927, "step": 37 }, { "completion_length": 794.7346801757812, "epoch": 0.04951543285283818, "grad_norm": 0.00637207692489028, "kl": 0.004886627197265625, "learning_rate": 4.935064935064935e-06, "loss": 0.0002, "reward": 1.0145772472023964, "reward_std": 0.250257789157331, "rewards/accuracy_reward": 0.3877550959587097, "rewards/semantic_entropy_math_reward": 0.6268221437931061, "step": 38 }, { "completion_length": 816.334171295166, "epoch": 0.05081847055949182, "grad_norm": 0.006815893575549126, "kl": 0.003961086273193359, "learning_rate": 5.064935064935065e-06, "loss": 0.0002, "reward": 1.0349854193627834, "reward_std": 0.2880831710062921, "rewards/accuracy_reward": 0.4566326476633549, "rewards/semantic_entropy_math_reward": 0.5783527530729771, "step": 39 }, { "completion_length": 830.1492233276367, "epoch": 0.05212150826614545, "grad_norm": 0.0067922151647508144, "kl": 0.005191802978515625, "learning_rate": 5.194805194805194e-06, "loss": 0.0002, "reward": 1.0590378902852535, "reward_std": 0.35004573687911034, "rewards/accuracy_reward": 0.4668367230333388, "rewards/semantic_entropy_math_reward": 0.5922011528164148, "step": 40 }, { "completion_length": 828.845645904541, "epoch": 0.05342454597279909, "grad_norm": 0.005164754576981068, "kl": 0.0037794113159179688, "learning_rate": 5.324675324675325e-06, "loss": 0.0002, "reward": 0.9498906619846821, "reward_std": 0.252530072350055, "rewards/accuracy_reward": 0.3686224427074194, "rewards/semantic_entropy_math_reward": 0.5812681969255209, "step": 41 }, { "completion_length": 784.6721687316895, "epoch": 0.054727583679452727, "grad_norm": 0.012475547380745411, "kl": 0.006168365478515625, "learning_rate": 5.4545454545454545e-06, "loss": 0.0002, "reward": 0.9402332231402397, "reward_std": 0.25077344570308924, "rewards/accuracy_reward": 0.3622448925161734, "rewards/semantic_entropy_math_reward": 0.5779883246868849, "step": 42 }, { "completion_length": 817.7576332092285, "epoch": 0.05603062138610636, "grad_norm": 0.007497240789234638, "kl": 0.004801750183105469, "learning_rate": 5.584415584415585e-06, "loss": 0.0002, "reward": 0.9518950320780277, "reward_std": 0.25486068893224, "rewards/accuracy_reward": 0.35969387111254036, "rewards/semantic_entropy_math_reward": 0.5922011416405439, "step": 43 }, { "completion_length": 796.2729396820068, "epoch": 0.057333659092759996, "grad_norm": 0.005530911963433027, "kl": 0.004511833190917969, "learning_rate": 5.7142857142857145e-06, "loss": 0.0002, "reward": 1.068877525627613, "reward_std": 0.26923465402796865, "rewards/accuracy_reward": 0.44387753680348396, "rewards/semantic_entropy_math_reward": 0.6249999832361937, "step": 44 }, { "completion_length": 825.4961490631104, "epoch": 0.058636696799413635, "grad_norm": 0.006700224243104458, "kl": 0.004664421081542969, "learning_rate": 5.844155844155844e-06, "loss": 0.0002, "reward": 0.982507299631834, "reward_std": 0.27642543241381645, "rewards/accuracy_reward": 0.40051019936800003, "rewards/semantic_entropy_math_reward": 0.5819970965385437, "step": 45 }, { "completion_length": 834.4808540344238, "epoch": 0.059939734506067266, "grad_norm": 0.005738178268074989, "kl": 0.0040416717529296875, "learning_rate": 5.9740259740259746e-06, "loss": 0.0002, "reward": 1.1045918501913548, "reward_std": 0.25190831208601594, "rewards/accuracy_reward": 0.48724489007145166, "rewards/semantic_entropy_math_reward": 0.6173469312489033, "step": 46 }, { "completion_length": 735.9196319580078, "epoch": 0.061242772212720904, "grad_norm": 0.0074202013202011585, "kl": 0.004734992980957031, "learning_rate": 6.103896103896104e-06, "loss": 0.0002, "reward": 1.0723396502435207, "reward_std": 0.24583148723468184, "rewards/accuracy_reward": 0.42219386994838715, "rewards/semantic_entropy_math_reward": 0.6501457560807467, "step": 47 }, { "completion_length": 675.4680938720703, "epoch": 0.06254580991937454, "grad_norm": 0.010823776945471764, "kl": 0.00669097900390625, "learning_rate": 6.233766233766234e-06, "loss": 0.0003, "reward": 1.0912900753319263, "reward_std": 0.23417045222595334, "rewards/accuracy_reward": 0.46045917738229036, "rewards/semantic_entropy_math_reward": 0.6308308802545071, "step": 48 }, { "completion_length": 777.5943756103516, "epoch": 0.06384884762602817, "grad_norm": 0.006463744677603245, "kl": 0.004790306091308594, "learning_rate": 6.363636363636364e-06, "loss": 0.0002, "reward": 1.1270043663680553, "reward_std": 0.2139966154936701, "rewards/accuracy_reward": 0.44515305384993553, "rewards/semantic_entropy_math_reward": 0.6818513125181198, "step": 49 }, { "completion_length": 770.5675811767578, "epoch": 0.06515188533268182, "grad_norm": 0.0070662652142345905, "kl": 0.005097389221191406, "learning_rate": 6.493506493506494e-06, "loss": 0.0002, "reward": 1.047740526497364, "reward_std": 0.22461897239554673, "rewards/accuracy_reward": 0.4515306055545807, "rewards/semantic_entropy_math_reward": 0.5962099004536867, "step": 50 }, { "completion_length": 771.9272727966309, "epoch": 0.06645492303933545, "grad_norm": 0.008104674518108368, "kl": 0.0056781768798828125, "learning_rate": 6.623376623376624e-06, "loss": 0.0002, "reward": 1.083636999130249, "reward_std": 0.26672434131614864, "rewards/accuracy_reward": 0.47066325321793556, "rewards/semantic_entropy_math_reward": 0.6129737384617329, "step": 51 }, { "completion_length": 752.4757537841797, "epoch": 0.06775796074598908, "grad_norm": 0.008416865020990372, "kl": 0.005208015441894531, "learning_rate": 6.753246753246754e-06, "loss": 0.0002, "reward": 1.074344016611576, "reward_std": 0.3144884016364813, "rewards/accuracy_reward": 0.4693877473473549, "rewards/semantic_entropy_math_reward": 0.6049562711268663, "step": 52 }, { "completion_length": 720.4795722961426, "epoch": 0.06906099845264273, "grad_norm": 0.006763579789549112, "kl": 0.005793571472167969, "learning_rate": 6.8831168831168835e-06, "loss": 0.0002, "reward": 1.1663629487156868, "reward_std": 0.24514551227912307, "rewards/accuracy_reward": 0.5165816247463226, "rewards/semantic_entropy_math_reward": 0.6497813239693642, "step": 53 }, { "completion_length": 722.3532943725586, "epoch": 0.07036403615929636, "grad_norm": 0.009582942351698875, "kl": 0.005523681640625, "learning_rate": 7.012987012987014e-06, "loss": 0.0002, "reward": 1.134110789746046, "reward_std": 0.2409876617603004, "rewards/accuracy_reward": 0.4846938671544194, "rewards/semantic_entropy_math_reward": 0.6494168881326914, "step": 54 }, { "completion_length": 714.0586566925049, "epoch": 0.07166707386594999, "grad_norm": 0.011050197295844555, "kl": 0.0056095123291015625, "learning_rate": 7.1428571428571436e-06, "loss": 0.0002, "reward": 1.0712463296949863, "reward_std": 0.2608937746845186, "rewards/accuracy_reward": 0.4221938708797097, "rewards/semantic_entropy_math_reward": 0.6490524634718895, "step": 55 }, { "completion_length": 734.0395278930664, "epoch": 0.07297011157260364, "grad_norm": 0.04811783879995346, "kl": 0.005349159240722656, "learning_rate": 7.272727272727273e-06, "loss": 0.0002, "reward": 1.0807215794920921, "reward_std": 0.22207323630573228, "rewards/accuracy_reward": 0.4426020343089476, "rewards/semantic_entropy_math_reward": 0.6381195047870278, "step": 56 }, { "completion_length": 705.2244758605957, "epoch": 0.07427314927925727, "grad_norm": 0.009875384159386158, "kl": 0.006084442138671875, "learning_rate": 7.402597402597404e-06, "loss": 0.0002, "reward": 1.0444606319069862, "reward_std": 0.2517742062918842, "rewards/accuracy_reward": 0.42857141606509686, "rewards/semantic_entropy_math_reward": 0.6158892009407282, "step": 57 }, { "completion_length": 708.8584022521973, "epoch": 0.0755761869859109, "grad_norm": 0.006270533427596092, "kl": 0.006121635437011719, "learning_rate": 7.532467532467533e-06, "loss": 0.0002, "reward": 1.157434392720461, "reward_std": 0.24472255888395011, "rewards/accuracy_reward": 0.5204081498086452, "rewards/semantic_entropy_math_reward": 0.6370262205600739, "step": 58 }, { "completion_length": 677.4030475616455, "epoch": 0.07687922469256454, "grad_norm": 0.006590430624783039, "kl": 0.005878448486328125, "learning_rate": 7.662337662337663e-06, "loss": 0.0002, "reward": 1.1669095940887928, "reward_std": 0.20257475320249796, "rewards/accuracy_reward": 0.46683672128710896, "rewards/semantic_entropy_math_reward": 0.7000728715211153, "step": 59 }, { "completion_length": 707.8966674804688, "epoch": 0.07818226239921817, "grad_norm": 0.00708845816552639, "kl": 0.005919456481933594, "learning_rate": 7.792207792207793e-06, "loss": 0.0002, "reward": 1.129008736461401, "reward_std": 0.24630564730614424, "rewards/accuracy_reward": 0.4617346879094839, "rewards/semantic_entropy_math_reward": 0.6672740392386913, "step": 60 }, { "completion_length": 684.0229454040527, "epoch": 0.07948530010587182, "grad_norm": 0.005163955502212048, "kl": 0.005597114562988281, "learning_rate": 7.922077922077924e-06, "loss": 0.0002, "reward": 1.2408891879022121, "reward_std": 0.2254499546252191, "rewards/accuracy_reward": 0.5484693786129355, "rewards/semantic_entropy_math_reward": 0.6924197934567928, "step": 61 }, { "completion_length": 706.8915710449219, "epoch": 0.08078833781252545, "grad_norm": 0.00732272956520319, "kl": 0.0057201385498046875, "learning_rate": 8.051948051948052e-06, "loss": 0.0002, "reward": 1.1681851595640182, "reward_std": 0.24154668161645532, "rewards/accuracy_reward": 0.5063775405287743, "rewards/semantic_entropy_math_reward": 0.6618075594305992, "step": 62 }, { "completion_length": 723.9043197631836, "epoch": 0.08209137551917908, "grad_norm": 0.007626292295753956, "kl": 0.005852699279785156, "learning_rate": 8.181818181818183e-06, "loss": 0.0002, "reward": 1.111880462616682, "reward_std": 0.24433484650216997, "rewards/accuracy_reward": 0.4515306018292904, "rewards/semantic_entropy_math_reward": 0.6603498365730047, "step": 63 }, { "completion_length": 685.8379974365234, "epoch": 0.08339441322583273, "grad_norm": 0.007170382887125015, "kl": 0.006209373474121094, "learning_rate": 8.311688311688313e-06, "loss": 0.0002, "reward": 1.1762026064097881, "reward_std": 0.24945610132999718, "rewards/accuracy_reward": 0.506377542973496, "rewards/semantic_entropy_math_reward": 0.6698250770568848, "step": 64 }, { "completion_length": 680.1504917144775, "epoch": 0.08469745093248636, "grad_norm": 0.0058379219844937325, "kl": 0.005846977233886719, "learning_rate": 8.441558441558442e-06, "loss": 0.0002, "reward": 1.183673482388258, "reward_std": 0.20530655985930935, "rewards/accuracy_reward": 0.49234692100435495, "rewards/semantic_entropy_math_reward": 0.6913265231996775, "step": 65 }, { "completion_length": 729.7436065673828, "epoch": 0.08600048863913999, "grad_norm": 0.0060434285551309586, "kl": 0.005603790283203125, "learning_rate": 8.571428571428571e-06, "loss": 0.0002, "reward": 1.1290087476372719, "reward_std": 0.24698832631111145, "rewards/accuracy_reward": 0.4719387646764517, "rewards/semantic_entropy_math_reward": 0.6570699643343687, "step": 66 }, { "completion_length": 762.172176361084, "epoch": 0.08730352634579364, "grad_norm": 0.006218090653419495, "kl": 0.004633903503417969, "learning_rate": 8.701298701298701e-06, "loss": 0.0002, "reward": 1.0672376081347466, "reward_std": 0.20497948792763054, "rewards/accuracy_reward": 0.4374999860301614, "rewards/semantic_entropy_math_reward": 0.6297375969588757, "step": 67 }, { "completion_length": 821.9221801757812, "epoch": 0.08860656405244727, "grad_norm": 0.0054735965095460415, "kl": 0.004580497741699219, "learning_rate": 8.831168831168832e-06, "loss": 0.0002, "reward": 1.0165816433727741, "reward_std": 0.21466172579675913, "rewards/accuracy_reward": 0.38903060369193554, "rewards/semantic_entropy_math_reward": 0.6275510117411613, "step": 68 }, { "completion_length": 773.0267658233643, "epoch": 0.0899096017591009, "grad_norm": 0.00755363330245018, "kl": 0.006371498107910156, "learning_rate": 8.96103896103896e-06, "loss": 0.0003, "reward": 1.1849489472806454, "reward_std": 0.24870864278636873, "rewards/accuracy_reward": 0.49617346189916134, "rewards/semantic_entropy_math_reward": 0.6887755021452904, "step": 69 }, { "completion_length": 771.0675868988037, "epoch": 0.09121263946575454, "grad_norm": 0.008616450242698193, "kl": 0.00557708740234375, "learning_rate": 9.090909090909091e-06, "loss": 0.0002, "reward": 1.1115160323679447, "reward_std": 0.27686935919336975, "rewards/accuracy_reward": 0.4795918297022581, "rewards/semantic_entropy_math_reward": 0.6319242026656866, "step": 70 }, { "completion_length": 847.4808464050293, "epoch": 0.09251567717240818, "grad_norm": 0.017135659232735634, "kl": 0.004952430725097656, "learning_rate": 9.220779220779221e-06, "loss": 0.0002, "reward": 1.1049562320113182, "reward_std": 0.23575837165117264, "rewards/accuracy_reward": 0.47959183156490326, "rewards/semantic_entropy_math_reward": 0.625364413484931, "step": 71 }, { "completion_length": 791.7283020019531, "epoch": 0.09381871487906181, "grad_norm": 0.006112267728894949, "kl": 0.004946708679199219, "learning_rate": 9.350649350649352e-06, "loss": 0.0002, "reward": 1.055575791746378, "reward_std": 0.21914992504753172, "rewards/accuracy_reward": 0.4221938671544194, "rewards/semantic_entropy_math_reward": 0.6333819087594748, "step": 72 }, { "completion_length": 761.0497398376465, "epoch": 0.09512175258571545, "grad_norm": 0.007089715451002121, "kl": 0.0054779052734375, "learning_rate": 9.48051948051948e-06, "loss": 0.0002, "reward": 1.069241989403963, "reward_std": 0.24503467418253422, "rewards/accuracy_reward": 0.44642856530845165, "rewards/semantic_entropy_math_reward": 0.622813418507576, "step": 73 }, { "completion_length": 703.5420818328857, "epoch": 0.09642479029236908, "grad_norm": 0.00609151367098093, "kl": 0.006160736083984375, "learning_rate": 9.610389610389611e-06, "loss": 0.0002, "reward": 1.204992700368166, "reward_std": 0.2674535112455487, "rewards/accuracy_reward": 0.5420918259769678, "rewards/semantic_entropy_math_reward": 0.6629008762538433, "step": 74 }, { "completion_length": 770.8252372741699, "epoch": 0.09772782799902271, "grad_norm": 0.005476623307913542, "kl": 0.005555152893066406, "learning_rate": 9.740259740259742e-06, "loss": 0.0002, "reward": 1.0958454981446266, "reward_std": 0.2314315689727664, "rewards/accuracy_reward": 0.4336734600365162, "rewards/semantic_entropy_math_reward": 0.6621719971299171, "step": 75 }, { "completion_length": 768.9056015014648, "epoch": 0.09903086570567636, "grad_norm": 0.009125325828790665, "kl": 0.00666046142578125, "learning_rate": 9.87012987012987e-06, "loss": 0.0003, "reward": 1.1446792744100094, "reward_std": 0.22204886749386787, "rewards/accuracy_reward": 0.48979590740054846, "rewards/semantic_entropy_math_reward": 0.6548833660781384, "step": 76 }, { "completion_length": 797.0293197631836, "epoch": 0.10033390341232999, "grad_norm": 0.0064984578639268875, "kl": 0.005627632141113281, "learning_rate": 1e-05, "loss": 0.0002, "reward": 1.0490160211920738, "reward_std": 0.23936751019209623, "rewards/accuracy_reward": 0.4196428470313549, "rewards/semantic_entropy_math_reward": 0.6293731536716223, "step": 77 }, { "completion_length": 736.6721744537354, "epoch": 0.10163694111898364, "grad_norm": 0.006235254462808371, "kl": 0.00672149658203125, "learning_rate": 9.999948174819623e-06, "loss": 0.0003, "reward": 1.1306486949324608, "reward_std": 0.26312900334596634, "rewards/accuracy_reward": 0.4655612129718065, "rewards/semantic_entropy_math_reward": 0.6650874465703964, "step": 78 }, { "completion_length": 711.0140151977539, "epoch": 0.10293997882563727, "grad_norm": 0.010870655067265034, "kl": 0.007700920104980469, "learning_rate": 9.999792700352826e-06, "loss": 0.0003, "reward": 1.1301020421087742, "reward_std": 0.26513777766376734, "rewards/accuracy_reward": 0.4795918222516775, "rewards/semantic_entropy_math_reward": 0.6505102030932903, "step": 79 }, { "completion_length": 727.3035545349121, "epoch": 0.1042430165322909, "grad_norm": 0.00799601711332798, "kl": 0.010301589965820312, "learning_rate": 9.999533579822611e-06, "loss": 0.0004, "reward": 1.1239066869020462, "reward_std": 0.2541914158500731, "rewards/accuracy_reward": 0.46173468604683876, "rewards/semantic_entropy_math_reward": 0.6621719934046268, "step": 80 }, { "completion_length": 807.3596820831299, "epoch": 0.10554605423894454, "grad_norm": 0.006246128585189581, "kl": 0.0056362152099609375, "learning_rate": 9.999170818600562e-06, "loss": 0.0002, "reward": 1.1827623546123505, "reward_std": 0.23597976006567478, "rewards/accuracy_reward": 0.5038265259936452, "rewards/semantic_entropy_math_reward": 0.6789358705282211, "step": 81 }, { "completion_length": 728.6020240783691, "epoch": 0.10684909194559818, "grad_norm": 0.00840029027312994, "kl": 0.007867813110351562, "learning_rate": 9.998704424206747e-06, "loss": 0.0003, "reward": 1.1078717038035393, "reward_std": 0.2740425090305507, "rewards/accuracy_reward": 0.4744897875934839, "rewards/semantic_entropy_math_reward": 0.6333819050341845, "step": 82 }, { "completion_length": 747.1441192626953, "epoch": 0.10815212965225181, "grad_norm": 0.007311175111681223, "kl": 0.006852149963378906, "learning_rate": 9.998134406309555e-06, "loss": 0.0003, "reward": 1.1171647161245346, "reward_std": 0.229319516394753, "rewards/accuracy_reward": 0.4706632550805807, "rewards/semantic_entropy_math_reward": 0.6465014219284058, "step": 83 }, { "completion_length": 780.1785621643066, "epoch": 0.10945516735890545, "grad_norm": 0.007676111999899149, "kl": 0.005618095397949219, "learning_rate": 9.997460776725497e-06, "loss": 0.0002, "reward": 1.1324708238244057, "reward_std": 0.23473380913492292, "rewards/accuracy_reward": 0.45790815772488713, "rewards/semantic_entropy_math_reward": 0.6745626777410507, "step": 84 }, { "completion_length": 756.9502372741699, "epoch": 0.11075820506555908, "grad_norm": 0.006404530722647905, "kl": 0.0058383941650390625, "learning_rate": 9.996683549418964e-06, "loss": 0.0002, "reward": 1.0763483680784702, "reward_std": 0.2944581815972924, "rewards/accuracy_reward": 0.4706632597371936, "rewards/semantic_entropy_math_reward": 0.6056851223111153, "step": 85 }, { "completion_length": 712.3609504699707, "epoch": 0.11206124277221272, "grad_norm": 0.0070160492323338985, "kl": 0.0070285797119140625, "learning_rate": 9.995802740501933e-06, "loss": 0.0003, "reward": 1.074708454310894, "reward_std": 0.2228950799908489, "rewards/accuracy_reward": 0.4311224361881614, "rewards/semantic_entropy_math_reward": 0.6435859780758619, "step": 86 }, { "completion_length": 740.5918350219727, "epoch": 0.11336428047886636, "grad_norm": 0.007996910251677036, "kl": 0.007010459899902344, "learning_rate": 9.994818368233639e-06, "loss": 0.0003, "reward": 1.1594387665390968, "reward_std": 0.25375659577548504, "rewards/accuracy_reward": 0.5012754946947098, "rewards/semantic_entropy_math_reward": 0.6581632513552904, "step": 87 }, { "completion_length": 677.190034866333, "epoch": 0.11466731818551999, "grad_norm": 0.0055174194276332855, "kl": 0.006993293762207031, "learning_rate": 9.993730453020187e-06, "loss": 0.0003, "reward": 1.192237600684166, "reward_std": 0.22190570668317378, "rewards/accuracy_reward": 0.5012754965573549, "rewards/semantic_entropy_math_reward": 0.690962091088295, "step": 88 }, { "completion_length": 732.2283020019531, "epoch": 0.11597035589217362, "grad_norm": 0.0052662077359855175, "kl": 0.0077686309814453125, "learning_rate": 9.99253901741414e-06, "loss": 0.0003, "reward": 1.16472302749753, "reward_std": 0.22657143790274858, "rewards/accuracy_reward": 0.4795918259769678, "rewards/semantic_entropy_math_reward": 0.6851311828941107, "step": 89 }, { "completion_length": 725.0816097259521, "epoch": 0.11727339359882727, "grad_norm": 0.005457949358969927, "kl": 0.006816864013671875, "learning_rate": 9.991244086114046e-06, "loss": 0.0003, "reward": 1.1350218504667282, "reward_std": 0.2144629955291748, "rewards/accuracy_reward": 0.46301019564270973, "rewards/semantic_entropy_math_reward": 0.6720116473734379, "step": 90 }, { "completion_length": 713.6632537841797, "epoch": 0.1185764313054809, "grad_norm": 0.006386930122971535, "kl": 0.006923675537109375, "learning_rate": 9.989845685963917e-06, "loss": 0.0003, "reward": 1.2295917980372906, "reward_std": 0.2634762260131538, "rewards/accuracy_reward": 0.5433673374354839, "rewards/semantic_entropy_math_reward": 0.6862244680523872, "step": 91 }, { "completion_length": 750.0395202636719, "epoch": 0.11987946901213453, "grad_norm": 0.005538599099963903, "kl": 0.0076656341552734375, "learning_rate": 9.988343845952697e-06, "loss": 0.0003, "reward": 1.1725582890212536, "reward_std": 0.20838514482602477, "rewards/accuracy_reward": 0.49107141699641943, "rewards/semantic_entropy_math_reward": 0.681486863642931, "step": 92 }, { "completion_length": 798.3979377746582, "epoch": 0.12118250671878818, "grad_norm": 0.005649934522807598, "kl": 0.006389617919921875, "learning_rate": 9.986738597213633e-06, "loss": 0.0003, "reward": 1.1459547989070415, "reward_std": 0.2266796543262899, "rewards/accuracy_reward": 0.4961734563112259, "rewards/semantic_entropy_math_reward": 0.6497813202440739, "step": 93 }, { "completion_length": 775.1683578491211, "epoch": 0.12248554442544181, "grad_norm": 0.0056807007640600204, "kl": 0.0063762664794921875, "learning_rate": 9.98502997302365e-06, "loss": 0.0003, "reward": 1.17620262876153, "reward_std": 0.24629885656759143, "rewards/accuracy_reward": 0.5140306014800444, "rewards/semantic_entropy_math_reward": 0.6621719859540462, "step": 94 }, { "completion_length": 690.7142639160156, "epoch": 0.12378858213209545, "grad_norm": 0.006946495268493891, "kl": 0.008794784545898438, "learning_rate": 9.983218008802648e-06, "loss": 0.0004, "reward": 1.230867337435484, "reward_std": 0.2220900694373995, "rewards/accuracy_reward": 0.5420918259769678, "rewards/semantic_entropy_math_reward": 0.6887755058705807, "step": 95 }, { "completion_length": 745.7142753601074, "epoch": 0.12509161983874909, "grad_norm": 0.005949975922703743, "kl": 0.0076656341552734375, "learning_rate": 9.98130274211278e-06, "loss": 0.0003, "reward": 1.1672740317881107, "reward_std": 0.22445385437458754, "rewards/accuracy_reward": 0.5051020309329033, "rewards/semantic_entropy_math_reward": 0.6621719934046268, "step": 96 }, { "completion_length": 710.3328971862793, "epoch": 0.12639465754540272, "grad_norm": 0.005919306073337793, "kl": 0.008236885070800781, "learning_rate": 9.979284212657658e-06, "loss": 0.0003, "reward": 1.1614431515336037, "reward_std": 0.24563464429229498, "rewards/accuracy_reward": 0.4795918297022581, "rewards/semantic_entropy_math_reward": 0.6818512957543135, "step": 97 }, { "completion_length": 770.6938629150391, "epoch": 0.12769769525205635, "grad_norm": 0.005864322185516357, "kl": 0.007312774658203125, "learning_rate": 9.977162462281544e-06, "loss": 0.0003, "reward": 1.1680029034614563, "reward_std": 0.2181876003742218, "rewards/accuracy_reward": 0.505102027207613, "rewards/semantic_entropy_math_reward": 0.6629008483141661, "step": 98 }, { "completion_length": 728.6862125396729, "epoch": 0.12900073295870998, "grad_norm": 0.007728188764303923, "kl": 0.009893417358398438, "learning_rate": 9.97493753496848e-06, "loss": 0.0004, "reward": 1.1860422529280186, "reward_std": 0.2188318921253085, "rewards/accuracy_reward": 0.48852040339261293, "rewards/semantic_entropy_math_reward": 0.6975218579173088, "step": 99 }, { "completion_length": 774.9145202636719, "epoch": 0.13030377066536364, "grad_norm": 0.005985771771520376, "kl": 0.008409500122070312, "learning_rate": 9.972609476841368e-06, "loss": 0.0003, "reward": 1.1155247502028942, "reward_std": 0.2292306530289352, "rewards/accuracy_reward": 0.47959183249622583, "rewards/semantic_entropy_math_reward": 0.635932931676507, "step": 100 }, { "completion_length": 760.4540634155273, "epoch": 0.13160680837201727, "grad_norm": 0.008021770976483822, "kl": 0.00884246826171875, "learning_rate": 9.970178336161018e-06, "loss": 0.0004, "reward": 1.1248177662491798, "reward_std": 0.1914176745340228, "rewards/accuracy_reward": 0.47831631638109684, "rewards/semantic_entropy_math_reward": 0.6465014442801476, "step": 101 }, { "completion_length": 677.7091636657715, "epoch": 0.1329098460786709, "grad_norm": 0.007061308715492487, "kl": 0.010564804077148438, "learning_rate": 9.967644163325157e-06, "loss": 0.0004, "reward": 1.2334183529019356, "reward_std": 0.2896037925966084, "rewards/accuracy_reward": 0.5548469256609678, "rewards/semantic_entropy_math_reward": 0.6785714030265808, "step": 102 }, { "completion_length": 720.9017639160156, "epoch": 0.13421288378532453, "grad_norm": 0.007274166215211153, "kl": 0.008790969848632812, "learning_rate": 9.965007010867366e-06, "loss": 0.0004, "reward": 1.0907434448599815, "reward_std": 0.26489617116749287, "rewards/accuracy_reward": 0.4642857052385807, "rewards/semantic_entropy_math_reward": 0.6264577116817236, "step": 103 }, { "completion_length": 741.5803394317627, "epoch": 0.13551592149197816, "grad_norm": 0.006455298978835344, "kl": 0.009304046630859375, "learning_rate": 9.962266933456008e-06, "loss": 0.0004, "reward": 1.112791534513235, "reward_std": 0.20856561261462048, "rewards/accuracy_reward": 0.4604591727256775, "rewards/semantic_entropy_math_reward": 0.6523323319852352, "step": 104 }, { "completion_length": 693.9948921203613, "epoch": 0.13681895919863182, "grad_norm": 0.006805942859500647, "kl": 0.00899505615234375, "learning_rate": 9.959423987893086e-06, "loss": 0.0004, "reward": 1.2904518954455853, "reward_std": 0.19335565413348377, "rewards/accuracy_reward": 0.5561224389821291, "rewards/semantic_entropy_math_reward": 0.7343294061720371, "step": 105 }, { "completion_length": 696.8188667297363, "epoch": 0.13812199690528545, "grad_norm": 0.005381173919886351, "kl": 0.008138656616210938, "learning_rate": 9.956478233113066e-06, "loss": 0.0003, "reward": 1.2348760776221752, "reward_std": 0.1950448069255799, "rewards/accuracy_reward": 0.5471938662230968, "rewards/semantic_entropy_math_reward": 0.6876822039484978, "step": 106 }, { "completion_length": 725.4081420898438, "epoch": 0.13942503461193909, "grad_norm": 0.006238647736608982, "kl": 0.008451461791992188, "learning_rate": 9.953429730181653e-06, "loss": 0.0003, "reward": 1.1971573941409588, "reward_std": 0.21925521024968475, "rewards/accuracy_reward": 0.5025510098785162, "rewards/semantic_entropy_math_reward": 0.6946063935756683, "step": 107 }, { "completion_length": 735.8915596008301, "epoch": 0.14072807231859272, "grad_norm": 0.006119231693446636, "kl": 0.0101318359375, "learning_rate": 9.95027854229454e-06, "loss": 0.0004, "reward": 1.0717929862439632, "reward_std": 0.30305144749581814, "rewards/accuracy_reward": 0.45918366499245167, "rewards/semantic_entropy_math_reward": 0.6126093342900276, "step": 108 }, { "completion_length": 714.9578971862793, "epoch": 0.14203111002524635, "grad_norm": 0.0058670626021921635, "kl": 0.009630203247070312, "learning_rate": 9.947024734776076e-06, "loss": 0.0004, "reward": 1.1654518991708755, "reward_std": 0.20603651460260153, "rewards/accuracy_reward": 0.4617346813902259, "rewards/semantic_entropy_math_reward": 0.7037171721458435, "step": 109 }, { "completion_length": 737.3609523773193, "epoch": 0.14333414773189998, "grad_norm": 0.0065796938724815845, "kl": 0.010492324829101562, "learning_rate": 9.943668375077926e-06, "loss": 0.0004, "reward": 1.0865524765104055, "reward_std": 0.24795687710866332, "rewards/accuracy_reward": 0.44770407548639923, "rewards/semantic_entropy_math_reward": 0.6388483829796314, "step": 110 }, { "completion_length": 771.7091636657715, "epoch": 0.14463718543855364, "grad_norm": 0.006352514028549194, "kl": 0.009588241577148438, "learning_rate": 9.940209532777666e-06, "loss": 0.0004, "reward": 1.1995262280106544, "reward_std": 0.2844707975164056, "rewards/accuracy_reward": 0.5318877408280969, "rewards/semantic_entropy_math_reward": 0.6676384750753641, "step": 111 }, { "completion_length": 715.2691173553467, "epoch": 0.14594022314520727, "grad_norm": 0.00713386619463563, "kl": 0.0105438232421875, "learning_rate": 9.93664827957735e-06, "loss": 0.0004, "reward": 1.1946064122021198, "reward_std": 0.2418018700554967, "rewards/accuracy_reward": 0.5025510154664516, "rewards/semantic_entropy_math_reward": 0.6920553874224424, "step": 112 }, { "completion_length": 738.9527931213379, "epoch": 0.1472432608518609, "grad_norm": 0.00607394939288497, "kl": 0.010150909423828125, "learning_rate": 9.932984689302012e-06, "loss": 0.0004, "reward": 1.1397594884037971, "reward_std": 0.253134717233479, "rewards/accuracy_reward": 0.4936224380508065, "rewards/semantic_entropy_math_reward": 0.6461370103061199, "step": 113 }, { "completion_length": 723.4846839904785, "epoch": 0.14854629855851453, "grad_norm": 0.0067606037482619286, "kl": 0.010318756103515625, "learning_rate": 9.929218837898143e-06, "loss": 0.0004, "reward": 1.0701530389487743, "reward_std": 0.22500360454432666, "rewards/accuracy_reward": 0.4119897875934839, "rewards/semantic_entropy_math_reward": 0.6581632569432259, "step": 114 }, { "completion_length": 791.5063648223877, "epoch": 0.14984933626516816, "grad_norm": 0.007142480928450823, "kl": 0.0089569091796875, "learning_rate": 9.925350803432112e-06, "loss": 0.0004, "reward": 1.124817781150341, "reward_std": 0.23698072507977486, "rewards/accuracy_reward": 0.4783163219690323, "rewards/semantic_entropy_math_reward": 0.6465014442801476, "step": 115 }, { "completion_length": 689.8826446533203, "epoch": 0.1511523739718218, "grad_norm": 0.006536894012242556, "kl": 0.0104522705078125, "learning_rate": 9.921380666088558e-06, "loss": 0.0004, "reward": 1.1958819068968296, "reward_std": 0.2152222206350416, "rewards/accuracy_reward": 0.4961734600365162, "rewards/semantic_entropy_math_reward": 0.6997084431350231, "step": 116 }, { "completion_length": 771.0216636657715, "epoch": 0.15245541167847546, "grad_norm": 0.00589456083253026, "kl": 0.010189056396484375, "learning_rate": 9.917308508168712e-06, "loss": 0.0004, "reward": 1.219023309648037, "reward_std": 0.22552279522642493, "rewards/accuracy_reward": 0.5255101881921291, "rewards/semantic_entropy_math_reward": 0.6935130879282951, "step": 117 }, { "completion_length": 743.1377449035645, "epoch": 0.1537584493851291, "grad_norm": 0.006651968229562044, "kl": 0.009731292724609375, "learning_rate": 9.913134414088698e-06, "loss": 0.0004, "reward": 1.1268221661448479, "reward_std": 0.19260207808110863, "rewards/accuracy_reward": 0.464285702444613, "rewards/semantic_entropy_math_reward": 0.6625364348292351, "step": 118 }, { "completion_length": 699.6390113830566, "epoch": 0.15506148709178272, "grad_norm": 0.008722213096916676, "kl": 0.013345718383789062, "learning_rate": 9.908858470377793e-06, "loss": 0.0005, "reward": 1.0730684995651245, "reward_std": 0.2153183789923787, "rewards/accuracy_reward": 0.40433672815561295, "rewards/semantic_entropy_math_reward": 0.668731739744544, "step": 119 }, { "completion_length": 750.123706817627, "epoch": 0.15636452479843635, "grad_norm": 0.00737353228032589, "kl": 0.011110305786132812, "learning_rate": 9.904480765676617e-06, "loss": 0.0004, "reward": 1.14322155341506, "reward_std": 0.27635968942195177, "rewards/accuracy_reward": 0.4948979504406452, "rewards/semantic_entropy_math_reward": 0.6483236011117697, "step": 120 }, { "completion_length": 765.6798324584961, "epoch": 0.15766756250508998, "grad_norm": 0.0055794911459088326, "kl": 0.010175704956054688, "learning_rate": 9.9000013907353e-06, "loss": 0.0004, "reward": 1.1423104964196682, "reward_std": 0.21809990890324116, "rewards/accuracy_reward": 0.47576529532670975, "rewards/semantic_entropy_math_reward": 0.6665451861917973, "step": 121 }, { "completion_length": 731.644115447998, "epoch": 0.15897060021174364, "grad_norm": 0.007198716513812542, "kl": 0.011560440063476562, "learning_rate": 9.895420438411616e-06, "loss": 0.0005, "reward": 1.0765306167304516, "reward_std": 0.24987194733694196, "rewards/accuracy_reward": 0.44387754471972585, "rewards/semantic_entropy_math_reward": 0.6326530482620001, "step": 122 }, { "completion_length": 767.0816116333008, "epoch": 0.16027363791839727, "grad_norm": 0.005763104185461998, "kl": 0.010362625122070312, "learning_rate": 9.890738003669029e-06, "loss": 0.0004, "reward": 1.2033527493476868, "reward_std": 0.24556104559451342, "rewards/accuracy_reward": 0.5433673355728388, "rewards/semantic_entropy_math_reward": 0.6599854193627834, "step": 123 }, { "completion_length": 725.8902931213379, "epoch": 0.1615766756250509, "grad_norm": 0.009651200845837593, "kl": 0.012035369873046875, "learning_rate": 9.885954183574753e-06, "loss": 0.0005, "reward": 1.0985786989331245, "reward_std": 0.21907822508364916, "rewards/accuracy_reward": 0.4477040749043226, "rewards/semantic_entropy_math_reward": 0.6508746184408665, "step": 124 }, { "completion_length": 730.7640132904053, "epoch": 0.16287971333170453, "grad_norm": 0.008391596376895905, "kl": 0.013095855712890625, "learning_rate": 9.881069077297724e-06, "loss": 0.0005, "reward": 1.120809007436037, "reward_std": 0.2568598391953856, "rewards/accuracy_reward": 0.48596938140690327, "rewards/semantic_entropy_math_reward": 0.6348396353423595, "step": 125 }, { "completion_length": 731.2283039093018, "epoch": 0.16418275103835817, "grad_norm": 0.009345101192593575, "kl": 0.013418197631835938, "learning_rate": 9.876082786106546e-06, "loss": 0.0005, "reward": 1.1614431589841843, "reward_std": 0.2437346400693059, "rewards/accuracy_reward": 0.5204081479460001, "rewards/semantic_entropy_math_reward": 0.6410349830985069, "step": 126 }, { "completion_length": 810.7856960296631, "epoch": 0.1654857887450118, "grad_norm": 0.006475828588008881, "kl": 0.0144500732421875, "learning_rate": 9.870995413367397e-06, "loss": 0.0006, "reward": 1.134475201368332, "reward_std": 0.21247584954835474, "rewards/accuracy_reward": 0.46428570337593555, "rewards/semantic_entropy_math_reward": 0.6701894905418158, "step": 127 }, { "completion_length": 756.459171295166, "epoch": 0.16678882645166546, "grad_norm": 0.007593442685902119, "kl": 0.017543792724609375, "learning_rate": 9.865807064541878e-06, "loss": 0.0007, "reward": 1.1639941595494747, "reward_std": 0.2205681074410677, "rewards/accuracy_reward": 0.5025510154664516, "rewards/semantic_entropy_math_reward": 0.6614431031048298, "step": 128 }, { "completion_length": 790.0433578491211, "epoch": 0.1680918641583191, "grad_norm": 0.006996171083301306, "kl": 0.017299652099609375, "learning_rate": 9.860517847184837e-06, "loss": 0.0007, "reward": 1.1056851521134377, "reward_std": 0.2734520174562931, "rewards/accuracy_reward": 0.49489794857800007, "rewards/semantic_entropy_math_reward": 0.6107871476560831, "step": 129 }, { "completion_length": 833.7206420898438, "epoch": 0.16939490186497272, "grad_norm": 0.008687090128660202, "kl": 0.016979217529296875, "learning_rate": 9.855127870942131e-06, "loss": 0.0007, "reward": 1.0849125161767006, "reward_std": 0.2633068151772022, "rewards/accuracy_reward": 0.4846938634291291, "rewards/semantic_entropy_math_reward": 0.6002186480909586, "step": 130 }, { "completion_length": 855.3520278930664, "epoch": 0.17069793957162635, "grad_norm": 0.0072541856206953526, "kl": 0.021335601806640625, "learning_rate": 9.849637247548356e-06, "loss": 0.0009, "reward": 1.1249999813735485, "reward_std": 0.2602782789617777, "rewards/accuracy_reward": 0.48469386994838715, "rewards/semantic_entropy_math_reward": 0.6403061002492905, "step": 131 }, { "completion_length": 802.4795684814453, "epoch": 0.17200097727827998, "grad_norm": 0.006454484537243843, "kl": 0.024652481079101562, "learning_rate": 9.844046090824533e-06, "loss": 0.001, "reward": 1.1264577023684978, "reward_std": 0.2818593478295952, "rewards/accuracy_reward": 0.4872448882088065, "rewards/semantic_entropy_math_reward": 0.6392128095030785, "step": 132 }, { "completion_length": 730.4987144470215, "epoch": 0.1733040149849336, "grad_norm": 0.0065299877896904945, "kl": 0.0326385498046875, "learning_rate": 9.83835451667574e-06, "loss": 0.0013, "reward": 1.229956228286028, "reward_std": 0.22485816711559892, "rewards/accuracy_reward": 0.5459183547645807, "rewards/semantic_entropy_math_reward": 0.6840378846973181, "step": 133 }, { "completion_length": 791.2231979370117, "epoch": 0.17460705269158727, "grad_norm": 0.007751519791781902, "kl": 0.0498809814453125, "learning_rate": 9.832562643088724e-06, "loss": 0.002, "reward": 1.1494168862700462, "reward_std": 0.24367713811807334, "rewards/accuracy_reward": 0.5204081535339355, "rewards/semantic_entropy_math_reward": 0.6290087401866913, "step": 134 }, { "completion_length": 848.2780418395996, "epoch": 0.1759100903982409, "grad_norm": 0.006717671174556017, "kl": 0.07633209228515625, "learning_rate": 9.826670590129442e-06, "loss": 0.0031, "reward": 1.0566690973937511, "reward_std": 0.17720634723082185, "rewards/accuracy_reward": 0.44515305012464523, "rewards/semantic_entropy_math_reward": 0.6115160081535578, "step": 135 }, { "completion_length": 822.2372245788574, "epoch": 0.17721312810489454, "grad_norm": 0.0061748516745865345, "kl": 0.0682830810546875, "learning_rate": 9.820678479940573e-06, "loss": 0.0027, "reward": 1.063228864222765, "reward_std": 0.19930668687447906, "rewards/accuracy_reward": 0.4170918297022581, "rewards/semantic_entropy_math_reward": 0.6461370214819908, "step": 136 }, { "completion_length": 861.7295761108398, "epoch": 0.17851616581154817, "grad_norm": 0.005776498932391405, "kl": 0.055389404296875, "learning_rate": 9.814586436738998e-06, "loss": 0.0022, "reward": 1.1244533248245716, "reward_std": 0.25053994404152036, "rewards/accuracy_reward": 0.5038265194743872, "rewards/semantic_entropy_math_reward": 0.620626812800765, "step": 137 }, { "completion_length": 780.7550773620605, "epoch": 0.1798192035182018, "grad_norm": 0.0057667335495352745, "kl": 0.04409027099609375, "learning_rate": 9.808394586813209e-06, "loss": 0.0018, "reward": 1.158709891140461, "reward_std": 0.24468013271689415, "rewards/accuracy_reward": 0.521683668717742, "rewards/semantic_entropy_math_reward": 0.6370262112468481, "step": 138 }, { "completion_length": 745.9706478118896, "epoch": 0.18112224122485546, "grad_norm": 0.004698788281530142, "kl": 0.034824371337890625, "learning_rate": 9.802103058520704e-06, "loss": 0.0014, "reward": 1.2505466304719448, "reward_std": 0.21848437120206654, "rewards/accuracy_reward": 0.5548469331115484, "rewards/semantic_entropy_math_reward": 0.6956996973603964, "step": 139 }, { "completion_length": 699.2576446533203, "epoch": 0.1824252789315091, "grad_norm": 0.006096448749303818, "kl": 0.03365325927734375, "learning_rate": 9.795711982285317e-06, "loss": 0.0013, "reward": 1.200437281280756, "reward_std": 0.25988480169326067, "rewards/accuracy_reward": 0.5229591764509678, "rewards/semantic_entropy_math_reward": 0.6774781178683043, "step": 140 }, { "completion_length": 754.3443698883057, "epoch": 0.18372831663816272, "grad_norm": 0.005306684877723455, "kl": 0.030506134033203125, "learning_rate": 9.78922149059452e-06, "loss": 0.0012, "reward": 1.1568877287209034, "reward_std": 0.2266323179937899, "rewards/accuracy_reward": 0.4681122386828065, "rewards/semantic_entropy_math_reward": 0.6887754928320646, "step": 141 }, { "completion_length": 680.1568698883057, "epoch": 0.18503135434481635, "grad_norm": 0.008213664405047894, "kl": 0.02951812744140625, "learning_rate": 9.782631717996675e-06, "loss": 0.0012, "reward": 1.2224853932857513, "reward_std": 0.19257353967987, "rewards/accuracy_reward": 0.503826517611742, "rewards/semantic_entropy_math_reward": 0.7186588756740093, "step": 142 }, { "completion_length": 736.8226928710938, "epoch": 0.18633439205146998, "grad_norm": 0.00466786976903677, "kl": 0.03533935546875, "learning_rate": 9.775942801098241e-06, "loss": 0.0014, "reward": 1.2230320572853088, "reward_std": 0.162883774144575, "rewards/accuracy_reward": 0.5204081498086452, "rewards/semantic_entropy_math_reward": 0.7026238739490509, "step": 143 }, { "completion_length": 842.9999847412109, "epoch": 0.18763742975812361, "grad_norm": 0.009062922559678555, "kl": 0.035717010498046875, "learning_rate": 9.76915487856095e-06, "loss": 0.0014, "reward": 1.100947517901659, "reward_std": 0.24765546899288893, "rewards/accuracy_reward": 0.4617346841841936, "rewards/semantic_entropy_math_reward": 0.6392128188163042, "step": 144 }, { "completion_length": 715.4017753601074, "epoch": 0.18894046746477727, "grad_norm": 0.006133963353931904, "kl": 0.036640167236328125, "learning_rate": 9.762268091098926e-06, "loss": 0.0015, "reward": 1.1989795677363873, "reward_std": 0.18175394041463733, "rewards/accuracy_reward": 0.4846938643604517, "rewards/semantic_entropy_math_reward": 0.714285708963871, "step": 145 }, { "completion_length": 723.8086605072021, "epoch": 0.1902435051714309, "grad_norm": 0.004137095529586077, "kl": 0.022266387939453125, "learning_rate": 9.755282581475769e-06, "loss": 0.0009, "reward": 1.242893572896719, "reward_std": 0.18324366473825648, "rewards/accuracy_reward": 0.5140306036919355, "rewards/semantic_entropy_math_reward": 0.7288629487156868, "step": 146 }, { "completion_length": 747.6900329589844, "epoch": 0.19154654287808454, "grad_norm": 0.004832756239920855, "kl": 0.029720306396484375, "learning_rate": 9.748198494501598e-06, "loss": 0.0012, "reward": 1.2485422678291798, "reward_std": 0.21063116868026555, "rewards/accuracy_reward": 0.5331632494926453, "rewards/semantic_entropy_math_reward": 0.7153789922595024, "step": 147 }, { "completion_length": 726.1045799255371, "epoch": 0.19284958058473817, "grad_norm": 0.0063225822523236275, "kl": 0.035915374755859375, "learning_rate": 9.741015977030046e-06, "loss": 0.0014, "reward": 1.1594387628138065, "reward_std": 0.26789093925617635, "rewards/accuracy_reward": 0.5012755002826452, "rewards/semantic_entropy_math_reward": 0.6581632625311613, "step": 148 }, { "completion_length": 723.3124771118164, "epoch": 0.1941526182913918, "grad_norm": 0.005702209193259478, "kl": 0.03449249267578125, "learning_rate": 9.733735177955219e-06, "loss": 0.0014, "reward": 1.2328716926276684, "reward_std": 0.22931992495432496, "rewards/accuracy_reward": 0.5153061151504517, "rewards/semantic_entropy_math_reward": 0.7175655961036682, "step": 149 }, { "completion_length": 747.2027835845947, "epoch": 0.19545565599804543, "grad_norm": 0.008735745213925838, "kl": 0.03743743896484375, "learning_rate": 9.72635624820861e-06, "loss": 0.0015, "reward": 1.1514212638139725, "reward_std": 0.24103434477001429, "rewards/accuracy_reward": 0.49617346189916134, "rewards/semantic_entropy_math_reward": 0.6552477888762951, "step": 150 }, { "completion_length": 750.0637550354004, "epoch": 0.1967586937046991, "grad_norm": 0.0072867595590651035, "kl": 0.041553497314453125, "learning_rate": 9.71887934075596e-06, "loss": 0.0017, "reward": 1.082361489534378, "reward_std": 0.23858080920763314, "rewards/accuracy_reward": 0.4464285681024194, "rewards/semantic_entropy_math_reward": 0.635932931676507, "step": 151 }, { "completion_length": 812.7907962799072, "epoch": 0.19806173141135272, "grad_norm": 0.0060833231545984745, "kl": 0.04632568359375, "learning_rate": 9.711304610594104e-06, "loss": 0.0019, "reward": 1.087645784020424, "reward_std": 0.24173568130936474, "rewards/accuracy_reward": 0.47066325321793556, "rewards/semantic_entropy_math_reward": 0.6169824711978436, "step": 152 }, { "completion_length": 776.985954284668, "epoch": 0.19936476911800635, "grad_norm": 0.005852220579981804, "kl": 0.05850982666015625, "learning_rate": 9.703632214747742e-06, "loss": 0.0023, "reward": 1.0922011584043503, "reward_std": 0.26367821014719084, "rewards/accuracy_reward": 0.464285702444613, "rewards/semantic_entropy_math_reward": 0.6279154270887375, "step": 153 }, { "completion_length": 761.9055938720703, "epoch": 0.20066780682465998, "grad_norm": 0.0070666056126356125, "kl": 0.07137680053710938, "learning_rate": 9.695862312266195e-06, "loss": 0.0029, "reward": 1.174198243767023, "reward_std": 0.24346035765483975, "rewards/accuracy_reward": 0.5076530482620001, "rewards/semantic_entropy_math_reward": 0.6665451787412167, "step": 154 }, { "completion_length": 818.8711585998535, "epoch": 0.20197084453131361, "grad_norm": 0.006547208875417709, "kl": 0.0753173828125, "learning_rate": 9.687995064220102e-06, "loss": 0.003, "reward": 1.027514562010765, "reward_std": 0.2596653157379478, "rewards/accuracy_reward": 0.42729590833187103, "rewards/semantic_entropy_math_reward": 0.6002186369150877, "step": 155 }, { "completion_length": 800.0127334594727, "epoch": 0.20327388223796727, "grad_norm": 0.0065093012526631355, "kl": 0.0836181640625, "learning_rate": 9.680030633698083e-06, "loss": 0.0033, "reward": 1.127004373818636, "reward_std": 0.25171618862077594, "rewards/accuracy_reward": 0.483418358489871, "rewards/semantic_entropy_math_reward": 0.6435860004276037, "step": 156 }, { "completion_length": 801.2257423400879, "epoch": 0.2045769199446209, "grad_norm": 0.009425929747521877, "kl": 0.107269287109375, "learning_rate": 9.671969185803357e-06, "loss": 0.0043, "reward": 1.0307944528758526, "reward_std": 0.2676779478788376, "rewards/accuracy_reward": 0.41454081051051617, "rewards/semantic_entropy_math_reward": 0.6162536218762398, "step": 157 }, { "completion_length": 780.9183521270752, "epoch": 0.20587995765127454, "grad_norm": 0.00592389702796936, "kl": 0.085113525390625, "learning_rate": 9.66381088765032e-06, "loss": 0.0034, "reward": 1.1452259421348572, "reward_std": 0.2685589883476496, "rewards/accuracy_reward": 0.5267857052385807, "rewards/semantic_entropy_math_reward": 0.6184402164071798, "step": 158 }, { "completion_length": 835.7104415893555, "epoch": 0.20718299535792817, "grad_norm": 0.006882313173264265, "kl": 0.10106658935546875, "learning_rate": 9.65555590836108e-06, "loss": 0.004, "reward": 1.0805393531918526, "reward_std": 0.31268593668937683, "rewards/accuracy_reward": 0.48469386994838715, "rewards/semantic_entropy_math_reward": 0.5958454832434654, "step": 159 }, { "completion_length": 678.399227142334, "epoch": 0.2084860330645818, "grad_norm": 0.007788940332829952, "kl": 0.06047821044921875, "learning_rate": 9.647204419061957e-06, "loss": 0.0024, "reward": 1.3197886049747467, "reward_std": 0.23159618326462805, "rewards/accuracy_reward": 0.5956632513552904, "rewards/semantic_entropy_math_reward": 0.7241253443062305, "step": 160 }, { "completion_length": 693.2359523773193, "epoch": 0.20978907077123543, "grad_norm": 0.004874850623309612, "kl": 0.06043243408203125, "learning_rate": 9.638756592879923e-06, "loss": 0.0024, "reward": 1.2104591578245163, "reward_std": 0.21325235278345644, "rewards/accuracy_reward": 0.5063775395974517, "rewards/semantic_entropy_math_reward": 0.7040816154330969, "step": 161 }, { "completion_length": 801.1670761108398, "epoch": 0.2110921084778891, "grad_norm": 0.005604679696261883, "kl": 0.07735252380371094, "learning_rate": 9.630212604939026e-06, "loss": 0.0031, "reward": 1.1643586084246635, "reward_std": 0.23105582501739264, "rewards/accuracy_reward": 0.497448967769742, "rewards/semantic_entropy_math_reward": 0.6669096201658249, "step": 162 }, { "completion_length": 761.9119720458984, "epoch": 0.21239514618454272, "grad_norm": 0.008337643928825855, "kl": 0.08140182495117188, "learning_rate": 9.621572632356754e-06, "loss": 0.0033, "reward": 1.1495991200208664, "reward_std": 0.2650262452661991, "rewards/accuracy_reward": 0.4783163219690323, "rewards/semantic_entropy_math_reward": 0.6712827757000923, "step": 163 }, { "completion_length": 819.7907981872559, "epoch": 0.21369818389119635, "grad_norm": 0.005642049014568329, "kl": 0.06630325317382812, "learning_rate": 9.61283685424036e-06, "loss": 0.0027, "reward": 1.1328352876007557, "reward_std": 0.28228158690035343, "rewards/accuracy_reward": 0.483418358489871, "rewards/semantic_entropy_math_reward": 0.649416895583272, "step": 164 }, { "completion_length": 765.8902854919434, "epoch": 0.21500122159784998, "grad_norm": 0.005377195309847593, "kl": 0.0746612548828125, "learning_rate": 9.604005451683154e-06, "loss": 0.003, "reward": 1.1798469256609678, "reward_std": 0.23891328554600477, "rewards/accuracy_reward": 0.5012755021452904, "rewards/semantic_entropy_math_reward": 0.6785714086145163, "step": 165 }, { "completion_length": 785.1313591003418, "epoch": 0.21630425930450362, "grad_norm": 0.005879529751837254, "kl": 0.091949462890625, "learning_rate": 9.59507860776075e-06, "loss": 0.0037, "reward": 1.063411083072424, "reward_std": 0.2700031818822026, "rewards/accuracy_reward": 0.4362244810909033, "rewards/semantic_entropy_math_reward": 0.6271865535527468, "step": 166 }, { "completion_length": 765.9374809265137, "epoch": 0.21760729701115725, "grad_norm": 0.005799470003694296, "kl": 0.07891082763671875, "learning_rate": 9.586056507527266e-06, "loss": 0.0032, "reward": 1.1459548026323318, "reward_std": 0.22966598463244736, "rewards/accuracy_reward": 0.5012755002826452, "rewards/semantic_entropy_math_reward": 0.64467928186059, "step": 167 }, { "completion_length": 731.075231552124, "epoch": 0.2189103347178109, "grad_norm": 0.00807306356728077, "kl": 0.09747314453125, "learning_rate": 9.57693933801149e-06, "loss": 0.0039, "reward": 1.0706996954977512, "reward_std": 0.29186924546957016, "rewards/accuracy_reward": 0.4668367262929678, "rewards/semantic_entropy_math_reward": 0.6038629673421383, "step": 168 }, { "completion_length": 775.2283096313477, "epoch": 0.22021337242446454, "grad_norm": 0.008333943784236908, "kl": 0.084716796875, "learning_rate": 9.567727288213005e-06, "loss": 0.0034, "reward": 1.0974854081869125, "reward_std": 0.29219970013946295, "rewards/accuracy_reward": 0.47576529160141945, "rewards/semantic_entropy_math_reward": 0.6217200867831707, "step": 169 }, { "completion_length": 763.2652816772461, "epoch": 0.22151641013111817, "grad_norm": 0.006703939288854599, "kl": 0.1121978759765625, "learning_rate": 9.558420549098269e-06, "loss": 0.0045, "reward": 1.0852769613265991, "reward_std": 0.29799219965934753, "rewards/accuracy_reward": 0.47193877026438713, "rewards/semantic_entropy_math_reward": 0.613338178023696, "step": 170 }, { "completion_length": 748.6556015014648, "epoch": 0.2228194478377718, "grad_norm": 0.005565381608903408, "kl": 0.11814117431640625, "learning_rate": 9.549019313596652e-06, "loss": 0.0047, "reward": 1.238338191062212, "reward_std": 0.3297378746792674, "rewards/accuracy_reward": 0.5816326476633549, "rewards/semantic_entropy_math_reward": 0.6567055210471153, "step": 171 }, { "completion_length": 797.4744644165039, "epoch": 0.22412248554442543, "grad_norm": 0.007363718934357166, "kl": 0.12139892578125, "learning_rate": 9.539523776596446e-06, "loss": 0.0049, "reward": 1.020590366795659, "reward_std": 0.2655470692552626, "rewards/accuracy_reward": 0.42984693218022585, "rewards/semantic_entropy_math_reward": 0.5907434113323689, "step": 172 }, { "completion_length": 795.8405437469482, "epoch": 0.2254255232510791, "grad_norm": 0.0047944397665560246, "kl": 0.10210418701171875, "learning_rate": 9.529934134940819e-06, "loss": 0.0041, "reward": 1.1086005605757236, "reward_std": 0.24513375293463469, "rewards/accuracy_reward": 0.46428570710122585, "rewards/semantic_entropy_math_reward": 0.6443148553371429, "step": 173 }, { "completion_length": 850.6096706390381, "epoch": 0.22672856095773272, "grad_norm": 0.0056090704165399075, "kl": 0.1551513671875, "learning_rate": 9.520250587423733e-06, "loss": 0.0062, "reward": 1.0730684958398342, "reward_std": 0.26498850155621767, "rewards/accuracy_reward": 0.4732142724096775, "rewards/semantic_entropy_math_reward": 0.5998542197048664, "step": 174 }, { "completion_length": 856.496150970459, "epoch": 0.22803159866438635, "grad_norm": 0.0057459184899926186, "kl": 0.1326141357421875, "learning_rate": 9.510473334785828e-06, "loss": 0.0053, "reward": 1.0280612297356129, "reward_std": 0.3099081702530384, "rewards/accuracy_reward": 0.4387755049392581, "rewards/semantic_entropy_math_reward": 0.5892857033759356, "step": 175 }, { "completion_length": 811.6032943725586, "epoch": 0.22933463637103998, "grad_norm": 0.005281019024550915, "kl": 0.131805419921875, "learning_rate": 9.500602579710256e-06, "loss": 0.0053, "reward": 1.058126799762249, "reward_std": 0.2955819093622267, "rewards/accuracy_reward": 0.4706632550805807, "rewards/semantic_entropy_math_reward": 0.5874635279178619, "step": 176 }, { "completion_length": 817.4170799255371, "epoch": 0.23063767407769362, "grad_norm": 0.010456100106239319, "kl": 0.162567138671875, "learning_rate": 9.490638526818482e-06, "loss": 0.0065, "reward": 1.013848403468728, "reward_std": 0.272765482775867, "rewards/accuracy_reward": 0.4183673458173871, "rewards/semantic_entropy_math_reward": 0.5954810436815023, "step": 177 }, { "completion_length": 792.9438591003418, "epoch": 0.23194071178434725, "grad_norm": 0.0055226427502930164, "kl": 0.1409759521484375, "learning_rate": 9.480581382666041e-06, "loss": 0.0056, "reward": 1.1080539338290691, "reward_std": 0.288189803250134, "rewards/accuracy_reward": 0.4885204005986452, "rewards/semantic_entropy_math_reward": 0.6195334941148758, "step": 178 }, { "completion_length": 795.8469314575195, "epoch": 0.2332437494910009, "grad_norm": 0.023137899115681648, "kl": 0.161651611328125, "learning_rate": 9.470431355738257e-06, "loss": 0.0065, "reward": 1.0402696579694748, "reward_std": 0.29460147581994534, "rewards/accuracy_reward": 0.4426020346581936, "rewards/semantic_entropy_math_reward": 0.5976676307618618, "step": 179 }, { "completion_length": 781.4515190124512, "epoch": 0.23454678719765454, "grad_norm": 0.006625569891184568, "kl": 0.1616058349609375, "learning_rate": 9.460188656445921e-06, "loss": 0.0065, "reward": 1.0306122172623873, "reward_std": 0.2928948850603774, "rewards/accuracy_reward": 0.4489795845001936, "rewards/semantic_entropy_math_reward": 0.5816326308995485, "step": 180 }, { "completion_length": 781.4272651672363, "epoch": 0.23584982490430817, "grad_norm": 0.006482996046543121, "kl": 0.175018310546875, "learning_rate": 9.449853497120928e-06, "loss": 0.007, "reward": 1.0770772248506546, "reward_std": 0.23921551229432225, "rewards/accuracy_reward": 0.46045917365700006, "rewards/semantic_entropy_math_reward": 0.6166180670261383, "step": 181 }, { "completion_length": 818.1045799255371, "epoch": 0.2371528626109618, "grad_norm": 0.007163103669881821, "kl": 0.166778564453125, "learning_rate": 9.439426092011877e-06, "loss": 0.0067, "reward": 1.0295189432799816, "reward_std": 0.25230085477232933, "rewards/accuracy_reward": 0.44642856158316135, "rewards/semantic_entropy_math_reward": 0.583090340718627, "step": 182 }, { "completion_length": 810.9170742034912, "epoch": 0.23845590031761543, "grad_norm": 0.006750667933374643, "kl": 0.208953857421875, "learning_rate": 9.428906657279629e-06, "loss": 0.0084, "reward": 0.9462463557720184, "reward_std": 0.36967444978654385, "rewards/accuracy_reward": 0.42219386994838715, "rewards/semantic_entropy_math_reward": 0.5240524746477604, "step": 183 }, { "completion_length": 879.8252296447754, "epoch": 0.23975893802426906, "grad_norm": 0.008494066074490547, "kl": 0.2569580078125, "learning_rate": 9.418295410992821e-06, "loss": 0.0103, "reward": 0.9077988397330046, "reward_std": 0.35520277731120586, "rewards/accuracy_reward": 0.4107142835855484, "rewards/semantic_entropy_math_reward": 0.4970845356583595, "step": 184 }, { "completion_length": 799.8048400878906, "epoch": 0.24106197573092272, "grad_norm": 0.008310383185744286, "kl": 0.253021240234375, "learning_rate": 9.407592573123359e-06, "loss": 0.0101, "reward": 1.030794434249401, "reward_std": 0.2828732808120549, "rewards/accuracy_reward": 0.44260203558951616, "rewards/semantic_entropy_math_reward": 0.5881923921406269, "step": 185 }, { "completion_length": 781.0637626647949, "epoch": 0.24236501343757635, "grad_norm": 0.006934929173439741, "kl": 0.23968505859375, "learning_rate": 9.396798365541841e-06, "loss": 0.0096, "reward": 1.0470116510987282, "reward_std": 0.3275171648710966, "rewards/accuracy_reward": 0.47704081051051617, "rewards/semantic_entropy_math_reward": 0.5699708182364702, "step": 186 }, { "completion_length": 870.1798286437988, "epoch": 0.24366805114422999, "grad_norm": 0.00771751906722784, "kl": 0.261871337890625, "learning_rate": 9.385913012012972e-06, "loss": 0.0105, "reward": 0.8799198027700186, "reward_std": 0.31368094123899937, "rewards/accuracy_reward": 0.3762755021452904, "rewards/semantic_entropy_math_reward": 0.5036442968994379, "step": 187 }, { "completion_length": 928.5956420898438, "epoch": 0.24497108885088362, "grad_norm": 0.01331934705376625, "kl": 0.356719970703125, "learning_rate": 9.374936738190913e-06, "loss": 0.0143, "reward": 0.7877186611294746, "reward_std": 0.2927225036546588, "rewards/accuracy_reward": 0.3431122386828065, "rewards/semantic_entropy_math_reward": 0.4446063945069909, "step": 188 }, { "completion_length": 946.2767677307129, "epoch": 0.24627412655753725, "grad_norm": 0.010896523483097553, "kl": 0.39837646484375, "learning_rate": 9.363869771614615e-06, "loss": 0.0159, "reward": 0.6652696877717972, "reward_std": 0.33899027574807405, "rewards/accuracy_reward": 0.29209183249622583, "rewards/semantic_entropy_math_reward": 0.3731778487563133, "step": 189 }, { "completion_length": 928.4145202636719, "epoch": 0.2475771642641909, "grad_norm": 0.02287294715642929, "kl": 0.415771484375, "learning_rate": 9.35271234170309e-06, "loss": 0.0166, "reward": 0.6279154475778341, "reward_std": 0.29416987765580416, "rewards/accuracy_reward": 0.27040815772488713, "rewards/semantic_entropy_math_reward": 0.3575072707608342, "step": 190 }, { "completion_length": 827.9030532836914, "epoch": 0.24888020197084454, "grad_norm": 0.018528223037719727, "kl": 0.29278564453125, "learning_rate": 9.341464679750669e-06, "loss": 0.0117, "reward": 0.7835276890546083, "reward_std": 0.3102852921001613, "rewards/accuracy_reward": 0.35969387041404843, "rewards/semantic_entropy_math_reward": 0.42383380234241486, "step": 191 }, { "completion_length": 659.3813591003418, "epoch": 0.25018323967749817, "grad_norm": 0.04590199142694473, "kl": 0.14002227783203125, "learning_rate": 9.330127018922195e-06, "loss": 0.0056, "reward": 1.020408146083355, "reward_std": 0.28293703217059374, "rewards/accuracy_reward": 0.4438775386661291, "rewards/semantic_entropy_math_reward": 0.5765306036919355, "step": 192 }, { "completion_length": 676.0114631652832, "epoch": 0.2514862773841518, "grad_norm": 0.01132090575993061, "kl": 0.124298095703125, "learning_rate": 9.318699594248192e-06, "loss": 0.005, "reward": 1.1732871271669865, "reward_std": 0.29400589130818844, "rewards/accuracy_reward": 0.5267857033759356, "rewards/semantic_entropy_math_reward": 0.6465014424175024, "step": 193 }, { "completion_length": 742.0369815826416, "epoch": 0.25278931509080543, "grad_norm": 0.014985255897045135, "kl": 0.15325927734375, "learning_rate": 9.307182642620001e-06, "loss": 0.0061, "reward": 1.0814504213631153, "reward_std": 0.23736674501560628, "rewards/accuracy_reward": 0.45790815725922585, "rewards/semantic_entropy_math_reward": 0.6235422510653734, "step": 194 }, { "completion_length": 672.864782333374, "epoch": 0.25409235279745906, "grad_norm": 0.006124918349087238, "kl": 0.116668701171875, "learning_rate": 9.295576402784858e-06, "loss": 0.0047, "reward": 1.2483600601553917, "reward_std": 0.20744502503657714, "rewards/accuracy_reward": 0.5676020234823227, "rewards/semantic_entropy_math_reward": 0.6807579919695854, "step": 195 }, { "completion_length": 675.103307723999, "epoch": 0.2553953905041127, "grad_norm": 0.014673289842903614, "kl": 0.140838623046875, "learning_rate": 9.283881115340957e-06, "loss": 0.0056, "reward": 1.1621719971299171, "reward_std": 0.2733052815310657, "rewards/accuracy_reward": 0.5153061170130968, "rewards/semantic_entropy_math_reward": 0.6468658819794655, "step": 196 }, { "completion_length": 707.6339149475098, "epoch": 0.2566984282107663, "grad_norm": 0.007380802650004625, "kl": 0.15087890625, "learning_rate": 9.272097022732444e-06, "loss": 0.006, "reward": 1.1036807261407375, "reward_std": 0.2723709491547197, "rewards/accuracy_reward": 0.4757652971893549, "rewards/semantic_entropy_math_reward": 0.6279154419898987, "step": 197 }, { "completion_length": 732.8915672302246, "epoch": 0.25800146591741996, "grad_norm": 0.005643486976623535, "kl": 0.1605072021484375, "learning_rate": 9.260224369244414e-06, "loss": 0.0064, "reward": 1.1782069951295853, "reward_std": 0.24145470699295402, "rewards/accuracy_reward": 0.5229591727256775, "rewards/semantic_entropy_math_reward": 0.6552478093653917, "step": 198 }, { "completion_length": 810.7703838348389, "epoch": 0.25930450362407365, "grad_norm": 0.008388770744204521, "kl": 0.1985015869140625, "learning_rate": 9.248263400997826e-06, "loss": 0.0079, "reward": 1.0490160323679447, "reward_std": 0.2694431249983609, "rewards/accuracy_reward": 0.4655612139031291, "rewards/semantic_entropy_math_reward": 0.5834547970443964, "step": 199 }, { "completion_length": 771.8073883056641, "epoch": 0.2606075413307273, "grad_norm": 0.009374907240271568, "kl": 0.2144317626953125, "learning_rate": 9.236214365944418e-06, "loss": 0.0086, "reward": 1.0131195411086082, "reward_std": 0.3031772803515196, "rewards/accuracy_reward": 0.4515306046232581, "rewards/semantic_entropy_math_reward": 0.5615889132022858, "step": 200 }, { "completion_length": 888.7461624145508, "epoch": 0.2619105790373809, "grad_norm": 0.009298073127865791, "kl": 0.3131103515625, "learning_rate": 9.224077513861556e-06, "loss": 0.0125, "reward": 0.8573250602930784, "reward_std": 0.2961276541464031, "rewards/accuracy_reward": 0.3762755040079355, "rewards/semantic_entropy_math_reward": 0.48104955069720745, "step": 201 }, { "completion_length": 984.1389999389648, "epoch": 0.26321361674403454, "grad_norm": 0.009315317496657372, "kl": 0.3388671875, "learning_rate": 9.211853096347059e-06, "loss": 0.0136, "reward": 0.7221209928393364, "reward_std": 0.3085676170885563, "rewards/accuracy_reward": 0.2997448928654194, "rewards/semantic_entropy_math_reward": 0.42237607575953007, "step": 202 }, { "completion_length": 785.2270278930664, "epoch": 0.26451665445068817, "grad_norm": 0.007693823426961899, "kl": 0.243255615234375, "learning_rate": 9.199541366813984e-06, "loss": 0.0097, "reward": 0.8866618033498526, "reward_std": 0.30504412204027176, "rewards/accuracy_reward": 0.38775509246625006, "rewards/semantic_entropy_math_reward": 0.4989066794514656, "step": 203 }, { "completion_length": 917.8214073181152, "epoch": 0.2658196921573418, "grad_norm": 0.012152981013059616, "kl": 0.41571044921875, "learning_rate": 9.18714258048537e-06, "loss": 0.0166, "reward": 0.6989795733243227, "reward_std": 0.3281674478203058, "rewards/accuracy_reward": 0.30357142456341535, "rewards/semantic_entropy_math_reward": 0.3954081516712904, "step": 204 }, { "completion_length": 893.0548362731934, "epoch": 0.26712272986399543, "grad_norm": 0.011389151215553284, "kl": 0.4075927734375, "learning_rate": 9.174656994388957e-06, "loss": 0.0163, "reward": 0.7057215757668018, "reward_std": 0.26561030512675643, "rewards/accuracy_reward": 0.2997448956593871, "rewards/semantic_entropy_math_reward": 0.4059766624122858, "step": 205 }, { "completion_length": 966.760181427002, "epoch": 0.26842576757064907, "grad_norm": 0.01193250436335802, "kl": 0.53192138671875, "learning_rate": 9.16208486735184e-06, "loss": 0.0213, "reward": 0.608053925447166, "reward_std": 0.2656254874309525, "rewards/accuracy_reward": 0.2691326476633549, "rewards/semantic_entropy_math_reward": 0.3389212694019079, "step": 206 }, { "completion_length": 975.917064666748, "epoch": 0.2697288052773027, "grad_norm": 0.0132168373093009, "kl": 0.5753173828125, "learning_rate": 9.149426459995127e-06, "loss": 0.023, "reward": 0.5061953347176313, "reward_std": 0.21771353296935558, "rewards/accuracy_reward": 0.21683673199731857, "rewards/semantic_entropy_math_reward": 0.28935858234763145, "step": 207 }, { "completion_length": 879.3915634155273, "epoch": 0.27103184298395633, "grad_norm": 0.029789483174681664, "kl": 0.4744873046875, "learning_rate": 9.136682034728508e-06, "loss": 0.019, "reward": 0.5475583160296082, "reward_std": 0.25839823111891747, "rewards/accuracy_reward": 0.24362244422081858, "rewards/semantic_entropy_math_reward": 0.30393584445118904, "step": 208 }, { "completion_length": 745.2920761108398, "epoch": 0.27233488069060996, "grad_norm": 0.02544447034597397, "kl": 0.4510498046875, "learning_rate": 9.123851855744842e-06, "loss": 0.0181, "reward": 0.44132653810083866, "reward_std": 0.24406332708895206, "rewards/accuracy_reward": 0.18367346562445164, "rewards/semantic_entropy_math_reward": 0.2576530510559678, "step": 209 }, { "completion_length": 683.6696281433105, "epoch": 0.27363791839726365, "grad_norm": 0.018655141815543175, "kl": 0.4075927734375, "learning_rate": 9.110936189014668e-06, "loss": 0.0163, "reward": 0.5041909739375114, "reward_std": 0.30082805175334215, "rewards/accuracy_reward": 0.22576530324295163, "rewards/semantic_entropy_math_reward": 0.27842564787715673, "step": 210 }, { "completion_length": 845.8596801757812, "epoch": 0.2749409561039173, "grad_norm": 0.01907339319586754, "kl": 0.43994140625, "learning_rate": 9.097935302280682e-06, "loss": 0.0176, "reward": 0.4688411084935069, "reward_std": 0.24678992317058146, "rewards/accuracy_reward": 0.20280611934140325, "rewards/semantic_entropy_math_reward": 0.2660349737852812, "step": 211 }, { "completion_length": 885.5165672302246, "epoch": 0.2762439938105709, "grad_norm": 0.013909995555877686, "kl": 0.460693359375, "learning_rate": 9.08484946505221e-06, "loss": 0.0184, "reward": 0.5127551052719355, "reward_std": 0.31098792888224125, "rewards/accuracy_reward": 0.23979591438546777, "rewards/semantic_entropy_math_reward": 0.27295917458832264, "step": 212 }, { "completion_length": 1026.4170837402344, "epoch": 0.27754703151722454, "grad_norm": 0.024753449484705925, "kl": 0.47998046875, "learning_rate": 9.0716789485996e-06, "loss": 0.0192, "reward": 0.4948979653418064, "reward_std": 0.26596762728877366, "rewards/accuracy_reward": 0.21173469291534275, "rewards/semantic_entropy_math_reward": 0.28316326066851616, "step": 213 }, { "completion_length": 1187.3163032531738, "epoch": 0.27885006922387817, "grad_norm": 0.019836200401186943, "kl": 0.60009765625, "learning_rate": 9.058424025948609e-06, "loss": 0.024, "reward": 0.3478498673066497, "reward_std": 0.18847463186830282, "rewards/accuracy_reward": 0.12372448807582259, "rewards/semantic_entropy_math_reward": 0.2241253610700369, "step": 214 }, { "completion_length": 1080.1020011901855, "epoch": 0.2801531069305318, "grad_norm": 0.019383462145924568, "kl": 0.5845947265625, "learning_rate": 9.045084971874738e-06, "loss": 0.0234, "reward": 0.3770043794065714, "reward_std": 0.2038659998215735, "rewards/accuracy_reward": 0.1466836710460484, "rewards/semantic_entropy_math_reward": 0.23032070137560368, "step": 215 }, { "completion_length": 801.683666229248, "epoch": 0.28145614463718543, "grad_norm": 0.029816076159477234, "kl": 0.42236328125, "learning_rate": 9.03166206289754e-06, "loss": 0.0169, "reward": 0.4369533499702811, "reward_std": 0.2877592113800347, "rewards/accuracy_reward": 0.18877550726756454, "rewards/semantic_entropy_math_reward": 0.24817784130573273, "step": 216 }, { "completion_length": 807.4183464050293, "epoch": 0.28275918234383907, "grad_norm": 0.03795782849192619, "kl": 0.480712890625, "learning_rate": 9.018155577274891e-06, "loss": 0.0192, "reward": 0.3022959250956774, "reward_std": 0.23153662728145719, "rewards/accuracy_reward": 0.11607142619322985, "rewards/semantic_entropy_math_reward": 0.18622449226677418, "step": 217 }, { "completion_length": 704.5816230773926, "epoch": 0.2840622200504927, "grad_norm": 0.05182367190718651, "kl": 0.4271240234375, "learning_rate": 9.004565794997209e-06, "loss": 0.0171, "reward": 0.4528061356395483, "reward_std": 0.33819583617150784, "rewards/accuracy_reward": 0.207908159121871, "rewards/semantic_entropy_math_reward": 0.24489795416593552, "step": 218 }, { "completion_length": 614.1147766113281, "epoch": 0.28536525775714633, "grad_norm": 0.060941848903894424, "kl": 0.2412109375, "learning_rate": 8.990892997781661e-06, "loss": 0.0096, "reward": 0.8545918464660645, "reward_std": 0.43180758878588676, "rewards/accuracy_reward": 0.4005101937800646, "rewards/semantic_entropy_math_reward": 0.45408161357045174, "step": 219 }, { "completion_length": 647.7180938720703, "epoch": 0.28666829546379996, "grad_norm": 0.024072663858532906, "kl": 0.21038818359375, "learning_rate": 8.977137469066321e-06, "loss": 0.0084, "reward": 0.9593658708035946, "reward_std": 0.34469017502851784, "rewards/accuracy_reward": 0.4196428470313549, "rewards/semantic_entropy_math_reward": 0.5397230237722397, "step": 220 }, { "completion_length": 609.443868637085, "epoch": 0.28797133317045365, "grad_norm": 0.020358948037028313, "kl": 0.129730224609375, "learning_rate": 8.963299494004292e-06, "loss": 0.0052, "reward": 1.057397935539484, "reward_std": 0.2887597740627825, "rewards/accuracy_reward": 0.43749999068677425, "rewards/semantic_entropy_math_reward": 0.6198979448527098, "step": 221 }, { "completion_length": 544.8635120391846, "epoch": 0.2892743708771073, "grad_norm": 0.009551750496029854, "kl": 0.075653076171875, "learning_rate": 8.949379359457795e-06, "loss": 0.003, "reward": 1.1594387702643871, "reward_std": 0.22266102768480778, "rewards/accuracy_reward": 0.48596938233822584, "rewards/semantic_entropy_math_reward": 0.6734693832695484, "step": 222 }, { "completion_length": 546.9017734527588, "epoch": 0.2905774085837609, "grad_norm": 0.007235002238303423, "kl": 0.0719757080078125, "learning_rate": 8.935377353992222e-06, "loss": 0.0029, "reward": 1.278243437409401, "reward_std": 0.18785033567110077, "rewards/accuracy_reward": 0.5242346841841936, "rewards/semantic_entropy_math_reward": 0.7540087513625622, "step": 223 }, { "completion_length": 527.7027931213379, "epoch": 0.29188044629041454, "grad_norm": 0.009255875833332539, "kl": 0.056182861328125, "learning_rate": 8.921293767870157e-06, "loss": 0.0022, "reward": 1.1880466267466545, "reward_std": 0.20094707608222961, "rewards/accuracy_reward": 0.49489795230329037, "rewards/semantic_entropy_math_reward": 0.6931486818939447, "step": 224 }, { "completion_length": 457.13647270202637, "epoch": 0.2931834839970682, "grad_norm": 0.007916360162198544, "kl": 0.0529327392578125, "learning_rate": 8.907128893045359e-06, "loss": 0.0021, "reward": 1.3616982363164425, "reward_std": 0.17598704434931278, "rewards/accuracy_reward": 0.5880101956427097, "rewards/semantic_entropy_math_reward": 0.7736880145967007, "step": 225 }, { "completion_length": 555.8443775177002, "epoch": 0.2944865217037218, "grad_norm": 0.008898014202713966, "kl": 0.05047607421875, "learning_rate": 8.892883023156703e-06, "loss": 0.002, "reward": 1.1475947462022305, "reward_std": 0.2232738654129207, "rewards/accuracy_reward": 0.4668367262929678, "rewards/semantic_entropy_math_reward": 0.6807580031454563, "step": 226 }, { "completion_length": 560.799732208252, "epoch": 0.29578955941037544, "grad_norm": 0.007520818617194891, "kl": 0.04727935791015625, "learning_rate": 8.8785564535221e-06, "loss": 0.0019, "reward": 1.172376073896885, "reward_std": 0.22141354344785213, "rewards/accuracy_reward": 0.49999999068677425, "rewards/semantic_entropy_math_reward": 0.6723760813474655, "step": 227 }, { "completion_length": 596.4106998443604, "epoch": 0.29709259711702907, "grad_norm": 0.005198197904974222, "kl": 0.045074462890625, "learning_rate": 8.86414948113237e-06, "loss": 0.0018, "reward": 1.2208454497158527, "reward_std": 0.19248850626172498, "rewards/accuracy_reward": 0.49999998696148396, "rewards/semantic_entropy_math_reward": 0.7208454608917236, "step": 228 }, { "completion_length": 611.8137607574463, "epoch": 0.2983956348236827, "grad_norm": 0.006569060496985912, "kl": 0.04053497314453125, "learning_rate": 8.849662404645097e-06, "loss": 0.0016, "reward": 1.1729227229952812, "reward_std": 0.19189510913565755, "rewards/accuracy_reward": 0.48086733650416136, "rewards/semantic_entropy_math_reward": 0.6920553836971521, "step": 229 }, { "completion_length": 612.554838180542, "epoch": 0.29969867253033633, "grad_norm": 0.007635242771357298, "kl": 0.0565643310546875, "learning_rate": 8.835095524378413e-06, "loss": 0.0023, "reward": 1.1787536330521107, "reward_std": 0.19869894511066377, "rewards/accuracy_reward": 0.48341835755854845, "rewards/semantic_entropy_math_reward": 0.6953352577984333, "step": 230 }, { "completion_length": 633.1300868988037, "epoch": 0.30100171023698996, "grad_norm": 0.005504283122718334, "kl": 0.05850982666015625, "learning_rate": 8.820449142304805e-06, "loss": 0.0023, "reward": 1.1858600415289402, "reward_std": 0.2176875094883144, "rewards/accuracy_reward": 0.5076530538499355, "rewards/semantic_entropy_math_reward": 0.6782069858163595, "step": 231 }, { "completion_length": 676.4196357727051, "epoch": 0.3023047479436436, "grad_norm": 0.005811599548906088, "kl": 0.05633544921875, "learning_rate": 8.805723562044825e-06, "loss": 0.0023, "reward": 1.1568877398967743, "reward_std": 0.260601872112602, "rewards/accuracy_reward": 0.4834183566272259, "rewards/semantic_entropy_math_reward": 0.6734693758189678, "step": 232 }, { "completion_length": 631.2423324584961, "epoch": 0.3036077856502973, "grad_norm": 0.007872718386352062, "kl": 0.07729339599609375, "learning_rate": 8.790919088860815e-06, "loss": 0.0031, "reward": 1.1752915494143963, "reward_std": 0.2167319026775658, "rewards/accuracy_reward": 0.4948979504406452, "rewards/semantic_entropy_math_reward": 0.680393585935235, "step": 233 }, { "completion_length": 714.4158020019531, "epoch": 0.3049108233569509, "grad_norm": 0.008222694508731365, "kl": 0.084075927734375, "learning_rate": 8.776036029650573e-06, "loss": 0.0034, "reward": 1.188046634197235, "reward_std": 0.26914450991898775, "rewards/accuracy_reward": 0.5204081451520324, "rewards/semantic_entropy_math_reward": 0.6676384601742029, "step": 234 }, { "completion_length": 716.6300868988037, "epoch": 0.30621386106360454, "grad_norm": 0.005068425554782152, "kl": 0.08300018310546875, "learning_rate": 8.76107469294099e-06, "loss": 0.0033, "reward": 1.1251822225749493, "reward_std": 0.24908931832760572, "rewards/accuracy_reward": 0.4859693832695484, "rewards/semantic_entropy_math_reward": 0.6392127983272076, "step": 235 }, { "completion_length": 809.0420684814453, "epoch": 0.3075168987702582, "grad_norm": 0.005393457133322954, "kl": 0.09809112548828125, "learning_rate": 8.746035388881655e-06, "loss": 0.0039, "reward": 1.09365888312459, "reward_std": 0.2978491976391524, "rewards/accuracy_reward": 0.46173468697816133, "rewards/semantic_entropy_math_reward": 0.6319241859018803, "step": 236 }, { "completion_length": 828.0663185119629, "epoch": 0.3088199364769118, "grad_norm": 0.005808001384139061, "kl": 0.11925506591796875, "learning_rate": 8.730918429238429e-06, "loss": 0.0048, "reward": 1.070153046399355, "reward_std": 0.2782040755264461, "rewards/accuracy_reward": 0.45535713620483875, "rewards/semantic_entropy_math_reward": 0.614795908331871, "step": 237 }, { "completion_length": 796.0969276428223, "epoch": 0.31012297418356544, "grad_norm": 0.005220044869929552, "kl": 0.128173828125, "learning_rate": 8.715724127386971e-06, "loss": 0.0051, "reward": 1.1650874465703964, "reward_std": 0.261374767171219, "rewards/accuracy_reward": 0.5025510173290968, "rewards/semantic_entropy_math_reward": 0.6625364292412996, "step": 238 }, { "completion_length": 760.284423828125, "epoch": 0.31142601189021907, "grad_norm": 0.00538071570917964, "kl": 0.111114501953125, "learning_rate": 8.70045279830626e-06, "loss": 0.0044, "reward": 1.1386661753058434, "reward_std": 0.28979678824543953, "rewards/accuracy_reward": 0.4961734600365162, "rewards/semantic_entropy_math_reward": 0.6424926947802305, "step": 239 }, { "completion_length": 870.5318756103516, "epoch": 0.3127290495968727, "grad_norm": 0.005206842441111803, "kl": 0.13873291015625, "learning_rate": 8.685104758572047e-06, "loss": 0.0056, "reward": 1.0601311959326267, "reward_std": 0.220444563543424, "rewards/accuracy_reward": 0.4489795807749033, "rewards/semantic_entropy_math_reward": 0.6111516039818525, "step": 240 }, { "completion_length": 891.0178413391113, "epoch": 0.31403208730352633, "grad_norm": 0.005222520790994167, "kl": 0.1482696533203125, "learning_rate": 8.669680326350303e-06, "loss": 0.0059, "reward": 1.0763483867049217, "reward_std": 0.23950046487152576, "rewards/accuracy_reward": 0.4477040786296129, "rewards/semantic_entropy_math_reward": 0.6286443118005991, "step": 241 }, { "completion_length": 842.7079048156738, "epoch": 0.31533512501017996, "grad_norm": 0.005921407137066126, "kl": 0.1538848876953125, "learning_rate": 8.65417982139062e-06, "loss": 0.0062, "reward": 1.0216836724430323, "reward_std": 0.30130245443433523, "rewards/accuracy_reward": 0.4426020358223468, "rewards/semantic_entropy_math_reward": 0.5790816191583872, "step": 242 }, { "completion_length": 794.5369644165039, "epoch": 0.3166381627168336, "grad_norm": 0.012830501422286034, "kl": 0.12078857421875, "learning_rate": 8.638603565019588e-06, "loss": 0.0048, "reward": 1.170371726155281, "reward_std": 0.30472523951902986, "rewards/accuracy_reward": 0.5267857015132904, "rewards/semantic_entropy_math_reward": 0.6435859836637974, "step": 243 }, { "completion_length": 857.7359504699707, "epoch": 0.3179412004234873, "grad_norm": 0.005284575745463371, "kl": 0.15008544921875, "learning_rate": 8.622951880134122e-06, "loss": 0.006, "reward": 1.1069606319069862, "reward_std": 0.27644803654402494, "rewards/accuracy_reward": 0.47831631638109684, "rewards/semantic_entropy_math_reward": 0.6286443080753088, "step": 244 }, { "completion_length": 800.9068717956543, "epoch": 0.3192442381301409, "grad_norm": 0.004598068539053202, "kl": 0.10028076171875, "learning_rate": 8.60722509119478e-06, "loss": 0.004, "reward": 1.1639941558241844, "reward_std": 0.23298410302959383, "rewards/accuracy_reward": 0.5051020309329033, "rewards/semantic_entropy_math_reward": 0.6588921211659908, "step": 245 }, { "completion_length": 826.6096725463867, "epoch": 0.32054727583679454, "grad_norm": 0.005362240131944418, "kl": 0.1144256591796875, "learning_rate": 8.59142352421903e-06, "loss": 0.0046, "reward": 1.0258746184408665, "reward_std": 0.255880287848413, "rewards/accuracy_reward": 0.4005101975053549, "rewards/semantic_entropy_math_reward": 0.6253644172102213, "step": 246 }, { "completion_length": 787.1798286437988, "epoch": 0.3218503135434482, "grad_norm": 0.0043987673707306385, "kl": 0.06555938720703125, "learning_rate": 8.575547506774498e-06, "loss": 0.0026, "reward": 1.1424926929175854, "reward_std": 0.23990340175805613, "rewards/accuracy_reward": 0.47193876118399203, "rewards/semantic_entropy_math_reward": 0.6705539021641016, "step": 247 }, { "completion_length": 772.6045799255371, "epoch": 0.3231533512501018, "grad_norm": 0.0046552796848118305, "kl": 0.08348846435546875, "learning_rate": 8.559597367972168e-06, "loss": 0.0033, "reward": 1.1353862918913364, "reward_std": 0.24413436278700829, "rewards/accuracy_reward": 0.4936224352568388, "rewards/semantic_entropy_math_reward": 0.6417638286948204, "step": 248 }, { "completion_length": 800.883918762207, "epoch": 0.32445638895675544, "grad_norm": 0.006352246273308992, "kl": 0.07598114013671875, "learning_rate": 8.543573438459573e-06, "loss": 0.003, "reward": 1.0737973656505346, "reward_std": 0.22448929399251938, "rewards/accuracy_reward": 0.44005101290531456, "rewards/semantic_entropy_math_reward": 0.6337463315576315, "step": 249 }, { "completion_length": 729.0599365234375, "epoch": 0.32575942666340907, "grad_norm": 0.004705470521003008, "kl": 0.05321502685546875, "learning_rate": 8.527476050413922e-06, "loss": 0.0021, "reward": 1.189322154968977, "reward_std": 0.2236394747160375, "rewards/accuracy_reward": 0.49107142037246376, "rewards/semantic_entropy_math_reward": 0.6982507184147835, "step": 250 }, { "completion_length": 770.9017677307129, "epoch": 0.3270624643700627, "grad_norm": 0.004741177428513765, "kl": 0.07711029052734375, "learning_rate": 8.511305537535238e-06, "loss": 0.0031, "reward": 1.1561588533222675, "reward_std": 0.23179244669154286, "rewards/accuracy_reward": 0.4961734591051936, "rewards/semantic_entropy_math_reward": 0.6599853988736868, "step": 251 }, { "completion_length": 710.3328914642334, "epoch": 0.32836550207671633, "grad_norm": 0.004981675185263157, "kl": 0.0583953857421875, "learning_rate": 8.49506223503941e-06, "loss": 0.0023, "reward": 1.1632653139531612, "reward_std": 0.21966229984536767, "rewards/accuracy_reward": 0.47704080236144364, "rewards/semantic_entropy_math_reward": 0.6862244680523872, "step": 252 }, { "completion_length": 719.469367980957, "epoch": 0.32966853978336996, "grad_norm": 0.004019387997686863, "kl": 0.047283172607421875, "learning_rate": 8.47874647965128e-06, "loss": 0.0019, "reward": 1.285532083362341, "reward_std": 0.1838363332208246, "rewards/accuracy_reward": 0.5369897847995162, "rewards/semantic_entropy_math_reward": 0.748542258515954, "step": 253 }, { "completion_length": 756.3622303009033, "epoch": 0.3309715774900236, "grad_norm": 0.004174725618213415, "kl": 0.06235504150390625, "learning_rate": 8.462358609597629e-06, "loss": 0.0025, "reward": 1.178024772554636, "reward_std": 0.2410548785701394, "rewards/accuracy_reward": 0.5012755012139678, "rewards/semantic_entropy_math_reward": 0.6767492610961199, "step": 254 }, { "completion_length": 728.9094181060791, "epoch": 0.3322746151966773, "grad_norm": 0.005024634301662445, "kl": 0.0721282958984375, "learning_rate": 8.445898964600188e-06, "loss": 0.0029, "reward": 1.2527332156896591, "reward_std": 0.24679026007652283, "rewards/accuracy_reward": 0.5573979467153549, "rewards/semantic_entropy_math_reward": 0.6953352596610785, "step": 255 }, { "completion_length": 809.057373046875, "epoch": 0.3335776529033309, "grad_norm": 0.0046451762318611145, "kl": 0.07041549682617188, "learning_rate": 8.429367885868582e-06, "loss": 0.0028, "reward": 1.1133381873369217, "reward_std": 0.269217541674152, "rewards/accuracy_reward": 0.46683673560619354, "rewards/semantic_entropy_math_reward": 0.6465014219284058, "step": 256 }, { "completion_length": 793.135181427002, "epoch": 0.33488069060998454, "grad_norm": 0.00446203863248229, "kl": 0.08423995971679688, "learning_rate": 8.412765716093273e-06, "loss": 0.0034, "reward": 1.1091472320258617, "reward_std": 0.23562164371833205, "rewards/accuracy_reward": 0.47576529905200005, "rewards/semantic_entropy_math_reward": 0.6333819087594748, "step": 257 }, { "completion_length": 706.4578952789307, "epoch": 0.3361837283166382, "grad_norm": 0.0048806979320943356, "kl": 0.07009124755859375, "learning_rate": 8.396092799438429e-06, "loss": 0.0028, "reward": 1.22849852591753, "reward_std": 0.23840252216905355, "rewards/accuracy_reward": 0.5382652934640646, "rewards/semantic_entropy_math_reward": 0.6902332156896591, "step": 258 }, { "completion_length": 800.0025405883789, "epoch": 0.3374867660232918, "grad_norm": 0.004317475948482752, "kl": 0.0771942138671875, "learning_rate": 8.379349481534822e-06, "loss": 0.0031, "reward": 1.1579810567200184, "reward_std": 0.24627113435417414, "rewards/accuracy_reward": 0.49362243339419365, "rewards/semantic_entropy_math_reward": 0.6643585860729218, "step": 259 }, { "completion_length": 791.4017677307129, "epoch": 0.33878980372994544, "grad_norm": 0.00391523540019989, "kl": 0.098907470703125, "learning_rate": 8.362536109472637e-06, "loss": 0.004, "reward": 1.1661807522177696, "reward_std": 0.2090360161382705, "rewards/accuracy_reward": 0.47704080399125814, "rewards/semantic_entropy_math_reward": 0.6891399249434471, "step": 260 }, { "completion_length": 688.3214111328125, "epoch": 0.34009284143659907, "grad_norm": 0.0048033446073532104, "kl": 0.0616912841796875, "learning_rate": 8.345653031794292e-06, "loss": 0.0025, "reward": 1.229956265538931, "reward_std": 0.22136465786024928, "rewards/accuracy_reward": 0.5280612129718065, "rewards/semantic_entropy_math_reward": 0.701895035803318, "step": 261 }, { "completion_length": 770.1262588500977, "epoch": 0.3413958791432527, "grad_norm": 0.00493368087336421, "kl": 0.0781402587890625, "learning_rate": 8.328700598487203e-06, "loss": 0.0031, "reward": 1.206450443714857, "reward_std": 0.2774555936921388, "rewards/accuracy_reward": 0.506377543322742, "rewards/semantic_entropy_math_reward": 0.7000728696584702, "step": 262 }, { "completion_length": 729.6951370239258, "epoch": 0.34269891684990633, "grad_norm": 0.005018001422286034, "kl": 0.08380126953125, "learning_rate": 8.31167916097654e-06, "loss": 0.0034, "reward": 1.1672740429639816, "reward_std": 0.24183082999661565, "rewards/accuracy_reward": 0.48214284889400005, "rewards/semantic_entropy_math_reward": 0.6851311884820461, "step": 263 }, { "completion_length": 763.8048305511475, "epoch": 0.34400195455655996, "grad_norm": 0.004851920064538717, "kl": 0.093902587890625, "learning_rate": 8.294589072117925e-06, "loss": 0.0038, "reward": 1.0617711395025253, "reward_std": 0.2621227176859975, "rewards/accuracy_reward": 0.4272959107765928, "rewards/semantic_entropy_math_reward": 0.6344752013683319, "step": 264 }, { "completion_length": 769.2155380249023, "epoch": 0.3453049922632136, "grad_norm": 0.004559279419481754, "kl": 0.09972381591796875, "learning_rate": 8.277430686190137e-06, "loss": 0.004, "reward": 1.196792982518673, "reward_std": 0.25737720262259245, "rewards/accuracy_reward": 0.5025510117411613, "rewards/semantic_entropy_math_reward": 0.6942419540137053, "step": 265 }, { "completion_length": 762.5535545349121, "epoch": 0.3466080299698672, "grad_norm": 0.004530859179794788, "kl": 0.1021728515625, "learning_rate": 8.260204358887753e-06, "loss": 0.0041, "reward": 1.1506924107670784, "reward_std": 0.21813430613838136, "rewards/accuracy_reward": 0.4732142728753388, "rewards/semantic_entropy_math_reward": 0.6774781048297882, "step": 266 }, { "completion_length": 777.035701751709, "epoch": 0.3479110676765209, "grad_norm": 0.00493161054328084, "kl": 0.113983154296875, "learning_rate": 8.24291044731378e-06, "loss": 0.0046, "reward": 1.1229956038296223, "reward_std": 0.29484259663149714, "rewards/accuracy_reward": 0.48596938140690327, "rewards/semantic_entropy_math_reward": 0.6370262447744608, "step": 267 }, { "completion_length": 839.7244720458984, "epoch": 0.34921410538317454, "grad_norm": 0.004290192387998104, "kl": 0.14404296875, "learning_rate": 8.225549309972256e-06, "loss": 0.0058, "reward": 1.1472303085029125, "reward_std": 0.29779061023145914, "rewards/accuracy_reward": 0.5178571287542582, "rewards/semantic_entropy_math_reward": 0.6293731685727835, "step": 268 }, { "completion_length": 742.8839149475098, "epoch": 0.3505171430898282, "grad_norm": 0.005196585785597563, "kl": 0.103973388671875, "learning_rate": 8.208121306760806e-06, "loss": 0.0042, "reward": 1.089103501290083, "reward_std": 0.2884269617497921, "rewards/accuracy_reward": 0.4630101900547743, "rewards/semantic_entropy_math_reward": 0.6260932739824057, "step": 269 }, { "completion_length": 784.2244682312012, "epoch": 0.3518201807964818, "grad_norm": 0.004381937440484762, "kl": 0.12152099609375, "learning_rate": 8.190626798963198e-06, "loss": 0.0049, "reward": 1.1525145918130875, "reward_std": 0.24618042167276144, "rewards/accuracy_reward": 0.4834183556959033, "rewards/semantic_entropy_math_reward": 0.6690962053835392, "step": 270 }, { "completion_length": 740.7066135406494, "epoch": 0.35312321850313544, "grad_norm": 0.004495648667216301, "kl": 0.102752685546875, "learning_rate": 8.173066149241839e-06, "loss": 0.0041, "reward": 1.1479591503739357, "reward_std": 0.2497478062286973, "rewards/accuracy_reward": 0.4795918297022581, "rewards/semantic_entropy_math_reward": 0.6683673337101936, "step": 271 }, { "completion_length": 837.3099327087402, "epoch": 0.35442625620978907, "grad_norm": 0.004835852887481451, "kl": 0.157562255859375, "learning_rate": 8.155439721630265e-06, "loss": 0.0063, "reward": 1.0705174822360277, "reward_std": 0.2746472479775548, "rewards/accuracy_reward": 0.4579081516712904, "rewards/semantic_entropy_math_reward": 0.6126093063503504, "step": 272 }, { "completion_length": 760.9438667297363, "epoch": 0.3557292939164427, "grad_norm": 0.0052245366387069225, "kl": 0.120208740234375, "learning_rate": 8.137747881525593e-06, "loss": 0.0048, "reward": 1.0816326551139355, "reward_std": 0.2729792189784348, "rewards/accuracy_reward": 0.4209183622151613, "rewards/semantic_entropy_math_reward": 0.6607142686843872, "step": 273 }, { "completion_length": 778.9885101318359, "epoch": 0.35703233162309633, "grad_norm": 0.013116881251335144, "kl": 0.13796234130859375, "learning_rate": 8.119990995680942e-06, "loss": 0.0055, "reward": 1.1310130842030048, "reward_std": 0.28320479579269886, "rewards/accuracy_reward": 0.48341835383325815, "rewards/semantic_entropy_math_reward": 0.6475947350263596, "step": 274 }, { "completion_length": 768.2193603515625, "epoch": 0.35833536932974996, "grad_norm": 0.004733406938612461, "kl": 0.11692047119140625, "learning_rate": 8.102169432197842e-06, "loss": 0.0047, "reward": 1.233600553125143, "reward_std": 0.29512258525937796, "rewards/accuracy_reward": 0.556122437119484, "rewards/semantic_entropy_math_reward": 0.6774781197309494, "step": 275 }, { "completion_length": 766.4936027526855, "epoch": 0.3596384070364036, "grad_norm": 0.005097323562949896, "kl": 0.1425933837890625, "learning_rate": 8.084283560518584e-06, "loss": 0.0057, "reward": 1.1638119518756866, "reward_std": 0.30734471417963505, "rewards/accuracy_reward": 0.5165816284716129, "rewards/semantic_entropy_math_reward": 0.6472303103655577, "step": 276 }, { "completion_length": 674.0956478118896, "epoch": 0.3609414447430572, "grad_norm": 0.005301058758050203, "kl": 0.11962890625, "learning_rate": 8.066333751418582e-06, "loss": 0.0048, "reward": 1.218294445425272, "reward_std": 0.2879829863086343, "rewards/accuracy_reward": 0.5331632504239678, "rewards/semantic_entropy_math_reward": 0.6851311735808849, "step": 277 }, { "completion_length": 708.2053337097168, "epoch": 0.3622444824497109, "grad_norm": 0.00503549724817276, "kl": 0.112457275390625, "learning_rate": 8.048320376998675e-06, "loss": 0.0045, "reward": 1.1827623844146729, "reward_std": 0.28464424330741167, "rewards/accuracy_reward": 0.506377543322742, "rewards/semantic_entropy_math_reward": 0.6763848178088665, "step": 278 }, { "completion_length": 672.3060989379883, "epoch": 0.36354752015636455, "grad_norm": 0.0052053555846214294, "kl": 0.09027099609375, "learning_rate": 8.030243810677408e-06, "loss": 0.0036, "reward": 1.1740160509943962, "reward_std": 0.28792761266231537, "rewards/accuracy_reward": 0.48086733743548393, "rewards/semantic_entropy_math_reward": 0.6931486614048481, "step": 279 }, { "completion_length": 747.3443756103516, "epoch": 0.3648505578630182, "grad_norm": 0.004773185588419437, "kl": 0.131561279296875, "learning_rate": 8.012104427183313e-06, "loss": 0.0053, "reward": 1.1018585786223412, "reward_std": 0.2507549161091447, "rewards/accuracy_reward": 0.45025509409606457, "rewards/semantic_entropy_math_reward": 0.6516034975647926, "step": 280 }, { "completion_length": 807.0293235778809, "epoch": 0.3661535955696718, "grad_norm": 0.004495895002037287, "kl": 0.146209716796875, "learning_rate": 7.993902602547113e-06, "loss": 0.0058, "reward": 1.0625000037252903, "reward_std": 0.2648404398933053, "rewards/accuracy_reward": 0.4553571343421936, "rewards/semantic_entropy_math_reward": 0.6071428414434195, "step": 281 }, { "completion_length": 794.4910507202148, "epoch": 0.36745663327632544, "grad_norm": 0.005112153012305498, "kl": 0.180755615234375, "learning_rate": 7.97563871409395e-06, "loss": 0.0072, "reward": 1.1390306204557419, "reward_std": 0.2928127693012357, "rewards/accuracy_reward": 0.5063775451853871, "rewards/semantic_entropy_math_reward": 0.6326530519872904, "step": 282 }, { "completion_length": 745.3392715454102, "epoch": 0.36875967098297907, "grad_norm": 0.0051488918252289295, "kl": 0.1558990478515625, "learning_rate": 7.957313140435545e-06, "loss": 0.0062, "reward": 1.1406705603003502, "reward_std": 0.27150456327944994, "rewards/accuracy_reward": 0.48469386994838715, "rewards/semantic_entropy_math_reward": 0.6559766568243504, "step": 283 }, { "completion_length": 742.4923324584961, "epoch": 0.3700627086896327, "grad_norm": 0.004638804588466883, "kl": 0.14400482177734375, "learning_rate": 7.938926261462366e-06, "loss": 0.0058, "reward": 1.1789358370006084, "reward_std": 0.30263456981629133, "rewards/accuracy_reward": 0.5280612111091614, "rewards/semantic_entropy_math_reward": 0.6508746258914471, "step": 284 }, { "completion_length": 752.2104396820068, "epoch": 0.37136574639628633, "grad_norm": 0.008125707507133484, "kl": 0.1697845458984375, "learning_rate": 7.920478458335738e-06, "loss": 0.0068, "reward": 1.1785714104771614, "reward_std": 0.30591345205903053, "rewards/accuracy_reward": 0.5127550903707743, "rewards/semantic_entropy_math_reward": 0.6658163238316774, "step": 285 }, { "completion_length": 798.1696243286133, "epoch": 0.37266878410293997, "grad_norm": 0.004430694505572319, "kl": 0.1587371826171875, "learning_rate": 7.901970113479956e-06, "loss": 0.0064, "reward": 1.1335641220211983, "reward_std": 0.2652543308213353, "rewards/accuracy_reward": 0.491071417927742, "rewards/semantic_entropy_math_reward": 0.6424927078187466, "step": 286 }, { "completion_length": 802.0203857421875, "epoch": 0.3739718218095936, "grad_norm": 0.004951073322445154, "kl": 0.143280029296875, "learning_rate": 7.883401610574338e-06, "loss": 0.0057, "reward": 1.1372084431350231, "reward_std": 0.32817965373396873, "rewards/accuracy_reward": 0.49617345724254847, "rewards/semantic_entropy_math_reward": 0.6410349681973457, "step": 287 }, { "completion_length": 796.922176361084, "epoch": 0.37527485951624723, "grad_norm": 0.004892925266176462, "kl": 0.1321258544921875, "learning_rate": 7.86477333454529e-06, "loss": 0.0053, "reward": 1.0134839490056038, "reward_std": 0.28645317582413554, "rewards/accuracy_reward": 0.41326529684010893, "rewards/semantic_entropy_math_reward": 0.6002186480909586, "step": 288 }, { "completion_length": 792.8239669799805, "epoch": 0.3765778972229009, "grad_norm": 0.004779390525072813, "kl": 0.1277008056640625, "learning_rate": 7.84608567155832e-06, "loss": 0.0051, "reward": 1.017857126891613, "reward_std": 0.2807872658595443, "rewards/accuracy_reward": 0.40816325740888715, "rewards/semantic_entropy_math_reward": 0.6096938643604517, "step": 289 }, { "completion_length": 784.3010063171387, "epoch": 0.37788093492955455, "grad_norm": 0.0040627773851156235, "kl": 0.155120849609375, "learning_rate": 7.82733900901003e-06, "loss": 0.0062, "reward": 1.1226311959326267, "reward_std": 0.22459624242037535, "rewards/accuracy_reward": 0.4528061132878065, "rewards/semantic_entropy_math_reward": 0.6698250491172075, "step": 290 }, { "completion_length": 711.9170761108398, "epoch": 0.3791839726362082, "grad_norm": 0.0047273775562644005, "kl": 0.125, "learning_rate": 7.808533735520087e-06, "loss": 0.005, "reward": 1.118804655969143, "reward_std": 0.2664653924293816, "rewards/accuracy_reward": 0.4566326439380646, "rewards/semantic_entropy_math_reward": 0.6621719859540462, "step": 291 }, { "completion_length": 758.7563648223877, "epoch": 0.3804870103428618, "grad_norm": 0.0049894098192453384, "kl": 0.123138427734375, "learning_rate": 7.789670240923169e-06, "loss": 0.0049, "reward": 1.0338921342045069, "reward_std": 0.2879991205409169, "rewards/accuracy_reward": 0.4132653013803065, "rewards/semantic_entropy_math_reward": 0.6206268090754747, "step": 292 }, { "completion_length": 711.4821262359619, "epoch": 0.38179004804951544, "grad_norm": 0.005104078911244869, "kl": 0.129364013671875, "learning_rate": 7.770748916260875e-06, "loss": 0.0052, "reward": 1.1317419670522213, "reward_std": 0.32502430863678455, "rewards/accuracy_reward": 0.4706632550805807, "rewards/semantic_entropy_math_reward": 0.661078717559576, "step": 293 }, { "completion_length": 767.3252372741699, "epoch": 0.38309308575616907, "grad_norm": 0.005806850269436836, "kl": 0.1323089599609375, "learning_rate": 7.751770153773635e-06, "loss": 0.0053, "reward": 1.1477769687771797, "reward_std": 0.2937973318621516, "rewards/accuracy_reward": 0.49872448295354843, "rewards/semantic_entropy_math_reward": 0.6490524597465992, "step": 294 }, { "completion_length": 707.1811084747314, "epoch": 0.3843961234628227, "grad_norm": 0.0432785265147686, "kl": 0.13897705078125, "learning_rate": 7.732734346892561e-06, "loss": 0.0056, "reward": 1.090378973633051, "reward_std": 0.24874793831259012, "rewards/accuracy_reward": 0.42346938140690327, "rewards/semantic_entropy_math_reward": 0.6669096127152443, "step": 295 }, { "completion_length": 710.6862106323242, "epoch": 0.38569916116947633, "grad_norm": 0.004688500426709652, "kl": 0.1250457763671875, "learning_rate": 7.71364189023131e-06, "loss": 0.005, "reward": 1.003462091088295, "reward_std": 0.2558277491480112, "rewards/accuracy_reward": 0.36862243991345167, "rewards/semantic_entropy_math_reward": 0.6348396372050047, "step": 296 }, { "completion_length": 670.9400424957275, "epoch": 0.38700219887612997, "grad_norm": 0.0045872884802520275, "kl": 0.1184844970703125, "learning_rate": 7.69449317957788e-06, "loss": 0.0047, "reward": 1.2192055061459541, "reward_std": 0.26538855489343405, "rewards/accuracy_reward": 0.5242346795275807, "rewards/semantic_entropy_math_reward": 0.6949708200991154, "step": 297 }, { "completion_length": 813.4451370239258, "epoch": 0.3883052365827836, "grad_norm": 0.004734583664685488, "kl": 0.141021728515625, "learning_rate": 7.675288611886423e-06, "loss": 0.0056, "reward": 0.9610058218240738, "reward_std": 0.24671304458752275, "rewards/accuracy_reward": 0.385204071062617, "rewards/semantic_entropy_math_reward": 0.5758017227053642, "step": 298 }, { "completion_length": 778.8890075683594, "epoch": 0.38960827428943723, "grad_norm": 0.005164034198969603, "kl": 0.1640167236328125, "learning_rate": 7.656028585269017e-06, "loss": 0.0066, "reward": 0.9329445995390415, "reward_std": 0.28441152768209577, "rewards/accuracy_reward": 0.3545918297022581, "rewards/semantic_entropy_math_reward": 0.5783527437597513, "step": 299 }, { "completion_length": 822.7920837402344, "epoch": 0.39091131199609086, "grad_norm": 0.004954595118761063, "kl": 0.14105224609375, "learning_rate": 7.636713498987405e-06, "loss": 0.0056, "reward": 0.89048832654953, "reward_std": 0.2517870641313493, "rewards/accuracy_reward": 0.3099489724263549, "rewards/semantic_entropy_math_reward": 0.5805393364280462, "step": 300 }, { "completion_length": 744.9311065673828, "epoch": 0.39221434970274455, "grad_norm": 0.004590762313455343, "kl": 0.1204071044921875, "learning_rate": 7.617343753444714e-06, "loss": 0.0048, "reward": 1.0444606356322765, "reward_std": 0.26932541467249393, "rewards/accuracy_reward": 0.4132653009146452, "rewards/semantic_entropy_math_reward": 0.6311953160911798, "step": 301 }, { "completion_length": 771.7397804260254, "epoch": 0.3935173874093982, "grad_norm": 0.005378189962357283, "kl": 0.137451171875, "learning_rate": 7.597919750177168e-06, "loss": 0.0055, "reward": 1.0331632606685162, "reward_std": 0.2658298793248832, "rewards/accuracy_reward": 0.42091835592873394, "rewards/semantic_entropy_math_reward": 0.6122448649257421, "step": 302 }, { "completion_length": 800.6020240783691, "epoch": 0.3948204251160518, "grad_norm": 0.005507489666342735, "kl": 0.1767578125, "learning_rate": 7.5784418918457605e-06, "loss": 0.0071, "reward": 1.0021866001188755, "reward_std": 0.330348739400506, "rewards/accuracy_reward": 0.4387755002826452, "rewards/semantic_entropy_math_reward": 0.5634110569953918, "step": 303 }, { "completion_length": 754.959171295166, "epoch": 0.39612346282270544, "grad_norm": 0.004219656344503164, "kl": 0.14666748046875, "learning_rate": 7.5589105822278944e-06, "loss": 0.0059, "reward": 1.1178935803472996, "reward_std": 0.26517650997266173, "rewards/accuracy_reward": 0.46045916771981865, "rewards/semantic_entropy_math_reward": 0.65743438154459, "step": 304 }, { "completion_length": 781.5420837402344, "epoch": 0.3974265005293591, "grad_norm": 0.005092747509479523, "kl": 0.178436279296875, "learning_rate": 7.539326226209032e-06, "loss": 0.0071, "reward": 1.0327988117933273, "reward_std": 0.2570009892806411, "rewards/accuracy_reward": 0.4260204015299678, "rewards/semantic_entropy_math_reward": 0.6067784074693918, "step": 305 }, { "completion_length": 824.6976928710938, "epoch": 0.3987295382360127, "grad_norm": 0.005433813203126192, "kl": 0.203704833984375, "learning_rate": 7.519689229774282e-06, "loss": 0.0082, "reward": 0.970663258805871, "reward_std": 0.2942730891518295, "rewards/accuracy_reward": 0.4094387674704194, "rewards/semantic_entropy_math_reward": 0.5612244829535484, "step": 306 }, { "completion_length": 756.457893371582, "epoch": 0.40003257594266634, "grad_norm": 0.006119919009506702, "kl": 0.173980712890625, "learning_rate": 7.500000000000001e-06, "loss": 0.007, "reward": 1.013666184619069, "reward_std": 0.275299983099103, "rewards/accuracy_reward": 0.4068877464160323, "rewards/semantic_entropy_math_reward": 0.6067783869802952, "step": 307 }, { "completion_length": 734.204065322876, "epoch": 0.40133561364931997, "grad_norm": 0.0051275622099637985, "kl": 0.1623687744140625, "learning_rate": 7.4802589450453415e-06, "loss": 0.0065, "reward": 1.0368075892329216, "reward_std": 0.2358871614560485, "rewards/accuracy_reward": 0.3775510126724839, "rewards/semantic_entropy_math_reward": 0.6592565290629864, "step": 308 }, { "completion_length": 814.2295799255371, "epoch": 0.4026386513559736, "grad_norm": 0.004981225822120905, "kl": 0.17822265625, "learning_rate": 7.4604664741437975e-06, "loss": 0.0071, "reward": 0.9593658894300461, "reward_std": 0.27833351865410805, "rewards/accuracy_reward": 0.3482142807915807, "rewards/semantic_entropy_math_reward": 0.6111515872180462, "step": 309 }, { "completion_length": 762.7805976867676, "epoch": 0.40394168906262723, "grad_norm": 0.004184551537036896, "kl": 0.159210205078125, "learning_rate": 7.440622997594718e-06, "loss": 0.0064, "reward": 1.0628644116222858, "reward_std": 0.24738109577447176, "rewards/accuracy_reward": 0.4247448928654194, "rewards/semantic_entropy_math_reward": 0.6381195206195116, "step": 310 }, { "completion_length": 838.6887588500977, "epoch": 0.40524472676928086, "grad_norm": 0.00516744889318943, "kl": 0.21356201171875, "learning_rate": 7.420728926754803e-06, "loss": 0.0085, "reward": 0.8974125329405069, "reward_std": 0.2636064914986491, "rewards/accuracy_reward": 0.35076529858633876, "rewards/semantic_entropy_math_reward": 0.54664720967412, "step": 311 }, { "completion_length": 728.7346725463867, "epoch": 0.40654776447593455, "grad_norm": 0.005465599708259106, "kl": 0.163177490234375, "learning_rate": 7.400784674029579e-06, "loss": 0.0065, "reward": 1.0670553781092167, "reward_std": 0.2703804336488247, "rewards/accuracy_reward": 0.4234693795442581, "rewards/semantic_entropy_math_reward": 0.643585991114378, "step": 312 }, { "completion_length": 753.8749866485596, "epoch": 0.4078508021825882, "grad_norm": 0.005277744494378567, "kl": 0.17803955078125, "learning_rate": 7.380790652864842e-06, "loss": 0.0071, "reward": 0.9447886310517788, "reward_std": 0.2373075796640478, "rewards/accuracy_reward": 0.36352040176279843, "rewards/semantic_entropy_math_reward": 0.5812682118266821, "step": 313 }, { "completion_length": 742.6657943725586, "epoch": 0.4091538398892418, "grad_norm": 0.0049532316625118256, "kl": 0.182525634765625, "learning_rate": 7.360747277738094e-06, "loss": 0.0073, "reward": 0.9174562282860279, "reward_std": 0.27921572024933994, "rewards/accuracy_reward": 0.33545918157324195, "rewards/semantic_entropy_math_reward": 0.581997063010931, "step": 314 }, { "completion_length": 731.9489688873291, "epoch": 0.41045687759589544, "grad_norm": 0.0056826770305633545, "kl": 0.176788330078125, "learning_rate": 7.340654964149947e-06, "loss": 0.0071, "reward": 0.9748542215675116, "reward_std": 0.24793488811701536, "rewards/accuracy_reward": 0.3928571343421936, "rewards/semantic_entropy_math_reward": 0.5819970574229956, "step": 315 }, { "completion_length": 729.8443756103516, "epoch": 0.4117599153025491, "grad_norm": 0.005787604954093695, "kl": 0.165496826171875, "learning_rate": 7.320514128615511e-06, "loss": 0.0066, "reward": 0.9495262131094933, "reward_std": 0.23626167979091406, "rewards/accuracy_reward": 0.3252550937468186, "rewards/semantic_entropy_math_reward": 0.6242711190134287, "step": 316 }, { "completion_length": 767.3137645721436, "epoch": 0.4130629530092027, "grad_norm": 0.004826774820685387, "kl": 0.157501220703125, "learning_rate": 7.300325188655762e-06, "loss": 0.0063, "reward": 0.9701166041195393, "reward_std": 0.26514851208776236, "rewards/accuracy_reward": 0.35969387274235487, "rewards/semantic_entropy_math_reward": 0.610422732308507, "step": 317 }, { "completion_length": 728.8086585998535, "epoch": 0.41436599071585634, "grad_norm": 0.004576473962515593, "kl": 0.158447265625, "learning_rate": 7.280088562788879e-06, "loss": 0.0063, "reward": 0.9088921286165714, "reward_std": 0.2492459244094789, "rewards/accuracy_reward": 0.31632652413100004, "rewards/semantic_entropy_math_reward": 0.5925655923783779, "step": 318 }, { "completion_length": 697.3201370239258, "epoch": 0.41566902842250997, "grad_norm": 0.005138779990375042, "kl": 0.136993408203125, "learning_rate": 7.259804670521579e-06, "loss": 0.0055, "reward": 0.9819606579840183, "reward_std": 0.26612218702211976, "rewards/accuracy_reward": 0.3635203999001533, "rewards/semantic_entropy_math_reward": 0.6184402201324701, "step": 319 }, { "completion_length": 740.6479415893555, "epoch": 0.4169720661291636, "grad_norm": 0.005470127798616886, "kl": 0.172393798828125, "learning_rate": 7.2394739323404105e-06, "loss": 0.0069, "reward": 1.042456267401576, "reward_std": 0.244871623814106, "rewards/accuracy_reward": 0.422193868085742, "rewards/semantic_entropy_math_reward": 0.6202623955905437, "step": 320 }, { "completion_length": 662.2614631652832, "epoch": 0.41827510383581723, "grad_norm": 0.008797611109912395, "kl": 0.1490631103515625, "learning_rate": 7.219096769703045e-06, "loss": 0.006, "reward": 1.097303207963705, "reward_std": 0.24035303108394146, "rewards/accuracy_reward": 0.40816325694322586, "rewards/semantic_entropy_math_reward": 0.6891399379819632, "step": 321 }, { "completion_length": 723.7601890563965, "epoch": 0.41957814154247086, "grad_norm": 0.005943264812231064, "kl": 0.155609130859375, "learning_rate": 7.198673605029529e-06, "loss": 0.0062, "reward": 0.9877915419638157, "reward_std": 0.2582882810384035, "rewards/accuracy_reward": 0.3737244801595807, "rewards/semantic_entropy_math_reward": 0.6140670273452997, "step": 322 }, { "completion_length": 700.8367233276367, "epoch": 0.42088117924912455, "grad_norm": 0.01175774447619915, "kl": 0.181488037109375, "learning_rate": 7.178204861693546e-06, "loss": 0.0073, "reward": 1.050838191062212, "reward_std": 0.29096414986997843, "rewards/accuracy_reward": 0.42984692472964525, "rewards/semantic_entropy_math_reward": 0.6209912169724703, "step": 323 }, { "completion_length": 708.9999923706055, "epoch": 0.4221842169557782, "grad_norm": 0.005239454098045826, "kl": 0.1979827880859375, "learning_rate": 7.15769096401362e-06, "loss": 0.0079, "reward": 1.0209548026323318, "reward_std": 0.28547285636886954, "rewards/accuracy_reward": 0.41964284516870975, "rewards/semantic_entropy_math_reward": 0.6013119388371706, "step": 324 }, { "completion_length": 754.9476928710938, "epoch": 0.4234872546624318, "grad_norm": 0.0072799003683030605, "kl": 0.209259033203125, "learning_rate": 7.137132337244329e-06, "loss": 0.0084, "reward": 1.0193148739635944, "reward_std": 0.3032051585614681, "rewards/accuracy_reward": 0.4030612148344517, "rewards/semantic_entropy_math_reward": 0.6162536256015301, "step": 325 }, { "completion_length": 778.6734466552734, "epoch": 0.42479029236908544, "grad_norm": 0.0061063156463205814, "kl": 0.264007568359375, "learning_rate": 7.116529407567489e-06, "loss": 0.0106, "reward": 0.9836005717515945, "reward_std": 0.2910753316245973, "rewards/accuracy_reward": 0.39030611887574196, "rewards/semantic_entropy_math_reward": 0.5932944603264332, "step": 326 }, { "completion_length": 783.8354396820068, "epoch": 0.4260933300757391, "grad_norm": 0.01212493609637022, "kl": 0.287841796875, "learning_rate": 7.095882602083321e-06, "loss": 0.0115, "reward": 0.9894314967095852, "reward_std": 0.32045542215928435, "rewards/accuracy_reward": 0.43112243991345167, "rewards/semantic_entropy_math_reward": 0.5583090130239725, "step": 327 }, { "completion_length": 784.8711547851562, "epoch": 0.4273963677823927, "grad_norm": 0.007912764325737953, "kl": 0.31365966796875, "learning_rate": 7.075192348801591e-06, "loss": 0.0125, "reward": 0.9028790034353733, "reward_std": 0.3331639338284731, "rewards/accuracy_reward": 0.35076529905200005, "rewards/semantic_entropy_math_reward": 0.5521136932075024, "step": 328 }, { "completion_length": 815.9719200134277, "epoch": 0.42869940548904634, "grad_norm": 0.008721559308469296, "kl": 0.299224853515625, "learning_rate": 7.054459076632742e-06, "loss": 0.012, "reward": 0.9271137025207281, "reward_std": 0.3191894362680614, "rewards/accuracy_reward": 0.38775509828701615, "rewards/semantic_entropy_math_reward": 0.5393585935235023, "step": 329 }, { "completion_length": 846.8813591003418, "epoch": 0.43000244319569997, "grad_norm": 0.007729404140263796, "kl": 0.327301025390625, "learning_rate": 7.033683215379002e-06, "loss": 0.0131, "reward": 0.9610058143734932, "reward_std": 0.32442838652059436, "rewards/accuracy_reward": 0.4107142807915807, "rewards/semantic_entropy_math_reward": 0.5502915252000093, "step": 330 }, { "completion_length": 782.1938591003418, "epoch": 0.4313054809023536, "grad_norm": 0.011318473145365715, "kl": 0.306884765625, "learning_rate": 7.012865195725473e-06, "loss": 0.0123, "reward": 1.0495626758784056, "reward_std": 0.2743197735399008, "rewards/accuracy_reward": 0.44897957891225815, "rewards/semantic_entropy_math_reward": 0.6005830746144056, "step": 331 }, { "completion_length": 744.6683540344238, "epoch": 0.43260851860900723, "grad_norm": 0.02737046405673027, "kl": 0.33233642578125, "learning_rate": 6.9920054492312086e-06, "loss": 0.0133, "reward": 1.0054664798080921, "reward_std": 0.31095796590670943, "rewards/accuracy_reward": 0.4183673420920968, "rewards/semantic_entropy_math_reward": 0.5870991088449955, "step": 332 }, { "completion_length": 664.8252410888672, "epoch": 0.43391155631566086, "grad_norm": 0.01112570334225893, "kl": 0.2249755859375, "learning_rate": 6.971104408320253e-06, "loss": 0.009, "reward": 1.1521501429378986, "reward_std": 0.2594457087107003, "rewards/accuracy_reward": 0.48341835383325815, "rewards/semantic_entropy_math_reward": 0.6687317583709955, "step": 333 }, { "completion_length": 703.9578971862793, "epoch": 0.4352145940223145, "grad_norm": 0.013256323523819447, "kl": 0.1408538818359375, "learning_rate": 6.950162506272697e-06, "loss": 0.0056, "reward": 1.1045918203890324, "reward_std": 0.1868606056086719, "rewards/accuracy_reward": 0.4438775386661291, "rewards/semantic_entropy_math_reward": 0.6607142612338066, "step": 334 }, { "completion_length": 587.2767734527588, "epoch": 0.4365176317289682, "grad_norm": 0.02699277736246586, "kl": 0.092132568359375, "learning_rate": 6.9291801772156775e-06, "loss": 0.0037, "reward": 1.2363337986171246, "reward_std": 0.17960269388277084, "rewards/accuracy_reward": 0.49617345351725817, "rewards/semantic_entropy_math_reward": 0.7401603311300278, "step": 335 }, { "completion_length": 616.2525386810303, "epoch": 0.4378206694356218, "grad_norm": 0.015168532729148865, "kl": 0.10162353515625, "learning_rate": 6.9081578561143924e-06, "loss": 0.0041, "reward": 1.1598031744360924, "reward_std": 0.19833671813830733, "rewards/accuracy_reward": 0.45790814980864525, "rewards/semantic_entropy_math_reward": 0.7018950246274471, "step": 336 }, { "completion_length": 604.4476947784424, "epoch": 0.43912370714227544, "grad_norm": 0.02215934544801712, "kl": 0.084259033203125, "learning_rate": 6.887095978763072e-06, "loss": 0.0034, "reward": 1.1936953105032444, "reward_std": 0.1879709882196039, "rewards/accuracy_reward": 0.47321427054703236, "rewards/semantic_entropy_math_reward": 0.7204810455441475, "step": 337 }, { "completion_length": 653.2538185119629, "epoch": 0.4404267448489291, "grad_norm": 0.01574917882680893, "kl": 0.12286376953125, "learning_rate": 6.865994981775958e-06, "loss": 0.0049, "reward": 1.089103490114212, "reward_std": 0.21896941494196653, "rewards/accuracy_reward": 0.41964284982532263, "rewards/semantic_entropy_math_reward": 0.669460641220212, "step": 338 }, { "completion_length": 666.8392715454102, "epoch": 0.4417297825555827, "grad_norm": 0.023268572986125946, "kl": 0.15814208984375, "learning_rate": 6.844855302578236e-06, "loss": 0.0063, "reward": 1.060860052704811, "reward_std": 0.22332853684201837, "rewards/accuracy_reward": 0.41581631696317345, "rewards/semantic_entropy_math_reward": 0.6450437251478434, "step": 339 }, { "completion_length": 662.7755012512207, "epoch": 0.44303282026223634, "grad_norm": 0.027622457593679428, "kl": 0.16668701171875, "learning_rate": 6.823677379396984e-06, "loss": 0.0067, "reward": 1.0557580329477787, "reward_std": 0.22784481919370592, "rewards/accuracy_reward": 0.39285713620483875, "rewards/semantic_entropy_math_reward": 0.6629008613526821, "step": 340 }, { "completion_length": 713.0867137908936, "epoch": 0.44433585796888997, "grad_norm": 0.015035735443234444, "kl": 0.156341552734375, "learning_rate": 6.802461651252073e-06, "loss": 0.0063, "reward": 1.0513848178088665, "reward_std": 0.1969891453627497, "rewards/accuracy_reward": 0.40561223891563714, "rewards/semantic_entropy_math_reward": 0.6457725670188665, "step": 341 }, { "completion_length": 620.2448844909668, "epoch": 0.4456388956755436, "grad_norm": 0.009398394264280796, "kl": 0.12908935546875, "learning_rate": 6.781208557947085e-06, "loss": 0.0052, "reward": 1.191326517611742, "reward_std": 0.19692257582210004, "rewards/accuracy_reward": 0.4923469298519194, "rewards/semantic_entropy_math_reward": 0.6989795733243227, "step": 342 }, { "completion_length": 706.7512645721436, "epoch": 0.44694193338219723, "grad_norm": 0.007799746003001928, "kl": 0.13372802734375, "learning_rate": 6.759918540060173e-06, "loss": 0.0054, "reward": 1.108600553125143, "reward_std": 0.20946968789212406, "rewards/accuracy_reward": 0.4668367290869355, "rewards/semantic_entropy_math_reward": 0.6417638510465622, "step": 343 }, { "completion_length": 647.5969276428223, "epoch": 0.44824497108885086, "grad_norm": 0.009406307712197304, "kl": 0.1243896484375, "learning_rate": 6.738592038934946e-06, "loss": 0.005, "reward": 1.1477769501507282, "reward_std": 0.20305118709802628, "rewards/accuracy_reward": 0.44260202813893557, "rewards/semantic_entropy_math_reward": 0.7051749154925346, "step": 344 }, { "completion_length": 666.746166229248, "epoch": 0.4495480087955045, "grad_norm": 0.0069907004944980145, "kl": 0.128631591796875, "learning_rate": 6.717229496671307e-06, "loss": 0.0051, "reward": 1.0213192272931337, "reward_std": 0.1917688064277172, "rewards/accuracy_reward": 0.37372448353562504, "rewards/semantic_entropy_math_reward": 0.6475947443395853, "step": 345 }, { "completion_length": 694.2346744537354, "epoch": 0.4508510465021582, "grad_norm": 0.013775353319942951, "kl": 0.12335205078125, "learning_rate": 6.6958313561163046e-06, "loss": 0.0049, "reward": 1.0471938513219357, "reward_std": 0.19519828539341688, "rewards/accuracy_reward": 0.35841835755854845, "rewards/semantic_entropy_math_reward": 0.6887755058705807, "step": 346 }, { "completion_length": 673.9362106323242, "epoch": 0.4521540842088118, "grad_norm": 0.017340529710054398, "kl": 0.14471435546875, "learning_rate": 6.674398060854931e-06, "loss": 0.0058, "reward": 1.1082361228764057, "reward_std": 0.21337410528212786, "rewards/accuracy_reward": 0.4489795807749033, "rewards/semantic_entropy_math_reward": 0.6592565551400185, "step": 347 }, { "completion_length": 680.1543197631836, "epoch": 0.45345712191546544, "grad_norm": 0.00639822194352746, "kl": 0.1414794921875, "learning_rate": 6.652930055200948e-06, "loss": 0.0057, "reward": 1.1355685032904148, "reward_std": 0.20548085123300552, "rewards/accuracy_reward": 0.4540816219523549, "rewards/semantic_entropy_math_reward": 0.6814868617802858, "step": 348 }, { "completion_length": 694.6096820831299, "epoch": 0.4547601596221191, "grad_norm": 0.00481455959379673, "kl": 0.117584228515625, "learning_rate": 6.631427784187658e-06, "loss": 0.0047, "reward": 1.0703352391719818, "reward_std": 0.19507367815822363, "rewards/accuracy_reward": 0.41581631917506456, "rewards/semantic_entropy_math_reward": 0.6545189265161753, "step": 349 }, { "completion_length": 668.033145904541, "epoch": 0.4560631973287727, "grad_norm": 0.006901232060045004, "kl": 0.105194091796875, "learning_rate": 6.609891693558692e-06, "loss": 0.0042, "reward": 1.1191690862178802, "reward_std": 0.23781136283650994, "rewards/accuracy_reward": 0.4336734637618065, "rewards/semantic_entropy_math_reward": 0.6854956150054932, "step": 350 }, { "completion_length": 670.559928894043, "epoch": 0.45736623503542634, "grad_norm": 0.004967196378856897, "kl": 0.10520172119140625, "learning_rate": 6.588322229758764e-06, "loss": 0.0042, "reward": 1.0422740429639816, "reward_std": 0.18865250423550606, "rewards/accuracy_reward": 0.40816326159983873, "rewards/semantic_entropy_math_reward": 0.6341107655316591, "step": 351 }, { "completion_length": 697.409423828125, "epoch": 0.45866927274207997, "grad_norm": 0.009297888725996017, "kl": 0.0989532470703125, "learning_rate": 6.566719839924412e-06, "loss": 0.004, "reward": 0.9881559312343597, "reward_std": 0.19892455800436437, "rewards/accuracy_reward": 0.3354591755196452, "rewards/semantic_entropy_math_reward": 0.6526967883110046, "step": 352 }, { "completion_length": 681.5484504699707, "epoch": 0.4599723104487336, "grad_norm": 0.011291270144283772, "kl": 0.124664306640625, "learning_rate": 6.545084971874738e-06, "loss": 0.005, "reward": 1.0266034733504057, "reward_std": 0.22961137327365577, "rewards/accuracy_reward": 0.4056122414767742, "rewards/semantic_entropy_math_reward": 0.6209912523627281, "step": 353 }, { "completion_length": 712.748706817627, "epoch": 0.46127534815538723, "grad_norm": 0.006487871985882521, "kl": 0.138763427734375, "learning_rate": 6.523418074102117e-06, "loss": 0.0056, "reward": 1.0151238851249218, "reward_std": 0.23282003263011575, "rewards/accuracy_reward": 0.4043367262929678, "rewards/semantic_entropy_math_reward": 0.6107871551066637, "step": 354 }, { "completion_length": 758.9208965301514, "epoch": 0.46257838586204086, "grad_norm": 0.006745281163603067, "kl": 0.18731689453125, "learning_rate": 6.501719595762903e-06, "loss": 0.0075, "reward": 0.985422745347023, "reward_std": 0.2376852766610682, "rewards/accuracy_reward": 0.3698979504406452, "rewards/semantic_entropy_math_reward": 0.6155247502028942, "step": 355 }, { "completion_length": 775.320125579834, "epoch": 0.4638814235686945, "grad_norm": 0.00966943521052599, "kl": 0.23651123046875, "learning_rate": 6.479989986668118e-06, "loss": 0.0095, "reward": 0.8801020383834839, "reward_std": 0.2274622900877148, "rewards/accuracy_reward": 0.30867346189916134, "rewards/semantic_entropy_math_reward": 0.5714285597205162, "step": 356 }, { "completion_length": 935.6951370239258, "epoch": 0.4651844612753482, "grad_norm": 0.006680449470877647, "kl": 0.358795166015625, "learning_rate": 6.458229697274125e-06, "loss": 0.0143, "reward": 0.7964650057256222, "reward_std": 0.2290822328068316, "rewards/accuracy_reward": 0.3278061165474355, "rewards/semantic_entropy_math_reward": 0.4686588738113642, "step": 357 }, { "completion_length": 1016.021671295166, "epoch": 0.4664874989820018, "grad_norm": 0.01612725853919983, "kl": 0.433349609375, "learning_rate": 6.436439178673296e-06, "loss": 0.0173, "reward": 0.6778425611555576, "reward_std": 0.25770668243058026, "rewards/accuracy_reward": 0.2704081607516855, "rewards/semantic_entropy_math_reward": 0.4074343852698803, "step": 358 }, { "completion_length": 995.1058578491211, "epoch": 0.46779053668865545, "grad_norm": 0.010848271660506725, "kl": 0.4808349609375, "learning_rate": 6.41461888258465e-06, "loss": 0.0192, "reward": 0.7011661734431982, "reward_std": 0.24520536826457828, "rewards/accuracy_reward": 0.28061223891563714, "rewards/semantic_entropy_math_reward": 0.4205539273098111, "step": 359 }, { "completion_length": 909.246150970459, "epoch": 0.4690935743953091, "grad_norm": 0.02019350789487362, "kl": 0.38128662109375, "learning_rate": 6.392769261344502e-06, "loss": 0.0152, "reward": 0.8669825121760368, "reward_std": 0.3195679443888366, "rewards/accuracy_reward": 0.387755099684, "rewards/semantic_entropy_math_reward": 0.4792273994535208, "step": 360 }, { "completion_length": 877.6033020019531, "epoch": 0.4703966121019627, "grad_norm": 0.010997808538377285, "kl": 0.3323974609375, "learning_rate": 6.370890767897078e-06, "loss": 0.0133, "reward": 0.78334547765553, "reward_std": 0.2672122800722718, "rewards/accuracy_reward": 0.3150510189589113, "rewards/semantic_entropy_math_reward": 0.468294445425272, "step": 361 }, { "completion_length": 802.4808502197266, "epoch": 0.47169964980861634, "grad_norm": 0.017504433169960976, "kl": 0.30340576171875, "learning_rate": 6.348983855785122e-06, "loss": 0.0121, "reward": 0.8822886385023594, "reward_std": 0.293777440674603, "rewards/accuracy_reward": 0.37755101174116135, "rewards/semantic_entropy_math_reward": 0.5047375988215208, "step": 362 }, { "completion_length": 693.9145317077637, "epoch": 0.47300268751526997, "grad_norm": 0.007121639791876078, "kl": 0.223480224609375, "learning_rate": 6.3270489791405055e-06, "loss": 0.0089, "reward": 0.9996355399489403, "reward_std": 0.2865983408410102, "rewards/accuracy_reward": 0.41071427799761295, "rewards/semantic_entropy_math_reward": 0.5889212917536497, "step": 363 }, { "completion_length": 773.8010063171387, "epoch": 0.4743057252219236, "grad_norm": 0.0100267194211483, "kl": 0.29107666015625, "learning_rate": 6.305086592674802e-06, "loss": 0.0116, "reward": 0.7882653027772903, "reward_std": 0.243139767087996, "rewards/accuracy_reward": 0.2704081569099799, "rewards/semantic_entropy_math_reward": 0.5178571380674839, "step": 364 }, { "completion_length": 762.0101852416992, "epoch": 0.47560876292857723, "grad_norm": 0.013310786336660385, "kl": 0.26739501953125, "learning_rate": 6.283097151669869e-06, "loss": 0.0107, "reward": 0.9378644358366728, "reward_std": 0.27988837263546884, "rewards/accuracy_reward": 0.386479583568871, "rewards/semantic_entropy_math_reward": 0.5513848420232534, "step": 365 }, { "completion_length": 703.7946243286133, "epoch": 0.47691180063523086, "grad_norm": 0.010405374690890312, "kl": 0.231689453125, "learning_rate": 6.261081111968403e-06, "loss": 0.0093, "reward": 1.0863702706992626, "reward_std": 0.2347999606281519, "rewards/accuracy_reward": 0.4464285597205162, "rewards/semantic_entropy_math_reward": 0.6399416662752628, "step": 366 }, { "completion_length": 743.6300926208496, "epoch": 0.4782148383418845, "grad_norm": 0.008521927520632744, "kl": 0.2628173828125, "learning_rate": 6.2390389299645e-06, "loss": 0.0105, "reward": 0.9150874614715576, "reward_std": 0.28459055768325925, "rewards/accuracy_reward": 0.3469387721270323, "rewards/semantic_entropy_math_reward": 0.568148672580719, "step": 367 }, { "completion_length": 686.1989631652832, "epoch": 0.4795178760485381, "grad_norm": 0.021317537873983383, "kl": 0.219085693359375, "learning_rate": 6.216971062594179e-06, "loss": 0.0088, "reward": 1.0564868673682213, "reward_std": 0.28004669956862926, "rewards/accuracy_reward": 0.4387755049392581, "rewards/semantic_entropy_math_reward": 0.6177113559097052, "step": 368 }, { "completion_length": 692.7959098815918, "epoch": 0.4808209137551918, "grad_norm": 0.009187164716422558, "kl": 0.2064208984375, "learning_rate": 6.1948779673259256e-06, "loss": 0.0083, "reward": 1.0730684883892536, "reward_std": 0.18420136778149754, "rewards/accuracy_reward": 0.41964284516870975, "rewards/semantic_entropy_math_reward": 0.6534256488084793, "step": 369 }, { "completion_length": 631.3252372741699, "epoch": 0.48212395146184545, "grad_norm": 0.10186277329921722, "kl": 0.2082061767578125, "learning_rate": 6.172760102151195e-06, "loss": 0.0083, "reward": 1.1198979429900646, "reward_std": 0.23107408173382282, "rewards/accuracy_reward": 0.4566326439380646, "rewards/semantic_entropy_math_reward": 0.6632653027772903, "step": 370 }, { "completion_length": 610.815034866333, "epoch": 0.4834269891684991, "grad_norm": 0.007722198031842709, "kl": 0.1451416015625, "learning_rate": 6.1506179255749335e-06, "loss": 0.0058, "reward": 1.0604956075549126, "reward_std": 0.19448005210142583, "rewards/accuracy_reward": 0.39795917295850813, "rewards/semantic_entropy_math_reward": 0.662536446005106, "step": 371 }, { "completion_length": 793.1657981872559, "epoch": 0.4847300268751527, "grad_norm": 0.006315342616289854, "kl": 0.25958251953125, "learning_rate": 6.128451896606054e-06, "loss": 0.0104, "reward": 0.9323979634791613, "reward_std": 0.21719268709421158, "rewards/accuracy_reward": 0.36352040292695165, "rewards/semantic_entropy_math_reward": 0.5688775293529034, "step": 372 }, { "completion_length": 690.2474308013916, "epoch": 0.48603306458180634, "grad_norm": 0.013134906999766827, "kl": 0.2006683349609375, "learning_rate": 6.106262474747939e-06, "loss": 0.008, "reward": 0.9775874614715576, "reward_std": 0.2343920525163412, "rewards/accuracy_reward": 0.36607141653075814, "rewards/semantic_entropy_math_reward": 0.6115160137414932, "step": 373 }, { "completion_length": 677.0637607574463, "epoch": 0.48733610228845997, "grad_norm": 0.006674555595964193, "kl": 0.2024078369140625, "learning_rate": 6.084050119988905e-06, "loss": 0.0081, "reward": 1.058673445135355, "reward_std": 0.19721634639427066, "rewards/accuracy_reward": 0.4158163173124194, "rewards/semantic_entropy_math_reward": 0.6428571417927742, "step": 374 }, { "completion_length": 730.6772766113281, "epoch": 0.4886391399951136, "grad_norm": 0.007163270376622677, "kl": 0.22808837890625, "learning_rate": 6.061815292792666e-06, "loss": 0.0091, "reward": 1.0151239112019539, "reward_std": 0.21142640919424593, "rewards/accuracy_reward": 0.4221938671544194, "rewards/semantic_entropy_math_reward": 0.5929300095885992, "step": 375 }, { "completion_length": 793.7231979370117, "epoch": 0.48994217770176723, "grad_norm": 0.007969381287693977, "kl": 0.31280517578125, "learning_rate": 6.039558454088796e-06, "loss": 0.0125, "reward": 0.9903425239026546, "reward_std": 0.20892217522487044, "rewards/accuracy_reward": 0.40178570058196783, "rewards/semantic_entropy_math_reward": 0.5885568298399448, "step": 376 }, { "completion_length": 750.977029800415, "epoch": 0.49124521540842087, "grad_norm": 0.006262959912419319, "kl": 0.28521728515625, "learning_rate": 6.0172800652631706e-06, "loss": 0.0114, "reward": 0.9484329484403133, "reward_std": 0.20858398941345513, "rewards/accuracy_reward": 0.3737244801595807, "rewards/semantic_entropy_math_reward": 0.574708441272378, "step": 377 }, { "completion_length": 753.7576370239258, "epoch": 0.4925482531150745, "grad_norm": 0.014709383249282837, "kl": 0.274505615234375, "learning_rate": 5.994980588148391e-06, "loss": 0.011, "reward": 0.9293002746999264, "reward_std": 0.2171371802687645, "rewards/accuracy_reward": 0.3698979504406452, "rewards/semantic_entropy_math_reward": 0.55940230935812, "step": 378 }, { "completion_length": 648.5956497192383, "epoch": 0.49385129082172813, "grad_norm": 0.00798421073704958, "kl": 0.2484588623046875, "learning_rate": 5.972660485014231e-06, "loss": 0.0099, "reward": 1.0158527456223965, "reward_std": 0.22296473383903503, "rewards/accuracy_reward": 0.42219386901706457, "rewards/semantic_entropy_math_reward": 0.5936588682234287, "step": 379 }, { "completion_length": 795.1045722961426, "epoch": 0.4951543285283818, "grad_norm": 0.006285274866968393, "kl": 0.3153076171875, "learning_rate": 5.950320218558037e-06, "loss": 0.0126, "reward": 0.7689504306763411, "reward_std": 0.23351129214279354, "rewards/accuracy_reward": 0.30612244131043553, "rewards/semantic_entropy_math_reward": 0.4628279712051153, "step": 380 }, { "completion_length": 773.4209060668945, "epoch": 0.49645736623503545, "grad_norm": 0.011034843511879444, "kl": 0.354248046875, "learning_rate": 5.927960251895146e-06, "loss": 0.0142, "reward": 0.7804300040006638, "reward_std": 0.21417786623351276, "rewards/accuracy_reward": 0.31505101080983877, "rewards/semantic_entropy_math_reward": 0.46537899132817984, "step": 381 }, { "completion_length": 805.7193717956543, "epoch": 0.4977604039416891, "grad_norm": 0.015468765050172806, "kl": 0.36944580078125, "learning_rate": 5.905581048549279e-06, "loss": 0.0148, "reward": 0.7091836724430323, "reward_std": 0.2868799832649529, "rewards/accuracy_reward": 0.3061224455013871, "rewards/semantic_entropy_math_reward": 0.40306120552122593, "step": 382 }, { "completion_length": 817.8596744537354, "epoch": 0.4990634416483427, "grad_norm": 0.00837157666683197, "kl": 0.38714599609375, "learning_rate": 5.883183072442938e-06, "loss": 0.0155, "reward": 0.7122813314199448, "reward_std": 0.2295974986627698, "rewards/accuracy_reward": 0.2920918306335807, "rewards/semantic_entropy_math_reward": 0.42018949054181576, "step": 383 }, { "completion_length": 811.5127372741699, "epoch": 0.5003664793549963, "grad_norm": 0.01828381046652794, "kl": 0.3365478515625, "learning_rate": 5.860766787887781e-06, "loss": 0.0135, "reward": 0.7170189386233687, "reward_std": 0.28463717154227197, "rewards/accuracy_reward": 0.2920918306335807, "rewards/semantic_entropy_math_reward": 0.4249271061271429, "step": 384 }, { "completion_length": 827.4897842407227, "epoch": 0.50166951706165, "grad_norm": 0.021149517968297005, "kl": 0.35382080078125, "learning_rate": 5.838332659575005e-06, "loss": 0.0141, "reward": 0.6670918315649033, "reward_std": 0.2682491969317198, "rewards/accuracy_reward": 0.26403061067685485, "rewards/semantic_entropy_math_reward": 0.4030611990019679, "step": 385 }, { "completion_length": 815.8341636657715, "epoch": 0.5029725547683036, "grad_norm": 0.013593657873570919, "kl": 0.35308837890625, "learning_rate": 5.815881152565712e-06, "loss": 0.0141, "reward": 0.8161443080753088, "reward_std": 0.2971952431835234, "rewards/accuracy_reward": 0.3482142761349678, "rewards/semantic_entropy_math_reward": 0.46793001145124435, "step": 386 }, { "completion_length": 849.9464111328125, "epoch": 0.5042755924749572, "grad_norm": 0.010619286447763443, "kl": 0.36865234375, "learning_rate": 5.793412732281258e-06, "loss": 0.0147, "reward": 0.7507288716733456, "reward_std": 0.28731415374204516, "rewards/accuracy_reward": 0.30102040036581457, "rewards/semantic_entropy_math_reward": 0.4497084440663457, "step": 387 }, { "completion_length": 854.6760025024414, "epoch": 0.5055786301816109, "grad_norm": 0.011452032253146172, "kl": 0.40301513671875, "learning_rate": 5.7709278644936164e-06, "loss": 0.0161, "reward": 0.7875364422798157, "reward_std": 0.32328713731840253, "rewards/accuracy_reward": 0.3137755049392581, "rewards/semantic_entropy_math_reward": 0.47376092802733183, "step": 388 }, { "completion_length": 830.1364593505859, "epoch": 0.5068816678882645, "grad_norm": 0.0211899783462286, "kl": 0.3828125, "learning_rate": 5.7484270153157215e-06, "loss": 0.0153, "reward": 0.8214285671710968, "reward_std": 0.3227659852709621, "rewards/accuracy_reward": 0.3367346879094839, "rewards/semantic_entropy_math_reward": 0.4846938643604517, "step": 389 }, { "completion_length": 889.7512626647949, "epoch": 0.5081847055949181, "grad_norm": 0.01221123244613409, "kl": 0.44732666015625, "learning_rate": 5.725910651191798e-06, "loss": 0.0179, "reward": 0.8524052277207375, "reward_std": 0.3005325470585376, "rewards/accuracy_reward": 0.33673468697816133, "rewards/semantic_entropy_math_reward": 0.5156705342233181, "step": 390 }, { "completion_length": 857.6823806762695, "epoch": 0.5094877433015718, "grad_norm": 0.02253851108253002, "kl": 0.589111328125, "learning_rate": 5.703379238887703e-06, "loss": 0.0236, "reward": 0.8591472413390875, "reward_std": 0.280006037093699, "rewards/accuracy_reward": 0.325255096424371, "rewards/semantic_entropy_math_reward": 0.5338921099901199, "step": 391 }, { "completion_length": 831.548454284668, "epoch": 0.5107907810082254, "grad_norm": 0.05936218798160553, "kl": 0.6256103515625, "learning_rate": 5.680833245481234e-06, "loss": 0.025, "reward": 0.9243804756551981, "reward_std": 0.274040000513196, "rewards/accuracy_reward": 0.373724477365613, "rewards/semantic_entropy_math_reward": 0.5506559703499079, "step": 392 }, { "completion_length": 745.0714111328125, "epoch": 0.512093818714879, "grad_norm": 0.00972607359290123, "kl": 0.49920654296875, "learning_rate": 5.6582731383524625e-06, "loss": 0.02, "reward": 0.9850583076477051, "reward_std": 0.28676118375733495, "rewards/accuracy_reward": 0.3928571380674839, "rewards/semantic_entropy_math_reward": 0.5922011565417051, "step": 393 }, { "completion_length": 802.0012550354004, "epoch": 0.5133968564215327, "grad_norm": 0.027455151081085205, "kl": 0.416015625, "learning_rate": 5.63569938517404e-06, "loss": 0.0166, "reward": 0.9023323617875576, "reward_std": 0.2589769000187516, "rewards/accuracy_reward": 0.3520408091135323, "rewards/semantic_entropy_math_reward": 0.5502915307879448, "step": 394 }, { "completion_length": 744.3405380249023, "epoch": 0.5146998941281863, "grad_norm": 0.02062767744064331, "kl": 0.307373046875, "learning_rate": 5.613112453901493e-06, "loss": 0.0123, "reward": 0.9759475104510784, "reward_std": 0.30136603210121393, "rewards/accuracy_reward": 0.395408159121871, "rewards/semantic_entropy_math_reward": 0.5805393438786268, "step": 395 }, { "completion_length": 790.3073863983154, "epoch": 0.5160029318348399, "grad_norm": 0.030654992908239365, "kl": 0.329132080078125, "learning_rate": 5.590512812763541e-06, "loss": 0.0132, "reward": 0.9728498328477144, "reward_std": 0.2543048169463873, "rewards/accuracy_reward": 0.39923468697816133, "rewards/semantic_entropy_math_reward": 0.5736151579767466, "step": 396 }, { "completion_length": 736.2219200134277, "epoch": 0.5173059695414937, "grad_norm": 0.02136358991265297, "kl": 0.2410888671875, "learning_rate": 5.567900930252375e-06, "loss": 0.0096, "reward": 0.9285714197903872, "reward_std": 0.2620608005672693, "rewards/accuracy_reward": 0.3418367290869355, "rewards/semantic_entropy_math_reward": 0.5867346897721291, "step": 397 }, { "completion_length": 701.8328800201416, "epoch": 0.5186090072481473, "grad_norm": 0.017228906974196434, "kl": 0.224029541015625, "learning_rate": 5.5452772751139496e-06, "loss": 0.009, "reward": 1.0010932721197605, "reward_std": 0.27315305219963193, "rewards/accuracy_reward": 0.3979591750539839, "rewards/semantic_entropy_math_reward": 0.6031340770423412, "step": 398 }, { "completion_length": 656.6288204193115, "epoch": 0.5199120449548009, "grad_norm": 0.018306978046894073, "kl": 0.1906890869140625, "learning_rate": 5.522642316338268e-06, "loss": 0.0076, "reward": 1.0404518730938435, "reward_std": 0.2153743339003995, "rewards/accuracy_reward": 0.3979591727256775, "rewards/semantic_entropy_math_reward": 0.6424926910549402, "step": 399 }, { "completion_length": 676.8048324584961, "epoch": 0.5212150826614546, "grad_norm": 0.02183961123228073, "kl": 0.20745849609375, "learning_rate": 5.49999652314966e-06, "loss": 0.0083, "reward": 1.1049562525004148, "reward_std": 0.2719402725342661, "rewards/accuracy_reward": 0.4744897894561291, "rewards/semantic_entropy_math_reward": 0.6304664611816406, "step": 400 }, { "completion_length": 640.260196685791, "epoch": 0.5225181203681082, "grad_norm": 0.014690287411212921, "kl": 0.1846466064453125, "learning_rate": 5.477340364997051e-06, "loss": 0.0074, "reward": 1.0621355772018433, "reward_std": 0.29634056612849236, "rewards/accuracy_reward": 0.41454080678522587, "rewards/semantic_entropy_math_reward": 0.6475947424769402, "step": 401 }, { "completion_length": 629.8902969360352, "epoch": 0.5238211580747618, "grad_norm": 0.0134364552795887, "kl": 0.1791534423828125, "learning_rate": 5.454674311544236e-06, "loss": 0.0072, "reward": 1.101311944425106, "reward_std": 0.25405340315774083, "rewards/accuracy_reward": 0.4464285559952259, "rewards/semantic_entropy_math_reward": 0.6548833716660738, "step": 402 }, { "completion_length": 712.8188571929932, "epoch": 0.5251241957814154, "grad_norm": 0.011339854449033737, "kl": 0.2296142578125, "learning_rate": 5.431998832660136e-06, "loss": 0.0092, "reward": 0.9241982512176037, "reward_std": 0.28033996117301285, "rewards/accuracy_reward": 0.3698979541659355, "rewards/semantic_entropy_math_reward": 0.5543002691119909, "step": 403 }, { "completion_length": 714.3826351165771, "epoch": 0.5264272334880691, "grad_norm": 0.025039412081241608, "kl": 0.253021240234375, "learning_rate": 5.409314398409067e-06, "loss": 0.0101, "reward": 0.9713921211659908, "reward_std": 0.24509224016219378, "rewards/accuracy_reward": 0.3941326541826129, "rewards/semantic_entropy_math_reward": 0.5772594716399908, "step": 404 }, { "completion_length": 707.5254974365234, "epoch": 0.5277302711947227, "grad_norm": 0.010884922929108143, "kl": 0.250274658203125, "learning_rate": 5.386621479040985e-06, "loss": 0.01, "reward": 1.009839627891779, "reward_std": 0.30727002397179604, "rewards/accuracy_reward": 0.4209183566272259, "rewards/semantic_entropy_math_reward": 0.5889212712645531, "step": 405 }, { "completion_length": 742.8864707946777, "epoch": 0.5290333089013763, "grad_norm": 0.015817036852240562, "kl": 0.249359130859375, "learning_rate": 5.363920544981749e-06, "loss": 0.01, "reward": 0.96574342623353, "reward_std": 0.2910738668870181, "rewards/accuracy_reward": 0.3928571343421936, "rewards/semantic_entropy_math_reward": 0.5728862769901752, "step": 406 }, { "completion_length": 738.665807723999, "epoch": 0.53033634660803, "grad_norm": 0.008892969228327274, "kl": 0.250640869140625, "learning_rate": 5.341212066823356e-06, "loss": 0.01, "reward": 1.0258746333420277, "reward_std": 0.30396523512899876, "rewards/accuracy_reward": 0.42346938140690327, "rewards/semantic_entropy_math_reward": 0.6024052370339632, "step": 407 }, { "completion_length": 768.020393371582, "epoch": 0.5316393843146836, "grad_norm": 0.010070595890283585, "kl": 0.25, "learning_rate": 5.3184965153142e-06, "loss": 0.01, "reward": 0.9708454869687557, "reward_std": 0.3446971047669649, "rewards/accuracy_reward": 0.40306121576577425, "rewards/semantic_entropy_math_reward": 0.5677842423319817, "step": 408 }, { "completion_length": 743.5344276428223, "epoch": 0.5329424220213372, "grad_norm": 0.009442967362701893, "kl": 0.306640625, "learning_rate": 5.295774361349299e-06, "loss": 0.0123, "reward": 0.9260203987360001, "reward_std": 0.2897274415008724, "rewards/accuracy_reward": 0.3520408058539033, "rewards/semantic_entropy_math_reward": 0.5739795789122581, "step": 409 }, { "completion_length": 788.4094200134277, "epoch": 0.5342454597279909, "grad_norm": 0.01579575426876545, "kl": 0.287109375, "learning_rate": 5.27304607596055e-06, "loss": 0.0115, "reward": 1.0420918241143227, "reward_std": 0.3600159287452698, "rewards/accuracy_reward": 0.4553571380674839, "rewards/semantic_entropy_math_reward": 0.5867346879094839, "step": 410 }, { "completion_length": 696.8418197631836, "epoch": 0.5355484974346445, "grad_norm": 0.00899480190128088, "kl": 0.2646484375, "learning_rate": 5.250312130306946e-06, "loss": 0.0106, "reward": 1.0914722681045532, "reward_std": 0.31279311887919903, "rewards/accuracy_reward": 0.4693877389654517, "rewards/semantic_entropy_math_reward": 0.622084541246295, "step": 411 }, { "completion_length": 819.5879936218262, "epoch": 0.5368515351412981, "grad_norm": 0.011177831329405308, "kl": 0.373291015625, "learning_rate": 5.227572995664819e-06, "loss": 0.0149, "reward": 0.8733600527048111, "reward_std": 0.33729337900877, "rewards/accuracy_reward": 0.3686224427074194, "rewards/semantic_entropy_math_reward": 0.5047375932335854, "step": 412 }, { "completion_length": 789.5025367736816, "epoch": 0.5381545728479518, "grad_norm": 0.012133578769862652, "kl": 0.32177734375, "learning_rate": 5.204829143418072e-06, "loss": 0.0129, "reward": 0.9347667433321476, "reward_std": 0.2945194048807025, "rewards/accuracy_reward": 0.40306121576577425, "rewards/semantic_entropy_math_reward": 0.5317055340856314, "step": 413 }, { "completion_length": 759.4617118835449, "epoch": 0.5394576105546054, "grad_norm": 0.010312288999557495, "kl": 0.28411865234375, "learning_rate": 5.182081045048404e-06, "loss": 0.0114, "reward": 0.9098031893372536, "reward_std": 0.27254815865308046, "rewards/accuracy_reward": 0.35076529812067747, "rewards/semantic_entropy_math_reward": 0.5590378772467375, "step": 414 }, { "completion_length": 765.6211624145508, "epoch": 0.540760648261259, "grad_norm": 0.008033771999180317, "kl": 0.30169677734375, "learning_rate": 5.159329172125533e-06, "loss": 0.0121, "reward": 0.9154519066214561, "reward_std": 0.30716718919575214, "rewards/accuracy_reward": 0.36989795276895165, "rewards/semantic_entropy_math_reward": 0.5455539207905531, "step": 415 }, { "completion_length": 737.8456478118896, "epoch": 0.5420636859679127, "grad_norm": 0.008785370737314224, "kl": 0.26043701171875, "learning_rate": 5.136573996297431e-06, "loss": 0.0104, "reward": 0.9582725893706083, "reward_std": 0.25230830255895853, "rewards/accuracy_reward": 0.38137754483614117, "rewards/semantic_entropy_math_reward": 0.5768950134515762, "step": 416 }, { "completion_length": 730.3711471557617, "epoch": 0.5433667236745663, "grad_norm": 0.008652962744235992, "kl": 0.2668914794921875, "learning_rate": 5.113815989280528e-06, "loss": 0.0107, "reward": 0.9865160416811705, "reward_std": 0.29250738490372896, "rewards/accuracy_reward": 0.3852040730416775, "rewards/semantic_entropy_math_reward": 0.601311944425106, "step": 417 }, { "completion_length": 678.7002487182617, "epoch": 0.5446697613812199, "grad_norm": 0.011162396520376205, "kl": 0.2384033203125, "learning_rate": 5.091055622849958e-06, "loss": 0.0095, "reward": 1.081632662564516, "reward_std": 0.2634318256750703, "rewards/accuracy_reward": 0.46938775293529034, "rewards/semantic_entropy_math_reward": 0.6122448910027742, "step": 418 }, { "completion_length": 735.9999885559082, "epoch": 0.5459727990878736, "grad_norm": 0.008636132813990116, "kl": 0.2808837890625, "learning_rate": 5.068293368829755e-06, "loss": 0.0112, "reward": 0.9378644339740276, "reward_std": 0.2595589440315962, "rewards/accuracy_reward": 0.35331631964072585, "rewards/semantic_entropy_math_reward": 0.5845480803400278, "step": 419 }, { "completion_length": 750.395393371582, "epoch": 0.5472758367945273, "grad_norm": 0.010147650726139545, "kl": 0.28436279296875, "learning_rate": 5.045529699083092e-06, "loss": 0.0114, "reward": 0.9876093231141567, "reward_std": 0.27308015106245875, "rewards/accuracy_reward": 0.408163258805871, "rewards/semantic_entropy_math_reward": 0.57944605499506, "step": 420 }, { "completion_length": 777.7563533782959, "epoch": 0.5485788745011809, "grad_norm": 0.010801425203680992, "kl": 0.35076904296875, "learning_rate": 5.022765085502478e-06, "loss": 0.014, "reward": 0.8644314743578434, "reward_std": 0.27991672046482563, "rewards/accuracy_reward": 0.3214285634458065, "rewards/semantic_entropy_math_reward": 0.543002899736166, "step": 421 }, { "completion_length": 626.5127429962158, "epoch": 0.5498819122078346, "grad_norm": 0.014374594204127789, "kl": 0.24139404296875, "learning_rate": 5e-06, "loss": 0.0097, "reward": 1.0840014554560184, "reward_std": 0.3029053374193609, "rewards/accuracy_reward": 0.45535712968558073, "rewards/semantic_entropy_math_reward": 0.6286443080753088, "step": 422 }, { "completion_length": 868.706615447998, "epoch": 0.5511849499144882, "grad_norm": 0.029794251546263695, "kl": 0.50103759765625, "learning_rate": 4.977234914497522e-06, "loss": 0.02, "reward": 0.8817419707775116, "reward_std": 0.35746728256344795, "rewards/accuracy_reward": 0.38137754518538713, "rewards/semantic_entropy_math_reward": 0.5003644302487373, "step": 423 }, { "completion_length": 888.5561103820801, "epoch": 0.5524879876211418, "grad_norm": 0.011106643825769424, "kl": 0.4971923828125, "learning_rate": 4.9544703009169115e-06, "loss": 0.0199, "reward": 0.8223396427929401, "reward_std": 0.30733291525393724, "rewards/accuracy_reward": 0.32015305757522583, "rewards/semantic_entropy_math_reward": 0.5021865721791983, "step": 424 }, { "completion_length": 857.4923286437988, "epoch": 0.5537910253277954, "grad_norm": 0.013995586894452572, "kl": 0.5228271484375, "learning_rate": 4.931706631170246e-06, "loss": 0.0209, "reward": 0.8396501149982214, "reward_std": 0.3221485884860158, "rewards/accuracy_reward": 0.3724489724263549, "rewards/semantic_entropy_math_reward": 0.46720114909112453, "step": 425 }, { "completion_length": 899.155590057373, "epoch": 0.5550940630344491, "grad_norm": 0.010864965617656708, "kl": 0.51416015625, "learning_rate": 4.9089443771500435e-06, "loss": 0.0206, "reward": 0.8126822169870138, "reward_std": 0.3215222926810384, "rewards/accuracy_reward": 0.34948979411274195, "rewards/semantic_entropy_math_reward": 0.4631924070417881, "step": 426 }, { "completion_length": 829.6670722961426, "epoch": 0.5563971007411027, "grad_norm": 0.029301879927515984, "kl": 0.447021484375, "learning_rate": 4.886184010719472e-06, "loss": 0.0179, "reward": 0.8210641238838434, "reward_std": 0.3493126221001148, "rewards/accuracy_reward": 0.32142856589052826, "rewards/semantic_entropy_math_reward": 0.4996355567127466, "step": 427 }, { "completion_length": 883.7448806762695, "epoch": 0.5577001384477563, "grad_norm": 0.02421894297003746, "kl": 0.45355224609375, "learning_rate": 4.863426003702572e-06, "loss": 0.0182, "reward": 0.8217929806560278, "reward_std": 0.34441763535141945, "rewards/accuracy_reward": 0.3392857098951936, "rewards/semantic_entropy_math_reward": 0.48250727728009224, "step": 428 }, { "completion_length": 876.2168235778809, "epoch": 0.55900317615441, "grad_norm": 0.021665673702955246, "kl": 0.46954345703125, "learning_rate": 4.840670827874468e-06, "loss": 0.0188, "reward": 0.6926020458340645, "reward_std": 0.3228567922487855, "rewards/accuracy_reward": 0.2691326488275081, "rewards/semantic_entropy_math_reward": 0.4234693609178066, "step": 429 }, { "completion_length": 837.947681427002, "epoch": 0.5603062138610636, "grad_norm": 0.013625122606754303, "kl": 0.38629150390625, "learning_rate": 4.817918954951598e-06, "loss": 0.0155, "reward": 0.7793367225676775, "reward_std": 0.2986125349998474, "rewards/accuracy_reward": 0.3073979541659355, "rewards/semantic_entropy_math_reward": 0.4719387460500002, "step": 430 }, { "completion_length": 731.4017715454102, "epoch": 0.5616092515677172, "grad_norm": 0.009045498445630074, "kl": 0.30010986328125, "learning_rate": 4.795170856581929e-06, "loss": 0.012, "reward": 0.9037900809198618, "reward_std": 0.304488108959049, "rewards/accuracy_reward": 0.3724489687010646, "rewards/semantic_entropy_math_reward": 0.531341090798378, "step": 431 }, { "completion_length": 722.5165672302246, "epoch": 0.5629122892743709, "grad_norm": 0.029921459034085274, "kl": 0.283782958984375, "learning_rate": 4.772427004335183e-06, "loss": 0.0114, "reward": 0.9482507295906544, "reward_std": 0.2863430380821228, "rewards/accuracy_reward": 0.37755101174116135, "rewards/semantic_entropy_math_reward": 0.5706996899098158, "step": 432 }, { "completion_length": 828.3239593505859, "epoch": 0.5642153269810245, "grad_norm": 0.01935972273349762, "kl": 0.38665771484375, "learning_rate": 4.749687869693056e-06, "loss": 0.0155, "reward": 0.8678935877978802, "reward_std": 0.4000359959900379, "rewards/accuracy_reward": 0.38647958170622587, "rewards/semantic_entropy_math_reward": 0.481413995847106, "step": 433 }, { "completion_length": 872.4591674804688, "epoch": 0.5655183646876781, "grad_norm": 0.009145833551883698, "kl": 0.40277099609375, "learning_rate": 4.7269539240394505e-06, "loss": 0.0161, "reward": 0.8972303308546543, "reward_std": 0.33448066376149654, "rewards/accuracy_reward": 0.39795916993170977, "rewards/semantic_entropy_math_reward": 0.49927111715078354, "step": 434 }, { "completion_length": 976.6759986877441, "epoch": 0.5668214023943318, "grad_norm": 0.01429939828813076, "kl": 0.46466064453125, "learning_rate": 4.7042256386507e-06, "loss": 0.0186, "reward": 0.7840743325650692, "reward_std": 0.37142589315772057, "rewards/accuracy_reward": 0.36096938513219357, "rewards/semantic_entropy_math_reward": 0.42310494370758533, "step": 435 }, { "completion_length": 791.7334022521973, "epoch": 0.5681244401009854, "grad_norm": 0.014761151745915413, "kl": 0.3704833984375, "learning_rate": 4.681503484685803e-06, "loss": 0.0148, "reward": 0.9274781439453363, "reward_std": 0.361936641857028, "rewards/accuracy_reward": 0.40816325694322586, "rewards/semantic_entropy_math_reward": 0.5193148525431752, "step": 436 }, { "completion_length": 880.3073768615723, "epoch": 0.569427477807639, "grad_norm": 0.009491129778325558, "kl": 0.39361572265625, "learning_rate": 4.6587879331766465e-06, "loss": 0.0158, "reward": 0.7884475160390139, "reward_std": 0.3169987094588578, "rewards/accuracy_reward": 0.31760203617159277, "rewards/semantic_entropy_math_reward": 0.4708454655483365, "step": 437 }, { "completion_length": 736.4056015014648, "epoch": 0.5707305155142927, "grad_norm": 0.00868231151252985, "kl": 0.31585693359375, "learning_rate": 4.636079455018253e-06, "loss": 0.0126, "reward": 0.9129008427262306, "reward_std": 0.30288167390972376, "rewards/accuracy_reward": 0.35714284982532263, "rewards/semantic_entropy_math_reward": 0.5557580217719078, "step": 438 }, { "completion_length": 814.2359504699707, "epoch": 0.5720335532209463, "grad_norm": 0.010362574830651283, "kl": 0.33740234375, "learning_rate": 4.613378520959016e-06, "loss": 0.0135, "reward": 0.9096209928393364, "reward_std": 0.2665034346282482, "rewards/accuracy_reward": 0.3571428470313549, "rewards/semantic_entropy_math_reward": 0.5524780973792076, "step": 439 }, { "completion_length": 734.9668273925781, "epoch": 0.5733365909275999, "grad_norm": 0.008001998998224735, "kl": 0.274078369140625, "learning_rate": 4.5906856015909365e-06, "loss": 0.011, "reward": 0.9907069839537144, "reward_std": 0.29019191302359104, "rewards/accuracy_reward": 0.4145408123731613, "rewards/semantic_entropy_math_reward": 0.5761661659926176, "step": 440 }, { "completion_length": 748.7448806762695, "epoch": 0.5746396286342536, "grad_norm": 0.008587611839175224, "kl": 0.282470703125, "learning_rate": 4.568001167339866e-06, "loss": 0.0113, "reward": 1.0227769613265991, "reward_std": 0.3082156986929476, "rewards/accuracy_reward": 0.44515305291861296, "rewards/semantic_entropy_math_reward": 0.5776238963007927, "step": 441 }, { "completion_length": 721.5548267364502, "epoch": 0.5759426663409073, "grad_norm": 0.01077638566493988, "kl": 0.2650146484375, "learning_rate": 4.545325688455766e-06, "loss": 0.0106, "reward": 0.9072521775960922, "reward_std": 0.2683902089484036, "rewards/accuracy_reward": 0.3380102012306452, "rewards/semantic_entropy_math_reward": 0.569241963326931, "step": 442 }, { "completion_length": 725.9936084747314, "epoch": 0.5772457040475609, "grad_norm": 0.010454374365508556, "kl": 0.27093505859375, "learning_rate": 4.52265963500295e-06, "loss": 0.0108, "reward": 0.933673482388258, "reward_std": 0.2592790825292468, "rewards/accuracy_reward": 0.38520407234318554, "rewards/semantic_entropy_math_reward": 0.5484693665057421, "step": 443 }, { "completion_length": 717.6989631652832, "epoch": 0.5785487417542146, "grad_norm": 0.0060407305136322975, "kl": 0.251617431640625, "learning_rate": 4.500003476850341e-06, "loss": 0.0101, "reward": 0.945335254073143, "reward_std": 0.2371021592989564, "rewards/accuracy_reward": 0.3673469312489033, "rewards/semantic_entropy_math_reward": 0.5779883172363043, "step": 444 }, { "completion_length": 715.7678413391113, "epoch": 0.5798517794608682, "grad_norm": 0.006333528086543083, "kl": 0.25640869140625, "learning_rate": 4.477357683661734e-06, "loss": 0.0103, "reward": 0.9666544944047928, "reward_std": 0.2585125626064837, "rewards/accuracy_reward": 0.3839285634458065, "rewards/semantic_entropy_math_reward": 0.5827259384095669, "step": 445 }, { "completion_length": 673.3354454040527, "epoch": 0.5811548171675218, "grad_norm": 0.006936109159141779, "kl": 0.2347412109375, "learning_rate": 4.454722724886051e-06, "loss": 0.0094, "reward": 1.0451894700527191, "reward_std": 0.2828494496643543, "rewards/accuracy_reward": 0.4234693804755807, "rewards/semantic_entropy_math_reward": 0.6217201054096222, "step": 446 }, { "completion_length": 647.9629936218262, "epoch": 0.5824578548741755, "grad_norm": 0.011997463181614876, "kl": 0.2271728515625, "learning_rate": 4.432099069747625e-06, "loss": 0.0091, "reward": 1.046282798051834, "reward_std": 0.3061999995261431, "rewards/accuracy_reward": 0.4489795872941613, "rewards/semantic_entropy_math_reward": 0.5973031930625439, "step": 447 }, { "completion_length": 660.0841732025146, "epoch": 0.5837608925808291, "grad_norm": 0.008362640626728535, "kl": 0.227325439453125, "learning_rate": 4.40948718723646e-06, "loss": 0.0091, "reward": 1.100218653678894, "reward_std": 0.27638205233961344, "rewards/accuracy_reward": 0.4591836631298065, "rewards/semantic_entropy_math_reward": 0.6410349886864424, "step": 448 }, { "completion_length": 757.071418762207, "epoch": 0.5850639302874827, "grad_norm": 0.008867011405527592, "kl": 0.279510498046875, "learning_rate": 4.386887546098509e-06, "loss": 0.0112, "reward": 0.999088928103447, "reward_std": 0.28076145239174366, "rewards/accuracy_reward": 0.39158162963576615, "rewards/semantic_entropy_math_reward": 0.6075072661042213, "step": 449 }, { "completion_length": 723.6402854919434, "epoch": 0.5863669679941363, "grad_norm": 0.007995868101716042, "kl": 0.3203125, "learning_rate": 4.364300614825963e-06, "loss": 0.0128, "reward": 0.9963556788861752, "reward_std": 0.30449158139526844, "rewards/accuracy_reward": 0.43622448202222586, "rewards/semantic_entropy_math_reward": 0.5601311754435301, "step": 450 }, { "completion_length": 819.0356941223145, "epoch": 0.58767000570079, "grad_norm": 0.007579697296023369, "kl": 0.36376953125, "learning_rate": 4.341726861647537e-06, "loss": 0.0146, "reward": 0.8602405153214931, "reward_std": 0.2306123892776668, "rewards/accuracy_reward": 0.3405612171627581, "rewards/semantic_entropy_math_reward": 0.5196792725473642, "step": 451 }, { "completion_length": 753.8673248291016, "epoch": 0.5889730434074436, "grad_norm": 0.010023192502558231, "kl": 0.34869384765625, "learning_rate": 4.319166754518768e-06, "loss": 0.014, "reward": 0.9624635465443134, "reward_std": 0.3025930430740118, "rewards/accuracy_reward": 0.41836734174285084, "rewards/semantic_entropy_math_reward": 0.5440962053835392, "step": 452 }, { "completion_length": 840.7869701385498, "epoch": 0.5902760811140972, "grad_norm": 0.010087518021464348, "kl": 0.38787841796875, "learning_rate": 4.296620761112299e-06, "loss": 0.0155, "reward": 0.8919460643082857, "reward_std": 0.3130454979836941, "rewards/accuracy_reward": 0.3992346851155162, "rewards/semantic_entropy_math_reward": 0.4927113652229309, "step": 453 }, { "completion_length": 883.2423305511475, "epoch": 0.5915791188207509, "grad_norm": 0.014301802963018417, "kl": 0.49176025390625, "learning_rate": 4.274089348808202e-06, "loss": 0.0197, "reward": 0.781341103836894, "reward_std": 0.2542855523061007, "rewards/accuracy_reward": 0.33673469070345163, "rewards/semantic_entropy_math_reward": 0.44460639357566833, "step": 454 }, { "completion_length": 909.8303451538086, "epoch": 0.5928821565274045, "grad_norm": 0.00975614134222269, "kl": 0.4764404296875, "learning_rate": 4.251572984684281e-06, "loss": 0.019, "reward": 0.6938775461167097, "reward_std": 0.31052700616419315, "rewards/accuracy_reward": 0.275510196108371, "rewards/semantic_entropy_math_reward": 0.4183673318475485, "step": 455 }, { "completion_length": 833.7193756103516, "epoch": 0.5941851942340581, "grad_norm": 0.008694404736161232, "kl": 0.4830322265625, "learning_rate": 4.229072135506384e-06, "loss": 0.0193, "reward": 0.6107871737331152, "reward_std": 0.2868787511251867, "rewards/accuracy_reward": 0.24489795602858067, "rewards/semantic_entropy_math_reward": 0.36588919814676046, "step": 456 }, { "completion_length": 776.2601928710938, "epoch": 0.5954882319407118, "grad_norm": 0.02165721170604229, "kl": 0.401123046875, "learning_rate": 4.206587267718743e-06, "loss": 0.0161, "reward": 0.7199343983083963, "reward_std": 0.3161720233038068, "rewards/accuracy_reward": 0.29209183249622583, "rewards/semantic_entropy_math_reward": 0.4278425555676222, "step": 457 }, { "completion_length": 789.6683464050293, "epoch": 0.5967912696473654, "grad_norm": 0.01711021549999714, "kl": 0.455078125, "learning_rate": 4.18411884743429e-06, "loss": 0.0182, "reward": 0.6352040860801935, "reward_std": 0.3071645484305918, "rewards/accuracy_reward": 0.24744897428900003, "rewards/semantic_entropy_math_reward": 0.38775508664548397, "step": 458 }, { "completion_length": 854.2078895568848, "epoch": 0.598094307354019, "grad_norm": 0.011356550268828869, "kl": 0.46820068359375, "learning_rate": 4.161667340424996e-06, "loss": 0.0187, "reward": 0.6878644395619631, "reward_std": 0.30793760903179646, "rewards/accuracy_reward": 0.2895408123731613, "rewards/semantic_entropy_math_reward": 0.39832358434796333, "step": 459 }, { "completion_length": 819.8226776123047, "epoch": 0.5993973450606727, "grad_norm": 0.018367163836956024, "kl": 0.4984130859375, "learning_rate": 4.139233212112221e-06, "loss": 0.0199, "reward": 0.6222667591646314, "reward_std": 0.28496866673231125, "rewards/accuracy_reward": 0.2614795877598226, "rewards/semantic_entropy_math_reward": 0.360787158831954, "step": 460 }, { "completion_length": 783.8673286437988, "epoch": 0.6007003827673263, "grad_norm": 0.013151396997272968, "kl": 0.4486083984375, "learning_rate": 4.116816927557063e-06, "loss": 0.018, "reward": 0.706268223002553, "reward_std": 0.3382634464651346, "rewards/accuracy_reward": 0.30867346515879035, "rewards/semantic_entropy_math_reward": 0.39759473502635956, "step": 461 }, { "completion_length": 836.7129955291748, "epoch": 0.6020034204739799, "grad_norm": 0.012311790138483047, "kl": 0.43829345703125, "learning_rate": 4.094418951450721e-06, "loss": 0.0175, "reward": 0.7405247874557972, "reward_std": 0.31488014571368694, "rewards/accuracy_reward": 0.3086734665557742, "rewards/semantic_entropy_math_reward": 0.4318513087928295, "step": 462 }, { "completion_length": 852.6453914642334, "epoch": 0.6033064581806336, "grad_norm": 0.01740703545510769, "kl": 0.4949951171875, "learning_rate": 4.072039748104856e-06, "loss": 0.0198, "reward": 0.7804300338029861, "reward_std": 0.32809497602283955, "rewards/accuracy_reward": 0.3609693832695484, "rewards/semantic_entropy_math_reward": 0.41946063190698624, "step": 463 }, { "completion_length": 958.5688552856445, "epoch": 0.6046094958872872, "grad_norm": 0.014113862067461014, "kl": 0.609619140625, "learning_rate": 4.0496797814419655e-06, "loss": 0.0244, "reward": 0.7091836649924517, "reward_std": 0.35923834051936865, "rewards/accuracy_reward": 0.334183671977371, "rewards/semantic_entropy_math_reward": 0.3749999813735485, "step": 464 }, { "completion_length": 1035.6211433410645, "epoch": 0.6059125335939409, "grad_norm": 0.018726006150245667, "kl": 0.662353515625, "learning_rate": 4.0273395149857705e-06, "loss": 0.0265, "reward": 0.5956632662564516, "reward_std": 0.2877348540350795, "rewards/accuracy_reward": 0.23086734279058874, "rewards/semantic_entropy_math_reward": 0.364795901812613, "step": 465 }, { "completion_length": 1106.3354263305664, "epoch": 0.6072155713005946, "grad_norm": 0.032633353024721146, "kl": 0.801513671875, "learning_rate": 4.0050194118516095e-06, "loss": 0.0321, "reward": 0.5672376118600368, "reward_std": 0.2542789396829903, "rewards/accuracy_reward": 0.23596938361879438, "rewards/semantic_entropy_math_reward": 0.3312682118266821, "step": 466 }, { "completion_length": 1099.7372093200684, "epoch": 0.6085186090072482, "grad_norm": 0.025146355852484703, "kl": 0.75390625, "learning_rate": 3.982719934736832e-06, "loss": 0.0302, "reward": 0.6519679334014654, "reward_std": 0.3058506823144853, "rewards/accuracy_reward": 0.29846938140690327, "rewards/semantic_entropy_math_reward": 0.3534985352307558, "step": 467 }, { "completion_length": 1159.7844047546387, "epoch": 0.6098216467139018, "grad_norm": 0.031599290668964386, "kl": 0.76611328125, "learning_rate": 3.960441545911205e-06, "loss": 0.0307, "reward": 0.5366253647953272, "reward_std": 0.28160625556483865, "rewards/accuracy_reward": 0.23086734395474195, "rewards/semantic_entropy_math_reward": 0.3057580189779401, "step": 468 }, { "completion_length": 964.5242004394531, "epoch": 0.6111246844205555, "grad_norm": 0.02900148183107376, "kl": 0.6328125, "learning_rate": 3.9381847072073346e-06, "loss": 0.0253, "reward": 0.6304664704948664, "reward_std": 0.29618403734639287, "rewards/accuracy_reward": 0.27295917738229036, "rewards/semantic_entropy_math_reward": 0.357507286593318, "step": 469 }, { "completion_length": 972.2499771118164, "epoch": 0.6124277221272091, "grad_norm": 0.016321644186973572, "kl": 0.64923095703125, "learning_rate": 3.915949880011096e-06, "loss": 0.026, "reward": 0.5974854175001383, "reward_std": 0.2926543587818742, "rewards/accuracy_reward": 0.2512755058705807, "rewards/semantic_entropy_math_reward": 0.34620990604162216, "step": 470 }, { "completion_length": 829.1198883056641, "epoch": 0.6137307598338627, "grad_norm": 0.012250939384102821, "kl": 0.4737548828125, "learning_rate": 3.893737525252063e-06, "loss": 0.0189, "reward": 0.7172011416405439, "reward_std": 0.3767399461939931, "rewards/accuracy_reward": 0.32142856530845165, "rewards/semantic_entropy_math_reward": 0.3957725968211889, "step": 471 }, { "completion_length": 769.1759986877441, "epoch": 0.6150337975405163, "grad_norm": 0.020389674231410027, "kl": 0.38525390625, "learning_rate": 3.871548103393947e-06, "loss": 0.0154, "reward": 0.8263483978807926, "reward_std": 0.34209100902080536, "rewards/accuracy_reward": 0.3533163187094033, "rewards/semantic_entropy_math_reward": 0.47303204983472824, "step": 472 }, { "completion_length": 755.7933464050293, "epoch": 0.61633683524717, "grad_norm": 0.03599300980567932, "kl": 0.32598876953125, "learning_rate": 3.849382074425069e-06, "loss": 0.013, "reward": 0.8429300338029861, "reward_std": 0.3253574660047889, "rewards/accuracy_reward": 0.34438774548470974, "rewards/semantic_entropy_math_reward": 0.49854227527976036, "step": 473 }, { "completion_length": 824.5663070678711, "epoch": 0.6176398729538236, "grad_norm": 0.008328622207045555, "kl": 0.33404541015625, "learning_rate": 3.827239897848805e-06, "loss": 0.0134, "reward": 0.7966472283005714, "reward_std": 0.299846522975713, "rewards/accuracy_reward": 0.32908162963576615, "rewards/semantic_entropy_math_reward": 0.46756558306515217, "step": 474 }, { "completion_length": 608.5408020019531, "epoch": 0.6189429106604772, "grad_norm": 0.013201562687754631, "kl": 0.2508544921875, "learning_rate": 3.805122032674077e-06, "loss": 0.01, "reward": 1.0464650020003319, "reward_std": 0.3086475534364581, "rewards/accuracy_reward": 0.45280610769987106, "rewards/semantic_entropy_math_reward": 0.5936588756740093, "step": 475 }, { "completion_length": 671.0714149475098, "epoch": 0.6202459483671309, "grad_norm": 0.00984184443950653, "kl": 0.25787353515625, "learning_rate": 3.7830289374058214e-06, "loss": 0.0103, "reward": 1.0696064084768295, "reward_std": 0.29869078751653433, "rewards/accuracy_reward": 0.47193876653909683, "rewards/semantic_entropy_math_reward": 0.5976676195859909, "step": 476 }, { "completion_length": 685.1823883056641, "epoch": 0.6215489860737845, "grad_norm": 0.0073153055272996426, "kl": 0.258697509765625, "learning_rate": 3.7609610700355014e-06, "loss": 0.0103, "reward": 1.0949343964457512, "reward_std": 0.2991542494855821, "rewards/accuracy_reward": 0.47066325787454844, "rewards/semantic_entropy_math_reward": 0.6242711283266544, "step": 477 }, { "completion_length": 605.1300849914551, "epoch": 0.6228520237804381, "grad_norm": 0.005829769652336836, "kl": 0.183807373046875, "learning_rate": 3.7389188880315962e-06, "loss": 0.0073, "reward": 1.2554664388298988, "reward_std": 0.25220690947026014, "rewards/accuracy_reward": 0.5433673392981291, "rewards/semantic_entropy_math_reward": 0.7120990920811892, "step": 478 }, { "completion_length": 603.2385177612305, "epoch": 0.6241550614870918, "grad_norm": 0.006107708904892206, "kl": 0.164093017578125, "learning_rate": 3.7169028483301333e-06, "loss": 0.0066, "reward": 1.1084183640778065, "reward_std": 0.2863908736035228, "rewards/accuracy_reward": 0.4655612148344517, "rewards/semantic_entropy_math_reward": 0.6428571324795485, "step": 479 }, { "completion_length": 573.8545780181885, "epoch": 0.6254580991937454, "grad_norm": 0.00693461624905467, "kl": 0.154266357421875, "learning_rate": 3.6949134073251993e-06, "loss": 0.0062, "reward": 1.1266399249434471, "reward_std": 0.24997530621476471, "rewards/accuracy_reward": 0.4630101975053549, "rewards/semantic_entropy_math_reward": 0.6636297330260277, "step": 480 }, { "completion_length": 573.4630012512207, "epoch": 0.626761136900399, "grad_norm": 0.005996524356305599, "kl": 0.1615753173828125, "learning_rate": 3.6729510208594954e-06, "loss": 0.0065, "reward": 1.2044460587203503, "reward_std": 0.22758202208206058, "rewards/accuracy_reward": 0.49744897056370974, "rewards/semantic_entropy_math_reward": 0.7069970704615116, "step": 481 }, { "completion_length": 579.3188648223877, "epoch": 0.6280641746070527, "grad_norm": 0.005521233659237623, "kl": 0.1152191162109375, "learning_rate": 3.6510161442148783e-06, "loss": 0.0046, "reward": 1.1302842386066914, "reward_std": 0.22607160033658147, "rewards/accuracy_reward": 0.43749998905695975, "rewards/semantic_entropy_math_reward": 0.6927842535078526, "step": 482 }, { "completion_length": 597.9732055664062, "epoch": 0.6293672123137063, "grad_norm": 0.005309538450092077, "kl": 0.1484832763671875, "learning_rate": 3.6291092321029244e-06, "loss": 0.0059, "reward": 1.1984329260885715, "reward_std": 0.2491335468366742, "rewards/accuracy_reward": 0.5063775405287743, "rewards/semantic_entropy_math_reward": 0.6920553874224424, "step": 483 }, { "completion_length": 567.8226890563965, "epoch": 0.6306702500203599, "grad_norm": 0.008105354383587837, "kl": 0.152862548828125, "learning_rate": 3.6072307386554983e-06, "loss": 0.0061, "reward": 1.0779883116483688, "reward_std": 0.23816610360518098, "rewards/accuracy_reward": 0.4387755021452904, "rewards/semantic_entropy_math_reward": 0.6392128262668848, "step": 484 }, { "completion_length": 622.4004993438721, "epoch": 0.6319732877270136, "grad_norm": 0.006110004149377346, "kl": 0.1293182373046875, "learning_rate": 3.58538111741535e-06, "loss": 0.0052, "reward": 1.0215014480054379, "reward_std": 0.27007982693612576, "rewards/accuracy_reward": 0.4081632620655, "rewards/semantic_entropy_math_reward": 0.6133381742984056, "step": 485 }, { "completion_length": 636.7971839904785, "epoch": 0.6332763254336672, "grad_norm": 0.017499709501862526, "kl": 0.1669921875, "learning_rate": 3.5635608213267063e-06, "loss": 0.0067, "reward": 1.0892857164144516, "reward_std": 0.2554079801775515, "rewards/accuracy_reward": 0.43367346189916134, "rewards/semantic_entropy_math_reward": 0.6556122284382582, "step": 486 }, { "completion_length": 588.7321300506592, "epoch": 0.6345793631403208, "grad_norm": 0.004545371048152447, "kl": 0.1056671142578125, "learning_rate": 3.5417703027258752e-06, "loss": 0.0042, "reward": 1.1413993909955025, "reward_std": 0.20089067751541734, "rewards/accuracy_reward": 0.4311224389821291, "rewards/semantic_entropy_math_reward": 0.7102769501507282, "step": 487 }, { "completion_length": 634.6071357727051, "epoch": 0.6358824008469746, "grad_norm": 0.006895666476339102, "kl": 0.133056640625, "learning_rate": 3.5200100133318836e-06, "loss": 0.0053, "reward": 1.0949343666434288, "reward_std": 0.24994314461946487, "rewards/accuracy_reward": 0.4349489714950323, "rewards/semantic_entropy_math_reward": 0.6599853970110416, "step": 488 }, { "completion_length": 605.1734600067139, "epoch": 0.6371854385536282, "grad_norm": 0.005687220022082329, "kl": 0.1224517822265625, "learning_rate": 3.4982804042370977e-06, "loss": 0.0049, "reward": 1.1929664723575115, "reward_std": 0.28191871475428343, "rewards/accuracy_reward": 0.503826517611742, "rewards/semantic_entropy_math_reward": 0.6891399398446083, "step": 489 }, { "completion_length": 584.5318698883057, "epoch": 0.6384884762602818, "grad_norm": 0.10970409959554672, "kl": 0.2203369140625, "learning_rate": 3.476581925897885e-06, "loss": 0.0088, "reward": 1.0945699661970139, "reward_std": 0.26680323388427496, "rewards/accuracy_reward": 0.424744886928238, "rewards/semantic_entropy_math_reward": 0.6698250640183687, "step": 490 }, { "completion_length": 618.9374923706055, "epoch": 0.6397915139669355, "grad_norm": 0.00716626551002264, "kl": 0.1537322998046875, "learning_rate": 3.4549150281252635e-06, "loss": 0.0062, "reward": 1.0965743213891983, "reward_std": 0.27518802881240845, "rewards/accuracy_reward": 0.46173468325287104, "rewards/semantic_entropy_math_reward": 0.6348396353423595, "step": 491 }, { "completion_length": 650.2857036590576, "epoch": 0.6410945516735891, "grad_norm": 0.0068465289659798145, "kl": 0.1469268798828125, "learning_rate": 3.4332801600755895e-06, "loss": 0.0059, "reward": 1.0732507482171059, "reward_std": 0.2783243739977479, "rewards/accuracy_reward": 0.4617346888408065, "rewards/semantic_entropy_math_reward": 0.6115160044282675, "step": 492 }, { "completion_length": 597.9731979370117, "epoch": 0.6423975893802427, "grad_norm": 0.005418149288743734, "kl": 0.14202880859375, "learning_rate": 3.4116777702412374e-06, "loss": 0.0057, "reward": 1.2037172056734562, "reward_std": 0.21637548180297017, "rewards/accuracy_reward": 0.477040808647871, "rewards/semantic_entropy_math_reward": 0.7266763728111982, "step": 493 }, { "completion_length": 625.3341751098633, "epoch": 0.6437006270868963, "grad_norm": 0.005324145313352346, "kl": 0.135284423828125, "learning_rate": 3.39010830644131e-06, "loss": 0.0054, "reward": 1.0242346823215485, "reward_std": 0.23252867441624403, "rewards/accuracy_reward": 0.4119897857308388, "rewards/semantic_entropy_math_reward": 0.612244876101613, "step": 494 }, { "completion_length": 639.5752353668213, "epoch": 0.64500366479355, "grad_norm": 0.0071450877003371716, "kl": 0.165374755859375, "learning_rate": 3.3685722158123435e-06, "loss": 0.0066, "reward": 1.074890647083521, "reward_std": 0.31771638337522745, "rewards/accuracy_reward": 0.4553571348078549, "rewards/semantic_entropy_math_reward": 0.6195335201919079, "step": 495 }, { "completion_length": 669.7091751098633, "epoch": 0.6463067025002036, "grad_norm": 0.005726813338696957, "kl": 0.1680908203125, "learning_rate": 3.3470699447990527e-06, "loss": 0.0067, "reward": 1.0604956075549126, "reward_std": 0.32736846897751093, "rewards/accuracy_reward": 0.47704081051051617, "rewards/semantic_entropy_math_reward": 0.5834548026323318, "step": 496 }, { "completion_length": 738.274227142334, "epoch": 0.6476097402068572, "grad_norm": 0.00598530750721693, "kl": 0.1893310546875, "learning_rate": 3.3256019391450696e-06, "loss": 0.0076, "reward": 0.9260204061865807, "reward_std": 0.29533028043806553, "rewards/accuracy_reward": 0.35459183249622583, "rewards/semantic_entropy_math_reward": 0.5714285653084517, "step": 497 }, { "completion_length": 676.584171295166, "epoch": 0.6489127779135109, "grad_norm": 0.00559025164693594, "kl": 0.18865966796875, "learning_rate": 3.3041686438836984e-06, "loss": 0.0075, "reward": 1.05375362560153, "reward_std": 0.3234766758978367, "rewards/accuracy_reward": 0.442602033726871, "rewards/semantic_entropy_math_reward": 0.6111515872180462, "step": 498 }, { "completion_length": 693.3277912139893, "epoch": 0.6502158156201645, "grad_norm": 0.005245016422122717, "kl": 0.171966552734375, "learning_rate": 3.2827705033286937e-06, "loss": 0.0069, "reward": 1.0987609215080738, "reward_std": 0.3140406091697514, "rewards/accuracy_reward": 0.4897959064692259, "rewards/semantic_entropy_math_reward": 0.6089650001376867, "step": 499 }, { "completion_length": 693.9974422454834, "epoch": 0.6515188533268181, "grad_norm": 0.0048330407589674, "kl": 0.1983795166015625, "learning_rate": 3.261407961065056e-06, "loss": 0.0079, "reward": 1.114249262958765, "reward_std": 0.2661195080727339, "rewards/accuracy_reward": 0.47831631638109684, "rewards/semantic_entropy_math_reward": 0.6359329335391521, "step": 500 }, { "completion_length": 698.6734619140625, "epoch": 0.6528218910334718, "grad_norm": 0.005367304664105177, "kl": 0.205596923828125, "learning_rate": 3.2400814599398283e-06, "loss": 0.0082, "reward": 1.0630466155707836, "reward_std": 0.2560729794204235, "rewards/accuracy_reward": 0.4438775423914194, "rewards/semantic_entropy_math_reward": 0.6191690731793642, "step": 501 }, { "completion_length": 745.2882499694824, "epoch": 0.6541249287401254, "grad_norm": 0.0048609632067382336, "kl": 0.2061767578125, "learning_rate": 3.2187914420529176e-06, "loss": 0.0082, "reward": 1.0041909478604794, "reward_std": 0.28918249253183603, "rewards/accuracy_reward": 0.42219387367367744, "rewards/semantic_entropy_math_reward": 0.5819970760494471, "step": 502 }, { "completion_length": 677.503812789917, "epoch": 0.655427966446779, "grad_norm": 0.00859370268881321, "kl": 0.1769561767578125, "learning_rate": 3.197538348747927e-06, "loss": 0.0071, "reward": 1.1033163107931614, "reward_std": 0.31454986706376076, "rewards/accuracy_reward": 0.4706632625311613, "rewards/semantic_entropy_math_reward": 0.6326530501246452, "step": 503 }, { "completion_length": 819.3775291442871, "epoch": 0.6567310041534327, "grad_norm": 0.02574523724615574, "kl": 0.32281494140625, "learning_rate": 3.176322620603018e-06, "loss": 0.0129, "reward": 0.9812317714095116, "reward_std": 0.286828659940511, "rewards/accuracy_reward": 0.4272959092631936, "rewards/semantic_entropy_math_reward": 0.5539358370006084, "step": 504 }, { "completion_length": 756.3571319580078, "epoch": 0.6580340418600863, "grad_norm": 0.005080169532448053, "kl": 0.21142578125, "learning_rate": 3.1551446974217643e-06, "loss": 0.0085, "reward": 0.9475218839943409, "reward_std": 0.2367838229984045, "rewards/accuracy_reward": 0.3877550926990807, "rewards/semantic_entropy_math_reward": 0.5597667563706636, "step": 505 }, { "completion_length": 687.2359504699707, "epoch": 0.6593370795667399, "grad_norm": 0.006239553447812796, "kl": 0.194488525390625, "learning_rate": 3.1340050182240438e-06, "loss": 0.0078, "reward": 1.1581632681190968, "reward_std": 0.268083474598825, "rewards/accuracy_reward": 0.5127550885081291, "rewards/semantic_entropy_math_reward": 0.645408146083355, "step": 506 }, { "completion_length": 782.9272727966309, "epoch": 0.6606401172733936, "grad_norm": 0.00490299379453063, "kl": 0.208282470703125, "learning_rate": 3.1129040212369286e-06, "loss": 0.0083, "reward": 0.9925291500985622, "reward_std": 0.25848277437034994, "rewards/accuracy_reward": 0.4502550968900323, "rewards/semantic_entropy_math_reward": 0.5422740392386913, "step": 507 }, { "completion_length": 668.0777893066406, "epoch": 0.6619431549800472, "grad_norm": 0.011372133158147335, "kl": 0.1784820556640625, "learning_rate": 3.091842143885609e-06, "loss": 0.0071, "reward": 1.0453717149794102, "reward_std": 0.31044185534119606, "rewards/accuracy_reward": 0.44515305012464523, "rewards/semantic_entropy_math_reward": 0.6002186518162489, "step": 508 }, { "completion_length": 691.622428894043, "epoch": 0.6632461926867008, "grad_norm": 0.005250839050859213, "kl": 0.18084716796875, "learning_rate": 3.070819822784323e-06, "loss": 0.0072, "reward": 1.0563046485185623, "reward_std": 0.2532790410332382, "rewards/accuracy_reward": 0.4502550922334194, "rewards/semantic_entropy_math_reward": 0.606049558147788, "step": 509 }, { "completion_length": 626.5650291442871, "epoch": 0.6645492303933546, "grad_norm": 0.008940630592405796, "kl": 0.159637451171875, "learning_rate": 3.0498374937273052e-06, "loss": 0.0064, "reward": 1.1115160211920738, "reward_std": 0.32732198666781187, "rewards/accuracy_reward": 0.4795918231830001, "rewards/semantic_entropy_math_reward": 0.6319241896271706, "step": 510 }, { "completion_length": 603.891565322876, "epoch": 0.6658522681000082, "grad_norm": 0.006209446582943201, "kl": 0.1728363037109375, "learning_rate": 3.028895591679748e-06, "loss": 0.0069, "reward": 1.1291909664869308, "reward_std": 0.25621695816516876, "rewards/accuracy_reward": 0.49872447550296783, "rewards/semantic_entropy_math_reward": 0.6304664518684149, "step": 511 }, { "completion_length": 617.848201751709, "epoch": 0.6671553058066618, "grad_norm": 0.00871723610907793, "kl": 0.1777191162109375, "learning_rate": 3.007994550768793e-06, "loss": 0.0071, "reward": 1.1093294396996498, "reward_std": 0.25273076817393303, "rewards/accuracy_reward": 0.4872448891401291, "rewards/semantic_entropy_math_reward": 0.6220845486968756, "step": 512 }, { "completion_length": 649.9834098815918, "epoch": 0.6684583435133155, "grad_norm": 0.030551278963685036, "kl": 0.1942138671875, "learning_rate": 2.987134804274526e-06, "loss": 0.0078, "reward": 0.9730320516973734, "reward_std": 0.26811313163489103, "rewards/accuracy_reward": 0.41071427892893553, "rewards/semantic_entropy_math_reward": 0.5623177755624056, "step": 513 }, { "completion_length": 596.0356998443604, "epoch": 0.6697613812199691, "grad_norm": 0.17326271533966064, "kl": 0.1730804443359375, "learning_rate": 2.966316784621e-06, "loss": 0.0069, "reward": 1.1656341068446636, "reward_std": 0.2616207399405539, "rewards/accuracy_reward": 0.5063775386661291, "rewards/semantic_entropy_math_reward": 0.6592565532773733, "step": 514 }, { "completion_length": 663.3303489685059, "epoch": 0.6710644189266227, "grad_norm": 0.007033300120383501, "kl": 0.220672607421875, "learning_rate": 2.9455409233672594e-06, "loss": 0.0088, "reward": 1.1038629598915577, "reward_std": 0.24969761539250612, "rewards/accuracy_reward": 0.477040808647871, "rewards/semantic_entropy_math_reward": 0.6268221493810415, "step": 515 }, { "completion_length": 675.3226947784424, "epoch": 0.6723674566332764, "grad_norm": 0.013932161964476109, "kl": 0.248992919921875, "learning_rate": 2.92480765119841e-06, "loss": 0.01, "reward": 1.0475582852959633, "reward_std": 0.2256002565845847, "rewards/accuracy_reward": 0.4553571343421936, "rewards/semantic_entropy_math_reward": 0.5922011435031891, "step": 516 }, { "completion_length": 711.7742233276367, "epoch": 0.67367049433993, "grad_norm": 0.008508083410561085, "kl": 0.26519775390625, "learning_rate": 2.9041173979166813e-06, "loss": 0.0106, "reward": 0.9876093398779631, "reward_std": 0.2467394908890128, "rewards/accuracy_reward": 0.41836733650416136, "rewards/semantic_entropy_math_reward": 0.5692419707775116, "step": 517 }, { "completion_length": 818.5650329589844, "epoch": 0.6749735320465836, "grad_norm": 0.008160092867910862, "kl": 0.29156494140625, "learning_rate": 2.883470592432512e-06, "loss": 0.0117, "reward": 0.7926384713500738, "reward_std": 0.24330945871770382, "rewards/accuracy_reward": 0.3265306062530726, "rewards/semantic_entropy_math_reward": 0.4661078657954931, "step": 518 }, { "completion_length": 743.4757556915283, "epoch": 0.6762765697532372, "grad_norm": 0.006833754479885101, "kl": 0.250640869140625, "learning_rate": 2.862867662755672e-06, "loss": 0.01, "reward": 0.965378999710083, "reward_std": 0.2578243245370686, "rewards/accuracy_reward": 0.41326529905200005, "rewards/semantic_entropy_math_reward": 0.5521136801689863, "step": 519 }, { "completion_length": 772.0905456542969, "epoch": 0.6775796074598909, "grad_norm": 0.0067017278634011745, "kl": 0.2637939453125, "learning_rate": 2.84230903598638e-06, "loss": 0.0106, "reward": 0.9819606281816959, "reward_std": 0.24308219738304615, "rewards/accuracy_reward": 0.4298469293862581, "rewards/semantic_entropy_math_reward": 0.5521136801689863, "step": 520 }, { "completion_length": 718.0726947784424, "epoch": 0.6788826451665445, "grad_norm": 0.007203311193734407, "kl": 0.212432861328125, "learning_rate": 2.8217951383064546e-06, "loss": 0.0085, "reward": 1.0420918129384518, "reward_std": 0.21660362696275115, "rewards/accuracy_reward": 0.43494897056370974, "rewards/semantic_entropy_math_reward": 0.6071428507566452, "step": 521 }, { "completion_length": 646.729570388794, "epoch": 0.6801856828731981, "grad_norm": 0.005152995232492685, "kl": 0.179840087890625, "learning_rate": 2.8013263949704706e-06, "loss": 0.0072, "reward": 1.1069606430828571, "reward_std": 0.22414773213677108, "rewards/accuracy_reward": 0.49362243711948395, "rewards/semantic_entropy_math_reward": 0.6133381761610508, "step": 522 }, { "completion_length": 699.3915634155273, "epoch": 0.6814887205798518, "grad_norm": 0.008858767338097095, "kl": 0.199951171875, "learning_rate": 2.7809032302969587e-06, "loss": 0.008, "reward": 1.0433673150837421, "reward_std": 0.2991886865347624, "rewards/accuracy_reward": 0.4617346916347742, "rewards/semantic_entropy_math_reward": 0.5816326402127743, "step": 523 }, { "completion_length": 687.3201332092285, "epoch": 0.6827917582865054, "grad_norm": 0.006325020454823971, "kl": 0.169891357421875, "learning_rate": 2.760526067659591e-06, "loss": 0.0068, "reward": 0.9859693832695484, "reward_std": 0.25481247482821345, "rewards/accuracy_reward": 0.41454081051051617, "rewards/semantic_entropy_math_reward": 0.571428569033742, "step": 524 }, { "completion_length": 703.3188648223877, "epoch": 0.684094795993159, "grad_norm": 0.007596628274768591, "kl": 0.1878662109375, "learning_rate": 2.740195329478424e-06, "loss": 0.0075, "reward": 0.9748542010784149, "reward_std": 0.2659454667009413, "rewards/accuracy_reward": 0.41071427892893553, "rewards/semantic_entropy_math_reward": 0.5641399137675762, "step": 525 }, { "completion_length": 683.8647785186768, "epoch": 0.6853978336998127, "grad_norm": 0.006465795915573835, "kl": 0.170196533203125, "learning_rate": 2.7199114372111224e-06, "loss": 0.0068, "reward": 1.1082361303269863, "reward_std": 0.21344353118911386, "rewards/accuracy_reward": 0.4668367253616452, "rewards/semantic_entropy_math_reward": 0.6413994003087282, "step": 526 }, { "completion_length": 639.9502449035645, "epoch": 0.6867008714064663, "grad_norm": 0.007306201383471489, "kl": 0.144927978515625, "learning_rate": 2.6996748113442397e-06, "loss": 0.0058, "reward": 1.1348396390676498, "reward_std": 0.18287338595837355, "rewards/accuracy_reward": 0.4668367231497541, "rewards/semantic_entropy_math_reward": 0.6680029276758432, "step": 527 }, { "completion_length": 638.0344200134277, "epoch": 0.6880039091131199, "grad_norm": 0.005182657856494188, "kl": 0.15069580078125, "learning_rate": 2.6794858713844895e-06, "loss": 0.006, "reward": 1.1198979392647743, "reward_std": 0.2267061648890376, "rewards/accuracy_reward": 0.4897959101945162, "rewards/semantic_entropy_math_reward": 0.6301020104438066, "step": 528 }, { "completion_length": 662.2805976867676, "epoch": 0.6893069468197736, "grad_norm": 0.006166698411107063, "kl": 0.1282806396484375, "learning_rate": 2.659345035850055e-06, "loss": 0.0051, "reward": 1.2049926966428757, "reward_std": 0.2516007013618946, "rewards/accuracy_reward": 0.5318877445533872, "rewards/semantic_entropy_math_reward": 0.6731049586087465, "step": 529 }, { "completion_length": 607.9693756103516, "epoch": 0.6906099845264272, "grad_norm": 0.007003019098192453, "kl": 0.155426025390625, "learning_rate": 2.6392527222619078e-06, "loss": 0.0062, "reward": 1.1678207069635391, "reward_std": 0.2332695524673909, "rewards/accuracy_reward": 0.5140305999666452, "rewards/semantic_entropy_math_reward": 0.6537900734692812, "step": 530 }, { "completion_length": 619.5777893066406, "epoch": 0.6919130222330808, "grad_norm": 0.011368455365300179, "kl": 0.146240234375, "learning_rate": 2.619209347135159e-06, "loss": 0.0059, "reward": 1.1809402517974377, "reward_std": 0.24746417393907905, "rewards/accuracy_reward": 0.5216836724430323, "rewards/semantic_entropy_math_reward": 0.6592565458267927, "step": 531 }, { "completion_length": 746.7716693878174, "epoch": 0.6932160599397345, "grad_norm": 0.007051791995763779, "kl": 0.2078399658203125, "learning_rate": 2.599215325970423e-06, "loss": 0.0083, "reward": 1.0320699512958527, "reward_std": 0.22767273988574743, "rewards/accuracy_reward": 0.4260204005986452, "rewards/semantic_entropy_math_reward": 0.6060495357960463, "step": 532 }, { "completion_length": 822.9310989379883, "epoch": 0.6945190976463882, "grad_norm": 0.027321763336658478, "kl": 0.245391845703125, "learning_rate": 2.5792710732452e-06, "loss": 0.0098, "reward": 0.922011636197567, "reward_std": 0.17606556764803827, "rewards/accuracy_reward": 0.3673469345085323, "rewards/semantic_entropy_math_reward": 0.5546647142618895, "step": 533 }, { "completion_length": 764.6989669799805, "epoch": 0.6958221353530418, "grad_norm": 0.007104664575308561, "kl": 0.21771240234375, "learning_rate": 2.559377002405285e-06, "loss": 0.0087, "reward": 0.9686588943004608, "reward_std": 0.28758456464856863, "rewards/accuracy_reward": 0.41071427799761295, "rewards/semantic_entropy_math_reward": 0.5579445976763964, "step": 534 }, { "completion_length": 766.0969276428223, "epoch": 0.6971251730596955, "grad_norm": 0.007235643453896046, "kl": 0.211334228515625, "learning_rate": 2.539533525856205e-06, "loss": 0.0085, "reward": 0.9879737608134747, "reward_std": 0.251090289093554, "rewards/accuracy_reward": 0.4081632513552904, "rewards/semantic_entropy_math_reward": 0.5798104722052813, "step": 535 }, { "completion_length": 726.7270355224609, "epoch": 0.6984282107663491, "grad_norm": 0.007906638085842133, "kl": 0.236480712890625, "learning_rate": 2.5197410549546598e-06, "loss": 0.0095, "reward": 1.010204080492258, "reward_std": 0.3336473312228918, "rewards/accuracy_reward": 0.43367346096783876, "rewards/semantic_entropy_math_reward": 0.5765306130051613, "step": 536 }, { "completion_length": 775.0650215148926, "epoch": 0.6997312484730027, "grad_norm": 0.010167106054723263, "kl": 0.26153564453125, "learning_rate": 2.5000000000000015e-06, "loss": 0.0105, "reward": 0.9606413915753365, "reward_std": 0.26847128942608833, "rewards/accuracy_reward": 0.40051019191741943, "rewards/semantic_entropy_math_reward": 0.5601311773061752, "step": 537 }, { "completion_length": 901.8813591003418, "epoch": 0.7010342861796564, "grad_norm": 0.011023694649338722, "kl": 0.36883544921875, "learning_rate": 2.4803107702257196e-06, "loss": 0.0147, "reward": 0.7693148627877235, "reward_std": 0.28571518138051033, "rewards/accuracy_reward": 0.3265306046232581, "rewards/semantic_entropy_math_reward": 0.4427842441946268, "step": 538 }, { "completion_length": 951.9706344604492, "epoch": 0.70233732388631, "grad_norm": 0.032441698014736176, "kl": 0.45880126953125, "learning_rate": 2.4606737737909696e-06, "loss": 0.0184, "reward": 0.7465378977358341, "reward_std": 0.322434832341969, "rewards/accuracy_reward": 0.32270407350733876, "rewards/semantic_entropy_math_reward": 0.4238338079303503, "step": 539 }, { "completion_length": 948.7027931213379, "epoch": 0.7036403615929636, "grad_norm": 0.013429205864667892, "kl": 0.4698486328125, "learning_rate": 2.4410894177721055e-06, "loss": 0.0188, "reward": 0.6958819199353456, "reward_std": 0.28599664429202676, "rewards/accuracy_reward": 0.3124999972060323, "rewards/semantic_entropy_math_reward": 0.3833818892017007, "step": 540 }, { "completion_length": 811.6683578491211, "epoch": 0.7049433992996172, "grad_norm": 0.017050035297870636, "kl": 0.400634765625, "learning_rate": 2.4215581081542416e-06, "loss": 0.016, "reward": 0.8852040693163872, "reward_std": 0.28038213634863496, "rewards/accuracy_reward": 0.39540815725922585, "rewards/semantic_entropy_math_reward": 0.48979588970541954, "step": 541 }, { "completion_length": 823.8583908081055, "epoch": 0.7062464370062709, "grad_norm": 0.009615703485906124, "kl": 0.385986328125, "learning_rate": 2.4020802498228333e-06, "loss": 0.0154, "reward": 0.8185131177306175, "reward_std": 0.2385545412544161, "rewards/accuracy_reward": 0.3392857088474557, "rewards/semantic_entropy_math_reward": 0.47922738641500473, "step": 542 }, { "completion_length": 703.1198902130127, "epoch": 0.7075494747129245, "grad_norm": 0.020303990691900253, "kl": 0.315399169921875, "learning_rate": 2.382656246555289e-06, "loss": 0.0126, "reward": 0.975218653678894, "reward_std": 0.30419396329671144, "rewards/accuracy_reward": 0.4158163210377097, "rewards/semantic_entropy_math_reward": 0.5594023130834103, "step": 543 }, { "completion_length": 681.1581535339355, "epoch": 0.7088525124195781, "grad_norm": 0.012283287942409515, "kl": 0.3436279296875, "learning_rate": 2.363286501012597e-06, "loss": 0.0137, "reward": 0.965378999710083, "reward_std": 0.2923779543489218, "rewards/accuracy_reward": 0.4030612176284194, "rewards/semantic_entropy_math_reward": 0.5623177755624056, "step": 544 }, { "completion_length": 647.7971858978271, "epoch": 0.7101555501262318, "grad_norm": 0.012344375252723694, "kl": 0.370941162109375, "learning_rate": 2.3439714147309845e-06, "loss": 0.0148, "reward": 0.9356778599321842, "reward_std": 0.31589669082313776, "rewards/accuracy_reward": 0.3941326439380646, "rewards/semantic_entropy_math_reward": 0.5415451619774103, "step": 545 }, { "completion_length": 624.709171295166, "epoch": 0.7114585878328854, "grad_norm": 0.019473157823085785, "kl": 0.41986083984375, "learning_rate": 2.3247113881135784e-06, "loss": 0.0168, "reward": 1.046282798051834, "reward_std": 0.33738357143010944, "rewards/accuracy_reward": 0.4668367188423872, "rewards/semantic_entropy_math_reward": 0.5794460587203503, "step": 546 }, { "completion_length": 598.6556015014648, "epoch": 0.712761625539539, "grad_norm": 0.017985127866268158, "kl": 0.379150390625, "learning_rate": 2.3055068204221226e-06, "loss": 0.0152, "reward": 1.0657798573374748, "reward_std": 0.3035286469385028, "rewards/accuracy_reward": 0.4579081553965807, "rewards/semantic_entropy_math_reward": 0.6078717205673456, "step": 547 }, { "completion_length": 558.3124885559082, "epoch": 0.7140646632461927, "grad_norm": 0.015661166980862617, "kl": 0.31695556640625, "learning_rate": 2.286358109768693e-06, "loss": 0.0127, "reward": 1.093294434249401, "reward_std": 0.30558070726692677, "rewards/accuracy_reward": 0.4693877501413226, "rewards/semantic_entropy_math_reward": 0.6239067036658525, "step": 548 }, { "completion_length": 581.8877391815186, "epoch": 0.7153677009528463, "grad_norm": 0.013467922806739807, "kl": 0.34124755859375, "learning_rate": 2.267265653107439e-06, "loss": 0.0137, "reward": 1.063957691192627, "reward_std": 0.312665224308148, "rewards/accuracy_reward": 0.4451530547812581, "rewards/semantic_entropy_math_reward": 0.6188046652823687, "step": 549 }, { "completion_length": 544.6198902130127, "epoch": 0.7166707386594999, "grad_norm": 0.01607655920088291, "kl": 0.27484130859375, "learning_rate": 2.248229846226366e-06, "loss": 0.011, "reward": 1.1266399249434471, "reward_std": 0.2719636783003807, "rewards/accuracy_reward": 0.49872447457164526, "rewards/semantic_entropy_math_reward": 0.6279154419898987, "step": 550 }, { "completion_length": 630.6313591003418, "epoch": 0.7179737763661536, "grad_norm": 0.012101009488105774, "kl": 0.352294921875, "learning_rate": 2.229251083739127e-06, "loss": 0.0141, "reward": 0.9928935766220093, "reward_std": 0.31344955042004585, "rewards/accuracy_reward": 0.4477040749043226, "rewards/semantic_entropy_math_reward": 0.5451894886791706, "step": 551 }, { "completion_length": 621.7334079742432, "epoch": 0.7192768140728072, "grad_norm": 0.008948387578129768, "kl": 0.3243408203125, "learning_rate": 2.2103297590768334e-06, "loss": 0.013, "reward": 1.0677842386066914, "reward_std": 0.23075262550264597, "rewards/accuracy_reward": 0.4464285643771291, "rewards/semantic_entropy_math_reward": 0.621355663985014, "step": 552 }, { "completion_length": 594.735954284668, "epoch": 0.7205798517794608, "grad_norm": 0.013390601612627506, "kl": 0.31689453125, "learning_rate": 2.191466264479915e-06, "loss": 0.0127, "reward": 0.966654509305954, "reward_std": 0.30670192558318377, "rewards/accuracy_reward": 0.381377543322742, "rewards/semantic_entropy_math_reward": 0.5852769538760185, "step": 553 }, { "completion_length": 620.9808521270752, "epoch": 0.7218828894861145, "grad_norm": 0.011634264141321182, "kl": 0.27801513671875, "learning_rate": 2.172660990989971e-06, "loss": 0.0111, "reward": 1.008017461746931, "reward_std": 0.26077887578867376, "rewards/accuracy_reward": 0.40561224054545164, "rewards/semantic_entropy_math_reward": 0.6024052388966084, "step": 554 }, { "completion_length": 608.8175926208496, "epoch": 0.7231859271927681, "grad_norm": 0.01886271871626377, "kl": 0.24444580078125, "learning_rate": 2.153914328441681e-06, "loss": 0.0098, "reward": 0.9821428656578064, "reward_std": 0.30726899579167366, "rewards/accuracy_reward": 0.40816325787454844, "rewards/semantic_entropy_math_reward": 0.5739795789122581, "step": 555 }, { "completion_length": 592.5943641662598, "epoch": 0.7244889648994218, "grad_norm": 0.05285423621535301, "kl": 0.2325897216796875, "learning_rate": 2.1352266654547127e-06, "loss": 0.0093, "reward": 1.1044096238911152, "reward_std": 0.28743922617286444, "rewards/accuracy_reward": 0.4757652971893549, "rewards/semantic_entropy_math_reward": 0.6286443043500185, "step": 556 }, { "completion_length": 507.7946319580078, "epoch": 0.7257920026060755, "grad_norm": 0.025362776592373848, "kl": 0.2216796875, "learning_rate": 2.1165983894256647e-06, "loss": 0.0089, "reward": 0.9730320647358894, "reward_std": 0.25302859814837575, "rewards/accuracy_reward": 0.38265305664390326, "rewards/semantic_entropy_math_reward": 0.5903789959847927, "step": 557 }, { "completion_length": 467.910701751709, "epoch": 0.7270950403127291, "grad_norm": 0.027268249541521072, "kl": 0.214630126953125, "learning_rate": 2.098029886520046e-06, "loss": 0.0086, "reward": 1.1831268109381199, "reward_std": 0.29964877339079976, "rewards/accuracy_reward": 0.5191326439380646, "rewards/semantic_entropy_math_reward": 0.6639941520988941, "step": 558 }, { "completion_length": 523.5675926208496, "epoch": 0.7283980780193827, "grad_norm": 0.020804887637495995, "kl": 0.221588134765625, "learning_rate": 2.0795215416642604e-06, "loss": 0.0089, "reward": 1.1004009060561657, "reward_std": 0.2511235764250159, "rewards/accuracy_reward": 0.4579081553965807, "rewards/semantic_entropy_math_reward": 0.6424927040934563, "step": 559 }, { "completion_length": 510.02805519104004, "epoch": 0.7297011157260364, "grad_norm": 0.030523886904120445, "kl": 0.238739013671875, "learning_rate": 2.061073738537635e-06, "loss": 0.0095, "reward": 1.1253644116222858, "reward_std": 0.26053296914324164, "rewards/accuracy_reward": 0.46938774455338717, "rewards/semantic_entropy_math_reward": 0.6559766549617052, "step": 560 }, { "completion_length": 555.0777969360352, "epoch": 0.73100415343269, "grad_norm": 0.02239506132900715, "kl": 0.230255126953125, "learning_rate": 2.042686859564455e-06, "loss": 0.0092, "reward": 1.0663265064358711, "reward_std": 0.2536613019183278, "rewards/accuracy_reward": 0.4311224455013871, "rewards/semantic_entropy_math_reward": 0.6352040730416775, "step": 561 }, { "completion_length": 570.0063667297363, "epoch": 0.7323071911393436, "grad_norm": 0.02290128916501999, "kl": 0.217132568359375, "learning_rate": 2.0243612859060526e-06, "loss": 0.0087, "reward": 1.1559766568243504, "reward_std": 0.2729125013574958, "rewards/accuracy_reward": 0.492346927523613, "rewards/semantic_entropy_math_reward": 0.6636297330260277, "step": 562 }, { "completion_length": 535.5446300506592, "epoch": 0.7336102288459972, "grad_norm": 0.015912175178527832, "kl": 0.2119140625, "learning_rate": 2.0060973974528873e-06, "loss": 0.0085, "reward": 1.215196792036295, "reward_std": 0.27952827024273574, "rewards/accuracy_reward": 0.5369897857308388, "rewards/semantic_entropy_math_reward": 0.6782069783657789, "step": 563 }, { "completion_length": 556.818865776062, "epoch": 0.7349132665526509, "grad_norm": 0.03764716908335686, "kl": 0.251953125, "learning_rate": 1.9878955728166894e-06, "loss": 0.0101, "reward": 1.0512026399374008, "reward_std": 0.23065149353351444, "rewards/accuracy_reward": 0.44515305291861296, "rewards/semantic_entropy_math_reward": 0.606049558147788, "step": 564 }, { "completion_length": 418.04718017578125, "epoch": 0.7362163042593045, "grad_norm": 0.032305777072906494, "kl": 0.2591552734375, "learning_rate": 1.9697561893225925e-06, "loss": 0.0104, "reward": 1.1211734600365162, "reward_std": 0.22499306593090296, "rewards/accuracy_reward": 0.4502550885081291, "rewards/semantic_entropy_math_reward": 0.6709183678030968, "step": 565 }, { "completion_length": 339.99106788635254, "epoch": 0.7375193419659581, "grad_norm": 0.05866134911775589, "kl": 0.30908203125, "learning_rate": 1.9516796230013275e-06, "loss": 0.0124, "reward": 1.113702617585659, "reward_std": 0.25871386053040624, "rewards/accuracy_reward": 0.4540816228836775, "rewards/semantic_entropy_math_reward": 0.6596209779381752, "step": 566 }, { "completion_length": 294.84310722351074, "epoch": 0.7388223796726118, "grad_norm": 0.0660272017121315, "kl": 0.343505859375, "learning_rate": 1.933666248581418e-06, "loss": 0.0137, "reward": 1.0468294471502304, "reward_std": 0.22743334900587797, "rewards/accuracy_reward": 0.3839285677531734, "rewards/semantic_entropy_math_reward": 0.6629008650779724, "step": 567 }, { "completion_length": 346.30228996276855, "epoch": 0.7401254173792654, "grad_norm": 0.08696349710226059, "kl": 0.32806396484375, "learning_rate": 1.9157164394814177e-06, "loss": 0.0131, "reward": 1.0428207144141197, "reward_std": 0.25299584632739425, "rewards/accuracy_reward": 0.39413264265749604, "rewards/semantic_entropy_math_reward": 0.6486880220472813, "step": 568 }, { "completion_length": 366.12626552581787, "epoch": 0.741428455085919, "grad_norm": 0.3502238094806671, "kl": 0.33148193359375, "learning_rate": 1.8978305678021598e-06, "loss": 0.0133, "reward": 1.119169108569622, "reward_std": 0.24155067326501012, "rewards/accuracy_reward": 0.41581631638109684, "rewards/semantic_entropy_math_reward": 0.7033527493476868, "step": 569 }, { "completion_length": 487.18494415283203, "epoch": 0.7427314927925727, "grad_norm": 0.029559634625911713, "kl": 0.276824951171875, "learning_rate": 1.8800090043190577e-06, "loss": 0.0111, "reward": 1.0792638398706913, "reward_std": 0.21825964516028762, "rewards/accuracy_reward": 0.40943876653909683, "rewards/semantic_entropy_math_reward": 0.6698250509798527, "step": 570 }, { "completion_length": 563.3749866485596, "epoch": 0.7440345304992263, "grad_norm": 0.045156899839639664, "kl": 0.28387451171875, "learning_rate": 1.862252118474409e-06, "loss": 0.0113, "reward": 1.0819970518350601, "reward_std": 0.20300109079107642, "rewards/accuracy_reward": 0.43112243991345167, "rewards/semantic_entropy_math_reward": 0.6508746109902859, "step": 571 }, { "completion_length": 526.4999847412109, "epoch": 0.7453375682058799, "grad_norm": 0.04475090280175209, "kl": 0.27496337890625, "learning_rate": 1.8445602783697375e-06, "loss": 0.011, "reward": 1.0577623546123505, "reward_std": 0.22694664704613388, "rewards/accuracy_reward": 0.3966836668550968, "rewards/semantic_entropy_math_reward": 0.6610786858946085, "step": 572 }, { "completion_length": 605.0446281433105, "epoch": 0.7466406059125336, "grad_norm": 0.024774570018053055, "kl": 0.34423828125, "learning_rate": 1.8269338507581629e-06, "loss": 0.0138, "reward": 0.918913971632719, "reward_std": 0.24625895079225302, "rewards/accuracy_reward": 0.3507652999833226, "rewards/semantic_entropy_math_reward": 0.5681486632674932, "step": 573 }, { "completion_length": 574.3010063171387, "epoch": 0.7479436436191872, "grad_norm": 0.037353966385126114, "kl": 0.365997314453125, "learning_rate": 1.8093732010368032e-06, "loss": 0.0146, "reward": 1.014212816953659, "reward_std": 0.2004528963007033, "rewards/accuracy_reward": 0.39540815725922585, "rewards/semantic_entropy_math_reward": 0.618804644793272, "step": 574 }, { "completion_length": 487.14667320251465, "epoch": 0.7492466813258408, "grad_norm": 0.041989631950855255, "kl": 0.36181640625, "learning_rate": 1.7918786932391945e-06, "loss": 0.0145, "reward": 1.0371720008552074, "reward_std": 0.17290423763915896, "rewards/accuracy_reward": 0.36734693218022585, "rewards/semantic_entropy_math_reward": 0.6698250640183687, "step": 575 }, { "completion_length": 352.76529693603516, "epoch": 0.7505497190324945, "grad_norm": 0.044953107833862305, "kl": 0.44354248046875, "learning_rate": 1.7744506900277464e-06, "loss": 0.0177, "reward": 1.016034983098507, "reward_std": 0.2133513499284163, "rewards/accuracy_reward": 0.32653060322627425, "rewards/semantic_entropy_math_reward": 0.6895043477416039, "step": 576 }, { "completion_length": 208.85458612442017, "epoch": 0.7518527567391481, "grad_norm": 0.0612008310854435, "kl": 0.5162353515625, "learning_rate": 1.7570895526862202e-06, "loss": 0.0207, "reward": 1.0901967696845531, "reward_std": 0.1308414033264853, "rewards/accuracy_reward": 0.30994897428900003, "rewards/semantic_entropy_math_reward": 0.7802478112280369, "step": 577 }, { "completion_length": 274.7066259384155, "epoch": 0.7531557944458018, "grad_norm": 0.1157035231590271, "kl": 0.44384765625, "learning_rate": 1.739795641112248e-06, "loss": 0.0177, "reward": 1.0892857238650322, "reward_std": 0.18080554471816868, "rewards/accuracy_reward": 0.33418366592377424, "rewards/semantic_entropy_math_reward": 0.7551020123064518, "step": 578 }, { "completion_length": 300.0880012512207, "epoch": 0.7544588321524555, "grad_norm": 0.08109962195158005, "kl": 0.4688720703125, "learning_rate": 1.7225693138098647e-06, "loss": 0.0187, "reward": 1.1525145657360554, "reward_std": 0.14735824265517294, "rewards/accuracy_reward": 0.3788265250623226, "rewards/semantic_entropy_math_reward": 0.773688055574894, "step": 579 }, { "completion_length": 297.2550950050354, "epoch": 0.7557618698591091, "grad_norm": 0.03772146627306938, "kl": 0.36279296875, "learning_rate": 1.7054109278820757e-06, "loss": 0.0145, "reward": 1.0956632532179356, "reward_std": 0.1659138685790822, "rewards/accuracy_reward": 0.3278061179444194, "rewards/semantic_entropy_math_reward": 0.7678571194410324, "step": 580 }, { "completion_length": 376.165807723999, "epoch": 0.7570649075657627, "grad_norm": 0.029041562229394913, "kl": 0.35699462890625, "learning_rate": 1.688320839023463e-06, "loss": 0.0143, "reward": 1.0900145433843136, "reward_std": 0.20950425369665027, "rewards/accuracy_reward": 0.3775510136038065, "rewards/semantic_entropy_math_reward": 0.7124635428190231, "step": 581 }, { "completion_length": 362.228307723999, "epoch": 0.7583679452724164, "grad_norm": 0.03911459818482399, "kl": 0.281341552734375, "learning_rate": 1.6712994015127976e-06, "loss": 0.0112, "reward": 1.13629737123847, "reward_std": 0.16711020795628428, "rewards/accuracy_reward": 0.38520407024770975, "rewards/semantic_entropy_math_reward": 0.7510932832956314, "step": 582 }, { "completion_length": 367.87499046325684, "epoch": 0.75967098297907, "grad_norm": 0.03136748820543289, "kl": 0.3541259765625, "learning_rate": 1.6543469682057105e-06, "loss": 0.0142, "reward": 1.078899409621954, "reward_std": 0.18274073861539364, "rewards/accuracy_reward": 0.345663258805871, "rewards/semantic_entropy_math_reward": 0.7332361191511154, "step": 583 }, { "completion_length": 360.51657581329346, "epoch": 0.7609740206857236, "grad_norm": 0.09674406796693802, "kl": 0.298583984375, "learning_rate": 1.6374638905273643e-06, "loss": 0.0119, "reward": 1.1361151337623596, "reward_std": 0.1828280050540343, "rewards/accuracy_reward": 0.3966836677864194, "rewards/semantic_entropy_math_reward": 0.7394314575940371, "step": 584 }, { "completion_length": 362.76275157928467, "epoch": 0.7622770583923772, "grad_norm": 0.11406248062849045, "kl": 0.3017730712890625, "learning_rate": 1.6206505184651793e-06, "loss": 0.0121, "reward": 1.1208090223371983, "reward_std": 0.20348022785037756, "rewards/accuracy_reward": 0.38137754425406456, "rewards/semantic_entropy_math_reward": 0.739431481808424, "step": 585 }, { "completion_length": 435.2308568954468, "epoch": 0.7635800960990309, "grad_norm": 0.02736966870725155, "kl": 0.2808685302734375, "learning_rate": 1.603907200561572e-06, "loss": 0.0112, "reward": 1.1596209667623043, "reward_std": 0.1956160650588572, "rewards/accuracy_reward": 0.42857142351567745, "rewards/semantic_entropy_math_reward": 0.7310495525598526, "step": 586 }, { "completion_length": 454.2882571220398, "epoch": 0.7648831338056845, "grad_norm": 0.027656100690364838, "kl": 0.3634033203125, "learning_rate": 1.5872342839067305e-06, "loss": 0.0145, "reward": 1.0754373036324978, "reward_std": 0.18778375268448144, "rewards/accuracy_reward": 0.37755101453512907, "rewards/semantic_entropy_math_reward": 0.6978862825781107, "step": 587 }, { "completion_length": 379.5459108352661, "epoch": 0.7661861715123381, "grad_norm": 0.03290018439292908, "kl": 0.33905029296875, "learning_rate": 1.5706321141314179e-06, "loss": 0.0136, "reward": 1.0665087178349495, "reward_std": 0.1908516800031066, "rewards/accuracy_reward": 0.340561218559742, "rewards/semantic_entropy_math_reward": 0.7259474918246269, "step": 588 }, { "completion_length": 382.5089182853699, "epoch": 0.7674892092189918, "grad_norm": 0.07391561567783356, "kl": 0.4368896484375, "learning_rate": 1.5541010353998132e-06, "loss": 0.0175, "reward": 1.0444606468081474, "reward_std": 0.16344475653022528, "rewards/accuracy_reward": 0.32908162218518555, "rewards/semantic_entropy_math_reward": 0.7153790071606636, "step": 589 }, { "completion_length": 315.21300315856934, "epoch": 0.7687922469256454, "grad_norm": 0.05292685329914093, "kl": 0.429443359375, "learning_rate": 1.5376413904023723e-06, "loss": 0.0172, "reward": 1.1118804439902306, "reward_std": 0.1601015015039593, "rewards/accuracy_reward": 0.3392857064027339, "rewards/semantic_entropy_math_reward": 0.7725947499275208, "step": 590 }, { "completion_length": 367.31886863708496, "epoch": 0.770095284632299, "grad_norm": 0.05790814757347107, "kl": 0.3692626953125, "learning_rate": 1.5212535203487227e-06, "loss": 0.0148, "reward": 1.1561588793992996, "reward_std": 0.19502084515988827, "rewards/accuracy_reward": 0.4298469293862581, "rewards/semantic_entropy_math_reward": 0.7263119332492352, "step": 591 }, { "completion_length": 298.876268863678, "epoch": 0.7713983223389527, "grad_norm": 0.04841747507452965, "kl": 0.40899658203125, "learning_rate": 1.5049377649605906e-06, "loss": 0.0164, "reward": 1.1667274050414562, "reward_std": 0.1857784865424037, "rewards/accuracy_reward": 0.38392855878919363, "rewards/semantic_entropy_math_reward": 0.7827988304197788, "step": 592 }, { "completion_length": 351.63775062561035, "epoch": 0.7727013600456063, "grad_norm": 0.039819974452257156, "kl": 0.38494873046875, "learning_rate": 1.4886944624647647e-06, "loss": 0.0154, "reward": 1.140852764248848, "reward_std": 0.21200390602461994, "rewards/accuracy_reward": 0.4119897875934839, "rewards/semantic_entropy_math_reward": 0.7288629561662674, "step": 593 }, { "completion_length": 309.96810364723206, "epoch": 0.7740043977522599, "grad_norm": 0.042601507157087326, "kl": 0.39892578125, "learning_rate": 1.4725239495860772e-06, "loss": 0.016, "reward": 1.1816690564155579, "reward_std": 0.14667326491326094, "rewards/accuracy_reward": 0.36862243711948395, "rewards/semantic_entropy_math_reward": 0.8130466304719448, "step": 594 }, { "completion_length": 376.6122360229492, "epoch": 0.7753074354589136, "grad_norm": 0.060134854167699814, "kl": 0.4835205078125, "learning_rate": 1.4564265615404289e-06, "loss": 0.0193, "reward": 1.0455539263784885, "reward_std": 0.17887542070820928, "rewards/accuracy_reward": 0.28826530056539923, "rewards/semantic_entropy_math_reward": 0.7572886161506176, "step": 595 }, { "completion_length": 396.1811103820801, "epoch": 0.7766104731655672, "grad_norm": 0.06609591841697693, "kl": 0.40057373046875, "learning_rate": 1.4404026320278318e-06, "loss": 0.016, "reward": 1.1045918092131615, "reward_std": 0.21589082339778543, "rewards/accuracy_reward": 0.400510192848742, "rewards/semantic_entropy_math_reward": 0.7040816284716129, "step": 596 }, { "completion_length": 372.4374928474426, "epoch": 0.7779135108722208, "grad_norm": 0.050282400101423264, "kl": 0.463623046875, "learning_rate": 1.4244524932255026e-06, "loss": 0.0185, "reward": 0.93476677313447, "reward_std": 0.18496204540133476, "rewards/accuracy_reward": 0.23214285355061293, "rewards/semantic_entropy_math_reward": 0.702623886987567, "step": 597 }, { "completion_length": 342.07652378082275, "epoch": 0.7792165485788745, "grad_norm": 0.040057145059108734, "kl": 0.454345703125, "learning_rate": 1.40857647578097e-06, "loss": 0.0182, "reward": 1.1004008911550045, "reward_std": 0.20259357197210193, "rewards/accuracy_reward": 0.34821427427232265, "rewards/semantic_entropy_math_reward": 0.7521865777671337, "step": 598 }, { "completion_length": 400.2423391342163, "epoch": 0.7805195862855281, "grad_norm": 0.050290465354919434, "kl": 0.396484375, "learning_rate": 1.3927749088052218e-06, "loss": 0.0159, "reward": 1.0360786989331245, "reward_std": 0.26113759470172226, "rewards/accuracy_reward": 0.3749999962747097, "rewards/semantic_entropy_math_reward": 0.661078717559576, "step": 599 }, { "completion_length": 407.16835498809814, "epoch": 0.7818226239921817, "grad_norm": 0.041269827634096146, "kl": 0.42626953125, "learning_rate": 1.3770481198658803e-06, "loss": 0.0171, "reward": 0.9737609215080738, "reward_std": 0.1884641689248383, "rewards/accuracy_reward": 0.2474489777814597, "rewards/semantic_entropy_math_reward": 0.7263119406998158, "step": 600 }, { "completion_length": 418.0803527832031, "epoch": 0.7831256616988355, "grad_norm": 0.03117123804986477, "kl": 0.43231201171875, "learning_rate": 1.361396434980413e-06, "loss": 0.0173, "reward": 1.0009110756218433, "reward_std": 0.23456275137141347, "rewards/accuracy_reward": 0.32015305617824197, "rewards/semantic_entropy_math_reward": 0.6807580124586821, "step": 601 }, { "completion_length": 362.0497364997864, "epoch": 0.7844286994054891, "grad_norm": 0.03975040838122368, "kl": 0.37957763671875, "learning_rate": 1.3458201786093795e-06, "loss": 0.0152, "reward": 1.036260947585106, "reward_std": 0.187728947494179, "rewards/accuracy_reward": 0.3048469293862581, "rewards/semantic_entropy_math_reward": 0.7314139828085899, "step": 602 }, { "completion_length": 340.35713291168213, "epoch": 0.7857317371121427, "grad_norm": 0.041815172880887985, "kl": 0.318603515625, "learning_rate": 1.3303196736496987e-06, "loss": 0.0127, "reward": 1.1131559386849403, "reward_std": 0.20154618099331856, "rewards/accuracy_reward": 0.38647958915680647, "rewards/semantic_entropy_math_reward": 0.7266763746738434, "step": 603 }, { "completion_length": 284.1658134460449, "epoch": 0.7870347748187964, "grad_norm": 0.05044187977910042, "kl": 0.35516357421875, "learning_rate": 1.3148952414279542e-06, "loss": 0.0142, "reward": 1.124271146953106, "reward_std": 0.18857199093326926, "rewards/accuracy_reward": 0.35714284889400005, "rewards/semantic_entropy_math_reward": 0.7671282477676868, "step": 604 }, { "completion_length": 289.3086676597595, "epoch": 0.78833781252545, "grad_norm": 0.1562122404575348, "kl": 0.623046875, "learning_rate": 1.2995472016937405e-06, "loss": 0.0249, "reward": 1.2479956075549126, "reward_std": 0.18065414763987064, "rewards/accuracy_reward": 0.4451530510559678, "rewards/semantic_entropy_math_reward": 0.8028425313532352, "step": 605 }, { "completion_length": 349.39412117004395, "epoch": 0.7896408502321036, "grad_norm": 0.04871983453631401, "kl": 0.362548828125, "learning_rate": 1.2842758726130283e-06, "loss": 0.0145, "reward": 1.1004008911550045, "reward_std": 0.24142802599817514, "rewards/accuracy_reward": 0.3864795845001936, "rewards/semantic_entropy_math_reward": 0.7139212675392628, "step": 606 }, { "completion_length": 469.8290767669678, "epoch": 0.7909438879387573, "grad_norm": 0.028862085193395615, "kl": 0.3682861328125, "learning_rate": 1.2690815707615727e-06, "loss": 0.0147, "reward": 1.0677842609584332, "reward_std": 0.27761211479082704, "rewards/accuracy_reward": 0.40051019843667746, "rewards/semantic_entropy_math_reward": 0.6672740392386913, "step": 607 }, { "completion_length": 590.3724365234375, "epoch": 0.7922469256454109, "grad_norm": 0.03419097140431404, "kl": 0.5047607421875, "learning_rate": 1.2539646111183452e-06, "loss": 0.0202, "reward": 0.8802842646837234, "reward_std": 0.3137570391409099, "rewards/accuracy_reward": 0.3099489724263549, "rewards/semantic_entropy_math_reward": 0.5703352764248848, "step": 608 }, { "completion_length": 603.7818794250488, "epoch": 0.7935499633520645, "grad_norm": 0.03210240229964256, "kl": 0.5252685546875, "learning_rate": 1.2389253070590118e-06, "loss": 0.021, "reward": 0.8862973675131798, "reward_std": 0.3080208618193865, "rewards/accuracy_reward": 0.31632652319967747, "rewards/semantic_entropy_math_reward": 0.5699708368629217, "step": 609 }, { "completion_length": 558.1900424957275, "epoch": 0.7948530010587181, "grad_norm": 0.03380637988448143, "kl": 0.4759521484375, "learning_rate": 1.2239639703494282e-06, "loss": 0.019, "reward": 0.9677478075027466, "reward_std": 0.30004962626844645, "rewards/accuracy_reward": 0.3660714216530323, "rewards/semantic_entropy_math_reward": 0.6016763653606176, "step": 610 }, { "completion_length": 491.76912689208984, "epoch": 0.7961560387653718, "grad_norm": 0.032054197043180466, "kl": 0.50555419921875, "learning_rate": 1.209080911139187e-06, "loss": 0.0202, "reward": 0.9247448965907097, "reward_std": 0.27889097575098276, "rewards/accuracy_reward": 0.3022959118243307, "rewards/semantic_entropy_math_reward": 0.6224489752203226, "step": 611 }, { "completion_length": 447.65432929992676, "epoch": 0.7974590764720254, "grad_norm": 0.0636380985379219, "kl": 0.4744873046875, "learning_rate": 1.194276437955177e-06, "loss": 0.019, "reward": 1.0018221251666546, "reward_std": 0.2524149240925908, "rewards/accuracy_reward": 0.35459182504564524, "rewards/semantic_entropy_math_reward": 0.647230314090848, "step": 612 }, { "completion_length": 338.26402378082275, "epoch": 0.798762114178679, "grad_norm": 0.02368706464767456, "kl": 0.3944091796875, "learning_rate": 1.1795508576951958e-06, "loss": 0.0158, "reward": 1.0974854342639446, "reward_std": 0.2424869371461682, "rewards/accuracy_reward": 0.36862243991345167, "rewards/semantic_entropy_math_reward": 0.728862963616848, "step": 613 }, { "completion_length": 367.5318775177002, "epoch": 0.8000651518853327, "grad_norm": 0.029344232752919197, "kl": 0.41790771484375, "learning_rate": 1.1649044756215872e-06, "loss": 0.0167, "reward": 0.9746720008552074, "reward_std": 0.21145512093789876, "rewards/accuracy_reward": 0.28188774711452425, "rewards/semantic_entropy_math_reward": 0.6927842423319817, "step": 614 }, { "completion_length": 373.7576484680176, "epoch": 0.8013681895919863, "grad_norm": 0.026323076337575912, "kl": 0.423828125, "learning_rate": 1.1503375953549046e-06, "loss": 0.017, "reward": 1.0118440128862858, "reward_std": 0.25336172385141253, "rewards/accuracy_reward": 0.3303571380674839, "rewards/semantic_entropy_math_reward": 0.6814868580549955, "step": 615 }, { "completion_length": 283.0982093811035, "epoch": 0.8026712272986399, "grad_norm": 0.0518302321434021, "kl": 0.4051513671875, "learning_rate": 1.1358505188676288e-06, "loss": 0.0162, "reward": 1.1572521515190601, "reward_std": 0.18953745206817985, "rewards/accuracy_reward": 0.3864795807749033, "rewards/semantic_entropy_math_reward": 0.770772609859705, "step": 616 }, { "completion_length": 340.0369815826416, "epoch": 0.8039742650052936, "grad_norm": 0.032579850405454636, "kl": 0.3560791015625, "learning_rate": 1.1214435464779006e-06, "loss": 0.0142, "reward": 1.071610763669014, "reward_std": 0.23610352934338152, "rewards/accuracy_reward": 0.3660714225843549, "rewards/semantic_entropy_math_reward": 0.7055393308401108, "step": 617 }, { "completion_length": 283.2283134460449, "epoch": 0.8052773027119472, "grad_norm": 0.02413102425634861, "kl": 0.40386962890625, "learning_rate": 1.1071169768432983e-06, "loss": 0.0162, "reward": 1.107689518481493, "reward_std": 0.18413462536409497, "rewards/accuracy_reward": 0.3380101970396936, "rewards/semantic_entropy_math_reward": 0.7696792930364609, "step": 618 }, { "completion_length": 312.3635153770447, "epoch": 0.8065803404186008, "grad_norm": 0.01991398259997368, "kl": 0.33349609375, "learning_rate": 1.0928711069546434e-06, "loss": 0.0133, "reward": 1.184220101684332, "reward_std": 0.15371649456210434, "rewards/accuracy_reward": 0.3839285634458065, "rewards/semantic_entropy_math_reward": 0.8002915605902672, "step": 619 }, { "completion_length": 281.18111515045166, "epoch": 0.8078833781252545, "grad_norm": 0.037673208862543106, "kl": 0.38360595703125, "learning_rate": 1.0787062321298441e-06, "loss": 0.0153, "reward": 1.1352040767669678, "reward_std": 0.16719181410735473, "rewards/accuracy_reward": 0.36989794950932264, "rewards/semantic_entropy_math_reward": 0.7653061039745808, "step": 620 }, { "completion_length": 370.61223793029785, "epoch": 0.8091864158319081, "grad_norm": 0.03418143466114998, "kl": 0.421875, "learning_rate": 1.0646226460077797e-06, "loss": 0.0169, "reward": 1.079446043819189, "reward_std": 0.20678971777670085, "rewards/accuracy_reward": 0.38265305291861296, "rewards/semantic_entropy_math_reward": 0.6967930011451244, "step": 621 }, { "completion_length": 330.9477014541626, "epoch": 0.8104894535385617, "grad_norm": 0.153985396027565, "kl": 0.568603515625, "learning_rate": 1.050620640542208e-06, "loss": 0.0228, "reward": 1.1333819068968296, "reward_std": 0.21255871118046343, "rewards/accuracy_reward": 0.3903061142191291, "rewards/semantic_entropy_math_reward": 0.7430757731199265, "step": 622 }, { "completion_length": 417.5663118362427, "epoch": 0.8117924912452154, "grad_norm": 0.027394482865929604, "kl": 0.3516845703125, "learning_rate": 1.0367005059957097e-06, "loss": 0.014, "reward": 1.1193513050675392, "reward_std": 0.23229796160012484, "rewards/accuracy_reward": 0.38392856251448393, "rewards/semantic_entropy_math_reward": 0.7354227378964424, "step": 623 }, { "completion_length": 453.4566249847412, "epoch": 0.8130955289518691, "grad_norm": 0.01650371588766575, "kl": 0.32598876953125, "learning_rate": 1.0228625309336793e-06, "loss": 0.013, "reward": 1.11916908249259, "reward_std": 0.2623719316907227, "rewards/accuracy_reward": 0.40306121576577425, "rewards/semantic_entropy_math_reward": 0.7161078527569771, "step": 624 }, { "completion_length": 519.9910593032837, "epoch": 0.8143985666585227, "grad_norm": 0.03104584477841854, "kl": 0.2882080078125, "learning_rate": 1.0091070022183386e-06, "loss": 0.0115, "reward": 1.1559766754508018, "reward_std": 0.26669816416688263, "rewards/accuracy_reward": 0.4897959101945162, "rewards/semantic_entropy_math_reward": 0.666180744767189, "step": 625 }, { "completion_length": 591.5076389312744, "epoch": 0.8157016043651764, "grad_norm": 0.017405111342668533, "kl": 0.38946533203125, "learning_rate": 9.954342050027922e-07, "loss": 0.0156, "reward": 1.0114795602858067, "reward_std": 0.2892504008486867, "rewards/accuracy_reward": 0.38903060369193554, "rewards/semantic_entropy_math_reward": 0.6224489621818066, "step": 626 }, { "completion_length": 651.4374828338623, "epoch": 0.81700464207183, "grad_norm": 0.022734420374035835, "kl": 0.44012451171875, "learning_rate": 9.81844422725109e-07, "loss": 0.0176, "reward": 0.8609693758189678, "reward_std": 0.28162418492138386, "rewards/accuracy_reward": 0.32015305757522583, "rewards/semantic_entropy_math_reward": 0.540816318243742, "step": 627 }, { "completion_length": 597.8928413391113, "epoch": 0.8183076797784836, "grad_norm": 0.027012916281819344, "kl": 0.408447265625, "learning_rate": 9.683379371024598e-07, "loss": 0.0163, "reward": 0.9549927078187466, "reward_std": 0.3158495416864753, "rewards/accuracy_reward": 0.3737244810909033, "rewards/semantic_entropy_math_reward": 0.5812682118266821, "step": 628 }, { "completion_length": 649.4030494689941, "epoch": 0.8196107174851373, "grad_norm": 0.025382481515407562, "kl": 0.4549560546875, "learning_rate": 9.549150281252633e-07, "loss": 0.0182, "reward": 0.9384110644459724, "reward_std": 0.35815113317221403, "rewards/accuracy_reward": 0.39795917458832264, "rewards/semantic_entropy_math_reward": 0.5404518805444241, "step": 629 }, { "completion_length": 580.4081497192383, "epoch": 0.8209137551917909, "grad_norm": 0.023639794439077377, "kl": 0.3726806640625, "learning_rate": 9.415759740513935e-07, "loss": 0.0149, "reward": 0.9996355548501015, "reward_std": 0.2600903743878007, "rewards/accuracy_reward": 0.3877551006153226, "rewards/semantic_entropy_math_reward": 0.6118804588913918, "step": 630 }, { "completion_length": 575.3405437469482, "epoch": 0.8222167928984445, "grad_norm": 0.013850666582584381, "kl": 0.36126708984375, "learning_rate": 9.283210514004009e-07, "loss": 0.0144, "reward": 1.0260568670928478, "reward_std": 0.2677850485779345, "rewards/accuracy_reward": 0.3839285634458065, "rewards/semantic_entropy_math_reward": 0.6421282533556223, "step": 631 }, { "completion_length": 524.933666229248, "epoch": 0.8235198306050981, "grad_norm": 0.03527293726801872, "kl": 0.37152099609375, "learning_rate": 9.151505349477901e-07, "loss": 0.0149, "reward": 0.9883381724357605, "reward_std": 0.269354238640517, "rewards/accuracy_reward": 0.35204081144183874, "rewards/semantic_entropy_math_reward": 0.6362973619252443, "step": 632 }, { "completion_length": 564.161979675293, "epoch": 0.8248228683117518, "grad_norm": 0.01822865754365921, "kl": 0.35162353515625, "learning_rate": 9.020646977193176e-07, "loss": 0.0141, "reward": 0.9921647235751152, "reward_std": 0.2643348714336753, "rewards/accuracy_reward": 0.3813775428570807, "rewards/semantic_entropy_math_reward": 0.6107871439307928, "step": 633 }, { "completion_length": 496.9094352722168, "epoch": 0.8261259060184054, "grad_norm": 0.03390026092529297, "kl": 0.3707275390625, "learning_rate": 8.890638109853339e-07, "loss": 0.0148, "reward": 1.1020408011972904, "reward_std": 0.2521671140566468, "rewards/accuracy_reward": 0.40816325903870165, "rewards/semantic_entropy_math_reward": 0.6938775554299355, "step": 634 }, { "completion_length": 514.464277267456, "epoch": 0.827428943725059, "grad_norm": 0.05904604494571686, "kl": 0.36944580078125, "learning_rate": 8.761481442551573e-07, "loss": 0.0148, "reward": 1.003462079912424, "reward_std": 0.30850899964571, "rewards/accuracy_reward": 0.38137754576746374, "rewards/semantic_entropy_math_reward": 0.6220845431089401, "step": 635 }, { "completion_length": 426.13264179229736, "epoch": 0.8287319814317127, "grad_norm": 0.041935548186302185, "kl": 0.42755126953125, "learning_rate": 8.633179652714919e-07, "loss": 0.0171, "reward": 1.0337099060416222, "reward_std": 0.21100471052341163, "rewards/accuracy_reward": 0.3456632597371936, "rewards/semantic_entropy_math_reward": 0.6880466565489769, "step": 636 }, { "completion_length": 422.9196357727051, "epoch": 0.8300350191383663, "grad_norm": 0.02546284720301628, "kl": 0.41717529296875, "learning_rate": 8.505735400048748e-07, "loss": 0.0167, "reward": 1.107507299631834, "reward_std": 0.21986527368426323, "rewards/accuracy_reward": 0.34693876560777426, "rewards/semantic_entropy_math_reward": 0.7605684921145439, "step": 637 }, { "completion_length": 454.71044921875, "epoch": 0.8313380568450199, "grad_norm": 0.03483239933848381, "kl": 0.2786407470703125, "learning_rate": 8.379151326481588e-07, "loss": 0.0111, "reward": 1.1672740429639816, "reward_std": 0.24700368754565716, "rewards/accuracy_reward": 0.45153060276061296, "rewards/semantic_entropy_math_reward": 0.7157434336841106, "step": 638 }, { "completion_length": 402.9132537841797, "epoch": 0.8326410945516736, "grad_norm": 0.0319095179438591, "kl": 0.3465576171875, "learning_rate": 8.253430056110451e-07, "loss": 0.0139, "reward": 1.15196792781353, "reward_std": 0.17017819953616709, "rewards/accuracy_reward": 0.4336734591051936, "rewards/semantic_entropy_math_reward": 0.7182944342494011, "step": 639 }, { "completion_length": 406.01402282714844, "epoch": 0.8339441322583272, "grad_norm": 0.0387706495821476, "kl": 0.3958740234375, "learning_rate": 8.128574195146305e-07, "loss": 0.0158, "reward": 1.0956632643938065, "reward_std": 0.20970030257012695, "rewards/accuracy_reward": 0.3890306046232581, "rewards/semantic_entropy_math_reward": 0.706632636487484, "step": 640 }, { "completion_length": 446.9923343658447, "epoch": 0.8352471699649808, "grad_norm": 0.03349936380982399, "kl": 0.37689208984375, "learning_rate": 8.004586331860176e-07, "loss": 0.0151, "reward": 1.0949343740940094, "reward_std": 0.20607998478226364, "rewards/accuracy_reward": 0.37882652785629034, "rewards/semantic_entropy_math_reward": 0.7161078657954931, "step": 641 }, { "completion_length": 379.9591751098633, "epoch": 0.8365502076716345, "grad_norm": 0.05373205617070198, "kl": 0.44476318359375, "learning_rate": 7.881469036529427e-07, "loss": 0.0178, "reward": 1.0863702520728111, "reward_std": 0.19910921528935432, "rewards/accuracy_reward": 0.33673468709457666, "rewards/semantic_entropy_math_reward": 0.7496355473995209, "step": 642 }, { "completion_length": 345.74999475479126, "epoch": 0.8378532453782881, "grad_norm": 0.01885698363184929, "kl": 0.402587890625, "learning_rate": 7.759224861384446e-07, "loss": 0.0161, "reward": 1.228862963616848, "reward_std": 0.16080106468871236, "rewards/accuracy_reward": 0.42857141606509686, "rewards/semantic_entropy_math_reward": 0.800291508436203, "step": 643 }, { "completion_length": 427.4642791748047, "epoch": 0.8391562830849417, "grad_norm": 0.04977795109152794, "kl": 0.3905029296875, "learning_rate": 7.637856340555822e-07, "loss": 0.0156, "reward": 1.1149781346321106, "reward_std": 0.20377160818316042, "rewards/accuracy_reward": 0.39158162102103233, "rewards/semantic_entropy_math_reward": 0.7233964949846268, "step": 644 }, { "completion_length": 354.3265218734741, "epoch": 0.8404593207915954, "grad_norm": 0.06010611727833748, "kl": 0.445098876953125, "learning_rate": 7.517365990021757e-07, "loss": 0.0178, "reward": 1.0980320498347282, "reward_std": 0.21962121210526675, "rewards/accuracy_reward": 0.34693876933306456, "rewards/semantic_entropy_math_reward": 0.7510932981967926, "step": 645 }, { "completion_length": 328.0535650253296, "epoch": 0.8417623584982491, "grad_norm": 0.040210895240306854, "kl": 0.379638671875, "learning_rate": 7.397756307555886e-07, "loss": 0.0152, "reward": 1.1109693720936775, "reward_std": 0.19025816256180406, "rewards/accuracy_reward": 0.3584183622151613, "rewards/semantic_entropy_math_reward": 0.752551008015871, "step": 646 }, { "completion_length": 400.6798391342163, "epoch": 0.8430653962049027, "grad_norm": 0.04031793028116226, "kl": 0.372314453125, "learning_rate": 7.279029772675572e-07, "loss": 0.0149, "reward": 1.145043734461069, "reward_std": 0.2088110044132918, "rewards/accuracy_reward": 0.41581632383167744, "rewards/semantic_entropy_math_reward": 0.7292274013161659, "step": 647 }, { "completion_length": 395.78442764282227, "epoch": 0.8443684339115564, "grad_norm": 0.026767419651150703, "kl": 0.42462158203125, "learning_rate": 7.161188846590455e-07, "loss": 0.017, "reward": 1.1086005680263042, "reward_std": 0.15781450062058866, "rewards/accuracy_reward": 0.3596938671544194, "rewards/semantic_entropy_math_reward": 0.7489067167043686, "step": 648 }, { "completion_length": 435.71682357788086, "epoch": 0.84567147161821, "grad_norm": 0.047037623822689056, "kl": 0.395751953125, "learning_rate": 7.044235972151431e-07, "loss": 0.0158, "reward": 1.086370263248682, "reward_std": 0.22188947489485145, "rewards/accuracy_reward": 0.3647959139198065, "rewards/semantic_entropy_math_reward": 0.7215743288397789, "step": 649 }, { "completion_length": 368.5918254852295, "epoch": 0.8469745093248636, "grad_norm": 0.0330636091530323, "kl": 0.3455810546875, "learning_rate": 6.928173573800007e-07, "loss": 0.0138, "reward": 1.2407069653272629, "reward_std": 0.18592529953457415, "rewards/accuracy_reward": 0.44770407397300005, "rewards/semantic_entropy_math_reward": 0.7930028922855854, "step": 650 }, { "completion_length": 371.6033124923706, "epoch": 0.8482775470315173, "grad_norm": 0.038935303688049316, "kl": 0.36468505859375, "learning_rate": 6.813004057518091e-07, "loss": 0.0146, "reward": 1.1572521589696407, "reward_std": 0.20256502274423838, "rewards/accuracy_reward": 0.3992346879094839, "rewards/semantic_entropy_math_reward": 0.7580174766480923, "step": 651 }, { "completion_length": 404.85203647613525, "epoch": 0.8495805847381709, "grad_norm": 0.049969255924224854, "kl": 0.3448486328125, "learning_rate": 6.698729810778065e-07, "loss": 0.0138, "reward": 1.1115160137414932, "reward_std": 0.1803172673098743, "rewards/accuracy_reward": 0.38775509409606457, "rewards/semantic_entropy_math_reward": 0.7237609326839447, "step": 652 }, { "completion_length": 406.1198959350586, "epoch": 0.8508836224448245, "grad_norm": 0.0458567775785923, "kl": 0.35931396484375, "learning_rate": 6.585353202493322e-07, "loss": 0.0144, "reward": 1.1188046336174011, "reward_std": 0.2137998074758798, "rewards/accuracy_reward": 0.3749999925494194, "rewards/semantic_entropy_math_reward": 0.7438046466559172, "step": 653 }, { "completion_length": 365.6288161277771, "epoch": 0.8521866601514781, "grad_norm": 0.03025432489812374, "kl": 0.3153076171875, "learning_rate": 6.472876582969101e-07, "loss": 0.0126, "reward": 1.2337828017771244, "reward_std": 0.16135288658551872, "rewards/accuracy_reward": 0.44005101174116135, "rewards/semantic_entropy_math_reward": 0.793731763958931, "step": 654 }, { "completion_length": 431.18749809265137, "epoch": 0.8534896978581318, "grad_norm": 0.07424527406692505, "kl": 0.345062255859375, "learning_rate": 6.36130228385386e-07, "loss": 0.0138, "reward": 1.1864066869020462, "reward_std": 0.19669072842225432, "rewards/accuracy_reward": 0.4196428491268307, "rewards/semantic_entropy_math_reward": 0.7667638212442398, "step": 655 }, { "completion_length": 397.48851585388184, "epoch": 0.8547927355647854, "grad_norm": 0.07442077249288559, "kl": 0.40008544921875, "learning_rate": 6.250632618090868e-07, "loss": 0.016, "reward": 1.0728862881660461, "reward_std": 0.185965295182541, "rewards/accuracy_reward": 0.31122448528185487, "rewards/semantic_entropy_math_reward": 0.7616617977619171, "step": 656 }, { "completion_length": 467.08162117004395, "epoch": 0.856095773271439, "grad_norm": 0.04169654846191406, "kl": 0.308837890625, "learning_rate": 6.140869879870287e-07, "loss": 0.0124, "reward": 1.1370262205600739, "reward_std": 0.18776633380912244, "rewards/accuracy_reward": 0.4158163205720484, "rewards/semantic_entropy_math_reward": 0.7212099023163319, "step": 657 }, { "completion_length": 411.7193794250488, "epoch": 0.8573988109780927, "grad_norm": 0.011574473232030869, "kl": 0.291259765625, "learning_rate": 6.032016344581598e-07, "loss": 0.0117, "reward": 1.2093658931553364, "reward_std": 0.21189526095986366, "rewards/accuracy_reward": 0.42474488355219364, "rewards/semantic_entropy_math_reward": 0.7846209742128849, "step": 658 }, { "completion_length": 424.3915729522705, "epoch": 0.8587018486847463, "grad_norm": 0.03387419879436493, "kl": 0.262664794921875, "learning_rate": 5.924074268766422e-07, "loss": 0.0105, "reward": 1.2383381612598896, "reward_std": 0.19120207405649126, "rewards/accuracy_reward": 0.4668367253616452, "rewards/semantic_entropy_math_reward": 0.7715014442801476, "step": 659 }, { "completion_length": 431.9681062698364, "epoch": 0.8600048863913999, "grad_norm": 0.037805065512657166, "kl": 0.31048583984375, "learning_rate": 5.817045890071793e-07, "loss": 0.0124, "reward": 1.1741982102394104, "reward_std": 0.16903570503927767, "rewards/accuracy_reward": 0.41071427799761295, "rewards/semantic_entropy_math_reward": 0.7634839229285717, "step": 660 }, { "completion_length": 410.6849412918091, "epoch": 0.8613079240980536, "grad_norm": 0.03823651000857353, "kl": 0.251678466796875, "learning_rate": 5.710933427203736e-07, "loss": 0.0101, "reward": 1.2572886236011982, "reward_std": 0.20483915554359555, "rewards/accuracy_reward": 0.46938774548470974, "rewards/semantic_entropy_math_reward": 0.7879008669406176, "step": 661 }, { "completion_length": 416.8928508758545, "epoch": 0.8626109618047072, "grad_norm": 0.05192481353878975, "kl": 0.26739501953125, "learning_rate": 5.60573907988124e-07, "loss": 0.0107, "reward": 1.2323250472545624, "reward_std": 0.20163663325365633, "rewards/accuracy_reward": 0.463010192848742, "rewards/semantic_entropy_math_reward": 0.7693148627877235, "step": 662 }, { "completion_length": 445.3290710449219, "epoch": 0.8639139995113608, "grad_norm": 0.04189416766166687, "kl": 0.312042236328125, "learning_rate": 5.501465028790726e-07, "loss": 0.0125, "reward": 1.1137026064097881, "reward_std": 0.189966871868819, "rewards/accuracy_reward": 0.4005101937800646, "rewards/semantic_entropy_math_reward": 0.7131923995912075, "step": 663 }, { "completion_length": 406.13136863708496, "epoch": 0.8652170372180145, "grad_norm": 0.054233673959970474, "kl": 0.281646728515625, "learning_rate": 5.398113435540797e-07, "loss": 0.0113, "reward": 1.2776967734098434, "reward_std": 0.16192793939262629, "rewards/accuracy_reward": 0.48979590460658073, "rewards/semantic_entropy_math_reward": 0.7879008688032627, "step": 664 }, { "completion_length": 420.1453971862793, "epoch": 0.8665200749246681, "grad_norm": 0.05208450183272362, "kl": 0.359619140625, "learning_rate": 5.295686442617442e-07, "loss": 0.0144, "reward": 1.1548833921551704, "reward_std": 0.21648172801360488, "rewards/accuracy_reward": 0.4005101975053549, "rewards/semantic_entropy_math_reward": 0.754373162984848, "step": 665 }, { "completion_length": 389.8469285964966, "epoch": 0.8678231126313217, "grad_norm": 0.027671653777360916, "kl": 0.29913330078125, "learning_rate": 5.194186173339599e-07, "loss": 0.012, "reward": 1.1763848215341568, "reward_std": 0.18030581763014197, "rewards/accuracy_reward": 0.4183673383668065, "rewards/semantic_entropy_math_reward": 0.7580174561589956, "step": 666 }, { "completion_length": 417.1568794250488, "epoch": 0.8691261503379754, "grad_norm": 0.07873902469873428, "kl": 0.31817626953125, "learning_rate": 5.0936147318152e-07, "loss": 0.0127, "reward": 1.1472303047776222, "reward_std": 0.18318506353534758, "rewards/accuracy_reward": 0.3852040721103549, "rewards/semantic_entropy_math_reward": 0.7620262280106544, "step": 667 }, { "completion_length": 369.3851947784424, "epoch": 0.870429188044629, "grad_norm": 0.05243045464158058, "kl": 0.295196533203125, "learning_rate": 4.993974202897456e-07, "loss": 0.0118, "reward": 1.14613701030612, "reward_std": 0.18234298407332972, "rewards/accuracy_reward": 0.3954081544652581, "rewards/semantic_entropy_math_reward": 0.7507288344204426, "step": 668 }, { "completion_length": 363.0497341156006, "epoch": 0.8717322257512827, "grad_norm": 0.1095355749130249, "kl": 0.2958984375, "learning_rate": 4.89526665214174e-07, "loss": 0.0118, "reward": 1.2485422492027283, "reward_std": 0.17711640358902514, "rewards/accuracy_reward": 0.4515306055545807, "rewards/semantic_entropy_math_reward": 0.79701167345047, "step": 669 }, { "completion_length": 357.57014656066895, "epoch": 0.8730352634579364, "grad_norm": 0.19667263329029083, "kl": 0.3148193359375, "learning_rate": 4.797494125762686e-07, "loss": 0.0126, "reward": 1.1539722867310047, "reward_std": 0.2037108545191586, "rewards/accuracy_reward": 0.39923468977212906, "rewards/semantic_entropy_math_reward": 0.7547375932335854, "step": 670 }, { "completion_length": 354.2767791748047, "epoch": 0.87433830116459, "grad_norm": 0.03188151493668556, "kl": 0.2900390625, "learning_rate": 4.7006586505918273e-07, "loss": 0.0116, "reward": 1.126093301922083, "reward_std": 0.19438390037976205, "rewards/accuracy_reward": 0.3647959101945162, "rewards/semantic_entropy_math_reward": 0.761297382414341, "step": 671 }, { "completion_length": 281.7971920967102, "epoch": 0.8756413388712436, "grad_norm": 0.06938362866640091, "kl": 0.3271484375, "learning_rate": 4.604762234035548e-07, "loss": 0.0131, "reward": 1.2246719896793365, "reward_std": 0.1467125858180225, "rewards/accuracy_reward": 0.41709182877093554, "rewards/semantic_entropy_math_reward": 0.8075801618397236, "step": 672 }, { "completion_length": 275.4617290496826, "epoch": 0.8769443765778973, "grad_norm": 0.0804184079170227, "kl": 0.43194580078125, "learning_rate": 4.5098068640335003e-07, "loss": 0.0173, "reward": 1.1554299890995026, "reward_std": 0.13515388022642583, "rewards/accuracy_reward": 0.3456632560119033, "rewards/semantic_entropy_math_reward": 0.8097667321562767, "step": 673 }, { "completion_length": 253.93494367599487, "epoch": 0.8782474142845509, "grad_norm": 0.07285462319850922, "kl": 0.395751953125, "learning_rate": 4.4157945090173294e-07, "loss": 0.0158, "reward": 1.1550655774772167, "reward_std": 0.1761651874985546, "rewards/accuracy_reward": 0.3737244822550565, "rewards/semantic_entropy_math_reward": 0.781341090798378, "step": 674 }, { "completion_length": 232.26785492897034, "epoch": 0.8795504519912045, "grad_norm": 0.1681598573923111, "kl": 0.54461669921875, "learning_rate": 4.322727117869951e-07, "loss": 0.0218, "reward": 1.203534971922636, "reward_std": 0.15214054519310594, "rewards/accuracy_reward": 0.3609693795442581, "rewards/semantic_entropy_math_reward": 0.8425655961036682, "step": 675 }, { "completion_length": 281.43749237060547, "epoch": 0.8808534896978581, "grad_norm": 0.03161332383751869, "kl": 0.32342529296875, "learning_rate": 4.230606619885108e-07, "loss": 0.0129, "reward": 1.2751457691192627, "reward_std": 0.15857776370830834, "rewards/accuracy_reward": 0.4515305981040001, "rewards/semantic_entropy_math_reward": 0.8236151412129402, "step": 676 }, { "completion_length": 374.45790004730225, "epoch": 0.8821565274045118, "grad_norm": 0.04998277500271797, "kl": 0.41162109375, "learning_rate": 4.139434924727359e-07, "loss": 0.0165, "reward": 1.1206268072128296, "reward_std": 0.2200465016067028, "rewards/accuracy_reward": 0.38520407769829035, "rewards/semantic_entropy_math_reward": 0.7354227304458618, "step": 677 }, { "completion_length": 543.3609638214111, "epoch": 0.8834595651111654, "grad_norm": 0.03607329726219177, "kl": 0.521728515625, "learning_rate": 4.049213922392509e-07, "loss": 0.0209, "reward": 0.9921647384762764, "reward_std": 0.22189455549232662, "rewards/accuracy_reward": 0.32270407397300005, "rewards/semantic_entropy_math_reward": 0.6694606374949217, "step": 678 }, { "completion_length": 610.0675945281982, "epoch": 0.884762602817819, "grad_norm": 0.03998032212257385, "kl": 0.5587158203125, "learning_rate": 3.9599454831684646e-07, "loss": 0.0223, "reward": 1.0278790071606636, "reward_std": 0.24171342654153705, "rewards/accuracy_reward": 0.3431122391484678, "rewards/semantic_entropy_math_reward": 0.6847667545080185, "step": 679 }, { "completion_length": 652.0012702941895, "epoch": 0.8860656405244727, "grad_norm": 0.035478148609399796, "kl": 0.5806884765625, "learning_rate": 3.8716314575964197e-07, "loss": 0.0232, "reward": 1.0133017338812351, "reward_std": 0.2808587457984686, "rewards/accuracy_reward": 0.353316318243742, "rewards/semantic_entropy_math_reward": 0.6599854156374931, "step": 680 }, { "completion_length": 619.2703952789307, "epoch": 0.8873686782311263, "grad_norm": 0.058793697506189346, "kl": 0.6444091796875, "learning_rate": 3.7842736764324705e-07, "loss": 0.0258, "reward": 1.02241250872612, "reward_std": 0.23398004146292806, "rewards/accuracy_reward": 0.34311223961412907, "rewards/semantic_entropy_math_reward": 0.6793002896010876, "step": 681 }, { "completion_length": 395.042085647583, "epoch": 0.8886717159377799, "grad_norm": 0.04328228533267975, "kl": 0.53875732421875, "learning_rate": 3.697873950609737e-07, "loss": 0.0215, "reward": 1.0617711320519447, "reward_std": 0.21476724895182997, "rewards/accuracy_reward": 0.36607142072170973, "rewards/semantic_entropy_math_reward": 0.6956996973603964, "step": 682 }, { "completion_length": 366.2538185119629, "epoch": 0.8899747536444336, "grad_norm": 0.04645731300115585, "kl": 0.46856689453125, "learning_rate": 3.612434071200771e-07, "loss": 0.0187, "reward": 1.0586734637618065, "reward_std": 0.2551836888305843, "rewards/accuracy_reward": 0.36479591205716133, "rewards/semantic_entropy_math_reward": 0.6938775330781937, "step": 683 }, { "completion_length": 295.38902282714844, "epoch": 0.8912777913510872, "grad_norm": 0.04322868585586548, "kl": 0.438873291015625, "learning_rate": 3.5279558093804456e-07, "loss": 0.0175, "reward": 1.0912900939583778, "reward_std": 0.23786525335162878, "rewards/accuracy_reward": 0.37117346189916134, "rewards/semantic_entropy_math_reward": 0.7201165985316038, "step": 684 }, { "completion_length": 292.8813705444336, "epoch": 0.8925808290577408, "grad_norm": 0.06398846209049225, "kl": 0.4605712890625, "learning_rate": 3.4444409163892076e-07, "loss": 0.0184, "reward": 1.0880101956427097, "reward_std": 0.22160307061858475, "rewards/accuracy_reward": 0.3380101975053549, "rewards/semantic_entropy_math_reward": 0.7499999739229679, "step": 685 }, { "completion_length": 228.51402473449707, "epoch": 0.8938838667643945, "grad_norm": 0.05024726688861847, "kl": 0.49658203125, "learning_rate": 3.361891123496824e-07, "loss": 0.0199, "reward": 1.1978862471878529, "reward_std": 0.18422120437026024, "rewards/accuracy_reward": 0.4030612148344517, "rewards/semantic_entropy_math_reward": 0.7948250621557236, "step": 686 }, { "completion_length": 306.61606550216675, "epoch": 0.8951869044710481, "grad_norm": 0.04674625024199486, "kl": 0.42791748046875, "learning_rate": 3.2803081419664483e-07, "loss": 0.0171, "reward": 1.098760899156332, "reward_std": 0.2297251671552658, "rewards/accuracy_reward": 0.3622448919340968, "rewards/semantic_entropy_math_reward": 0.7365160137414932, "step": 687 }, { "completion_length": 265.5624942779541, "epoch": 0.8964899421777017, "grad_norm": 0.029866818338632584, "kl": 0.40057373046875, "learning_rate": 3.1996936630191876e-07, "loss": 0.016, "reward": 1.1936952993273735, "reward_std": 0.16354057961143553, "rewards/accuracy_reward": 0.4221938649425283, "rewards/semantic_entropy_math_reward": 0.7715014405548573, "step": 688 }, { "completion_length": 350.4527988433838, "epoch": 0.8977929798843554, "grad_norm": 0.03606593608856201, "kl": 0.35064697265625, "learning_rate": 3.1200493577989875e-07, "loss": 0.014, "reward": 1.1036807335913181, "reward_std": 0.18493338662665337, "rewards/accuracy_reward": 0.37372448295354843, "rewards/semantic_entropy_math_reward": 0.7299562580883503, "step": 689 }, { "completion_length": 352.2704076766968, "epoch": 0.899096017591009, "grad_norm": 0.025935035198926926, "kl": 0.3753662109375, "learning_rate": 3.041376877338059e-07, "loss": 0.015, "reward": 1.1260932683944702, "reward_std": 0.22720066783949733, "rewards/accuracy_reward": 0.4107142793945968, "rewards/semantic_entropy_math_reward": 0.715378999710083, "step": 690 }, { "completion_length": 384.17218685150146, "epoch": 0.9003990552976627, "grad_norm": 0.02440270222723484, "kl": 0.39837646484375, "learning_rate": 2.9636778525225897e-07, "loss": 0.0159, "reward": 1.148505810648203, "reward_std": 0.2703684773296118, "rewards/accuracy_reward": 0.4502550922334194, "rewards/semantic_entropy_math_reward": 0.6982507258653641, "step": 691 }, { "completion_length": 399.5880002975464, "epoch": 0.9017020930043164, "grad_norm": 0.0358339361846447, "kl": 0.364990234375, "learning_rate": 2.88695389405898e-07, "loss": 0.0146, "reward": 1.0929300151765347, "reward_std": 0.24717829283326864, "rewards/accuracy_reward": 0.40816326159983873, "rewards/semantic_entropy_math_reward": 0.6847667396068573, "step": 692 }, { "completion_length": 350.663254737854, "epoch": 0.90300513071097, "grad_norm": 0.031924594193696976, "kl": 0.360107421875, "learning_rate": 2.8112065924404075e-07, "loss": 0.0144, "reward": 1.1375728771090508, "reward_std": 0.23678708774968982, "rewards/accuracy_reward": 0.42219386994838715, "rewards/semantic_entropy_math_reward": 0.7153789959847927, "step": 693 }, { "completion_length": 426.64157485961914, "epoch": 0.9043081684176236, "grad_norm": 0.09235311299562454, "kl": 0.404052734375, "learning_rate": 2.7364375179139147e-07, "loss": 0.0162, "reward": 1.0337098874151707, "reward_std": 0.2390512810088694, "rewards/accuracy_reward": 0.3482142803259194, "rewards/semantic_entropy_math_reward": 0.6854956038296223, "step": 694 }, { "completion_length": 417.71172428131104, "epoch": 0.9056112061242773, "grad_norm": 0.018458684906363487, "kl": 0.383056640625, "learning_rate": 2.662648220447811e-07, "loss": 0.0153, "reward": 1.1208090297877789, "reward_std": 0.21627305913716555, "rewards/accuracy_reward": 0.40433672443032265, "rewards/semantic_entropy_math_reward": 0.7164722699671984, "step": 695 }, { "completion_length": 458.40687370300293, "epoch": 0.9069142438309309, "grad_norm": 0.019566472619771957, "kl": 0.352783203125, "learning_rate": 2.5898402296995584e-07, "loss": 0.0141, "reward": 1.2224854305386543, "reward_std": 0.261710612103343, "rewards/accuracy_reward": 0.49362243711948395, "rewards/semantic_entropy_math_reward": 0.7288629710674286, "step": 696 }, { "completion_length": 450.03953075408936, "epoch": 0.9082172815375845, "grad_norm": 0.028319351375102997, "kl": 0.37744140625, "learning_rate": 2.518015054984041e-07, "loss": 0.0151, "reward": 1.0987609438598156, "reward_std": 0.24412001110613346, "rewards/accuracy_reward": 0.41071427799761295, "rewards/semantic_entropy_math_reward": 0.6880466528236866, "step": 697 }, { "completion_length": 459.8698902130127, "epoch": 0.9095203192442382, "grad_norm": 0.09039890766143799, "kl": 0.37261962890625, "learning_rate": 2.447174185242324e-07, "loss": 0.0149, "reward": 1.023870225995779, "reward_std": 0.2071858572307974, "rewards/accuracy_reward": 0.34566325787454844, "rewards/semantic_entropy_math_reward": 0.6782069765031338, "step": 698 }, { "completion_length": 561.855863571167, "epoch": 0.9108233569508918, "grad_norm": 0.10793410986661911, "kl": 0.42462158203125, "learning_rate": 2.377319089010749e-07, "loss": 0.017, "reward": 0.9542638435959816, "reward_std": 0.23002642020583153, "rewards/accuracy_reward": 0.32525509560946375, "rewards/semantic_entropy_math_reward": 0.6290087215602398, "step": 699 }, { "completion_length": 500.6747398376465, "epoch": 0.9121263946575454, "grad_norm": 0.024128131568431854, "kl": 0.34368896484375, "learning_rate": 2.3084512143905057e-07, "loss": 0.0137, "reward": 1.2172011658549309, "reward_std": 0.2632435173727572, "rewards/accuracy_reward": 0.48979590833187103, "rewards/semantic_entropy_math_reward": 0.7274052519351244, "step": 700 }, { "completion_length": 535.8022842407227, "epoch": 0.913429432364199, "grad_norm": 0.024771742522716522, "kl": 0.41070556640625, "learning_rate": 2.240571989017598e-07, "loss": 0.0164, "reward": 0.9859693795442581, "reward_std": 0.2209851387888193, "rewards/accuracy_reward": 0.3329081553965807, "rewards/semantic_entropy_math_reward": 0.6530611887574196, "step": 701 }, { "completion_length": 492.42728328704834, "epoch": 0.9147324700708527, "grad_norm": 0.029699552804231644, "kl": 0.3704833984375, "learning_rate": 2.1736828200332628e-07, "loss": 0.0148, "reward": 1.0393586084246635, "reward_std": 0.21769557683728635, "rewards/accuracy_reward": 0.35714284982532263, "rewards/semantic_entropy_math_reward": 0.6822157315909863, "step": 702 }, { "completion_length": 574.6836624145508, "epoch": 0.9160355077775063, "grad_norm": 0.018987800925970078, "kl": 0.37677001953125, "learning_rate": 2.107785094054804e-07, "loss": 0.0151, "reward": 1.0590379014611244, "reward_std": 0.2792482681106776, "rewards/accuracy_reward": 0.39795917831361294, "rewards/semantic_entropy_math_reward": 0.6610787026584148, "step": 703 }, { "completion_length": 594.7869815826416, "epoch": 0.9173385454841599, "grad_norm": 0.016190655529499054, "kl": 0.40179443359375, "learning_rate": 2.0428801771468388e-07, "loss": 0.0161, "reward": 1.0404518991708755, "reward_std": 0.30567876109853387, "rewards/accuracy_reward": 0.4030612139031291, "rewards/semantic_entropy_math_reward": 0.6373906545341015, "step": 704 }, { "completion_length": 512.0459098815918, "epoch": 0.9186415831908136, "grad_norm": 0.03197417035698891, "kl": 0.368560791015625, "learning_rate": 1.978969414792975e-07, "loss": 0.0147, "reward": 1.052842564880848, "reward_std": 0.27527198754251003, "rewards/accuracy_reward": 0.3954081581905484, "rewards/semantic_entropy_math_reward": 0.6574343796819448, "step": 705 }, { "completion_length": 432.4783058166504, "epoch": 0.9199446208974672, "grad_norm": 0.12664413452148438, "kl": 0.44183349609375, "learning_rate": 1.9160541318679227e-07, "loss": 0.0177, "reward": 1.0433673523366451, "reward_std": 0.1849078651284799, "rewards/accuracy_reward": 0.3061224431730807, "rewards/semantic_entropy_math_reward": 0.7372448816895485, "step": 706 }, { "completion_length": 399.7257614135742, "epoch": 0.9212476586041208, "grad_norm": 0.028620820492506027, "kl": 0.319793701171875, "learning_rate": 1.8541356326100436e-07, "loss": 0.0128, "reward": 1.1876821778714657, "reward_std": 0.183109519071877, "rewards/accuracy_reward": 0.42346937768161297, "rewards/semantic_entropy_math_reward": 0.7642128169536591, "step": 707 }, { "completion_length": 380.3686180114746, "epoch": 0.9225506963107745, "grad_norm": 0.023331135511398315, "kl": 0.3931884765625, "learning_rate": 1.793215200594284e-07, "loss": 0.0157, "reward": 1.1198979206383228, "reward_std": 0.1656974998768419, "rewards/accuracy_reward": 0.34948978619650006, "rewards/semantic_entropy_math_reward": 0.7704081386327744, "step": 708 }, { "completion_length": 386.7257614135742, "epoch": 0.9238537340174281, "grad_norm": 0.01822568289935589, "kl": 0.3607177734375, "learning_rate": 1.7332940987056014e-07, "loss": 0.0144, "reward": 1.1454081647098064, "reward_std": 0.2311251019127667, "rewards/accuracy_reward": 0.4132652971893549, "rewards/semantic_entropy_math_reward": 0.7321428321301937, "step": 709 }, { "completion_length": 382.3724412918091, "epoch": 0.9251567717240817, "grad_norm": 0.01877155341207981, "kl": 0.31597900390625, "learning_rate": 1.6743735691127639e-07, "loss": 0.0126, "reward": 1.1762026250362396, "reward_std": 0.1784958445932716, "rewards/accuracy_reward": 0.4272959055379033, "rewards/semantic_entropy_math_reward": 0.7489067055284977, "step": 710 }, { "completion_length": 377.8558597564697, "epoch": 0.9264598094307354, "grad_norm": 0.015497390180826187, "kl": 0.35791015625, "learning_rate": 1.6164548332426033e-07, "loss": 0.0143, "reward": 1.2443512976169586, "reward_std": 0.21314028184860945, "rewards/accuracy_reward": 0.46811223588883877, "rewards/semantic_entropy_math_reward": 0.7762390300631523, "step": 711 }, { "completion_length": 364.33800506591797, "epoch": 0.927762847137389, "grad_norm": 0.03393126279115677, "kl": 0.363006591796875, "learning_rate": 1.559539091754686e-07, "loss": 0.0145, "reward": 1.174198243767023, "reward_std": 0.2167317953426391, "rewards/accuracy_reward": 0.4260204010643065, "rewards/semantic_entropy_math_reward": 0.7481778301298618, "step": 712 }, { "completion_length": 341.5867266654968, "epoch": 0.9290658848440426, "grad_norm": 0.02347736805677414, "kl": 0.3837890625, "learning_rate": 1.5036275245164377e-07, "loss": 0.0153, "reward": 1.1973396390676498, "reward_std": 0.2168941011186689, "rewards/accuracy_reward": 0.4017857061699033, "rewards/semantic_entropy_math_reward": 0.7955539412796497, "step": 713 }, { "completion_length": 368.2104539871216, "epoch": 0.9303689225506964, "grad_norm": 0.015018362551927567, "kl": 0.3460693359375, "learning_rate": 1.4487212905786973e-07, "loss": 0.0138, "reward": 1.1062317676842213, "reward_std": 0.1761375303613022, "rewards/accuracy_reward": 0.35076530056539923, "rewards/semantic_entropy_math_reward": 0.7554664611816406, "step": 714 }, { "completion_length": 332.2181043624878, "epoch": 0.93167196025735, "grad_norm": 0.034039661288261414, "kl": 0.32257080078125, "learning_rate": 1.3948215281516352e-07, "loss": 0.0129, "reward": 1.2540087439119816, "reward_std": 0.19711918849498034, "rewards/accuracy_reward": 0.44897958589717746, "rewards/semantic_entropy_math_reward": 0.8050291463732719, "step": 715 }, { "completion_length": 335.7793302536011, "epoch": 0.9329749979640036, "grad_norm": 0.01781829446554184, "kl": 0.34783935546875, "learning_rate": 1.341929354581234e-07, "loss": 0.0139, "reward": 1.172376062721014, "reward_std": 0.17551544518209994, "rewards/accuracy_reward": 0.40561223309487104, "rewards/semantic_entropy_math_reward": 0.7667638324201107, "step": 716 }, { "completion_length": 364.9693775177002, "epoch": 0.9342780356706573, "grad_norm": 0.026781508699059486, "kl": 0.3289794921875, "learning_rate": 1.2900458663260506e-07, "loss": 0.0132, "reward": 1.2128279618918896, "reward_std": 0.2522245158907026, "rewards/accuracy_reward": 0.464285708963871, "rewards/semantic_entropy_math_reward": 0.7485422659665346, "step": 717 }, { "completion_length": 322.6173405647278, "epoch": 0.9355810733773109, "grad_norm": 0.019278842955827713, "kl": 0.362060546875, "learning_rate": 1.2391721389345468e-07, "loss": 0.0145, "reward": 1.1924198046326637, "reward_std": 0.17253582226112485, "rewards/accuracy_reward": 0.3852040732745081, "rewards/semantic_entropy_math_reward": 0.8072157204151154, "step": 718 }, { "completion_length": 348.52550506591797, "epoch": 0.9368841110839645, "grad_norm": 0.025921957567334175, "kl": 0.38519287109375, "learning_rate": 1.1893092270227724e-07, "loss": 0.0154, "reward": 1.0865524522960186, "reward_std": 0.15436905319802463, "rewards/accuracy_reward": 0.31760203279554844, "rewards/semantic_entropy_math_reward": 0.768950417637825, "step": 719 }, { "completion_length": 377.4732027053833, "epoch": 0.9381871487906182, "grad_norm": 0.01881706900894642, "kl": 0.3355712890625, "learning_rate": 1.1404581642524782e-07, "loss": 0.0134, "reward": 1.1519679129123688, "reward_std": 0.19073297455906868, "rewards/accuracy_reward": 0.39285713247954845, "rewards/semantic_entropy_math_reward": 0.7591107562184334, "step": 720 }, { "completion_length": 376.54208183288574, "epoch": 0.9394901864972718, "grad_norm": 0.024892186746001244, "kl": 0.36529541015625, "learning_rate": 1.0926199633097156e-07, "loss": 0.0146, "reward": 1.068695306777954, "reward_std": 0.21568013355135918, "rewards/accuracy_reward": 0.31505101546645164, "rewards/semantic_entropy_math_reward": 0.7536443024873734, "step": 721 }, { "completion_length": 390.8469305038452, "epoch": 0.9407932242039254, "grad_norm": 0.04719565808773041, "kl": 0.39141845703125, "learning_rate": 1.0457956158838545e-07, "loss": 0.0157, "reward": 1.0645043551921844, "reward_std": 0.21989482874050736, "rewards/accuracy_reward": 0.33673469070345163, "rewards/semantic_entropy_math_reward": 0.7277696654200554, "step": 722 }, { "completion_length": 395.5331554412842, "epoch": 0.942096261910579, "grad_norm": 0.04518326371908188, "kl": 0.35357666015625, "learning_rate": 9.999860926469928e-08, "loss": 0.0141, "reward": 1.1485058069229126, "reward_std": 0.20885607041418552, "rewards/accuracy_reward": 0.40178570710122585, "rewards/semantic_entropy_math_reward": 0.746720090508461, "step": 723 }, { "completion_length": 383.1938705444336, "epoch": 0.9433992996172327, "grad_norm": 0.028134629130363464, "kl": 0.34954833984375, "learning_rate": 9.551923432338406e-08, "loss": 0.014, "reward": 1.1603498347103596, "reward_std": 0.19691818160936236, "rewards/accuracy_reward": 0.3928571336437017, "rewards/semantic_entropy_math_reward": 0.7674927115440369, "step": 724 }, { "completion_length": 364.84948444366455, "epoch": 0.9447023373238863, "grad_norm": 0.018599722534418106, "kl": 0.3402099609375, "learning_rate": 9.114152962220734e-08, "loss": 0.0136, "reward": 1.2108235955238342, "reward_std": 0.20205550221726298, "rewards/accuracy_reward": 0.42474489472806454, "rewards/semantic_entropy_math_reward": 0.7860786989331245, "step": 725 }, { "completion_length": 364.55228996276855, "epoch": 0.9460053750305399, "grad_norm": 0.03085581213235855, "kl": 0.38922119140625, "learning_rate": 8.686558591130157e-08, "loss": 0.0156, "reward": 1.1297375932335854, "reward_std": 0.174534130259417, "rewards/accuracy_reward": 0.3571428507566452, "rewards/semantic_entropy_math_reward": 0.7725947313010693, "step": 726 }, { "completion_length": 331.56887340545654, "epoch": 0.9473084127371936, "grad_norm": 0.018940677866339684, "kl": 0.31597900390625, "learning_rate": 8.269149183128988e-08, "loss": 0.0126, "reward": 1.2487244792282581, "reward_std": 0.19634207640774548, "rewards/accuracy_reward": 0.43494896683841944, "rewards/semantic_entropy_math_reward": 0.8137755170464516, "step": 727 }, { "completion_length": 390.30101108551025, "epoch": 0.9486114504438472, "grad_norm": 0.015773901715874672, "kl": 0.3446044921875, "learning_rate": 7.861933391144272e-08, "loss": 0.0138, "reward": 1.1452259123325348, "reward_std": 0.1720047469716519, "rewards/accuracy_reward": 0.3584183594211936, "rewards/semantic_entropy_math_reward": 0.7868075370788574, "step": 728 }, { "completion_length": 416.3418302536011, "epoch": 0.9499144881505008, "grad_norm": 0.027443690225481987, "kl": 0.35955810546875, "learning_rate": 7.464919656788804e-08, "loss": 0.0144, "reward": 1.1603498347103596, "reward_std": 0.21259729354642332, "rewards/accuracy_reward": 0.39030611608177423, "rewards/semantic_entropy_math_reward": 0.7700437158346176, "step": 729 }, { "completion_length": 404.1964178085327, "epoch": 0.9512175258571545, "grad_norm": 0.024271821603178978, "kl": 0.405029296875, "learning_rate": 7.078116210185892e-08, "loss": 0.0162, "reward": 1.0931122601032257, "reward_std": 0.21530530892778188, "rewards/accuracy_reward": 0.37117346189916134, "rewards/semantic_entropy_math_reward": 0.7219387553632259, "step": 730 }, { "completion_length": 363.2869825363159, "epoch": 0.9525205635638081, "grad_norm": 0.4443020224571228, "kl": 1.28289794921875, "learning_rate": 6.701531069799039e-08, "loss": 0.0513, "reward": 1.1561588682234287, "reward_std": 0.18145866435952485, "rewards/accuracy_reward": 0.3635204011807218, "rewards/semantic_entropy_math_reward": 0.7926384471356869, "step": 731 }, { "completion_length": 464.9553508758545, "epoch": 0.9538236012704617, "grad_norm": 0.01929629221558571, "kl": 0.4041748046875, "learning_rate": 6.335172042265192e-08, "loss": 0.0162, "reward": 1.0462828017771244, "reward_std": 0.26719582057558, "rewards/accuracy_reward": 0.3698979504406452, "rewards/semantic_entropy_math_reward": 0.676384836435318, "step": 732 }, { "completion_length": 477.80738830566406, "epoch": 0.9551266389771154, "grad_norm": 0.018624667078256607, "kl": 0.444580078125, "learning_rate": 5.97904672223354e-08, "loss": 0.0178, "reward": 1.0624999813735485, "reward_std": 0.2284381533972919, "rewards/accuracy_reward": 0.3635204015299678, "rewards/semantic_entropy_math_reward": 0.6989795845001936, "step": 733 }, { "completion_length": 626.9923362731934, "epoch": 0.956429676683769, "grad_norm": 0.034819867461919785, "kl": 0.630615234375, "learning_rate": 5.633162492207633e-08, "loss": 0.0252, "reward": 0.855867350474, "reward_std": 0.270494430558756, "rewards/accuracy_reward": 0.28188774827867746, "rewards/semantic_entropy_math_reward": 0.5739795695990324, "step": 734 }, { "completion_length": 695.4017696380615, "epoch": 0.9577327143904226, "grad_norm": 0.03701328858733177, "kl": 0.6246337890625, "learning_rate": 5.2975265223925155e-08, "loss": 0.025, "reward": 0.8205174840986729, "reward_std": 0.2966882986947894, "rewards/accuracy_reward": 0.3048469349741936, "rewards/semantic_entropy_math_reward": 0.5156705398112535, "step": 735 }, { "completion_length": 718.0216674804688, "epoch": 0.9590357520970763, "grad_norm": 0.030487217009067535, "kl": 0.6962890625, "learning_rate": 4.9721457705459995e-08, "loss": 0.0279, "reward": 0.7680393755435944, "reward_std": 0.2891839537769556, "rewards/accuracy_reward": 0.27168366638943553, "rewards/semantic_entropy_math_reward": 0.49635567888617516, "step": 736 }, { "completion_length": 665.5395336151123, "epoch": 0.96033878980373, "grad_norm": 0.033213261514902115, "kl": 0.7098388671875, "learning_rate": 4.657026981834623e-08, "loss": 0.0284, "reward": 0.7443513162434101, "reward_std": 0.30231898650527, "rewards/accuracy_reward": 0.2691326504573226, "rewards/semantic_entropy_math_reward": 0.47521864995360374, "step": 737 }, { "completion_length": 417.9936180114746, "epoch": 0.9616418275103836, "grad_norm": 0.04055352136492729, "kl": 0.5357666015625, "learning_rate": 4.3521766886936434e-08, "loss": 0.0214, "reward": 0.9487973675131798, "reward_std": 0.326946537476033, "rewards/accuracy_reward": 0.3456632592715323, "rewards/semantic_entropy_math_reward": 0.6031340956687927, "step": 738 }, { "completion_length": 418.23213291168213, "epoch": 0.9629448652170373, "grad_norm": 0.02809285558760166, "kl": 0.5797119140625, "learning_rate": 4.057601210691542e-08, "loss": 0.0232, "reward": 0.8955903630703688, "reward_std": 0.283821654622443, "rewards/accuracy_reward": 0.3022959107765928, "rewards/semantic_entropy_math_reward": 0.5932944398373365, "step": 739 }, { "completion_length": 370.7040796279907, "epoch": 0.9642479029236909, "grad_norm": 0.031031547114253044, "kl": 0.6207275390625, "learning_rate": 3.773306654399234e-08, "loss": 0.0248, "reward": 0.922376099973917, "reward_std": 0.2904593222774565, "rewards/accuracy_reward": 0.31887754483614117, "rewards/semantic_entropy_math_reward": 0.6034985426813364, "step": 740 }, { "completion_length": 325.73723888397217, "epoch": 0.9655509406303445, "grad_norm": 0.03169599547982216, "kl": 0.55706787109375, "learning_rate": 3.4992989132634495e-08, "loss": 0.0223, "reward": 1.050838179886341, "reward_std": 0.26356734801083803, "rewards/accuracy_reward": 0.3839285662397742, "rewards/semantic_entropy_math_reward": 0.666909595951438, "step": 741 }, { "completion_length": 301.1568784713745, "epoch": 0.9668539783369982, "grad_norm": 0.03818634897470474, "kl": 0.5819091796875, "learning_rate": 3.235583667484443e-08, "loss": 0.0233, "reward": 0.8522230312228203, "reward_std": 0.23672135127708316, "rewards/accuracy_reward": 0.24872448574751616, "rewards/semantic_entropy_math_reward": 0.6034985259175301, "step": 742 }, { "completion_length": 299.5777950286865, "epoch": 0.9681570160436518, "grad_norm": 0.03915174677968025, "kl": 0.4954833984375, "learning_rate": 2.9821663838981994e-08, "loss": 0.0198, "reward": 1.0643221624195576, "reward_std": 0.2860506591387093, "rewards/accuracy_reward": 0.37372448202222586, "rewards/semantic_entropy_math_reward": 0.690597653388977, "step": 743 }, { "completion_length": 317.92091178894043, "epoch": 0.9694600537503054, "grad_norm": 0.022805264219641685, "kl": 0.4864501953125, "learning_rate": 2.7390523158633552e-08, "loss": 0.0194, "reward": 1.0027332194149494, "reward_std": 0.26014196826145053, "rewards/accuracy_reward": 0.31249999126885086, "rewards/semantic_entropy_math_reward": 0.6902332380414009, "step": 744 }, { "completion_length": 251.1849398612976, "epoch": 0.970763091456959, "grad_norm": 0.03166572377085686, "kl": 0.47314453125, "learning_rate": 2.5062465031520656e-08, "loss": 0.0189, "reward": 1.117164708673954, "reward_std": 0.18831918947398663, "rewards/accuracy_reward": 0.3533163210377097, "rewards/semantic_entropy_math_reward": 0.7638483792543411, "step": 745 }, { "completion_length": 228.0408103466034, "epoch": 0.9720661291636127, "grad_norm": 0.02740345150232315, "kl": 0.537109375, "learning_rate": 2.283753771845587e-08, "loss": 0.0215, "reward": 1.0770772583782673, "reward_std": 0.23393970844335854, "rewards/accuracy_reward": 0.312499993480742, "rewards/semantic_entropy_math_reward": 0.7645772509276867, "step": 746 }, { "completion_length": 274.1045842766762, "epoch": 0.9733691668702663, "grad_norm": 0.0755334198474884, "kl": 0.5225830078125, "learning_rate": 2.0715787342343586e-08, "loss": 0.0209, "reward": 1.0522959046065807, "reward_std": 0.20147397881373763, "rewards/accuracy_reward": 0.33545917831361294, "rewards/semantic_entropy_math_reward": 0.7168367225676775, "step": 747 }, { "completion_length": 284.74999237060547, "epoch": 0.9746722045769199, "grad_norm": 0.06074034422636032, "kl": 0.6005859375, "learning_rate": 1.8697257887221876e-08, "loss": 0.024, "reward": 1.0280612222850323, "reward_std": 0.22796658985316753, "rewards/accuracy_reward": 0.30357142305001616, "rewards/semantic_entropy_math_reward": 0.7244897894561291, "step": 748 }, { "completion_length": 256.82524967193604, "epoch": 0.9759752422835736, "grad_norm": 0.03348409757018089, "kl": 0.5262451171875, "learning_rate": 1.6781991197352133e-08, "loss": 0.021, "reward": 1.0317055210471153, "reward_std": 0.18445400358177722, "rewards/accuracy_reward": 0.2729591761017218, "rewards/semantic_entropy_math_reward": 0.7587463371455669, "step": 749 }, { "completion_length": 296.5293328166008, "epoch": 0.9772782799902272, "grad_norm": 0.030276097357273102, "kl": 0.50927734375, "learning_rate": 1.4970026976351416e-08, "loss": 0.0203, "reward": 1.1426748931407928, "reward_std": 0.22198078641667962, "rewards/accuracy_reward": 0.3864795856643468, "rewards/semantic_entropy_math_reward": 0.756195317953825, "step": 750 }, { "completion_length": 328.72703075408936, "epoch": 0.9785813176968808, "grad_norm": 0.03804737702012062, "kl": 0.503173828125, "learning_rate": 1.326140278636756e-08, "loss": 0.0201, "reward": 1.0348031856119633, "reward_std": 0.23460589349269867, "rewards/accuracy_reward": 0.3227040767669678, "rewards/semantic_entropy_math_reward": 0.7120991088449955, "step": 751 }, { "completion_length": 370.77932929992676, "epoch": 0.9798843554035345, "grad_norm": 0.040832389146089554, "kl": 0.456787109375, "learning_rate": 1.1656154047303691e-08, "loss": 0.0183, "reward": 1.1013119369745255, "reward_std": 0.2191525532398373, "rewards/accuracy_reward": 0.38265305291861296, "rewards/semantic_entropy_math_reward": 0.7186588644981384, "step": 752 }, { "completion_length": 349.68493843078613, "epoch": 0.9811873931101881, "grad_norm": 0.036864735186100006, "kl": 0.5096435546875, "learning_rate": 1.0154314036083247e-08, "loss": 0.0204, "reward": 1.0036442950367928, "reward_std": 0.2143091990146786, "rewards/accuracy_reward": 0.3112244834192097, "rewards/semantic_entropy_math_reward": 0.6924198046326637, "step": 753 }, { "completion_length": 359.1989736557007, "epoch": 0.9824904308168417, "grad_norm": 0.035298023372888565, "kl": 0.45697021484375, "learning_rate": 8.755913885956647e-09, "loss": 0.0183, "reward": 1.152150135487318, "reward_std": 0.2608576063066721, "rewards/accuracy_reward": 0.4298469293862581, "rewards/semantic_entropy_math_reward": 0.7223031967878342, "step": 754 }, { "completion_length": 329.1925935745239, "epoch": 0.9837934685234954, "grad_norm": 0.04450774937868118, "kl": 0.519287109375, "learning_rate": 7.460982585860144e-09, "loss": 0.0208, "reward": 1.1395772360265255, "reward_std": 0.19313901104032993, "rewards/accuracy_reward": 0.3928571355063468, "rewards/semantic_entropy_math_reward": 0.7467201128602028, "step": 755 }, { "completion_length": 371.588002204895, "epoch": 0.985096506230149, "grad_norm": 0.038371238857507706, "kl": 0.582275390625, "learning_rate": 6.269546979813524e-09, "loss": 0.0233, "reward": 1.0309766568243504, "reward_std": 0.27720969961956143, "rewards/accuracy_reward": 0.31887754483614117, "rewards/semantic_entropy_math_reward": 0.7120991200208664, "step": 756 }, { "completion_length": 397.43111419677734, "epoch": 0.9863995439368026, "grad_norm": 0.03375720977783203, "kl": 0.4869384765625, "learning_rate": 5.181631766362216e-09, "loss": 0.0195, "reward": 1.1197157353162766, "reward_std": 0.24281214643269777, "rewards/accuracy_reward": 0.40688774548470974, "rewards/semantic_entropy_math_reward": 0.7128279507160187, "step": 757 }, { "completion_length": 350.55611658096313, "epoch": 0.9877025816434563, "grad_norm": 0.0337342731654644, "kl": 0.48828125, "learning_rate": 4.197259498067707e-09, "loss": 0.0195, "reward": 1.2038994170725346, "reward_std": 0.21908019436523318, "rewards/accuracy_reward": 0.43749999068677425, "rewards/semantic_entropy_math_reward": 0.7663994058966637, "step": 758 }, { "completion_length": 301.17474126815796, "epoch": 0.98900561935011, "grad_norm": 0.02405785769224167, "kl": 0.5045166015625, "learning_rate": 3.3164505810373558e-09, "loss": 0.0202, "reward": 1.2337827868759632, "reward_std": 0.20697664888575673, "rewards/accuracy_reward": 0.4145408058539033, "rewards/semantic_entropy_math_reward": 0.8192419894039631, "step": 759 }, { "completion_length": 348.72065353393555, "epoch": 0.9903086570567636, "grad_norm": 0.034918904304504395, "kl": 0.52392578125, "learning_rate": 2.539223274504732e-09, "loss": 0.021, "reward": 1.0419096313416958, "reward_std": 0.21935143927112222, "rewards/accuracy_reward": 0.3188775428570807, "rewards/semantic_entropy_math_reward": 0.7230320554226637, "step": 760 }, { "completion_length": 441.2538146972656, "epoch": 0.9916116947634173, "grad_norm": 0.03681718185544014, "kl": 0.5216064453125, "learning_rate": 1.865593690446588e-09, "loss": 0.0209, "reward": 1.0167638398706913, "reward_std": 0.2226292029954493, "rewards/accuracy_reward": 0.33928571082651615, "rewards/semantic_entropy_math_reward": 0.6774781346321106, "step": 761 }, { "completion_length": 424.39412021636963, "epoch": 0.9929147324700709, "grad_norm": 0.05879458039999008, "kl": 0.45819091796875, "learning_rate": 1.2955757932542334e-09, "loss": 0.0183, "reward": 1.073979601264, "reward_std": 0.2877105185762048, "rewards/accuracy_reward": 0.3979591731913388, "rewards/semantic_entropy_math_reward": 0.676020385697484, "step": 762 }, { "completion_length": 330.93367052078247, "epoch": 0.9942177701767245, "grad_norm": 0.06197572126984596, "kl": 0.5130615234375, "learning_rate": 8.291813994387721e-10, "loss": 0.0205, "reward": 1.1383017376065254, "reward_std": 0.1970637736376375, "rewards/accuracy_reward": 0.38137754052877426, "rewards/semantic_entropy_math_reward": 0.7569241635501385, "step": 763 }, { "completion_length": 381.992338180542, "epoch": 0.9955208078833782, "grad_norm": 0.04618186131119728, "kl": 0.52978515625, "learning_rate": 4.664201773896259e-10, "loss": 0.0212, "reward": 1.1086005717515945, "reward_std": 0.22527359519153833, "rewards/accuracy_reward": 0.36479591589886695, "rewards/semantic_entropy_math_reward": 0.7438046373426914, "step": 764 }, { "completion_length": 292.90178060531616, "epoch": 0.9968238455900318, "grad_norm": 0.03945469111204147, "kl": 0.508544921875, "learning_rate": 2.0729964717414174e-10, "loss": 0.0203, "reward": 1.1352040618658066, "reward_std": 0.26001801155507565, "rewards/accuracy_reward": 0.3724489761516452, "rewards/semantic_entropy_math_reward": 0.7627551183104515, "step": 765 }, { "completion_length": 358.8558578491211, "epoch": 0.9981268832966854, "grad_norm": 0.04740915447473526, "kl": 0.5386962890625, "learning_rate": 5.182518037827322e-11, "loss": 0.0215, "reward": 1.0920189395546913, "reward_std": 0.17419996298849583, "rewards/accuracy_reward": 0.30994897056370974, "rewards/semantic_entropy_math_reward": 0.7820699512958527, "step": 766 }, { "completion_length": 318.0637722015381, "epoch": 0.999429921003339, "grad_norm": 0.04322025552392006, "kl": 0.5421142578125, "learning_rate": 0.0, "loss": 0.0217, "reward": 1.18166908249259, "reward_std": 0.22505978774279356, "rewards/accuracy_reward": 0.39158162474632263, "rewards/semantic_entropy_math_reward": 0.7900874391198158, "step": 767 }, { "epoch": 0.999429921003339, "step": 767, "total_flos": 0.0, "train_loss": 0.0007264473253309804, "train_runtime": 13704.5305, "train_samples_per_second": 6.272, "train_steps_per_second": 0.056 } ], "logging_steps": 1, "max_steps": 767, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }