{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9983500824958752, "eval_steps": 100, "global_step": 416, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 579.5451488494873, "epoch": 0.0023998800059997, "grad_norm": 0.023609351366758347, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.4967548958957195, "reward_std": 0.5467220321297646, "rewards/semantic_entropy_math_reward": -1.4967548958957195, "step": 1 }, { "completion_length": 555.8125, "epoch": 0.0047997600119994, "grad_norm": 0.023468418046832085, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.2350558526813984, "reward_std": 0.47530375327914953, "rewards/semantic_entropy_math_reward": -1.2350558526813984, "step": 2 }, { "completion_length": 593.9114570617676, "epoch": 0.0071996400179991, "grad_norm": 0.020463040098547935, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.1972830928862095, "reward_std": 0.41670812107622623, "rewards/semantic_entropy_math_reward": -1.1972830928862095, "step": 3 }, { "completion_length": 534.1493110656738, "epoch": 0.0095995200239988, "grad_norm": 0.02234843000769615, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.2397769559174776, "reward_std": 0.45711124083027244, "rewards/semantic_entropy_math_reward": -1.2397769559174776, "step": 4 }, { "completion_length": 557.0486125946045, "epoch": 0.0119994000299985, "grad_norm": 0.022175000980496407, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.0718507505953312, "reward_std": 0.40449281968176365, "rewards/semantic_entropy_math_reward": -1.0718507505953312, "step": 5 }, { "completion_length": 532.0104236602783, "epoch": 0.0143992800359982, "grad_norm": 0.023260876536369324, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.2149364296346903, "reward_std": 0.5302340695634484, "rewards/semantic_entropy_math_reward": -1.2149364296346903, "step": 6 }, { "completion_length": 585.7517433166504, "epoch": 0.0167991600419979, "grad_norm": 0.02340966835618019, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.164964143652469, "reward_std": 0.4511878960765898, "rewards/semantic_entropy_math_reward": -1.164964143652469, "step": 7 }, { "completion_length": 610.2152843475342, "epoch": 0.0191990400479976, "grad_norm": 0.025966230779886246, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.1972486525774002, "reward_std": 0.4742407090961933, "rewards/semantic_entropy_math_reward": -1.1972486525774002, "step": 8 }, { "completion_length": 661.5781269073486, "epoch": 0.0215989200539973, "grad_norm": 0.019320230931043625, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.0158138242550194, "reward_std": 0.3856792887672782, "rewards/semantic_entropy_math_reward": -1.0158138242550194, "step": 9 }, { "completion_length": 553.5677089691162, "epoch": 0.023998800059997, "grad_norm": 0.02017727680504322, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.2822860479354858, "reward_std": 0.5186142511665821, "rewards/semantic_entropy_math_reward": -1.2822860479354858, "step": 10 }, { "completion_length": 689.8593769073486, "epoch": 0.0263986800659967, "grad_norm": 0.016234688460826874, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.1487259063869715, "reward_std": 0.39036796567961574, "rewards/semantic_entropy_math_reward": -1.1487259063869715, "step": 11 }, { "completion_length": 613.6718845367432, "epoch": 0.0287985600719964, "grad_norm": 0.01803727075457573, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.0811139764264226, "reward_std": 0.4259672285988927, "rewards/semantic_entropy_math_reward": -1.0811139764264226, "step": 12 }, { "completion_length": 620.7552165985107, "epoch": 0.0311984400779961, "grad_norm": 0.015589025802910328, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.9830317534506321, "reward_std": 0.36907480750232935, "rewards/semantic_entropy_math_reward": -0.9830317534506321, "step": 13 }, { "completion_length": 657.2968921661377, "epoch": 0.0335983200839958, "grad_norm": 0.016835488379001617, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.1987630133517087, "reward_std": 0.5184819111600518, "rewards/semantic_entropy_math_reward": -1.1987630133517087, "step": 14 }, { "completion_length": 610.79514503479, "epoch": 0.0359982000899955, "grad_norm": 0.0162822213023901, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.0888072960078716, "reward_std": 0.39932171534746885, "rewards/semantic_entropy_math_reward": -1.0888072960078716, "step": 15 }, { "completion_length": 563.517370223999, "epoch": 0.0383980800959952, "grad_norm": 0.019534002989530563, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.09929908066988, "reward_std": 0.41265817545354366, "rewards/semantic_entropy_math_reward": -1.09929908066988, "step": 16 }, { "completion_length": 637.720495223999, "epoch": 0.0407979601019949, "grad_norm": 0.01697668805718422, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.1787171624600887, "reward_std": 0.428286274895072, "rewards/semantic_entropy_math_reward": -1.1787171624600887, "step": 17 }, { "completion_length": 621.3993148803711, "epoch": 0.0431978401079946, "grad_norm": 0.017321443185210228, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.2735732290893793, "reward_std": 0.43550457525998354, "rewards/semantic_entropy_math_reward": -1.2735732290893793, "step": 18 }, { "completion_length": 586.9704875946045, "epoch": 0.0455977201139943, "grad_norm": 0.02127678506076336, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.2064109002240002, "reward_std": 0.4663712582550943, "rewards/semantic_entropy_math_reward": -1.2064109002240002, "step": 19 }, { "completion_length": 626.7257080078125, "epoch": 0.047997600119994, "grad_norm": 0.016206126660108566, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.152345221489668, "reward_std": 0.39662991324439645, "rewards/semantic_entropy_math_reward": -1.152345221489668, "step": 20 }, { "completion_length": 583.7916679382324, "epoch": 0.0503974801259937, "grad_norm": 0.019242137670516968, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.1798701924271882, "reward_std": 0.4554295316338539, "rewards/semantic_entropy_math_reward": -1.1798701924271882, "step": 21 }, { "completion_length": 587.8559036254883, "epoch": 0.0527973601319934, "grad_norm": 0.020106367766857147, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.211361431516707, "reward_std": 0.5777098424732685, "rewards/semantic_entropy_math_reward": -1.211361431516707, "step": 22 }, { "completion_length": 613.4635486602783, "epoch": 0.0551972401379931, "grad_norm": 0.019849685952067375, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.3117065764963627, "reward_std": 0.45542027335613966, "rewards/semantic_entropy_math_reward": -1.3117065764963627, "step": 23 }, { "completion_length": 609.819450378418, "epoch": 0.0575971201439928, "grad_norm": 0.019817881286144257, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.2621403858065605, "reward_std": 0.4839176032692194, "rewards/semantic_entropy_math_reward": -1.2621403858065605, "step": 24 }, { "completion_length": 730.8524436950684, "epoch": 0.0599970001499925, "grad_norm": 0.014975810423493385, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.2369180954992771, "reward_std": 0.43273931834846735, "rewards/semantic_entropy_math_reward": -1.2369180954992771, "step": 25 }, { "completion_length": 594.288200378418, "epoch": 0.0623968801559922, "grad_norm": 0.01721040904521942, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.1034620627760887, "reward_std": 0.4124642931856215, "rewards/semantic_entropy_math_reward": -1.1034620627760887, "step": 26 }, { "completion_length": 553.0868148803711, "epoch": 0.0647967601619919, "grad_norm": 0.019415754824876785, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.130167681723833, "reward_std": 0.5143290963023901, "rewards/semantic_entropy_math_reward": -1.130167681723833, "step": 27 }, { "completion_length": 653.2829856872559, "epoch": 0.0671966401679916, "grad_norm": 0.017851749435067177, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.989163676276803, "reward_std": 0.4082563756965101, "rewards/semantic_entropy_math_reward": -0.989163676276803, "step": 28 }, { "completion_length": 605.6371650695801, "epoch": 0.0695965201739913, "grad_norm": 0.023129364475607872, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.4734891951084137, "reward_std": 0.5043481979519129, "rewards/semantic_entropy_math_reward": -1.4734891951084137, "step": 29 }, { "completion_length": 654.7951488494873, "epoch": 0.071996400179991, "grad_norm": 0.020585162565112114, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.37895911000669, "reward_std": 0.4850574918091297, "rewards/semantic_entropy_math_reward": -1.37895911000669, "step": 30 }, { "completion_length": 601.3628540039062, "epoch": 0.0743962801859907, "grad_norm": 0.02046247385442257, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.0621395409107208, "reward_std": 0.4105358109809458, "rewards/semantic_entropy_math_reward": -1.0621395409107208, "step": 31 }, { "completion_length": 656.0902843475342, "epoch": 0.0767961601919904, "grad_norm": 0.021095257252454758, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.328016122803092, "reward_std": 0.4542691232636571, "rewards/semantic_entropy_math_reward": -1.328016122803092, "step": 32 }, { "completion_length": 585.4427185058594, "epoch": 0.0791960401979901, "grad_norm": 0.01907271519303322, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.0881526842713356, "reward_std": 0.4317492740228772, "rewards/semantic_entropy_math_reward": -1.0881526842713356, "step": 33 }, { "completion_length": 617.6753482818604, "epoch": 0.0815959202039898, "grad_norm": 0.022275349125266075, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.2841743230819702, "reward_std": 0.47124351374804974, "rewards/semantic_entropy_math_reward": -1.2841743230819702, "step": 34 }, { "completion_length": 591.3628520965576, "epoch": 0.08399580020998951, "grad_norm": 0.021941347047686577, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.0643355981446803, "reward_std": 0.4094805922359228, "rewards/semantic_entropy_math_reward": -1.0643355981446803, "step": 35 }, { "completion_length": 519.338544845581, "epoch": 0.0863956802159892, "grad_norm": 0.026442214846611023, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.9271132331341505, "reward_std": 0.4420350408181548, "rewards/semantic_entropy_math_reward": -0.9271132331341505, "step": 36 }, { "completion_length": 628.0937614440918, "epoch": 0.0887955602219889, "grad_norm": 0.022040951997041702, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.0198373273015022, "reward_std": 0.41784033365547657, "rewards/semantic_entropy_math_reward": -1.0198373273015022, "step": 37 }, { "completion_length": 608.0572986602783, "epoch": 0.0911954402279886, "grad_norm": 0.026434265077114105, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.3598360251635313, "reward_std": 0.41981533356010914, "rewards/semantic_entropy_math_reward": -1.3598360251635313, "step": 38 }, { "completion_length": 626.1857681274414, "epoch": 0.0935953202339883, "grad_norm": 0.042337119579315186, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.3184345178306103, "reward_std": 0.48870162200182676, "rewards/semantic_entropy_math_reward": -1.3184345178306103, "step": 39 }, { "completion_length": 627.5381984710693, "epoch": 0.095995200239988, "grad_norm": 0.03428055718541145, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.3468139134347439, "reward_std": 0.4769498906098306, "rewards/semantic_entropy_math_reward": -1.3468139134347439, "step": 40 }, { "completion_length": 633.9496536254883, "epoch": 0.0983950802459877, "grad_norm": 0.030928973108530045, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.132771894801408, "reward_std": 0.5021341033279896, "rewards/semantic_entropy_math_reward": -1.132771894801408, "step": 41 }, { "completion_length": 569.9253540039062, "epoch": 0.1007949602519874, "grad_norm": 0.026956375688314438, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.1975932940840721, "reward_std": 0.467874969355762, "rewards/semantic_entropy_math_reward": -1.1975932940840721, "step": 42 }, { "completion_length": 608.6944522857666, "epoch": 0.1031948402579871, "grad_norm": 0.0328996405005455, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.2175438068807125, "reward_std": 0.4515871210023761, "rewards/semantic_entropy_math_reward": -1.2175438068807125, "step": 43 }, { "completion_length": 619.1527767181396, "epoch": 0.1055947202639868, "grad_norm": 0.03257475048303604, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.2264893371611834, "reward_std": 0.4431668370962143, "rewards/semantic_entropy_math_reward": -1.2264893371611834, "step": 44 }, { "completion_length": 562.6493148803711, "epoch": 0.1079946002699865, "grad_norm": 0.03283218294382095, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.0574581907130778, "reward_std": 0.48819410149008036, "rewards/semantic_entropy_math_reward": -1.0574581907130778, "step": 45 }, { "completion_length": 597.8142395019531, "epoch": 0.1103944802759862, "grad_norm": 0.03596136346459389, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.3064512498676777, "reward_std": 0.5052206655964255, "rewards/semantic_entropy_math_reward": -1.3064512498676777, "step": 46 }, { "completion_length": 559.2951469421387, "epoch": 0.1127943602819859, "grad_norm": 0.04295654594898224, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.2744261305779219, "reward_std": 0.4529256196692586, "rewards/semantic_entropy_math_reward": -1.2744261305779219, "step": 47 }, { "completion_length": 574.0347270965576, "epoch": 0.1151942402879856, "grad_norm": 0.037087395787239075, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.1887072566896677, "reward_std": 0.5219112485647202, "rewards/semantic_entropy_math_reward": -1.1887072566896677, "step": 48 }, { "completion_length": 576.9548645019531, "epoch": 0.1175941202939853, "grad_norm": 0.0523596853017807, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.2185893571004272, "reward_std": 0.5283103645779192, "rewards/semantic_entropy_math_reward": -1.2185893571004272, "step": 49 }, { "completion_length": 587.7326488494873, "epoch": 0.119994000299985, "grad_norm": 0.07026118040084839, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.424642201513052, "reward_std": 0.5118397288024426, "rewards/semantic_entropy_math_reward": -1.424642201513052, "step": 50 }, { "completion_length": 607.8958377838135, "epoch": 0.1223938803059847, "grad_norm": 0.11793287098407745, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.2733404748141766, "reward_std": 0.4436110374517739, "rewards/semantic_entropy_math_reward": -1.2733404748141766, "step": 51 }, { "completion_length": 592.35764503479, "epoch": 0.1247937603119844, "grad_norm": 0.2582877576351166, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.28734240680933, "reward_std": 0.43458423344418406, "rewards/semantic_entropy_math_reward": -1.28734240680933, "step": 52 }, { "completion_length": 591.2170181274414, "epoch": 0.1271936403179841, "grad_norm": 0.9619891047477722, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.335528964176774, "reward_std": 0.480956160929054, "rewards/semantic_entropy_math_reward": -1.335528964176774, "step": 53 }, { "completion_length": 601.8802165985107, "epoch": 0.1295935203239838, "grad_norm": 1.464898705482483, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.0449805338867009, "reward_std": 0.3896405389532447, "rewards/semantic_entropy_math_reward": -1.0449805338867009, "step": 54 }, { "completion_length": 745.2829933166504, "epoch": 0.1319934003299835, "grad_norm": 2.908984422683716, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.8210013713687658, "reward_std": 0.28385873371735215, "rewards/semantic_entropy_math_reward": -0.8210013713687658, "step": 55 }, { "completion_length": 888.5954971313477, "epoch": 0.1343932803359832, "grad_norm": 2.303511381149292, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.28092469088733196, "reward_std": 0.07076533045619726, "rewards/semantic_entropy_math_reward": -0.28092469088733196, "step": 56 }, { "completion_length": 901.1823043823242, "epoch": 0.1367931603419829, "grad_norm": 1.215346336364746, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.24043410643935204, "reward_std": 0.0535881663672626, "rewards/semantic_entropy_math_reward": -0.24043410643935204, "step": 57 }, { "completion_length": 945.3715286254883, "epoch": 0.1391930403479826, "grad_norm": 0.24487844109535217, "learning_rate": 1e-06, "loss": 0.0, "reward": -0.04936212673783302, "reward_std": 0.005620982963591814, "rewards/semantic_entropy_math_reward": -0.04936212673783302, "step": 58 }, { "completion_length": 958.8437576293945, "epoch": 0.1415929203539823, "grad_norm": 0.3599923551082611, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.0954090766608715, "reward_std": 0.015972360502928495, "rewards/semantic_entropy_math_reward": -0.0954090766608715, "step": 59 }, { "completion_length": 919.9392471313477, "epoch": 0.143992800359982, "grad_norm": 0.6720292568206787, "learning_rate": 1e-06, "loss": 0.0, "reward": -0.04936212673783302, "reward_std": 0.005620982963591814, "rewards/semantic_entropy_math_reward": -0.04936212673783302, "step": 60 }, { "completion_length": 872.0069389343262, "epoch": 0.14639268036598171, "grad_norm": 0.7852513790130615, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.24349546059966087, "reward_std": 0.03283530939370394, "rewards/semantic_entropy_math_reward": -0.24349546059966087, "step": 61 }, { "completion_length": 885.3871574401855, "epoch": 0.1487925603719814, "grad_norm": 0.051829516887664795, "learning_rate": 1e-06, "loss": 0.0, "reward": -0.04936212673783302, "reward_std": 0.005620982963591814, "rewards/semantic_entropy_math_reward": -0.04936212673783302, "step": 62 }, { "completion_length": 852.8298721313477, "epoch": 0.1511924403779811, "grad_norm": 0.0997152253985405, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.1974485069513321, "reward_std": 0.022483931854367256, "rewards/semantic_entropy_math_reward": -0.1974485069513321, "step": 63 }, { "completion_length": 666.3194484710693, "epoch": 0.1535923203839808, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 64 }, { "completion_length": 453.2257013320923, "epoch": 0.1559922003899805, "grad_norm": 0.12662464380264282, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.48036056384444237, "reward_std": 0.07513140747323632, "rewards/semantic_entropy_math_reward": -0.48036056384444237, "step": 65 }, { "completion_length": 428.5538215637207, "epoch": 0.1583920803959802, "grad_norm": 0.11950503289699554, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.3143479097634554, "reward_std": 0.06879218481481075, "rewards/semantic_entropy_math_reward": -0.3143479097634554, "step": 66 }, { "completion_length": 372.9236145019531, "epoch": 0.1607919604019799, "grad_norm": 1.306576132774353, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.2195042371749878, "reward_std": 0.30455823382362723, "rewards/semantic_entropy_math_reward": -1.2195042371749878, "step": 67 }, { "completion_length": 405.38021087646484, "epoch": 0.1631918404079796, "grad_norm": 3.1170105934143066, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.187372762709856, "reward_std": 0.4900816101580858, "rewards/semantic_entropy_math_reward": -1.187372762709856, "step": 68 }, { "completion_length": 401.2066020965576, "epoch": 0.1655917204139793, "grad_norm": 1.0041663646697998, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.7445017509162426, "reward_std": 0.19908591220155358, "rewards/semantic_entropy_math_reward": -0.7445017509162426, "step": 69 }, { "completion_length": 368.0833320617676, "epoch": 0.16799160041997901, "grad_norm": 0.262119323015213, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.25673690997064114, "reward_std": 0.0700853606685996, "rewards/semantic_entropy_math_reward": -0.25673690997064114, "step": 70 }, { "completion_length": 295.34201431274414, "epoch": 0.1703914804259787, "grad_norm": 0.4609036445617676, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.18782146647572517, "reward_std": 0.030400068033486605, "rewards/semantic_entropy_math_reward": -0.18782146647572517, "step": 71 }, { "completion_length": 362.2968807220459, "epoch": 0.1727913604319784, "grad_norm": 0.1498037576675415, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.20401418767869473, "reward_std": 0.0741230258718133, "rewards/semantic_entropy_math_reward": -0.20401418767869473, "step": 72 }, { "completion_length": 398.3211898803711, "epoch": 0.1751912404379781, "grad_norm": 0.20637626945972443, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.33969277516007423, "reward_std": 0.08184323832392693, "rewards/semantic_entropy_math_reward": -0.33969277516007423, "step": 73 }, { "completion_length": 364.69270610809326, "epoch": 0.1775911204439778, "grad_norm": 0.3202667236328125, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.5564979244954884, "reward_std": 0.17949713906273246, "rewards/semantic_entropy_math_reward": -0.5564979244954884, "step": 74 }, { "completion_length": 398.5538263320923, "epoch": 0.1799910004499775, "grad_norm": 0.4271727502346039, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.5178813878446817, "reward_std": 0.19856971083208919, "rewards/semantic_entropy_math_reward": -0.5178813878446817, "step": 75 }, { "completion_length": 328.8177137374878, "epoch": 0.1823908804559772, "grad_norm": 1.2274174690246582, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.730956120416522, "reward_std": 0.17488946160301566, "rewards/semantic_entropy_math_reward": -0.730956120416522, "step": 76 }, { "completion_length": 435.6927089691162, "epoch": 0.1847907604619769, "grad_norm": 1.0755140781402588, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.0360593143850565, "reward_std": 0.31746397400274873, "rewards/semantic_entropy_math_reward": -1.0360593143850565, "step": 77 }, { "completion_length": 396.92187881469727, "epoch": 0.1871906404679766, "grad_norm": 0.49259528517723083, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.0349559504538774, "reward_std": 0.30292816972360015, "rewards/semantic_entropy_math_reward": -1.0349559504538774, "step": 78 }, { "completion_length": 303.45312213897705, "epoch": 0.18959052047397632, "grad_norm": 0.49153009057044983, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.5245293974876404, "reward_std": 0.18332215631380677, "rewards/semantic_entropy_math_reward": -0.5245293974876404, "step": 79 }, { "completion_length": 241.1944465637207, "epoch": 0.191990400479976, "grad_norm": 1.309309482574463, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.7875159978866577, "reward_std": 0.2404517256654799, "rewards/semantic_entropy_math_reward": -0.7875159978866577, "step": 80 }, { "completion_length": 106.01736164093018, "epoch": 0.1943902804859757, "grad_norm": 1.159730076789856, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.3929096572101116, "reward_std": 0.10203729756176472, "rewards/semantic_entropy_math_reward": -0.3929096572101116, "step": 81 }, { "completion_length": 64.85590314865112, "epoch": 0.1967901604919754, "grad_norm": 2.622030258178711, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.7893553748726845, "reward_std": 0.1765661663375795, "rewards/semantic_entropy_math_reward": -0.7893553748726845, "step": 82 }, { "completion_length": 233.77778005599976, "epoch": 0.1991900404979751, "grad_norm": 6.741254806518555, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.8342948034405708, "reward_std": 0.30480349250137806, "rewards/semantic_entropy_math_reward": -0.8342948034405708, "step": 83 }, { "completion_length": 252.50521039962769, "epoch": 0.2015899205039748, "grad_norm": 6.380768299102783, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.1298357415944338, "reward_std": 0.4231584039516747, "rewards/semantic_entropy_math_reward": -1.1298357415944338, "step": 84 }, { "completion_length": 85.42708444595337, "epoch": 0.2039898005099745, "grad_norm": 1.255273461341858, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.9342016261070967, "reward_std": 0.37867429945617914, "rewards/semantic_entropy_math_reward": -0.9342016261070967, "step": 85 }, { "completion_length": 83.58854204416275, "epoch": 0.2063896805159742, "grad_norm": 1.491475224494934, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.1672791484743357, "reward_std": 0.411985841114074, "rewards/semantic_entropy_math_reward": -1.1672791484743357, "step": 86 }, { "completion_length": 147.82638984918594, "epoch": 0.2087895605219739, "grad_norm": 1.796517252922058, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.186328262090683, "reward_std": 0.5614343388006091, "rewards/semantic_entropy_math_reward": -1.186328262090683, "step": 87 }, { "completion_length": 266.8281271457672, "epoch": 0.2111894405279736, "grad_norm": 0.8293857574462891, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.7227916922420263, "reward_std": 0.5203290600329638, "rewards/semantic_entropy_math_reward": -0.7227916922420263, "step": 88 }, { "completion_length": 419.9079918861389, "epoch": 0.2135893205339733, "grad_norm": 1.5567965507507324, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.6697393516078591, "reward_std": 0.3471951074898243, "rewards/semantic_entropy_math_reward": -0.6697393516078591, "step": 89 }, { "completion_length": 848.0347213745117, "epoch": 0.215989200539973, "grad_norm": 0.7054896354675293, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.4150713551789522, "reward_std": 0.18047819379717112, "rewards/semantic_entropy_math_reward": -0.4150713551789522, "step": 90 }, { "completion_length": 508.8593807220459, "epoch": 0.2183890805459727, "grad_norm": 3.7869491577148438, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.4939053989946842, "reward_std": 0.6185994260013103, "rewards/semantic_entropy_math_reward": -1.4939053989946842, "step": 91 }, { "completion_length": 589.859375, "epoch": 0.2207889605519724, "grad_norm": 0.05026252567768097, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.0941957477480173, "reward_std": 0.5725127439945936, "rewards/semantic_entropy_math_reward": -1.0941957477480173, "step": 92 }, { "completion_length": 605.2968769073486, "epoch": 0.2231888405579721, "grad_norm": 0.056097887456417084, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.2516395896673203, "reward_std": 0.6146153416484594, "rewards/semantic_entropy_math_reward": -1.2516395896673203, "step": 93 }, { "completion_length": 539.8055553436279, "epoch": 0.2255887205639718, "grad_norm": 0.07102109491825104, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.312748797237873, "reward_std": 0.5412911372259259, "rewards/semantic_entropy_math_reward": -1.312748797237873, "step": 94 }, { "completion_length": 502.1197929382324, "epoch": 0.2279886005699715, "grad_norm": 0.06010741740465164, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.9112661816179752, "reward_std": 0.5259749758988619, "rewards/semantic_entropy_math_reward": -0.9112661816179752, "step": 95 }, { "completion_length": 546.5416793823242, "epoch": 0.2303884805759712, "grad_norm": 0.05066002905368805, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.128136644139886, "reward_std": 0.5719058271497488, "rewards/semantic_entropy_math_reward": -1.128136644139886, "step": 96 }, { "completion_length": 492.8194522857666, "epoch": 0.2327883605819709, "grad_norm": 0.0559186227619648, "learning_rate": 1e-06, "loss": -0.0, "reward": -1.1546523049473763, "reward_std": 0.5111725647002459, "rewards/semantic_entropy_math_reward": -1.1546523049473763, "step": 97 }, { "completion_length": 473.44097900390625, "epoch": 0.2351882405879706, "grad_norm": 0.05957731232047081, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.9957732241600752, "reward_std": 0.5401759054511786, "rewards/semantic_entropy_math_reward": -0.9957732241600752, "step": 98 }, { "completion_length": 489.8177146911621, "epoch": 0.2375881205939703, "grad_norm": 0.06533516943454742, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.8187971916049719, "reward_std": 0.4784417259506881, "rewards/semantic_entropy_math_reward": -0.8187971916049719, "step": 99 }, { "completion_length": 571.5191040039062, "epoch": 0.23998800059997, "grad_norm": 0.25294971466064453, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.9735191389918327, "reward_std": 0.6345395464450121, "rewards/semantic_entropy_math_reward": -0.9735191389918327, "step": 100 }, { "completion_length": 400.6996593475342, "epoch": 0.2423878806059697, "grad_norm": 0.3172270357608795, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.6215638350695372, "reward_std": 0.38559078332036734, "rewards/semantic_entropy_math_reward": -0.6215638350695372, "step": 101 }, { "completion_length": 238.6892409324646, "epoch": 0.2447877606119694, "grad_norm": 1.6551789045333862, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.39494438795372844, "reward_std": 0.3323068944737315, "rewards/semantic_entropy_math_reward": -0.39494438795372844, "step": 102 }, { "completion_length": 10.09375, "epoch": 0.2471876406179691, "grad_norm": 0.430303156375885, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.0410688160918653, "reward_std": 0.07188181672245264, "rewards/semantic_entropy_math_reward": -0.0410688160918653, "step": 103 }, { "completion_length": 10.078125059604645, "epoch": 0.2495875206239688, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 104 }, { "completion_length": 10.062499940395355, "epoch": 0.2519874006299685, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 105 }, { "completion_length": 10.045138835906982, "epoch": 0.2543872806359682, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 106 }, { "completion_length": 10.057291746139526, "epoch": 0.2567871606419679, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 107 }, { "completion_length": 10.008680582046509, "epoch": 0.2591870406479676, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 108 }, { "completion_length": 10.036458432674408, "epoch": 0.2615869206539673, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 109 }, { "completion_length": 10.032986104488373, "epoch": 0.263986800659967, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 110 }, { "completion_length": 10.013888835906982, "epoch": 0.2663866806659667, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 111 }, { "completion_length": 10.052083313465118, "epoch": 0.2687865606719664, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 112 }, { "completion_length": 10.013888895511627, "epoch": 0.2711864406779661, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 113 }, { "completion_length": 10.020833313465118, "epoch": 0.2735863206839658, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 114 }, { "completion_length": 10.019097208976746, "epoch": 0.2759862006899655, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 115 }, { "completion_length": 10.071180582046509, "epoch": 0.2783860806959652, "grad_norm": 0.2876928746700287, "learning_rate": 1e-06, "loss": 0.0, "reward": -0.011715315282344818, "reward_std": 0.010351377539336681, "rewards/semantic_entropy_math_reward": -0.011715315282344818, "step": 116 }, { "completion_length": 10.093750059604645, "epoch": 0.2807859607019649, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 117 }, { "completion_length": 10.124999940395355, "epoch": 0.2831858407079646, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 118 }, { "completion_length": 10.423611223697662, "epoch": 0.2855857207139643, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 119 }, { "completion_length": 10.47569453716278, "epoch": 0.287985600719964, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 120 }, { "completion_length": 10.59375011920929, "epoch": 0.2903854807259637, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 121 }, { "completion_length": 10.640625178813934, "epoch": 0.29278536073196343, "grad_norm": 0.23409874737262726, "learning_rate": 1e-06, "loss": 0.0, "reward": -0.005975749809294939, "reward_std": 0.014421098865568638, "rewards/semantic_entropy_math_reward": -0.005975749809294939, "step": 122 }, { "completion_length": 10.890625178813934, "epoch": 0.2951852407379631, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 123 }, { "completion_length": 10.881944596767426, "epoch": 0.2975851207439628, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 124 }, { "completion_length": 10.994791686534882, "epoch": 0.2999850007499625, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 125 }, { "completion_length": 10.973958432674408, "epoch": 0.3023848807559622, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 126 }, { "completion_length": 10.987847208976746, "epoch": 0.3047847607619619, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 127 }, { "completion_length": 10.996527791023254, "epoch": 0.3071846407679616, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 128 }, { "completion_length": 10.984375, "epoch": 0.3095845207739613, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 129 }, { "completion_length": 10.998263895511627, "epoch": 0.311984400779961, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 130 }, { "completion_length": 10.993055582046509, "epoch": 0.3143842807859607, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 131 }, { "completion_length": 11.0, "epoch": 0.3167841607919604, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 132 }, { "completion_length": 11.01909726858139, "epoch": 0.3191840407979601, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 133 }, { "completion_length": 11.0, "epoch": 0.3215839208039598, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 134 }, { "completion_length": 11.0, "epoch": 0.3239838008099595, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 135 }, { "completion_length": 10.994791686534882, "epoch": 0.3263836808159592, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 136 }, { "completion_length": 10.998263895511627, "epoch": 0.3287835608219589, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 137 }, { "completion_length": 11.0, "epoch": 0.3311834408279586, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 138 }, { "completion_length": 10.996527791023254, "epoch": 0.3335833208339583, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 139 }, { "completion_length": 10.998263895511627, "epoch": 0.33598320083995803, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 140 }, { "completion_length": 11.0, "epoch": 0.3383830808459577, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 141 }, { "completion_length": 10.996527791023254, "epoch": 0.3407829608519574, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 142 }, { "completion_length": 11.0, "epoch": 0.3431828408579571, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 143 }, { "completion_length": 11.017361164093018, "epoch": 0.3455827208639568, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 144 }, { "completion_length": 10.998263895511627, "epoch": 0.3479826008699565, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 145 }, { "completion_length": 10.996527791023254, "epoch": 0.3503824808759562, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 146 }, { "completion_length": 11.0, "epoch": 0.3527823608819559, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 147 }, { "completion_length": 11.0, "epoch": 0.3551822408879556, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 148 }, { "completion_length": 11.0, "epoch": 0.3575821208939553, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 149 }, { "completion_length": 10.998263895511627, "epoch": 0.359982000899955, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 150 }, { "completion_length": 11.0, "epoch": 0.3623818809059547, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 151 }, { "completion_length": 11.0, "epoch": 0.3647817609119544, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 152 }, { "completion_length": 10.994791686534882, "epoch": 0.3671816409179541, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 153 }, { "completion_length": 10.984375, "epoch": 0.3695815209239538, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 154 }, { "completion_length": 11.0, "epoch": 0.3719814009299535, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 155 }, { "completion_length": 10.998263895511627, "epoch": 0.3743812809359532, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 156 }, { "completion_length": 10.993055582046509, "epoch": 0.3767811609419529, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 157 }, { "completion_length": 11.0, "epoch": 0.37918104094795263, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 158 }, { "completion_length": 11.013888895511627, "epoch": 0.3815809209539523, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 159 }, { "completion_length": 11.0, "epoch": 0.383980800959952, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 160 }, { "completion_length": 11.0, "epoch": 0.3863806809659517, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 161 }, { "completion_length": 11.0, "epoch": 0.3887805609719514, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 162 }, { "completion_length": 10.996527791023254, "epoch": 0.3911804409779511, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 163 }, { "completion_length": 11.0, "epoch": 0.3935803209839508, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 164 }, { "completion_length": 11.024305582046509, "epoch": 0.3959802009899505, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 165 }, { "completion_length": 10.996527791023254, "epoch": 0.3983800809959502, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 166 }, { "completion_length": 10.996527791023254, "epoch": 0.4007799610019499, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 167 }, { "completion_length": 11.0, "epoch": 0.4031798410079496, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 168 }, { "completion_length": 10.998263895511627, "epoch": 0.4055797210139493, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 169 }, { "completion_length": 10.994791686534882, "epoch": 0.407979601019949, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 170 }, { "completion_length": 11.0, "epoch": 0.4103794810259487, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 171 }, { "completion_length": 11.0, "epoch": 0.4127793610319484, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 172 }, { "completion_length": 10.998263895511627, "epoch": 0.4151792410379481, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 173 }, { "completion_length": 11.0, "epoch": 0.4175791210439478, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 174 }, { "completion_length": 11.0, "epoch": 0.4199790010499475, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 175 }, { "completion_length": 11.0, "epoch": 0.4223788810559472, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 176 }, { "completion_length": 10.998263895511627, "epoch": 0.4247787610619469, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 177 }, { "completion_length": 11.0, "epoch": 0.4271786410679466, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 178 }, { "completion_length": 10.996527791023254, "epoch": 0.4295785210739463, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 179 }, { "completion_length": 10.998263895511627, "epoch": 0.431978401079946, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 180 }, { "completion_length": 10.998263895511627, "epoch": 0.4343782810859457, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 181 }, { "completion_length": 10.980902791023254, "epoch": 0.4367781610919454, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 182 }, { "completion_length": 11.0, "epoch": 0.4391780410979451, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 183 }, { "completion_length": 11.0, "epoch": 0.4415779211039448, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 184 }, { "completion_length": 11.0, "epoch": 0.4439778011099445, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 185 }, { "completion_length": 10.986111104488373, "epoch": 0.4463776811159442, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 186 }, { "completion_length": 10.994791686534882, "epoch": 0.4487775611219439, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 187 }, { "completion_length": 10.998263895511627, "epoch": 0.4511774411279436, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 188 }, { "completion_length": 10.984375, "epoch": 0.4535773211339433, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 189 }, { "completion_length": 10.996527791023254, "epoch": 0.455977201139943, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 190 }, { "completion_length": 10.998263895511627, "epoch": 0.4583770811459427, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 191 }, { "completion_length": 11.0, "epoch": 0.4607769611519424, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 192 }, { "completion_length": 11.0, "epoch": 0.4631768411579421, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 193 }, { "completion_length": 11.0, "epoch": 0.4655767211639418, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 194 }, { "completion_length": 11.0, "epoch": 0.4679766011699415, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 195 }, { "completion_length": 10.989583313465118, "epoch": 0.4703764811759412, "grad_norm": 0.07069958746433258, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.005975749809294939, "reward_std": 0.014421098865568638, "rewards/semantic_entropy_math_reward": -0.005975749809294939, "step": 196 }, { "completion_length": 10.998263895511627, "epoch": 0.4727763611819409, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 197 }, { "completion_length": 11.0, "epoch": 0.4751762411879406, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 198 }, { "completion_length": 11.0, "epoch": 0.47757612119394033, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 199 }, { "completion_length": 11.0, "epoch": 0.47997600119994, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 200 }, { "completion_length": 10.996527791023254, "epoch": 0.4823758812059397, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 201 }, { "completion_length": 10.998263895511627, "epoch": 0.4847757612119394, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 202 }, { "completion_length": 11.0, "epoch": 0.4871756412179391, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 203 }, { "completion_length": 11.0, "epoch": 0.4895755212239388, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 204 }, { "completion_length": 10.987847208976746, "epoch": 0.4919754012299385, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 205 }, { "completion_length": 11.0, "epoch": 0.4943752812359382, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 206 }, { "completion_length": 11.0, "epoch": 0.4967751612419379, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 207 }, { "completion_length": 11.0, "epoch": 0.4991750412479376, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 208 }, { "completion_length": 10.996527791023254, "epoch": 0.5015749212539373, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 209 }, { "completion_length": 10.998263895511627, "epoch": 0.503974801259937, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 210 }, { "completion_length": 10.980902791023254, "epoch": 0.5063746812659367, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 211 }, { "completion_length": 11.0, "epoch": 0.5087745612719364, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 212 }, { "completion_length": 11.0, "epoch": 0.5111744412779361, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 213 }, { "completion_length": 11.0, "epoch": 0.5135743212839358, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 214 }, { "completion_length": 10.996527791023254, "epoch": 0.5159742012899355, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 215 }, { "completion_length": 11.0, "epoch": 0.5183740812959352, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 216 }, { "completion_length": 11.0, "epoch": 0.5207739613019349, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 217 }, { "completion_length": 10.996527791023254, "epoch": 0.5231738413079347, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 218 }, { "completion_length": 10.996527791023254, "epoch": 0.5255737213139343, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 219 }, { "completion_length": 10.996527791023254, "epoch": 0.527973601319934, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 220 }, { "completion_length": 11.020833313465118, "epoch": 0.5303734813259336, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 221 }, { "completion_length": 11.0, "epoch": 0.5327733613319334, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 222 }, { "completion_length": 10.994791686534882, "epoch": 0.5351732413379331, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 223 }, { "completion_length": 11.0, "epoch": 0.5375731213439328, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 224 }, { "completion_length": 11.0, "epoch": 0.5399730013499325, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 225 }, { "completion_length": 11.0, "epoch": 0.5423728813559322, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 226 }, { "completion_length": 11.0, "epoch": 0.5447727613619319, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 227 }, { "completion_length": 11.0, "epoch": 0.5471726413679316, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 228 }, { "completion_length": 10.98784726858139, "epoch": 0.5495725213739313, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 229 }, { "completion_length": 11.0, "epoch": 0.551972401379931, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 230 }, { "completion_length": 10.996527791023254, "epoch": 0.5543722813859308, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 231 }, { "completion_length": 11.0, "epoch": 0.5567721613919304, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 232 }, { "completion_length": 11.0, "epoch": 0.5591720413979301, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 233 }, { "completion_length": 11.0, "epoch": 0.5615719214039298, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 234 }, { "completion_length": 11.0, "epoch": 0.5639718014099295, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 235 }, { "completion_length": 10.998263895511627, "epoch": 0.5663716814159292, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 236 }, { "completion_length": 10.998263895511627, "epoch": 0.5687715614219289, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 237 }, { "completion_length": 10.993055582046509, "epoch": 0.5711714414279286, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 238 }, { "completion_length": 11.0, "epoch": 0.5735713214339283, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 239 }, { "completion_length": 10.991319477558136, "epoch": 0.575971201439928, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 240 }, { "completion_length": 11.0, "epoch": 0.5783710814459277, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 241 }, { "completion_length": 11.0, "epoch": 0.5807709614519274, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 242 }, { "completion_length": 11.022569477558136, "epoch": 0.5831708414579271, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 243 }, { "completion_length": 11.0, "epoch": 0.5855707214639269, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 244 }, { "completion_length": 11.0, "epoch": 0.5879706014699265, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 245 }, { "completion_length": 11.0, "epoch": 0.5903704814759262, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 246 }, { "completion_length": 10.996527791023254, "epoch": 0.5927703614819259, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 247 }, { "completion_length": 10.998263895511627, "epoch": 0.5951702414879256, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 248 }, { "completion_length": 10.996527791023254, "epoch": 0.5975701214939253, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 249 }, { "completion_length": 10.991319477558136, "epoch": 0.599970001499925, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 250 }, { "completion_length": 11.0, "epoch": 0.6023698815059247, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 251 }, { "completion_length": 11.0, "epoch": 0.6047697615119244, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 252 }, { "completion_length": 11.0, "epoch": 0.6071696415179241, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 253 }, { "completion_length": 10.994791686534882, "epoch": 0.6095695215239239, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 254 }, { "completion_length": 11.0, "epoch": 0.6119694015299235, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 255 }, { "completion_length": 11.0, "epoch": 0.6143692815359232, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 256 }, { "completion_length": 10.987847208976746, "epoch": 0.6167691615419229, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 257 }, { "completion_length": 10.996527791023254, "epoch": 0.6191690415479226, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 258 }, { "completion_length": 10.996527791023254, "epoch": 0.6215689215539223, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 259 }, { "completion_length": 11.012152791023254, "epoch": 0.623968801559922, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 260 }, { "completion_length": 10.998263895511627, "epoch": 0.6263686815659217, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 261 }, { "completion_length": 10.998263895511627, "epoch": 0.6287685615719214, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 262 }, { "completion_length": 10.996527791023254, "epoch": 0.6311684415779211, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 263 }, { "completion_length": 11.0, "epoch": 0.6335683215839208, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 264 }, { "completion_length": 10.998263895511627, "epoch": 0.6359682015899205, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 265 }, { "completion_length": 11.0, "epoch": 0.6383680815959202, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 266 }, { "completion_length": 11.0, "epoch": 0.64076796160192, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 267 }, { "completion_length": 10.996527791023254, "epoch": 0.6431678416079196, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 268 }, { "completion_length": 11.0, "epoch": 0.6455677216139193, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 269 }, { "completion_length": 11.0, "epoch": 0.647967601619919, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 270 }, { "completion_length": 10.998263895511627, "epoch": 0.6503674816259187, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 271 }, { "completion_length": 11.0, "epoch": 0.6527673616319184, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 272 }, { "completion_length": 11.0, "epoch": 0.6551672416379181, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 273 }, { "completion_length": 10.984375059604645, "epoch": 0.6575671216439178, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 274 }, { "completion_length": 11.0, "epoch": 0.6599670016499175, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 275 }, { "completion_length": 11.0, "epoch": 0.6623668816559172, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 276 }, { "completion_length": 11.0, "epoch": 0.664766761661917, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 277 }, { "completion_length": 10.998263895511627, "epoch": 0.6671666416679166, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 278 }, { "completion_length": 11.0, "epoch": 0.6695665216739163, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 279 }, { "completion_length": 11.0, "epoch": 0.6719664016799161, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 280 }, { "completion_length": 10.994791686534882, "epoch": 0.6743662816859157, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 281 }, { "completion_length": 10.996527791023254, "epoch": 0.6767661616919154, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 282 }, { "completion_length": 11.0, "epoch": 0.6791660416979151, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 283 }, { "completion_length": 10.980902791023254, "epoch": 0.6815659217039148, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 284 }, { "completion_length": 11.0, "epoch": 0.6839658017099145, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 285 }, { "completion_length": 10.998263895511627, "epoch": 0.6863656817159142, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 286 }, { "completion_length": 10.998263895511627, "epoch": 0.6887655617219139, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 287 }, { "completion_length": 11.0, "epoch": 0.6911654417279136, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 288 }, { "completion_length": 10.994791686534882, "epoch": 0.6935653217339133, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 289 }, { "completion_length": 10.998263895511627, "epoch": 0.695965201739913, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 290 }, { "completion_length": 11.0, "epoch": 0.6983650817459127, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 291 }, { "completion_length": 11.0, "epoch": 0.7007649617519124, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 292 }, { "completion_length": 11.0, "epoch": 0.703164841757912, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 293 }, { "completion_length": 10.987847208976746, "epoch": 0.7055647217639118, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 294 }, { "completion_length": 10.994791686534882, "epoch": 0.7079646017699115, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 295 }, { "completion_length": 11.0, "epoch": 0.7103644817759112, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 296 }, { "completion_length": 11.0, "epoch": 0.7127643617819109, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 297 }, { "completion_length": 11.0, "epoch": 0.7151642417879106, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 298 }, { "completion_length": 10.996527791023254, "epoch": 0.7175641217939103, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 299 }, { "completion_length": 11.0, "epoch": 0.71996400179991, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 300 }, { "completion_length": 11.0, "epoch": 0.7223638818059097, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 301 }, { "completion_length": 11.0, "epoch": 0.7247637618119094, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 302 }, { "completion_length": 11.0, "epoch": 0.7271636418179092, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 303 }, { "completion_length": 11.0, "epoch": 0.7295635218239088, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 304 }, { "completion_length": 10.998263895511627, "epoch": 0.7319634018299085, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 305 }, { "completion_length": 11.0, "epoch": 0.7343632818359082, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 306 }, { "completion_length": 11.0, "epoch": 0.7367631618419079, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 307 }, { "completion_length": 10.996527791023254, "epoch": 0.7391630418479076, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 308 }, { "completion_length": 10.993055582046509, "epoch": 0.7415629218539073, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 309 }, { "completion_length": 11.0, "epoch": 0.743962801859907, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 310 }, { "completion_length": 11.0, "epoch": 0.7463626818659067, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 311 }, { "completion_length": 11.0, "epoch": 0.7487625618719064, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 312 }, { "completion_length": 11.0, "epoch": 0.7511624418779062, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 313 }, { "completion_length": 10.998263895511627, "epoch": 0.7535623218839058, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 314 }, { "completion_length": 10.993055582046509, "epoch": 0.7559622018899055, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 315 }, { "completion_length": 10.996527791023254, "epoch": 0.7583620818959053, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 316 }, { "completion_length": 10.998263895511627, "epoch": 0.7607619619019049, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 317 }, { "completion_length": 11.0, "epoch": 0.7631618419079046, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 318 }, { "completion_length": 10.998263895511627, "epoch": 0.7655617219139043, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 319 }, { "completion_length": 11.0, "epoch": 0.767961601919904, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 320 }, { "completion_length": 10.993055582046509, "epoch": 0.7703614819259037, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 321 }, { "completion_length": 11.0, "epoch": 0.7727613619319034, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 322 }, { "completion_length": 10.979166686534882, "epoch": 0.7751612419379031, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 323 }, { "completion_length": 11.0, "epoch": 0.7775611219439028, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 324 }, { "completion_length": 10.994791686534882, "epoch": 0.7799610019499025, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 325 }, { "completion_length": 10.991319477558136, "epoch": 0.7823608819559023, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 326 }, { "completion_length": 11.0, "epoch": 0.7847607619619019, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 327 }, { "completion_length": 11.0, "epoch": 0.7871606419679016, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 328 }, { "completion_length": 11.0, "epoch": 0.7895605219739013, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 329 }, { "completion_length": 11.0, "epoch": 0.791960401979901, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 330 }, { "completion_length": 10.993055582046509, "epoch": 0.7943602819859007, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 331 }, { "completion_length": 10.98784726858139, "epoch": 0.7967601619919004, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 332 }, { "completion_length": 11.0, "epoch": 0.7991600419979001, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 333 }, { "completion_length": 11.0, "epoch": 0.8015599220038998, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 334 }, { "completion_length": 10.996527791023254, "epoch": 0.8039598020098995, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 335 }, { "completion_length": 10.998263895511627, "epoch": 0.8063596820158992, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 336 }, { "completion_length": 10.998263895511627, "epoch": 0.8087595620218989, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 337 }, { "completion_length": 10.998263895511627, "epoch": 0.8111594420278986, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 338 }, { "completion_length": 10.996527791023254, "epoch": 0.8135593220338984, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 339 }, { "completion_length": 11.0, "epoch": 0.815959202039898, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 340 }, { "completion_length": 11.0, "epoch": 0.8183590820458977, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 341 }, { "completion_length": 11.0, "epoch": 0.8207589620518974, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 342 }, { "completion_length": 11.0, "epoch": 0.8231588420578971, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 343 }, { "completion_length": 11.0, "epoch": 0.8255587220638968, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 344 }, { "completion_length": 10.998263895511627, "epoch": 0.8279586020698965, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 345 }, { "completion_length": 10.998263895511627, "epoch": 0.8303584820758962, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 346 }, { "completion_length": 11.0, "epoch": 0.8327583620818959, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 347 }, { "completion_length": 11.0, "epoch": 0.8351582420878956, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 348 }, { "completion_length": 11.0, "epoch": 0.8375581220938954, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 349 }, { "completion_length": 10.998263895511627, "epoch": 0.839958002099895, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 350 }, { "completion_length": 10.979166686534882, "epoch": 0.8423578821058947, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 351 }, { "completion_length": 11.0, "epoch": 0.8447577621118944, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 352 }, { "completion_length": 11.0, "epoch": 0.8471576421178941, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 353 }, { "completion_length": 11.0, "epoch": 0.8495575221238938, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 354 }, { "completion_length": 11.0, "epoch": 0.8519574021298935, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 355 }, { "completion_length": 11.0, "epoch": 0.8543572821358932, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 356 }, { "completion_length": 11.0, "epoch": 0.8567571621418929, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 357 }, { "completion_length": 11.0, "epoch": 0.8591570421478926, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 358 }, { "completion_length": 10.998263895511627, "epoch": 0.8615569221538923, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 359 }, { "completion_length": 11.0, "epoch": 0.863956802159892, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 360 }, { "completion_length": 11.0, "epoch": 0.8663566821658917, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 361 }, { "completion_length": 11.022569477558136, "epoch": 0.8687565621718915, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 362 }, { "completion_length": 11.0, "epoch": 0.8711564421778911, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 363 }, { "completion_length": 10.998263895511627, "epoch": 0.8735563221838908, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 364 }, { "completion_length": 10.998263895511627, "epoch": 0.8759562021898905, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 365 }, { "completion_length": 11.0, "epoch": 0.8783560821958902, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 366 }, { "completion_length": 11.0, "epoch": 0.8807559622018899, "grad_norm": 0.02790955826640129, "learning_rate": 1e-06, "loss": -0.0, "reward": -0.005975749343633652, "reward_std": 0.014421098865568638, "rewards/semantic_entropy_math_reward": -0.005975749343633652, "step": 367 }, { "completion_length": 11.0, "epoch": 0.8831558422078896, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 368 }, { "completion_length": 11.0, "epoch": 0.8855557222138893, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 369 }, { "completion_length": 11.0, "epoch": 0.887955602219889, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 370 }, { "completion_length": 11.0, "epoch": 0.8903554822258887, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 371 }, { "completion_length": 11.0, "epoch": 0.8927553622318884, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 372 }, { "completion_length": 10.996527791023254, "epoch": 0.8951552422378881, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 373 }, { "completion_length": 11.0, "epoch": 0.8975551222438878, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 374 }, { "completion_length": 10.996527791023254, "epoch": 0.8999550022498876, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 375 }, { "completion_length": 11.0, "epoch": 0.9023548822558872, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 376 }, { "completion_length": 10.998263895511627, "epoch": 0.9047547622618869, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 377 }, { "completion_length": 11.0, "epoch": 0.9071546422678866, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 378 }, { "completion_length": 11.0, "epoch": 0.9095545222738863, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 379 }, { "completion_length": 11.0, "epoch": 0.911954402279886, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 380 }, { "completion_length": 11.0, "epoch": 0.9143542822858857, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 381 }, { "completion_length": 10.998263895511627, "epoch": 0.9167541622918854, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 382 }, { "completion_length": 11.0, "epoch": 0.9191540422978851, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 383 }, { "completion_length": 10.993055582046509, "epoch": 0.9215539223038848, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 384 }, { "completion_length": 10.979166686534882, "epoch": 0.9239538023098846, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 385 }, { "completion_length": 10.994791686534882, "epoch": 0.9263536823158842, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 386 }, { "completion_length": 10.996527791023254, "epoch": 0.9287535623218839, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 387 }, { "completion_length": 10.998263895511627, "epoch": 0.9311534423278836, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 388 }, { "completion_length": 10.996527791023254, "epoch": 0.9335533223338833, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 389 }, { "completion_length": 11.0, "epoch": 0.935953202339883, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 390 }, { "completion_length": 10.998263895511627, "epoch": 0.9383530823458827, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 391 }, { "completion_length": 11.0, "epoch": 0.9407529623518824, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 392 }, { "completion_length": 10.998263895511627, "epoch": 0.9431528423578821, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 393 }, { "completion_length": 11.0, "epoch": 0.9455527223638818, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 394 }, { "completion_length": 10.998263895511627, "epoch": 0.9479526023698815, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 395 }, { "completion_length": 11.0, "epoch": 0.9503524823758812, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 396 }, { "completion_length": 10.996527791023254, "epoch": 0.9527523623818809, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 397 }, { "completion_length": 11.0, "epoch": 0.9551522423878807, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 398 }, { "completion_length": 11.0, "epoch": 0.9575521223938803, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 399 }, { "completion_length": 10.996527791023254, "epoch": 0.95995200239988, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 400 }, { "completion_length": 11.0, "epoch": 0.9623518824058797, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 401 }, { "completion_length": 10.998263895511627, "epoch": 0.9647517624118794, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 402 }, { "completion_length": 10.998263895511627, "epoch": 0.9671516424178791, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 403 }, { "completion_length": 10.996527791023254, "epoch": 0.9695515224238788, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 404 }, { "completion_length": 10.994791686534882, "epoch": 0.9719514024298785, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 405 }, { "completion_length": 11.038194477558136, "epoch": 0.9743512824358782, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 406 }, { "completion_length": 11.0, "epoch": 0.9767511624418779, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 407 }, { "completion_length": 11.0, "epoch": 0.9791510424478777, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 408 }, { "completion_length": 10.998263895511627, "epoch": 0.9815509224538773, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 409 }, { "completion_length": 11.0, "epoch": 0.983950802459877, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 410 }, { "completion_length": 10.998263895511627, "epoch": 0.9863506824658768, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 411 }, { "completion_length": 11.0, "epoch": 0.9887505624718764, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 412 }, { "completion_length": 11.0, "epoch": 0.9911504424778761, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 413 }, { "completion_length": 11.0, "epoch": 0.9935503224838758, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 414 }, { "completion_length": 11.0, "epoch": 0.9959502024898755, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 415 }, { "completion_length": 11.0, "epoch": 0.9983500824958752, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/semantic_entropy_math_reward": 0.0, "step": 416 }, { "epoch": 0.9983500824958752, "step": 416, "total_flos": 0.0, "train_loss": -8.8036603985719e-09, "train_runtime": 28166.6905, "train_samples_per_second": 0.71, "train_steps_per_second": 0.015 } ], "logging_steps": 1, "max_steps": 416, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }