{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 100, "global_step": 468, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 626.9428821563721, "epoch": 0.010666666666666666, "grad_norm": 3.135220183260807, "kl": 0.00012028217315673828, "learning_rate": 3.1914893617021275e-07, "loss": 0.0, "reward": 1.1311939403414726, "reward_std": 0.9199116222560406, "rewards/accuracy_reward": 0.5946428846567869, "rewards/cosine_scaled_reward": 0.28655105652287605, "rewards/format_reward": 0.00714285746216774, "rewards/reasoning_steps_reward": 0.2428571599535644, "step": 5 }, { "completion_length": 610.6661018371582, "epoch": 0.021333333333333333, "grad_norm": 5.7259660177649625, "kl": 0.0001917600631713867, "learning_rate": 6.382978723404255e-07, "loss": 0.0, "reward": 1.2825231447815895, "reward_std": 0.8677008092403412, "rewards/accuracy_reward": 0.6535714611411094, "rewards/cosine_scaled_reward": 0.33728500208817425, "rewards/format_reward": 0.001785714365541935, "rewards/reasoning_steps_reward": 0.28988096918910744, "step": 10 }, { "completion_length": 609.1125274658203, "epoch": 0.032, "grad_norm": 1.0344908113531213, "kl": 0.00023615360260009766, "learning_rate": 9.574468085106384e-07, "loss": 0.0, "reward": 1.3210706368088723, "reward_std": 0.8099735792726278, "rewards/accuracy_reward": 0.6767857499420643, "rewards/cosine_scaled_reward": 0.3460705999750644, "rewards/format_reward": 0.001785714365541935, "rewards/reasoning_steps_reward": 0.29642859203740957, "step": 15 }, { "completion_length": 611.9785972595215, "epoch": 0.042666666666666665, "grad_norm": 1.2756183417905018, "kl": 0.0007787942886352539, "learning_rate": 1.276595744680851e-06, "loss": 0.0, "reward": 1.2778369441628457, "reward_std": 0.7753925062716007, "rewards/accuracy_reward": 0.6625000327825546, "rewards/cosine_scaled_reward": 0.35878926496952773, "rewards/format_reward": 0.001785714365541935, "rewards/reasoning_steps_reward": 0.2547619211487472, "step": 20 }, { "completion_length": 644.8321746826172, "epoch": 0.05333333333333334, "grad_norm": 12.666142241785211, "kl": 0.002669668197631836, "learning_rate": 1.5957446808510639e-06, "loss": 0.0001, "reward": 1.2453887628391385, "reward_std": 0.7985292233526706, "rewards/accuracy_reward": 0.641071455925703, "rewards/cosine_scaled_reward": 0.34419824378564956, "rewards/format_reward": 0.00357142873108387, "rewards/reasoning_steps_reward": 0.25654763616621495, "step": 25 }, { "completion_length": 664.1107467651367, "epoch": 0.064, "grad_norm": 2.039644155087791, "kl": 0.0030992507934570314, "learning_rate": 1.9148936170212767e-06, "loss": 0.0001, "reward": 1.3286324340850115, "reward_std": 0.7313086107373238, "rewards/accuracy_reward": 0.6696428894996643, "rewards/cosine_scaled_reward": 0.3649419266730547, "rewards/format_reward": 0.001785714365541935, "rewards/reasoning_steps_reward": 0.2922619292512536, "step": 30 }, { "completion_length": 646.792887878418, "epoch": 0.07466666666666667, "grad_norm": 1.110778707013103, "kl": 0.004967975616455078, "learning_rate": 2.2340425531914894e-06, "loss": 0.0002, "reward": 1.2785038705915213, "reward_std": 0.732379237562418, "rewards/accuracy_reward": 0.6642857423052192, "rewards/cosine_scaled_reward": 0.335646699834615, "rewards/format_reward": 0.001785714365541935, "rewards/reasoning_steps_reward": 0.276785734295845, "step": 35 }, { "completion_length": 657.1143142700196, "epoch": 0.08533333333333333, "grad_norm": 0.9844730773757054, "kl": 0.014885807037353515, "learning_rate": 2.553191489361702e-06, "loss": 0.0006, "reward": 1.5404972655698657, "reward_std": 0.6586654607206583, "rewards/accuracy_reward": 0.7446428902447224, "rewards/cosine_scaled_reward": 0.4643067395314574, "rewards/format_reward": 0.001785714365541935, "rewards/reasoning_steps_reward": 0.329761927947402, "step": 40 }, { "completion_length": 617.8857398986817, "epoch": 0.096, "grad_norm": 0.7427114343652937, "kl": 0.010532951354980469, "learning_rate": 2.872340425531915e-06, "loss": 0.0004, "reward": 1.3987550295889377, "reward_std": 0.6983755130320788, "rewards/accuracy_reward": 0.7196428939700127, "rewards/cosine_scaled_reward": 0.39756449486594647, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.28154764492064716, "step": 45 }, { "completion_length": 658.5303855895996, "epoch": 0.10666666666666667, "grad_norm": 1.6404392956435, "kl": 0.00817718505859375, "learning_rate": 2.9996241442585123e-06, "loss": 0.0003, "reward": 1.5072809681296349, "reward_std": 0.7857246264815331, "rewards/accuracy_reward": 0.7125000357627869, "rewards/cosine_scaled_reward": 0.41680470630526545, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.37797621842473744, "step": 50 }, { "completion_length": 633.0089584350586, "epoch": 0.11733333333333333, "grad_norm": 1.4925006192982901, "kl": 0.005891990661621094, "learning_rate": 2.9973279301399446e-06, "loss": 0.0002, "reward": 1.3728282183408738, "reward_std": 0.7632799297571182, "rewards/accuracy_reward": 0.6642857462167739, "rewards/cosine_scaled_reward": 0.3627091235946864, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.3458333555608988, "step": 55 }, { "completion_length": 612.4803871154785, "epoch": 0.128, "grad_norm": 0.6602171934295183, "kl": 0.005200958251953125, "learning_rate": 2.992947502998804e-06, "loss": 0.0002, "reward": 1.550386269390583, "reward_std": 0.7166643626987934, "rewards/accuracy_reward": 0.7267857551574707, "rewards/cosine_scaled_reward": 0.40752905812114476, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.4160714574158192, "step": 60 }, { "completion_length": 637.0893135070801, "epoch": 0.13866666666666666, "grad_norm": 0.7395014891892754, "kl": 0.005736923217773438, "learning_rate": 2.9864889601923268e-06, "loss": 0.0002, "reward": 1.548742873966694, "reward_std": 0.7250724889338016, "rewards/accuracy_reward": 0.6928571745753288, "rewards/cosine_scaled_reward": 0.40052852691151203, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.4553571745753288, "step": 65 }, { "completion_length": 624.2607398986817, "epoch": 0.14933333333333335, "grad_norm": 1.3446269002594227, "kl": 0.00745697021484375, "learning_rate": 2.977961291721137e-06, "loss": 0.0003, "reward": 1.726818972826004, "reward_std": 0.7127795048058033, "rewards/accuracy_reward": 0.7500000286847353, "rewards/cosine_scaled_reward": 0.4518189021851867, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.5250000316649676, "step": 70 }, { "completion_length": 582.6161026000976, "epoch": 0.16, "grad_norm": 1.6339747436033418, "kl": 0.01014862060546875, "learning_rate": 2.9673763677155655e-06, "loss": 0.0004, "reward": 1.7601879060268402, "reward_std": 0.6540568165481091, "rewards/accuracy_reward": 0.7571428894996644, "rewards/cosine_scaled_reward": 0.44530690894462166, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.5577381297945976, "step": 75 }, { "completion_length": 604.8535972595215, "epoch": 0.17066666666666666, "grad_norm": 0.5255090188980472, "kl": 0.011274337768554688, "learning_rate": 2.9547489219129666e-06, "loss": 0.0005, "reward": 1.8713448494672775, "reward_std": 0.6472877942025661, "rewards/accuracy_reward": 0.7910714626312256, "rewards/cosine_scaled_reward": 0.46479713870212436, "rewards/format_reward": 0.001785714365541935, "rewards/reasoning_steps_reward": 0.6136905208230019, "step": 80 }, { "completion_length": 652.2661026000976, "epoch": 0.18133333333333335, "grad_norm": 0.9972915077162576, "kl": 0.030218505859375, "learning_rate": 2.9400965311490175e-06, "loss": 0.0012, "reward": 1.8985693082213402, "reward_std": 0.6914638102054596, "rewards/accuracy_reward": 0.7303571823984385, "rewards/cosine_scaled_reward": 0.4414263550657779, "rewards/format_reward": 0.00714285746216774, "rewards/reasoning_steps_reward": 0.7196429148316383, "step": 85 }, { "completion_length": 639.8696754455566, "epoch": 0.192, "grad_norm": 0.6349460842058491, "kl": 0.021417236328125, "learning_rate": 2.9234395908915565e-06, "loss": 0.0009, "reward": 1.8425026133656501, "reward_std": 0.6659979414194822, "rewards/accuracy_reward": 0.6660714538767933, "rewards/cosine_scaled_reward": 0.37762160785496235, "rewards/format_reward": 0.008928571827709675, "rewards/reasoning_steps_reward": 0.7898810163140297, "step": 90 }, { "completion_length": 653.3714622497558, "epoch": 0.20266666666666666, "grad_norm": 0.7068217863673739, "kl": 0.0266326904296875, "learning_rate": 2.904801286851009e-06, "loss": 0.0011, "reward": 2.013348326086998, "reward_std": 0.6553943566977978, "rewards/accuracy_reward": 0.7017857439815998, "rewards/cosine_scaled_reward": 0.41513395444490014, "rewards/format_reward": 0.06785714626312256, "rewards/reasoning_steps_reward": 0.8285714961588383, "step": 95 }, { "completion_length": 677.8946754455567, "epoch": 0.21333333333333335, "grad_norm": 2.9570623395086475, "kl": 0.0402923583984375, "learning_rate": 2.884207562706925e-06, "loss": 0.0016, "reward": 2.5119238168001177, "reward_std": 0.8375062063336373, "rewards/accuracy_reward": 0.7125000339001417, "rewards/cosine_scaled_reward": 0.4250189420999959, "rewards/format_reward": 0.5553571665659547, "rewards/reasoning_steps_reward": 0.8190476760268212, "step": 100 }, { "epoch": 0.21333333333333335, "eval_completion_length": 717.3023177124023, "eval_kl": 0.045336328125, "eval_loss": 0.0018938997527584434, "eval_reward": 2.535167279243469, "eval_reward_std": 0.8789399468839169, "eval_rewards/accuracy_reward": 0.5740285987168551, "eval_rewards/cosine_scaled_reward": 0.29340527021205964, "eval_rewards/format_reward": 0.8470000272035598, "eval_rewards/reasoning_steps_reward": 0.8207333937764167, "eval_runtime": 34268.8498, "eval_samples_per_second": 0.146, "eval_steps_per_second": 0.01, "step": 100 }, { "completion_length": 724.1053909301758, "epoch": 0.224, "grad_norm": 1.1690466064365364, "kl": 0.1235015869140625, "learning_rate": 2.8616870839955444e-06, "loss": 0.0049, "reward": 2.752804014086723, "reward_std": 0.8537621341645718, "rewards/accuracy_reward": 0.6767857506871223, "rewards/cosine_scaled_reward": 0.4045896364772489, "rewards/format_reward": 0.8553571686148643, "rewards/reasoning_steps_reward": 0.816071481257677, "step": 105 }, { "completion_length": 768.4839645385742, "epoch": 0.23466666666666666, "grad_norm": 14.95234115514868, "kl": 0.0612457275390625, "learning_rate": 2.837271198208662e-06, "loss": 0.0025, "reward": 2.6233972758054733, "reward_std": 0.8962277337908745, "rewards/accuracy_reward": 0.6214286016300321, "rewards/cosine_scaled_reward": 0.340063818404451, "rewards/format_reward": 0.8839285984635353, "rewards/reasoning_steps_reward": 0.7779762476682663, "step": 110 }, { "completion_length": 731.7893173217774, "epoch": 0.24533333333333332, "grad_norm": 0.7880946164162693, "kl": 0.163909912109375, "learning_rate": 2.8109938911593322e-06, "loss": 0.0066, "reward": 2.6163390249013903, "reward_std": 0.9033554136753082, "rewards/accuracy_reward": 0.6589286033064127, "rewards/cosine_scaled_reward": 0.3818151192739606, "rewards/format_reward": 0.7267857484519482, "rewards/reasoning_steps_reward": 0.8488095819950103, "step": 115 }, { "completion_length": 759.9339622497558, "epoch": 0.256, "grad_norm": 1.7990639203312269, "kl": 0.042535400390625, "learning_rate": 2.7828917396751474e-06, "loss": 0.0017, "reward": 2.6672952473163605, "reward_std": 0.8769947469234467, "rewards/accuracy_reward": 0.6535714585334063, "rewards/cosine_scaled_reward": 0.384557047416456, "rewards/format_reward": 0.7732143148779869, "rewards/reasoning_steps_reward": 0.8559524476528168, "step": 120 }, { "completion_length": 760.0518188476562, "epoch": 0.26666666666666666, "grad_norm": 0.525125891329557, "kl": 0.048138427734375, "learning_rate": 2.753003860684943e-06, "loss": 0.0019, "reward": 3.015795236825943, "reward_std": 0.7571237944066525, "rewards/accuracy_reward": 0.7303571730852128, "rewards/cosine_scaled_reward": 0.47650940530002117, "rewards/format_reward": 0.9767857253551483, "rewards/reasoning_steps_reward": 0.832142922282219, "step": 125 }, { "completion_length": 787.3678970336914, "epoch": 0.2773333333333333, "grad_norm": 1.6324108230600705, "kl": 0.0462982177734375, "learning_rate": 2.721371856769793e-06, "loss": 0.0019, "reward": 2.96227542757988, "reward_std": 0.7570989470928907, "rewards/accuracy_reward": 0.6714286040514708, "rewards/cosine_scaled_reward": 0.40870389440096916, "rewards/format_reward": 0.9892857193946838, "rewards/reasoning_steps_reward": 0.8928571999073028, "step": 130 }, { "completion_length": 739.2839584350586, "epoch": 0.288, "grad_norm": 1.2279064196254164, "kl": 0.0569366455078125, "learning_rate": 2.688039758254093e-06, "loss": 0.0023, "reward": 3.0940194368362426, "reward_std": 0.7407944872975349, "rewards/accuracy_reward": 0.748214321769774, "rewards/cosine_scaled_reward": 0.4910431296331808, "rewards/format_reward": 0.9946428596973419, "rewards/reasoning_steps_reward": 0.8601191073656083, "step": 135 }, { "completion_length": 742.6107498168946, "epoch": 0.2986666666666667, "grad_norm": 1.142989559294876, "kl": 0.0421722412109375, "learning_rate": 2.65305396191733e-06, "loss": 0.0017, "reward": 3.0360836148262025, "reward_std": 0.7486013866961002, "rewards/accuracy_reward": 0.7267857477068901, "rewards/cosine_scaled_reward": 0.44679776607081295, "rewards/format_reward": 0.9785714343190193, "rewards/reasoning_steps_reward": 0.8839286372065545, "step": 140 }, { "completion_length": 708.2089630126953, "epoch": 0.30933333333333335, "grad_norm": 0.5591745022956734, "kl": 0.0389068603515625, "learning_rate": 2.61646316641186e-06, "loss": 0.0016, "reward": 3.039272406697273, "reward_std": 0.6369190786033869, "rewards/accuracy_reward": 0.7375000305473804, "rewards/cosine_scaled_reward": 0.4565341799054295, "rewards/format_reward": 0.9928571462631226, "rewards/reasoning_steps_reward": 0.85238102003932, "step": 145 }, { "completion_length": 693.1196723937989, "epoch": 0.32, "grad_norm": 0.8445904699258806, "kl": 0.04276123046875, "learning_rate": 2.5783183044765715e-06, "loss": 0.0017, "reward": 3.065075045824051, "reward_std": 0.6227528784424067, "rewards/accuracy_reward": 0.7553571730852127, "rewards/cosine_scaled_reward": 0.48888447135686874, "rewards/format_reward": 0.980357152223587, "rewards/reasoning_steps_reward": 0.8404762536287308, "step": 150 }, { "completion_length": 666.7178871154786, "epoch": 0.33066666666666666, "grad_norm": 0.6870106883522581, "kl": 0.041412353515625, "learning_rate": 2.5386724720408135e-06, "loss": 0.0017, "reward": 3.156706044077873, "reward_std": 0.6447003319859504, "rewards/accuracy_reward": 0.798214316740632, "rewards/cosine_scaled_reward": 0.5305154274217785, "rewards/format_reward": 0.9821428656578064, "rewards/reasoning_steps_reward": 0.8458333984017372, "step": 155 }, { "completion_length": 723.2357452392578, "epoch": 0.3413333333333333, "grad_norm": 0.598319397397006, "kl": 0.0435455322265625, "learning_rate": 2.49758085431725e-06, "loss": 0.0017, "reward": 3.0701630294322966, "reward_std": 0.6298764709383249, "rewards/accuracy_reward": 0.7410714581608773, "rewards/cosine_scaled_reward": 0.463615276478231, "rewards/format_reward": 0.9696428686380386, "rewards/reasoning_steps_reward": 0.895833395421505, "step": 160 }, { "completion_length": 799.8250381469727, "epoch": 0.352, "grad_norm": 0.6301042631581175, "kl": 0.0475830078125, "learning_rate": 2.455100648986533e-06, "loss": 0.0019, "reward": 2.9756604552268984, "reward_std": 0.7367240894585848, "rewards/accuracy_reward": 0.6785714585334063, "rewards/cosine_scaled_reward": 0.4125651277601719, "rewards/format_reward": 0.9535714492201806, "rewards/reasoning_steps_reward": 0.9309524342417717, "step": 165 }, { "completion_length": 826.1500366210937, "epoch": 0.3626666666666667, "grad_norm": 0.8610873292738485, "kl": 0.054132080078125, "learning_rate": 2.4112909865807053e-06, "loss": 0.0022, "reward": 2.680763456225395, "reward_std": 0.8146794062107802, "rewards/accuracy_reward": 0.5303571671247482, "rewards/cosine_scaled_reward": 0.2390966679668054, "rewards/format_reward": 0.9714285835623742, "rewards/reasoning_steps_reward": 0.9398810014128685, "step": 170 }, { "completion_length": 800.7339706420898, "epoch": 0.37333333333333335, "grad_norm": 0.7079176746358598, "kl": 0.067926025390625, "learning_rate": 2.366212848176164e-06, "loss": 0.0027, "reward": 2.7596707075834273, "reward_std": 0.8444030195474624, "rewards/accuracy_reward": 0.5714285975322128, "rewards/cosine_scaled_reward": 0.3090753515250981, "rewards/format_reward": 0.9732142984867096, "rewards/reasoning_steps_reward": 0.9059524446725845, "step": 175 }, { "completion_length": 778.9928909301758, "epoch": 0.384, "grad_norm": 1.1001119066854874, "kl": 0.0932373046875, "learning_rate": 2.319928980510752e-06, "loss": 0.0037, "reward": 2.765022465586662, "reward_std": 0.8211875937879085, "rewards/accuracy_reward": 0.5821428883820772, "rewards/cosine_scaled_reward": 0.32335569793358443, "rewards/format_reward": 0.9732142984867096, "rewards/reasoning_steps_reward": 0.8863095760345459, "step": 180 }, { "completion_length": 792.8321823120117, "epoch": 0.39466666666666667, "grad_norm": 0.621075404533647, "kl": 0.170367431640625, "learning_rate": 2.272503808643123e-06, "loss": 0.0068, "reward": 2.6604900896549224, "reward_std": 0.7879154846072197, "rewards/accuracy_reward": 0.5410714561119676, "rewards/cosine_scaled_reward": 0.26644235964631663, "rewards/format_reward": 0.9892857193946838, "rewards/reasoning_steps_reward": 0.8636905372142791, "step": 185 }, { "completion_length": 734.221459197998, "epoch": 0.4053333333333333, "grad_norm": 2.6622395485180066, "kl": 0.1951171875, "learning_rate": 2.2240033462759628e-06, "loss": 0.0078, "reward": 2.9028186976909636, "reward_std": 0.7853110164403916, "rewards/accuracy_reward": 0.6750000350177288, "rewards/cosine_scaled_reward": 0.41293763257563115, "rewards/format_reward": 0.9767857238650322, "rewards/reasoning_steps_reward": 0.8380952909588814, "step": 190 }, { "completion_length": 757.7036010742188, "epoch": 0.416, "grad_norm": 1.9232019767182387, "kl": 0.197918701171875, "learning_rate": 2.1744951038678905e-06, "loss": 0.0079, "reward": 2.847309911251068, "reward_std": 0.8383581660687923, "rewards/accuracy_reward": 0.6553571727126837, "rewards/cosine_scaled_reward": 0.3895717078819871, "rewards/format_reward": 0.9660714402794838, "rewards/reasoning_steps_reward": 0.836309588700533, "step": 195 }, { "completion_length": 704.6071769714356, "epoch": 0.4266666666666667, "grad_norm": 2.187135193267794, "kl": 0.2211669921875, "learning_rate": 2.124047994661941e-06, "loss": 0.0088, "reward": 2.8649386018514633, "reward_std": 0.8465140253305435, "rewards/accuracy_reward": 0.6785714600235224, "rewards/cosine_scaled_reward": 0.4107717891223729, "rewards/format_reward": 0.9767857253551483, "rewards/reasoning_steps_reward": 0.7988095879554749, "step": 200 }, { "epoch": 0.4266666666666667, "eval_completion_length": 726.0360606079101, "eval_kl": 0.4157892578125, "eval_loss": 0.016393402591347694, "eval_reward": 2.668296795749664, "eval_reward_std": 0.848984938377142, "eval_rewards/accuracy_reward": 0.5848571698188781, "eval_rewards/cosine_scaled_reward": 0.31035383035842096, "eval_rewards/format_reward": 0.9741428675889969, "eval_rewards/reasoning_steps_reward": 0.7989429181694985, "eval_runtime": 34304.9807, "eval_samples_per_second": 0.146, "eval_steps_per_second": 0.01, "step": 200 }, { "completion_length": 711.1785987854004, "epoch": 0.43733333333333335, "grad_norm": 13.425767555167974, "kl": 0.522259521484375, "learning_rate": 2.072732238761434e-06, "loss": 0.0209, "reward": 2.5607248514890673, "reward_std": 0.8997476093471051, "rewards/accuracy_reward": 0.5821428846567869, "rewards/cosine_scaled_reward": 0.29524860471719877, "rewards/format_reward": 0.9446428805589676, "rewards/reasoning_steps_reward": 0.7386905312538147, "step": 205 }, { "completion_length": 703.9107421875, "epoch": 0.448, "grad_norm": 1.9515561067024476, "kl": 0.407659912109375, "learning_rate": 2.0206192653867536e-06, "loss": 0.0163, "reward": 2.5372259259223937, "reward_std": 1.0205993868410588, "rewards/accuracy_reward": 0.6178571723401547, "rewards/cosine_scaled_reward": 0.334844835329568, "rewards/format_reward": 0.887500025331974, "rewards/reasoning_steps_reward": 0.6970238521695137, "step": 210 }, { "completion_length": 743.6161041259766, "epoch": 0.45866666666666667, "grad_norm": 3.1537839868905078, "kl": 0.660333251953125, "learning_rate": 1.967781613449095e-06, "loss": 0.0264, "reward": 1.9156419575214385, "reward_std": 1.1042504251003264, "rewards/accuracy_reward": 0.4267857324331999, "rewards/cosine_scaled_reward": 0.1459990169329103, "rewards/format_reward": 0.7571428924798965, "rewards/reasoning_steps_reward": 0.5857143320143223, "step": 215 }, { "completion_length": 776.1553909301758, "epoch": 0.4693333333333333, "grad_norm": 4.038362237813945, "kl": 0.3061279296875, "learning_rate": 1.9142928305795637e-06, "loss": 0.0122, "reward": 2.4867064505815506, "reward_std": 0.9735465314239263, "rewards/accuracy_reward": 0.5696428786963225, "rewards/cosine_scaled_reward": 0.2962301469407976, "rewards/format_reward": 0.9053571730852127, "rewards/reasoning_steps_reward": 0.7154762424528599, "step": 220 }, { "completion_length": 729.4678955078125, "epoch": 0.48, "grad_norm": 3.1439180898202466, "kl": 0.28773193359375, "learning_rate": 1.8602273707541886e-06, "loss": 0.0115, "reward": 2.7534320563077928, "reward_std": 0.9354146108031273, "rewards/accuracy_reward": 0.671428600884974, "rewards/cosine_scaled_reward": 0.4331938370829448, "rewards/format_reward": 0.9357143059372902, "rewards/reasoning_steps_reward": 0.7130952820181846, "step": 225 }, { "completion_length": 753.7571792602539, "epoch": 0.49066666666666664, "grad_norm": 9.742972287112453, "kl": 0.672515869140625, "learning_rate": 1.8056604906573418e-06, "loss": 0.0269, "reward": 2.4900094658136367, "reward_std": 1.071580182760954, "rewards/accuracy_reward": 0.6053571721538902, "rewards/cosine_scaled_reward": 0.3417950821574777, "rewards/format_reward": 0.900000023841858, "rewards/reasoning_steps_reward": 0.6428571917116642, "step": 230 }, { "completion_length": 769.7821792602539, "epoch": 0.5013333333333333, "grad_norm": 12.748394988794061, "kl": 0.615447998046875, "learning_rate": 1.7506681449278226e-06, "loss": 0.0246, "reward": 2.4423232048749925, "reward_std": 1.0899774149060248, "rewards/accuracy_reward": 0.6035714549943805, "rewards/cosine_scaled_reward": 0.3429183505475521, "rewards/format_reward": 0.8517857506871224, "rewards/reasoning_steps_reward": 0.6440476708114147, "step": 235 }, { "completion_length": 721.4928901672363, "epoch": 0.512, "grad_norm": 316.5564758948846, "kl": 2.6417724609375, "learning_rate": 1.6953268804334257e-06, "loss": 0.1056, "reward": 2.6406149938702583, "reward_std": 0.9681473188102245, "rewards/accuracy_reward": 0.6732143137603999, "rewards/cosine_scaled_reward": 0.43644821029156444, "rewards/format_reward": 0.833928607404232, "rewards/reasoning_steps_reward": 0.6970238670706749, "step": 240 }, { "completion_length": 699.0089614868164, "epoch": 0.5226666666666666, "grad_norm": 144932.71498560338, "kl": 41.92372436523438, "learning_rate": 1.6397137297211436e-06, "loss": 1.677, "reward": 2.9064596563577654, "reward_std": 0.8081207755953074, "rewards/accuracy_reward": 0.7678571745753289, "rewards/cosine_scaled_reward": 0.5046738618053496, "rewards/format_reward": 0.9178571611642837, "rewards/reasoning_steps_reward": 0.7160714864730835, "step": 245 }, { "completion_length": 719.6250274658203, "epoch": 0.5333333333333333, "grad_norm": 12.128586542505051, "kl": 13.330413818359375, "learning_rate": 1.5839061037913395e-06, "loss": 0.5321, "reward": 2.8467082887887956, "reward_std": 0.8377620510756969, "rewards/accuracy_reward": 0.7500000335276127, "rewards/cosine_scaled_reward": 0.4984938623383641, "rewards/format_reward": 0.8803571656346321, "rewards/reasoning_steps_reward": 0.7178571961820126, "step": 250 }, { "completion_length": 740.6482467651367, "epoch": 0.544, "grad_norm": 21.125458441852672, "kl": 0.171630859375, "learning_rate": 1.527981684345115e-06, "loss": 0.0069, "reward": 2.7478116720914842, "reward_std": 0.8047603815793991, "rewards/accuracy_reward": 0.6946428874507546, "rewards/cosine_scaled_reward": 0.4323353324783966, "rewards/format_reward": 0.9214285910129547, "rewards/reasoning_steps_reward": 0.6994048036634922, "step": 255 }, { "completion_length": 741.4857482910156, "epoch": 0.5546666666666666, "grad_norm": 24.989348854776196, "kl": 0.7936920166015625, "learning_rate": 1.4720183156548855e-06, "loss": 0.0317, "reward": 2.8889215648174287, "reward_std": 0.8046084839850665, "rewards/accuracy_reward": 0.7053571708500386, "rewards/cosine_scaled_reward": 0.4633262232731795, "rewards/format_reward": 0.9500000178813934, "rewards/reasoning_steps_reward": 0.7702381610870361, "step": 260 }, { "completion_length": 742.701823425293, "epoch": 0.5653333333333334, "grad_norm": 31.341677037958075, "kl": 0.67542724609375, "learning_rate": 1.4160938962086612e-06, "loss": 0.027, "reward": 2.8618105918169023, "reward_std": 0.756641275063157, "rewards/accuracy_reward": 0.7321428785100579, "rewards/cosine_scaled_reward": 0.4826438320800662, "rewards/format_reward": 0.9267857372760773, "rewards/reasoning_steps_reward": 0.7202381417155266, "step": 265 }, { "completion_length": 741.5803863525391, "epoch": 0.576, "grad_norm": 208.94608010239415, "kl": 0.9906982421875, "learning_rate": 1.3602862702788567e-06, "loss": 0.0396, "reward": 2.706931698322296, "reward_std": 0.9525154523551465, "rewards/accuracy_reward": 0.6732143200933933, "rewards/cosine_scaled_reward": 0.43609824670711533, "rewards/format_reward": 0.8946428880095482, "rewards/reasoning_steps_reward": 0.7029762521386147, "step": 270 }, { "completion_length": 732.1143157958984, "epoch": 0.5866666666666667, "grad_norm": 38.275566557137644, "kl": 1.51126708984375, "learning_rate": 1.3046731195665748e-06, "loss": 0.0605, "reward": 2.7598373025655745, "reward_std": 0.9775024671107531, "rewards/accuracy_reward": 0.7053571678698063, "rewards/cosine_scaled_reward": 0.45864667696878314, "rewards/format_reward": 0.8839286029338836, "rewards/reasoning_steps_reward": 0.7119048096239566, "step": 275 }, { "completion_length": 766.3411087036133, "epoch": 0.5973333333333334, "grad_norm": 34.400362223993966, "kl": 0.446759033203125, "learning_rate": 1.2493318550721775e-06, "loss": 0.0179, "reward": 2.80095117688179, "reward_std": 0.9185693945735693, "rewards/accuracy_reward": 0.701785746216774, "rewards/cosine_scaled_reward": 0.44499868620187044, "rewards/format_reward": 0.9017857417464257, "rewards/reasoning_steps_reward": 0.7523810148239136, "step": 280 }, { "completion_length": 745.8839614868164, "epoch": 0.608, "grad_norm": 70.23076333110882, "kl": 3.24036865234375, "learning_rate": 1.1943395093426585e-06, "loss": 0.1296, "reward": 2.9303120017051696, "reward_std": 0.7858256082981825, "rewards/accuracy_reward": 0.7339286036789417, "rewards/cosine_scaled_reward": 0.5118595488369465, "rewards/format_reward": 0.9214285984635353, "rewards/reasoning_steps_reward": 0.7630952894687653, "step": 285 }, { "completion_length": 729.6464584350585, "epoch": 0.6186666666666667, "grad_norm": 51.01572887577541, "kl": 2.792974853515625, "learning_rate": 1.1397726292458115e-06, "loss": 0.1118, "reward": 2.8507910460233687, "reward_std": 0.8466649554669857, "rewards/accuracy_reward": 0.7142857432365417, "rewards/cosine_scaled_reward": 0.46924334414070473, "rewards/format_reward": 0.9303571686148644, "rewards/reasoning_steps_reward": 0.7369048215448857, "step": 290 }, { "completion_length": 758.4518173217773, "epoch": 0.6293333333333333, "grad_norm": 21.800940990494052, "kl": 3.139178466796875, "learning_rate": 1.085707169420437e-06, "loss": 0.1259, "reward": 2.687264183163643, "reward_std": 0.8229550156742335, "rewards/accuracy_reward": 0.6535714585334063, "rewards/cosine_scaled_reward": 0.4211926580406725, "rewards/format_reward": 0.8946428805589676, "rewards/reasoning_steps_reward": 0.7178571917116642, "step": 295 }, { "completion_length": 736.9268188476562, "epoch": 0.64, "grad_norm": 8.05056275297732, "kl": 0.720037841796875, "learning_rate": 1.0322183865509054e-06, "loss": 0.0288, "reward": 2.7745183438062666, "reward_std": 0.9824469141662121, "rewards/accuracy_reward": 0.7232143133878708, "rewards/cosine_scaled_reward": 0.46856586267240347, "rewards/format_reward": 0.9071428790688515, "rewards/reasoning_steps_reward": 0.6755952909588814, "step": 300 }, { "epoch": 0.64, "eval_completion_length": 756.2787765625, "eval_kl": 2.815593212890625, "eval_loss": 0.11271046847105026, "eval_reward": 2.6679908989191055, "eval_reward_std": 0.9292156134605408, "eval_rewards/accuracy_reward": 0.6384000294238329, "eval_rewards/cosine_scaled_reward": 0.386524124888191, "eval_rewards/format_reward": 0.9171143096089364, "eval_rewards/reasoning_steps_reward": 0.7259524360120296, "eval_runtime": 34511.2614, "eval_samples_per_second": 0.145, "eval_steps_per_second": 0.01, "step": 300 }, { "completion_length": 760.3768211364746, "epoch": 0.6506666666666666, "grad_norm": 16.434699024651234, "kl": 0.956842041015625, "learning_rate": 9.793807346132464e-07, "loss": 0.0383, "reward": 2.901669743657112, "reward_std": 0.8654729023575782, "rewards/accuracy_reward": 0.7464286040514707, "rewards/cosine_scaled_reward": 0.5034553268924356, "rewards/format_reward": 0.9392857357859612, "rewards/reasoning_steps_reward": 0.7125000521540642, "step": 305 }, { "completion_length": 780.1018203735351, "epoch": 0.6613333333333333, "grad_norm": 9.30628937439928, "kl": 0.76141357421875, "learning_rate": 9.272677612385667e-07, "loss": 0.0305, "reward": 2.715619903802872, "reward_std": 0.9227011248469352, "rewards/accuracy_reward": 0.6767857443541289, "rewards/cosine_scaled_reward": 0.4328816962428391, "rewards/format_reward": 0.9000000298023224, "rewards/reasoning_steps_reward": 0.7059524297714234, "step": 310 }, { "completion_length": 768.9500411987304, "epoch": 0.672, "grad_norm": 1.7983007811573581, "kl": 0.573968505859375, "learning_rate": 8.759520053380591e-07, "loss": 0.023, "reward": 2.786320286989212, "reward_std": 0.7833445437252522, "rewards/accuracy_reward": 0.6910714630037547, "rewards/cosine_scaled_reward": 0.4357249645516276, "rewards/format_reward": 0.9464285910129547, "rewards/reasoning_steps_reward": 0.7130952931940555, "step": 315 }, { "completion_length": 723.1536033630371, "epoch": 0.6826666666666666, "grad_norm": 12.691694660091667, "kl": 0.486602783203125, "learning_rate": 8.255048961321088e-07, "loss": 0.0194, "reward": 2.831101644039154, "reward_std": 0.8475235715508461, "rewards/accuracy_reward": 0.7071428868919611, "rewards/cosine_scaled_reward": 0.46919678517151625, "rewards/format_reward": 0.930357164144516, "rewards/reasoning_steps_reward": 0.7244048148393631, "step": 320 }, { "completion_length": 723.8268165588379, "epoch": 0.6933333333333334, "grad_norm": 4.135287895650402, "kl": 1.668939208984375, "learning_rate": 7.759966537240373e-07, "loss": 0.0664, "reward": 3.0232764929533005, "reward_std": 0.7830160673707723, "rewards/accuracy_reward": 0.7607143174856901, "rewards/cosine_scaled_reward": 0.5054192344192415, "rewards/format_reward": 0.9696428701281548, "rewards/reasoning_steps_reward": 0.7875000596046448, "step": 325 }, { "completion_length": 743.2911071777344, "epoch": 0.704, "grad_norm": 2.428880550991712, "kl": 0.23929443359375, "learning_rate": 7.274961913568773e-07, "loss": 0.0096, "reward": 2.980492576956749, "reward_std": 0.7519855977967381, "rewards/accuracy_reward": 0.7517857424914837, "rewards/cosine_scaled_reward": 0.5114448417443782, "rewards/format_reward": 0.9589285865426064, "rewards/reasoning_steps_reward": 0.7583333984017372, "step": 330 }, { "completion_length": 777.944676208496, "epoch": 0.7146666666666667, "grad_norm": 10.667405755387346, "kl": 0.491375732421875, "learning_rate": 6.800710194892484e-07, "loss": 0.0197, "reward": 2.9669879227876663, "reward_std": 0.7909464325755835, "rewards/accuracy_reward": 0.7285714603960514, "rewards/cosine_scaled_reward": 0.4985354314424967, "rewards/format_reward": 0.9696428716182709, "rewards/reasoning_steps_reward": 0.7702381551265717, "step": 335 }, { "completion_length": 783.7107513427734, "epoch": 0.7253333333333334, "grad_norm": 20.401487806762606, "kl": 1.50914306640625, "learning_rate": 6.33787151823836e-07, "loss": 0.0604, "reward": 2.733260214328766, "reward_std": 0.9950653843581676, "rewards/accuracy_reward": 0.6535714577883482, "rewards/cosine_scaled_reward": 0.4219506177818403, "rewards/format_reward": 0.9071428775787354, "rewards/reasoning_steps_reward": 0.7505952984094619, "step": 340 }, { "completion_length": 747.0571807861328, "epoch": 0.736, "grad_norm": 3.8988932597998756, "kl": 0.765008544921875, "learning_rate": 5.887090134192947e-07, "loss": 0.0306, "reward": 2.950327825546265, "reward_std": 0.7912764519453048, "rewards/accuracy_reward": 0.7142857464030385, "rewards/cosine_scaled_reward": 0.4931848540902138, "rewards/format_reward": 0.9553571596741677, "rewards/reasoning_steps_reward": 0.7875000700354576, "step": 345 }, { "completion_length": 768.9161071777344, "epoch": 0.7466666666666667, "grad_norm": 5.964808758728403, "kl": 0.8441162109375, "learning_rate": 5.448993510134669e-07, "loss": 0.0338, "reward": 2.7927453070878983, "reward_std": 0.8378362115472555, "rewards/accuracy_reward": 0.6892857491970062, "rewards/cosine_scaled_reward": 0.44631660780869425, "rewards/format_reward": 0.9250000208616257, "rewards/reasoning_steps_reward": 0.7321429125964641, "step": 350 }, { "completion_length": 771.8000366210938, "epoch": 0.7573333333333333, "grad_norm": 19.67075460524417, "kl": 1.37095947265625, "learning_rate": 5.024191456827498e-07, "loss": 0.0549, "reward": 2.757797637581825, "reward_std": 0.9828206066042184, "rewards/accuracy_reward": 0.6946428813040256, "rewards/cosine_scaled_reward": 0.4447022658772767, "rewards/format_reward": 0.900000023841858, "rewards/reasoning_steps_reward": 0.7184524320065975, "step": 355 }, { "completion_length": 764.0839599609375, "epoch": 0.768, "grad_norm": 20.908101755536627, "kl": 1.678790283203125, "learning_rate": 4.6132752795918667e-07, "loss": 0.0672, "reward": 2.710779735445976, "reward_std": 0.9775115817785263, "rewards/accuracy_reward": 0.6642857462167739, "rewards/cosine_scaled_reward": 0.40601770151406524, "rewards/format_reward": 0.9107143133878708, "rewards/reasoning_steps_reward": 0.7297619603574276, "step": 360 }, { "completion_length": 763.6000320434571, "epoch": 0.7786666666666666, "grad_norm": 12.87435288128971, "kl": 2.571148681640625, "learning_rate": 4.2168169552342905e-07, "loss": 0.1028, "reward": 2.6531503438949584, "reward_std": 0.9446496672928333, "rewards/accuracy_reward": 0.6321428865194321, "rewards/cosine_scaled_reward": 0.38410261063836515, "rewards/format_reward": 0.9035714536905288, "rewards/reasoning_steps_reward": 0.7333333849906921, "step": 365 }, { "completion_length": 743.5286102294922, "epoch": 0.7893333333333333, "grad_norm": 9.936371570504198, "kl": 1.2698486328125, "learning_rate": 3.8353683358814046e-07, "loss": 0.0508, "reward": 2.7013536602258683, "reward_std": 0.862298522144556, "rewards/accuracy_reward": 0.6571428874507547, "rewards/cosine_scaled_reward": 0.4073059491813183, "rewards/format_reward": 0.9000000208616257, "rewards/reasoning_steps_reward": 0.7369048207998276, "step": 370 }, { "completion_length": 759.3643180847168, "epoch": 0.8, "grad_norm": 23.218570111303308, "kl": 2.2372650146484374, "learning_rate": 3.469460380826697e-07, "loss": 0.0895, "reward": 2.7877288803458216, "reward_std": 0.9023670472204686, "rewards/accuracy_reward": 0.6785714600235224, "rewards/cosine_scaled_reward": 0.4454668462276459, "rewards/format_reward": 0.9142857402563095, "rewards/reasoning_steps_reward": 0.7494048178195953, "step": 375 }, { "completion_length": 747.8786026000977, "epoch": 0.8106666666666666, "grad_norm": 5.9371726712745065, "kl": 1.312139892578125, "learning_rate": 3.119602417459075e-07, "loss": 0.0525, "reward": 2.8006283044815063, "reward_std": 0.8084595888853073, "rewards/accuracy_reward": 0.6607143096625805, "rewards/cosine_scaled_reward": 0.4244377123657614, "rewards/format_reward": 0.9464285865426063, "rewards/reasoning_steps_reward": 0.7690476909279823, "step": 380 }, { "completion_length": 746.4321762084961, "epoch": 0.8213333333333334, "grad_norm": 5.13518428447357, "kl": 0.994195556640625, "learning_rate": 2.786281432302071e-07, "loss": 0.0397, "reward": 2.861383581161499, "reward_std": 0.8250645313411951, "rewards/accuracy_reward": 0.7000000301748515, "rewards/cosine_scaled_reward": 0.44947873409837485, "rewards/format_reward": 0.9428571656346321, "rewards/reasoning_steps_reward": 0.7690476924180984, "step": 385 }, { "completion_length": 748.0875358581543, "epoch": 0.832, "grad_norm": 4.45042845549648, "kl": 1.18310546875, "learning_rate": 2.46996139315057e-07, "loss": 0.0474, "reward": 2.8951289474964144, "reward_std": 0.8875481400638818, "rewards/accuracy_reward": 0.7357143111526966, "rewards/cosine_scaled_reward": 0.4856050438596867, "rewards/format_reward": 0.9357143044471741, "rewards/reasoning_steps_reward": 0.738095298409462, "step": 390 }, { "completion_length": 742.2714614868164, "epoch": 0.8426666666666667, "grad_norm": 9.790655066948279, "kl": 1.13563232421875, "learning_rate": 2.1710826032485286e-07, "loss": 0.0454, "reward": 2.8695475578308107, "reward_std": 0.8187286149710417, "rewards/accuracy_reward": 0.7142857439815998, "rewards/cosine_scaled_reward": 0.4582379271276295, "rewards/format_reward": 0.9464285880327225, "rewards/reasoning_steps_reward": 0.7505952954292298, "step": 395 }, { "completion_length": 770.3428955078125, "epoch": 0.8533333333333334, "grad_norm": 4.399010377251051, "kl": 1.116912841796875, "learning_rate": 1.8900610884066817e-07, "loss": 0.0447, "reward": 2.7512567728757857, "reward_std": 0.9337207470089197, "rewards/accuracy_reward": 0.651785746961832, "rewards/cosine_scaled_reward": 0.4012566165998578, "rewards/format_reward": 0.9428571671247482, "rewards/reasoning_steps_reward": 0.7553571984171867, "step": 400 }, { "epoch": 0.8533333333333334, "eval_completion_length": 760.9953487792968, "eval_kl": 1.14858369140625, "eval_loss": 0.045983318239450455, "eval_reward": 2.729253951013088, "eval_reward_std": 0.8810034032523633, "eval_rewards/accuracy_reward": 0.6346000292986631, "eval_rewards/cosine_scaled_reward": 0.385330027777364, "eval_rewards/format_reward": 0.9519428731679916, "eval_rewards/reasoning_steps_reward": 0.7573810091674328, "eval_runtime": 34525.1878, "eval_samples_per_second": 0.145, "eval_steps_per_second": 0.01, "step": 400 }, { "completion_length": 753.4678894042969, "epoch": 0.864, "grad_norm": 4.304417705259892, "kl": 1.15230712890625, "learning_rate": 1.627288017913383e-07, "loss": 0.0461, "reward": 2.8077996015548705, "reward_std": 0.9062907833606004, "rewards/accuracy_reward": 0.676785746961832, "rewards/cosine_scaled_reward": 0.44053755011409523, "rewards/format_reward": 0.9517857328057289, "rewards/reasoning_steps_reward": 0.738690534979105, "step": 405 }, { "completion_length": 785.1232528686523, "epoch": 0.8746666666666667, "grad_norm": 5.246554486522317, "kl": 1.4623046875, "learning_rate": 1.3831291600445573e-07, "loss": 0.0585, "reward": 2.7128711313009264, "reward_std": 0.8996678274124861, "rewards/accuracy_reward": 0.6446428809314966, "rewards/cosine_scaled_reward": 0.4003709977958351, "rewards/format_reward": 0.9267857372760773, "rewards/reasoning_steps_reward": 0.7410714901983738, "step": 410 }, { "completion_length": 759.5286041259766, "epoch": 0.8853333333333333, "grad_norm": 4.4529003923911565, "kl": 0.970086669921875, "learning_rate": 1.1579243729307487e-07, "loss": 0.0388, "reward": 2.6863596469163893, "reward_std": 0.9374732062220573, "rewards/accuracy_reward": 0.6446428891271353, "rewards/cosine_scaled_reward": 0.3893356985412538, "rewards/format_reward": 0.9250000223517418, "rewards/reasoning_steps_reward": 0.7273810058832169, "step": 415 }, { "completion_length": 768.6143203735352, "epoch": 0.896, "grad_norm": 5.297256486680652, "kl": 1.40426025390625, "learning_rate": 9.519871314899092e-08, "loss": 0.0562, "reward": 2.7144743263721467, "reward_std": 0.9721049644052983, "rewards/accuracy_reward": 0.682142886146903, "rewards/cosine_scaled_reward": 0.4079266074113548, "rewards/format_reward": 0.9125000268220902, "rewards/reasoning_steps_reward": 0.7119048140943051, "step": 420 }, { "completion_length": 762.5839622497558, "epoch": 0.9066666666666666, "grad_norm": 5.808442367606542, "kl": 1.266558837890625, "learning_rate": 7.656040910844358e-08, "loss": 0.0507, "reward": 2.7946069568395613, "reward_std": 0.9051171492785215, "rewards/accuracy_reward": 0.6750000327825546, "rewards/cosine_scaled_reward": 0.42258303044363854, "rewards/format_reward": 0.9500000178813934, "rewards/reasoning_steps_reward": 0.7470238760113717, "step": 425 }, { "completion_length": 758.2143196105957, "epoch": 0.9173333333333333, "grad_norm": 4.124091919765889, "kl": 1.363201904296875, "learning_rate": 5.990346885098235e-08, "loss": 0.0546, "reward": 2.8003338128328323, "reward_std": 0.9829879272729158, "rewards/accuracy_reward": 0.698214316368103, "rewards/cosine_scaled_reward": 0.44914320297539234, "rewards/format_reward": 0.9303571596741677, "rewards/reasoning_steps_reward": 0.7226191096007824, "step": 430 }, { "completion_length": 761.5821762084961, "epoch": 0.928, "grad_norm": 41.306011448835555, "kl": 1.1220947265625, "learning_rate": 4.5251078087033493e-08, "loss": 0.0449, "reward": 2.9364974945783615, "reward_std": 0.9109487719833851, "rewards/accuracy_reward": 0.7285714596509933, "rewards/cosine_scaled_reward": 0.4841164079494774, "rewards/format_reward": 0.9303571656346321, "rewards/reasoning_steps_reward": 0.7934524446725846, "step": 435 }, { "completion_length": 762.6250328063965, "epoch": 0.9386666666666666, "grad_norm": 3.9945081680140406, "kl": 1.48448486328125, "learning_rate": 3.262363228443427e-08, "loss": 0.0594, "reward": 2.6933425307273864, "reward_std": 0.9241555985063314, "rewards/accuracy_reward": 0.6500000320374966, "rewards/cosine_scaled_reward": 0.399890087870881, "rewards/format_reward": 0.9357143074274064, "rewards/reasoning_steps_reward": 0.7077381528913975, "step": 440 }, { "completion_length": 768.3500335693359, "epoch": 0.9493333333333334, "grad_norm": 13.713505614062974, "kl": 1.749468994140625, "learning_rate": 2.2038708278862952e-08, "loss": 0.07, "reward": 2.676873904466629, "reward_std": 0.9034410756081342, "rewards/accuracy_reward": 0.6410714581608772, "rewards/cosine_scaled_reward": 0.3983023501932621, "rewards/format_reward": 0.9178571671247482, "rewards/reasoning_steps_reward": 0.7196429133415222, "step": 445 }, { "completion_length": 754.7732467651367, "epoch": 0.96, "grad_norm": 5.550660972818955, "kl": 1.152679443359375, "learning_rate": 1.3511039807673209e-08, "loss": 0.0461, "reward": 2.84182793200016, "reward_std": 0.8377097636461258, "rewards/accuracy_reward": 0.6964286038652062, "rewards/cosine_scaled_reward": 0.44599451111862437, "rewards/format_reward": 0.939285734295845, "rewards/reasoning_steps_reward": 0.7601191103458405, "step": 450 }, { "completion_length": 743.9089569091797, "epoch": 0.9706666666666667, "grad_norm": 2.2837748295404885, "kl": 0.9584228515625, "learning_rate": 7.0524970011963675e-09, "loss": 0.0383, "reward": 2.9682214707136154, "reward_std": 0.8571841098368168, "rewards/accuracy_reward": 0.7517857450991869, "rewards/cosine_scaled_reward": 0.5009594126604497, "rewards/format_reward": 0.9553571596741677, "rewards/reasoning_steps_reward": 0.7601191103458405, "step": 455 }, { "completion_length": 747.2446739196778, "epoch": 0.9813333333333333, "grad_norm": 60.34548052033379, "kl": 1.486627197265625, "learning_rate": 2.6720698600553595e-09, "loss": 0.0595, "reward": 2.8397951513528823, "reward_std": 0.8803306795656681, "rewards/accuracy_reward": 0.7000000329688192, "rewards/cosine_scaled_reward": 0.4570569400675595, "rewards/format_reward": 0.9517857328057289, "rewards/reasoning_steps_reward": 0.7309524327516556, "step": 460 }, { "completion_length": 766.3500358581543, "epoch": 0.992, "grad_norm": 18.459219357783663, "kl": 1.420770263671875, "learning_rate": 3.7585574148779613e-10, "loss": 0.0569, "reward": 2.704774260520935, "reward_std": 0.9208621602505446, "rewards/accuracy_reward": 0.6678571715950966, "rewards/cosine_scaled_reward": 0.4119170166202821, "rewards/format_reward": 0.8982143089175224, "rewards/reasoning_steps_reward": 0.72678577080369, "step": 465 }, { "completion_length": 773.5982462565104, "epoch": 0.9984, "kl": 1.5178934733072917, "reward": 2.810507302482923, "reward_std": 0.9296036226054033, "rewards/accuracy_reward": 0.6785714613894621, "rewards/cosine_scaled_reward": 0.44344370051597554, "rewards/format_reward": 0.9523809651533762, "rewards/reasoning_steps_reward": 0.7361111715435982, "step": 468, "total_flos": 0.0, "train_loss": 0.05152365299789334, "train_runtime": 191788.2346, "train_samples_per_second": 0.039, "train_steps_per_second": 0.002 } ], "logging_steps": 5, "max_steps": 468, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }