{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 100, "global_step": 468, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 638.4285987854004, "epoch": 0.010666666666666666, "grad_norm": 2.42055344581604, "kl": 0.00011775493621826171, "learning_rate": 3.1914893617021275e-07, "loss": 0.0, "reward": 1.1089806377887725, "reward_std": 0.8793896824121475, "rewards/accuracy_reward": 0.5767857443541289, "rewards/cosine_scaled_reward": 0.25779010977130384, "rewards/format_reward": 0.00357142873108387, "rewards/reasoning_steps_reward": 0.270833354908973, "step": 5 }, { "completion_length": 611.7714553833008, "epoch": 0.021333333333333333, "grad_norm": 1.2694348096847534, "kl": 0.00021342039108276367, "learning_rate": 6.382978723404255e-07, "loss": 0.0, "reward": 1.1431605055928231, "reward_std": 0.8805145360529423, "rewards/accuracy_reward": 0.5892857421189547, "rewards/cosine_scaled_reward": 0.3008985619725536, "rewards/format_reward": 0.001785714365541935, "rewards/reasoning_steps_reward": 0.25119049232453106, "step": 10 }, { "completion_length": 603.9625274658204, "epoch": 0.032, "grad_norm": 4.813971042633057, "kl": 0.00024839639663696287, "learning_rate": 9.574468085106384e-07, "loss": 0.0, "reward": 1.2853217244148254, "reward_std": 0.8000296212732791, "rewards/accuracy_reward": 0.6535714577883482, "rewards/cosine_scaled_reward": 0.3317502578254789, "rewards/format_reward": 0.00357142873108387, "rewards/reasoning_steps_reward": 0.2964285886846483, "step": 15 }, { "completion_length": 595.6982391357421, "epoch": 0.042666666666666665, "grad_norm": 0.9142511487007141, "kl": 0.00047616958618164064, "learning_rate": 1.276595744680851e-06, "loss": 0.0, "reward": 1.2075286597013473, "reward_std": 0.7695159167051315, "rewards/accuracy_reward": 0.6428571730852127, "rewards/cosine_scaled_reward": 0.3247905206750147, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.23988096807152032, "step": 20 }, { "completion_length": 644.2071708679199, "epoch": 0.05333333333333334, "grad_norm": 0.7305999994277954, "kl": 0.0009862899780273438, "learning_rate": 1.5957446808510639e-06, "loss": 0.0, "reward": 1.2186039187014104, "reward_std": 0.7756468575447798, "rewards/accuracy_reward": 0.6339286027476192, "rewards/cosine_scaled_reward": 0.3200919725000858, "rewards/format_reward": 0.0008928571827709675, "rewards/reasoning_steps_reward": 0.26369049586355686, "step": 25 }, { "completion_length": 669.2125297546387, "epoch": 0.064, "grad_norm": 0.9431222677230835, "kl": 0.0020453929901123047, "learning_rate": 1.9148936170212767e-06, "loss": 0.0001, "reward": 1.3601950403302907, "reward_std": 0.760306540131569, "rewards/accuracy_reward": 0.6732143152505159, "rewards/cosine_scaled_reward": 0.3795402319636196, "rewards/format_reward": 0.0008928571827709675, "rewards/reasoning_steps_reward": 0.3065476375631988, "step": 30 }, { "completion_length": 643.0696685791015, "epoch": 0.07466666666666667, "grad_norm": 48319720.0, "kl": 6272.004669189453, "learning_rate": 2.2340425531914894e-06, "loss": 250.5145, "reward": 1.289367458410561, "reward_std": 0.7266312446445227, "rewards/accuracy_reward": 0.676785746589303, "rewards/cosine_scaled_reward": 0.3441293075971771, "rewards/format_reward": 0.001785714365541935, "rewards/reasoning_steps_reward": 0.26666668280959127, "step": 35 }, { "completion_length": 669.8018165588379, "epoch": 0.08533333333333333, "grad_norm": 0.4609315097332001, "kl": 0.018144559860229493, "learning_rate": 2.553191489361702e-06, "loss": 0.0007, "reward": 1.4497444801032544, "reward_std": 0.6942832075059414, "rewards/accuracy_reward": 0.6982143171131611, "rewards/cosine_scaled_reward": 0.41283965120092037, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.3386905025690794, "step": 40 }, { "completion_length": 642.2321701049805, "epoch": 0.096, "grad_norm": 1.9606070518493652, "kl": 0.004248189926147461, "learning_rate": 2.872340425531915e-06, "loss": 0.0002, "reward": 1.4603484645485878, "reward_std": 0.7034628570079804, "rewards/accuracy_reward": 0.7232143163681031, "rewards/cosine_scaled_reward": 0.4335627053398639, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.3035714477300644, "step": 45 }, { "completion_length": 659.8625274658203, "epoch": 0.10666666666666667, "grad_norm": 1.5518417358398438, "kl": 0.004150962829589844, "learning_rate": 2.9996241442585123e-06, "loss": 0.0002, "reward": 1.5205835647881032, "reward_std": 0.7311159037053585, "rewards/accuracy_reward": 0.7303571753203869, "rewards/cosine_scaled_reward": 0.4155239976942539, "rewards/format_reward": 0.0008928571827709675, "rewards/reasoning_steps_reward": 0.37380955144762995, "step": 50 }, { "completion_length": 638.6232460021972, "epoch": 0.11733333333333333, "grad_norm": 1.3417630195617676, "kl": 0.01309032440185547, "learning_rate": 2.9973279301399446e-06, "loss": 0.0005, "reward": 1.4900454580783844, "reward_std": 0.7109502237290144, "rewards/accuracy_reward": 0.7035714603960515, "rewards/cosine_scaled_reward": 0.3855811151210219, "rewards/format_reward": 0.0026785715483129023, "rewards/reasoning_steps_reward": 0.3982143113389611, "step": 55 }, { "completion_length": 637.7786003112793, "epoch": 0.128, "grad_norm": 0.7655097842216492, "kl": 0.005014801025390625, "learning_rate": 2.992947502998804e-06, "loss": 0.0002, "reward": 1.5817858844995498, "reward_std": 0.7063489355146885, "rewards/accuracy_reward": 0.7285714685916901, "rewards/cosine_scaled_reward": 0.4130358204245567, "rewards/format_reward": 0.0026785715483129023, "rewards/reasoning_steps_reward": 0.43750002793967724, "step": 60 }, { "completion_length": 619.2321716308594, "epoch": 0.13866666666666666, "grad_norm": 0.6214168667793274, "kl": 0.006450653076171875, "learning_rate": 2.9864889601923268e-06, "loss": 0.0003, "reward": 1.618351523578167, "reward_std": 0.6625145003199577, "rewards/accuracy_reward": 0.7232143163681031, "rewards/cosine_scaled_reward": 0.4254943021107465, "rewards/format_reward": 0.00357142873108387, "rewards/reasoning_steps_reward": 0.4660714641213417, "step": 65 }, { "completion_length": 620.4696716308594, "epoch": 0.14933333333333335, "grad_norm": 1.1358349323272705, "kl": 0.007819366455078126, "learning_rate": 2.977961291721137e-06, "loss": 0.0003, "reward": 1.809953036904335, "reward_std": 0.685565372928977, "rewards/accuracy_reward": 0.7589285969734192, "rewards/cosine_scaled_reward": 0.47126249115681274, "rewards/format_reward": 0.00714285746216774, "rewards/reasoning_steps_reward": 0.5726190894842148, "step": 70 }, { "completion_length": 591.2000267028809, "epoch": 0.16, "grad_norm": 0.9748697876930237, "kl": 0.01055145263671875, "learning_rate": 2.9673763677155655e-06, "loss": 0.0004, "reward": 1.801955761015415, "reward_std": 0.6809038281440735, "rewards/accuracy_reward": 0.7410714589059353, "rewards/cosine_scaled_reward": 0.4311223858210724, "rewards/format_reward": 0.0053571430966258046, "rewards/reasoning_steps_reward": 0.6244048129767179, "step": 75 }, { "completion_length": 604.2036003112793, "epoch": 0.17066666666666666, "grad_norm": 0.41957351565361023, "kl": 0.01427001953125, "learning_rate": 2.9547489219129666e-06, "loss": 0.0006, "reward": 1.9645369604229928, "reward_std": 0.5586295232176781, "rewards/accuracy_reward": 0.8053571732714773, "rewards/cosine_scaled_reward": 0.4785249759210274, "rewards/format_reward": 0.0026785715483129023, "rewards/reasoning_steps_reward": 0.6779762431979179, "step": 80 }, { "completion_length": 654.9571723937988, "epoch": 0.18133333333333335, "grad_norm": 0.5244520306587219, "kl": 0.01689453125, "learning_rate": 2.9400965311490175e-06, "loss": 0.0007, "reward": 1.9714522436261177, "reward_std": 0.654699632152915, "rewards/accuracy_reward": 0.7232143158093095, "rewards/cosine_scaled_reward": 0.4538926437497139, "rewards/format_reward": 0.0026785715483129023, "rewards/reasoning_steps_reward": 0.7916667237877846, "step": 85 }, { "completion_length": 635.4571701049805, "epoch": 0.192, "grad_norm": 0.5995980501174927, "kl": 0.0202392578125, "learning_rate": 2.9234395908915565e-06, "loss": 0.0008, "reward": 1.9059573337435722, "reward_std": 0.6402278915047646, "rewards/accuracy_reward": 0.6857143208384514, "rewards/cosine_scaled_reward": 0.3851239001378417, "rewards/format_reward": 0.001785714365541935, "rewards/reasoning_steps_reward": 0.8333333969116211, "step": 90 }, { "completion_length": 646.8857414245606, "epoch": 0.20266666666666666, "grad_norm": 0.3573947846889496, "kl": 0.02309417724609375, "learning_rate": 2.904801286851009e-06, "loss": 0.0009, "reward": 2.067735290527344, "reward_std": 0.5824727656319737, "rewards/accuracy_reward": 0.739285740442574, "rewards/cosine_scaled_reward": 0.45374711682088675, "rewards/format_reward": 0.0026785715483129023, "rewards/reasoning_steps_reward": 0.8720238715410232, "step": 95 }, { "completion_length": 617.1839553833008, "epoch": 0.21333333333333335, "grad_norm": 0.4597730338573456, "kl": 0.0259674072265625, "learning_rate": 2.884207562706925e-06, "loss": 0.001, "reward": 2.1270981818437575, "reward_std": 0.5901715014129877, "rewards/accuracy_reward": 0.7750000283122063, "rewards/cosine_scaled_reward": 0.49138382682576776, "rewards/format_reward": 0.001785714365541935, "rewards/reasoning_steps_reward": 0.858928632736206, "step": 100 }, { "epoch": 0.21333333333333335, "eval_completion_length": 645.8245434692383, "eval_kl": 0.0286726806640625, "eval_loss": 0.0011589155765250325, "eval_reward": 1.9586333739757538, "eval_reward_std": 0.6544256884813309, "eval_rewards/accuracy_reward": 0.667685743278265, "eval_rewards/cosine_scaled_reward": 0.38711901631861545, "eval_rewards/format_reward": 0.0034000001534819605, "eval_rewards/reasoning_steps_reward": 0.9004286315560341, "eval_runtime": 30593.3304, "eval_samples_per_second": 0.163, "eval_steps_per_second": 0.012, "step": 100 }, { "completion_length": 666.8482437133789, "epoch": 0.224, "grad_norm": 0.3493061065673828, "kl": 0.027435302734375, "learning_rate": 2.8616870839955444e-06, "loss": 0.0011, "reward": 2.089837631583214, "reward_std": 0.6151095872744918, "rewards/accuracy_reward": 0.7214286036789417, "rewards/cosine_scaled_reward": 0.44727803440764546, "rewards/format_reward": 0.0026785715483129023, "rewards/reasoning_steps_reward": 0.9184524416923523, "step": 105 }, { "completion_length": 689.3928901672364, "epoch": 0.23466666666666666, "grad_norm": 0.7455374002456665, "kl": 0.0316131591796875, "learning_rate": 2.837271198208662e-06, "loss": 0.0013, "reward": 2.129907730221748, "reward_std": 0.5857493598014116, "rewards/accuracy_reward": 0.7428571671247483, "rewards/cosine_scaled_reward": 0.47633622232824563, "rewards/format_reward": 0.001785714365541935, "rewards/reasoning_steps_reward": 0.9089286178350449, "step": 110 }, { "completion_length": 648.7071731567382, "epoch": 0.24533333333333332, "grad_norm": 0.423718124628067, "kl": 0.0330322265625, "learning_rate": 2.8109938911593322e-06, "loss": 0.0013, "reward": 2.1340139895677566, "reward_std": 0.5813389342278242, "rewards/accuracy_reward": 0.7232143152505159, "rewards/cosine_scaled_reward": 0.4527639038278721, "rewards/format_reward": 0.008035714644938708, "rewards/reasoning_steps_reward": 0.9500000536441803, "step": 115 }, { "completion_length": 669.1518165588379, "epoch": 0.256, "grad_norm": 0.5140184760093689, "kl": 0.0355621337890625, "learning_rate": 2.7828917396751474e-06, "loss": 0.0015, "reward": 2.123083771765232, "reward_std": 0.6066710935905576, "rewards/accuracy_reward": 0.7285714626312256, "rewards/cosine_scaled_reward": 0.4486789128568489, "rewards/format_reward": 0.00357142873108387, "rewards/reasoning_steps_reward": 0.9422619551420212, "step": 120 }, { "completion_length": 651.8714584350586, "epoch": 0.26666666666666666, "grad_norm": 14.626392364501953, "kl": 0.054949951171875, "learning_rate": 2.753003860684943e-06, "loss": 0.0022, "reward": 2.238000822067261, "reward_std": 0.6136660899966955, "rewards/accuracy_reward": 0.7803571730852127, "rewards/cosine_scaled_reward": 0.5305602598935366, "rewards/format_reward": 0.00625000037252903, "rewards/reasoning_steps_reward": 0.9208333969116211, "step": 125 }, { "completion_length": 687.6518211364746, "epoch": 0.2773333333333333, "grad_norm": 0.4999578297138214, "kl": 0.0349700927734375, "learning_rate": 2.721371856769793e-06, "loss": 0.0014, "reward": 2.085830058157444, "reward_std": 0.5865108415484428, "rewards/accuracy_reward": 0.6946428868919611, "rewards/cosine_scaled_reward": 0.4269014226272702, "rewards/format_reward": 0.010714286193251609, "rewards/reasoning_steps_reward": 0.9535714745521545, "step": 130 }, { "completion_length": 625.8964546203613, "epoch": 0.288, "grad_norm": 0.5372105240821838, "kl": 0.039947509765625, "learning_rate": 2.688039758254093e-06, "loss": 0.0016, "reward": 2.243368774652481, "reward_std": 0.6026589145883918, "rewards/accuracy_reward": 0.7750000316649676, "rewards/cosine_scaled_reward": 0.4892020009458065, "rewards/format_reward": 0.01428571492433548, "rewards/reasoning_steps_reward": 0.9648810073733329, "step": 135 }, { "completion_length": 657.5696739196777, "epoch": 0.2986666666666667, "grad_norm": 0.6321460008621216, "kl": 0.0416748046875, "learning_rate": 2.65305396191733e-06, "loss": 0.0017, "reward": 2.1084320515394213, "reward_std": 0.7091156661510467, "rewards/accuracy_reward": 0.7071428887546063, "rewards/cosine_scaled_reward": 0.42956293127499523, "rewards/format_reward": 0.01696428647264838, "rewards/reasoning_steps_reward": 0.9547619611024857, "step": 140 }, { "completion_length": 669.2839614868165, "epoch": 0.30933333333333335, "grad_norm": 0.4815462529659271, "kl": 0.0447967529296875, "learning_rate": 2.61646316641186e-06, "loss": 0.0018, "reward": 2.022332654893398, "reward_std": 0.6810956679284572, "rewards/accuracy_reward": 0.6750000305473804, "rewards/cosine_scaled_reward": 0.4035826030303724, "rewards/format_reward": 0.009821429010480642, "rewards/reasoning_steps_reward": 0.9339286148548126, "step": 145 }, { "completion_length": 647.9393119812012, "epoch": 0.32, "grad_norm": 0.4961049556732178, "kl": 0.0510009765625, "learning_rate": 2.5783183044765715e-06, "loss": 0.002, "reward": 1.9679424732923507, "reward_std": 0.7103641763329506, "rewards/accuracy_reward": 0.6678571719676256, "rewards/cosine_scaled_reward": 0.3926447768812068, "rewards/format_reward": 0.008035714644938708, "rewards/reasoning_steps_reward": 0.8994048193097115, "step": 150 }, { "completion_length": 625.0803840637207, "epoch": 0.33066666666666666, "grad_norm": 0.5056385397911072, "kl": 0.06827392578125, "learning_rate": 2.5386724720408135e-06, "loss": 0.0027, "reward": 2.0651650190353394, "reward_std": 0.6431151006370783, "rewards/accuracy_reward": 0.7375000286847353, "rewards/cosine_scaled_reward": 0.45712922792881727, "rewards/format_reward": 0.009821429010480642, "rewards/reasoning_steps_reward": 0.860714353621006, "step": 155 }, { "completion_length": 611.2964599609375, "epoch": 0.3413333333333333, "grad_norm": 0.4566061794757843, "kl": 0.08109130859375, "learning_rate": 2.49758085431725e-06, "loss": 0.0032, "reward": 2.0270636796951296, "reward_std": 0.6181262265890837, "rewards/accuracy_reward": 0.7464285964146257, "rewards/cosine_scaled_reward": 0.43301601126149764, "rewards/format_reward": 0.012500000558793545, "rewards/reasoning_steps_reward": 0.8351191088557244, "step": 160 }, { "completion_length": 643.8536026000977, "epoch": 0.352, "grad_norm": 0.3811902403831482, "kl": 0.12118072509765625, "learning_rate": 2.455100648986533e-06, "loss": 0.0048, "reward": 1.9545473739504815, "reward_std": 0.7556829001754523, "rewards/accuracy_reward": 0.6857143171131611, "rewards/cosine_scaled_reward": 0.40722584864124656, "rewards/format_reward": 0.008035714644938708, "rewards/reasoning_steps_reward": 0.8535714909434319, "step": 165 }, { "completion_length": 666.0553924560547, "epoch": 0.3626666666666667, "grad_norm": 0.5245673656463623, "kl": 0.1458251953125, "learning_rate": 2.4112909865807053e-06, "loss": 0.0058, "reward": 1.7782447993755341, "reward_std": 0.7516406249254942, "rewards/accuracy_reward": 0.6017857398837805, "rewards/cosine_scaled_reward": 0.32050663968548176, "rewards/format_reward": 0.008928571827709675, "rewards/reasoning_steps_reward": 0.8470238700509072, "step": 170 }, { "completion_length": 656.5375297546386, "epoch": 0.37333333333333335, "grad_norm": 0.3452568054199219, "kl": 0.100311279296875, "learning_rate": 2.366212848176164e-06, "loss": 0.004, "reward": 2.048972634971142, "reward_std": 0.706931572034955, "rewards/accuracy_reward": 0.7178571790456771, "rewards/cosine_scaled_reward": 0.4686154007911682, "rewards/format_reward": 0.0053571430966258046, "rewards/reasoning_steps_reward": 0.8571429222822189, "step": 175 }, { "completion_length": 625.3053825378418, "epoch": 0.384, "grad_norm": 0.7150533199310303, "kl": 0.1327392578125, "learning_rate": 2.319928980510752e-06, "loss": 0.0053, "reward": 1.9931991159915925, "reward_std": 0.7508711714297533, "rewards/accuracy_reward": 0.7053571753203869, "rewards/cosine_scaled_reward": 0.4312942801974714, "rewards/format_reward": 0.012500000651925802, "rewards/reasoning_steps_reward": 0.8440476790070534, "step": 180 }, { "completion_length": 662.8018127441406, "epoch": 0.39466666666666667, "grad_norm": 0.5048078894615173, "kl": 0.2526092529296875, "learning_rate": 2.272503808643123e-06, "loss": 0.0101, "reward": 1.6179989255964755, "reward_std": 0.826694194227457, "rewards/accuracy_reward": 0.5803571738302707, "rewards/cosine_scaled_reward": 0.283177447039634, "rewards/format_reward": 0.009821429010480642, "rewards/reasoning_steps_reward": 0.7446429103612899, "step": 185 }, { "completion_length": 612.7518165588378, "epoch": 0.4053333333333333, "grad_norm": 15.008540153503418, "kl": 0.2234375, "learning_rate": 2.2240033462759628e-06, "loss": 0.0089, "reward": 1.8689126953482629, "reward_std": 0.8196798441931605, "rewards/accuracy_reward": 0.6892857480794191, "rewards/cosine_scaled_reward": 0.38706737738102676, "rewards/format_reward": 0.0062500002793967726, "rewards/reasoning_steps_reward": 0.786309577524662, "step": 190 }, { "completion_length": 636.9143135070801, "epoch": 0.416, "grad_norm": 1.8482682704925537, "kl": 0.1047210693359375, "learning_rate": 2.1744951038678905e-06, "loss": 0.0042, "reward": 2.1131999254226685, "reward_std": 0.6497842017561197, "rewards/accuracy_reward": 0.7464286003261804, "rewards/cosine_scaled_reward": 0.475997456186451, "rewards/format_reward": 0.011607143562287092, "rewards/reasoning_steps_reward": 0.8791667327284813, "step": 195 }, { "completion_length": 643.9375305175781, "epoch": 0.4266666666666667, "grad_norm": 1.6493088006973267, "kl": 0.1329986572265625, "learning_rate": 2.124047994661941e-06, "loss": 0.0053, "reward": 2.1098326206207276, "reward_std": 0.6796215798705816, "rewards/accuracy_reward": 0.7250000312924385, "rewards/cosine_scaled_reward": 0.4544754126574844, "rewards/format_reward": 0.019642858114093543, "rewards/reasoning_steps_reward": 0.9107143446803093, "step": 200 }, { "epoch": 0.4266666666666667, "eval_completion_length": 658.4622008789063, "eval_kl": 0.16044462890625, "eval_loss": 0.006352806463837624, "eval_reward": 2.0124860629320143, "eval_reward_std": 0.7236331352472305, "eval_rewards/accuracy_reward": 0.6609143146038056, "eval_rewards/cosine_scaled_reward": 0.3881764653600403, "eval_rewards/format_reward": 0.03644285902827978, "eval_rewards/reasoning_steps_reward": 0.9269524365663528, "eval_runtime": 39396.7661, "eval_samples_per_second": 0.127, "eval_steps_per_second": 0.009, "step": 200 }, { "completion_length": 664.1589576721192, "epoch": 0.43733333333333335, "grad_norm": 0.5569435358047485, "kl": 0.1752899169921875, "learning_rate": 2.072732238761434e-06, "loss": 0.007, "reward": 2.151426687836647, "reward_std": 0.6980113681405783, "rewards/accuracy_reward": 0.732142886519432, "rewards/cosine_scaled_reward": 0.4615456376457587, "rewards/format_reward": 0.03750000176951289, "rewards/reasoning_steps_reward": 0.9202381521463394, "step": 205 }, { "completion_length": 638.1839584350586, "epoch": 0.448, "grad_norm": 0.8609623312950134, "kl": 0.1427032470703125, "learning_rate": 2.0206192653867536e-06, "loss": 0.0057, "reward": 2.260927739739418, "reward_std": 0.6022655628621578, "rewards/accuracy_reward": 0.7767857406288385, "rewards/cosine_scaled_reward": 0.5171776844188571, "rewards/format_reward": 0.045535716507583854, "rewards/reasoning_steps_reward": 0.9214286223053932, "step": 210 }, { "completion_length": 723.9089622497559, "epoch": 0.45866666666666667, "grad_norm": 1.7239713668823242, "kl": 0.304364013671875, "learning_rate": 1.967781613449095e-06, "loss": 0.0122, "reward": 1.6578506268560886, "reward_std": 0.8106920622289181, "rewards/accuracy_reward": 0.5500000214204193, "rewards/cosine_scaled_reward": 0.2706481910310686, "rewards/format_reward": 0.020535715389996767, "rewards/reasoning_steps_reward": 0.8166667185723782, "step": 215 }, { "completion_length": 722.1321716308594, "epoch": 0.4693333333333333, "grad_norm": 3.4466731548309326, "kl": 0.541461181640625, "learning_rate": 1.9142928305795637e-06, "loss": 0.0217, "reward": 1.1115448012948037, "reward_std": 0.9772109590470791, "rewards/accuracy_reward": 0.3910714492201805, "rewards/cosine_scaled_reward": 0.09517570563766639, "rewards/format_reward": 0.01339285783469677, "rewards/reasoning_steps_reward": 0.6119048058986664, "step": 220 }, { "completion_length": 655.9178810119629, "epoch": 0.48, "grad_norm": 0.7818121910095215, "kl": 0.38446044921875, "learning_rate": 1.8602273707541886e-06, "loss": 0.0154, "reward": 1.443753632903099, "reward_std": 1.1735275402665137, "rewards/accuracy_reward": 0.5517857423052192, "rewards/cosine_scaled_reward": 0.263098827842623, "rewards/format_reward": 0.022321429569274187, "rewards/reasoning_steps_reward": 0.6065476641058922, "step": 225 }, { "completion_length": 669.6357391357421, "epoch": 0.49066666666666664, "grad_norm": 0.3009130656719208, "kl": 0.186187744140625, "learning_rate": 1.8056604906573418e-06, "loss": 0.0074, "reward": 1.8981928735971452, "reward_std": 0.8392410669475794, "rewards/accuracy_reward": 0.6964286047965288, "rewards/cosine_scaled_reward": 0.438073761574924, "rewards/format_reward": 0.012500000558793545, "rewards/reasoning_steps_reward": 0.7511905305087566, "step": 230 }, { "completion_length": 653.5732406616211, "epoch": 0.5013333333333333, "grad_norm": 4.625399589538574, "kl": 0.05731353759765625, "learning_rate": 1.7506681449278226e-06, "loss": 0.0023, "reward": 2.11394245326519, "reward_std": 0.6526154175400734, "rewards/accuracy_reward": 0.7375000312924385, "rewards/cosine_scaled_reward": 0.4865614231675863, "rewards/format_reward": 0.021428572479635477, "rewards/reasoning_steps_reward": 0.8684524416923523, "step": 235 }, { "completion_length": 645.69467086792, "epoch": 0.512, "grad_norm": 0.2851395606994629, "kl": 0.050042724609375, "learning_rate": 1.6953268804334257e-06, "loss": 0.002, "reward": 2.228366295993328, "reward_std": 0.5444579780101776, "rewards/accuracy_reward": 0.7642857372760773, "rewards/cosine_scaled_reward": 0.514675722271204, "rewards/format_reward": 0.03392857322469354, "rewards/reasoning_steps_reward": 0.9154762506484986, "step": 240 }, { "completion_length": 632.8053848266602, "epoch": 0.5226666666666666, "grad_norm": 0.34539249539375305, "kl": 0.0962860107421875, "learning_rate": 1.6397137297211436e-06, "loss": 0.0039, "reward": 2.337542861700058, "reward_std": 0.557469642162323, "rewards/accuracy_reward": 0.8142857376486063, "rewards/cosine_scaled_reward": 0.553614255785942, "rewards/format_reward": 0.041071430593729016, "rewards/reasoning_steps_reward": 0.9285714775323868, "step": 245 }, { "completion_length": 659.1518135070801, "epoch": 0.5333333333333333, "grad_norm": 0.30278754234313965, "kl": 0.0599609375, "learning_rate": 1.5839061037913395e-06, "loss": 0.0024, "reward": 2.421455779671669, "reward_std": 0.5301560776308178, "rewards/accuracy_reward": 0.8410714566707611, "rewards/cosine_scaled_reward": 0.6059795372188092, "rewards/format_reward": 0.028571429941803218, "rewards/reasoning_steps_reward": 0.9458333805203438, "step": 250 }, { "completion_length": 713.3518188476562, "epoch": 0.544, "grad_norm": 0.3068905770778656, "kl": 0.0843505859375, "learning_rate": 1.527981684345115e-06, "loss": 0.0034, "reward": 2.1079654544591904, "reward_std": 0.6414080807939172, "rewards/accuracy_reward": 0.7000000279396772, "rewards/cosine_scaled_reward": 0.4207629946060479, "rewards/format_reward": 0.03482143022119999, "rewards/reasoning_steps_reward": 0.9523809969425201, "step": 255 }, { "completion_length": 707.9411071777344, "epoch": 0.5546666666666666, "grad_norm": 0.33481159806251526, "kl": 0.07261962890625, "learning_rate": 1.4720183156548855e-06, "loss": 0.0029, "reward": 2.2514570981264113, "reward_std": 0.6310873694717885, "rewards/accuracy_reward": 0.7321428906172514, "rewards/cosine_scaled_reward": 0.5041355590336025, "rewards/format_reward": 0.05089285969734192, "rewards/reasoning_steps_reward": 0.9642857611179352, "step": 260 }, { "completion_length": 718.3107444763184, "epoch": 0.5653333333333334, "grad_norm": 0.38096827268600464, "kl": 0.08720703125, "learning_rate": 1.4160938962086612e-06, "loss": 0.0035, "reward": 2.1334225252270698, "reward_std": 0.6659108363091946, "rewards/accuracy_reward": 0.7035714538767934, "rewards/cosine_scaled_reward": 0.44383912505581974, "rewards/format_reward": 0.04732143105939031, "rewards/reasoning_steps_reward": 0.9386905342340469, "step": 265 }, { "completion_length": 691.2268180847168, "epoch": 0.576, "grad_norm": 0.28566083312034607, "kl": 0.088311767578125, "learning_rate": 1.3602862702788567e-06, "loss": 0.0035, "reward": 2.2702409833669663, "reward_std": 0.6507027853280306, "rewards/accuracy_reward": 0.7500000327825547, "rewards/cosine_scaled_reward": 0.508336108038202, "rewards/format_reward": 0.057142859976738694, "rewards/reasoning_steps_reward": 0.9547619566321373, "step": 270 }, { "completion_length": 688.7464607238769, "epoch": 0.5866666666666667, "grad_norm": 0.35180163383483887, "kl": 0.0841522216796875, "learning_rate": 1.3046731195665748e-06, "loss": 0.0034, "reward": 2.3181463330984116, "reward_std": 0.6268971297889948, "rewards/accuracy_reward": 0.7696428872644901, "rewards/cosine_scaled_reward": 0.5246938619762659, "rewards/format_reward": 0.06785714644938708, "rewards/reasoning_steps_reward": 0.9559524253010749, "step": 275 }, { "completion_length": 687.6589630126953, "epoch": 0.5973333333333334, "grad_norm": 0.3075723648071289, "kl": 0.0902801513671875, "learning_rate": 1.2493318550721775e-06, "loss": 0.0036, "reward": 2.242439457774162, "reward_std": 0.6110217805951834, "rewards/accuracy_reward": 0.7375000305473804, "rewards/cosine_scaled_reward": 0.49422508366405965, "rewards/format_reward": 0.0678571461699903, "rewards/reasoning_steps_reward": 0.9428571984171867, "step": 280 }, { "completion_length": 702.2446731567383, "epoch": 0.608, "grad_norm": 0.3365749716758728, "kl": 0.1429901123046875, "learning_rate": 1.1943395093426585e-06, "loss": 0.0057, "reward": 2.236051079630852, "reward_std": 0.6939984124153853, "rewards/accuracy_reward": 0.7464286036789417, "rewards/cosine_scaled_reward": 0.515515277441591, "rewards/format_reward": 0.06517857508733868, "rewards/reasoning_steps_reward": 0.9089286342263222, "step": 285 }, { "completion_length": 667.2232437133789, "epoch": 0.6186666666666667, "grad_norm": 0.3665401041507721, "kl": 0.1312591552734375, "learning_rate": 1.1397726292458115e-06, "loss": 0.0053, "reward": 2.2684289827942847, "reward_std": 0.6653882045298815, "rewards/accuracy_reward": 0.751785746589303, "rewards/cosine_scaled_reward": 0.5151550889015197, "rewards/format_reward": 0.07410714691504836, "rewards/reasoning_steps_reward": 0.927381020784378, "step": 290 }, { "completion_length": 723.4911033630372, "epoch": 0.6293333333333333, "grad_norm": 0.25975558161735535, "kl": 0.14805908203125, "learning_rate": 1.085707169420437e-06, "loss": 0.0059, "reward": 2.1135958269238473, "reward_std": 0.6525055527687073, "rewards/accuracy_reward": 0.6732143115252256, "rewards/cosine_scaled_reward": 0.4448457522317767, "rewards/format_reward": 0.08660714710131287, "rewards/reasoning_steps_reward": 0.9089286252856255, "step": 295 }, { "completion_length": 685.9928894042969, "epoch": 0.64, "grad_norm": 0.32239413261413574, "kl": 0.1186309814453125, "learning_rate": 1.0322183865509054e-06, "loss": 0.0047, "reward": 2.3491215094923974, "reward_std": 0.6740828949958086, "rewards/accuracy_reward": 0.7928571742027998, "rewards/cosine_scaled_reward": 0.5387047556228935, "rewards/format_reward": 0.11339286286383868, "rewards/reasoning_steps_reward": 0.9041667297482491, "step": 300 }, { "epoch": 0.64, "eval_completion_length": 692.8417169921875, "eval_kl": 0.126764990234375, "eval_loss": 0.005116811487823725, "eval_reward": 2.1439696138501168, "eval_reward_std": 0.7277915328145027, "eval_rewards/accuracy_reward": 0.6845143143117428, "eval_rewards/cosine_scaled_reward": 0.44376001094253736, "eval_rewards/format_reward": 0.10977143388986588, "eval_rewards/reasoning_steps_reward": 0.9059238699197769, "eval_runtime": 39599.431, "eval_samples_per_second": 0.126, "eval_steps_per_second": 0.009, "step": 300 }, { "completion_length": 699.3339584350585, "epoch": 0.6506666666666666, "grad_norm": 0.3004523515701294, "kl": 0.1110198974609375, "learning_rate": 9.793807346132464e-07, "loss": 0.0044, "reward": 2.2790004700422286, "reward_std": 0.7021285973489284, "rewards/accuracy_reward": 0.7607143171131611, "rewards/cosine_scaled_reward": 0.5272146660834551, "rewards/format_reward": 0.09107143292203546, "rewards/reasoning_steps_reward": 0.9000000640749931, "step": 305 }, { "completion_length": 699.3411010742187, "epoch": 0.6613333333333333, "grad_norm": 0.29891085624694824, "kl": 0.13359375, "learning_rate": 9.272677612385667e-07, "loss": 0.0053, "reward": 2.1629825204610826, "reward_std": 0.6936808105558157, "rewards/accuracy_reward": 0.7142857410013675, "rewards/cosine_scaled_reward": 0.46447055372409524, "rewards/format_reward": 0.097321433480829, "rewards/reasoning_steps_reward": 0.8869048282504082, "step": 310 }, { "completion_length": 706.1196708679199, "epoch": 0.672, "grad_norm": 0.3089028596878052, "kl": 0.1456146240234375, "learning_rate": 8.759520053380591e-07, "loss": 0.0058, "reward": 2.1019979074597357, "reward_std": 0.740777799114585, "rewards/accuracy_reward": 0.6803571773692966, "rewards/cosine_scaled_reward": 0.44158115636964795, "rewards/format_reward": 0.08839286118745804, "rewards/reasoning_steps_reward": 0.8916667312383652, "step": 315 }, { "completion_length": 669.9286071777344, "epoch": 0.6826666666666666, "grad_norm": 0.6965083479881287, "kl": 0.145257568359375, "learning_rate": 8.255048961321088e-07, "loss": 0.0058, "reward": 2.272875265777111, "reward_std": 0.7233634147793054, "rewards/accuracy_reward": 0.7553571749478578, "rewards/cosine_scaled_reward": 0.5264466149732471, "rewards/format_reward": 0.10892857704311609, "rewards/reasoning_steps_reward": 0.8821429207921028, "step": 320 }, { "completion_length": 692.9786003112793, "epoch": 0.6933333333333334, "grad_norm": 0.33192208409309387, "kl": 0.1722869873046875, "learning_rate": 7.759966537240373e-07, "loss": 0.0069, "reward": 2.169807307422161, "reward_std": 0.8210989892482757, "rewards/accuracy_reward": 0.7089285964146257, "rewards/cosine_scaled_reward": 0.4582000946626067, "rewards/format_reward": 0.11160714961588383, "rewards/reasoning_steps_reward": 0.8910714894533157, "step": 325 }, { "completion_length": 705.5911056518555, "epoch": 0.704, "grad_norm": 0.5306978821754456, "kl": 0.24217529296875, "learning_rate": 7.274961913568773e-07, "loss": 0.0097, "reward": 2.0500947162508965, "reward_std": 0.8266730591654777, "rewards/accuracy_reward": 0.667857170663774, "rewards/cosine_scaled_reward": 0.44116610190831124, "rewards/format_reward": 0.11071429131552576, "rewards/reasoning_steps_reward": 0.8303571999073028, "step": 330 }, { "completion_length": 731.1839591979981, "epoch": 0.7146666666666667, "grad_norm": 0.4849264919757843, "kl": 0.2579345703125, "learning_rate": 6.800710194892484e-07, "loss": 0.0103, "reward": 1.9499552190303802, "reward_std": 0.9148915704339743, "rewards/accuracy_reward": 0.6428571701049804, "rewards/cosine_scaled_reward": 0.3943003877531737, "rewards/format_reward": 0.10267857694998384, "rewards/reasoning_steps_reward": 0.8101191118359565, "step": 335 }, { "completion_length": 733.7321731567383, "epoch": 0.7253333333333334, "grad_norm": 0.6697672009468079, "kl": 0.3463623046875, "learning_rate": 6.33787151823836e-07, "loss": 0.0139, "reward": 1.6754619617015123, "reward_std": 1.0652375385165214, "rewards/accuracy_reward": 0.5571428790688515, "rewards/cosine_scaled_reward": 0.29867618879216024, "rewards/format_reward": 0.07500000381842256, "rewards/reasoning_steps_reward": 0.7446429077535868, "step": 340 }, { "completion_length": 708.6964599609375, "epoch": 0.736, "grad_norm": 0.342541366815567, "kl": 0.3625, "learning_rate": 5.887090134192947e-07, "loss": 0.0145, "reward": 1.7074500739574432, "reward_std": 1.06461516097188, "rewards/accuracy_reward": 0.5910714540630579, "rewards/cosine_scaled_reward": 0.3163785987533629, "rewards/format_reward": 0.07321428908035159, "rewards/reasoning_steps_reward": 0.7267857655882836, "step": 345 }, { "completion_length": 741.1839630126954, "epoch": 0.7466666666666667, "grad_norm": 0.5499653816223145, "kl": 0.4493896484375, "learning_rate": 5.448993510134669e-07, "loss": 0.018, "reward": 1.4979531578719616, "reward_std": 1.063758409768343, "rewards/accuracy_reward": 0.5339285928755999, "rewards/cosine_scaled_reward": 0.2485483249882236, "rewards/format_reward": 0.06071428917348385, "rewards/reasoning_steps_reward": 0.6547619514167309, "step": 350 }, { "completion_length": 718.5732467651367, "epoch": 0.7573333333333333, "grad_norm": 0.8918161988258362, "kl": 0.5351806640625, "learning_rate": 5.024191456827498e-07, "loss": 0.0214, "reward": 1.2987217612564563, "reward_std": 1.1655599363148212, "rewards/accuracy_reward": 0.49464288353919983, "rewards/cosine_scaled_reward": 0.19366217765491456, "rewards/format_reward": 0.04017857378348708, "rewards/reasoning_steps_reward": 0.5702381379902363, "step": 355 }, { "completion_length": 703.1107444763184, "epoch": 0.768, "grad_norm": 0.3764072358608246, "kl": 0.4150390625, "learning_rate": 4.6132752795918667e-07, "loss": 0.0166, "reward": 1.4548213778063654, "reward_std": 1.117940279096365, "rewards/accuracy_reward": 0.5250000219792128, "rewards/cosine_scaled_reward": 0.25392846008762715, "rewards/format_reward": 0.054464288800954816, "rewards/reasoning_steps_reward": 0.6214286223053932, "step": 360 }, { "completion_length": 702.7321716308594, "epoch": 0.7786666666666666, "grad_norm": 0.536405086517334, "kl": 0.2932281494140625, "learning_rate": 4.2168169552342905e-07, "loss": 0.0117, "reward": 1.7472290426492691, "reward_std": 1.064868475496769, "rewards/accuracy_reward": 0.614285746589303, "rewards/cosine_scaled_reward": 0.35437183200847355, "rewards/format_reward": 0.06071428880095482, "rewards/reasoning_steps_reward": 0.7178571954369545, "step": 365 }, { "completion_length": 667.4143173217774, "epoch": 0.7893333333333333, "grad_norm": 1.1500115394592285, "kl": 0.255450439453125, "learning_rate": 3.8353683358814046e-07, "loss": 0.0102, "reward": 1.826224359869957, "reward_std": 0.9232858289033175, "rewards/accuracy_reward": 0.6482143169268966, "rewards/cosine_scaled_reward": 0.3732480947277509, "rewards/format_reward": 0.06607143199071289, "rewards/reasoning_steps_reward": 0.7386905357241631, "step": 370 }, { "completion_length": 681.9518119812012, "epoch": 0.8, "grad_norm": 0.8492513298988342, "kl": 0.2910614013671875, "learning_rate": 3.469460380826697e-07, "loss": 0.0117, "reward": 1.7300246395170689, "reward_std": 0.9816528409719467, "rewards/accuracy_reward": 0.6125000230967999, "rewards/cosine_scaled_reward": 0.3600841243751347, "rewards/format_reward": 0.052678574342280626, "rewards/reasoning_steps_reward": 0.7047619506716728, "step": 375 }, { "completion_length": 683.8536003112793, "epoch": 0.8106666666666666, "grad_norm": 0.43946385383605957, "kl": 0.35491943359375, "learning_rate": 3.119602417459075e-07, "loss": 0.0142, "reward": 1.6164295073598622, "reward_std": 1.0403125062584877, "rewards/accuracy_reward": 0.5767857421189546, "rewards/cosine_scaled_reward": 0.29619133038795553, "rewards/format_reward": 0.0482142879627645, "rewards/reasoning_steps_reward": 0.6952381365001201, "step": 380 }, { "completion_length": 661.137525177002, "epoch": 0.8213333333333334, "grad_norm": 0.5927759408950806, "kl": 0.251336669921875, "learning_rate": 2.786281432302071e-07, "loss": 0.0101, "reward": 1.8459785029292106, "reward_std": 0.8845801506191492, "rewards/accuracy_reward": 0.6821428865194321, "rewards/cosine_scaled_reward": 0.3781212717294693, "rewards/format_reward": 0.06428571781143547, "rewards/reasoning_steps_reward": 0.7214286215603352, "step": 385 }, { "completion_length": 700.2768203735352, "epoch": 0.832, "grad_norm": 0.5752273797988892, "kl": 0.379559326171875, "learning_rate": 2.46996139315057e-07, "loss": 0.0152, "reward": 1.6465823888778686, "reward_std": 1.0167622987180949, "rewards/accuracy_reward": 0.6142857445403933, "rewards/cosine_scaled_reward": 0.31146327857859435, "rewards/format_reward": 0.07500000344589353, "rewards/reasoning_steps_reward": 0.6458333857357502, "step": 390 }, { "completion_length": 688.6482421875, "epoch": 0.8426666666666667, "grad_norm": 0.41832882165908813, "kl": 0.379150390625, "learning_rate": 2.1710826032485286e-07, "loss": 0.0152, "reward": 1.6644656013697385, "reward_std": 0.9824759595096111, "rewards/accuracy_reward": 0.6250000283122062, "rewards/cosine_scaled_reward": 0.3260727058397606, "rewards/format_reward": 0.054464288614690305, "rewards/reasoning_steps_reward": 0.658928620070219, "step": 395 }, { "completion_length": 719.937533569336, "epoch": 0.8533333333333334, "grad_norm": 0.5534791350364685, "kl": 0.382177734375, "learning_rate": 1.8900610884066817e-07, "loss": 0.0153, "reward": 1.4879010431468487, "reward_std": 1.0550432510674, "rewards/accuracy_reward": 0.5410714585334062, "rewards/cosine_scaled_reward": 0.2453414467825496, "rewards/format_reward": 0.0491071455180645, "rewards/reasoning_steps_reward": 0.6523810014128685, "step": 400 }, { "epoch": 0.8533333333333334, "eval_completion_length": 695.8426594726562, "eval_kl": 0.383571875, "eval_loss": 0.015375643037259579, "eval_reward": 1.5146705395892262, "eval_reward_std": 1.0417588331997394, "eval_rewards/accuracy_reward": 0.5409714534372091, "eval_rewards/cosine_scaled_reward": 0.24984666706966235, "eval_rewards/format_reward": 0.060157146042585374, "eval_rewards/reasoning_steps_reward": 0.66369528632164, "eval_runtime": 40348.1586, "eval_samples_per_second": 0.124, "eval_steps_per_second": 0.009, "step": 400 }, { "completion_length": 709.2018096923828, "epoch": 0.864, "grad_norm": 0.37454745173454285, "kl": 0.43958740234375, "learning_rate": 1.627288017913383e-07, "loss": 0.0176, "reward": 1.5630248546600343, "reward_std": 1.0267837572842837, "rewards/accuracy_reward": 0.5678571719676256, "rewards/cosine_scaled_reward": 0.28772715290542694, "rewards/format_reward": 0.04910714561119676, "rewards/reasoning_steps_reward": 0.6583333760499954, "step": 405 }, { "completion_length": 715.1696792602539, "epoch": 0.8746666666666667, "grad_norm": 0.5133277773857117, "kl": 0.399395751953125, "learning_rate": 1.3831291600445573e-07, "loss": 0.016, "reward": 1.5371075724251568, "reward_std": 1.0601157665252685, "rewards/accuracy_reward": 0.553571455925703, "rewards/cosine_scaled_reward": 0.28829799513332544, "rewards/format_reward": 0.053571431431919336, "rewards/reasoning_steps_reward": 0.6416667148470878, "step": 410 }, { "completion_length": 693.0446723937988, "epoch": 0.8853333333333333, "grad_norm": 0.7482662200927734, "kl": 0.376470947265625, "learning_rate": 1.1579243729307487e-07, "loss": 0.0151, "reward": 1.516674379259348, "reward_std": 0.9749270871281623, "rewards/accuracy_reward": 0.560714315250516, "rewards/cosine_scaled_reward": 0.27411481700837614, "rewards/format_reward": 0.043750001955777405, "rewards/reasoning_steps_reward": 0.638095286488533, "step": 415 }, { "completion_length": 708.925032043457, "epoch": 0.896, "grad_norm": 0.38554155826568604, "kl": 0.4101318359375, "learning_rate": 9.519871314899092e-08, "loss": 0.0164, "reward": 1.5347512325271964, "reward_std": 1.034306138008833, "rewards/accuracy_reward": 0.585714316368103, "rewards/cosine_scaled_reward": 0.2793940259842202, "rewards/format_reward": 0.05000000260770321, "rewards/reasoning_steps_reward": 0.6196429081261158, "step": 420 }, { "completion_length": 692.3571731567383, "epoch": 0.9066666666666666, "grad_norm": 0.390541672706604, "kl": 0.294134521484375, "learning_rate": 7.656040910844358e-08, "loss": 0.0118, "reward": 1.7413318648934364, "reward_std": 0.9963843055069447, "rewards/accuracy_reward": 0.6285714589059352, "rewards/cosine_scaled_reward": 0.3463913181563839, "rewards/format_reward": 0.04732143124565482, "rewards/reasoning_steps_reward": 0.7190476730465889, "step": 425 }, { "completion_length": 683.8750282287598, "epoch": 0.9173333333333333, "grad_norm": 0.5177262425422668, "kl": 0.330364990234375, "learning_rate": 5.990346885098235e-08, "loss": 0.0132, "reward": 1.6970172494649887, "reward_std": 1.0683425880968571, "rewards/accuracy_reward": 0.6142857454717159, "rewards/cosine_scaled_reward": 0.3476124212145805, "rewards/format_reward": 0.057142860256135464, "rewards/reasoning_steps_reward": 0.6779762372374535, "step": 430 }, { "completion_length": 693.9232406616211, "epoch": 0.928, "grad_norm": 0.41641440987586975, "kl": 0.335888671875, "learning_rate": 4.5251078087033493e-08, "loss": 0.0134, "reward": 1.7540825940668583, "reward_std": 1.0200565621256827, "rewards/accuracy_reward": 0.6160714615136385, "rewards/cosine_scaled_reward": 0.35378490211442115, "rewards/format_reward": 0.06875000363215804, "rewards/reasoning_steps_reward": 0.7154762402176857, "step": 435 }, { "completion_length": 684.9786003112793, "epoch": 0.9386666666666666, "grad_norm": 0.6882645487785339, "kl": 0.365093994140625, "learning_rate": 3.262363228443427e-08, "loss": 0.0146, "reward": 1.6049893379211426, "reward_std": 0.9915731698274612, "rewards/accuracy_reward": 0.6035714587196708, "rewards/cosine_scaled_reward": 0.3165964335203171, "rewards/format_reward": 0.04732143105939031, "rewards/reasoning_steps_reward": 0.6375000439584255, "step": 440 }, { "completion_length": 713.3928909301758, "epoch": 0.9493333333333334, "grad_norm": 0.48911258578300476, "kl": 0.3521331787109375, "learning_rate": 2.2038708278862952e-08, "loss": 0.0141, "reward": 1.5449063807725907, "reward_std": 0.9845283433794976, "rewards/accuracy_reward": 0.5500000244006514, "rewards/cosine_scaled_reward": 0.28419203840894625, "rewards/format_reward": 0.05178571678698063, "rewards/reasoning_steps_reward": 0.6589286208152771, "step": 445 }, { "completion_length": 672.1143127441406, "epoch": 0.96, "grad_norm": 0.5151104927062988, "kl": 0.319622802734375, "learning_rate": 1.3511039807673209e-08, "loss": 0.0128, "reward": 1.7190548315644265, "reward_std": 1.052689327299595, "rewards/accuracy_reward": 0.6339285988360643, "rewards/cosine_scaled_reward": 0.3443523827940226, "rewards/format_reward": 0.0562500024214387, "rewards/reasoning_steps_reward": 0.6845238626003265, "step": 450 }, { "completion_length": 676.0714645385742, "epoch": 0.9706666666666667, "grad_norm": 0.6873491406440735, "kl": 0.286529541015625, "learning_rate": 7.0524970011963675e-09, "loss": 0.0115, "reward": 1.8955881476402283, "reward_std": 0.9624031879007816, "rewards/accuracy_reward": 0.682142891176045, "rewards/cosine_scaled_reward": 0.4223737971391529, "rewards/format_reward": 0.07857143282890319, "rewards/reasoning_steps_reward": 0.7125000573694706, "step": 455 }, { "completion_length": 679.2321739196777, "epoch": 0.9813333333333333, "grad_norm": 0.3787095546722412, "kl": 0.304974365234375, "learning_rate": 2.6720698600553595e-09, "loss": 0.0122, "reward": 1.7936133489012718, "reward_std": 1.0248655170202254, "rewards/accuracy_reward": 0.6535714577883482, "rewards/cosine_scaled_reward": 0.38111327985534443, "rewards/format_reward": 0.08214286155998707, "rewards/reasoning_steps_reward": 0.6767857633531094, "step": 460 }, { "completion_length": 696.1339584350586, "epoch": 0.992, "grad_norm": 0.40489259362220764, "kl": 0.3529052734375, "learning_rate": 3.7585574148779613e-10, "loss": 0.0141, "reward": 1.6771088674664498, "reward_std": 1.0866830073297025, "rewards/accuracy_reward": 0.5982143137603998, "rewards/cosine_scaled_reward": 0.3318706821650267, "rewards/format_reward": 0.053571431525051595, "rewards/reasoning_steps_reward": 0.6934524282813073, "step": 465 }, { "completion_length": 688.1964645385742, "epoch": 0.9984, "kl": 0.2928059895833333, "reward": 1.8073695426185925, "reward_std": 1.0462930103143055, "rewards/accuracy_reward": 0.6517857536673546, "rewards/cosine_scaled_reward": 0.40607976416746777, "rewards/format_reward": 0.049107145673284926, "rewards/reasoning_steps_reward": 0.700396885474523, "step": 468, "total_flos": 0.0, "train_loss": 2.683533102224817, "train_runtime": 211196.195, "train_samples_per_second": 0.036, "train_steps_per_second": 0.002 } ], "logging_steps": 5, "max_steps": 468, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }