Qwen-2.5-7B-Simple-RL / trainer_state.json
shanshanbang's picture
Model save
4189a2c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9984,
"eval_steps": 100,
"global_step": 468,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 626.9428821563721,
"epoch": 0.010666666666666666,
"grad_norm": 3.135220183260807,
"kl": 0.00012028217315673828,
"learning_rate": 3.1914893617021275e-07,
"loss": 0.0,
"reward": 1.1311939403414726,
"reward_std": 0.9199116222560406,
"rewards/accuracy_reward": 0.5946428846567869,
"rewards/cosine_scaled_reward": 0.28655105652287605,
"rewards/format_reward": 0.00714285746216774,
"rewards/reasoning_steps_reward": 0.2428571599535644,
"step": 5
},
{
"completion_length": 610.6661018371582,
"epoch": 0.021333333333333333,
"grad_norm": 5.7259660177649625,
"kl": 0.0001917600631713867,
"learning_rate": 6.382978723404255e-07,
"loss": 0.0,
"reward": 1.2825231447815895,
"reward_std": 0.8677008092403412,
"rewards/accuracy_reward": 0.6535714611411094,
"rewards/cosine_scaled_reward": 0.33728500208817425,
"rewards/format_reward": 0.001785714365541935,
"rewards/reasoning_steps_reward": 0.28988096918910744,
"step": 10
},
{
"completion_length": 609.1125274658203,
"epoch": 0.032,
"grad_norm": 1.0344908113531213,
"kl": 0.00023615360260009766,
"learning_rate": 9.574468085106384e-07,
"loss": 0.0,
"reward": 1.3210706368088723,
"reward_std": 0.8099735792726278,
"rewards/accuracy_reward": 0.6767857499420643,
"rewards/cosine_scaled_reward": 0.3460705999750644,
"rewards/format_reward": 0.001785714365541935,
"rewards/reasoning_steps_reward": 0.29642859203740957,
"step": 15
},
{
"completion_length": 611.9785972595215,
"epoch": 0.042666666666666665,
"grad_norm": 1.2756183417905018,
"kl": 0.0007787942886352539,
"learning_rate": 1.276595744680851e-06,
"loss": 0.0,
"reward": 1.2778369441628457,
"reward_std": 0.7753925062716007,
"rewards/accuracy_reward": 0.6625000327825546,
"rewards/cosine_scaled_reward": 0.35878926496952773,
"rewards/format_reward": 0.001785714365541935,
"rewards/reasoning_steps_reward": 0.2547619211487472,
"step": 20
},
{
"completion_length": 644.8321746826172,
"epoch": 0.05333333333333334,
"grad_norm": 12.666142241785211,
"kl": 0.002669668197631836,
"learning_rate": 1.5957446808510639e-06,
"loss": 0.0001,
"reward": 1.2453887628391385,
"reward_std": 0.7985292233526706,
"rewards/accuracy_reward": 0.641071455925703,
"rewards/cosine_scaled_reward": 0.34419824378564956,
"rewards/format_reward": 0.00357142873108387,
"rewards/reasoning_steps_reward": 0.25654763616621495,
"step": 25
},
{
"completion_length": 664.1107467651367,
"epoch": 0.064,
"grad_norm": 2.039644155087791,
"kl": 0.0030992507934570314,
"learning_rate": 1.9148936170212767e-06,
"loss": 0.0001,
"reward": 1.3286324340850115,
"reward_std": 0.7313086107373238,
"rewards/accuracy_reward": 0.6696428894996643,
"rewards/cosine_scaled_reward": 0.3649419266730547,
"rewards/format_reward": 0.001785714365541935,
"rewards/reasoning_steps_reward": 0.2922619292512536,
"step": 30
},
{
"completion_length": 646.792887878418,
"epoch": 0.07466666666666667,
"grad_norm": 1.110778707013103,
"kl": 0.004967975616455078,
"learning_rate": 2.2340425531914894e-06,
"loss": 0.0002,
"reward": 1.2785038705915213,
"reward_std": 0.732379237562418,
"rewards/accuracy_reward": 0.6642857423052192,
"rewards/cosine_scaled_reward": 0.335646699834615,
"rewards/format_reward": 0.001785714365541935,
"rewards/reasoning_steps_reward": 0.276785734295845,
"step": 35
},
{
"completion_length": 657.1143142700196,
"epoch": 0.08533333333333333,
"grad_norm": 0.9844730773757054,
"kl": 0.014885807037353515,
"learning_rate": 2.553191489361702e-06,
"loss": 0.0006,
"reward": 1.5404972655698657,
"reward_std": 0.6586654607206583,
"rewards/accuracy_reward": 0.7446428902447224,
"rewards/cosine_scaled_reward": 0.4643067395314574,
"rewards/format_reward": 0.001785714365541935,
"rewards/reasoning_steps_reward": 0.329761927947402,
"step": 40
},
{
"completion_length": 617.8857398986817,
"epoch": 0.096,
"grad_norm": 0.7427114343652937,
"kl": 0.010532951354980469,
"learning_rate": 2.872340425531915e-06,
"loss": 0.0004,
"reward": 1.3987550295889377,
"reward_std": 0.6983755130320788,
"rewards/accuracy_reward": 0.7196428939700127,
"rewards/cosine_scaled_reward": 0.39756449486594647,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.28154764492064716,
"step": 45
},
{
"completion_length": 658.5303855895996,
"epoch": 0.10666666666666667,
"grad_norm": 1.6404392956435,
"kl": 0.00817718505859375,
"learning_rate": 2.9996241442585123e-06,
"loss": 0.0003,
"reward": 1.5072809681296349,
"reward_std": 0.7857246264815331,
"rewards/accuracy_reward": 0.7125000357627869,
"rewards/cosine_scaled_reward": 0.41680470630526545,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.37797621842473744,
"step": 50
},
{
"completion_length": 633.0089584350586,
"epoch": 0.11733333333333333,
"grad_norm": 1.4925006192982901,
"kl": 0.005891990661621094,
"learning_rate": 2.9973279301399446e-06,
"loss": 0.0002,
"reward": 1.3728282183408738,
"reward_std": 0.7632799297571182,
"rewards/accuracy_reward": 0.6642857462167739,
"rewards/cosine_scaled_reward": 0.3627091235946864,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.3458333555608988,
"step": 55
},
{
"completion_length": 612.4803871154785,
"epoch": 0.128,
"grad_norm": 0.6602171934295183,
"kl": 0.005200958251953125,
"learning_rate": 2.992947502998804e-06,
"loss": 0.0002,
"reward": 1.550386269390583,
"reward_std": 0.7166643626987934,
"rewards/accuracy_reward": 0.7267857551574707,
"rewards/cosine_scaled_reward": 0.40752905812114476,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.4160714574158192,
"step": 60
},
{
"completion_length": 637.0893135070801,
"epoch": 0.13866666666666666,
"grad_norm": 0.7395014891892754,
"kl": 0.005736923217773438,
"learning_rate": 2.9864889601923268e-06,
"loss": 0.0002,
"reward": 1.548742873966694,
"reward_std": 0.7250724889338016,
"rewards/accuracy_reward": 0.6928571745753288,
"rewards/cosine_scaled_reward": 0.40052852691151203,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.4553571745753288,
"step": 65
},
{
"completion_length": 624.2607398986817,
"epoch": 0.14933333333333335,
"grad_norm": 1.3446269002594227,
"kl": 0.00745697021484375,
"learning_rate": 2.977961291721137e-06,
"loss": 0.0003,
"reward": 1.726818972826004,
"reward_std": 0.7127795048058033,
"rewards/accuracy_reward": 0.7500000286847353,
"rewards/cosine_scaled_reward": 0.4518189021851867,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.5250000316649676,
"step": 70
},
{
"completion_length": 582.6161026000976,
"epoch": 0.16,
"grad_norm": 1.6339747436033418,
"kl": 0.01014862060546875,
"learning_rate": 2.9673763677155655e-06,
"loss": 0.0004,
"reward": 1.7601879060268402,
"reward_std": 0.6540568165481091,
"rewards/accuracy_reward": 0.7571428894996644,
"rewards/cosine_scaled_reward": 0.44530690894462166,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.5577381297945976,
"step": 75
},
{
"completion_length": 604.8535972595215,
"epoch": 0.17066666666666666,
"grad_norm": 0.5255090188980472,
"kl": 0.011274337768554688,
"learning_rate": 2.9547489219129666e-06,
"loss": 0.0005,
"reward": 1.8713448494672775,
"reward_std": 0.6472877942025661,
"rewards/accuracy_reward": 0.7910714626312256,
"rewards/cosine_scaled_reward": 0.46479713870212436,
"rewards/format_reward": 0.001785714365541935,
"rewards/reasoning_steps_reward": 0.6136905208230019,
"step": 80
},
{
"completion_length": 652.2661026000976,
"epoch": 0.18133333333333335,
"grad_norm": 0.9972915077162576,
"kl": 0.030218505859375,
"learning_rate": 2.9400965311490175e-06,
"loss": 0.0012,
"reward": 1.8985693082213402,
"reward_std": 0.6914638102054596,
"rewards/accuracy_reward": 0.7303571823984385,
"rewards/cosine_scaled_reward": 0.4414263550657779,
"rewards/format_reward": 0.00714285746216774,
"rewards/reasoning_steps_reward": 0.7196429148316383,
"step": 85
},
{
"completion_length": 639.8696754455566,
"epoch": 0.192,
"grad_norm": 0.6349460842058491,
"kl": 0.021417236328125,
"learning_rate": 2.9234395908915565e-06,
"loss": 0.0009,
"reward": 1.8425026133656501,
"reward_std": 0.6659979414194822,
"rewards/accuracy_reward": 0.6660714538767933,
"rewards/cosine_scaled_reward": 0.37762160785496235,
"rewards/format_reward": 0.008928571827709675,
"rewards/reasoning_steps_reward": 0.7898810163140297,
"step": 90
},
{
"completion_length": 653.3714622497558,
"epoch": 0.20266666666666666,
"grad_norm": 0.7068217863673739,
"kl": 0.0266326904296875,
"learning_rate": 2.904801286851009e-06,
"loss": 0.0011,
"reward": 2.013348326086998,
"reward_std": 0.6553943566977978,
"rewards/accuracy_reward": 0.7017857439815998,
"rewards/cosine_scaled_reward": 0.41513395444490014,
"rewards/format_reward": 0.06785714626312256,
"rewards/reasoning_steps_reward": 0.8285714961588383,
"step": 95
},
{
"completion_length": 677.8946754455567,
"epoch": 0.21333333333333335,
"grad_norm": 2.9570623395086475,
"kl": 0.0402923583984375,
"learning_rate": 2.884207562706925e-06,
"loss": 0.0016,
"reward": 2.5119238168001177,
"reward_std": 0.8375062063336373,
"rewards/accuracy_reward": 0.7125000339001417,
"rewards/cosine_scaled_reward": 0.4250189420999959,
"rewards/format_reward": 0.5553571665659547,
"rewards/reasoning_steps_reward": 0.8190476760268212,
"step": 100
},
{
"epoch": 0.21333333333333335,
"eval_completion_length": 717.3023177124023,
"eval_kl": 0.045336328125,
"eval_loss": 0.0018938997527584434,
"eval_reward": 2.535167279243469,
"eval_reward_std": 0.8789399468839169,
"eval_rewards/accuracy_reward": 0.5740285987168551,
"eval_rewards/cosine_scaled_reward": 0.29340527021205964,
"eval_rewards/format_reward": 0.8470000272035598,
"eval_rewards/reasoning_steps_reward": 0.8207333937764167,
"eval_runtime": 34268.8498,
"eval_samples_per_second": 0.146,
"eval_steps_per_second": 0.01,
"step": 100
},
{
"completion_length": 724.1053909301758,
"epoch": 0.224,
"grad_norm": 1.1690466064365364,
"kl": 0.1235015869140625,
"learning_rate": 2.8616870839955444e-06,
"loss": 0.0049,
"reward": 2.752804014086723,
"reward_std": 0.8537621341645718,
"rewards/accuracy_reward": 0.6767857506871223,
"rewards/cosine_scaled_reward": 0.4045896364772489,
"rewards/format_reward": 0.8553571686148643,
"rewards/reasoning_steps_reward": 0.816071481257677,
"step": 105
},
{
"completion_length": 768.4839645385742,
"epoch": 0.23466666666666666,
"grad_norm": 14.95234115514868,
"kl": 0.0612457275390625,
"learning_rate": 2.837271198208662e-06,
"loss": 0.0025,
"reward": 2.6233972758054733,
"reward_std": 0.8962277337908745,
"rewards/accuracy_reward": 0.6214286016300321,
"rewards/cosine_scaled_reward": 0.340063818404451,
"rewards/format_reward": 0.8839285984635353,
"rewards/reasoning_steps_reward": 0.7779762476682663,
"step": 110
},
{
"completion_length": 731.7893173217774,
"epoch": 0.24533333333333332,
"grad_norm": 0.7880946164162693,
"kl": 0.163909912109375,
"learning_rate": 2.8109938911593322e-06,
"loss": 0.0066,
"reward": 2.6163390249013903,
"reward_std": 0.9033554136753082,
"rewards/accuracy_reward": 0.6589286033064127,
"rewards/cosine_scaled_reward": 0.3818151192739606,
"rewards/format_reward": 0.7267857484519482,
"rewards/reasoning_steps_reward": 0.8488095819950103,
"step": 115
},
{
"completion_length": 759.9339622497558,
"epoch": 0.256,
"grad_norm": 1.7990639203312269,
"kl": 0.042535400390625,
"learning_rate": 2.7828917396751474e-06,
"loss": 0.0017,
"reward": 2.6672952473163605,
"reward_std": 0.8769947469234467,
"rewards/accuracy_reward": 0.6535714585334063,
"rewards/cosine_scaled_reward": 0.384557047416456,
"rewards/format_reward": 0.7732143148779869,
"rewards/reasoning_steps_reward": 0.8559524476528168,
"step": 120
},
{
"completion_length": 760.0518188476562,
"epoch": 0.26666666666666666,
"grad_norm": 0.525125891329557,
"kl": 0.048138427734375,
"learning_rate": 2.753003860684943e-06,
"loss": 0.0019,
"reward": 3.015795236825943,
"reward_std": 0.7571237944066525,
"rewards/accuracy_reward": 0.7303571730852128,
"rewards/cosine_scaled_reward": 0.47650940530002117,
"rewards/format_reward": 0.9767857253551483,
"rewards/reasoning_steps_reward": 0.832142922282219,
"step": 125
},
{
"completion_length": 787.3678970336914,
"epoch": 0.2773333333333333,
"grad_norm": 1.6324108230600705,
"kl": 0.0462982177734375,
"learning_rate": 2.721371856769793e-06,
"loss": 0.0019,
"reward": 2.96227542757988,
"reward_std": 0.7570989470928907,
"rewards/accuracy_reward": 0.6714286040514708,
"rewards/cosine_scaled_reward": 0.40870389440096916,
"rewards/format_reward": 0.9892857193946838,
"rewards/reasoning_steps_reward": 0.8928571999073028,
"step": 130
},
{
"completion_length": 739.2839584350586,
"epoch": 0.288,
"grad_norm": 1.2279064196254164,
"kl": 0.0569366455078125,
"learning_rate": 2.688039758254093e-06,
"loss": 0.0023,
"reward": 3.0940194368362426,
"reward_std": 0.7407944872975349,
"rewards/accuracy_reward": 0.748214321769774,
"rewards/cosine_scaled_reward": 0.4910431296331808,
"rewards/format_reward": 0.9946428596973419,
"rewards/reasoning_steps_reward": 0.8601191073656083,
"step": 135
},
{
"completion_length": 742.6107498168946,
"epoch": 0.2986666666666667,
"grad_norm": 1.142989559294876,
"kl": 0.0421722412109375,
"learning_rate": 2.65305396191733e-06,
"loss": 0.0017,
"reward": 3.0360836148262025,
"reward_std": 0.7486013866961002,
"rewards/accuracy_reward": 0.7267857477068901,
"rewards/cosine_scaled_reward": 0.44679776607081295,
"rewards/format_reward": 0.9785714343190193,
"rewards/reasoning_steps_reward": 0.8839286372065545,
"step": 140
},
{
"completion_length": 708.2089630126953,
"epoch": 0.30933333333333335,
"grad_norm": 0.5591745022956734,
"kl": 0.0389068603515625,
"learning_rate": 2.61646316641186e-06,
"loss": 0.0016,
"reward": 3.039272406697273,
"reward_std": 0.6369190786033869,
"rewards/accuracy_reward": 0.7375000305473804,
"rewards/cosine_scaled_reward": 0.4565341799054295,
"rewards/format_reward": 0.9928571462631226,
"rewards/reasoning_steps_reward": 0.85238102003932,
"step": 145
},
{
"completion_length": 693.1196723937989,
"epoch": 0.32,
"grad_norm": 0.8445904699258806,
"kl": 0.04276123046875,
"learning_rate": 2.5783183044765715e-06,
"loss": 0.0017,
"reward": 3.065075045824051,
"reward_std": 0.6227528784424067,
"rewards/accuracy_reward": 0.7553571730852127,
"rewards/cosine_scaled_reward": 0.48888447135686874,
"rewards/format_reward": 0.980357152223587,
"rewards/reasoning_steps_reward": 0.8404762536287308,
"step": 150
},
{
"completion_length": 666.7178871154786,
"epoch": 0.33066666666666666,
"grad_norm": 0.6870106883522581,
"kl": 0.041412353515625,
"learning_rate": 2.5386724720408135e-06,
"loss": 0.0017,
"reward": 3.156706044077873,
"reward_std": 0.6447003319859504,
"rewards/accuracy_reward": 0.798214316740632,
"rewards/cosine_scaled_reward": 0.5305154274217785,
"rewards/format_reward": 0.9821428656578064,
"rewards/reasoning_steps_reward": 0.8458333984017372,
"step": 155
},
{
"completion_length": 723.2357452392578,
"epoch": 0.3413333333333333,
"grad_norm": 0.598319397397006,
"kl": 0.0435455322265625,
"learning_rate": 2.49758085431725e-06,
"loss": 0.0017,
"reward": 3.0701630294322966,
"reward_std": 0.6298764709383249,
"rewards/accuracy_reward": 0.7410714581608773,
"rewards/cosine_scaled_reward": 0.463615276478231,
"rewards/format_reward": 0.9696428686380386,
"rewards/reasoning_steps_reward": 0.895833395421505,
"step": 160
},
{
"completion_length": 799.8250381469727,
"epoch": 0.352,
"grad_norm": 0.6301042631581175,
"kl": 0.0475830078125,
"learning_rate": 2.455100648986533e-06,
"loss": 0.0019,
"reward": 2.9756604552268984,
"reward_std": 0.7367240894585848,
"rewards/accuracy_reward": 0.6785714585334063,
"rewards/cosine_scaled_reward": 0.4125651277601719,
"rewards/format_reward": 0.9535714492201806,
"rewards/reasoning_steps_reward": 0.9309524342417717,
"step": 165
},
{
"completion_length": 826.1500366210937,
"epoch": 0.3626666666666667,
"grad_norm": 0.8610873292738485,
"kl": 0.054132080078125,
"learning_rate": 2.4112909865807053e-06,
"loss": 0.0022,
"reward": 2.680763456225395,
"reward_std": 0.8146794062107802,
"rewards/accuracy_reward": 0.5303571671247482,
"rewards/cosine_scaled_reward": 0.2390966679668054,
"rewards/format_reward": 0.9714285835623742,
"rewards/reasoning_steps_reward": 0.9398810014128685,
"step": 170
},
{
"completion_length": 800.7339706420898,
"epoch": 0.37333333333333335,
"grad_norm": 0.7079176746358598,
"kl": 0.067926025390625,
"learning_rate": 2.366212848176164e-06,
"loss": 0.0027,
"reward": 2.7596707075834273,
"reward_std": 0.8444030195474624,
"rewards/accuracy_reward": 0.5714285975322128,
"rewards/cosine_scaled_reward": 0.3090753515250981,
"rewards/format_reward": 0.9732142984867096,
"rewards/reasoning_steps_reward": 0.9059524446725845,
"step": 175
},
{
"completion_length": 778.9928909301758,
"epoch": 0.384,
"grad_norm": 1.1001119066854874,
"kl": 0.0932373046875,
"learning_rate": 2.319928980510752e-06,
"loss": 0.0037,
"reward": 2.765022465586662,
"reward_std": 0.8211875937879085,
"rewards/accuracy_reward": 0.5821428883820772,
"rewards/cosine_scaled_reward": 0.32335569793358443,
"rewards/format_reward": 0.9732142984867096,
"rewards/reasoning_steps_reward": 0.8863095760345459,
"step": 180
},
{
"completion_length": 792.8321823120117,
"epoch": 0.39466666666666667,
"grad_norm": 0.621075404533647,
"kl": 0.170367431640625,
"learning_rate": 2.272503808643123e-06,
"loss": 0.0068,
"reward": 2.6604900896549224,
"reward_std": 0.7879154846072197,
"rewards/accuracy_reward": 0.5410714561119676,
"rewards/cosine_scaled_reward": 0.26644235964631663,
"rewards/format_reward": 0.9892857193946838,
"rewards/reasoning_steps_reward": 0.8636905372142791,
"step": 185
},
{
"completion_length": 734.221459197998,
"epoch": 0.4053333333333333,
"grad_norm": 2.6622395485180066,
"kl": 0.1951171875,
"learning_rate": 2.2240033462759628e-06,
"loss": 0.0078,
"reward": 2.9028186976909636,
"reward_std": 0.7853110164403916,
"rewards/accuracy_reward": 0.6750000350177288,
"rewards/cosine_scaled_reward": 0.41293763257563115,
"rewards/format_reward": 0.9767857238650322,
"rewards/reasoning_steps_reward": 0.8380952909588814,
"step": 190
},
{
"completion_length": 757.7036010742188,
"epoch": 0.416,
"grad_norm": 1.9232019767182387,
"kl": 0.197918701171875,
"learning_rate": 2.1744951038678905e-06,
"loss": 0.0079,
"reward": 2.847309911251068,
"reward_std": 0.8383581660687923,
"rewards/accuracy_reward": 0.6553571727126837,
"rewards/cosine_scaled_reward": 0.3895717078819871,
"rewards/format_reward": 0.9660714402794838,
"rewards/reasoning_steps_reward": 0.836309588700533,
"step": 195
},
{
"completion_length": 704.6071769714356,
"epoch": 0.4266666666666667,
"grad_norm": 2.187135193267794,
"kl": 0.2211669921875,
"learning_rate": 2.124047994661941e-06,
"loss": 0.0088,
"reward": 2.8649386018514633,
"reward_std": 0.8465140253305435,
"rewards/accuracy_reward": 0.6785714600235224,
"rewards/cosine_scaled_reward": 0.4107717891223729,
"rewards/format_reward": 0.9767857253551483,
"rewards/reasoning_steps_reward": 0.7988095879554749,
"step": 200
},
{
"epoch": 0.4266666666666667,
"eval_completion_length": 726.0360606079101,
"eval_kl": 0.4157892578125,
"eval_loss": 0.016393402591347694,
"eval_reward": 2.668296795749664,
"eval_reward_std": 0.848984938377142,
"eval_rewards/accuracy_reward": 0.5848571698188781,
"eval_rewards/cosine_scaled_reward": 0.31035383035842096,
"eval_rewards/format_reward": 0.9741428675889969,
"eval_rewards/reasoning_steps_reward": 0.7989429181694985,
"eval_runtime": 34304.9807,
"eval_samples_per_second": 0.146,
"eval_steps_per_second": 0.01,
"step": 200
},
{
"completion_length": 711.1785987854004,
"epoch": 0.43733333333333335,
"grad_norm": 13.425767555167974,
"kl": 0.522259521484375,
"learning_rate": 2.072732238761434e-06,
"loss": 0.0209,
"reward": 2.5607248514890673,
"reward_std": 0.8997476093471051,
"rewards/accuracy_reward": 0.5821428846567869,
"rewards/cosine_scaled_reward": 0.29524860471719877,
"rewards/format_reward": 0.9446428805589676,
"rewards/reasoning_steps_reward": 0.7386905312538147,
"step": 205
},
{
"completion_length": 703.9107421875,
"epoch": 0.448,
"grad_norm": 1.9515561067024476,
"kl": 0.407659912109375,
"learning_rate": 2.0206192653867536e-06,
"loss": 0.0163,
"reward": 2.5372259259223937,
"reward_std": 1.0205993868410588,
"rewards/accuracy_reward": 0.6178571723401547,
"rewards/cosine_scaled_reward": 0.334844835329568,
"rewards/format_reward": 0.887500025331974,
"rewards/reasoning_steps_reward": 0.6970238521695137,
"step": 210
},
{
"completion_length": 743.6161041259766,
"epoch": 0.45866666666666667,
"grad_norm": 3.1537839868905078,
"kl": 0.660333251953125,
"learning_rate": 1.967781613449095e-06,
"loss": 0.0264,
"reward": 1.9156419575214385,
"reward_std": 1.1042504251003264,
"rewards/accuracy_reward": 0.4267857324331999,
"rewards/cosine_scaled_reward": 0.1459990169329103,
"rewards/format_reward": 0.7571428924798965,
"rewards/reasoning_steps_reward": 0.5857143320143223,
"step": 215
},
{
"completion_length": 776.1553909301758,
"epoch": 0.4693333333333333,
"grad_norm": 4.038362237813945,
"kl": 0.3061279296875,
"learning_rate": 1.9142928305795637e-06,
"loss": 0.0122,
"reward": 2.4867064505815506,
"reward_std": 0.9735465314239263,
"rewards/accuracy_reward": 0.5696428786963225,
"rewards/cosine_scaled_reward": 0.2962301469407976,
"rewards/format_reward": 0.9053571730852127,
"rewards/reasoning_steps_reward": 0.7154762424528599,
"step": 220
},
{
"completion_length": 729.4678955078125,
"epoch": 0.48,
"grad_norm": 3.1439180898202466,
"kl": 0.28773193359375,
"learning_rate": 1.8602273707541886e-06,
"loss": 0.0115,
"reward": 2.7534320563077928,
"reward_std": 0.9354146108031273,
"rewards/accuracy_reward": 0.671428600884974,
"rewards/cosine_scaled_reward": 0.4331938370829448,
"rewards/format_reward": 0.9357143059372902,
"rewards/reasoning_steps_reward": 0.7130952820181846,
"step": 225
},
{
"completion_length": 753.7571792602539,
"epoch": 0.49066666666666664,
"grad_norm": 9.742972287112453,
"kl": 0.672515869140625,
"learning_rate": 1.8056604906573418e-06,
"loss": 0.0269,
"reward": 2.4900094658136367,
"reward_std": 1.071580182760954,
"rewards/accuracy_reward": 0.6053571721538902,
"rewards/cosine_scaled_reward": 0.3417950821574777,
"rewards/format_reward": 0.900000023841858,
"rewards/reasoning_steps_reward": 0.6428571917116642,
"step": 230
},
{
"completion_length": 769.7821792602539,
"epoch": 0.5013333333333333,
"grad_norm": 12.748394988794061,
"kl": 0.615447998046875,
"learning_rate": 1.7506681449278226e-06,
"loss": 0.0246,
"reward": 2.4423232048749925,
"reward_std": 1.0899774149060248,
"rewards/accuracy_reward": 0.6035714549943805,
"rewards/cosine_scaled_reward": 0.3429183505475521,
"rewards/format_reward": 0.8517857506871224,
"rewards/reasoning_steps_reward": 0.6440476708114147,
"step": 235
},
{
"completion_length": 721.4928901672363,
"epoch": 0.512,
"grad_norm": 316.5564758948846,
"kl": 2.6417724609375,
"learning_rate": 1.6953268804334257e-06,
"loss": 0.1056,
"reward": 2.6406149938702583,
"reward_std": 0.9681473188102245,
"rewards/accuracy_reward": 0.6732143137603999,
"rewards/cosine_scaled_reward": 0.43644821029156444,
"rewards/format_reward": 0.833928607404232,
"rewards/reasoning_steps_reward": 0.6970238670706749,
"step": 240
},
{
"completion_length": 699.0089614868164,
"epoch": 0.5226666666666666,
"grad_norm": 144932.71498560338,
"kl": 41.92372436523438,
"learning_rate": 1.6397137297211436e-06,
"loss": 1.677,
"reward": 2.9064596563577654,
"reward_std": 0.8081207755953074,
"rewards/accuracy_reward": 0.7678571745753289,
"rewards/cosine_scaled_reward": 0.5046738618053496,
"rewards/format_reward": 0.9178571611642837,
"rewards/reasoning_steps_reward": 0.7160714864730835,
"step": 245
},
{
"completion_length": 719.6250274658203,
"epoch": 0.5333333333333333,
"grad_norm": 12.128586542505051,
"kl": 13.330413818359375,
"learning_rate": 1.5839061037913395e-06,
"loss": 0.5321,
"reward": 2.8467082887887956,
"reward_std": 0.8377620510756969,
"rewards/accuracy_reward": 0.7500000335276127,
"rewards/cosine_scaled_reward": 0.4984938623383641,
"rewards/format_reward": 0.8803571656346321,
"rewards/reasoning_steps_reward": 0.7178571961820126,
"step": 250
},
{
"completion_length": 740.6482467651367,
"epoch": 0.544,
"grad_norm": 21.125458441852672,
"kl": 0.171630859375,
"learning_rate": 1.527981684345115e-06,
"loss": 0.0069,
"reward": 2.7478116720914842,
"reward_std": 0.8047603815793991,
"rewards/accuracy_reward": 0.6946428874507546,
"rewards/cosine_scaled_reward": 0.4323353324783966,
"rewards/format_reward": 0.9214285910129547,
"rewards/reasoning_steps_reward": 0.6994048036634922,
"step": 255
},
{
"completion_length": 741.4857482910156,
"epoch": 0.5546666666666666,
"grad_norm": 24.989348854776196,
"kl": 0.7936920166015625,
"learning_rate": 1.4720183156548855e-06,
"loss": 0.0317,
"reward": 2.8889215648174287,
"reward_std": 0.8046084839850665,
"rewards/accuracy_reward": 0.7053571708500386,
"rewards/cosine_scaled_reward": 0.4633262232731795,
"rewards/format_reward": 0.9500000178813934,
"rewards/reasoning_steps_reward": 0.7702381610870361,
"step": 260
},
{
"completion_length": 742.701823425293,
"epoch": 0.5653333333333334,
"grad_norm": 31.341677037958075,
"kl": 0.67542724609375,
"learning_rate": 1.4160938962086612e-06,
"loss": 0.027,
"reward": 2.8618105918169023,
"reward_std": 0.756641275063157,
"rewards/accuracy_reward": 0.7321428785100579,
"rewards/cosine_scaled_reward": 0.4826438320800662,
"rewards/format_reward": 0.9267857372760773,
"rewards/reasoning_steps_reward": 0.7202381417155266,
"step": 265
},
{
"completion_length": 741.5803863525391,
"epoch": 0.576,
"grad_norm": 208.94608010239415,
"kl": 0.9906982421875,
"learning_rate": 1.3602862702788567e-06,
"loss": 0.0396,
"reward": 2.706931698322296,
"reward_std": 0.9525154523551465,
"rewards/accuracy_reward": 0.6732143200933933,
"rewards/cosine_scaled_reward": 0.43609824670711533,
"rewards/format_reward": 0.8946428880095482,
"rewards/reasoning_steps_reward": 0.7029762521386147,
"step": 270
},
{
"completion_length": 732.1143157958984,
"epoch": 0.5866666666666667,
"grad_norm": 38.275566557137644,
"kl": 1.51126708984375,
"learning_rate": 1.3046731195665748e-06,
"loss": 0.0605,
"reward": 2.7598373025655745,
"reward_std": 0.9775024671107531,
"rewards/accuracy_reward": 0.7053571678698063,
"rewards/cosine_scaled_reward": 0.45864667696878314,
"rewards/format_reward": 0.8839286029338836,
"rewards/reasoning_steps_reward": 0.7119048096239566,
"step": 275
},
{
"completion_length": 766.3411087036133,
"epoch": 0.5973333333333334,
"grad_norm": 34.400362223993966,
"kl": 0.446759033203125,
"learning_rate": 1.2493318550721775e-06,
"loss": 0.0179,
"reward": 2.80095117688179,
"reward_std": 0.9185693945735693,
"rewards/accuracy_reward": 0.701785746216774,
"rewards/cosine_scaled_reward": 0.44499868620187044,
"rewards/format_reward": 0.9017857417464257,
"rewards/reasoning_steps_reward": 0.7523810148239136,
"step": 280
},
{
"completion_length": 745.8839614868164,
"epoch": 0.608,
"grad_norm": 70.23076333110882,
"kl": 3.24036865234375,
"learning_rate": 1.1943395093426585e-06,
"loss": 0.1296,
"reward": 2.9303120017051696,
"reward_std": 0.7858256082981825,
"rewards/accuracy_reward": 0.7339286036789417,
"rewards/cosine_scaled_reward": 0.5118595488369465,
"rewards/format_reward": 0.9214285984635353,
"rewards/reasoning_steps_reward": 0.7630952894687653,
"step": 285
},
{
"completion_length": 729.6464584350585,
"epoch": 0.6186666666666667,
"grad_norm": 51.01572887577541,
"kl": 2.792974853515625,
"learning_rate": 1.1397726292458115e-06,
"loss": 0.1118,
"reward": 2.8507910460233687,
"reward_std": 0.8466649554669857,
"rewards/accuracy_reward": 0.7142857432365417,
"rewards/cosine_scaled_reward": 0.46924334414070473,
"rewards/format_reward": 0.9303571686148644,
"rewards/reasoning_steps_reward": 0.7369048215448857,
"step": 290
},
{
"completion_length": 758.4518173217773,
"epoch": 0.6293333333333333,
"grad_norm": 21.800940990494052,
"kl": 3.139178466796875,
"learning_rate": 1.085707169420437e-06,
"loss": 0.1259,
"reward": 2.687264183163643,
"reward_std": 0.8229550156742335,
"rewards/accuracy_reward": 0.6535714585334063,
"rewards/cosine_scaled_reward": 0.4211926580406725,
"rewards/format_reward": 0.8946428805589676,
"rewards/reasoning_steps_reward": 0.7178571917116642,
"step": 295
},
{
"completion_length": 736.9268188476562,
"epoch": 0.64,
"grad_norm": 8.05056275297732,
"kl": 0.720037841796875,
"learning_rate": 1.0322183865509054e-06,
"loss": 0.0288,
"reward": 2.7745183438062666,
"reward_std": 0.9824469141662121,
"rewards/accuracy_reward": 0.7232143133878708,
"rewards/cosine_scaled_reward": 0.46856586267240347,
"rewards/format_reward": 0.9071428790688515,
"rewards/reasoning_steps_reward": 0.6755952909588814,
"step": 300
},
{
"epoch": 0.64,
"eval_completion_length": 756.2787765625,
"eval_kl": 2.815593212890625,
"eval_loss": 0.11271046847105026,
"eval_reward": 2.6679908989191055,
"eval_reward_std": 0.9292156134605408,
"eval_rewards/accuracy_reward": 0.6384000294238329,
"eval_rewards/cosine_scaled_reward": 0.386524124888191,
"eval_rewards/format_reward": 0.9171143096089364,
"eval_rewards/reasoning_steps_reward": 0.7259524360120296,
"eval_runtime": 34511.2614,
"eval_samples_per_second": 0.145,
"eval_steps_per_second": 0.01,
"step": 300
},
{
"completion_length": 760.3768211364746,
"epoch": 0.6506666666666666,
"grad_norm": 16.434699024651234,
"kl": 0.956842041015625,
"learning_rate": 9.793807346132464e-07,
"loss": 0.0383,
"reward": 2.901669743657112,
"reward_std": 0.8654729023575782,
"rewards/accuracy_reward": 0.7464286040514707,
"rewards/cosine_scaled_reward": 0.5034553268924356,
"rewards/format_reward": 0.9392857357859612,
"rewards/reasoning_steps_reward": 0.7125000521540642,
"step": 305
},
{
"completion_length": 780.1018203735351,
"epoch": 0.6613333333333333,
"grad_norm": 9.30628937439928,
"kl": 0.76141357421875,
"learning_rate": 9.272677612385667e-07,
"loss": 0.0305,
"reward": 2.715619903802872,
"reward_std": 0.9227011248469352,
"rewards/accuracy_reward": 0.6767857443541289,
"rewards/cosine_scaled_reward": 0.4328816962428391,
"rewards/format_reward": 0.9000000298023224,
"rewards/reasoning_steps_reward": 0.7059524297714234,
"step": 310
},
{
"completion_length": 768.9500411987304,
"epoch": 0.672,
"grad_norm": 1.7983007811573581,
"kl": 0.573968505859375,
"learning_rate": 8.759520053380591e-07,
"loss": 0.023,
"reward": 2.786320286989212,
"reward_std": 0.7833445437252522,
"rewards/accuracy_reward": 0.6910714630037547,
"rewards/cosine_scaled_reward": 0.4357249645516276,
"rewards/format_reward": 0.9464285910129547,
"rewards/reasoning_steps_reward": 0.7130952931940555,
"step": 315
},
{
"completion_length": 723.1536033630371,
"epoch": 0.6826666666666666,
"grad_norm": 12.691694660091667,
"kl": 0.486602783203125,
"learning_rate": 8.255048961321088e-07,
"loss": 0.0194,
"reward": 2.831101644039154,
"reward_std": 0.8475235715508461,
"rewards/accuracy_reward": 0.7071428868919611,
"rewards/cosine_scaled_reward": 0.46919678517151625,
"rewards/format_reward": 0.930357164144516,
"rewards/reasoning_steps_reward": 0.7244048148393631,
"step": 320
},
{
"completion_length": 723.8268165588379,
"epoch": 0.6933333333333334,
"grad_norm": 4.135287895650402,
"kl": 1.668939208984375,
"learning_rate": 7.759966537240373e-07,
"loss": 0.0664,
"reward": 3.0232764929533005,
"reward_std": 0.7830160673707723,
"rewards/accuracy_reward": 0.7607143174856901,
"rewards/cosine_scaled_reward": 0.5054192344192415,
"rewards/format_reward": 0.9696428701281548,
"rewards/reasoning_steps_reward": 0.7875000596046448,
"step": 325
},
{
"completion_length": 743.2911071777344,
"epoch": 0.704,
"grad_norm": 2.428880550991712,
"kl": 0.23929443359375,
"learning_rate": 7.274961913568773e-07,
"loss": 0.0096,
"reward": 2.980492576956749,
"reward_std": 0.7519855977967381,
"rewards/accuracy_reward": 0.7517857424914837,
"rewards/cosine_scaled_reward": 0.5114448417443782,
"rewards/format_reward": 0.9589285865426064,
"rewards/reasoning_steps_reward": 0.7583333984017372,
"step": 330
},
{
"completion_length": 777.944676208496,
"epoch": 0.7146666666666667,
"grad_norm": 10.667405755387346,
"kl": 0.491375732421875,
"learning_rate": 6.800710194892484e-07,
"loss": 0.0197,
"reward": 2.9669879227876663,
"reward_std": 0.7909464325755835,
"rewards/accuracy_reward": 0.7285714603960514,
"rewards/cosine_scaled_reward": 0.4985354314424967,
"rewards/format_reward": 0.9696428716182709,
"rewards/reasoning_steps_reward": 0.7702381551265717,
"step": 335
},
{
"completion_length": 783.7107513427734,
"epoch": 0.7253333333333334,
"grad_norm": 20.401487806762606,
"kl": 1.50914306640625,
"learning_rate": 6.33787151823836e-07,
"loss": 0.0604,
"reward": 2.733260214328766,
"reward_std": 0.9950653843581676,
"rewards/accuracy_reward": 0.6535714577883482,
"rewards/cosine_scaled_reward": 0.4219506177818403,
"rewards/format_reward": 0.9071428775787354,
"rewards/reasoning_steps_reward": 0.7505952984094619,
"step": 340
},
{
"completion_length": 747.0571807861328,
"epoch": 0.736,
"grad_norm": 3.8988932597998756,
"kl": 0.765008544921875,
"learning_rate": 5.887090134192947e-07,
"loss": 0.0306,
"reward": 2.950327825546265,
"reward_std": 0.7912764519453048,
"rewards/accuracy_reward": 0.7142857464030385,
"rewards/cosine_scaled_reward": 0.4931848540902138,
"rewards/format_reward": 0.9553571596741677,
"rewards/reasoning_steps_reward": 0.7875000700354576,
"step": 345
},
{
"completion_length": 768.9161071777344,
"epoch": 0.7466666666666667,
"grad_norm": 5.964808758728403,
"kl": 0.8441162109375,
"learning_rate": 5.448993510134669e-07,
"loss": 0.0338,
"reward": 2.7927453070878983,
"reward_std": 0.8378362115472555,
"rewards/accuracy_reward": 0.6892857491970062,
"rewards/cosine_scaled_reward": 0.44631660780869425,
"rewards/format_reward": 0.9250000208616257,
"rewards/reasoning_steps_reward": 0.7321429125964641,
"step": 350
},
{
"completion_length": 771.8000366210938,
"epoch": 0.7573333333333333,
"grad_norm": 19.67075460524417,
"kl": 1.37095947265625,
"learning_rate": 5.024191456827498e-07,
"loss": 0.0549,
"reward": 2.757797637581825,
"reward_std": 0.9828206066042184,
"rewards/accuracy_reward": 0.6946428813040256,
"rewards/cosine_scaled_reward": 0.4447022658772767,
"rewards/format_reward": 0.900000023841858,
"rewards/reasoning_steps_reward": 0.7184524320065975,
"step": 355
},
{
"completion_length": 764.0839599609375,
"epoch": 0.768,
"grad_norm": 20.908101755536627,
"kl": 1.678790283203125,
"learning_rate": 4.6132752795918667e-07,
"loss": 0.0672,
"reward": 2.710779735445976,
"reward_std": 0.9775115817785263,
"rewards/accuracy_reward": 0.6642857462167739,
"rewards/cosine_scaled_reward": 0.40601770151406524,
"rewards/format_reward": 0.9107143133878708,
"rewards/reasoning_steps_reward": 0.7297619603574276,
"step": 360
},
{
"completion_length": 763.6000320434571,
"epoch": 0.7786666666666666,
"grad_norm": 12.87435288128971,
"kl": 2.571148681640625,
"learning_rate": 4.2168169552342905e-07,
"loss": 0.1028,
"reward": 2.6531503438949584,
"reward_std": 0.9446496672928333,
"rewards/accuracy_reward": 0.6321428865194321,
"rewards/cosine_scaled_reward": 0.38410261063836515,
"rewards/format_reward": 0.9035714536905288,
"rewards/reasoning_steps_reward": 0.7333333849906921,
"step": 365
},
{
"completion_length": 743.5286102294922,
"epoch": 0.7893333333333333,
"grad_norm": 9.936371570504198,
"kl": 1.2698486328125,
"learning_rate": 3.8353683358814046e-07,
"loss": 0.0508,
"reward": 2.7013536602258683,
"reward_std": 0.862298522144556,
"rewards/accuracy_reward": 0.6571428874507547,
"rewards/cosine_scaled_reward": 0.4073059491813183,
"rewards/format_reward": 0.9000000208616257,
"rewards/reasoning_steps_reward": 0.7369048207998276,
"step": 370
},
{
"completion_length": 759.3643180847168,
"epoch": 0.8,
"grad_norm": 23.218570111303308,
"kl": 2.2372650146484374,
"learning_rate": 3.469460380826697e-07,
"loss": 0.0895,
"reward": 2.7877288803458216,
"reward_std": 0.9023670472204686,
"rewards/accuracy_reward": 0.6785714600235224,
"rewards/cosine_scaled_reward": 0.4454668462276459,
"rewards/format_reward": 0.9142857402563095,
"rewards/reasoning_steps_reward": 0.7494048178195953,
"step": 375
},
{
"completion_length": 747.8786026000977,
"epoch": 0.8106666666666666,
"grad_norm": 5.9371726712745065,
"kl": 1.312139892578125,
"learning_rate": 3.119602417459075e-07,
"loss": 0.0525,
"reward": 2.8006283044815063,
"reward_std": 0.8084595888853073,
"rewards/accuracy_reward": 0.6607143096625805,
"rewards/cosine_scaled_reward": 0.4244377123657614,
"rewards/format_reward": 0.9464285865426063,
"rewards/reasoning_steps_reward": 0.7690476909279823,
"step": 380
},
{
"completion_length": 746.4321762084961,
"epoch": 0.8213333333333334,
"grad_norm": 5.13518428447357,
"kl": 0.994195556640625,
"learning_rate": 2.786281432302071e-07,
"loss": 0.0397,
"reward": 2.861383581161499,
"reward_std": 0.8250645313411951,
"rewards/accuracy_reward": 0.7000000301748515,
"rewards/cosine_scaled_reward": 0.44947873409837485,
"rewards/format_reward": 0.9428571656346321,
"rewards/reasoning_steps_reward": 0.7690476924180984,
"step": 385
},
{
"completion_length": 748.0875358581543,
"epoch": 0.832,
"grad_norm": 4.45042845549648,
"kl": 1.18310546875,
"learning_rate": 2.46996139315057e-07,
"loss": 0.0474,
"reward": 2.8951289474964144,
"reward_std": 0.8875481400638818,
"rewards/accuracy_reward": 0.7357143111526966,
"rewards/cosine_scaled_reward": 0.4856050438596867,
"rewards/format_reward": 0.9357143044471741,
"rewards/reasoning_steps_reward": 0.738095298409462,
"step": 390
},
{
"completion_length": 742.2714614868164,
"epoch": 0.8426666666666667,
"grad_norm": 9.790655066948279,
"kl": 1.13563232421875,
"learning_rate": 2.1710826032485286e-07,
"loss": 0.0454,
"reward": 2.8695475578308107,
"reward_std": 0.8187286149710417,
"rewards/accuracy_reward": 0.7142857439815998,
"rewards/cosine_scaled_reward": 0.4582379271276295,
"rewards/format_reward": 0.9464285880327225,
"rewards/reasoning_steps_reward": 0.7505952954292298,
"step": 395
},
{
"completion_length": 770.3428955078125,
"epoch": 0.8533333333333334,
"grad_norm": 4.399010377251051,
"kl": 1.116912841796875,
"learning_rate": 1.8900610884066817e-07,
"loss": 0.0447,
"reward": 2.7512567728757857,
"reward_std": 0.9337207470089197,
"rewards/accuracy_reward": 0.651785746961832,
"rewards/cosine_scaled_reward": 0.4012566165998578,
"rewards/format_reward": 0.9428571671247482,
"rewards/reasoning_steps_reward": 0.7553571984171867,
"step": 400
},
{
"epoch": 0.8533333333333334,
"eval_completion_length": 760.9953487792968,
"eval_kl": 1.14858369140625,
"eval_loss": 0.045983318239450455,
"eval_reward": 2.729253951013088,
"eval_reward_std": 0.8810034032523633,
"eval_rewards/accuracy_reward": 0.6346000292986631,
"eval_rewards/cosine_scaled_reward": 0.385330027777364,
"eval_rewards/format_reward": 0.9519428731679916,
"eval_rewards/reasoning_steps_reward": 0.7573810091674328,
"eval_runtime": 34525.1878,
"eval_samples_per_second": 0.145,
"eval_steps_per_second": 0.01,
"step": 400
},
{
"completion_length": 753.4678894042969,
"epoch": 0.864,
"grad_norm": 4.304417705259892,
"kl": 1.15230712890625,
"learning_rate": 1.627288017913383e-07,
"loss": 0.0461,
"reward": 2.8077996015548705,
"reward_std": 0.9062907833606004,
"rewards/accuracy_reward": 0.676785746961832,
"rewards/cosine_scaled_reward": 0.44053755011409523,
"rewards/format_reward": 0.9517857328057289,
"rewards/reasoning_steps_reward": 0.738690534979105,
"step": 405
},
{
"completion_length": 785.1232528686523,
"epoch": 0.8746666666666667,
"grad_norm": 5.246554486522317,
"kl": 1.4623046875,
"learning_rate": 1.3831291600445573e-07,
"loss": 0.0585,
"reward": 2.7128711313009264,
"reward_std": 0.8996678274124861,
"rewards/accuracy_reward": 0.6446428809314966,
"rewards/cosine_scaled_reward": 0.4003709977958351,
"rewards/format_reward": 0.9267857372760773,
"rewards/reasoning_steps_reward": 0.7410714901983738,
"step": 410
},
{
"completion_length": 759.5286041259766,
"epoch": 0.8853333333333333,
"grad_norm": 4.4529003923911565,
"kl": 0.970086669921875,
"learning_rate": 1.1579243729307487e-07,
"loss": 0.0388,
"reward": 2.6863596469163893,
"reward_std": 0.9374732062220573,
"rewards/accuracy_reward": 0.6446428891271353,
"rewards/cosine_scaled_reward": 0.3893356985412538,
"rewards/format_reward": 0.9250000223517418,
"rewards/reasoning_steps_reward": 0.7273810058832169,
"step": 415
},
{
"completion_length": 768.6143203735352,
"epoch": 0.896,
"grad_norm": 5.297256486680652,
"kl": 1.40426025390625,
"learning_rate": 9.519871314899092e-08,
"loss": 0.0562,
"reward": 2.7144743263721467,
"reward_std": 0.9721049644052983,
"rewards/accuracy_reward": 0.682142886146903,
"rewards/cosine_scaled_reward": 0.4079266074113548,
"rewards/format_reward": 0.9125000268220902,
"rewards/reasoning_steps_reward": 0.7119048140943051,
"step": 420
},
{
"completion_length": 762.5839622497558,
"epoch": 0.9066666666666666,
"grad_norm": 5.808442367606542,
"kl": 1.266558837890625,
"learning_rate": 7.656040910844358e-08,
"loss": 0.0507,
"reward": 2.7946069568395613,
"reward_std": 0.9051171492785215,
"rewards/accuracy_reward": 0.6750000327825546,
"rewards/cosine_scaled_reward": 0.42258303044363854,
"rewards/format_reward": 0.9500000178813934,
"rewards/reasoning_steps_reward": 0.7470238760113717,
"step": 425
},
{
"completion_length": 758.2143196105957,
"epoch": 0.9173333333333333,
"grad_norm": 4.124091919765889,
"kl": 1.363201904296875,
"learning_rate": 5.990346885098235e-08,
"loss": 0.0546,
"reward": 2.8003338128328323,
"reward_std": 0.9829879272729158,
"rewards/accuracy_reward": 0.698214316368103,
"rewards/cosine_scaled_reward": 0.44914320297539234,
"rewards/format_reward": 0.9303571596741677,
"rewards/reasoning_steps_reward": 0.7226191096007824,
"step": 430
},
{
"completion_length": 761.5821762084961,
"epoch": 0.928,
"grad_norm": 41.306011448835555,
"kl": 1.1220947265625,
"learning_rate": 4.5251078087033493e-08,
"loss": 0.0449,
"reward": 2.9364974945783615,
"reward_std": 0.9109487719833851,
"rewards/accuracy_reward": 0.7285714596509933,
"rewards/cosine_scaled_reward": 0.4841164079494774,
"rewards/format_reward": 0.9303571656346321,
"rewards/reasoning_steps_reward": 0.7934524446725846,
"step": 435
},
{
"completion_length": 762.6250328063965,
"epoch": 0.9386666666666666,
"grad_norm": 3.9945081680140406,
"kl": 1.48448486328125,
"learning_rate": 3.262363228443427e-08,
"loss": 0.0594,
"reward": 2.6933425307273864,
"reward_std": 0.9241555985063314,
"rewards/accuracy_reward": 0.6500000320374966,
"rewards/cosine_scaled_reward": 0.399890087870881,
"rewards/format_reward": 0.9357143074274064,
"rewards/reasoning_steps_reward": 0.7077381528913975,
"step": 440
},
{
"completion_length": 768.3500335693359,
"epoch": 0.9493333333333334,
"grad_norm": 13.713505614062974,
"kl": 1.749468994140625,
"learning_rate": 2.2038708278862952e-08,
"loss": 0.07,
"reward": 2.676873904466629,
"reward_std": 0.9034410756081342,
"rewards/accuracy_reward": 0.6410714581608772,
"rewards/cosine_scaled_reward": 0.3983023501932621,
"rewards/format_reward": 0.9178571671247482,
"rewards/reasoning_steps_reward": 0.7196429133415222,
"step": 445
},
{
"completion_length": 754.7732467651367,
"epoch": 0.96,
"grad_norm": 5.550660972818955,
"kl": 1.152679443359375,
"learning_rate": 1.3511039807673209e-08,
"loss": 0.0461,
"reward": 2.84182793200016,
"reward_std": 0.8377097636461258,
"rewards/accuracy_reward": 0.6964286038652062,
"rewards/cosine_scaled_reward": 0.44599451111862437,
"rewards/format_reward": 0.939285734295845,
"rewards/reasoning_steps_reward": 0.7601191103458405,
"step": 450
},
{
"completion_length": 743.9089569091797,
"epoch": 0.9706666666666667,
"grad_norm": 2.2837748295404885,
"kl": 0.9584228515625,
"learning_rate": 7.0524970011963675e-09,
"loss": 0.0383,
"reward": 2.9682214707136154,
"reward_std": 0.8571841098368168,
"rewards/accuracy_reward": 0.7517857450991869,
"rewards/cosine_scaled_reward": 0.5009594126604497,
"rewards/format_reward": 0.9553571596741677,
"rewards/reasoning_steps_reward": 0.7601191103458405,
"step": 455
},
{
"completion_length": 747.2446739196778,
"epoch": 0.9813333333333333,
"grad_norm": 60.34548052033379,
"kl": 1.486627197265625,
"learning_rate": 2.6720698600553595e-09,
"loss": 0.0595,
"reward": 2.8397951513528823,
"reward_std": 0.8803306795656681,
"rewards/accuracy_reward": 0.7000000329688192,
"rewards/cosine_scaled_reward": 0.4570569400675595,
"rewards/format_reward": 0.9517857328057289,
"rewards/reasoning_steps_reward": 0.7309524327516556,
"step": 460
},
{
"completion_length": 766.3500358581543,
"epoch": 0.992,
"grad_norm": 18.459219357783663,
"kl": 1.420770263671875,
"learning_rate": 3.7585574148779613e-10,
"loss": 0.0569,
"reward": 2.704774260520935,
"reward_std": 0.9208621602505446,
"rewards/accuracy_reward": 0.6678571715950966,
"rewards/cosine_scaled_reward": 0.4119170166202821,
"rewards/format_reward": 0.8982143089175224,
"rewards/reasoning_steps_reward": 0.72678577080369,
"step": 465
},
{
"completion_length": 773.5982462565104,
"epoch": 0.9984,
"kl": 1.5178934733072917,
"reward": 2.810507302482923,
"reward_std": 0.9296036226054033,
"rewards/accuracy_reward": 0.6785714613894621,
"rewards/cosine_scaled_reward": 0.44344370051597554,
"rewards/format_reward": 0.9523809651533762,
"rewards/reasoning_steps_reward": 0.7361111715435982,
"step": 468,
"total_flos": 0.0,
"train_loss": 0.05152365299789334,
"train_runtime": 191788.2346,
"train_samples_per_second": 0.039,
"train_steps_per_second": 0.002
}
],
"logging_steps": 5,
"max_steps": 468,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}