Qwen-2.5-7B-Simple-RL / trainer_state.json
zxnstc's picture
Model save
5b5cd2c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9984,
"eval_steps": 100,
"global_step": 468,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 638.4285987854004,
"epoch": 0.010666666666666666,
"grad_norm": 2.42055344581604,
"kl": 0.00011775493621826171,
"learning_rate": 3.1914893617021275e-07,
"loss": 0.0,
"reward": 1.1089806377887725,
"reward_std": 0.8793896824121475,
"rewards/accuracy_reward": 0.5767857443541289,
"rewards/cosine_scaled_reward": 0.25779010977130384,
"rewards/format_reward": 0.00357142873108387,
"rewards/reasoning_steps_reward": 0.270833354908973,
"step": 5
},
{
"completion_length": 611.7714553833008,
"epoch": 0.021333333333333333,
"grad_norm": 1.2694348096847534,
"kl": 0.00021342039108276367,
"learning_rate": 6.382978723404255e-07,
"loss": 0.0,
"reward": 1.1431605055928231,
"reward_std": 0.8805145360529423,
"rewards/accuracy_reward": 0.5892857421189547,
"rewards/cosine_scaled_reward": 0.3008985619725536,
"rewards/format_reward": 0.001785714365541935,
"rewards/reasoning_steps_reward": 0.25119049232453106,
"step": 10
},
{
"completion_length": 603.9625274658204,
"epoch": 0.032,
"grad_norm": 4.813971042633057,
"kl": 0.00024839639663696287,
"learning_rate": 9.574468085106384e-07,
"loss": 0.0,
"reward": 1.2853217244148254,
"reward_std": 0.8000296212732791,
"rewards/accuracy_reward": 0.6535714577883482,
"rewards/cosine_scaled_reward": 0.3317502578254789,
"rewards/format_reward": 0.00357142873108387,
"rewards/reasoning_steps_reward": 0.2964285886846483,
"step": 15
},
{
"completion_length": 595.6982391357421,
"epoch": 0.042666666666666665,
"grad_norm": 0.9142511487007141,
"kl": 0.00047616958618164064,
"learning_rate": 1.276595744680851e-06,
"loss": 0.0,
"reward": 1.2075286597013473,
"reward_std": 0.7695159167051315,
"rewards/accuracy_reward": 0.6428571730852127,
"rewards/cosine_scaled_reward": 0.3247905206750147,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.23988096807152032,
"step": 20
},
{
"completion_length": 644.2071708679199,
"epoch": 0.05333333333333334,
"grad_norm": 0.7305999994277954,
"kl": 0.0009862899780273438,
"learning_rate": 1.5957446808510639e-06,
"loss": 0.0,
"reward": 1.2186039187014104,
"reward_std": 0.7756468575447798,
"rewards/accuracy_reward": 0.6339286027476192,
"rewards/cosine_scaled_reward": 0.3200919725000858,
"rewards/format_reward": 0.0008928571827709675,
"rewards/reasoning_steps_reward": 0.26369049586355686,
"step": 25
},
{
"completion_length": 669.2125297546387,
"epoch": 0.064,
"grad_norm": 0.9431222677230835,
"kl": 0.0020453929901123047,
"learning_rate": 1.9148936170212767e-06,
"loss": 0.0001,
"reward": 1.3601950403302907,
"reward_std": 0.760306540131569,
"rewards/accuracy_reward": 0.6732143152505159,
"rewards/cosine_scaled_reward": 0.3795402319636196,
"rewards/format_reward": 0.0008928571827709675,
"rewards/reasoning_steps_reward": 0.3065476375631988,
"step": 30
},
{
"completion_length": 643.0696685791015,
"epoch": 0.07466666666666667,
"grad_norm": 48319720.0,
"kl": 6272.004669189453,
"learning_rate": 2.2340425531914894e-06,
"loss": 250.5145,
"reward": 1.289367458410561,
"reward_std": 0.7266312446445227,
"rewards/accuracy_reward": 0.676785746589303,
"rewards/cosine_scaled_reward": 0.3441293075971771,
"rewards/format_reward": 0.001785714365541935,
"rewards/reasoning_steps_reward": 0.26666668280959127,
"step": 35
},
{
"completion_length": 669.8018165588379,
"epoch": 0.08533333333333333,
"grad_norm": 0.4609315097332001,
"kl": 0.018144559860229493,
"learning_rate": 2.553191489361702e-06,
"loss": 0.0007,
"reward": 1.4497444801032544,
"reward_std": 0.6942832075059414,
"rewards/accuracy_reward": 0.6982143171131611,
"rewards/cosine_scaled_reward": 0.41283965120092037,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.3386905025690794,
"step": 40
},
{
"completion_length": 642.2321701049805,
"epoch": 0.096,
"grad_norm": 1.9606070518493652,
"kl": 0.004248189926147461,
"learning_rate": 2.872340425531915e-06,
"loss": 0.0002,
"reward": 1.4603484645485878,
"reward_std": 0.7034628570079804,
"rewards/accuracy_reward": 0.7232143163681031,
"rewards/cosine_scaled_reward": 0.4335627053398639,
"rewards/format_reward": 0.0,
"rewards/reasoning_steps_reward": 0.3035714477300644,
"step": 45
},
{
"completion_length": 659.8625274658203,
"epoch": 0.10666666666666667,
"grad_norm": 1.5518417358398438,
"kl": 0.004150962829589844,
"learning_rate": 2.9996241442585123e-06,
"loss": 0.0002,
"reward": 1.5205835647881032,
"reward_std": 0.7311159037053585,
"rewards/accuracy_reward": 0.7303571753203869,
"rewards/cosine_scaled_reward": 0.4155239976942539,
"rewards/format_reward": 0.0008928571827709675,
"rewards/reasoning_steps_reward": 0.37380955144762995,
"step": 50
},
{
"completion_length": 638.6232460021972,
"epoch": 0.11733333333333333,
"grad_norm": 1.3417630195617676,
"kl": 0.01309032440185547,
"learning_rate": 2.9973279301399446e-06,
"loss": 0.0005,
"reward": 1.4900454580783844,
"reward_std": 0.7109502237290144,
"rewards/accuracy_reward": 0.7035714603960515,
"rewards/cosine_scaled_reward": 0.3855811151210219,
"rewards/format_reward": 0.0026785715483129023,
"rewards/reasoning_steps_reward": 0.3982143113389611,
"step": 55
},
{
"completion_length": 637.7786003112793,
"epoch": 0.128,
"grad_norm": 0.7655097842216492,
"kl": 0.005014801025390625,
"learning_rate": 2.992947502998804e-06,
"loss": 0.0002,
"reward": 1.5817858844995498,
"reward_std": 0.7063489355146885,
"rewards/accuracy_reward": 0.7285714685916901,
"rewards/cosine_scaled_reward": 0.4130358204245567,
"rewards/format_reward": 0.0026785715483129023,
"rewards/reasoning_steps_reward": 0.43750002793967724,
"step": 60
},
{
"completion_length": 619.2321716308594,
"epoch": 0.13866666666666666,
"grad_norm": 0.6214168667793274,
"kl": 0.006450653076171875,
"learning_rate": 2.9864889601923268e-06,
"loss": 0.0003,
"reward": 1.618351523578167,
"reward_std": 0.6625145003199577,
"rewards/accuracy_reward": 0.7232143163681031,
"rewards/cosine_scaled_reward": 0.4254943021107465,
"rewards/format_reward": 0.00357142873108387,
"rewards/reasoning_steps_reward": 0.4660714641213417,
"step": 65
},
{
"completion_length": 620.4696716308594,
"epoch": 0.14933333333333335,
"grad_norm": 1.1358349323272705,
"kl": 0.007819366455078126,
"learning_rate": 2.977961291721137e-06,
"loss": 0.0003,
"reward": 1.809953036904335,
"reward_std": 0.685565372928977,
"rewards/accuracy_reward": 0.7589285969734192,
"rewards/cosine_scaled_reward": 0.47126249115681274,
"rewards/format_reward": 0.00714285746216774,
"rewards/reasoning_steps_reward": 0.5726190894842148,
"step": 70
},
{
"completion_length": 591.2000267028809,
"epoch": 0.16,
"grad_norm": 0.9748697876930237,
"kl": 0.01055145263671875,
"learning_rate": 2.9673763677155655e-06,
"loss": 0.0004,
"reward": 1.801955761015415,
"reward_std": 0.6809038281440735,
"rewards/accuracy_reward": 0.7410714589059353,
"rewards/cosine_scaled_reward": 0.4311223858210724,
"rewards/format_reward": 0.0053571430966258046,
"rewards/reasoning_steps_reward": 0.6244048129767179,
"step": 75
},
{
"completion_length": 604.2036003112793,
"epoch": 0.17066666666666666,
"grad_norm": 0.41957351565361023,
"kl": 0.01427001953125,
"learning_rate": 2.9547489219129666e-06,
"loss": 0.0006,
"reward": 1.9645369604229928,
"reward_std": 0.5586295232176781,
"rewards/accuracy_reward": 0.8053571732714773,
"rewards/cosine_scaled_reward": 0.4785249759210274,
"rewards/format_reward": 0.0026785715483129023,
"rewards/reasoning_steps_reward": 0.6779762431979179,
"step": 80
},
{
"completion_length": 654.9571723937988,
"epoch": 0.18133333333333335,
"grad_norm": 0.5244520306587219,
"kl": 0.01689453125,
"learning_rate": 2.9400965311490175e-06,
"loss": 0.0007,
"reward": 1.9714522436261177,
"reward_std": 0.654699632152915,
"rewards/accuracy_reward": 0.7232143158093095,
"rewards/cosine_scaled_reward": 0.4538926437497139,
"rewards/format_reward": 0.0026785715483129023,
"rewards/reasoning_steps_reward": 0.7916667237877846,
"step": 85
},
{
"completion_length": 635.4571701049805,
"epoch": 0.192,
"grad_norm": 0.5995980501174927,
"kl": 0.0202392578125,
"learning_rate": 2.9234395908915565e-06,
"loss": 0.0008,
"reward": 1.9059573337435722,
"reward_std": 0.6402278915047646,
"rewards/accuracy_reward": 0.6857143208384514,
"rewards/cosine_scaled_reward": 0.3851239001378417,
"rewards/format_reward": 0.001785714365541935,
"rewards/reasoning_steps_reward": 0.8333333969116211,
"step": 90
},
{
"completion_length": 646.8857414245606,
"epoch": 0.20266666666666666,
"grad_norm": 0.3573947846889496,
"kl": 0.02309417724609375,
"learning_rate": 2.904801286851009e-06,
"loss": 0.0009,
"reward": 2.067735290527344,
"reward_std": 0.5824727656319737,
"rewards/accuracy_reward": 0.739285740442574,
"rewards/cosine_scaled_reward": 0.45374711682088675,
"rewards/format_reward": 0.0026785715483129023,
"rewards/reasoning_steps_reward": 0.8720238715410232,
"step": 95
},
{
"completion_length": 617.1839553833008,
"epoch": 0.21333333333333335,
"grad_norm": 0.4597730338573456,
"kl": 0.0259674072265625,
"learning_rate": 2.884207562706925e-06,
"loss": 0.001,
"reward": 2.1270981818437575,
"reward_std": 0.5901715014129877,
"rewards/accuracy_reward": 0.7750000283122063,
"rewards/cosine_scaled_reward": 0.49138382682576776,
"rewards/format_reward": 0.001785714365541935,
"rewards/reasoning_steps_reward": 0.858928632736206,
"step": 100
},
{
"epoch": 0.21333333333333335,
"eval_completion_length": 645.8245434692383,
"eval_kl": 0.0286726806640625,
"eval_loss": 0.0011589155765250325,
"eval_reward": 1.9586333739757538,
"eval_reward_std": 0.6544256884813309,
"eval_rewards/accuracy_reward": 0.667685743278265,
"eval_rewards/cosine_scaled_reward": 0.38711901631861545,
"eval_rewards/format_reward": 0.0034000001534819605,
"eval_rewards/reasoning_steps_reward": 0.9004286315560341,
"eval_runtime": 30593.3304,
"eval_samples_per_second": 0.163,
"eval_steps_per_second": 0.012,
"step": 100
},
{
"completion_length": 666.8482437133789,
"epoch": 0.224,
"grad_norm": 0.3493061065673828,
"kl": 0.027435302734375,
"learning_rate": 2.8616870839955444e-06,
"loss": 0.0011,
"reward": 2.089837631583214,
"reward_std": 0.6151095872744918,
"rewards/accuracy_reward": 0.7214286036789417,
"rewards/cosine_scaled_reward": 0.44727803440764546,
"rewards/format_reward": 0.0026785715483129023,
"rewards/reasoning_steps_reward": 0.9184524416923523,
"step": 105
},
{
"completion_length": 689.3928901672364,
"epoch": 0.23466666666666666,
"grad_norm": 0.7455374002456665,
"kl": 0.0316131591796875,
"learning_rate": 2.837271198208662e-06,
"loss": 0.0013,
"reward": 2.129907730221748,
"reward_std": 0.5857493598014116,
"rewards/accuracy_reward": 0.7428571671247483,
"rewards/cosine_scaled_reward": 0.47633622232824563,
"rewards/format_reward": 0.001785714365541935,
"rewards/reasoning_steps_reward": 0.9089286178350449,
"step": 110
},
{
"completion_length": 648.7071731567382,
"epoch": 0.24533333333333332,
"grad_norm": 0.423718124628067,
"kl": 0.0330322265625,
"learning_rate": 2.8109938911593322e-06,
"loss": 0.0013,
"reward": 2.1340139895677566,
"reward_std": 0.5813389342278242,
"rewards/accuracy_reward": 0.7232143152505159,
"rewards/cosine_scaled_reward": 0.4527639038278721,
"rewards/format_reward": 0.008035714644938708,
"rewards/reasoning_steps_reward": 0.9500000536441803,
"step": 115
},
{
"completion_length": 669.1518165588379,
"epoch": 0.256,
"grad_norm": 0.5140184760093689,
"kl": 0.0355621337890625,
"learning_rate": 2.7828917396751474e-06,
"loss": 0.0015,
"reward": 2.123083771765232,
"reward_std": 0.6066710935905576,
"rewards/accuracy_reward": 0.7285714626312256,
"rewards/cosine_scaled_reward": 0.4486789128568489,
"rewards/format_reward": 0.00357142873108387,
"rewards/reasoning_steps_reward": 0.9422619551420212,
"step": 120
},
{
"completion_length": 651.8714584350586,
"epoch": 0.26666666666666666,
"grad_norm": 14.626392364501953,
"kl": 0.054949951171875,
"learning_rate": 2.753003860684943e-06,
"loss": 0.0022,
"reward": 2.238000822067261,
"reward_std": 0.6136660899966955,
"rewards/accuracy_reward": 0.7803571730852127,
"rewards/cosine_scaled_reward": 0.5305602598935366,
"rewards/format_reward": 0.00625000037252903,
"rewards/reasoning_steps_reward": 0.9208333969116211,
"step": 125
},
{
"completion_length": 687.6518211364746,
"epoch": 0.2773333333333333,
"grad_norm": 0.4999578297138214,
"kl": 0.0349700927734375,
"learning_rate": 2.721371856769793e-06,
"loss": 0.0014,
"reward": 2.085830058157444,
"reward_std": 0.5865108415484428,
"rewards/accuracy_reward": 0.6946428868919611,
"rewards/cosine_scaled_reward": 0.4269014226272702,
"rewards/format_reward": 0.010714286193251609,
"rewards/reasoning_steps_reward": 0.9535714745521545,
"step": 130
},
{
"completion_length": 625.8964546203613,
"epoch": 0.288,
"grad_norm": 0.5372105240821838,
"kl": 0.039947509765625,
"learning_rate": 2.688039758254093e-06,
"loss": 0.0016,
"reward": 2.243368774652481,
"reward_std": 0.6026589145883918,
"rewards/accuracy_reward": 0.7750000316649676,
"rewards/cosine_scaled_reward": 0.4892020009458065,
"rewards/format_reward": 0.01428571492433548,
"rewards/reasoning_steps_reward": 0.9648810073733329,
"step": 135
},
{
"completion_length": 657.5696739196777,
"epoch": 0.2986666666666667,
"grad_norm": 0.6321460008621216,
"kl": 0.0416748046875,
"learning_rate": 2.65305396191733e-06,
"loss": 0.0017,
"reward": 2.1084320515394213,
"reward_std": 0.7091156661510467,
"rewards/accuracy_reward": 0.7071428887546063,
"rewards/cosine_scaled_reward": 0.42956293127499523,
"rewards/format_reward": 0.01696428647264838,
"rewards/reasoning_steps_reward": 0.9547619611024857,
"step": 140
},
{
"completion_length": 669.2839614868165,
"epoch": 0.30933333333333335,
"grad_norm": 0.4815462529659271,
"kl": 0.0447967529296875,
"learning_rate": 2.61646316641186e-06,
"loss": 0.0018,
"reward": 2.022332654893398,
"reward_std": 0.6810956679284572,
"rewards/accuracy_reward": 0.6750000305473804,
"rewards/cosine_scaled_reward": 0.4035826030303724,
"rewards/format_reward": 0.009821429010480642,
"rewards/reasoning_steps_reward": 0.9339286148548126,
"step": 145
},
{
"completion_length": 647.9393119812012,
"epoch": 0.32,
"grad_norm": 0.4961049556732178,
"kl": 0.0510009765625,
"learning_rate": 2.5783183044765715e-06,
"loss": 0.002,
"reward": 1.9679424732923507,
"reward_std": 0.7103641763329506,
"rewards/accuracy_reward": 0.6678571719676256,
"rewards/cosine_scaled_reward": 0.3926447768812068,
"rewards/format_reward": 0.008035714644938708,
"rewards/reasoning_steps_reward": 0.8994048193097115,
"step": 150
},
{
"completion_length": 625.0803840637207,
"epoch": 0.33066666666666666,
"grad_norm": 0.5056385397911072,
"kl": 0.06827392578125,
"learning_rate": 2.5386724720408135e-06,
"loss": 0.0027,
"reward": 2.0651650190353394,
"reward_std": 0.6431151006370783,
"rewards/accuracy_reward": 0.7375000286847353,
"rewards/cosine_scaled_reward": 0.45712922792881727,
"rewards/format_reward": 0.009821429010480642,
"rewards/reasoning_steps_reward": 0.860714353621006,
"step": 155
},
{
"completion_length": 611.2964599609375,
"epoch": 0.3413333333333333,
"grad_norm": 0.4566061794757843,
"kl": 0.08109130859375,
"learning_rate": 2.49758085431725e-06,
"loss": 0.0032,
"reward": 2.0270636796951296,
"reward_std": 0.6181262265890837,
"rewards/accuracy_reward": 0.7464285964146257,
"rewards/cosine_scaled_reward": 0.43301601126149764,
"rewards/format_reward": 0.012500000558793545,
"rewards/reasoning_steps_reward": 0.8351191088557244,
"step": 160
},
{
"completion_length": 643.8536026000977,
"epoch": 0.352,
"grad_norm": 0.3811902403831482,
"kl": 0.12118072509765625,
"learning_rate": 2.455100648986533e-06,
"loss": 0.0048,
"reward": 1.9545473739504815,
"reward_std": 0.7556829001754523,
"rewards/accuracy_reward": 0.6857143171131611,
"rewards/cosine_scaled_reward": 0.40722584864124656,
"rewards/format_reward": 0.008035714644938708,
"rewards/reasoning_steps_reward": 0.8535714909434319,
"step": 165
},
{
"completion_length": 666.0553924560547,
"epoch": 0.3626666666666667,
"grad_norm": 0.5245673656463623,
"kl": 0.1458251953125,
"learning_rate": 2.4112909865807053e-06,
"loss": 0.0058,
"reward": 1.7782447993755341,
"reward_std": 0.7516406249254942,
"rewards/accuracy_reward": 0.6017857398837805,
"rewards/cosine_scaled_reward": 0.32050663968548176,
"rewards/format_reward": 0.008928571827709675,
"rewards/reasoning_steps_reward": 0.8470238700509072,
"step": 170
},
{
"completion_length": 656.5375297546386,
"epoch": 0.37333333333333335,
"grad_norm": 0.3452568054199219,
"kl": 0.100311279296875,
"learning_rate": 2.366212848176164e-06,
"loss": 0.004,
"reward": 2.048972634971142,
"reward_std": 0.706931572034955,
"rewards/accuracy_reward": 0.7178571790456771,
"rewards/cosine_scaled_reward": 0.4686154007911682,
"rewards/format_reward": 0.0053571430966258046,
"rewards/reasoning_steps_reward": 0.8571429222822189,
"step": 175
},
{
"completion_length": 625.3053825378418,
"epoch": 0.384,
"grad_norm": 0.7150533199310303,
"kl": 0.1327392578125,
"learning_rate": 2.319928980510752e-06,
"loss": 0.0053,
"reward": 1.9931991159915925,
"reward_std": 0.7508711714297533,
"rewards/accuracy_reward": 0.7053571753203869,
"rewards/cosine_scaled_reward": 0.4312942801974714,
"rewards/format_reward": 0.012500000651925802,
"rewards/reasoning_steps_reward": 0.8440476790070534,
"step": 180
},
{
"completion_length": 662.8018127441406,
"epoch": 0.39466666666666667,
"grad_norm": 0.5048078894615173,
"kl": 0.2526092529296875,
"learning_rate": 2.272503808643123e-06,
"loss": 0.0101,
"reward": 1.6179989255964755,
"reward_std": 0.826694194227457,
"rewards/accuracy_reward": 0.5803571738302707,
"rewards/cosine_scaled_reward": 0.283177447039634,
"rewards/format_reward": 0.009821429010480642,
"rewards/reasoning_steps_reward": 0.7446429103612899,
"step": 185
},
{
"completion_length": 612.7518165588378,
"epoch": 0.4053333333333333,
"grad_norm": 15.008540153503418,
"kl": 0.2234375,
"learning_rate": 2.2240033462759628e-06,
"loss": 0.0089,
"reward": 1.8689126953482629,
"reward_std": 0.8196798441931605,
"rewards/accuracy_reward": 0.6892857480794191,
"rewards/cosine_scaled_reward": 0.38706737738102676,
"rewards/format_reward": 0.0062500002793967726,
"rewards/reasoning_steps_reward": 0.786309577524662,
"step": 190
},
{
"completion_length": 636.9143135070801,
"epoch": 0.416,
"grad_norm": 1.8482682704925537,
"kl": 0.1047210693359375,
"learning_rate": 2.1744951038678905e-06,
"loss": 0.0042,
"reward": 2.1131999254226685,
"reward_std": 0.6497842017561197,
"rewards/accuracy_reward": 0.7464286003261804,
"rewards/cosine_scaled_reward": 0.475997456186451,
"rewards/format_reward": 0.011607143562287092,
"rewards/reasoning_steps_reward": 0.8791667327284813,
"step": 195
},
{
"completion_length": 643.9375305175781,
"epoch": 0.4266666666666667,
"grad_norm": 1.6493088006973267,
"kl": 0.1329986572265625,
"learning_rate": 2.124047994661941e-06,
"loss": 0.0053,
"reward": 2.1098326206207276,
"reward_std": 0.6796215798705816,
"rewards/accuracy_reward": 0.7250000312924385,
"rewards/cosine_scaled_reward": 0.4544754126574844,
"rewards/format_reward": 0.019642858114093543,
"rewards/reasoning_steps_reward": 0.9107143446803093,
"step": 200
},
{
"epoch": 0.4266666666666667,
"eval_completion_length": 658.4622008789063,
"eval_kl": 0.16044462890625,
"eval_loss": 0.006352806463837624,
"eval_reward": 2.0124860629320143,
"eval_reward_std": 0.7236331352472305,
"eval_rewards/accuracy_reward": 0.6609143146038056,
"eval_rewards/cosine_scaled_reward": 0.3881764653600403,
"eval_rewards/format_reward": 0.03644285902827978,
"eval_rewards/reasoning_steps_reward": 0.9269524365663528,
"eval_runtime": 39396.7661,
"eval_samples_per_second": 0.127,
"eval_steps_per_second": 0.009,
"step": 200
},
{
"completion_length": 664.1589576721192,
"epoch": 0.43733333333333335,
"grad_norm": 0.5569435358047485,
"kl": 0.1752899169921875,
"learning_rate": 2.072732238761434e-06,
"loss": 0.007,
"reward": 2.151426687836647,
"reward_std": 0.6980113681405783,
"rewards/accuracy_reward": 0.732142886519432,
"rewards/cosine_scaled_reward": 0.4615456376457587,
"rewards/format_reward": 0.03750000176951289,
"rewards/reasoning_steps_reward": 0.9202381521463394,
"step": 205
},
{
"completion_length": 638.1839584350586,
"epoch": 0.448,
"grad_norm": 0.8609623312950134,
"kl": 0.1427032470703125,
"learning_rate": 2.0206192653867536e-06,
"loss": 0.0057,
"reward": 2.260927739739418,
"reward_std": 0.6022655628621578,
"rewards/accuracy_reward": 0.7767857406288385,
"rewards/cosine_scaled_reward": 0.5171776844188571,
"rewards/format_reward": 0.045535716507583854,
"rewards/reasoning_steps_reward": 0.9214286223053932,
"step": 210
},
{
"completion_length": 723.9089622497559,
"epoch": 0.45866666666666667,
"grad_norm": 1.7239713668823242,
"kl": 0.304364013671875,
"learning_rate": 1.967781613449095e-06,
"loss": 0.0122,
"reward": 1.6578506268560886,
"reward_std": 0.8106920622289181,
"rewards/accuracy_reward": 0.5500000214204193,
"rewards/cosine_scaled_reward": 0.2706481910310686,
"rewards/format_reward": 0.020535715389996767,
"rewards/reasoning_steps_reward": 0.8166667185723782,
"step": 215
},
{
"completion_length": 722.1321716308594,
"epoch": 0.4693333333333333,
"grad_norm": 3.4466731548309326,
"kl": 0.541461181640625,
"learning_rate": 1.9142928305795637e-06,
"loss": 0.0217,
"reward": 1.1115448012948037,
"reward_std": 0.9772109590470791,
"rewards/accuracy_reward": 0.3910714492201805,
"rewards/cosine_scaled_reward": 0.09517570563766639,
"rewards/format_reward": 0.01339285783469677,
"rewards/reasoning_steps_reward": 0.6119048058986664,
"step": 220
},
{
"completion_length": 655.9178810119629,
"epoch": 0.48,
"grad_norm": 0.7818121910095215,
"kl": 0.38446044921875,
"learning_rate": 1.8602273707541886e-06,
"loss": 0.0154,
"reward": 1.443753632903099,
"reward_std": 1.1735275402665137,
"rewards/accuracy_reward": 0.5517857423052192,
"rewards/cosine_scaled_reward": 0.263098827842623,
"rewards/format_reward": 0.022321429569274187,
"rewards/reasoning_steps_reward": 0.6065476641058922,
"step": 225
},
{
"completion_length": 669.6357391357421,
"epoch": 0.49066666666666664,
"grad_norm": 0.3009130656719208,
"kl": 0.186187744140625,
"learning_rate": 1.8056604906573418e-06,
"loss": 0.0074,
"reward": 1.8981928735971452,
"reward_std": 0.8392410669475794,
"rewards/accuracy_reward": 0.6964286047965288,
"rewards/cosine_scaled_reward": 0.438073761574924,
"rewards/format_reward": 0.012500000558793545,
"rewards/reasoning_steps_reward": 0.7511905305087566,
"step": 230
},
{
"completion_length": 653.5732406616211,
"epoch": 0.5013333333333333,
"grad_norm": 4.625399589538574,
"kl": 0.05731353759765625,
"learning_rate": 1.7506681449278226e-06,
"loss": 0.0023,
"reward": 2.11394245326519,
"reward_std": 0.6526154175400734,
"rewards/accuracy_reward": 0.7375000312924385,
"rewards/cosine_scaled_reward": 0.4865614231675863,
"rewards/format_reward": 0.021428572479635477,
"rewards/reasoning_steps_reward": 0.8684524416923523,
"step": 235
},
{
"completion_length": 645.69467086792,
"epoch": 0.512,
"grad_norm": 0.2851395606994629,
"kl": 0.050042724609375,
"learning_rate": 1.6953268804334257e-06,
"loss": 0.002,
"reward": 2.228366295993328,
"reward_std": 0.5444579780101776,
"rewards/accuracy_reward": 0.7642857372760773,
"rewards/cosine_scaled_reward": 0.514675722271204,
"rewards/format_reward": 0.03392857322469354,
"rewards/reasoning_steps_reward": 0.9154762506484986,
"step": 240
},
{
"completion_length": 632.8053848266602,
"epoch": 0.5226666666666666,
"grad_norm": 0.34539249539375305,
"kl": 0.0962860107421875,
"learning_rate": 1.6397137297211436e-06,
"loss": 0.0039,
"reward": 2.337542861700058,
"reward_std": 0.557469642162323,
"rewards/accuracy_reward": 0.8142857376486063,
"rewards/cosine_scaled_reward": 0.553614255785942,
"rewards/format_reward": 0.041071430593729016,
"rewards/reasoning_steps_reward": 0.9285714775323868,
"step": 245
},
{
"completion_length": 659.1518135070801,
"epoch": 0.5333333333333333,
"grad_norm": 0.30278754234313965,
"kl": 0.0599609375,
"learning_rate": 1.5839061037913395e-06,
"loss": 0.0024,
"reward": 2.421455779671669,
"reward_std": 0.5301560776308178,
"rewards/accuracy_reward": 0.8410714566707611,
"rewards/cosine_scaled_reward": 0.6059795372188092,
"rewards/format_reward": 0.028571429941803218,
"rewards/reasoning_steps_reward": 0.9458333805203438,
"step": 250
},
{
"completion_length": 713.3518188476562,
"epoch": 0.544,
"grad_norm": 0.3068905770778656,
"kl": 0.0843505859375,
"learning_rate": 1.527981684345115e-06,
"loss": 0.0034,
"reward": 2.1079654544591904,
"reward_std": 0.6414080807939172,
"rewards/accuracy_reward": 0.7000000279396772,
"rewards/cosine_scaled_reward": 0.4207629946060479,
"rewards/format_reward": 0.03482143022119999,
"rewards/reasoning_steps_reward": 0.9523809969425201,
"step": 255
},
{
"completion_length": 707.9411071777344,
"epoch": 0.5546666666666666,
"grad_norm": 0.33481159806251526,
"kl": 0.07261962890625,
"learning_rate": 1.4720183156548855e-06,
"loss": 0.0029,
"reward": 2.2514570981264113,
"reward_std": 0.6310873694717885,
"rewards/accuracy_reward": 0.7321428906172514,
"rewards/cosine_scaled_reward": 0.5041355590336025,
"rewards/format_reward": 0.05089285969734192,
"rewards/reasoning_steps_reward": 0.9642857611179352,
"step": 260
},
{
"completion_length": 718.3107444763184,
"epoch": 0.5653333333333334,
"grad_norm": 0.38096827268600464,
"kl": 0.08720703125,
"learning_rate": 1.4160938962086612e-06,
"loss": 0.0035,
"reward": 2.1334225252270698,
"reward_std": 0.6659108363091946,
"rewards/accuracy_reward": 0.7035714538767934,
"rewards/cosine_scaled_reward": 0.44383912505581974,
"rewards/format_reward": 0.04732143105939031,
"rewards/reasoning_steps_reward": 0.9386905342340469,
"step": 265
},
{
"completion_length": 691.2268180847168,
"epoch": 0.576,
"grad_norm": 0.28566083312034607,
"kl": 0.088311767578125,
"learning_rate": 1.3602862702788567e-06,
"loss": 0.0035,
"reward": 2.2702409833669663,
"reward_std": 0.6507027853280306,
"rewards/accuracy_reward": 0.7500000327825547,
"rewards/cosine_scaled_reward": 0.508336108038202,
"rewards/format_reward": 0.057142859976738694,
"rewards/reasoning_steps_reward": 0.9547619566321373,
"step": 270
},
{
"completion_length": 688.7464607238769,
"epoch": 0.5866666666666667,
"grad_norm": 0.35180163383483887,
"kl": 0.0841522216796875,
"learning_rate": 1.3046731195665748e-06,
"loss": 0.0034,
"reward": 2.3181463330984116,
"reward_std": 0.6268971297889948,
"rewards/accuracy_reward": 0.7696428872644901,
"rewards/cosine_scaled_reward": 0.5246938619762659,
"rewards/format_reward": 0.06785714644938708,
"rewards/reasoning_steps_reward": 0.9559524253010749,
"step": 275
},
{
"completion_length": 687.6589630126953,
"epoch": 0.5973333333333334,
"grad_norm": 0.3075723648071289,
"kl": 0.0902801513671875,
"learning_rate": 1.2493318550721775e-06,
"loss": 0.0036,
"reward": 2.242439457774162,
"reward_std": 0.6110217805951834,
"rewards/accuracy_reward": 0.7375000305473804,
"rewards/cosine_scaled_reward": 0.49422508366405965,
"rewards/format_reward": 0.0678571461699903,
"rewards/reasoning_steps_reward": 0.9428571984171867,
"step": 280
},
{
"completion_length": 702.2446731567383,
"epoch": 0.608,
"grad_norm": 0.3365749716758728,
"kl": 0.1429901123046875,
"learning_rate": 1.1943395093426585e-06,
"loss": 0.0057,
"reward": 2.236051079630852,
"reward_std": 0.6939984124153853,
"rewards/accuracy_reward": 0.7464286036789417,
"rewards/cosine_scaled_reward": 0.515515277441591,
"rewards/format_reward": 0.06517857508733868,
"rewards/reasoning_steps_reward": 0.9089286342263222,
"step": 285
},
{
"completion_length": 667.2232437133789,
"epoch": 0.6186666666666667,
"grad_norm": 0.3665401041507721,
"kl": 0.1312591552734375,
"learning_rate": 1.1397726292458115e-06,
"loss": 0.0053,
"reward": 2.2684289827942847,
"reward_std": 0.6653882045298815,
"rewards/accuracy_reward": 0.751785746589303,
"rewards/cosine_scaled_reward": 0.5151550889015197,
"rewards/format_reward": 0.07410714691504836,
"rewards/reasoning_steps_reward": 0.927381020784378,
"step": 290
},
{
"completion_length": 723.4911033630372,
"epoch": 0.6293333333333333,
"grad_norm": 0.25975558161735535,
"kl": 0.14805908203125,
"learning_rate": 1.085707169420437e-06,
"loss": 0.0059,
"reward": 2.1135958269238473,
"reward_std": 0.6525055527687073,
"rewards/accuracy_reward": 0.6732143115252256,
"rewards/cosine_scaled_reward": 0.4448457522317767,
"rewards/format_reward": 0.08660714710131287,
"rewards/reasoning_steps_reward": 0.9089286252856255,
"step": 295
},
{
"completion_length": 685.9928894042969,
"epoch": 0.64,
"grad_norm": 0.32239413261413574,
"kl": 0.1186309814453125,
"learning_rate": 1.0322183865509054e-06,
"loss": 0.0047,
"reward": 2.3491215094923974,
"reward_std": 0.6740828949958086,
"rewards/accuracy_reward": 0.7928571742027998,
"rewards/cosine_scaled_reward": 0.5387047556228935,
"rewards/format_reward": 0.11339286286383868,
"rewards/reasoning_steps_reward": 0.9041667297482491,
"step": 300
},
{
"epoch": 0.64,
"eval_completion_length": 692.8417169921875,
"eval_kl": 0.126764990234375,
"eval_loss": 0.005116811487823725,
"eval_reward": 2.1439696138501168,
"eval_reward_std": 0.7277915328145027,
"eval_rewards/accuracy_reward": 0.6845143143117428,
"eval_rewards/cosine_scaled_reward": 0.44376001094253736,
"eval_rewards/format_reward": 0.10977143388986588,
"eval_rewards/reasoning_steps_reward": 0.9059238699197769,
"eval_runtime": 39599.431,
"eval_samples_per_second": 0.126,
"eval_steps_per_second": 0.009,
"step": 300
},
{
"completion_length": 699.3339584350585,
"epoch": 0.6506666666666666,
"grad_norm": 0.3004523515701294,
"kl": 0.1110198974609375,
"learning_rate": 9.793807346132464e-07,
"loss": 0.0044,
"reward": 2.2790004700422286,
"reward_std": 0.7021285973489284,
"rewards/accuracy_reward": 0.7607143171131611,
"rewards/cosine_scaled_reward": 0.5272146660834551,
"rewards/format_reward": 0.09107143292203546,
"rewards/reasoning_steps_reward": 0.9000000640749931,
"step": 305
},
{
"completion_length": 699.3411010742187,
"epoch": 0.6613333333333333,
"grad_norm": 0.29891085624694824,
"kl": 0.13359375,
"learning_rate": 9.272677612385667e-07,
"loss": 0.0053,
"reward": 2.1629825204610826,
"reward_std": 0.6936808105558157,
"rewards/accuracy_reward": 0.7142857410013675,
"rewards/cosine_scaled_reward": 0.46447055372409524,
"rewards/format_reward": 0.097321433480829,
"rewards/reasoning_steps_reward": 0.8869048282504082,
"step": 310
},
{
"completion_length": 706.1196708679199,
"epoch": 0.672,
"grad_norm": 0.3089028596878052,
"kl": 0.1456146240234375,
"learning_rate": 8.759520053380591e-07,
"loss": 0.0058,
"reward": 2.1019979074597357,
"reward_std": 0.740777799114585,
"rewards/accuracy_reward": 0.6803571773692966,
"rewards/cosine_scaled_reward": 0.44158115636964795,
"rewards/format_reward": 0.08839286118745804,
"rewards/reasoning_steps_reward": 0.8916667312383652,
"step": 315
},
{
"completion_length": 669.9286071777344,
"epoch": 0.6826666666666666,
"grad_norm": 0.6965083479881287,
"kl": 0.145257568359375,
"learning_rate": 8.255048961321088e-07,
"loss": 0.0058,
"reward": 2.272875265777111,
"reward_std": 0.7233634147793054,
"rewards/accuracy_reward": 0.7553571749478578,
"rewards/cosine_scaled_reward": 0.5264466149732471,
"rewards/format_reward": 0.10892857704311609,
"rewards/reasoning_steps_reward": 0.8821429207921028,
"step": 320
},
{
"completion_length": 692.9786003112793,
"epoch": 0.6933333333333334,
"grad_norm": 0.33192208409309387,
"kl": 0.1722869873046875,
"learning_rate": 7.759966537240373e-07,
"loss": 0.0069,
"reward": 2.169807307422161,
"reward_std": 0.8210989892482757,
"rewards/accuracy_reward": 0.7089285964146257,
"rewards/cosine_scaled_reward": 0.4582000946626067,
"rewards/format_reward": 0.11160714961588383,
"rewards/reasoning_steps_reward": 0.8910714894533157,
"step": 325
},
{
"completion_length": 705.5911056518555,
"epoch": 0.704,
"grad_norm": 0.5306978821754456,
"kl": 0.24217529296875,
"learning_rate": 7.274961913568773e-07,
"loss": 0.0097,
"reward": 2.0500947162508965,
"reward_std": 0.8266730591654777,
"rewards/accuracy_reward": 0.667857170663774,
"rewards/cosine_scaled_reward": 0.44116610190831124,
"rewards/format_reward": 0.11071429131552576,
"rewards/reasoning_steps_reward": 0.8303571999073028,
"step": 330
},
{
"completion_length": 731.1839591979981,
"epoch": 0.7146666666666667,
"grad_norm": 0.4849264919757843,
"kl": 0.2579345703125,
"learning_rate": 6.800710194892484e-07,
"loss": 0.0103,
"reward": 1.9499552190303802,
"reward_std": 0.9148915704339743,
"rewards/accuracy_reward": 0.6428571701049804,
"rewards/cosine_scaled_reward": 0.3943003877531737,
"rewards/format_reward": 0.10267857694998384,
"rewards/reasoning_steps_reward": 0.8101191118359565,
"step": 335
},
{
"completion_length": 733.7321731567383,
"epoch": 0.7253333333333334,
"grad_norm": 0.6697672009468079,
"kl": 0.3463623046875,
"learning_rate": 6.33787151823836e-07,
"loss": 0.0139,
"reward": 1.6754619617015123,
"reward_std": 1.0652375385165214,
"rewards/accuracy_reward": 0.5571428790688515,
"rewards/cosine_scaled_reward": 0.29867618879216024,
"rewards/format_reward": 0.07500000381842256,
"rewards/reasoning_steps_reward": 0.7446429077535868,
"step": 340
},
{
"completion_length": 708.6964599609375,
"epoch": 0.736,
"grad_norm": 0.342541366815567,
"kl": 0.3625,
"learning_rate": 5.887090134192947e-07,
"loss": 0.0145,
"reward": 1.7074500739574432,
"reward_std": 1.06461516097188,
"rewards/accuracy_reward": 0.5910714540630579,
"rewards/cosine_scaled_reward": 0.3163785987533629,
"rewards/format_reward": 0.07321428908035159,
"rewards/reasoning_steps_reward": 0.7267857655882836,
"step": 345
},
{
"completion_length": 741.1839630126954,
"epoch": 0.7466666666666667,
"grad_norm": 0.5499653816223145,
"kl": 0.4493896484375,
"learning_rate": 5.448993510134669e-07,
"loss": 0.018,
"reward": 1.4979531578719616,
"reward_std": 1.063758409768343,
"rewards/accuracy_reward": 0.5339285928755999,
"rewards/cosine_scaled_reward": 0.2485483249882236,
"rewards/format_reward": 0.06071428917348385,
"rewards/reasoning_steps_reward": 0.6547619514167309,
"step": 350
},
{
"completion_length": 718.5732467651367,
"epoch": 0.7573333333333333,
"grad_norm": 0.8918161988258362,
"kl": 0.5351806640625,
"learning_rate": 5.024191456827498e-07,
"loss": 0.0214,
"reward": 1.2987217612564563,
"reward_std": 1.1655599363148212,
"rewards/accuracy_reward": 0.49464288353919983,
"rewards/cosine_scaled_reward": 0.19366217765491456,
"rewards/format_reward": 0.04017857378348708,
"rewards/reasoning_steps_reward": 0.5702381379902363,
"step": 355
},
{
"completion_length": 703.1107444763184,
"epoch": 0.768,
"grad_norm": 0.3764072358608246,
"kl": 0.4150390625,
"learning_rate": 4.6132752795918667e-07,
"loss": 0.0166,
"reward": 1.4548213778063654,
"reward_std": 1.117940279096365,
"rewards/accuracy_reward": 0.5250000219792128,
"rewards/cosine_scaled_reward": 0.25392846008762715,
"rewards/format_reward": 0.054464288800954816,
"rewards/reasoning_steps_reward": 0.6214286223053932,
"step": 360
},
{
"completion_length": 702.7321716308594,
"epoch": 0.7786666666666666,
"grad_norm": 0.536405086517334,
"kl": 0.2932281494140625,
"learning_rate": 4.2168169552342905e-07,
"loss": 0.0117,
"reward": 1.7472290426492691,
"reward_std": 1.064868475496769,
"rewards/accuracy_reward": 0.614285746589303,
"rewards/cosine_scaled_reward": 0.35437183200847355,
"rewards/format_reward": 0.06071428880095482,
"rewards/reasoning_steps_reward": 0.7178571954369545,
"step": 365
},
{
"completion_length": 667.4143173217774,
"epoch": 0.7893333333333333,
"grad_norm": 1.1500115394592285,
"kl": 0.255450439453125,
"learning_rate": 3.8353683358814046e-07,
"loss": 0.0102,
"reward": 1.826224359869957,
"reward_std": 0.9232858289033175,
"rewards/accuracy_reward": 0.6482143169268966,
"rewards/cosine_scaled_reward": 0.3732480947277509,
"rewards/format_reward": 0.06607143199071289,
"rewards/reasoning_steps_reward": 0.7386905357241631,
"step": 370
},
{
"completion_length": 681.9518119812012,
"epoch": 0.8,
"grad_norm": 0.8492513298988342,
"kl": 0.2910614013671875,
"learning_rate": 3.469460380826697e-07,
"loss": 0.0117,
"reward": 1.7300246395170689,
"reward_std": 0.9816528409719467,
"rewards/accuracy_reward": 0.6125000230967999,
"rewards/cosine_scaled_reward": 0.3600841243751347,
"rewards/format_reward": 0.052678574342280626,
"rewards/reasoning_steps_reward": 0.7047619506716728,
"step": 375
},
{
"completion_length": 683.8536003112793,
"epoch": 0.8106666666666666,
"grad_norm": 0.43946385383605957,
"kl": 0.35491943359375,
"learning_rate": 3.119602417459075e-07,
"loss": 0.0142,
"reward": 1.6164295073598622,
"reward_std": 1.0403125062584877,
"rewards/accuracy_reward": 0.5767857421189546,
"rewards/cosine_scaled_reward": 0.29619133038795553,
"rewards/format_reward": 0.0482142879627645,
"rewards/reasoning_steps_reward": 0.6952381365001201,
"step": 380
},
{
"completion_length": 661.137525177002,
"epoch": 0.8213333333333334,
"grad_norm": 0.5927759408950806,
"kl": 0.251336669921875,
"learning_rate": 2.786281432302071e-07,
"loss": 0.0101,
"reward": 1.8459785029292106,
"reward_std": 0.8845801506191492,
"rewards/accuracy_reward": 0.6821428865194321,
"rewards/cosine_scaled_reward": 0.3781212717294693,
"rewards/format_reward": 0.06428571781143547,
"rewards/reasoning_steps_reward": 0.7214286215603352,
"step": 385
},
{
"completion_length": 700.2768203735352,
"epoch": 0.832,
"grad_norm": 0.5752273797988892,
"kl": 0.379559326171875,
"learning_rate": 2.46996139315057e-07,
"loss": 0.0152,
"reward": 1.6465823888778686,
"reward_std": 1.0167622987180949,
"rewards/accuracy_reward": 0.6142857445403933,
"rewards/cosine_scaled_reward": 0.31146327857859435,
"rewards/format_reward": 0.07500000344589353,
"rewards/reasoning_steps_reward": 0.6458333857357502,
"step": 390
},
{
"completion_length": 688.6482421875,
"epoch": 0.8426666666666667,
"grad_norm": 0.41832882165908813,
"kl": 0.379150390625,
"learning_rate": 2.1710826032485286e-07,
"loss": 0.0152,
"reward": 1.6644656013697385,
"reward_std": 0.9824759595096111,
"rewards/accuracy_reward": 0.6250000283122062,
"rewards/cosine_scaled_reward": 0.3260727058397606,
"rewards/format_reward": 0.054464288614690305,
"rewards/reasoning_steps_reward": 0.658928620070219,
"step": 395
},
{
"completion_length": 719.937533569336,
"epoch": 0.8533333333333334,
"grad_norm": 0.5534791350364685,
"kl": 0.382177734375,
"learning_rate": 1.8900610884066817e-07,
"loss": 0.0153,
"reward": 1.4879010431468487,
"reward_std": 1.0550432510674,
"rewards/accuracy_reward": 0.5410714585334062,
"rewards/cosine_scaled_reward": 0.2453414467825496,
"rewards/format_reward": 0.0491071455180645,
"rewards/reasoning_steps_reward": 0.6523810014128685,
"step": 400
},
{
"epoch": 0.8533333333333334,
"eval_completion_length": 695.8426594726562,
"eval_kl": 0.383571875,
"eval_loss": 0.015375643037259579,
"eval_reward": 1.5146705395892262,
"eval_reward_std": 1.0417588331997394,
"eval_rewards/accuracy_reward": 0.5409714534372091,
"eval_rewards/cosine_scaled_reward": 0.24984666706966235,
"eval_rewards/format_reward": 0.060157146042585374,
"eval_rewards/reasoning_steps_reward": 0.66369528632164,
"eval_runtime": 40348.1586,
"eval_samples_per_second": 0.124,
"eval_steps_per_second": 0.009,
"step": 400
},
{
"completion_length": 709.2018096923828,
"epoch": 0.864,
"grad_norm": 0.37454745173454285,
"kl": 0.43958740234375,
"learning_rate": 1.627288017913383e-07,
"loss": 0.0176,
"reward": 1.5630248546600343,
"reward_std": 1.0267837572842837,
"rewards/accuracy_reward": 0.5678571719676256,
"rewards/cosine_scaled_reward": 0.28772715290542694,
"rewards/format_reward": 0.04910714561119676,
"rewards/reasoning_steps_reward": 0.6583333760499954,
"step": 405
},
{
"completion_length": 715.1696792602539,
"epoch": 0.8746666666666667,
"grad_norm": 0.5133277773857117,
"kl": 0.399395751953125,
"learning_rate": 1.3831291600445573e-07,
"loss": 0.016,
"reward": 1.5371075724251568,
"reward_std": 1.0601157665252685,
"rewards/accuracy_reward": 0.553571455925703,
"rewards/cosine_scaled_reward": 0.28829799513332544,
"rewards/format_reward": 0.053571431431919336,
"rewards/reasoning_steps_reward": 0.6416667148470878,
"step": 410
},
{
"completion_length": 693.0446723937988,
"epoch": 0.8853333333333333,
"grad_norm": 0.7482662200927734,
"kl": 0.376470947265625,
"learning_rate": 1.1579243729307487e-07,
"loss": 0.0151,
"reward": 1.516674379259348,
"reward_std": 0.9749270871281623,
"rewards/accuracy_reward": 0.560714315250516,
"rewards/cosine_scaled_reward": 0.27411481700837614,
"rewards/format_reward": 0.043750001955777405,
"rewards/reasoning_steps_reward": 0.638095286488533,
"step": 415
},
{
"completion_length": 708.925032043457,
"epoch": 0.896,
"grad_norm": 0.38554155826568604,
"kl": 0.4101318359375,
"learning_rate": 9.519871314899092e-08,
"loss": 0.0164,
"reward": 1.5347512325271964,
"reward_std": 1.034306138008833,
"rewards/accuracy_reward": 0.585714316368103,
"rewards/cosine_scaled_reward": 0.2793940259842202,
"rewards/format_reward": 0.05000000260770321,
"rewards/reasoning_steps_reward": 0.6196429081261158,
"step": 420
},
{
"completion_length": 692.3571731567383,
"epoch": 0.9066666666666666,
"grad_norm": 0.390541672706604,
"kl": 0.294134521484375,
"learning_rate": 7.656040910844358e-08,
"loss": 0.0118,
"reward": 1.7413318648934364,
"reward_std": 0.9963843055069447,
"rewards/accuracy_reward": 0.6285714589059352,
"rewards/cosine_scaled_reward": 0.3463913181563839,
"rewards/format_reward": 0.04732143124565482,
"rewards/reasoning_steps_reward": 0.7190476730465889,
"step": 425
},
{
"completion_length": 683.8750282287598,
"epoch": 0.9173333333333333,
"grad_norm": 0.5177262425422668,
"kl": 0.330364990234375,
"learning_rate": 5.990346885098235e-08,
"loss": 0.0132,
"reward": 1.6970172494649887,
"reward_std": 1.0683425880968571,
"rewards/accuracy_reward": 0.6142857454717159,
"rewards/cosine_scaled_reward": 0.3476124212145805,
"rewards/format_reward": 0.057142860256135464,
"rewards/reasoning_steps_reward": 0.6779762372374535,
"step": 430
},
{
"completion_length": 693.9232406616211,
"epoch": 0.928,
"grad_norm": 0.41641440987586975,
"kl": 0.335888671875,
"learning_rate": 4.5251078087033493e-08,
"loss": 0.0134,
"reward": 1.7540825940668583,
"reward_std": 1.0200565621256827,
"rewards/accuracy_reward": 0.6160714615136385,
"rewards/cosine_scaled_reward": 0.35378490211442115,
"rewards/format_reward": 0.06875000363215804,
"rewards/reasoning_steps_reward": 0.7154762402176857,
"step": 435
},
{
"completion_length": 684.9786003112793,
"epoch": 0.9386666666666666,
"grad_norm": 0.6882645487785339,
"kl": 0.365093994140625,
"learning_rate": 3.262363228443427e-08,
"loss": 0.0146,
"reward": 1.6049893379211426,
"reward_std": 0.9915731698274612,
"rewards/accuracy_reward": 0.6035714587196708,
"rewards/cosine_scaled_reward": 0.3165964335203171,
"rewards/format_reward": 0.04732143105939031,
"rewards/reasoning_steps_reward": 0.6375000439584255,
"step": 440
},
{
"completion_length": 713.3928909301758,
"epoch": 0.9493333333333334,
"grad_norm": 0.48911258578300476,
"kl": 0.3521331787109375,
"learning_rate": 2.2038708278862952e-08,
"loss": 0.0141,
"reward": 1.5449063807725907,
"reward_std": 0.9845283433794976,
"rewards/accuracy_reward": 0.5500000244006514,
"rewards/cosine_scaled_reward": 0.28419203840894625,
"rewards/format_reward": 0.05178571678698063,
"rewards/reasoning_steps_reward": 0.6589286208152771,
"step": 445
},
{
"completion_length": 672.1143127441406,
"epoch": 0.96,
"grad_norm": 0.5151104927062988,
"kl": 0.319622802734375,
"learning_rate": 1.3511039807673209e-08,
"loss": 0.0128,
"reward": 1.7190548315644265,
"reward_std": 1.052689327299595,
"rewards/accuracy_reward": 0.6339285988360643,
"rewards/cosine_scaled_reward": 0.3443523827940226,
"rewards/format_reward": 0.0562500024214387,
"rewards/reasoning_steps_reward": 0.6845238626003265,
"step": 450
},
{
"completion_length": 676.0714645385742,
"epoch": 0.9706666666666667,
"grad_norm": 0.6873491406440735,
"kl": 0.286529541015625,
"learning_rate": 7.0524970011963675e-09,
"loss": 0.0115,
"reward": 1.8955881476402283,
"reward_std": 0.9624031879007816,
"rewards/accuracy_reward": 0.682142891176045,
"rewards/cosine_scaled_reward": 0.4223737971391529,
"rewards/format_reward": 0.07857143282890319,
"rewards/reasoning_steps_reward": 0.7125000573694706,
"step": 455
},
{
"completion_length": 679.2321739196777,
"epoch": 0.9813333333333333,
"grad_norm": 0.3787095546722412,
"kl": 0.304974365234375,
"learning_rate": 2.6720698600553595e-09,
"loss": 0.0122,
"reward": 1.7936133489012718,
"reward_std": 1.0248655170202254,
"rewards/accuracy_reward": 0.6535714577883482,
"rewards/cosine_scaled_reward": 0.38111327985534443,
"rewards/format_reward": 0.08214286155998707,
"rewards/reasoning_steps_reward": 0.6767857633531094,
"step": 460
},
{
"completion_length": 696.1339584350586,
"epoch": 0.992,
"grad_norm": 0.40489259362220764,
"kl": 0.3529052734375,
"learning_rate": 3.7585574148779613e-10,
"loss": 0.0141,
"reward": 1.6771088674664498,
"reward_std": 1.0866830073297025,
"rewards/accuracy_reward": 0.5982143137603998,
"rewards/cosine_scaled_reward": 0.3318706821650267,
"rewards/format_reward": 0.053571431525051595,
"rewards/reasoning_steps_reward": 0.6934524282813073,
"step": 465
},
{
"completion_length": 688.1964645385742,
"epoch": 0.9984,
"kl": 0.2928059895833333,
"reward": 1.8073695426185925,
"reward_std": 1.0462930103143055,
"rewards/accuracy_reward": 0.6517857536673546,
"rewards/cosine_scaled_reward": 0.40607976416746777,
"rewards/format_reward": 0.049107145673284926,
"rewards/reasoning_steps_reward": 0.700396885474523,
"step": 468,
"total_flos": 0.0,
"train_loss": 2.683533102224817,
"train_runtime": 211196.195,
"train_samples_per_second": 0.036,
"train_steps_per_second": 0.002
}
],
"logging_steps": 5,
"max_steps": 468,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}