|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9984, |
|
"eval_steps": 100, |
|
"global_step": 468, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 638.4285987854004, |
|
"epoch": 0.010666666666666666, |
|
"grad_norm": 2.42055344581604, |
|
"kl": 0.00011775493621826171, |
|
"learning_rate": 3.1914893617021275e-07, |
|
"loss": 0.0, |
|
"reward": 1.1089806377887725, |
|
"reward_std": 0.8793896824121475, |
|
"rewards/accuracy_reward": 0.5767857443541289, |
|
"rewards/cosine_scaled_reward": 0.25779010977130384, |
|
"rewards/format_reward": 0.00357142873108387, |
|
"rewards/reasoning_steps_reward": 0.270833354908973, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 611.7714553833008, |
|
"epoch": 0.021333333333333333, |
|
"grad_norm": 1.2694348096847534, |
|
"kl": 0.00021342039108276367, |
|
"learning_rate": 6.382978723404255e-07, |
|
"loss": 0.0, |
|
"reward": 1.1431605055928231, |
|
"reward_std": 0.8805145360529423, |
|
"rewards/accuracy_reward": 0.5892857421189547, |
|
"rewards/cosine_scaled_reward": 0.3008985619725536, |
|
"rewards/format_reward": 0.001785714365541935, |
|
"rewards/reasoning_steps_reward": 0.25119049232453106, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 603.9625274658204, |
|
"epoch": 0.032, |
|
"grad_norm": 4.813971042633057, |
|
"kl": 0.00024839639663696287, |
|
"learning_rate": 9.574468085106384e-07, |
|
"loss": 0.0, |
|
"reward": 1.2853217244148254, |
|
"reward_std": 0.8000296212732791, |
|
"rewards/accuracy_reward": 0.6535714577883482, |
|
"rewards/cosine_scaled_reward": 0.3317502578254789, |
|
"rewards/format_reward": 0.00357142873108387, |
|
"rewards/reasoning_steps_reward": 0.2964285886846483, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 595.6982391357421, |
|
"epoch": 0.042666666666666665, |
|
"grad_norm": 0.9142511487007141, |
|
"kl": 0.00047616958618164064, |
|
"learning_rate": 1.276595744680851e-06, |
|
"loss": 0.0, |
|
"reward": 1.2075286597013473, |
|
"reward_std": 0.7695159167051315, |
|
"rewards/accuracy_reward": 0.6428571730852127, |
|
"rewards/cosine_scaled_reward": 0.3247905206750147, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.23988096807152032, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 644.2071708679199, |
|
"epoch": 0.05333333333333334, |
|
"grad_norm": 0.7305999994277954, |
|
"kl": 0.0009862899780273438, |
|
"learning_rate": 1.5957446808510639e-06, |
|
"loss": 0.0, |
|
"reward": 1.2186039187014104, |
|
"reward_std": 0.7756468575447798, |
|
"rewards/accuracy_reward": 0.6339286027476192, |
|
"rewards/cosine_scaled_reward": 0.3200919725000858, |
|
"rewards/format_reward": 0.0008928571827709675, |
|
"rewards/reasoning_steps_reward": 0.26369049586355686, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 669.2125297546387, |
|
"epoch": 0.064, |
|
"grad_norm": 0.9431222677230835, |
|
"kl": 0.0020453929901123047, |
|
"learning_rate": 1.9148936170212767e-06, |
|
"loss": 0.0001, |
|
"reward": 1.3601950403302907, |
|
"reward_std": 0.760306540131569, |
|
"rewards/accuracy_reward": 0.6732143152505159, |
|
"rewards/cosine_scaled_reward": 0.3795402319636196, |
|
"rewards/format_reward": 0.0008928571827709675, |
|
"rewards/reasoning_steps_reward": 0.3065476375631988, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 643.0696685791015, |
|
"epoch": 0.07466666666666667, |
|
"grad_norm": 48319720.0, |
|
"kl": 6272.004669189453, |
|
"learning_rate": 2.2340425531914894e-06, |
|
"loss": 250.5145, |
|
"reward": 1.289367458410561, |
|
"reward_std": 0.7266312446445227, |
|
"rewards/accuracy_reward": 0.676785746589303, |
|
"rewards/cosine_scaled_reward": 0.3441293075971771, |
|
"rewards/format_reward": 0.001785714365541935, |
|
"rewards/reasoning_steps_reward": 0.26666668280959127, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 669.8018165588379, |
|
"epoch": 0.08533333333333333, |
|
"grad_norm": 0.4609315097332001, |
|
"kl": 0.018144559860229493, |
|
"learning_rate": 2.553191489361702e-06, |
|
"loss": 0.0007, |
|
"reward": 1.4497444801032544, |
|
"reward_std": 0.6942832075059414, |
|
"rewards/accuracy_reward": 0.6982143171131611, |
|
"rewards/cosine_scaled_reward": 0.41283965120092037, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.3386905025690794, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 642.2321701049805, |
|
"epoch": 0.096, |
|
"grad_norm": 1.9606070518493652, |
|
"kl": 0.004248189926147461, |
|
"learning_rate": 2.872340425531915e-06, |
|
"loss": 0.0002, |
|
"reward": 1.4603484645485878, |
|
"reward_std": 0.7034628570079804, |
|
"rewards/accuracy_reward": 0.7232143163681031, |
|
"rewards/cosine_scaled_reward": 0.4335627053398639, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.3035714477300644, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 659.8625274658203, |
|
"epoch": 0.10666666666666667, |
|
"grad_norm": 1.5518417358398438, |
|
"kl": 0.004150962829589844, |
|
"learning_rate": 2.9996241442585123e-06, |
|
"loss": 0.0002, |
|
"reward": 1.5205835647881032, |
|
"reward_std": 0.7311159037053585, |
|
"rewards/accuracy_reward": 0.7303571753203869, |
|
"rewards/cosine_scaled_reward": 0.4155239976942539, |
|
"rewards/format_reward": 0.0008928571827709675, |
|
"rewards/reasoning_steps_reward": 0.37380955144762995, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 638.6232460021972, |
|
"epoch": 0.11733333333333333, |
|
"grad_norm": 1.3417630195617676, |
|
"kl": 0.01309032440185547, |
|
"learning_rate": 2.9973279301399446e-06, |
|
"loss": 0.0005, |
|
"reward": 1.4900454580783844, |
|
"reward_std": 0.7109502237290144, |
|
"rewards/accuracy_reward": 0.7035714603960515, |
|
"rewards/cosine_scaled_reward": 0.3855811151210219, |
|
"rewards/format_reward": 0.0026785715483129023, |
|
"rewards/reasoning_steps_reward": 0.3982143113389611, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 637.7786003112793, |
|
"epoch": 0.128, |
|
"grad_norm": 0.7655097842216492, |
|
"kl": 0.005014801025390625, |
|
"learning_rate": 2.992947502998804e-06, |
|
"loss": 0.0002, |
|
"reward": 1.5817858844995498, |
|
"reward_std": 0.7063489355146885, |
|
"rewards/accuracy_reward": 0.7285714685916901, |
|
"rewards/cosine_scaled_reward": 0.4130358204245567, |
|
"rewards/format_reward": 0.0026785715483129023, |
|
"rewards/reasoning_steps_reward": 0.43750002793967724, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 619.2321716308594, |
|
"epoch": 0.13866666666666666, |
|
"grad_norm": 0.6214168667793274, |
|
"kl": 0.006450653076171875, |
|
"learning_rate": 2.9864889601923268e-06, |
|
"loss": 0.0003, |
|
"reward": 1.618351523578167, |
|
"reward_std": 0.6625145003199577, |
|
"rewards/accuracy_reward": 0.7232143163681031, |
|
"rewards/cosine_scaled_reward": 0.4254943021107465, |
|
"rewards/format_reward": 0.00357142873108387, |
|
"rewards/reasoning_steps_reward": 0.4660714641213417, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 620.4696716308594, |
|
"epoch": 0.14933333333333335, |
|
"grad_norm": 1.1358349323272705, |
|
"kl": 0.007819366455078126, |
|
"learning_rate": 2.977961291721137e-06, |
|
"loss": 0.0003, |
|
"reward": 1.809953036904335, |
|
"reward_std": 0.685565372928977, |
|
"rewards/accuracy_reward": 0.7589285969734192, |
|
"rewards/cosine_scaled_reward": 0.47126249115681274, |
|
"rewards/format_reward": 0.00714285746216774, |
|
"rewards/reasoning_steps_reward": 0.5726190894842148, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 591.2000267028809, |
|
"epoch": 0.16, |
|
"grad_norm": 0.9748697876930237, |
|
"kl": 0.01055145263671875, |
|
"learning_rate": 2.9673763677155655e-06, |
|
"loss": 0.0004, |
|
"reward": 1.801955761015415, |
|
"reward_std": 0.6809038281440735, |
|
"rewards/accuracy_reward": 0.7410714589059353, |
|
"rewards/cosine_scaled_reward": 0.4311223858210724, |
|
"rewards/format_reward": 0.0053571430966258046, |
|
"rewards/reasoning_steps_reward": 0.6244048129767179, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 604.2036003112793, |
|
"epoch": 0.17066666666666666, |
|
"grad_norm": 0.41957351565361023, |
|
"kl": 0.01427001953125, |
|
"learning_rate": 2.9547489219129666e-06, |
|
"loss": 0.0006, |
|
"reward": 1.9645369604229928, |
|
"reward_std": 0.5586295232176781, |
|
"rewards/accuracy_reward": 0.8053571732714773, |
|
"rewards/cosine_scaled_reward": 0.4785249759210274, |
|
"rewards/format_reward": 0.0026785715483129023, |
|
"rewards/reasoning_steps_reward": 0.6779762431979179, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 654.9571723937988, |
|
"epoch": 0.18133333333333335, |
|
"grad_norm": 0.5244520306587219, |
|
"kl": 0.01689453125, |
|
"learning_rate": 2.9400965311490175e-06, |
|
"loss": 0.0007, |
|
"reward": 1.9714522436261177, |
|
"reward_std": 0.654699632152915, |
|
"rewards/accuracy_reward": 0.7232143158093095, |
|
"rewards/cosine_scaled_reward": 0.4538926437497139, |
|
"rewards/format_reward": 0.0026785715483129023, |
|
"rewards/reasoning_steps_reward": 0.7916667237877846, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 635.4571701049805, |
|
"epoch": 0.192, |
|
"grad_norm": 0.5995980501174927, |
|
"kl": 0.0202392578125, |
|
"learning_rate": 2.9234395908915565e-06, |
|
"loss": 0.0008, |
|
"reward": 1.9059573337435722, |
|
"reward_std": 0.6402278915047646, |
|
"rewards/accuracy_reward": 0.6857143208384514, |
|
"rewards/cosine_scaled_reward": 0.3851239001378417, |
|
"rewards/format_reward": 0.001785714365541935, |
|
"rewards/reasoning_steps_reward": 0.8333333969116211, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 646.8857414245606, |
|
"epoch": 0.20266666666666666, |
|
"grad_norm": 0.3573947846889496, |
|
"kl": 0.02309417724609375, |
|
"learning_rate": 2.904801286851009e-06, |
|
"loss": 0.0009, |
|
"reward": 2.067735290527344, |
|
"reward_std": 0.5824727656319737, |
|
"rewards/accuracy_reward": 0.739285740442574, |
|
"rewards/cosine_scaled_reward": 0.45374711682088675, |
|
"rewards/format_reward": 0.0026785715483129023, |
|
"rewards/reasoning_steps_reward": 0.8720238715410232, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 617.1839553833008, |
|
"epoch": 0.21333333333333335, |
|
"grad_norm": 0.4597730338573456, |
|
"kl": 0.0259674072265625, |
|
"learning_rate": 2.884207562706925e-06, |
|
"loss": 0.001, |
|
"reward": 2.1270981818437575, |
|
"reward_std": 0.5901715014129877, |
|
"rewards/accuracy_reward": 0.7750000283122063, |
|
"rewards/cosine_scaled_reward": 0.49138382682576776, |
|
"rewards/format_reward": 0.001785714365541935, |
|
"rewards/reasoning_steps_reward": 0.858928632736206, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.21333333333333335, |
|
"eval_completion_length": 645.8245434692383, |
|
"eval_kl": 0.0286726806640625, |
|
"eval_loss": 0.0011589155765250325, |
|
"eval_reward": 1.9586333739757538, |
|
"eval_reward_std": 0.6544256884813309, |
|
"eval_rewards/accuracy_reward": 0.667685743278265, |
|
"eval_rewards/cosine_scaled_reward": 0.38711901631861545, |
|
"eval_rewards/format_reward": 0.0034000001534819605, |
|
"eval_rewards/reasoning_steps_reward": 0.9004286315560341, |
|
"eval_runtime": 30593.3304, |
|
"eval_samples_per_second": 0.163, |
|
"eval_steps_per_second": 0.012, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 666.8482437133789, |
|
"epoch": 0.224, |
|
"grad_norm": 0.3493061065673828, |
|
"kl": 0.027435302734375, |
|
"learning_rate": 2.8616870839955444e-06, |
|
"loss": 0.0011, |
|
"reward": 2.089837631583214, |
|
"reward_std": 0.6151095872744918, |
|
"rewards/accuracy_reward": 0.7214286036789417, |
|
"rewards/cosine_scaled_reward": 0.44727803440764546, |
|
"rewards/format_reward": 0.0026785715483129023, |
|
"rewards/reasoning_steps_reward": 0.9184524416923523, |
|
"step": 105 |
|
}, |
|
{ |
|
"completion_length": 689.3928901672364, |
|
"epoch": 0.23466666666666666, |
|
"grad_norm": 0.7455374002456665, |
|
"kl": 0.0316131591796875, |
|
"learning_rate": 2.837271198208662e-06, |
|
"loss": 0.0013, |
|
"reward": 2.129907730221748, |
|
"reward_std": 0.5857493598014116, |
|
"rewards/accuracy_reward": 0.7428571671247483, |
|
"rewards/cosine_scaled_reward": 0.47633622232824563, |
|
"rewards/format_reward": 0.001785714365541935, |
|
"rewards/reasoning_steps_reward": 0.9089286178350449, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 648.7071731567382, |
|
"epoch": 0.24533333333333332, |
|
"grad_norm": 0.423718124628067, |
|
"kl": 0.0330322265625, |
|
"learning_rate": 2.8109938911593322e-06, |
|
"loss": 0.0013, |
|
"reward": 2.1340139895677566, |
|
"reward_std": 0.5813389342278242, |
|
"rewards/accuracy_reward": 0.7232143152505159, |
|
"rewards/cosine_scaled_reward": 0.4527639038278721, |
|
"rewards/format_reward": 0.008035714644938708, |
|
"rewards/reasoning_steps_reward": 0.9500000536441803, |
|
"step": 115 |
|
}, |
|
{ |
|
"completion_length": 669.1518165588379, |
|
"epoch": 0.256, |
|
"grad_norm": 0.5140184760093689, |
|
"kl": 0.0355621337890625, |
|
"learning_rate": 2.7828917396751474e-06, |
|
"loss": 0.0015, |
|
"reward": 2.123083771765232, |
|
"reward_std": 0.6066710935905576, |
|
"rewards/accuracy_reward": 0.7285714626312256, |
|
"rewards/cosine_scaled_reward": 0.4486789128568489, |
|
"rewards/format_reward": 0.00357142873108387, |
|
"rewards/reasoning_steps_reward": 0.9422619551420212, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 651.8714584350586, |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 14.626392364501953, |
|
"kl": 0.054949951171875, |
|
"learning_rate": 2.753003860684943e-06, |
|
"loss": 0.0022, |
|
"reward": 2.238000822067261, |
|
"reward_std": 0.6136660899966955, |
|
"rewards/accuracy_reward": 0.7803571730852127, |
|
"rewards/cosine_scaled_reward": 0.5305602598935366, |
|
"rewards/format_reward": 0.00625000037252903, |
|
"rewards/reasoning_steps_reward": 0.9208333969116211, |
|
"step": 125 |
|
}, |
|
{ |
|
"completion_length": 687.6518211364746, |
|
"epoch": 0.2773333333333333, |
|
"grad_norm": 0.4999578297138214, |
|
"kl": 0.0349700927734375, |
|
"learning_rate": 2.721371856769793e-06, |
|
"loss": 0.0014, |
|
"reward": 2.085830058157444, |
|
"reward_std": 0.5865108415484428, |
|
"rewards/accuracy_reward": 0.6946428868919611, |
|
"rewards/cosine_scaled_reward": 0.4269014226272702, |
|
"rewards/format_reward": 0.010714286193251609, |
|
"rewards/reasoning_steps_reward": 0.9535714745521545, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 625.8964546203613, |
|
"epoch": 0.288, |
|
"grad_norm": 0.5372105240821838, |
|
"kl": 0.039947509765625, |
|
"learning_rate": 2.688039758254093e-06, |
|
"loss": 0.0016, |
|
"reward": 2.243368774652481, |
|
"reward_std": 0.6026589145883918, |
|
"rewards/accuracy_reward": 0.7750000316649676, |
|
"rewards/cosine_scaled_reward": 0.4892020009458065, |
|
"rewards/format_reward": 0.01428571492433548, |
|
"rewards/reasoning_steps_reward": 0.9648810073733329, |
|
"step": 135 |
|
}, |
|
{ |
|
"completion_length": 657.5696739196777, |
|
"epoch": 0.2986666666666667, |
|
"grad_norm": 0.6321460008621216, |
|
"kl": 0.0416748046875, |
|
"learning_rate": 2.65305396191733e-06, |
|
"loss": 0.0017, |
|
"reward": 2.1084320515394213, |
|
"reward_std": 0.7091156661510467, |
|
"rewards/accuracy_reward": 0.7071428887546063, |
|
"rewards/cosine_scaled_reward": 0.42956293127499523, |
|
"rewards/format_reward": 0.01696428647264838, |
|
"rewards/reasoning_steps_reward": 0.9547619611024857, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 669.2839614868165, |
|
"epoch": 0.30933333333333335, |
|
"grad_norm": 0.4815462529659271, |
|
"kl": 0.0447967529296875, |
|
"learning_rate": 2.61646316641186e-06, |
|
"loss": 0.0018, |
|
"reward": 2.022332654893398, |
|
"reward_std": 0.6810956679284572, |
|
"rewards/accuracy_reward": 0.6750000305473804, |
|
"rewards/cosine_scaled_reward": 0.4035826030303724, |
|
"rewards/format_reward": 0.009821429010480642, |
|
"rewards/reasoning_steps_reward": 0.9339286148548126, |
|
"step": 145 |
|
}, |
|
{ |
|
"completion_length": 647.9393119812012, |
|
"epoch": 0.32, |
|
"grad_norm": 0.4961049556732178, |
|
"kl": 0.0510009765625, |
|
"learning_rate": 2.5783183044765715e-06, |
|
"loss": 0.002, |
|
"reward": 1.9679424732923507, |
|
"reward_std": 0.7103641763329506, |
|
"rewards/accuracy_reward": 0.6678571719676256, |
|
"rewards/cosine_scaled_reward": 0.3926447768812068, |
|
"rewards/format_reward": 0.008035714644938708, |
|
"rewards/reasoning_steps_reward": 0.8994048193097115, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 625.0803840637207, |
|
"epoch": 0.33066666666666666, |
|
"grad_norm": 0.5056385397911072, |
|
"kl": 0.06827392578125, |
|
"learning_rate": 2.5386724720408135e-06, |
|
"loss": 0.0027, |
|
"reward": 2.0651650190353394, |
|
"reward_std": 0.6431151006370783, |
|
"rewards/accuracy_reward": 0.7375000286847353, |
|
"rewards/cosine_scaled_reward": 0.45712922792881727, |
|
"rewards/format_reward": 0.009821429010480642, |
|
"rewards/reasoning_steps_reward": 0.860714353621006, |
|
"step": 155 |
|
}, |
|
{ |
|
"completion_length": 611.2964599609375, |
|
"epoch": 0.3413333333333333, |
|
"grad_norm": 0.4566061794757843, |
|
"kl": 0.08109130859375, |
|
"learning_rate": 2.49758085431725e-06, |
|
"loss": 0.0032, |
|
"reward": 2.0270636796951296, |
|
"reward_std": 0.6181262265890837, |
|
"rewards/accuracy_reward": 0.7464285964146257, |
|
"rewards/cosine_scaled_reward": 0.43301601126149764, |
|
"rewards/format_reward": 0.012500000558793545, |
|
"rewards/reasoning_steps_reward": 0.8351191088557244, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 643.8536026000977, |
|
"epoch": 0.352, |
|
"grad_norm": 0.3811902403831482, |
|
"kl": 0.12118072509765625, |
|
"learning_rate": 2.455100648986533e-06, |
|
"loss": 0.0048, |
|
"reward": 1.9545473739504815, |
|
"reward_std": 0.7556829001754523, |
|
"rewards/accuracy_reward": 0.6857143171131611, |
|
"rewards/cosine_scaled_reward": 0.40722584864124656, |
|
"rewards/format_reward": 0.008035714644938708, |
|
"rewards/reasoning_steps_reward": 0.8535714909434319, |
|
"step": 165 |
|
}, |
|
{ |
|
"completion_length": 666.0553924560547, |
|
"epoch": 0.3626666666666667, |
|
"grad_norm": 0.5245673656463623, |
|
"kl": 0.1458251953125, |
|
"learning_rate": 2.4112909865807053e-06, |
|
"loss": 0.0058, |
|
"reward": 1.7782447993755341, |
|
"reward_std": 0.7516406249254942, |
|
"rewards/accuracy_reward": 0.6017857398837805, |
|
"rewards/cosine_scaled_reward": 0.32050663968548176, |
|
"rewards/format_reward": 0.008928571827709675, |
|
"rewards/reasoning_steps_reward": 0.8470238700509072, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 656.5375297546386, |
|
"epoch": 0.37333333333333335, |
|
"grad_norm": 0.3452568054199219, |
|
"kl": 0.100311279296875, |
|
"learning_rate": 2.366212848176164e-06, |
|
"loss": 0.004, |
|
"reward": 2.048972634971142, |
|
"reward_std": 0.706931572034955, |
|
"rewards/accuracy_reward": 0.7178571790456771, |
|
"rewards/cosine_scaled_reward": 0.4686154007911682, |
|
"rewards/format_reward": 0.0053571430966258046, |
|
"rewards/reasoning_steps_reward": 0.8571429222822189, |
|
"step": 175 |
|
}, |
|
{ |
|
"completion_length": 625.3053825378418, |
|
"epoch": 0.384, |
|
"grad_norm": 0.7150533199310303, |
|
"kl": 0.1327392578125, |
|
"learning_rate": 2.319928980510752e-06, |
|
"loss": 0.0053, |
|
"reward": 1.9931991159915925, |
|
"reward_std": 0.7508711714297533, |
|
"rewards/accuracy_reward": 0.7053571753203869, |
|
"rewards/cosine_scaled_reward": 0.4312942801974714, |
|
"rewards/format_reward": 0.012500000651925802, |
|
"rewards/reasoning_steps_reward": 0.8440476790070534, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 662.8018127441406, |
|
"epoch": 0.39466666666666667, |
|
"grad_norm": 0.5048078894615173, |
|
"kl": 0.2526092529296875, |
|
"learning_rate": 2.272503808643123e-06, |
|
"loss": 0.0101, |
|
"reward": 1.6179989255964755, |
|
"reward_std": 0.826694194227457, |
|
"rewards/accuracy_reward": 0.5803571738302707, |
|
"rewards/cosine_scaled_reward": 0.283177447039634, |
|
"rewards/format_reward": 0.009821429010480642, |
|
"rewards/reasoning_steps_reward": 0.7446429103612899, |
|
"step": 185 |
|
}, |
|
{ |
|
"completion_length": 612.7518165588378, |
|
"epoch": 0.4053333333333333, |
|
"grad_norm": 15.008540153503418, |
|
"kl": 0.2234375, |
|
"learning_rate": 2.2240033462759628e-06, |
|
"loss": 0.0089, |
|
"reward": 1.8689126953482629, |
|
"reward_std": 0.8196798441931605, |
|
"rewards/accuracy_reward": 0.6892857480794191, |
|
"rewards/cosine_scaled_reward": 0.38706737738102676, |
|
"rewards/format_reward": 0.0062500002793967726, |
|
"rewards/reasoning_steps_reward": 0.786309577524662, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 636.9143135070801, |
|
"epoch": 0.416, |
|
"grad_norm": 1.8482682704925537, |
|
"kl": 0.1047210693359375, |
|
"learning_rate": 2.1744951038678905e-06, |
|
"loss": 0.0042, |
|
"reward": 2.1131999254226685, |
|
"reward_std": 0.6497842017561197, |
|
"rewards/accuracy_reward": 0.7464286003261804, |
|
"rewards/cosine_scaled_reward": 0.475997456186451, |
|
"rewards/format_reward": 0.011607143562287092, |
|
"rewards/reasoning_steps_reward": 0.8791667327284813, |
|
"step": 195 |
|
}, |
|
{ |
|
"completion_length": 643.9375305175781, |
|
"epoch": 0.4266666666666667, |
|
"grad_norm": 1.6493088006973267, |
|
"kl": 0.1329986572265625, |
|
"learning_rate": 2.124047994661941e-06, |
|
"loss": 0.0053, |
|
"reward": 2.1098326206207276, |
|
"reward_std": 0.6796215798705816, |
|
"rewards/accuracy_reward": 0.7250000312924385, |
|
"rewards/cosine_scaled_reward": 0.4544754126574844, |
|
"rewards/format_reward": 0.019642858114093543, |
|
"rewards/reasoning_steps_reward": 0.9107143446803093, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4266666666666667, |
|
"eval_completion_length": 658.4622008789063, |
|
"eval_kl": 0.16044462890625, |
|
"eval_loss": 0.006352806463837624, |
|
"eval_reward": 2.0124860629320143, |
|
"eval_reward_std": 0.7236331352472305, |
|
"eval_rewards/accuracy_reward": 0.6609143146038056, |
|
"eval_rewards/cosine_scaled_reward": 0.3881764653600403, |
|
"eval_rewards/format_reward": 0.03644285902827978, |
|
"eval_rewards/reasoning_steps_reward": 0.9269524365663528, |
|
"eval_runtime": 39396.7661, |
|
"eval_samples_per_second": 0.127, |
|
"eval_steps_per_second": 0.009, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 664.1589576721192, |
|
"epoch": 0.43733333333333335, |
|
"grad_norm": 0.5569435358047485, |
|
"kl": 0.1752899169921875, |
|
"learning_rate": 2.072732238761434e-06, |
|
"loss": 0.007, |
|
"reward": 2.151426687836647, |
|
"reward_std": 0.6980113681405783, |
|
"rewards/accuracy_reward": 0.732142886519432, |
|
"rewards/cosine_scaled_reward": 0.4615456376457587, |
|
"rewards/format_reward": 0.03750000176951289, |
|
"rewards/reasoning_steps_reward": 0.9202381521463394, |
|
"step": 205 |
|
}, |
|
{ |
|
"completion_length": 638.1839584350586, |
|
"epoch": 0.448, |
|
"grad_norm": 0.8609623312950134, |
|
"kl": 0.1427032470703125, |
|
"learning_rate": 2.0206192653867536e-06, |
|
"loss": 0.0057, |
|
"reward": 2.260927739739418, |
|
"reward_std": 0.6022655628621578, |
|
"rewards/accuracy_reward": 0.7767857406288385, |
|
"rewards/cosine_scaled_reward": 0.5171776844188571, |
|
"rewards/format_reward": 0.045535716507583854, |
|
"rewards/reasoning_steps_reward": 0.9214286223053932, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 723.9089622497559, |
|
"epoch": 0.45866666666666667, |
|
"grad_norm": 1.7239713668823242, |
|
"kl": 0.304364013671875, |
|
"learning_rate": 1.967781613449095e-06, |
|
"loss": 0.0122, |
|
"reward": 1.6578506268560886, |
|
"reward_std": 0.8106920622289181, |
|
"rewards/accuracy_reward": 0.5500000214204193, |
|
"rewards/cosine_scaled_reward": 0.2706481910310686, |
|
"rewards/format_reward": 0.020535715389996767, |
|
"rewards/reasoning_steps_reward": 0.8166667185723782, |
|
"step": 215 |
|
}, |
|
{ |
|
"completion_length": 722.1321716308594, |
|
"epoch": 0.4693333333333333, |
|
"grad_norm": 3.4466731548309326, |
|
"kl": 0.541461181640625, |
|
"learning_rate": 1.9142928305795637e-06, |
|
"loss": 0.0217, |
|
"reward": 1.1115448012948037, |
|
"reward_std": 0.9772109590470791, |
|
"rewards/accuracy_reward": 0.3910714492201805, |
|
"rewards/cosine_scaled_reward": 0.09517570563766639, |
|
"rewards/format_reward": 0.01339285783469677, |
|
"rewards/reasoning_steps_reward": 0.6119048058986664, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 655.9178810119629, |
|
"epoch": 0.48, |
|
"grad_norm": 0.7818121910095215, |
|
"kl": 0.38446044921875, |
|
"learning_rate": 1.8602273707541886e-06, |
|
"loss": 0.0154, |
|
"reward": 1.443753632903099, |
|
"reward_std": 1.1735275402665137, |
|
"rewards/accuracy_reward": 0.5517857423052192, |
|
"rewards/cosine_scaled_reward": 0.263098827842623, |
|
"rewards/format_reward": 0.022321429569274187, |
|
"rewards/reasoning_steps_reward": 0.6065476641058922, |
|
"step": 225 |
|
}, |
|
{ |
|
"completion_length": 669.6357391357421, |
|
"epoch": 0.49066666666666664, |
|
"grad_norm": 0.3009130656719208, |
|
"kl": 0.186187744140625, |
|
"learning_rate": 1.8056604906573418e-06, |
|
"loss": 0.0074, |
|
"reward": 1.8981928735971452, |
|
"reward_std": 0.8392410669475794, |
|
"rewards/accuracy_reward": 0.6964286047965288, |
|
"rewards/cosine_scaled_reward": 0.438073761574924, |
|
"rewards/format_reward": 0.012500000558793545, |
|
"rewards/reasoning_steps_reward": 0.7511905305087566, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 653.5732406616211, |
|
"epoch": 0.5013333333333333, |
|
"grad_norm": 4.625399589538574, |
|
"kl": 0.05731353759765625, |
|
"learning_rate": 1.7506681449278226e-06, |
|
"loss": 0.0023, |
|
"reward": 2.11394245326519, |
|
"reward_std": 0.6526154175400734, |
|
"rewards/accuracy_reward": 0.7375000312924385, |
|
"rewards/cosine_scaled_reward": 0.4865614231675863, |
|
"rewards/format_reward": 0.021428572479635477, |
|
"rewards/reasoning_steps_reward": 0.8684524416923523, |
|
"step": 235 |
|
}, |
|
{ |
|
"completion_length": 645.69467086792, |
|
"epoch": 0.512, |
|
"grad_norm": 0.2851395606994629, |
|
"kl": 0.050042724609375, |
|
"learning_rate": 1.6953268804334257e-06, |
|
"loss": 0.002, |
|
"reward": 2.228366295993328, |
|
"reward_std": 0.5444579780101776, |
|
"rewards/accuracy_reward": 0.7642857372760773, |
|
"rewards/cosine_scaled_reward": 0.514675722271204, |
|
"rewards/format_reward": 0.03392857322469354, |
|
"rewards/reasoning_steps_reward": 0.9154762506484986, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 632.8053848266602, |
|
"epoch": 0.5226666666666666, |
|
"grad_norm": 0.34539249539375305, |
|
"kl": 0.0962860107421875, |
|
"learning_rate": 1.6397137297211436e-06, |
|
"loss": 0.0039, |
|
"reward": 2.337542861700058, |
|
"reward_std": 0.557469642162323, |
|
"rewards/accuracy_reward": 0.8142857376486063, |
|
"rewards/cosine_scaled_reward": 0.553614255785942, |
|
"rewards/format_reward": 0.041071430593729016, |
|
"rewards/reasoning_steps_reward": 0.9285714775323868, |
|
"step": 245 |
|
}, |
|
{ |
|
"completion_length": 659.1518135070801, |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 0.30278754234313965, |
|
"kl": 0.0599609375, |
|
"learning_rate": 1.5839061037913395e-06, |
|
"loss": 0.0024, |
|
"reward": 2.421455779671669, |
|
"reward_std": 0.5301560776308178, |
|
"rewards/accuracy_reward": 0.8410714566707611, |
|
"rewards/cosine_scaled_reward": 0.6059795372188092, |
|
"rewards/format_reward": 0.028571429941803218, |
|
"rewards/reasoning_steps_reward": 0.9458333805203438, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 713.3518188476562, |
|
"epoch": 0.544, |
|
"grad_norm": 0.3068905770778656, |
|
"kl": 0.0843505859375, |
|
"learning_rate": 1.527981684345115e-06, |
|
"loss": 0.0034, |
|
"reward": 2.1079654544591904, |
|
"reward_std": 0.6414080807939172, |
|
"rewards/accuracy_reward": 0.7000000279396772, |
|
"rewards/cosine_scaled_reward": 0.4207629946060479, |
|
"rewards/format_reward": 0.03482143022119999, |
|
"rewards/reasoning_steps_reward": 0.9523809969425201, |
|
"step": 255 |
|
}, |
|
{ |
|
"completion_length": 707.9411071777344, |
|
"epoch": 0.5546666666666666, |
|
"grad_norm": 0.33481159806251526, |
|
"kl": 0.07261962890625, |
|
"learning_rate": 1.4720183156548855e-06, |
|
"loss": 0.0029, |
|
"reward": 2.2514570981264113, |
|
"reward_std": 0.6310873694717885, |
|
"rewards/accuracy_reward": 0.7321428906172514, |
|
"rewards/cosine_scaled_reward": 0.5041355590336025, |
|
"rewards/format_reward": 0.05089285969734192, |
|
"rewards/reasoning_steps_reward": 0.9642857611179352, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 718.3107444763184, |
|
"epoch": 0.5653333333333334, |
|
"grad_norm": 0.38096827268600464, |
|
"kl": 0.08720703125, |
|
"learning_rate": 1.4160938962086612e-06, |
|
"loss": 0.0035, |
|
"reward": 2.1334225252270698, |
|
"reward_std": 0.6659108363091946, |
|
"rewards/accuracy_reward": 0.7035714538767934, |
|
"rewards/cosine_scaled_reward": 0.44383912505581974, |
|
"rewards/format_reward": 0.04732143105939031, |
|
"rewards/reasoning_steps_reward": 0.9386905342340469, |
|
"step": 265 |
|
}, |
|
{ |
|
"completion_length": 691.2268180847168, |
|
"epoch": 0.576, |
|
"grad_norm": 0.28566083312034607, |
|
"kl": 0.088311767578125, |
|
"learning_rate": 1.3602862702788567e-06, |
|
"loss": 0.0035, |
|
"reward": 2.2702409833669663, |
|
"reward_std": 0.6507027853280306, |
|
"rewards/accuracy_reward": 0.7500000327825547, |
|
"rewards/cosine_scaled_reward": 0.508336108038202, |
|
"rewards/format_reward": 0.057142859976738694, |
|
"rewards/reasoning_steps_reward": 0.9547619566321373, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 688.7464607238769, |
|
"epoch": 0.5866666666666667, |
|
"grad_norm": 0.35180163383483887, |
|
"kl": 0.0841522216796875, |
|
"learning_rate": 1.3046731195665748e-06, |
|
"loss": 0.0034, |
|
"reward": 2.3181463330984116, |
|
"reward_std": 0.6268971297889948, |
|
"rewards/accuracy_reward": 0.7696428872644901, |
|
"rewards/cosine_scaled_reward": 0.5246938619762659, |
|
"rewards/format_reward": 0.06785714644938708, |
|
"rewards/reasoning_steps_reward": 0.9559524253010749, |
|
"step": 275 |
|
}, |
|
{ |
|
"completion_length": 687.6589630126953, |
|
"epoch": 0.5973333333333334, |
|
"grad_norm": 0.3075723648071289, |
|
"kl": 0.0902801513671875, |
|
"learning_rate": 1.2493318550721775e-06, |
|
"loss": 0.0036, |
|
"reward": 2.242439457774162, |
|
"reward_std": 0.6110217805951834, |
|
"rewards/accuracy_reward": 0.7375000305473804, |
|
"rewards/cosine_scaled_reward": 0.49422508366405965, |
|
"rewards/format_reward": 0.0678571461699903, |
|
"rewards/reasoning_steps_reward": 0.9428571984171867, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 702.2446731567383, |
|
"epoch": 0.608, |
|
"grad_norm": 0.3365749716758728, |
|
"kl": 0.1429901123046875, |
|
"learning_rate": 1.1943395093426585e-06, |
|
"loss": 0.0057, |
|
"reward": 2.236051079630852, |
|
"reward_std": 0.6939984124153853, |
|
"rewards/accuracy_reward": 0.7464286036789417, |
|
"rewards/cosine_scaled_reward": 0.515515277441591, |
|
"rewards/format_reward": 0.06517857508733868, |
|
"rewards/reasoning_steps_reward": 0.9089286342263222, |
|
"step": 285 |
|
}, |
|
{ |
|
"completion_length": 667.2232437133789, |
|
"epoch": 0.6186666666666667, |
|
"grad_norm": 0.3665401041507721, |
|
"kl": 0.1312591552734375, |
|
"learning_rate": 1.1397726292458115e-06, |
|
"loss": 0.0053, |
|
"reward": 2.2684289827942847, |
|
"reward_std": 0.6653882045298815, |
|
"rewards/accuracy_reward": 0.751785746589303, |
|
"rewards/cosine_scaled_reward": 0.5151550889015197, |
|
"rewards/format_reward": 0.07410714691504836, |
|
"rewards/reasoning_steps_reward": 0.927381020784378, |
|
"step": 290 |
|
}, |
|
{ |
|
"completion_length": 723.4911033630372, |
|
"epoch": 0.6293333333333333, |
|
"grad_norm": 0.25975558161735535, |
|
"kl": 0.14805908203125, |
|
"learning_rate": 1.085707169420437e-06, |
|
"loss": 0.0059, |
|
"reward": 2.1135958269238473, |
|
"reward_std": 0.6525055527687073, |
|
"rewards/accuracy_reward": 0.6732143115252256, |
|
"rewards/cosine_scaled_reward": 0.4448457522317767, |
|
"rewards/format_reward": 0.08660714710131287, |
|
"rewards/reasoning_steps_reward": 0.9089286252856255, |
|
"step": 295 |
|
}, |
|
{ |
|
"completion_length": 685.9928894042969, |
|
"epoch": 0.64, |
|
"grad_norm": 0.32239413261413574, |
|
"kl": 0.1186309814453125, |
|
"learning_rate": 1.0322183865509054e-06, |
|
"loss": 0.0047, |
|
"reward": 2.3491215094923974, |
|
"reward_std": 0.6740828949958086, |
|
"rewards/accuracy_reward": 0.7928571742027998, |
|
"rewards/cosine_scaled_reward": 0.5387047556228935, |
|
"rewards/format_reward": 0.11339286286383868, |
|
"rewards/reasoning_steps_reward": 0.9041667297482491, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_completion_length": 692.8417169921875, |
|
"eval_kl": 0.126764990234375, |
|
"eval_loss": 0.005116811487823725, |
|
"eval_reward": 2.1439696138501168, |
|
"eval_reward_std": 0.7277915328145027, |
|
"eval_rewards/accuracy_reward": 0.6845143143117428, |
|
"eval_rewards/cosine_scaled_reward": 0.44376001094253736, |
|
"eval_rewards/format_reward": 0.10977143388986588, |
|
"eval_rewards/reasoning_steps_reward": 0.9059238699197769, |
|
"eval_runtime": 39599.431, |
|
"eval_samples_per_second": 0.126, |
|
"eval_steps_per_second": 0.009, |
|
"step": 300 |
|
}, |
|
{ |
|
"completion_length": 699.3339584350585, |
|
"epoch": 0.6506666666666666, |
|
"grad_norm": 0.3004523515701294, |
|
"kl": 0.1110198974609375, |
|
"learning_rate": 9.793807346132464e-07, |
|
"loss": 0.0044, |
|
"reward": 2.2790004700422286, |
|
"reward_std": 0.7021285973489284, |
|
"rewards/accuracy_reward": 0.7607143171131611, |
|
"rewards/cosine_scaled_reward": 0.5272146660834551, |
|
"rewards/format_reward": 0.09107143292203546, |
|
"rewards/reasoning_steps_reward": 0.9000000640749931, |
|
"step": 305 |
|
}, |
|
{ |
|
"completion_length": 699.3411010742187, |
|
"epoch": 0.6613333333333333, |
|
"grad_norm": 0.29891085624694824, |
|
"kl": 0.13359375, |
|
"learning_rate": 9.272677612385667e-07, |
|
"loss": 0.0053, |
|
"reward": 2.1629825204610826, |
|
"reward_std": 0.6936808105558157, |
|
"rewards/accuracy_reward": 0.7142857410013675, |
|
"rewards/cosine_scaled_reward": 0.46447055372409524, |
|
"rewards/format_reward": 0.097321433480829, |
|
"rewards/reasoning_steps_reward": 0.8869048282504082, |
|
"step": 310 |
|
}, |
|
{ |
|
"completion_length": 706.1196708679199, |
|
"epoch": 0.672, |
|
"grad_norm": 0.3089028596878052, |
|
"kl": 0.1456146240234375, |
|
"learning_rate": 8.759520053380591e-07, |
|
"loss": 0.0058, |
|
"reward": 2.1019979074597357, |
|
"reward_std": 0.740777799114585, |
|
"rewards/accuracy_reward": 0.6803571773692966, |
|
"rewards/cosine_scaled_reward": 0.44158115636964795, |
|
"rewards/format_reward": 0.08839286118745804, |
|
"rewards/reasoning_steps_reward": 0.8916667312383652, |
|
"step": 315 |
|
}, |
|
{ |
|
"completion_length": 669.9286071777344, |
|
"epoch": 0.6826666666666666, |
|
"grad_norm": 0.6965083479881287, |
|
"kl": 0.145257568359375, |
|
"learning_rate": 8.255048961321088e-07, |
|
"loss": 0.0058, |
|
"reward": 2.272875265777111, |
|
"reward_std": 0.7233634147793054, |
|
"rewards/accuracy_reward": 0.7553571749478578, |
|
"rewards/cosine_scaled_reward": 0.5264466149732471, |
|
"rewards/format_reward": 0.10892857704311609, |
|
"rewards/reasoning_steps_reward": 0.8821429207921028, |
|
"step": 320 |
|
}, |
|
{ |
|
"completion_length": 692.9786003112793, |
|
"epoch": 0.6933333333333334, |
|
"grad_norm": 0.33192208409309387, |
|
"kl": 0.1722869873046875, |
|
"learning_rate": 7.759966537240373e-07, |
|
"loss": 0.0069, |
|
"reward": 2.169807307422161, |
|
"reward_std": 0.8210989892482757, |
|
"rewards/accuracy_reward": 0.7089285964146257, |
|
"rewards/cosine_scaled_reward": 0.4582000946626067, |
|
"rewards/format_reward": 0.11160714961588383, |
|
"rewards/reasoning_steps_reward": 0.8910714894533157, |
|
"step": 325 |
|
}, |
|
{ |
|
"completion_length": 705.5911056518555, |
|
"epoch": 0.704, |
|
"grad_norm": 0.5306978821754456, |
|
"kl": 0.24217529296875, |
|
"learning_rate": 7.274961913568773e-07, |
|
"loss": 0.0097, |
|
"reward": 2.0500947162508965, |
|
"reward_std": 0.8266730591654777, |
|
"rewards/accuracy_reward": 0.667857170663774, |
|
"rewards/cosine_scaled_reward": 0.44116610190831124, |
|
"rewards/format_reward": 0.11071429131552576, |
|
"rewards/reasoning_steps_reward": 0.8303571999073028, |
|
"step": 330 |
|
}, |
|
{ |
|
"completion_length": 731.1839591979981, |
|
"epoch": 0.7146666666666667, |
|
"grad_norm": 0.4849264919757843, |
|
"kl": 0.2579345703125, |
|
"learning_rate": 6.800710194892484e-07, |
|
"loss": 0.0103, |
|
"reward": 1.9499552190303802, |
|
"reward_std": 0.9148915704339743, |
|
"rewards/accuracy_reward": 0.6428571701049804, |
|
"rewards/cosine_scaled_reward": 0.3943003877531737, |
|
"rewards/format_reward": 0.10267857694998384, |
|
"rewards/reasoning_steps_reward": 0.8101191118359565, |
|
"step": 335 |
|
}, |
|
{ |
|
"completion_length": 733.7321731567383, |
|
"epoch": 0.7253333333333334, |
|
"grad_norm": 0.6697672009468079, |
|
"kl": 0.3463623046875, |
|
"learning_rate": 6.33787151823836e-07, |
|
"loss": 0.0139, |
|
"reward": 1.6754619617015123, |
|
"reward_std": 1.0652375385165214, |
|
"rewards/accuracy_reward": 0.5571428790688515, |
|
"rewards/cosine_scaled_reward": 0.29867618879216024, |
|
"rewards/format_reward": 0.07500000381842256, |
|
"rewards/reasoning_steps_reward": 0.7446429077535868, |
|
"step": 340 |
|
}, |
|
{ |
|
"completion_length": 708.6964599609375, |
|
"epoch": 0.736, |
|
"grad_norm": 0.342541366815567, |
|
"kl": 0.3625, |
|
"learning_rate": 5.887090134192947e-07, |
|
"loss": 0.0145, |
|
"reward": 1.7074500739574432, |
|
"reward_std": 1.06461516097188, |
|
"rewards/accuracy_reward": 0.5910714540630579, |
|
"rewards/cosine_scaled_reward": 0.3163785987533629, |
|
"rewards/format_reward": 0.07321428908035159, |
|
"rewards/reasoning_steps_reward": 0.7267857655882836, |
|
"step": 345 |
|
}, |
|
{ |
|
"completion_length": 741.1839630126954, |
|
"epoch": 0.7466666666666667, |
|
"grad_norm": 0.5499653816223145, |
|
"kl": 0.4493896484375, |
|
"learning_rate": 5.448993510134669e-07, |
|
"loss": 0.018, |
|
"reward": 1.4979531578719616, |
|
"reward_std": 1.063758409768343, |
|
"rewards/accuracy_reward": 0.5339285928755999, |
|
"rewards/cosine_scaled_reward": 0.2485483249882236, |
|
"rewards/format_reward": 0.06071428917348385, |
|
"rewards/reasoning_steps_reward": 0.6547619514167309, |
|
"step": 350 |
|
}, |
|
{ |
|
"completion_length": 718.5732467651367, |
|
"epoch": 0.7573333333333333, |
|
"grad_norm": 0.8918161988258362, |
|
"kl": 0.5351806640625, |
|
"learning_rate": 5.024191456827498e-07, |
|
"loss": 0.0214, |
|
"reward": 1.2987217612564563, |
|
"reward_std": 1.1655599363148212, |
|
"rewards/accuracy_reward": 0.49464288353919983, |
|
"rewards/cosine_scaled_reward": 0.19366217765491456, |
|
"rewards/format_reward": 0.04017857378348708, |
|
"rewards/reasoning_steps_reward": 0.5702381379902363, |
|
"step": 355 |
|
}, |
|
{ |
|
"completion_length": 703.1107444763184, |
|
"epoch": 0.768, |
|
"grad_norm": 0.3764072358608246, |
|
"kl": 0.4150390625, |
|
"learning_rate": 4.6132752795918667e-07, |
|
"loss": 0.0166, |
|
"reward": 1.4548213778063654, |
|
"reward_std": 1.117940279096365, |
|
"rewards/accuracy_reward": 0.5250000219792128, |
|
"rewards/cosine_scaled_reward": 0.25392846008762715, |
|
"rewards/format_reward": 0.054464288800954816, |
|
"rewards/reasoning_steps_reward": 0.6214286223053932, |
|
"step": 360 |
|
}, |
|
{ |
|
"completion_length": 702.7321716308594, |
|
"epoch": 0.7786666666666666, |
|
"grad_norm": 0.536405086517334, |
|
"kl": 0.2932281494140625, |
|
"learning_rate": 4.2168169552342905e-07, |
|
"loss": 0.0117, |
|
"reward": 1.7472290426492691, |
|
"reward_std": 1.064868475496769, |
|
"rewards/accuracy_reward": 0.614285746589303, |
|
"rewards/cosine_scaled_reward": 0.35437183200847355, |
|
"rewards/format_reward": 0.06071428880095482, |
|
"rewards/reasoning_steps_reward": 0.7178571954369545, |
|
"step": 365 |
|
}, |
|
{ |
|
"completion_length": 667.4143173217774, |
|
"epoch": 0.7893333333333333, |
|
"grad_norm": 1.1500115394592285, |
|
"kl": 0.255450439453125, |
|
"learning_rate": 3.8353683358814046e-07, |
|
"loss": 0.0102, |
|
"reward": 1.826224359869957, |
|
"reward_std": 0.9232858289033175, |
|
"rewards/accuracy_reward": 0.6482143169268966, |
|
"rewards/cosine_scaled_reward": 0.3732480947277509, |
|
"rewards/format_reward": 0.06607143199071289, |
|
"rewards/reasoning_steps_reward": 0.7386905357241631, |
|
"step": 370 |
|
}, |
|
{ |
|
"completion_length": 681.9518119812012, |
|
"epoch": 0.8, |
|
"grad_norm": 0.8492513298988342, |
|
"kl": 0.2910614013671875, |
|
"learning_rate": 3.469460380826697e-07, |
|
"loss": 0.0117, |
|
"reward": 1.7300246395170689, |
|
"reward_std": 0.9816528409719467, |
|
"rewards/accuracy_reward": 0.6125000230967999, |
|
"rewards/cosine_scaled_reward": 0.3600841243751347, |
|
"rewards/format_reward": 0.052678574342280626, |
|
"rewards/reasoning_steps_reward": 0.7047619506716728, |
|
"step": 375 |
|
}, |
|
{ |
|
"completion_length": 683.8536003112793, |
|
"epoch": 0.8106666666666666, |
|
"grad_norm": 0.43946385383605957, |
|
"kl": 0.35491943359375, |
|
"learning_rate": 3.119602417459075e-07, |
|
"loss": 0.0142, |
|
"reward": 1.6164295073598622, |
|
"reward_std": 1.0403125062584877, |
|
"rewards/accuracy_reward": 0.5767857421189546, |
|
"rewards/cosine_scaled_reward": 0.29619133038795553, |
|
"rewards/format_reward": 0.0482142879627645, |
|
"rewards/reasoning_steps_reward": 0.6952381365001201, |
|
"step": 380 |
|
}, |
|
{ |
|
"completion_length": 661.137525177002, |
|
"epoch": 0.8213333333333334, |
|
"grad_norm": 0.5927759408950806, |
|
"kl": 0.251336669921875, |
|
"learning_rate": 2.786281432302071e-07, |
|
"loss": 0.0101, |
|
"reward": 1.8459785029292106, |
|
"reward_std": 0.8845801506191492, |
|
"rewards/accuracy_reward": 0.6821428865194321, |
|
"rewards/cosine_scaled_reward": 0.3781212717294693, |
|
"rewards/format_reward": 0.06428571781143547, |
|
"rewards/reasoning_steps_reward": 0.7214286215603352, |
|
"step": 385 |
|
}, |
|
{ |
|
"completion_length": 700.2768203735352, |
|
"epoch": 0.832, |
|
"grad_norm": 0.5752273797988892, |
|
"kl": 0.379559326171875, |
|
"learning_rate": 2.46996139315057e-07, |
|
"loss": 0.0152, |
|
"reward": 1.6465823888778686, |
|
"reward_std": 1.0167622987180949, |
|
"rewards/accuracy_reward": 0.6142857445403933, |
|
"rewards/cosine_scaled_reward": 0.31146327857859435, |
|
"rewards/format_reward": 0.07500000344589353, |
|
"rewards/reasoning_steps_reward": 0.6458333857357502, |
|
"step": 390 |
|
}, |
|
{ |
|
"completion_length": 688.6482421875, |
|
"epoch": 0.8426666666666667, |
|
"grad_norm": 0.41832882165908813, |
|
"kl": 0.379150390625, |
|
"learning_rate": 2.1710826032485286e-07, |
|
"loss": 0.0152, |
|
"reward": 1.6644656013697385, |
|
"reward_std": 0.9824759595096111, |
|
"rewards/accuracy_reward": 0.6250000283122062, |
|
"rewards/cosine_scaled_reward": 0.3260727058397606, |
|
"rewards/format_reward": 0.054464288614690305, |
|
"rewards/reasoning_steps_reward": 0.658928620070219, |
|
"step": 395 |
|
}, |
|
{ |
|
"completion_length": 719.937533569336, |
|
"epoch": 0.8533333333333334, |
|
"grad_norm": 0.5534791350364685, |
|
"kl": 0.382177734375, |
|
"learning_rate": 1.8900610884066817e-07, |
|
"loss": 0.0153, |
|
"reward": 1.4879010431468487, |
|
"reward_std": 1.0550432510674, |
|
"rewards/accuracy_reward": 0.5410714585334062, |
|
"rewards/cosine_scaled_reward": 0.2453414467825496, |
|
"rewards/format_reward": 0.0491071455180645, |
|
"rewards/reasoning_steps_reward": 0.6523810014128685, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8533333333333334, |
|
"eval_completion_length": 695.8426594726562, |
|
"eval_kl": 0.383571875, |
|
"eval_loss": 0.015375643037259579, |
|
"eval_reward": 1.5146705395892262, |
|
"eval_reward_std": 1.0417588331997394, |
|
"eval_rewards/accuracy_reward": 0.5409714534372091, |
|
"eval_rewards/cosine_scaled_reward": 0.24984666706966235, |
|
"eval_rewards/format_reward": 0.060157146042585374, |
|
"eval_rewards/reasoning_steps_reward": 0.66369528632164, |
|
"eval_runtime": 40348.1586, |
|
"eval_samples_per_second": 0.124, |
|
"eval_steps_per_second": 0.009, |
|
"step": 400 |
|
}, |
|
{ |
|
"completion_length": 709.2018096923828, |
|
"epoch": 0.864, |
|
"grad_norm": 0.37454745173454285, |
|
"kl": 0.43958740234375, |
|
"learning_rate": 1.627288017913383e-07, |
|
"loss": 0.0176, |
|
"reward": 1.5630248546600343, |
|
"reward_std": 1.0267837572842837, |
|
"rewards/accuracy_reward": 0.5678571719676256, |
|
"rewards/cosine_scaled_reward": 0.28772715290542694, |
|
"rewards/format_reward": 0.04910714561119676, |
|
"rewards/reasoning_steps_reward": 0.6583333760499954, |
|
"step": 405 |
|
}, |
|
{ |
|
"completion_length": 715.1696792602539, |
|
"epoch": 0.8746666666666667, |
|
"grad_norm": 0.5133277773857117, |
|
"kl": 0.399395751953125, |
|
"learning_rate": 1.3831291600445573e-07, |
|
"loss": 0.016, |
|
"reward": 1.5371075724251568, |
|
"reward_std": 1.0601157665252685, |
|
"rewards/accuracy_reward": 0.553571455925703, |
|
"rewards/cosine_scaled_reward": 0.28829799513332544, |
|
"rewards/format_reward": 0.053571431431919336, |
|
"rewards/reasoning_steps_reward": 0.6416667148470878, |
|
"step": 410 |
|
}, |
|
{ |
|
"completion_length": 693.0446723937988, |
|
"epoch": 0.8853333333333333, |
|
"grad_norm": 0.7482662200927734, |
|
"kl": 0.376470947265625, |
|
"learning_rate": 1.1579243729307487e-07, |
|
"loss": 0.0151, |
|
"reward": 1.516674379259348, |
|
"reward_std": 0.9749270871281623, |
|
"rewards/accuracy_reward": 0.560714315250516, |
|
"rewards/cosine_scaled_reward": 0.27411481700837614, |
|
"rewards/format_reward": 0.043750001955777405, |
|
"rewards/reasoning_steps_reward": 0.638095286488533, |
|
"step": 415 |
|
}, |
|
{ |
|
"completion_length": 708.925032043457, |
|
"epoch": 0.896, |
|
"grad_norm": 0.38554155826568604, |
|
"kl": 0.4101318359375, |
|
"learning_rate": 9.519871314899092e-08, |
|
"loss": 0.0164, |
|
"reward": 1.5347512325271964, |
|
"reward_std": 1.034306138008833, |
|
"rewards/accuracy_reward": 0.585714316368103, |
|
"rewards/cosine_scaled_reward": 0.2793940259842202, |
|
"rewards/format_reward": 0.05000000260770321, |
|
"rewards/reasoning_steps_reward": 0.6196429081261158, |
|
"step": 420 |
|
}, |
|
{ |
|
"completion_length": 692.3571731567383, |
|
"epoch": 0.9066666666666666, |
|
"grad_norm": 0.390541672706604, |
|
"kl": 0.294134521484375, |
|
"learning_rate": 7.656040910844358e-08, |
|
"loss": 0.0118, |
|
"reward": 1.7413318648934364, |
|
"reward_std": 0.9963843055069447, |
|
"rewards/accuracy_reward": 0.6285714589059352, |
|
"rewards/cosine_scaled_reward": 0.3463913181563839, |
|
"rewards/format_reward": 0.04732143124565482, |
|
"rewards/reasoning_steps_reward": 0.7190476730465889, |
|
"step": 425 |
|
}, |
|
{ |
|
"completion_length": 683.8750282287598, |
|
"epoch": 0.9173333333333333, |
|
"grad_norm": 0.5177262425422668, |
|
"kl": 0.330364990234375, |
|
"learning_rate": 5.990346885098235e-08, |
|
"loss": 0.0132, |
|
"reward": 1.6970172494649887, |
|
"reward_std": 1.0683425880968571, |
|
"rewards/accuracy_reward": 0.6142857454717159, |
|
"rewards/cosine_scaled_reward": 0.3476124212145805, |
|
"rewards/format_reward": 0.057142860256135464, |
|
"rewards/reasoning_steps_reward": 0.6779762372374535, |
|
"step": 430 |
|
}, |
|
{ |
|
"completion_length": 693.9232406616211, |
|
"epoch": 0.928, |
|
"grad_norm": 0.41641440987586975, |
|
"kl": 0.335888671875, |
|
"learning_rate": 4.5251078087033493e-08, |
|
"loss": 0.0134, |
|
"reward": 1.7540825940668583, |
|
"reward_std": 1.0200565621256827, |
|
"rewards/accuracy_reward": 0.6160714615136385, |
|
"rewards/cosine_scaled_reward": 0.35378490211442115, |
|
"rewards/format_reward": 0.06875000363215804, |
|
"rewards/reasoning_steps_reward": 0.7154762402176857, |
|
"step": 435 |
|
}, |
|
{ |
|
"completion_length": 684.9786003112793, |
|
"epoch": 0.9386666666666666, |
|
"grad_norm": 0.6882645487785339, |
|
"kl": 0.365093994140625, |
|
"learning_rate": 3.262363228443427e-08, |
|
"loss": 0.0146, |
|
"reward": 1.6049893379211426, |
|
"reward_std": 0.9915731698274612, |
|
"rewards/accuracy_reward": 0.6035714587196708, |
|
"rewards/cosine_scaled_reward": 0.3165964335203171, |
|
"rewards/format_reward": 0.04732143105939031, |
|
"rewards/reasoning_steps_reward": 0.6375000439584255, |
|
"step": 440 |
|
}, |
|
{ |
|
"completion_length": 713.3928909301758, |
|
"epoch": 0.9493333333333334, |
|
"grad_norm": 0.48911258578300476, |
|
"kl": 0.3521331787109375, |
|
"learning_rate": 2.2038708278862952e-08, |
|
"loss": 0.0141, |
|
"reward": 1.5449063807725907, |
|
"reward_std": 0.9845283433794976, |
|
"rewards/accuracy_reward": 0.5500000244006514, |
|
"rewards/cosine_scaled_reward": 0.28419203840894625, |
|
"rewards/format_reward": 0.05178571678698063, |
|
"rewards/reasoning_steps_reward": 0.6589286208152771, |
|
"step": 445 |
|
}, |
|
{ |
|
"completion_length": 672.1143127441406, |
|
"epoch": 0.96, |
|
"grad_norm": 0.5151104927062988, |
|
"kl": 0.319622802734375, |
|
"learning_rate": 1.3511039807673209e-08, |
|
"loss": 0.0128, |
|
"reward": 1.7190548315644265, |
|
"reward_std": 1.052689327299595, |
|
"rewards/accuracy_reward": 0.6339285988360643, |
|
"rewards/cosine_scaled_reward": 0.3443523827940226, |
|
"rewards/format_reward": 0.0562500024214387, |
|
"rewards/reasoning_steps_reward": 0.6845238626003265, |
|
"step": 450 |
|
}, |
|
{ |
|
"completion_length": 676.0714645385742, |
|
"epoch": 0.9706666666666667, |
|
"grad_norm": 0.6873491406440735, |
|
"kl": 0.286529541015625, |
|
"learning_rate": 7.0524970011963675e-09, |
|
"loss": 0.0115, |
|
"reward": 1.8955881476402283, |
|
"reward_std": 0.9624031879007816, |
|
"rewards/accuracy_reward": 0.682142891176045, |
|
"rewards/cosine_scaled_reward": 0.4223737971391529, |
|
"rewards/format_reward": 0.07857143282890319, |
|
"rewards/reasoning_steps_reward": 0.7125000573694706, |
|
"step": 455 |
|
}, |
|
{ |
|
"completion_length": 679.2321739196777, |
|
"epoch": 0.9813333333333333, |
|
"grad_norm": 0.3787095546722412, |
|
"kl": 0.304974365234375, |
|
"learning_rate": 2.6720698600553595e-09, |
|
"loss": 0.0122, |
|
"reward": 1.7936133489012718, |
|
"reward_std": 1.0248655170202254, |
|
"rewards/accuracy_reward": 0.6535714577883482, |
|
"rewards/cosine_scaled_reward": 0.38111327985534443, |
|
"rewards/format_reward": 0.08214286155998707, |
|
"rewards/reasoning_steps_reward": 0.6767857633531094, |
|
"step": 460 |
|
}, |
|
{ |
|
"completion_length": 696.1339584350586, |
|
"epoch": 0.992, |
|
"grad_norm": 0.40489259362220764, |
|
"kl": 0.3529052734375, |
|
"learning_rate": 3.7585574148779613e-10, |
|
"loss": 0.0141, |
|
"reward": 1.6771088674664498, |
|
"reward_std": 1.0866830073297025, |
|
"rewards/accuracy_reward": 0.5982143137603998, |
|
"rewards/cosine_scaled_reward": 0.3318706821650267, |
|
"rewards/format_reward": 0.053571431525051595, |
|
"rewards/reasoning_steps_reward": 0.6934524282813073, |
|
"step": 465 |
|
}, |
|
{ |
|
"completion_length": 688.1964645385742, |
|
"epoch": 0.9984, |
|
"kl": 0.2928059895833333, |
|
"reward": 1.8073695426185925, |
|
"reward_std": 1.0462930103143055, |
|
"rewards/accuracy_reward": 0.6517857536673546, |
|
"rewards/cosine_scaled_reward": 0.40607976416746777, |
|
"rewards/format_reward": 0.049107145673284926, |
|
"rewards/reasoning_steps_reward": 0.700396885474523, |
|
"step": 468, |
|
"total_flos": 0.0, |
|
"train_loss": 2.683533102224817, |
|
"train_runtime": 211196.195, |
|
"train_samples_per_second": 0.036, |
|
"train_steps_per_second": 0.002 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 468, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|