|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9984, |
|
"eval_steps": 100, |
|
"global_step": 468, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 626.9428821563721, |
|
"epoch": 0.010666666666666666, |
|
"grad_norm": 3.135220183260807, |
|
"kl": 0.00012028217315673828, |
|
"learning_rate": 3.1914893617021275e-07, |
|
"loss": 0.0, |
|
"reward": 1.1311939403414726, |
|
"reward_std": 0.9199116222560406, |
|
"rewards/accuracy_reward": 0.5946428846567869, |
|
"rewards/cosine_scaled_reward": 0.28655105652287605, |
|
"rewards/format_reward": 0.00714285746216774, |
|
"rewards/reasoning_steps_reward": 0.2428571599535644, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 610.6661018371582, |
|
"epoch": 0.021333333333333333, |
|
"grad_norm": 5.7259660177649625, |
|
"kl": 0.0001917600631713867, |
|
"learning_rate": 6.382978723404255e-07, |
|
"loss": 0.0, |
|
"reward": 1.2825231447815895, |
|
"reward_std": 0.8677008092403412, |
|
"rewards/accuracy_reward": 0.6535714611411094, |
|
"rewards/cosine_scaled_reward": 0.33728500208817425, |
|
"rewards/format_reward": 0.001785714365541935, |
|
"rewards/reasoning_steps_reward": 0.28988096918910744, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 609.1125274658203, |
|
"epoch": 0.032, |
|
"grad_norm": 1.0344908113531213, |
|
"kl": 0.00023615360260009766, |
|
"learning_rate": 9.574468085106384e-07, |
|
"loss": 0.0, |
|
"reward": 1.3210706368088723, |
|
"reward_std": 0.8099735792726278, |
|
"rewards/accuracy_reward": 0.6767857499420643, |
|
"rewards/cosine_scaled_reward": 0.3460705999750644, |
|
"rewards/format_reward": 0.001785714365541935, |
|
"rewards/reasoning_steps_reward": 0.29642859203740957, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 611.9785972595215, |
|
"epoch": 0.042666666666666665, |
|
"grad_norm": 1.2756183417905018, |
|
"kl": 0.0007787942886352539, |
|
"learning_rate": 1.276595744680851e-06, |
|
"loss": 0.0, |
|
"reward": 1.2778369441628457, |
|
"reward_std": 0.7753925062716007, |
|
"rewards/accuracy_reward": 0.6625000327825546, |
|
"rewards/cosine_scaled_reward": 0.35878926496952773, |
|
"rewards/format_reward": 0.001785714365541935, |
|
"rewards/reasoning_steps_reward": 0.2547619211487472, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 644.8321746826172, |
|
"epoch": 0.05333333333333334, |
|
"grad_norm": 12.666142241785211, |
|
"kl": 0.002669668197631836, |
|
"learning_rate": 1.5957446808510639e-06, |
|
"loss": 0.0001, |
|
"reward": 1.2453887628391385, |
|
"reward_std": 0.7985292233526706, |
|
"rewards/accuracy_reward": 0.641071455925703, |
|
"rewards/cosine_scaled_reward": 0.34419824378564956, |
|
"rewards/format_reward": 0.00357142873108387, |
|
"rewards/reasoning_steps_reward": 0.25654763616621495, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 664.1107467651367, |
|
"epoch": 0.064, |
|
"grad_norm": 2.039644155087791, |
|
"kl": 0.0030992507934570314, |
|
"learning_rate": 1.9148936170212767e-06, |
|
"loss": 0.0001, |
|
"reward": 1.3286324340850115, |
|
"reward_std": 0.7313086107373238, |
|
"rewards/accuracy_reward": 0.6696428894996643, |
|
"rewards/cosine_scaled_reward": 0.3649419266730547, |
|
"rewards/format_reward": 0.001785714365541935, |
|
"rewards/reasoning_steps_reward": 0.2922619292512536, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 646.792887878418, |
|
"epoch": 0.07466666666666667, |
|
"grad_norm": 1.110778707013103, |
|
"kl": 0.004967975616455078, |
|
"learning_rate": 2.2340425531914894e-06, |
|
"loss": 0.0002, |
|
"reward": 1.2785038705915213, |
|
"reward_std": 0.732379237562418, |
|
"rewards/accuracy_reward": 0.6642857423052192, |
|
"rewards/cosine_scaled_reward": 0.335646699834615, |
|
"rewards/format_reward": 0.001785714365541935, |
|
"rewards/reasoning_steps_reward": 0.276785734295845, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 657.1143142700196, |
|
"epoch": 0.08533333333333333, |
|
"grad_norm": 0.9844730773757054, |
|
"kl": 0.014885807037353515, |
|
"learning_rate": 2.553191489361702e-06, |
|
"loss": 0.0006, |
|
"reward": 1.5404972655698657, |
|
"reward_std": 0.6586654607206583, |
|
"rewards/accuracy_reward": 0.7446428902447224, |
|
"rewards/cosine_scaled_reward": 0.4643067395314574, |
|
"rewards/format_reward": 0.001785714365541935, |
|
"rewards/reasoning_steps_reward": 0.329761927947402, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 617.8857398986817, |
|
"epoch": 0.096, |
|
"grad_norm": 0.7427114343652937, |
|
"kl": 0.010532951354980469, |
|
"learning_rate": 2.872340425531915e-06, |
|
"loss": 0.0004, |
|
"reward": 1.3987550295889377, |
|
"reward_std": 0.6983755130320788, |
|
"rewards/accuracy_reward": 0.7196428939700127, |
|
"rewards/cosine_scaled_reward": 0.39756449486594647, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.28154764492064716, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 658.5303855895996, |
|
"epoch": 0.10666666666666667, |
|
"grad_norm": 1.6404392956435, |
|
"kl": 0.00817718505859375, |
|
"learning_rate": 2.9996241442585123e-06, |
|
"loss": 0.0003, |
|
"reward": 1.5072809681296349, |
|
"reward_std": 0.7857246264815331, |
|
"rewards/accuracy_reward": 0.7125000357627869, |
|
"rewards/cosine_scaled_reward": 0.41680470630526545, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.37797621842473744, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 633.0089584350586, |
|
"epoch": 0.11733333333333333, |
|
"grad_norm": 1.4925006192982901, |
|
"kl": 0.005891990661621094, |
|
"learning_rate": 2.9973279301399446e-06, |
|
"loss": 0.0002, |
|
"reward": 1.3728282183408738, |
|
"reward_std": 0.7632799297571182, |
|
"rewards/accuracy_reward": 0.6642857462167739, |
|
"rewards/cosine_scaled_reward": 0.3627091235946864, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.3458333555608988, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 612.4803871154785, |
|
"epoch": 0.128, |
|
"grad_norm": 0.6602171934295183, |
|
"kl": 0.005200958251953125, |
|
"learning_rate": 2.992947502998804e-06, |
|
"loss": 0.0002, |
|
"reward": 1.550386269390583, |
|
"reward_std": 0.7166643626987934, |
|
"rewards/accuracy_reward": 0.7267857551574707, |
|
"rewards/cosine_scaled_reward": 0.40752905812114476, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.4160714574158192, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 637.0893135070801, |
|
"epoch": 0.13866666666666666, |
|
"grad_norm": 0.7395014891892754, |
|
"kl": 0.005736923217773438, |
|
"learning_rate": 2.9864889601923268e-06, |
|
"loss": 0.0002, |
|
"reward": 1.548742873966694, |
|
"reward_std": 0.7250724889338016, |
|
"rewards/accuracy_reward": 0.6928571745753288, |
|
"rewards/cosine_scaled_reward": 0.40052852691151203, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.4553571745753288, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 624.2607398986817, |
|
"epoch": 0.14933333333333335, |
|
"grad_norm": 1.3446269002594227, |
|
"kl": 0.00745697021484375, |
|
"learning_rate": 2.977961291721137e-06, |
|
"loss": 0.0003, |
|
"reward": 1.726818972826004, |
|
"reward_std": 0.7127795048058033, |
|
"rewards/accuracy_reward": 0.7500000286847353, |
|
"rewards/cosine_scaled_reward": 0.4518189021851867, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.5250000316649676, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 582.6161026000976, |
|
"epoch": 0.16, |
|
"grad_norm": 1.6339747436033418, |
|
"kl": 0.01014862060546875, |
|
"learning_rate": 2.9673763677155655e-06, |
|
"loss": 0.0004, |
|
"reward": 1.7601879060268402, |
|
"reward_std": 0.6540568165481091, |
|
"rewards/accuracy_reward": 0.7571428894996644, |
|
"rewards/cosine_scaled_reward": 0.44530690894462166, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.5577381297945976, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 604.8535972595215, |
|
"epoch": 0.17066666666666666, |
|
"grad_norm": 0.5255090188980472, |
|
"kl": 0.011274337768554688, |
|
"learning_rate": 2.9547489219129666e-06, |
|
"loss": 0.0005, |
|
"reward": 1.8713448494672775, |
|
"reward_std": 0.6472877942025661, |
|
"rewards/accuracy_reward": 0.7910714626312256, |
|
"rewards/cosine_scaled_reward": 0.46479713870212436, |
|
"rewards/format_reward": 0.001785714365541935, |
|
"rewards/reasoning_steps_reward": 0.6136905208230019, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 652.2661026000976, |
|
"epoch": 0.18133333333333335, |
|
"grad_norm": 0.9972915077162576, |
|
"kl": 0.030218505859375, |
|
"learning_rate": 2.9400965311490175e-06, |
|
"loss": 0.0012, |
|
"reward": 1.8985693082213402, |
|
"reward_std": 0.6914638102054596, |
|
"rewards/accuracy_reward": 0.7303571823984385, |
|
"rewards/cosine_scaled_reward": 0.4414263550657779, |
|
"rewards/format_reward": 0.00714285746216774, |
|
"rewards/reasoning_steps_reward": 0.7196429148316383, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 639.8696754455566, |
|
"epoch": 0.192, |
|
"grad_norm": 0.6349460842058491, |
|
"kl": 0.021417236328125, |
|
"learning_rate": 2.9234395908915565e-06, |
|
"loss": 0.0009, |
|
"reward": 1.8425026133656501, |
|
"reward_std": 0.6659979414194822, |
|
"rewards/accuracy_reward": 0.6660714538767933, |
|
"rewards/cosine_scaled_reward": 0.37762160785496235, |
|
"rewards/format_reward": 0.008928571827709675, |
|
"rewards/reasoning_steps_reward": 0.7898810163140297, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 653.3714622497558, |
|
"epoch": 0.20266666666666666, |
|
"grad_norm": 0.7068217863673739, |
|
"kl": 0.0266326904296875, |
|
"learning_rate": 2.904801286851009e-06, |
|
"loss": 0.0011, |
|
"reward": 2.013348326086998, |
|
"reward_std": 0.6553943566977978, |
|
"rewards/accuracy_reward": 0.7017857439815998, |
|
"rewards/cosine_scaled_reward": 0.41513395444490014, |
|
"rewards/format_reward": 0.06785714626312256, |
|
"rewards/reasoning_steps_reward": 0.8285714961588383, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 677.8946754455567, |
|
"epoch": 0.21333333333333335, |
|
"grad_norm": 2.9570623395086475, |
|
"kl": 0.0402923583984375, |
|
"learning_rate": 2.884207562706925e-06, |
|
"loss": 0.0016, |
|
"reward": 2.5119238168001177, |
|
"reward_std": 0.8375062063336373, |
|
"rewards/accuracy_reward": 0.7125000339001417, |
|
"rewards/cosine_scaled_reward": 0.4250189420999959, |
|
"rewards/format_reward": 0.5553571665659547, |
|
"rewards/reasoning_steps_reward": 0.8190476760268212, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.21333333333333335, |
|
"eval_completion_length": 717.3023177124023, |
|
"eval_kl": 0.045336328125, |
|
"eval_loss": 0.0018938997527584434, |
|
"eval_reward": 2.535167279243469, |
|
"eval_reward_std": 0.8789399468839169, |
|
"eval_rewards/accuracy_reward": 0.5740285987168551, |
|
"eval_rewards/cosine_scaled_reward": 0.29340527021205964, |
|
"eval_rewards/format_reward": 0.8470000272035598, |
|
"eval_rewards/reasoning_steps_reward": 0.8207333937764167, |
|
"eval_runtime": 34268.8498, |
|
"eval_samples_per_second": 0.146, |
|
"eval_steps_per_second": 0.01, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 724.1053909301758, |
|
"epoch": 0.224, |
|
"grad_norm": 1.1690466064365364, |
|
"kl": 0.1235015869140625, |
|
"learning_rate": 2.8616870839955444e-06, |
|
"loss": 0.0049, |
|
"reward": 2.752804014086723, |
|
"reward_std": 0.8537621341645718, |
|
"rewards/accuracy_reward": 0.6767857506871223, |
|
"rewards/cosine_scaled_reward": 0.4045896364772489, |
|
"rewards/format_reward": 0.8553571686148643, |
|
"rewards/reasoning_steps_reward": 0.816071481257677, |
|
"step": 105 |
|
}, |
|
{ |
|
"completion_length": 768.4839645385742, |
|
"epoch": 0.23466666666666666, |
|
"grad_norm": 14.95234115514868, |
|
"kl": 0.0612457275390625, |
|
"learning_rate": 2.837271198208662e-06, |
|
"loss": 0.0025, |
|
"reward": 2.6233972758054733, |
|
"reward_std": 0.8962277337908745, |
|
"rewards/accuracy_reward": 0.6214286016300321, |
|
"rewards/cosine_scaled_reward": 0.340063818404451, |
|
"rewards/format_reward": 0.8839285984635353, |
|
"rewards/reasoning_steps_reward": 0.7779762476682663, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 731.7893173217774, |
|
"epoch": 0.24533333333333332, |
|
"grad_norm": 0.7880946164162693, |
|
"kl": 0.163909912109375, |
|
"learning_rate": 2.8109938911593322e-06, |
|
"loss": 0.0066, |
|
"reward": 2.6163390249013903, |
|
"reward_std": 0.9033554136753082, |
|
"rewards/accuracy_reward": 0.6589286033064127, |
|
"rewards/cosine_scaled_reward": 0.3818151192739606, |
|
"rewards/format_reward": 0.7267857484519482, |
|
"rewards/reasoning_steps_reward": 0.8488095819950103, |
|
"step": 115 |
|
}, |
|
{ |
|
"completion_length": 759.9339622497558, |
|
"epoch": 0.256, |
|
"grad_norm": 1.7990639203312269, |
|
"kl": 0.042535400390625, |
|
"learning_rate": 2.7828917396751474e-06, |
|
"loss": 0.0017, |
|
"reward": 2.6672952473163605, |
|
"reward_std": 0.8769947469234467, |
|
"rewards/accuracy_reward": 0.6535714585334063, |
|
"rewards/cosine_scaled_reward": 0.384557047416456, |
|
"rewards/format_reward": 0.7732143148779869, |
|
"rewards/reasoning_steps_reward": 0.8559524476528168, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 760.0518188476562, |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 0.525125891329557, |
|
"kl": 0.048138427734375, |
|
"learning_rate": 2.753003860684943e-06, |
|
"loss": 0.0019, |
|
"reward": 3.015795236825943, |
|
"reward_std": 0.7571237944066525, |
|
"rewards/accuracy_reward": 0.7303571730852128, |
|
"rewards/cosine_scaled_reward": 0.47650940530002117, |
|
"rewards/format_reward": 0.9767857253551483, |
|
"rewards/reasoning_steps_reward": 0.832142922282219, |
|
"step": 125 |
|
}, |
|
{ |
|
"completion_length": 787.3678970336914, |
|
"epoch": 0.2773333333333333, |
|
"grad_norm": 1.6324108230600705, |
|
"kl": 0.0462982177734375, |
|
"learning_rate": 2.721371856769793e-06, |
|
"loss": 0.0019, |
|
"reward": 2.96227542757988, |
|
"reward_std": 0.7570989470928907, |
|
"rewards/accuracy_reward": 0.6714286040514708, |
|
"rewards/cosine_scaled_reward": 0.40870389440096916, |
|
"rewards/format_reward": 0.9892857193946838, |
|
"rewards/reasoning_steps_reward": 0.8928571999073028, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 739.2839584350586, |
|
"epoch": 0.288, |
|
"grad_norm": 1.2279064196254164, |
|
"kl": 0.0569366455078125, |
|
"learning_rate": 2.688039758254093e-06, |
|
"loss": 0.0023, |
|
"reward": 3.0940194368362426, |
|
"reward_std": 0.7407944872975349, |
|
"rewards/accuracy_reward": 0.748214321769774, |
|
"rewards/cosine_scaled_reward": 0.4910431296331808, |
|
"rewards/format_reward": 0.9946428596973419, |
|
"rewards/reasoning_steps_reward": 0.8601191073656083, |
|
"step": 135 |
|
}, |
|
{ |
|
"completion_length": 742.6107498168946, |
|
"epoch": 0.2986666666666667, |
|
"grad_norm": 1.142989559294876, |
|
"kl": 0.0421722412109375, |
|
"learning_rate": 2.65305396191733e-06, |
|
"loss": 0.0017, |
|
"reward": 3.0360836148262025, |
|
"reward_std": 0.7486013866961002, |
|
"rewards/accuracy_reward": 0.7267857477068901, |
|
"rewards/cosine_scaled_reward": 0.44679776607081295, |
|
"rewards/format_reward": 0.9785714343190193, |
|
"rewards/reasoning_steps_reward": 0.8839286372065545, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 708.2089630126953, |
|
"epoch": 0.30933333333333335, |
|
"grad_norm": 0.5591745022956734, |
|
"kl": 0.0389068603515625, |
|
"learning_rate": 2.61646316641186e-06, |
|
"loss": 0.0016, |
|
"reward": 3.039272406697273, |
|
"reward_std": 0.6369190786033869, |
|
"rewards/accuracy_reward": 0.7375000305473804, |
|
"rewards/cosine_scaled_reward": 0.4565341799054295, |
|
"rewards/format_reward": 0.9928571462631226, |
|
"rewards/reasoning_steps_reward": 0.85238102003932, |
|
"step": 145 |
|
}, |
|
{ |
|
"completion_length": 693.1196723937989, |
|
"epoch": 0.32, |
|
"grad_norm": 0.8445904699258806, |
|
"kl": 0.04276123046875, |
|
"learning_rate": 2.5783183044765715e-06, |
|
"loss": 0.0017, |
|
"reward": 3.065075045824051, |
|
"reward_std": 0.6227528784424067, |
|
"rewards/accuracy_reward": 0.7553571730852127, |
|
"rewards/cosine_scaled_reward": 0.48888447135686874, |
|
"rewards/format_reward": 0.980357152223587, |
|
"rewards/reasoning_steps_reward": 0.8404762536287308, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 666.7178871154786, |
|
"epoch": 0.33066666666666666, |
|
"grad_norm": 0.6870106883522581, |
|
"kl": 0.041412353515625, |
|
"learning_rate": 2.5386724720408135e-06, |
|
"loss": 0.0017, |
|
"reward": 3.156706044077873, |
|
"reward_std": 0.6447003319859504, |
|
"rewards/accuracy_reward": 0.798214316740632, |
|
"rewards/cosine_scaled_reward": 0.5305154274217785, |
|
"rewards/format_reward": 0.9821428656578064, |
|
"rewards/reasoning_steps_reward": 0.8458333984017372, |
|
"step": 155 |
|
}, |
|
{ |
|
"completion_length": 723.2357452392578, |
|
"epoch": 0.3413333333333333, |
|
"grad_norm": 0.598319397397006, |
|
"kl": 0.0435455322265625, |
|
"learning_rate": 2.49758085431725e-06, |
|
"loss": 0.0017, |
|
"reward": 3.0701630294322966, |
|
"reward_std": 0.6298764709383249, |
|
"rewards/accuracy_reward": 0.7410714581608773, |
|
"rewards/cosine_scaled_reward": 0.463615276478231, |
|
"rewards/format_reward": 0.9696428686380386, |
|
"rewards/reasoning_steps_reward": 0.895833395421505, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 799.8250381469727, |
|
"epoch": 0.352, |
|
"grad_norm": 0.6301042631581175, |
|
"kl": 0.0475830078125, |
|
"learning_rate": 2.455100648986533e-06, |
|
"loss": 0.0019, |
|
"reward": 2.9756604552268984, |
|
"reward_std": 0.7367240894585848, |
|
"rewards/accuracy_reward": 0.6785714585334063, |
|
"rewards/cosine_scaled_reward": 0.4125651277601719, |
|
"rewards/format_reward": 0.9535714492201806, |
|
"rewards/reasoning_steps_reward": 0.9309524342417717, |
|
"step": 165 |
|
}, |
|
{ |
|
"completion_length": 826.1500366210937, |
|
"epoch": 0.3626666666666667, |
|
"grad_norm": 0.8610873292738485, |
|
"kl": 0.054132080078125, |
|
"learning_rate": 2.4112909865807053e-06, |
|
"loss": 0.0022, |
|
"reward": 2.680763456225395, |
|
"reward_std": 0.8146794062107802, |
|
"rewards/accuracy_reward": 0.5303571671247482, |
|
"rewards/cosine_scaled_reward": 0.2390966679668054, |
|
"rewards/format_reward": 0.9714285835623742, |
|
"rewards/reasoning_steps_reward": 0.9398810014128685, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 800.7339706420898, |
|
"epoch": 0.37333333333333335, |
|
"grad_norm": 0.7079176746358598, |
|
"kl": 0.067926025390625, |
|
"learning_rate": 2.366212848176164e-06, |
|
"loss": 0.0027, |
|
"reward": 2.7596707075834273, |
|
"reward_std": 0.8444030195474624, |
|
"rewards/accuracy_reward": 0.5714285975322128, |
|
"rewards/cosine_scaled_reward": 0.3090753515250981, |
|
"rewards/format_reward": 0.9732142984867096, |
|
"rewards/reasoning_steps_reward": 0.9059524446725845, |
|
"step": 175 |
|
}, |
|
{ |
|
"completion_length": 778.9928909301758, |
|
"epoch": 0.384, |
|
"grad_norm": 1.1001119066854874, |
|
"kl": 0.0932373046875, |
|
"learning_rate": 2.319928980510752e-06, |
|
"loss": 0.0037, |
|
"reward": 2.765022465586662, |
|
"reward_std": 0.8211875937879085, |
|
"rewards/accuracy_reward": 0.5821428883820772, |
|
"rewards/cosine_scaled_reward": 0.32335569793358443, |
|
"rewards/format_reward": 0.9732142984867096, |
|
"rewards/reasoning_steps_reward": 0.8863095760345459, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 792.8321823120117, |
|
"epoch": 0.39466666666666667, |
|
"grad_norm": 0.621075404533647, |
|
"kl": 0.170367431640625, |
|
"learning_rate": 2.272503808643123e-06, |
|
"loss": 0.0068, |
|
"reward": 2.6604900896549224, |
|
"reward_std": 0.7879154846072197, |
|
"rewards/accuracy_reward": 0.5410714561119676, |
|
"rewards/cosine_scaled_reward": 0.26644235964631663, |
|
"rewards/format_reward": 0.9892857193946838, |
|
"rewards/reasoning_steps_reward": 0.8636905372142791, |
|
"step": 185 |
|
}, |
|
{ |
|
"completion_length": 734.221459197998, |
|
"epoch": 0.4053333333333333, |
|
"grad_norm": 2.6622395485180066, |
|
"kl": 0.1951171875, |
|
"learning_rate": 2.2240033462759628e-06, |
|
"loss": 0.0078, |
|
"reward": 2.9028186976909636, |
|
"reward_std": 0.7853110164403916, |
|
"rewards/accuracy_reward": 0.6750000350177288, |
|
"rewards/cosine_scaled_reward": 0.41293763257563115, |
|
"rewards/format_reward": 0.9767857238650322, |
|
"rewards/reasoning_steps_reward": 0.8380952909588814, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 757.7036010742188, |
|
"epoch": 0.416, |
|
"grad_norm": 1.9232019767182387, |
|
"kl": 0.197918701171875, |
|
"learning_rate": 2.1744951038678905e-06, |
|
"loss": 0.0079, |
|
"reward": 2.847309911251068, |
|
"reward_std": 0.8383581660687923, |
|
"rewards/accuracy_reward": 0.6553571727126837, |
|
"rewards/cosine_scaled_reward": 0.3895717078819871, |
|
"rewards/format_reward": 0.9660714402794838, |
|
"rewards/reasoning_steps_reward": 0.836309588700533, |
|
"step": 195 |
|
}, |
|
{ |
|
"completion_length": 704.6071769714356, |
|
"epoch": 0.4266666666666667, |
|
"grad_norm": 2.187135193267794, |
|
"kl": 0.2211669921875, |
|
"learning_rate": 2.124047994661941e-06, |
|
"loss": 0.0088, |
|
"reward": 2.8649386018514633, |
|
"reward_std": 0.8465140253305435, |
|
"rewards/accuracy_reward": 0.6785714600235224, |
|
"rewards/cosine_scaled_reward": 0.4107717891223729, |
|
"rewards/format_reward": 0.9767857253551483, |
|
"rewards/reasoning_steps_reward": 0.7988095879554749, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4266666666666667, |
|
"eval_completion_length": 726.0360606079101, |
|
"eval_kl": 0.4157892578125, |
|
"eval_loss": 0.016393402591347694, |
|
"eval_reward": 2.668296795749664, |
|
"eval_reward_std": 0.848984938377142, |
|
"eval_rewards/accuracy_reward": 0.5848571698188781, |
|
"eval_rewards/cosine_scaled_reward": 0.31035383035842096, |
|
"eval_rewards/format_reward": 0.9741428675889969, |
|
"eval_rewards/reasoning_steps_reward": 0.7989429181694985, |
|
"eval_runtime": 34304.9807, |
|
"eval_samples_per_second": 0.146, |
|
"eval_steps_per_second": 0.01, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 711.1785987854004, |
|
"epoch": 0.43733333333333335, |
|
"grad_norm": 13.425767555167974, |
|
"kl": 0.522259521484375, |
|
"learning_rate": 2.072732238761434e-06, |
|
"loss": 0.0209, |
|
"reward": 2.5607248514890673, |
|
"reward_std": 0.8997476093471051, |
|
"rewards/accuracy_reward": 0.5821428846567869, |
|
"rewards/cosine_scaled_reward": 0.29524860471719877, |
|
"rewards/format_reward": 0.9446428805589676, |
|
"rewards/reasoning_steps_reward": 0.7386905312538147, |
|
"step": 205 |
|
}, |
|
{ |
|
"completion_length": 703.9107421875, |
|
"epoch": 0.448, |
|
"grad_norm": 1.9515561067024476, |
|
"kl": 0.407659912109375, |
|
"learning_rate": 2.0206192653867536e-06, |
|
"loss": 0.0163, |
|
"reward": 2.5372259259223937, |
|
"reward_std": 1.0205993868410588, |
|
"rewards/accuracy_reward": 0.6178571723401547, |
|
"rewards/cosine_scaled_reward": 0.334844835329568, |
|
"rewards/format_reward": 0.887500025331974, |
|
"rewards/reasoning_steps_reward": 0.6970238521695137, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 743.6161041259766, |
|
"epoch": 0.45866666666666667, |
|
"grad_norm": 3.1537839868905078, |
|
"kl": 0.660333251953125, |
|
"learning_rate": 1.967781613449095e-06, |
|
"loss": 0.0264, |
|
"reward": 1.9156419575214385, |
|
"reward_std": 1.1042504251003264, |
|
"rewards/accuracy_reward": 0.4267857324331999, |
|
"rewards/cosine_scaled_reward": 0.1459990169329103, |
|
"rewards/format_reward": 0.7571428924798965, |
|
"rewards/reasoning_steps_reward": 0.5857143320143223, |
|
"step": 215 |
|
}, |
|
{ |
|
"completion_length": 776.1553909301758, |
|
"epoch": 0.4693333333333333, |
|
"grad_norm": 4.038362237813945, |
|
"kl": 0.3061279296875, |
|
"learning_rate": 1.9142928305795637e-06, |
|
"loss": 0.0122, |
|
"reward": 2.4867064505815506, |
|
"reward_std": 0.9735465314239263, |
|
"rewards/accuracy_reward": 0.5696428786963225, |
|
"rewards/cosine_scaled_reward": 0.2962301469407976, |
|
"rewards/format_reward": 0.9053571730852127, |
|
"rewards/reasoning_steps_reward": 0.7154762424528599, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 729.4678955078125, |
|
"epoch": 0.48, |
|
"grad_norm": 3.1439180898202466, |
|
"kl": 0.28773193359375, |
|
"learning_rate": 1.8602273707541886e-06, |
|
"loss": 0.0115, |
|
"reward": 2.7534320563077928, |
|
"reward_std": 0.9354146108031273, |
|
"rewards/accuracy_reward": 0.671428600884974, |
|
"rewards/cosine_scaled_reward": 0.4331938370829448, |
|
"rewards/format_reward": 0.9357143059372902, |
|
"rewards/reasoning_steps_reward": 0.7130952820181846, |
|
"step": 225 |
|
}, |
|
{ |
|
"completion_length": 753.7571792602539, |
|
"epoch": 0.49066666666666664, |
|
"grad_norm": 9.742972287112453, |
|
"kl": 0.672515869140625, |
|
"learning_rate": 1.8056604906573418e-06, |
|
"loss": 0.0269, |
|
"reward": 2.4900094658136367, |
|
"reward_std": 1.071580182760954, |
|
"rewards/accuracy_reward": 0.6053571721538902, |
|
"rewards/cosine_scaled_reward": 0.3417950821574777, |
|
"rewards/format_reward": 0.900000023841858, |
|
"rewards/reasoning_steps_reward": 0.6428571917116642, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 769.7821792602539, |
|
"epoch": 0.5013333333333333, |
|
"grad_norm": 12.748394988794061, |
|
"kl": 0.615447998046875, |
|
"learning_rate": 1.7506681449278226e-06, |
|
"loss": 0.0246, |
|
"reward": 2.4423232048749925, |
|
"reward_std": 1.0899774149060248, |
|
"rewards/accuracy_reward": 0.6035714549943805, |
|
"rewards/cosine_scaled_reward": 0.3429183505475521, |
|
"rewards/format_reward": 0.8517857506871224, |
|
"rewards/reasoning_steps_reward": 0.6440476708114147, |
|
"step": 235 |
|
}, |
|
{ |
|
"completion_length": 721.4928901672363, |
|
"epoch": 0.512, |
|
"grad_norm": 316.5564758948846, |
|
"kl": 2.6417724609375, |
|
"learning_rate": 1.6953268804334257e-06, |
|
"loss": 0.1056, |
|
"reward": 2.6406149938702583, |
|
"reward_std": 0.9681473188102245, |
|
"rewards/accuracy_reward": 0.6732143137603999, |
|
"rewards/cosine_scaled_reward": 0.43644821029156444, |
|
"rewards/format_reward": 0.833928607404232, |
|
"rewards/reasoning_steps_reward": 0.6970238670706749, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 699.0089614868164, |
|
"epoch": 0.5226666666666666, |
|
"grad_norm": 144932.71498560338, |
|
"kl": 41.92372436523438, |
|
"learning_rate": 1.6397137297211436e-06, |
|
"loss": 1.677, |
|
"reward": 2.9064596563577654, |
|
"reward_std": 0.8081207755953074, |
|
"rewards/accuracy_reward": 0.7678571745753289, |
|
"rewards/cosine_scaled_reward": 0.5046738618053496, |
|
"rewards/format_reward": 0.9178571611642837, |
|
"rewards/reasoning_steps_reward": 0.7160714864730835, |
|
"step": 245 |
|
}, |
|
{ |
|
"completion_length": 719.6250274658203, |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 12.128586542505051, |
|
"kl": 13.330413818359375, |
|
"learning_rate": 1.5839061037913395e-06, |
|
"loss": 0.5321, |
|
"reward": 2.8467082887887956, |
|
"reward_std": 0.8377620510756969, |
|
"rewards/accuracy_reward": 0.7500000335276127, |
|
"rewards/cosine_scaled_reward": 0.4984938623383641, |
|
"rewards/format_reward": 0.8803571656346321, |
|
"rewards/reasoning_steps_reward": 0.7178571961820126, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 740.6482467651367, |
|
"epoch": 0.544, |
|
"grad_norm": 21.125458441852672, |
|
"kl": 0.171630859375, |
|
"learning_rate": 1.527981684345115e-06, |
|
"loss": 0.0069, |
|
"reward": 2.7478116720914842, |
|
"reward_std": 0.8047603815793991, |
|
"rewards/accuracy_reward": 0.6946428874507546, |
|
"rewards/cosine_scaled_reward": 0.4323353324783966, |
|
"rewards/format_reward": 0.9214285910129547, |
|
"rewards/reasoning_steps_reward": 0.6994048036634922, |
|
"step": 255 |
|
}, |
|
{ |
|
"completion_length": 741.4857482910156, |
|
"epoch": 0.5546666666666666, |
|
"grad_norm": 24.989348854776196, |
|
"kl": 0.7936920166015625, |
|
"learning_rate": 1.4720183156548855e-06, |
|
"loss": 0.0317, |
|
"reward": 2.8889215648174287, |
|
"reward_std": 0.8046084839850665, |
|
"rewards/accuracy_reward": 0.7053571708500386, |
|
"rewards/cosine_scaled_reward": 0.4633262232731795, |
|
"rewards/format_reward": 0.9500000178813934, |
|
"rewards/reasoning_steps_reward": 0.7702381610870361, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 742.701823425293, |
|
"epoch": 0.5653333333333334, |
|
"grad_norm": 31.341677037958075, |
|
"kl": 0.67542724609375, |
|
"learning_rate": 1.4160938962086612e-06, |
|
"loss": 0.027, |
|
"reward": 2.8618105918169023, |
|
"reward_std": 0.756641275063157, |
|
"rewards/accuracy_reward": 0.7321428785100579, |
|
"rewards/cosine_scaled_reward": 0.4826438320800662, |
|
"rewards/format_reward": 0.9267857372760773, |
|
"rewards/reasoning_steps_reward": 0.7202381417155266, |
|
"step": 265 |
|
}, |
|
{ |
|
"completion_length": 741.5803863525391, |
|
"epoch": 0.576, |
|
"grad_norm": 208.94608010239415, |
|
"kl": 0.9906982421875, |
|
"learning_rate": 1.3602862702788567e-06, |
|
"loss": 0.0396, |
|
"reward": 2.706931698322296, |
|
"reward_std": 0.9525154523551465, |
|
"rewards/accuracy_reward": 0.6732143200933933, |
|
"rewards/cosine_scaled_reward": 0.43609824670711533, |
|
"rewards/format_reward": 0.8946428880095482, |
|
"rewards/reasoning_steps_reward": 0.7029762521386147, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 732.1143157958984, |
|
"epoch": 0.5866666666666667, |
|
"grad_norm": 38.275566557137644, |
|
"kl": 1.51126708984375, |
|
"learning_rate": 1.3046731195665748e-06, |
|
"loss": 0.0605, |
|
"reward": 2.7598373025655745, |
|
"reward_std": 0.9775024671107531, |
|
"rewards/accuracy_reward": 0.7053571678698063, |
|
"rewards/cosine_scaled_reward": 0.45864667696878314, |
|
"rewards/format_reward": 0.8839286029338836, |
|
"rewards/reasoning_steps_reward": 0.7119048096239566, |
|
"step": 275 |
|
}, |
|
{ |
|
"completion_length": 766.3411087036133, |
|
"epoch": 0.5973333333333334, |
|
"grad_norm": 34.400362223993966, |
|
"kl": 0.446759033203125, |
|
"learning_rate": 1.2493318550721775e-06, |
|
"loss": 0.0179, |
|
"reward": 2.80095117688179, |
|
"reward_std": 0.9185693945735693, |
|
"rewards/accuracy_reward": 0.701785746216774, |
|
"rewards/cosine_scaled_reward": 0.44499868620187044, |
|
"rewards/format_reward": 0.9017857417464257, |
|
"rewards/reasoning_steps_reward": 0.7523810148239136, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 745.8839614868164, |
|
"epoch": 0.608, |
|
"grad_norm": 70.23076333110882, |
|
"kl": 3.24036865234375, |
|
"learning_rate": 1.1943395093426585e-06, |
|
"loss": 0.1296, |
|
"reward": 2.9303120017051696, |
|
"reward_std": 0.7858256082981825, |
|
"rewards/accuracy_reward": 0.7339286036789417, |
|
"rewards/cosine_scaled_reward": 0.5118595488369465, |
|
"rewards/format_reward": 0.9214285984635353, |
|
"rewards/reasoning_steps_reward": 0.7630952894687653, |
|
"step": 285 |
|
}, |
|
{ |
|
"completion_length": 729.6464584350585, |
|
"epoch": 0.6186666666666667, |
|
"grad_norm": 51.01572887577541, |
|
"kl": 2.792974853515625, |
|
"learning_rate": 1.1397726292458115e-06, |
|
"loss": 0.1118, |
|
"reward": 2.8507910460233687, |
|
"reward_std": 0.8466649554669857, |
|
"rewards/accuracy_reward": 0.7142857432365417, |
|
"rewards/cosine_scaled_reward": 0.46924334414070473, |
|
"rewards/format_reward": 0.9303571686148644, |
|
"rewards/reasoning_steps_reward": 0.7369048215448857, |
|
"step": 290 |
|
}, |
|
{ |
|
"completion_length": 758.4518173217773, |
|
"epoch": 0.6293333333333333, |
|
"grad_norm": 21.800940990494052, |
|
"kl": 3.139178466796875, |
|
"learning_rate": 1.085707169420437e-06, |
|
"loss": 0.1259, |
|
"reward": 2.687264183163643, |
|
"reward_std": 0.8229550156742335, |
|
"rewards/accuracy_reward": 0.6535714585334063, |
|
"rewards/cosine_scaled_reward": 0.4211926580406725, |
|
"rewards/format_reward": 0.8946428805589676, |
|
"rewards/reasoning_steps_reward": 0.7178571917116642, |
|
"step": 295 |
|
}, |
|
{ |
|
"completion_length": 736.9268188476562, |
|
"epoch": 0.64, |
|
"grad_norm": 8.05056275297732, |
|
"kl": 0.720037841796875, |
|
"learning_rate": 1.0322183865509054e-06, |
|
"loss": 0.0288, |
|
"reward": 2.7745183438062666, |
|
"reward_std": 0.9824469141662121, |
|
"rewards/accuracy_reward": 0.7232143133878708, |
|
"rewards/cosine_scaled_reward": 0.46856586267240347, |
|
"rewards/format_reward": 0.9071428790688515, |
|
"rewards/reasoning_steps_reward": 0.6755952909588814, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_completion_length": 756.2787765625, |
|
"eval_kl": 2.815593212890625, |
|
"eval_loss": 0.11271046847105026, |
|
"eval_reward": 2.6679908989191055, |
|
"eval_reward_std": 0.9292156134605408, |
|
"eval_rewards/accuracy_reward": 0.6384000294238329, |
|
"eval_rewards/cosine_scaled_reward": 0.386524124888191, |
|
"eval_rewards/format_reward": 0.9171143096089364, |
|
"eval_rewards/reasoning_steps_reward": 0.7259524360120296, |
|
"eval_runtime": 34511.2614, |
|
"eval_samples_per_second": 0.145, |
|
"eval_steps_per_second": 0.01, |
|
"step": 300 |
|
}, |
|
{ |
|
"completion_length": 760.3768211364746, |
|
"epoch": 0.6506666666666666, |
|
"grad_norm": 16.434699024651234, |
|
"kl": 0.956842041015625, |
|
"learning_rate": 9.793807346132464e-07, |
|
"loss": 0.0383, |
|
"reward": 2.901669743657112, |
|
"reward_std": 0.8654729023575782, |
|
"rewards/accuracy_reward": 0.7464286040514707, |
|
"rewards/cosine_scaled_reward": 0.5034553268924356, |
|
"rewards/format_reward": 0.9392857357859612, |
|
"rewards/reasoning_steps_reward": 0.7125000521540642, |
|
"step": 305 |
|
}, |
|
{ |
|
"completion_length": 780.1018203735351, |
|
"epoch": 0.6613333333333333, |
|
"grad_norm": 9.30628937439928, |
|
"kl": 0.76141357421875, |
|
"learning_rate": 9.272677612385667e-07, |
|
"loss": 0.0305, |
|
"reward": 2.715619903802872, |
|
"reward_std": 0.9227011248469352, |
|
"rewards/accuracy_reward": 0.6767857443541289, |
|
"rewards/cosine_scaled_reward": 0.4328816962428391, |
|
"rewards/format_reward": 0.9000000298023224, |
|
"rewards/reasoning_steps_reward": 0.7059524297714234, |
|
"step": 310 |
|
}, |
|
{ |
|
"completion_length": 768.9500411987304, |
|
"epoch": 0.672, |
|
"grad_norm": 1.7983007811573581, |
|
"kl": 0.573968505859375, |
|
"learning_rate": 8.759520053380591e-07, |
|
"loss": 0.023, |
|
"reward": 2.786320286989212, |
|
"reward_std": 0.7833445437252522, |
|
"rewards/accuracy_reward": 0.6910714630037547, |
|
"rewards/cosine_scaled_reward": 0.4357249645516276, |
|
"rewards/format_reward": 0.9464285910129547, |
|
"rewards/reasoning_steps_reward": 0.7130952931940555, |
|
"step": 315 |
|
}, |
|
{ |
|
"completion_length": 723.1536033630371, |
|
"epoch": 0.6826666666666666, |
|
"grad_norm": 12.691694660091667, |
|
"kl": 0.486602783203125, |
|
"learning_rate": 8.255048961321088e-07, |
|
"loss": 0.0194, |
|
"reward": 2.831101644039154, |
|
"reward_std": 0.8475235715508461, |
|
"rewards/accuracy_reward": 0.7071428868919611, |
|
"rewards/cosine_scaled_reward": 0.46919678517151625, |
|
"rewards/format_reward": 0.930357164144516, |
|
"rewards/reasoning_steps_reward": 0.7244048148393631, |
|
"step": 320 |
|
}, |
|
{ |
|
"completion_length": 723.8268165588379, |
|
"epoch": 0.6933333333333334, |
|
"grad_norm": 4.135287895650402, |
|
"kl": 1.668939208984375, |
|
"learning_rate": 7.759966537240373e-07, |
|
"loss": 0.0664, |
|
"reward": 3.0232764929533005, |
|
"reward_std": 0.7830160673707723, |
|
"rewards/accuracy_reward": 0.7607143174856901, |
|
"rewards/cosine_scaled_reward": 0.5054192344192415, |
|
"rewards/format_reward": 0.9696428701281548, |
|
"rewards/reasoning_steps_reward": 0.7875000596046448, |
|
"step": 325 |
|
}, |
|
{ |
|
"completion_length": 743.2911071777344, |
|
"epoch": 0.704, |
|
"grad_norm": 2.428880550991712, |
|
"kl": 0.23929443359375, |
|
"learning_rate": 7.274961913568773e-07, |
|
"loss": 0.0096, |
|
"reward": 2.980492576956749, |
|
"reward_std": 0.7519855977967381, |
|
"rewards/accuracy_reward": 0.7517857424914837, |
|
"rewards/cosine_scaled_reward": 0.5114448417443782, |
|
"rewards/format_reward": 0.9589285865426064, |
|
"rewards/reasoning_steps_reward": 0.7583333984017372, |
|
"step": 330 |
|
}, |
|
{ |
|
"completion_length": 777.944676208496, |
|
"epoch": 0.7146666666666667, |
|
"grad_norm": 10.667405755387346, |
|
"kl": 0.491375732421875, |
|
"learning_rate": 6.800710194892484e-07, |
|
"loss": 0.0197, |
|
"reward": 2.9669879227876663, |
|
"reward_std": 0.7909464325755835, |
|
"rewards/accuracy_reward": 0.7285714603960514, |
|
"rewards/cosine_scaled_reward": 0.4985354314424967, |
|
"rewards/format_reward": 0.9696428716182709, |
|
"rewards/reasoning_steps_reward": 0.7702381551265717, |
|
"step": 335 |
|
}, |
|
{ |
|
"completion_length": 783.7107513427734, |
|
"epoch": 0.7253333333333334, |
|
"grad_norm": 20.401487806762606, |
|
"kl": 1.50914306640625, |
|
"learning_rate": 6.33787151823836e-07, |
|
"loss": 0.0604, |
|
"reward": 2.733260214328766, |
|
"reward_std": 0.9950653843581676, |
|
"rewards/accuracy_reward": 0.6535714577883482, |
|
"rewards/cosine_scaled_reward": 0.4219506177818403, |
|
"rewards/format_reward": 0.9071428775787354, |
|
"rewards/reasoning_steps_reward": 0.7505952984094619, |
|
"step": 340 |
|
}, |
|
{ |
|
"completion_length": 747.0571807861328, |
|
"epoch": 0.736, |
|
"grad_norm": 3.8988932597998756, |
|
"kl": 0.765008544921875, |
|
"learning_rate": 5.887090134192947e-07, |
|
"loss": 0.0306, |
|
"reward": 2.950327825546265, |
|
"reward_std": 0.7912764519453048, |
|
"rewards/accuracy_reward": 0.7142857464030385, |
|
"rewards/cosine_scaled_reward": 0.4931848540902138, |
|
"rewards/format_reward": 0.9553571596741677, |
|
"rewards/reasoning_steps_reward": 0.7875000700354576, |
|
"step": 345 |
|
}, |
|
{ |
|
"completion_length": 768.9161071777344, |
|
"epoch": 0.7466666666666667, |
|
"grad_norm": 5.964808758728403, |
|
"kl": 0.8441162109375, |
|
"learning_rate": 5.448993510134669e-07, |
|
"loss": 0.0338, |
|
"reward": 2.7927453070878983, |
|
"reward_std": 0.8378362115472555, |
|
"rewards/accuracy_reward": 0.6892857491970062, |
|
"rewards/cosine_scaled_reward": 0.44631660780869425, |
|
"rewards/format_reward": 0.9250000208616257, |
|
"rewards/reasoning_steps_reward": 0.7321429125964641, |
|
"step": 350 |
|
}, |
|
{ |
|
"completion_length": 771.8000366210938, |
|
"epoch": 0.7573333333333333, |
|
"grad_norm": 19.67075460524417, |
|
"kl": 1.37095947265625, |
|
"learning_rate": 5.024191456827498e-07, |
|
"loss": 0.0549, |
|
"reward": 2.757797637581825, |
|
"reward_std": 0.9828206066042184, |
|
"rewards/accuracy_reward": 0.6946428813040256, |
|
"rewards/cosine_scaled_reward": 0.4447022658772767, |
|
"rewards/format_reward": 0.900000023841858, |
|
"rewards/reasoning_steps_reward": 0.7184524320065975, |
|
"step": 355 |
|
}, |
|
{ |
|
"completion_length": 764.0839599609375, |
|
"epoch": 0.768, |
|
"grad_norm": 20.908101755536627, |
|
"kl": 1.678790283203125, |
|
"learning_rate": 4.6132752795918667e-07, |
|
"loss": 0.0672, |
|
"reward": 2.710779735445976, |
|
"reward_std": 0.9775115817785263, |
|
"rewards/accuracy_reward": 0.6642857462167739, |
|
"rewards/cosine_scaled_reward": 0.40601770151406524, |
|
"rewards/format_reward": 0.9107143133878708, |
|
"rewards/reasoning_steps_reward": 0.7297619603574276, |
|
"step": 360 |
|
}, |
|
{ |
|
"completion_length": 763.6000320434571, |
|
"epoch": 0.7786666666666666, |
|
"grad_norm": 12.87435288128971, |
|
"kl": 2.571148681640625, |
|
"learning_rate": 4.2168169552342905e-07, |
|
"loss": 0.1028, |
|
"reward": 2.6531503438949584, |
|
"reward_std": 0.9446496672928333, |
|
"rewards/accuracy_reward": 0.6321428865194321, |
|
"rewards/cosine_scaled_reward": 0.38410261063836515, |
|
"rewards/format_reward": 0.9035714536905288, |
|
"rewards/reasoning_steps_reward": 0.7333333849906921, |
|
"step": 365 |
|
}, |
|
{ |
|
"completion_length": 743.5286102294922, |
|
"epoch": 0.7893333333333333, |
|
"grad_norm": 9.936371570504198, |
|
"kl": 1.2698486328125, |
|
"learning_rate": 3.8353683358814046e-07, |
|
"loss": 0.0508, |
|
"reward": 2.7013536602258683, |
|
"reward_std": 0.862298522144556, |
|
"rewards/accuracy_reward": 0.6571428874507547, |
|
"rewards/cosine_scaled_reward": 0.4073059491813183, |
|
"rewards/format_reward": 0.9000000208616257, |
|
"rewards/reasoning_steps_reward": 0.7369048207998276, |
|
"step": 370 |
|
}, |
|
{ |
|
"completion_length": 759.3643180847168, |
|
"epoch": 0.8, |
|
"grad_norm": 23.218570111303308, |
|
"kl": 2.2372650146484374, |
|
"learning_rate": 3.469460380826697e-07, |
|
"loss": 0.0895, |
|
"reward": 2.7877288803458216, |
|
"reward_std": 0.9023670472204686, |
|
"rewards/accuracy_reward": 0.6785714600235224, |
|
"rewards/cosine_scaled_reward": 0.4454668462276459, |
|
"rewards/format_reward": 0.9142857402563095, |
|
"rewards/reasoning_steps_reward": 0.7494048178195953, |
|
"step": 375 |
|
}, |
|
{ |
|
"completion_length": 747.8786026000977, |
|
"epoch": 0.8106666666666666, |
|
"grad_norm": 5.9371726712745065, |
|
"kl": 1.312139892578125, |
|
"learning_rate": 3.119602417459075e-07, |
|
"loss": 0.0525, |
|
"reward": 2.8006283044815063, |
|
"reward_std": 0.8084595888853073, |
|
"rewards/accuracy_reward": 0.6607143096625805, |
|
"rewards/cosine_scaled_reward": 0.4244377123657614, |
|
"rewards/format_reward": 0.9464285865426063, |
|
"rewards/reasoning_steps_reward": 0.7690476909279823, |
|
"step": 380 |
|
}, |
|
{ |
|
"completion_length": 746.4321762084961, |
|
"epoch": 0.8213333333333334, |
|
"grad_norm": 5.13518428447357, |
|
"kl": 0.994195556640625, |
|
"learning_rate": 2.786281432302071e-07, |
|
"loss": 0.0397, |
|
"reward": 2.861383581161499, |
|
"reward_std": 0.8250645313411951, |
|
"rewards/accuracy_reward": 0.7000000301748515, |
|
"rewards/cosine_scaled_reward": 0.44947873409837485, |
|
"rewards/format_reward": 0.9428571656346321, |
|
"rewards/reasoning_steps_reward": 0.7690476924180984, |
|
"step": 385 |
|
}, |
|
{ |
|
"completion_length": 748.0875358581543, |
|
"epoch": 0.832, |
|
"grad_norm": 4.45042845549648, |
|
"kl": 1.18310546875, |
|
"learning_rate": 2.46996139315057e-07, |
|
"loss": 0.0474, |
|
"reward": 2.8951289474964144, |
|
"reward_std": 0.8875481400638818, |
|
"rewards/accuracy_reward": 0.7357143111526966, |
|
"rewards/cosine_scaled_reward": 0.4856050438596867, |
|
"rewards/format_reward": 0.9357143044471741, |
|
"rewards/reasoning_steps_reward": 0.738095298409462, |
|
"step": 390 |
|
}, |
|
{ |
|
"completion_length": 742.2714614868164, |
|
"epoch": 0.8426666666666667, |
|
"grad_norm": 9.790655066948279, |
|
"kl": 1.13563232421875, |
|
"learning_rate": 2.1710826032485286e-07, |
|
"loss": 0.0454, |
|
"reward": 2.8695475578308107, |
|
"reward_std": 0.8187286149710417, |
|
"rewards/accuracy_reward": 0.7142857439815998, |
|
"rewards/cosine_scaled_reward": 0.4582379271276295, |
|
"rewards/format_reward": 0.9464285880327225, |
|
"rewards/reasoning_steps_reward": 0.7505952954292298, |
|
"step": 395 |
|
}, |
|
{ |
|
"completion_length": 770.3428955078125, |
|
"epoch": 0.8533333333333334, |
|
"grad_norm": 4.399010377251051, |
|
"kl": 1.116912841796875, |
|
"learning_rate": 1.8900610884066817e-07, |
|
"loss": 0.0447, |
|
"reward": 2.7512567728757857, |
|
"reward_std": 0.9337207470089197, |
|
"rewards/accuracy_reward": 0.651785746961832, |
|
"rewards/cosine_scaled_reward": 0.4012566165998578, |
|
"rewards/format_reward": 0.9428571671247482, |
|
"rewards/reasoning_steps_reward": 0.7553571984171867, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8533333333333334, |
|
"eval_completion_length": 760.9953487792968, |
|
"eval_kl": 1.14858369140625, |
|
"eval_loss": 0.045983318239450455, |
|
"eval_reward": 2.729253951013088, |
|
"eval_reward_std": 0.8810034032523633, |
|
"eval_rewards/accuracy_reward": 0.6346000292986631, |
|
"eval_rewards/cosine_scaled_reward": 0.385330027777364, |
|
"eval_rewards/format_reward": 0.9519428731679916, |
|
"eval_rewards/reasoning_steps_reward": 0.7573810091674328, |
|
"eval_runtime": 34525.1878, |
|
"eval_samples_per_second": 0.145, |
|
"eval_steps_per_second": 0.01, |
|
"step": 400 |
|
}, |
|
{ |
|
"completion_length": 753.4678894042969, |
|
"epoch": 0.864, |
|
"grad_norm": 4.304417705259892, |
|
"kl": 1.15230712890625, |
|
"learning_rate": 1.627288017913383e-07, |
|
"loss": 0.0461, |
|
"reward": 2.8077996015548705, |
|
"reward_std": 0.9062907833606004, |
|
"rewards/accuracy_reward": 0.676785746961832, |
|
"rewards/cosine_scaled_reward": 0.44053755011409523, |
|
"rewards/format_reward": 0.9517857328057289, |
|
"rewards/reasoning_steps_reward": 0.738690534979105, |
|
"step": 405 |
|
}, |
|
{ |
|
"completion_length": 785.1232528686523, |
|
"epoch": 0.8746666666666667, |
|
"grad_norm": 5.246554486522317, |
|
"kl": 1.4623046875, |
|
"learning_rate": 1.3831291600445573e-07, |
|
"loss": 0.0585, |
|
"reward": 2.7128711313009264, |
|
"reward_std": 0.8996678274124861, |
|
"rewards/accuracy_reward": 0.6446428809314966, |
|
"rewards/cosine_scaled_reward": 0.4003709977958351, |
|
"rewards/format_reward": 0.9267857372760773, |
|
"rewards/reasoning_steps_reward": 0.7410714901983738, |
|
"step": 410 |
|
}, |
|
{ |
|
"completion_length": 759.5286041259766, |
|
"epoch": 0.8853333333333333, |
|
"grad_norm": 4.4529003923911565, |
|
"kl": 0.970086669921875, |
|
"learning_rate": 1.1579243729307487e-07, |
|
"loss": 0.0388, |
|
"reward": 2.6863596469163893, |
|
"reward_std": 0.9374732062220573, |
|
"rewards/accuracy_reward": 0.6446428891271353, |
|
"rewards/cosine_scaled_reward": 0.3893356985412538, |
|
"rewards/format_reward": 0.9250000223517418, |
|
"rewards/reasoning_steps_reward": 0.7273810058832169, |
|
"step": 415 |
|
}, |
|
{ |
|
"completion_length": 768.6143203735352, |
|
"epoch": 0.896, |
|
"grad_norm": 5.297256486680652, |
|
"kl": 1.40426025390625, |
|
"learning_rate": 9.519871314899092e-08, |
|
"loss": 0.0562, |
|
"reward": 2.7144743263721467, |
|
"reward_std": 0.9721049644052983, |
|
"rewards/accuracy_reward": 0.682142886146903, |
|
"rewards/cosine_scaled_reward": 0.4079266074113548, |
|
"rewards/format_reward": 0.9125000268220902, |
|
"rewards/reasoning_steps_reward": 0.7119048140943051, |
|
"step": 420 |
|
}, |
|
{ |
|
"completion_length": 762.5839622497558, |
|
"epoch": 0.9066666666666666, |
|
"grad_norm": 5.808442367606542, |
|
"kl": 1.266558837890625, |
|
"learning_rate": 7.656040910844358e-08, |
|
"loss": 0.0507, |
|
"reward": 2.7946069568395613, |
|
"reward_std": 0.9051171492785215, |
|
"rewards/accuracy_reward": 0.6750000327825546, |
|
"rewards/cosine_scaled_reward": 0.42258303044363854, |
|
"rewards/format_reward": 0.9500000178813934, |
|
"rewards/reasoning_steps_reward": 0.7470238760113717, |
|
"step": 425 |
|
}, |
|
{ |
|
"completion_length": 758.2143196105957, |
|
"epoch": 0.9173333333333333, |
|
"grad_norm": 4.124091919765889, |
|
"kl": 1.363201904296875, |
|
"learning_rate": 5.990346885098235e-08, |
|
"loss": 0.0546, |
|
"reward": 2.8003338128328323, |
|
"reward_std": 0.9829879272729158, |
|
"rewards/accuracy_reward": 0.698214316368103, |
|
"rewards/cosine_scaled_reward": 0.44914320297539234, |
|
"rewards/format_reward": 0.9303571596741677, |
|
"rewards/reasoning_steps_reward": 0.7226191096007824, |
|
"step": 430 |
|
}, |
|
{ |
|
"completion_length": 761.5821762084961, |
|
"epoch": 0.928, |
|
"grad_norm": 41.306011448835555, |
|
"kl": 1.1220947265625, |
|
"learning_rate": 4.5251078087033493e-08, |
|
"loss": 0.0449, |
|
"reward": 2.9364974945783615, |
|
"reward_std": 0.9109487719833851, |
|
"rewards/accuracy_reward": 0.7285714596509933, |
|
"rewards/cosine_scaled_reward": 0.4841164079494774, |
|
"rewards/format_reward": 0.9303571656346321, |
|
"rewards/reasoning_steps_reward": 0.7934524446725846, |
|
"step": 435 |
|
}, |
|
{ |
|
"completion_length": 762.6250328063965, |
|
"epoch": 0.9386666666666666, |
|
"grad_norm": 3.9945081680140406, |
|
"kl": 1.48448486328125, |
|
"learning_rate": 3.262363228443427e-08, |
|
"loss": 0.0594, |
|
"reward": 2.6933425307273864, |
|
"reward_std": 0.9241555985063314, |
|
"rewards/accuracy_reward": 0.6500000320374966, |
|
"rewards/cosine_scaled_reward": 0.399890087870881, |
|
"rewards/format_reward": 0.9357143074274064, |
|
"rewards/reasoning_steps_reward": 0.7077381528913975, |
|
"step": 440 |
|
}, |
|
{ |
|
"completion_length": 768.3500335693359, |
|
"epoch": 0.9493333333333334, |
|
"grad_norm": 13.713505614062974, |
|
"kl": 1.749468994140625, |
|
"learning_rate": 2.2038708278862952e-08, |
|
"loss": 0.07, |
|
"reward": 2.676873904466629, |
|
"reward_std": 0.9034410756081342, |
|
"rewards/accuracy_reward": 0.6410714581608772, |
|
"rewards/cosine_scaled_reward": 0.3983023501932621, |
|
"rewards/format_reward": 0.9178571671247482, |
|
"rewards/reasoning_steps_reward": 0.7196429133415222, |
|
"step": 445 |
|
}, |
|
{ |
|
"completion_length": 754.7732467651367, |
|
"epoch": 0.96, |
|
"grad_norm": 5.550660972818955, |
|
"kl": 1.152679443359375, |
|
"learning_rate": 1.3511039807673209e-08, |
|
"loss": 0.0461, |
|
"reward": 2.84182793200016, |
|
"reward_std": 0.8377097636461258, |
|
"rewards/accuracy_reward": 0.6964286038652062, |
|
"rewards/cosine_scaled_reward": 0.44599451111862437, |
|
"rewards/format_reward": 0.939285734295845, |
|
"rewards/reasoning_steps_reward": 0.7601191103458405, |
|
"step": 450 |
|
}, |
|
{ |
|
"completion_length": 743.9089569091797, |
|
"epoch": 0.9706666666666667, |
|
"grad_norm": 2.2837748295404885, |
|
"kl": 0.9584228515625, |
|
"learning_rate": 7.0524970011963675e-09, |
|
"loss": 0.0383, |
|
"reward": 2.9682214707136154, |
|
"reward_std": 0.8571841098368168, |
|
"rewards/accuracy_reward": 0.7517857450991869, |
|
"rewards/cosine_scaled_reward": 0.5009594126604497, |
|
"rewards/format_reward": 0.9553571596741677, |
|
"rewards/reasoning_steps_reward": 0.7601191103458405, |
|
"step": 455 |
|
}, |
|
{ |
|
"completion_length": 747.2446739196778, |
|
"epoch": 0.9813333333333333, |
|
"grad_norm": 60.34548052033379, |
|
"kl": 1.486627197265625, |
|
"learning_rate": 2.6720698600553595e-09, |
|
"loss": 0.0595, |
|
"reward": 2.8397951513528823, |
|
"reward_std": 0.8803306795656681, |
|
"rewards/accuracy_reward": 0.7000000329688192, |
|
"rewards/cosine_scaled_reward": 0.4570569400675595, |
|
"rewards/format_reward": 0.9517857328057289, |
|
"rewards/reasoning_steps_reward": 0.7309524327516556, |
|
"step": 460 |
|
}, |
|
{ |
|
"completion_length": 766.3500358581543, |
|
"epoch": 0.992, |
|
"grad_norm": 18.459219357783663, |
|
"kl": 1.420770263671875, |
|
"learning_rate": 3.7585574148779613e-10, |
|
"loss": 0.0569, |
|
"reward": 2.704774260520935, |
|
"reward_std": 0.9208621602505446, |
|
"rewards/accuracy_reward": 0.6678571715950966, |
|
"rewards/cosine_scaled_reward": 0.4119170166202821, |
|
"rewards/format_reward": 0.8982143089175224, |
|
"rewards/reasoning_steps_reward": 0.72678577080369, |
|
"step": 465 |
|
}, |
|
{ |
|
"completion_length": 773.5982462565104, |
|
"epoch": 0.9984, |
|
"kl": 1.5178934733072917, |
|
"reward": 2.810507302482923, |
|
"reward_std": 0.9296036226054033, |
|
"rewards/accuracy_reward": 0.6785714613894621, |
|
"rewards/cosine_scaled_reward": 0.44344370051597554, |
|
"rewards/format_reward": 0.9523809651533762, |
|
"rewards/reasoning_steps_reward": 0.7361111715435982, |
|
"step": 468, |
|
"total_flos": 0.0, |
|
"train_loss": 0.05152365299789334, |
|
"train_runtime": 191788.2346, |
|
"train_samples_per_second": 0.039, |
|
"train_steps_per_second": 0.002 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 468, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|