|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.2857142857142857, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3023.0, |
|
"epoch": 0.0005714285714285715, |
|
"grad_norm": 0.2460898458957672, |
|
"kl": 0.0, |
|
"learning_rate": 2e-08, |
|
"loss": -0.0314, |
|
"num_tokens": 151404.0, |
|
"reward": -0.17859874665737152, |
|
"reward_std": 0.18563616648316383, |
|
"rewards/cosine_scaled_reward": -0.08929938450455666, |
|
"step": 1 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2731.3958740234375, |
|
"epoch": 0.001142857142857143, |
|
"grad_norm": 0.22445940971374512, |
|
"kl": 0.0, |
|
"learning_rate": 4e-08, |
|
"loss": 0.0519, |
|
"num_tokens": 288319.0, |
|
"reward": -0.535461600869894, |
|
"reward_std": 0.16202664375305176, |
|
"rewards/cosine_scaled_reward": -0.2677307929843664, |
|
"step": 2 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2233.5416717529297, |
|
"epoch": 0.0017142857142857142, |
|
"grad_norm": 0.24249283969402313, |
|
"kl": 0.00036716461181640625, |
|
"learning_rate": 6e-08, |
|
"loss": 0.0576, |
|
"num_tokens": 401025.0, |
|
"reward": 0.33875009417533875, |
|
"reward_std": 0.6338529586791992, |
|
"rewards/cosine_scaled_reward": 0.16937505267560482, |
|
"step": 3 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2376.2500915527344, |
|
"epoch": 0.002285714285714286, |
|
"grad_norm": 0.3028571605682373, |
|
"kl": 0.0006170272827148438, |
|
"learning_rate": 8e-08, |
|
"loss": 0.0773, |
|
"num_tokens": 521001.0, |
|
"reward": -0.4882083088159561, |
|
"reward_std": 0.4496277957223356, |
|
"rewards/cosine_scaled_reward": -0.24410414695739746, |
|
"step": 4 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3372.3750610351562, |
|
"epoch": 0.002857142857142857, |
|
"grad_norm": 0.21095335483551025, |
|
"kl": 0.0006723403930664062, |
|
"learning_rate": 1e-07, |
|
"loss": 0.007, |
|
"num_tokens": 689631.0, |
|
"reward": -0.5746253430843353, |
|
"reward_std": 0.23810617998242378, |
|
"rewards/cosine_scaled_reward": -0.28731267154216766, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2820.375030517578, |
|
"epoch": 0.0034285714285714284, |
|
"grad_norm": 0.33144864439964294, |
|
"kl": 0.0005979537963867188, |
|
"learning_rate": 1.2e-07, |
|
"loss": -0.0916, |
|
"num_tokens": 831357.0, |
|
"reward": -0.4876829609274864, |
|
"reward_std": 0.20684241317212582, |
|
"rewards/cosine_scaled_reward": -0.24384147115051746, |
|
"step": 6 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3243.8750610351562, |
|
"epoch": 0.004, |
|
"grad_norm": 0.20102664828300476, |
|
"kl": 0.0005273818969726562, |
|
"learning_rate": 1.4e-07, |
|
"loss": -0.0443, |
|
"num_tokens": 992799.0, |
|
"reward": -0.30651520285755396, |
|
"reward_std": 0.3921822514384985, |
|
"rewards/cosine_scaled_reward": -0.15325760166160762, |
|
"step": 7 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3092.625, |
|
"epoch": 0.004571428571428572, |
|
"grad_norm": 0.23749692738056183, |
|
"kl": 0.0005922317504882812, |
|
"learning_rate": 1.6e-07, |
|
"loss": -0.0614, |
|
"num_tokens": 1146921.0, |
|
"reward": -0.8370707631111145, |
|
"reward_std": 0.16819308325648308, |
|
"rewards/cosine_scaled_reward": -0.41853538155555725, |
|
"step": 8 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2227.9166717529297, |
|
"epoch": 0.005142857142857143, |
|
"grad_norm": 0.3016290068626404, |
|
"kl": 0.0005288124084472656, |
|
"learning_rate": 1.8e-07, |
|
"loss": -0.0113, |
|
"num_tokens": 1259777.0, |
|
"reward": -0.15989744663238525, |
|
"reward_std": 0.25975861586630344, |
|
"rewards/cosine_scaled_reward": -0.07994873821735382, |
|
"step": 9 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3047.4376220703125, |
|
"epoch": 0.005714285714285714, |
|
"grad_norm": 0.21191374957561493, |
|
"kl": 0.000568389892578125, |
|
"learning_rate": 2e-07, |
|
"loss": -0.0152, |
|
"num_tokens": 1411430.0, |
|
"reward": -0.019763831049203873, |
|
"reward_std": 0.8465264737606049, |
|
"rewards/cosine_scaled_reward": -0.00988190807402134, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2482.4375, |
|
"epoch": 0.006285714285714286, |
|
"grad_norm": 0.330398291349411, |
|
"kl": 0.0006580352783203125, |
|
"learning_rate": 2.1999999999999998e-07, |
|
"loss": 0.053, |
|
"num_tokens": 1536179.0, |
|
"reward": -0.4834251650609076, |
|
"reward_std": 0.5014891251921654, |
|
"rewards/cosine_scaled_reward": -0.2417125750798732, |
|
"step": 11 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2487.2500610351562, |
|
"epoch": 0.006857142857142857, |
|
"grad_norm": 0.30102092027664185, |
|
"kl": 0.000583648681640625, |
|
"learning_rate": 2.4e-07, |
|
"loss": -0.0912, |
|
"num_tokens": 1661063.0, |
|
"reward": -0.14752477407455444, |
|
"reward_std": 0.5868879407644272, |
|
"rewards/cosine_scaled_reward": -0.07376237958669662, |
|
"step": 12 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2876.229217529297, |
|
"epoch": 0.0074285714285714285, |
|
"grad_norm": 0.25117188692092896, |
|
"kl": 0.0005583763122558594, |
|
"learning_rate": 2.6e-07, |
|
"loss": -0.0352, |
|
"num_tokens": 1804702.0, |
|
"reward": -0.08496717864181846, |
|
"reward_std": 0.5994590483605862, |
|
"rewards/cosine_scaled_reward": -0.042483578145038337, |
|
"step": 13 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2376.166717529297, |
|
"epoch": 0.008, |
|
"grad_norm": 0.34051504731178284, |
|
"kl": 0.0006012916564941406, |
|
"learning_rate": 2.8e-07, |
|
"loss": 0.1844, |
|
"num_tokens": 1924458.0, |
|
"reward": 0.5019294954836369, |
|
"reward_std": 0.4014289937913418, |
|
"rewards/cosine_scaled_reward": 0.2509647514671087, |
|
"step": 14 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2104.3541870117188, |
|
"epoch": 0.008571428571428572, |
|
"grad_norm": 0.27839282155036926, |
|
"kl": 0.0004687309265136719, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0602, |
|
"num_tokens": 2031107.0, |
|
"reward": -0.2739022574387491, |
|
"reward_std": 0.5232805069535971, |
|
"rewards/cosine_scaled_reward": -0.13695112499408424, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3022.2709350585938, |
|
"epoch": 0.009142857142857144, |
|
"grad_norm": 0.23329903185367584, |
|
"kl": 0.0007066726684570312, |
|
"learning_rate": 3.2e-07, |
|
"loss": -0.0235, |
|
"num_tokens": 2182272.0, |
|
"reward": -0.28543997276574373, |
|
"reward_std": 0.5580427274107933, |
|
"rewards/cosine_scaled_reward": -0.14271997893229127, |
|
"step": 16 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2995.8959350585938, |
|
"epoch": 0.009714285714285713, |
|
"grad_norm": 0.20035724341869354, |
|
"kl": 0.00064849853515625, |
|
"learning_rate": 3.4000000000000003e-07, |
|
"loss": 0.0584, |
|
"num_tokens": 2331607.0, |
|
"reward": -0.49751752614974976, |
|
"reward_std": 0.40262408554553986, |
|
"rewards/cosine_scaled_reward": -0.24875876307487488, |
|
"step": 17 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3225.3125, |
|
"epoch": 0.010285714285714285, |
|
"grad_norm": 0.20705807209014893, |
|
"kl": 0.0006122589111328125, |
|
"learning_rate": 3.6e-07, |
|
"loss": 0.0237, |
|
"num_tokens": 2492782.0, |
|
"reward": -0.5619450844824314, |
|
"reward_std": 0.47893428802490234, |
|
"rewards/cosine_scaled_reward": -0.2809725347906351, |
|
"step": 18 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2767.8541717529297, |
|
"epoch": 0.010857142857142857, |
|
"grad_norm": 0.25546160340309143, |
|
"kl": 0.0007276535034179688, |
|
"learning_rate": 3.7999999999999996e-07, |
|
"loss": -0.0882, |
|
"num_tokens": 2631327.0, |
|
"reward": -0.18378404527902603, |
|
"reward_std": 0.5495752617716789, |
|
"rewards/cosine_scaled_reward": -0.09189202263951302, |
|
"step": 19 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1481.000015258789, |
|
"epoch": 0.011428571428571429, |
|
"grad_norm": 0.3158569931983948, |
|
"kl": 0.0004367828369140625, |
|
"learning_rate": 4e-07, |
|
"loss": 0.0439, |
|
"num_tokens": 2708367.0, |
|
"reward": -0.025459617376327515, |
|
"reward_std": 0.57894092425704, |
|
"rewards/cosine_scaled_reward": -0.01272980123758316, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1864.2500457763672, |
|
"epoch": 0.012, |
|
"grad_norm": 0.4832952618598938, |
|
"kl": 0.00046253204345703125, |
|
"learning_rate": 4.1999999999999995e-07, |
|
"loss": 0.0893, |
|
"num_tokens": 2803731.0, |
|
"reward": -0.20306236669421196, |
|
"reward_std": 0.7009828165173531, |
|
"rewards/cosine_scaled_reward": -0.10153118334710598, |
|
"step": 21 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3143.604248046875, |
|
"epoch": 0.012571428571428572, |
|
"grad_norm": 0.20443888008594513, |
|
"kl": 0.0005559921264648438, |
|
"learning_rate": 4.3999999999999997e-07, |
|
"loss": 0.0298, |
|
"num_tokens": 2960960.0, |
|
"reward": 0.664834626019001, |
|
"reward_std": 1.0612835884094238, |
|
"rewards/cosine_scaled_reward": 0.3324173092842102, |
|
"step": 22 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2191.9791870117188, |
|
"epoch": 0.013142857142857144, |
|
"grad_norm": 0.23772495985031128, |
|
"kl": 0.00048279762268066406, |
|
"learning_rate": 4.6e-07, |
|
"loss": 0.0419, |
|
"num_tokens": 3071671.0, |
|
"reward": -0.08595703169703484, |
|
"reward_std": 0.7537456881254911, |
|
"rewards/cosine_scaled_reward": -0.042978519573807716, |
|
"step": 23 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2681.1458740234375, |
|
"epoch": 0.013714285714285714, |
|
"grad_norm": 0.2397710382938385, |
|
"kl": 0.0005970001220703125, |
|
"learning_rate": 4.8e-07, |
|
"loss": 0.0064, |
|
"num_tokens": 3206774.0, |
|
"reward": 0.038678646087646484, |
|
"reward_std": 0.3931765630841255, |
|
"rewards/cosine_scaled_reward": 0.01933930814266205, |
|
"step": 24 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1551.250015258789, |
|
"epoch": 0.014285714285714285, |
|
"grad_norm": 0.3846415579319, |
|
"kl": 0.0004100799560546875, |
|
"learning_rate": 5e-07, |
|
"loss": 0.0557, |
|
"num_tokens": 3287126.0, |
|
"reward": -0.13053925335407257, |
|
"reward_std": 0.4432575963437557, |
|
"rewards/cosine_scaled_reward": -0.06526962295174599, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2773.6250610351562, |
|
"epoch": 0.014857142857142857, |
|
"grad_norm": 0.23315444588661194, |
|
"kl": 0.0005998611450195312, |
|
"learning_rate": 5.2e-07, |
|
"loss": 0.0573, |
|
"num_tokens": 3426272.0, |
|
"reward": -0.14446274191141129, |
|
"reward_std": 0.7020265012979507, |
|
"rewards/cosine_scaled_reward": -0.07223137095570564, |
|
"step": 26 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3283.5, |
|
"epoch": 0.015428571428571429, |
|
"grad_norm": 0.21181795001029968, |
|
"kl": 0.0006422996520996094, |
|
"learning_rate": 5.4e-07, |
|
"loss": -0.0769, |
|
"num_tokens": 3590060.0, |
|
"reward": -0.299600200727582, |
|
"reward_std": 0.4262968748807907, |
|
"rewards/cosine_scaled_reward": -0.14980009896680713, |
|
"step": 27 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2844.0833740234375, |
|
"epoch": 0.016, |
|
"grad_norm": 0.21920780837535858, |
|
"kl": 0.000614166259765625, |
|
"learning_rate": 5.6e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 3733524.0, |
|
"reward": -0.4807719439268112, |
|
"reward_std": 0.42509571835398674, |
|
"rewards/cosine_scaled_reward": -0.24038597010076046, |
|
"step": 28 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2722.250030517578, |
|
"epoch": 0.01657142857142857, |
|
"grad_norm": 0.31188592314720154, |
|
"kl": 0.0006227493286132812, |
|
"learning_rate": 5.8e-07, |
|
"loss": 0.0756, |
|
"num_tokens": 3871068.0, |
|
"reward": 0.08422036468982697, |
|
"reward_std": 0.6395114436745644, |
|
"rewards/cosine_scaled_reward": 0.04211018607020378, |
|
"step": 29 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3481.2709350585938, |
|
"epoch": 0.017142857142857144, |
|
"grad_norm": 0.20324808359146118, |
|
"kl": 0.000644683837890625, |
|
"learning_rate": 6e-07, |
|
"loss": 0.0018, |
|
"num_tokens": 4044145.0, |
|
"reward": -0.18417476117610931, |
|
"reward_std": 0.6154340840876102, |
|
"rewards/cosine_scaled_reward": -0.0920873824506998, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2657.166793823242, |
|
"epoch": 0.017714285714285714, |
|
"grad_norm": 0.3732287287712097, |
|
"kl": 0.0005855560302734375, |
|
"learning_rate": 6.2e-07, |
|
"loss": 0.1126, |
|
"num_tokens": 4176981.0, |
|
"reward": -0.34596723690629005, |
|
"reward_std": 0.6994314044713974, |
|
"rewards/cosine_scaled_reward": -0.17298361286520958, |
|
"step": 31 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1848.4583587646484, |
|
"epoch": 0.018285714285714287, |
|
"grad_norm": 0.34063395857810974, |
|
"kl": 0.0004658699035644531, |
|
"learning_rate": 6.4e-07, |
|
"loss": 0.0435, |
|
"num_tokens": 4271395.0, |
|
"reward": 0.1516597867012024, |
|
"reward_std": 0.7864086776971817, |
|
"rewards/cosine_scaled_reward": 0.0758299008011818, |
|
"step": 32 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.018857142857142857, |
|
"grad_norm": 0.22958050668239594, |
|
"kl": 0.000720977783203125, |
|
"learning_rate": 6.6e-07, |
|
"loss": 0.0, |
|
"num_tokens": 4450063.0, |
|
"reward": -0.6370590478181839, |
|
"reward_std": 0.20538340508937836, |
|
"rewards/cosine_scaled_reward": -0.31852949783205986, |
|
"step": 33 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2935.3959350585938, |
|
"epoch": 0.019428571428571427, |
|
"grad_norm": 0.2298842966556549, |
|
"kl": 0.0005502700805664062, |
|
"learning_rate": 6.800000000000001e-07, |
|
"loss": 0.0374, |
|
"num_tokens": 4596926.0, |
|
"reward": -0.41714829951524734, |
|
"reward_std": 0.39900972694158554, |
|
"rewards/cosine_scaled_reward": -0.20857414416968822, |
|
"step": 34 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3036.9166870117188, |
|
"epoch": 0.02, |
|
"grad_norm": 0.23063404858112335, |
|
"kl": 0.0006365776062011719, |
|
"learning_rate": 7e-07, |
|
"loss": 0.0008, |
|
"num_tokens": 4749586.0, |
|
"reward": -0.7142433375120163, |
|
"reward_std": 0.3739009462296963, |
|
"rewards/cosine_scaled_reward": -0.35712166875600815, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2749.8958740234375, |
|
"epoch": 0.02057142857142857, |
|
"grad_norm": 0.27252835035324097, |
|
"kl": 0.000637054443359375, |
|
"learning_rate": 7.2e-07, |
|
"loss": -0.0058, |
|
"num_tokens": 4887449.0, |
|
"reward": -0.10459958261344582, |
|
"reward_std": 0.6155130080878735, |
|
"rewards/cosine_scaled_reward": -0.052299798757303506, |
|
"step": 36 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2935.562530517578, |
|
"epoch": 0.021142857142857144, |
|
"grad_norm": 0.2475793957710266, |
|
"kl": 0.000568389892578125, |
|
"learning_rate": 7.4e-07, |
|
"loss": -0.088, |
|
"num_tokens": 5034704.0, |
|
"reward": -0.29862387478351593, |
|
"reward_std": 0.39744907803833485, |
|
"rewards/cosine_scaled_reward": -0.14931193552911282, |
|
"step": 37 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3025.7083740234375, |
|
"epoch": 0.021714285714285714, |
|
"grad_norm": 0.22514301538467407, |
|
"kl": 0.0006685256958007812, |
|
"learning_rate": 7.599999999999999e-07, |
|
"loss": 0.0347, |
|
"num_tokens": 5186814.0, |
|
"reward": -0.2246699258685112, |
|
"reward_std": 0.6279377490282059, |
|
"rewards/cosine_scaled_reward": -0.11233496479690075, |
|
"step": 38 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1896.0208435058594, |
|
"epoch": 0.022285714285714287, |
|
"grad_norm": 0.31844407320022583, |
|
"kl": 0.00075531005859375, |
|
"learning_rate": 7.799999999999999e-07, |
|
"loss": -0.0971, |
|
"num_tokens": 5282791.0, |
|
"reward": -0.26946142315864563, |
|
"reward_std": 0.6844599097967148, |
|
"rewards/cosine_scaled_reward": -0.13473070412874222, |
|
"step": 39 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3461.6458740234375, |
|
"epoch": 0.022857142857142857, |
|
"grad_norm": 0.1988898515701294, |
|
"kl": 0.0005855560302734375, |
|
"learning_rate": 8e-07, |
|
"loss": 0.0348, |
|
"num_tokens": 5455178.0, |
|
"reward": -0.3771579749882221, |
|
"reward_std": 0.43913378193974495, |
|
"rewards/cosine_scaled_reward": -0.18857897631824017, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2864.0625, |
|
"epoch": 0.023428571428571427, |
|
"grad_norm": 0.2541484534740448, |
|
"kl": 0.0006170272827148438, |
|
"learning_rate": 8.199999999999999e-07, |
|
"loss": 0.0332, |
|
"num_tokens": 5599517.0, |
|
"reward": -0.15848201513290405, |
|
"reward_std": 0.28519516810774803, |
|
"rewards/cosine_scaled_reward": -0.07924101501703262, |
|
"step": 41 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2103.375045776367, |
|
"epoch": 0.024, |
|
"grad_norm": 0.4904050827026367, |
|
"kl": 0.0005826950073242188, |
|
"learning_rate": 8.399999999999999e-07, |
|
"loss": 0.1226, |
|
"num_tokens": 5706251.0, |
|
"reward": 0.05192290246486664, |
|
"reward_std": 0.5277432054281235, |
|
"rewards/cosine_scaled_reward": 0.02596145309507847, |
|
"step": 42 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3375.9583740234375, |
|
"epoch": 0.02457142857142857, |
|
"grad_norm": 0.20466555655002594, |
|
"kl": 0.0007343292236328125, |
|
"learning_rate": 8.599999999999999e-07, |
|
"loss": 0.0472, |
|
"num_tokens": 5873601.0, |
|
"reward": -0.1930120848119259, |
|
"reward_std": 0.7160178981721401, |
|
"rewards/cosine_scaled_reward": -0.0965060293674469, |
|
"step": 43 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3035.7708740234375, |
|
"epoch": 0.025142857142857144, |
|
"grad_norm": 0.19474832713603973, |
|
"kl": 0.0006237030029296875, |
|
"learning_rate": 8.799999999999999e-07, |
|
"loss": -0.0035, |
|
"num_tokens": 6025522.0, |
|
"reward": -0.16648699529469013, |
|
"reward_std": 0.6652341857552528, |
|
"rewards/cosine_scaled_reward": -0.08324349066242576, |
|
"step": 44 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3335.8750610351562, |
|
"epoch": 0.025714285714285714, |
|
"grad_norm": 0.21470214426517487, |
|
"kl": 0.0007848739624023438, |
|
"learning_rate": 9e-07, |
|
"loss": 0.0771, |
|
"num_tokens": 6192064.0, |
|
"reward": -0.6564953848719597, |
|
"reward_std": 0.22902014665305614, |
|
"rewards/cosine_scaled_reward": -0.32824768498539925, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2241.8541870117188, |
|
"epoch": 0.026285714285714287, |
|
"grad_norm": 0.29658329486846924, |
|
"kl": 0.00080108642578125, |
|
"learning_rate": 9.2e-07, |
|
"loss": -0.0078, |
|
"num_tokens": 6305085.0, |
|
"reward": 0.3887103348970413, |
|
"reward_std": 1.0468786805868149, |
|
"rewards/cosine_scaled_reward": 0.19435517117381096, |
|
"step": 46 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2734.7083740234375, |
|
"epoch": 0.026857142857142857, |
|
"grad_norm": 0.28740641474723816, |
|
"kl": 0.0004706382751464844, |
|
"learning_rate": 9.399999999999999e-07, |
|
"loss": 0.049, |
|
"num_tokens": 6441355.0, |
|
"reward": 0.15308012068271637, |
|
"reward_std": 0.4208949161693454, |
|
"rewards/cosine_scaled_reward": 0.07654005661606789, |
|
"step": 47 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1500.8958740234375, |
|
"epoch": 0.027428571428571427, |
|
"grad_norm": 0.39340028166770935, |
|
"kl": 0.0004849433898925781, |
|
"learning_rate": 9.6e-07, |
|
"loss": 0.0855, |
|
"num_tokens": 6520190.0, |
|
"reward": 0.44709211960434914, |
|
"reward_std": 0.7460008524358273, |
|
"rewards/cosine_scaled_reward": 0.22354605607688427, |
|
"step": 48 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2304.1458435058594, |
|
"epoch": 0.028, |
|
"grad_norm": 0.35876935720443726, |
|
"kl": 0.0008916854858398438, |
|
"learning_rate": 9.8e-07, |
|
"loss": 0.0348, |
|
"num_tokens": 6636501.0, |
|
"reward": -0.05522707849740982, |
|
"reward_std": 0.5208401791751385, |
|
"rewards/cosine_scaled_reward": -0.02761353738605976, |
|
"step": 49 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3012.8541870117188, |
|
"epoch": 0.02857142857142857, |
|
"grad_norm": 0.2135663479566574, |
|
"kl": 0.0007343292236328125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0216, |
|
"num_tokens": 6786926.0, |
|
"reward": -0.08169351518154144, |
|
"reward_std": 0.6610444337129593, |
|
"rewards/cosine_scaled_reward": -0.040846746414899826, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2833.375, |
|
"epoch": 0.029142857142857144, |
|
"grad_norm": 0.24645818769931793, |
|
"kl": 0.0005292892456054688, |
|
"learning_rate": 9.999890338174275e-07, |
|
"loss": 0.0353, |
|
"num_tokens": 6929624.0, |
|
"reward": 0.0020843185484409332, |
|
"reward_std": 0.12999659916386008, |
|
"rewards/cosine_scaled_reward": 0.001042170450091362, |
|
"step": 51 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3556.0416870117188, |
|
"epoch": 0.029714285714285714, |
|
"grad_norm": 0.18516220152378082, |
|
"kl": 0.00058746337890625, |
|
"learning_rate": 9.999561358041868e-07, |
|
"loss": 0.0009, |
|
"num_tokens": 7107826.0, |
|
"reward": -0.3568333759903908, |
|
"reward_std": 0.5824087001383305, |
|
"rewards/cosine_scaled_reward": -0.1784166805446148, |
|
"step": 52 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2816.1666717529297, |
|
"epoch": 0.030285714285714287, |
|
"grad_norm": 0.3401743173599243, |
|
"kl": 0.0006856918334960938, |
|
"learning_rate": 9.999013075636804e-07, |
|
"loss": 0.06, |
|
"num_tokens": 7250142.0, |
|
"reward": -0.00018092244863510132, |
|
"reward_std": 0.16538633033633232, |
|
"rewards/cosine_scaled_reward": -9.047612547874451e-05, |
|
"step": 53 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3037.3958435058594, |
|
"epoch": 0.030857142857142857, |
|
"grad_norm": 0.39248475432395935, |
|
"kl": 0.000926971435546875, |
|
"learning_rate": 9.998245517681593e-07, |
|
"loss": -0.0691, |
|
"num_tokens": 7402561.0, |
|
"reward": -0.5911240540444851, |
|
"reward_std": 0.18204397335648537, |
|
"rewards/cosine_scaled_reward": -0.29556202609091997, |
|
"step": 54 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3278.8958740234375, |
|
"epoch": 0.03142857142857143, |
|
"grad_norm": 0.22106441855430603, |
|
"kl": 0.000995635986328125, |
|
"learning_rate": 9.997258721585931e-07, |
|
"loss": 0.0338, |
|
"num_tokens": 7566500.0, |
|
"reward": -0.07510977238416672, |
|
"reward_std": 0.608110748231411, |
|
"rewards/cosine_scaled_reward": -0.03755488805472851, |
|
"step": 55 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2919.937530517578, |
|
"epoch": 0.032, |
|
"grad_norm": 0.27260297536849976, |
|
"kl": 0.0011653900146484375, |
|
"learning_rate": 9.996052735444862e-07, |
|
"loss": 0.0375, |
|
"num_tokens": 7712429.0, |
|
"reward": -0.1868463009595871, |
|
"reward_std": 0.5689870864152908, |
|
"rewards/cosine_scaled_reward": -0.09342315793037415, |
|
"step": 56 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2649.8751220703125, |
|
"epoch": 0.03257142857142857, |
|
"grad_norm": 0.21306385099887848, |
|
"kl": 0.0008172988891601562, |
|
"learning_rate": 9.994627618036452e-07, |
|
"loss": -0.0171, |
|
"num_tokens": 7845899.0, |
|
"reward": -0.43829748034477234, |
|
"reward_std": 0.6765277907252312, |
|
"rewards/cosine_scaled_reward": -0.21914873644709587, |
|
"step": 57 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3418.3958740234375, |
|
"epoch": 0.03314285714285714, |
|
"grad_norm": 0.19094346463680267, |
|
"kl": 0.00072479248046875, |
|
"learning_rate": 9.992983438818915e-07, |
|
"loss": 0.0116, |
|
"num_tokens": 8016522.0, |
|
"reward": -0.2600528746843338, |
|
"reward_std": 0.41667389310896397, |
|
"rewards/cosine_scaled_reward": -0.1300264373421669, |
|
"step": 58 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2712.9583740234375, |
|
"epoch": 0.03371428571428572, |
|
"grad_norm": 0.27561813592910767, |
|
"kl": 0.0011749267578125, |
|
"learning_rate": 9.991120277927223e-07, |
|
"loss": -0.0132, |
|
"num_tokens": 8152660.0, |
|
"reward": -0.2609961926937103, |
|
"reward_std": 0.5313794314861298, |
|
"rewards/cosine_scaled_reward": -0.13049809262156487, |
|
"step": 59 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3343.8125610351562, |
|
"epoch": 0.03428571428571429, |
|
"grad_norm": 0.20157206058502197, |
|
"kl": 0.0007047653198242188, |
|
"learning_rate": 9.989038226169207e-07, |
|
"loss": 0.0128, |
|
"num_tokens": 8319607.0, |
|
"reward": 0.07680468261241913, |
|
"reward_std": 0.3094941098242998, |
|
"rewards/cosine_scaled_reward": 0.038402341306209564, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2177.208396911621, |
|
"epoch": 0.03485714285714286, |
|
"grad_norm": 0.36359933018684387, |
|
"kl": 0.0006518363952636719, |
|
"learning_rate": 9.98673738502114e-07, |
|
"loss": 0.14, |
|
"num_tokens": 8430461.0, |
|
"reward": 0.17365121096372604, |
|
"reward_std": 0.5538155660033226, |
|
"rewards/cosine_scaled_reward": 0.08682558685541153, |
|
"step": 61 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2966.7291870117188, |
|
"epoch": 0.03542857142857143, |
|
"grad_norm": 0.3397689163684845, |
|
"kl": 0.0008535385131835938, |
|
"learning_rate": 9.98421786662277e-07, |
|
"loss": -0.0699, |
|
"num_tokens": 8578936.0, |
|
"reward": -0.7255322933197021, |
|
"reward_std": 0.2827052026987076, |
|
"rewards/cosine_scaled_reward": -0.3627661466598511, |
|
"step": 62 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3450.8125610351562, |
|
"epoch": 0.036, |
|
"grad_norm": 0.18789339065551758, |
|
"kl": 0.0008687973022460938, |
|
"learning_rate": 9.981479793771866e-07, |
|
"loss": -0.0483, |
|
"num_tokens": 8750479.0, |
|
"reward": -0.2192160151898861, |
|
"reward_std": 0.5799107477068901, |
|
"rewards/cosine_scaled_reward": -0.10960800759494305, |
|
"step": 63 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3443.3541870117188, |
|
"epoch": 0.036571428571428574, |
|
"grad_norm": 0.17766065895557404, |
|
"kl": 0.0008435249328613281, |
|
"learning_rate": 9.97852329991824e-07, |
|
"loss": -0.0151, |
|
"num_tokens": 8922264.0, |
|
"reward": -0.48261551931500435, |
|
"reward_std": 0.19259289279580116, |
|
"rewards/cosine_scaled_reward": -0.24130774475634098, |
|
"step": 64 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3403.0625610351562, |
|
"epoch": 0.037142857142857144, |
|
"grad_norm": 0.1934821903705597, |
|
"kl": 0.00086212158203125, |
|
"learning_rate": 9.975348529157229e-07, |
|
"loss": 0.0185, |
|
"num_tokens": 9092067.0, |
|
"reward": -0.46268967539072037, |
|
"reward_std": 0.28929166309535503, |
|
"rewards/cosine_scaled_reward": -0.23134482093155384, |
|
"step": 65 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2666.395965576172, |
|
"epoch": 0.037714285714285714, |
|
"grad_norm": 0.24617139995098114, |
|
"kl": 0.00070953369140625, |
|
"learning_rate": 9.971955636222684e-07, |
|
"loss": -0.0574, |
|
"num_tokens": 9225802.0, |
|
"reward": 0.30002279952168465, |
|
"reward_std": 0.6946442574262619, |
|
"rewards/cosine_scaled_reward": 0.1500114006921649, |
|
"step": 66 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2073.750030517578, |
|
"epoch": 0.038285714285714284, |
|
"grad_norm": 0.4215092062950134, |
|
"kl": 0.0011243820190429688, |
|
"learning_rate": 9.968344786479415e-07, |
|
"loss": 0.0529, |
|
"num_tokens": 9330898.0, |
|
"reward": -0.09548089653253555, |
|
"reward_std": 0.8580914586782455, |
|
"rewards/cosine_scaled_reward": -0.04774044919759035, |
|
"step": 67 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3528.1666870117188, |
|
"epoch": 0.038857142857142854, |
|
"grad_norm": 0.20039838552474976, |
|
"kl": 0.0005893707275390625, |
|
"learning_rate": 9.964516155915151e-07, |
|
"loss": -0.0095, |
|
"num_tokens": 9506442.0, |
|
"reward": -0.42071669083088636, |
|
"reward_std": 0.42319002375006676, |
|
"rewards/cosine_scaled_reward": -0.21035834541544318, |
|
"step": 68 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2716.7291870117188, |
|
"epoch": 0.03942857142857143, |
|
"grad_norm": 0.2517124116420746, |
|
"kl": 0.0007944107055664062, |
|
"learning_rate": 9.960469931131936e-07, |
|
"loss": -0.1279, |
|
"num_tokens": 9643061.0, |
|
"reward": -0.3758072182536125, |
|
"reward_std": 0.45701417699456215, |
|
"rewards/cosine_scaled_reward": -0.18790359422564507, |
|
"step": 69 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2648.0000228881836, |
|
"epoch": 0.04, |
|
"grad_norm": 0.29658740758895874, |
|
"kl": 0.0008521080017089844, |
|
"learning_rate": 9.956206309337066e-07, |
|
"loss": 0.0124, |
|
"num_tokens": 9776189.0, |
|
"reward": -0.16740068793296814, |
|
"reward_std": 0.41449040174484253, |
|
"rewards/cosine_scaled_reward": -0.08370032906532288, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1873.812515258789, |
|
"epoch": 0.04057142857142857, |
|
"grad_norm": 0.3751354515552521, |
|
"kl": 0.0007829666137695312, |
|
"learning_rate": 9.951725498333448e-07, |
|
"loss": 0.0863, |
|
"num_tokens": 9871712.0, |
|
"reward": 0.3852356970310211, |
|
"reward_std": 0.5030505172908306, |
|
"rewards/cosine_scaled_reward": 0.19261783733963966, |
|
"step": 71 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2817.8334197998047, |
|
"epoch": 0.04114285714285714, |
|
"grad_norm": 0.27796244621276855, |
|
"kl": 0.000804901123046875, |
|
"learning_rate": 9.947027716509488e-07, |
|
"loss": 0.0581, |
|
"num_tokens": 10013232.0, |
|
"reward": -0.10409137606620789, |
|
"reward_std": 0.20197268202900887, |
|
"rewards/cosine_scaled_reward": -0.05204569548368454, |
|
"step": 72 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2128.0208740234375, |
|
"epoch": 0.04171428571428572, |
|
"grad_norm": 0.33463340997695923, |
|
"kl": 0.0006952285766601562, |
|
"learning_rate": 9.942113192828444e-07, |
|
"loss": 0.0541, |
|
"num_tokens": 10120693.0, |
|
"reward": -0.22259100899100304, |
|
"reward_std": 0.4416811428964138, |
|
"rewards/cosine_scaled_reward": -0.11129548959434032, |
|
"step": 73 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2830.6666717529297, |
|
"epoch": 0.04228571428571429, |
|
"grad_norm": 0.28111714124679565, |
|
"kl": 0.0005383491516113281, |
|
"learning_rate": 9.93698216681727e-07, |
|
"loss": 0.0078, |
|
"num_tokens": 10262025.0, |
|
"reward": -0.14538022875785828, |
|
"reward_std": 0.5454810187220573, |
|
"rewards/cosine_scaled_reward": -0.07269011810421944, |
|
"step": 74 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1554.2708435058594, |
|
"epoch": 0.04285714285714286, |
|
"grad_norm": 0.4327227771282196, |
|
"kl": 0.0011434555053710938, |
|
"learning_rate": 9.931634888554935e-07, |
|
"loss": 0.0645, |
|
"num_tokens": 10341778.0, |
|
"reward": -0.28616778552532196, |
|
"reward_std": 0.6919333338737488, |
|
"rewards/cosine_scaled_reward": -0.1430838778614998, |
|
"step": 75 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3283.2083740234375, |
|
"epoch": 0.04342857142857143, |
|
"grad_norm": 0.2106700837612152, |
|
"kl": 0.0007944107055664062, |
|
"learning_rate": 9.926071618660237e-07, |
|
"loss": 0.0367, |
|
"num_tokens": 10504976.0, |
|
"reward": -0.337122593075037, |
|
"reward_std": 0.5403655767440796, |
|
"rewards/cosine_scaled_reward": -0.1685612890869379, |
|
"step": 76 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2804.854248046875, |
|
"epoch": 0.044, |
|
"grad_norm": 0.290350079536438, |
|
"kl": 0.0010528564453125, |
|
"learning_rate": 9.9202926282791e-07, |
|
"loss": -0.1465, |
|
"num_tokens": 10645033.0, |
|
"reward": -0.3672878537327051, |
|
"reward_std": 0.4394787736237049, |
|
"rewards/cosine_scaled_reward": -0.18364392640069127, |
|
"step": 77 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2761.9375610351562, |
|
"epoch": 0.044571428571428574, |
|
"grad_norm": 0.23817922174930573, |
|
"kl": 0.0006437301635742188, |
|
"learning_rate": 9.91429819907136e-07, |
|
"loss": 0.0617, |
|
"num_tokens": 10783558.0, |
|
"reward": 0.039801888167858124, |
|
"reward_std": 0.8721425756812096, |
|
"rewards/cosine_scaled_reward": 0.019900942221283913, |
|
"step": 78 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2478.0000610351562, |
|
"epoch": 0.045142857142857144, |
|
"grad_norm": 0.2438346892595291, |
|
"kl": 0.0007276535034179688, |
|
"learning_rate": 9.908088623197048e-07, |
|
"loss": -0.0649, |
|
"num_tokens": 10908622.0, |
|
"reward": -0.009920487180352211, |
|
"reward_std": 0.6310172341763973, |
|
"rewards/cosine_scaled_reward": -0.00496023939922452, |
|
"step": 79 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3292.5208740234375, |
|
"epoch": 0.045714285714285714, |
|
"grad_norm": 0.19675898551940918, |
|
"kl": 0.0007238388061523438, |
|
"learning_rate": 9.901664203302124e-07, |
|
"loss": 0.028, |
|
"num_tokens": 11073503.0, |
|
"reward": -0.26976824924349785, |
|
"reward_std": 0.44561611115932465, |
|
"rewards/cosine_scaled_reward": -0.13488411717116833, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2928.7500610351562, |
|
"epoch": 0.046285714285714284, |
|
"grad_norm": 0.23351161181926727, |
|
"kl": 0.0010251998901367188, |
|
"learning_rate": 9.895025252503755e-07, |
|
"loss": -0.0533, |
|
"num_tokens": 11219663.0, |
|
"reward": 0.14206518977880478, |
|
"reward_std": 0.47988787665963173, |
|
"rewards/cosine_scaled_reward": 0.07103258091956377, |
|
"step": 81 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3216.3958740234375, |
|
"epoch": 0.046857142857142854, |
|
"grad_norm": 0.22799547016620636, |
|
"kl": 0.0006895065307617188, |
|
"learning_rate": 9.888172094375033e-07, |
|
"loss": 0.0032, |
|
"num_tokens": 11380386.0, |
|
"reward": -0.16227489709854126, |
|
"reward_std": 0.33015402406454086, |
|
"rewards/cosine_scaled_reward": -0.08113745599985123, |
|
"step": 82 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 993.4583435058594, |
|
"epoch": 0.04742857142857143, |
|
"grad_norm": 0.551730751991272, |
|
"kl": 0.0010633468627929688, |
|
"learning_rate": 9.881105062929221e-07, |
|
"loss": 0.1338, |
|
"num_tokens": 11432752.0, |
|
"reward": 0.6598471999168396, |
|
"reward_std": 0.16890107188373804, |
|
"rewards/cosine_scaled_reward": 0.3299236036837101, |
|
"step": 83 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1853.7083740234375, |
|
"epoch": 0.048, |
|
"grad_norm": 0.29992803931236267, |
|
"kl": 0.0008602142333984375, |
|
"learning_rate": 9.873824502603459e-07, |
|
"loss": 0.0295, |
|
"num_tokens": 11527454.0, |
|
"reward": -0.3242928695399314, |
|
"reward_std": 0.7351822834461927, |
|
"rewards/cosine_scaled_reward": -0.16214643290732056, |
|
"step": 84 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2737.2083740234375, |
|
"epoch": 0.04857142857142857, |
|
"grad_norm": 0.31860730051994324, |
|
"kl": 0.0012292861938476562, |
|
"learning_rate": 9.866330768241983e-07, |
|
"loss": 0.0168, |
|
"num_tokens": 11664108.0, |
|
"reward": -0.3406180441379547, |
|
"reward_std": 0.4295322969555855, |
|
"rewards/cosine_scaled_reward": -0.17030901461839676, |
|
"step": 85 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2586.541732788086, |
|
"epoch": 0.04914285714285714, |
|
"grad_norm": 0.3602101802825928, |
|
"kl": 0.000606536865234375, |
|
"learning_rate": 9.85862422507884e-07, |
|
"loss": -0.1276, |
|
"num_tokens": 11794454.0, |
|
"reward": 0.20340422540903091, |
|
"reward_std": 0.9888063967227936, |
|
"rewards/cosine_scaled_reward": 0.10170210711658001, |
|
"step": 86 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2964.75, |
|
"epoch": 0.04971428571428571, |
|
"grad_norm": 0.3094576895236969, |
|
"kl": 0.00102996826171875, |
|
"learning_rate": 9.850705248720068e-07, |
|
"loss": -0.0531, |
|
"num_tokens": 11943002.0, |
|
"reward": 0.01601184532046318, |
|
"reward_std": 0.4094504490494728, |
|
"rewards/cosine_scaled_reward": 0.008005908690392971, |
|
"step": 87 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2426.1666870117188, |
|
"epoch": 0.05028571428571429, |
|
"grad_norm": 0.2704874277114868, |
|
"kl": 0.001644134521484375, |
|
"learning_rate": 9.8425742251254e-07, |
|
"loss": 0.0654, |
|
"num_tokens": 12065278.0, |
|
"reward": -0.40290449309395626, |
|
"reward_std": 0.6259909272193909, |
|
"rewards/cosine_scaled_reward": -0.20145224656153005, |
|
"step": 88 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2142.1041870117188, |
|
"epoch": 0.05085714285714286, |
|
"grad_norm": 0.3306039273738861, |
|
"kl": 0.00197601318359375, |
|
"learning_rate": 9.83423155058946e-07, |
|
"loss": -0.1126, |
|
"num_tokens": 12174291.0, |
|
"reward": -0.23806674778461456, |
|
"reward_std": 0.6727369725704193, |
|
"rewards/cosine_scaled_reward": -0.11903337389230728, |
|
"step": 89 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2580.7709350585938, |
|
"epoch": 0.05142857142857143, |
|
"grad_norm": 0.23774664103984833, |
|
"kl": 0.0007190704345703125, |
|
"learning_rate": 9.825677631722435e-07, |
|
"loss": 0.077, |
|
"num_tokens": 12303664.0, |
|
"reward": 0.3880194779485464, |
|
"reward_std": 1.0538864731788635, |
|
"rewards/cosine_scaled_reward": 0.1940097352489829, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2841.312530517578, |
|
"epoch": 0.052, |
|
"grad_norm": 0.24611423909664154, |
|
"kl": 0.000865936279296875, |
|
"learning_rate": 9.816912885430258e-07, |
|
"loss": 0.0599, |
|
"num_tokens": 12445867.0, |
|
"reward": -0.19608542323112488, |
|
"reward_std": 0.42760632932186127, |
|
"rewards/cosine_scaled_reward": -0.09804270416498184, |
|
"step": 91 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2743.6041717529297, |
|
"epoch": 0.052571428571428575, |
|
"grad_norm": 0.23887218534946442, |
|
"kl": 0.0006628036499023438, |
|
"learning_rate": 9.807937738894303e-07, |
|
"loss": 0.0646, |
|
"num_tokens": 12584352.0, |
|
"reward": 0.15040923655033112, |
|
"reward_std": 0.25195283722132444, |
|
"rewards/cosine_scaled_reward": 0.07520462200045586, |
|
"step": 92 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2130.0833435058594, |
|
"epoch": 0.053142857142857144, |
|
"grad_norm": 0.42298954725265503, |
|
"kl": 0.0020599365234375, |
|
"learning_rate": 9.798752629550546e-07, |
|
"loss": 0.0779, |
|
"num_tokens": 12692224.0, |
|
"reward": -0.06379163265228271, |
|
"reward_std": 0.3432878144085407, |
|
"rewards/cosine_scaled_reward": -0.03189583122730255, |
|
"step": 93 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2588.2709350585938, |
|
"epoch": 0.053714285714285714, |
|
"grad_norm": 0.21005931496620178, |
|
"kl": 0.0005979537963867188, |
|
"learning_rate": 9.78935800506826e-07, |
|
"loss": 0.0561, |
|
"num_tokens": 12822065.0, |
|
"reward": -0.2485465258359909, |
|
"reward_std": 0.6237562894821167, |
|
"rewards/cosine_scaled_reward": -0.12427325546741486, |
|
"step": 94 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1483.5417175292969, |
|
"epoch": 0.054285714285714284, |
|
"grad_norm": 0.34288352727890015, |
|
"kl": 0.0010595321655273438, |
|
"learning_rate": 9.779754323328192e-07, |
|
"loss": 0.0743, |
|
"num_tokens": 12899239.0, |
|
"reward": 0.16759111359715462, |
|
"reward_std": 0.9945086091756821, |
|
"rewards/cosine_scaled_reward": 0.08379554376006126, |
|
"step": 95 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2968.5000610351562, |
|
"epoch": 0.054857142857142854, |
|
"grad_norm": 0.20499330759048462, |
|
"kl": 0.0007104873657226562, |
|
"learning_rate": 9.769942052400235e-07, |
|
"loss": 0.0123, |
|
"num_tokens": 13047499.0, |
|
"reward": -0.08611700683832169, |
|
"reward_std": 0.7486424595117569, |
|
"rewards/cosine_scaled_reward": -0.04305850435048342, |
|
"step": 96 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2057.5625228881836, |
|
"epoch": 0.05542857142857143, |
|
"grad_norm": 0.3967874348163605, |
|
"kl": 0.00153350830078125, |
|
"learning_rate": 9.759921670520634e-07, |
|
"loss": 0.0677, |
|
"num_tokens": 13152466.0, |
|
"reward": -0.5269366651773453, |
|
"reward_std": 0.3624592386186123, |
|
"rewards/cosine_scaled_reward": -0.26346831768751144, |
|
"step": 97 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1952.1041717529297, |
|
"epoch": 0.056, |
|
"grad_norm": 0.3613995909690857, |
|
"kl": 0.0018596649169921875, |
|
"learning_rate": 9.749693666068663e-07, |
|
"loss": 0.0947, |
|
"num_tokens": 13251459.0, |
|
"reward": 0.9773776829242706, |
|
"reward_std": 0.8529446795582771, |
|
"rewards/cosine_scaled_reward": 0.4886888340115547, |
|
"step": 98 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2277.0416717529297, |
|
"epoch": 0.05657142857142857, |
|
"grad_norm": 0.3505290150642395, |
|
"kl": 0.0008687973022460938, |
|
"learning_rate": 9.739258537542835e-07, |
|
"loss": 0.0603, |
|
"num_tokens": 13366469.0, |
|
"reward": 0.27346290089190006, |
|
"reward_std": 0.47108355164527893, |
|
"rewards/cosine_scaled_reward": 0.13673143601045012, |
|
"step": 99 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1580.0625305175781, |
|
"epoch": 0.05714285714285714, |
|
"grad_norm": 0.36085861921310425, |
|
"kl": 0.0011138916015625, |
|
"learning_rate": 9.728616793536587e-07, |
|
"loss": 0.0027, |
|
"num_tokens": 13446920.0, |
|
"reward": 0.4378054551780224, |
|
"reward_std": 0.6341788824647665, |
|
"rewards/cosine_scaled_reward": 0.2189027275890112, |
|
"step": 100 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2714.8541717529297, |
|
"epoch": 0.05771428571428571, |
|
"grad_norm": 0.2826038599014282, |
|
"kl": 0.0008282661437988281, |
|
"learning_rate": 9.717768952713511e-07, |
|
"loss": 0.0606, |
|
"num_tokens": 13583809.0, |
|
"reward": -0.5446555614471436, |
|
"reward_std": 0.35200726985931396, |
|
"rewards/cosine_scaled_reward": -0.2723277732729912, |
|
"step": 101 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2409.666717529297, |
|
"epoch": 0.05828571428571429, |
|
"grad_norm": 0.30025118589401245, |
|
"kl": 0.0009298324584960938, |
|
"learning_rate": 9.706715543782064e-07, |
|
"loss": -0.028, |
|
"num_tokens": 13705845.0, |
|
"reward": -0.1350867822766304, |
|
"reward_std": 0.3358248174190521, |
|
"rewards/cosine_scaled_reward": -0.0675433836877346, |
|
"step": 102 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2799.2708587646484, |
|
"epoch": 0.05885714285714286, |
|
"grad_norm": 0.2541813552379608, |
|
"kl": 0.0008907318115234375, |
|
"learning_rate": 9.695457105469804e-07, |
|
"loss": 0.0025, |
|
"num_tokens": 13846618.0, |
|
"reward": -0.6046699732542038, |
|
"reward_std": 0.3693853598088026, |
|
"rewards/cosine_scaled_reward": -0.3023349717259407, |
|
"step": 103 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2963.1458435058594, |
|
"epoch": 0.05942857142857143, |
|
"grad_norm": 0.2182713747024536, |
|
"kl": 0.000762939453125, |
|
"learning_rate": 9.683994186497132e-07, |
|
"loss": 0.0303, |
|
"num_tokens": 13995953.0, |
|
"reward": -0.20170933986082673, |
|
"reward_std": 0.5568380877375603, |
|
"rewards/cosine_scaled_reward": -0.10085467004682869, |
|
"step": 104 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2755.1458740234375, |
|
"epoch": 0.06, |
|
"grad_norm": 0.26797640323638916, |
|
"kl": 0.0011749267578125, |
|
"learning_rate": 9.672327345550543e-07, |
|
"loss": -0.0094, |
|
"num_tokens": 14134524.0, |
|
"reward": -0.6127043217420578, |
|
"reward_std": 0.19923977181315422, |
|
"rewards/cosine_scaled_reward": -0.3063521459698677, |
|
"step": 105 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2781.0416717529297, |
|
"epoch": 0.060571428571428575, |
|
"grad_norm": 0.4417632222175598, |
|
"kl": 0.0021905899047851562, |
|
"learning_rate": 9.66045715125541e-07, |
|
"loss": 0.0892, |
|
"num_tokens": 14274434.0, |
|
"reward": -0.042052820324897766, |
|
"reward_std": 0.555817510932684, |
|
"rewards/cosine_scaled_reward": -0.021026406437158585, |
|
"step": 106 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2453.125030517578, |
|
"epoch": 0.061142857142857145, |
|
"grad_norm": 0.2828550338745117, |
|
"kl": 0.000919342041015625, |
|
"learning_rate": 9.648384182148252e-07, |
|
"loss": -0.0085, |
|
"num_tokens": 14397428.0, |
|
"reward": -0.37871552258729935, |
|
"reward_std": 0.4371586740016937, |
|
"rewards/cosine_scaled_reward": -0.18935775943100452, |
|
"step": 107 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2142.5625, |
|
"epoch": 0.061714285714285715, |
|
"grad_norm": 0.30579039454460144, |
|
"kl": 0.0007009506225585938, |
|
"learning_rate": 9.636109026648554e-07, |
|
"loss": 0.0216, |
|
"num_tokens": 14506031.0, |
|
"reward": -0.2899063751101494, |
|
"reward_std": 0.30772218108177185, |
|
"rewards/cosine_scaled_reward": -0.1449531875550747, |
|
"step": 108 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3271.8750610351562, |
|
"epoch": 0.062285714285714285, |
|
"grad_norm": 0.212229385972023, |
|
"kl": 0.0011157989501953125, |
|
"learning_rate": 9.623632283030077e-07, |
|
"loss": -0.035, |
|
"num_tokens": 14669513.0, |
|
"reward": -0.5292222313582897, |
|
"reward_std": 0.40466723032295704, |
|
"rewards/cosine_scaled_reward": -0.26461111195385456, |
|
"step": 109 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2759.3125, |
|
"epoch": 0.06285714285714286, |
|
"grad_norm": 0.25697484612464905, |
|
"kl": 0.0011186599731445312, |
|
"learning_rate": 9.610954559391704e-07, |
|
"loss": 0.0172, |
|
"num_tokens": 14807984.0, |
|
"reward": -0.4681231379508972, |
|
"reward_std": 0.29983806796371937, |
|
"rewards/cosine_scaled_reward": -0.23406155407428741, |
|
"step": 110 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2785.4166717529297, |
|
"epoch": 0.06342857142857143, |
|
"grad_norm": 0.47920992970466614, |
|
"kl": 0.009280204772949219, |
|
"learning_rate": 9.598076473627796e-07, |
|
"loss": 0.0485, |
|
"num_tokens": 14949280.0, |
|
"reward": 0.15108218044042587, |
|
"reward_std": 0.20054534077644348, |
|
"rewards/cosine_scaled_reward": 0.07554109394550323, |
|
"step": 111 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2506.416748046875, |
|
"epoch": 0.064, |
|
"grad_norm": 0.26250529289245605, |
|
"kl": 0.0015287399291992188, |
|
"learning_rate": 9.58499865339809e-07, |
|
"loss": -0.1711, |
|
"num_tokens": 15075480.0, |
|
"reward": -0.23084469139575958, |
|
"reward_std": 0.5984194576740265, |
|
"rewards/cosine_scaled_reward": -0.11542234569787979, |
|
"step": 112 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2602.5208740234375, |
|
"epoch": 0.06457142857142857, |
|
"grad_norm": 0.424373596906662, |
|
"kl": 0.0057659149169921875, |
|
"learning_rate": 9.571721736097088e-07, |
|
"loss": 0.0718, |
|
"num_tokens": 15206593.0, |
|
"reward": -0.253767779096961, |
|
"reward_std": 0.4538792669773102, |
|
"rewards/cosine_scaled_reward": -0.12688388722017407, |
|
"step": 113 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3077.0000610351562, |
|
"epoch": 0.06514285714285714, |
|
"grad_norm": 0.2078198343515396, |
|
"kl": 0.0006704330444335938, |
|
"learning_rate": 9.55824636882301e-07, |
|
"loss": -0.0122, |
|
"num_tokens": 15360649.0, |
|
"reward": -0.15475063771009445, |
|
"reward_std": 0.5910014770925045, |
|
"rewards/cosine_scaled_reward": -0.07737531885504723, |
|
"step": 114 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2643.6875610351562, |
|
"epoch": 0.06571428571428571, |
|
"grad_norm": 0.28771933913230896, |
|
"kl": 0.0011758804321289062, |
|
"learning_rate": 9.54457320834625e-07, |
|
"loss": 0.0676, |
|
"num_tokens": 15493570.0, |
|
"reward": -0.19310491532087326, |
|
"reward_std": 0.5526712536811829, |
|
"rewards/cosine_scaled_reward": -0.09655245952308178, |
|
"step": 115 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3344.7291870117188, |
|
"epoch": 0.06628571428571428, |
|
"grad_norm": 0.2425944209098816, |
|
"kl": 0.0008716583251953125, |
|
"learning_rate": 9.530702921077358e-07, |
|
"loss": 0.0343, |
|
"num_tokens": 15661161.0, |
|
"reward": -0.7032309919595718, |
|
"reward_std": 0.3479248844087124, |
|
"rewards/cosine_scaled_reward": -0.351615484803915, |
|
"step": 116 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3084.4583740234375, |
|
"epoch": 0.06685714285714285, |
|
"grad_norm": 0.20971350371837616, |
|
"kl": 0.0007829666137695312, |
|
"learning_rate": 9.516636183034564e-07, |
|
"loss": -0.0389, |
|
"num_tokens": 15815755.0, |
|
"reward": 0.0003270097076892853, |
|
"reward_std": 0.43772435188293457, |
|
"rewards/cosine_scaled_reward": 0.00016349367797374725, |
|
"step": 117 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2729.312530517578, |
|
"epoch": 0.06742857142857143, |
|
"grad_norm": 0.2613621950149536, |
|
"kl": 0.001468658447265625, |
|
"learning_rate": 9.502373679810839e-07, |
|
"loss": 0.0463, |
|
"num_tokens": 15952474.0, |
|
"reward": -0.06591695547103882, |
|
"reward_std": 0.39777015522122383, |
|
"rewards/cosine_scaled_reward": -0.032958466559648514, |
|
"step": 118 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2081.7916717529297, |
|
"epoch": 0.068, |
|
"grad_norm": 0.4527161717414856, |
|
"kl": 0.0013675689697265625, |
|
"learning_rate": 9.487916106540465e-07, |
|
"loss": 0.0989, |
|
"num_tokens": 16059348.0, |
|
"reward": 0.35896405577659607, |
|
"reward_std": 0.3922106046229601, |
|
"rewards/cosine_scaled_reward": 0.17948202788829803, |
|
"step": 119 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2978.125, |
|
"epoch": 0.06857142857142857, |
|
"grad_norm": 0.2305285632610321, |
|
"kl": 0.0010137557983398438, |
|
"learning_rate": 9.473264167865171e-07, |
|
"loss": -0.0725, |
|
"num_tokens": 16208058.0, |
|
"reward": -0.6749820820987225, |
|
"reward_std": 0.33936042711138725, |
|
"rewards/cosine_scaled_reward": -0.33749102614820004, |
|
"step": 120 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2772.125, |
|
"epoch": 0.06914285714285714, |
|
"grad_norm": 0.2650047242641449, |
|
"kl": 0.0009822845458984375, |
|
"learning_rate": 9.458418577899774e-07, |
|
"loss": -0.0085, |
|
"num_tokens": 16347408.0, |
|
"reward": -0.032290175557136536, |
|
"reward_std": 0.4409598559141159, |
|
"rewards/cosine_scaled_reward": -0.016145076602697372, |
|
"step": 121 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2094.479217529297, |
|
"epoch": 0.06971428571428571, |
|
"grad_norm": 0.24298027157783508, |
|
"kl": 0.0012693405151367188, |
|
"learning_rate": 9.443380060197385e-07, |
|
"loss": -0.0886, |
|
"num_tokens": 16453067.0, |
|
"reward": 0.5003323024138808, |
|
"reward_std": 0.5531654357910156, |
|
"rewards/cosine_scaled_reward": 0.2501661437563598, |
|
"step": 122 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1995.3958587646484, |
|
"epoch": 0.07028571428571428, |
|
"grad_norm": 0.2879078686237335, |
|
"kl": 0.001132965087890625, |
|
"learning_rate": 9.428149347714143e-07, |
|
"loss": 0.069, |
|
"num_tokens": 16554006.0, |
|
"reward": -0.1732923611998558, |
|
"reward_std": 0.6025716587901115, |
|
"rewards/cosine_scaled_reward": -0.08664617873728275, |
|
"step": 123 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3321.0625, |
|
"epoch": 0.07085714285714285, |
|
"grad_norm": 0.18874195218086243, |
|
"kl": 0.0006341934204101562, |
|
"learning_rate": 9.412727182773486e-07, |
|
"loss": 0.0448, |
|
"num_tokens": 16720257.0, |
|
"reward": 0.0892313290387392, |
|
"reward_std": 0.36883802339434624, |
|
"rewards/cosine_scaled_reward": 0.0446156719699502, |
|
"step": 124 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1953.6458740234375, |
|
"epoch": 0.07142857142857142, |
|
"grad_norm": 0.34396979212760925, |
|
"kl": 0.0025730133056640625, |
|
"learning_rate": 9.397114317029974e-07, |
|
"loss": -0.0302, |
|
"num_tokens": 16819864.0, |
|
"reward": 0.04853908717632294, |
|
"reward_std": 0.6620666459202766, |
|
"rewards/cosine_scaled_reward": 0.024269558489322662, |
|
"step": 125 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1393.2708435058594, |
|
"epoch": 0.072, |
|
"grad_norm": 0.5013756155967712, |
|
"kl": 0.011034011840820312, |
|
"learning_rate": 9.381311511432658e-07, |
|
"loss": 0.1767, |
|
"num_tokens": 16891889.0, |
|
"reward": -0.013652913272380829, |
|
"reward_std": 0.3805545046925545, |
|
"rewards/cosine_scaled_reward": -0.006826456636190414, |
|
"step": 126 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2377.8958435058594, |
|
"epoch": 0.07257142857142856, |
|
"grad_norm": 0.22264918684959412, |
|
"kl": 0.0008573532104492188, |
|
"learning_rate": 9.36531953618799e-07, |
|
"loss": 0.0667, |
|
"num_tokens": 17012724.0, |
|
"reward": 0.2236809842288494, |
|
"reward_std": 0.4961891621351242, |
|
"rewards/cosine_scaled_reward": 0.11184047814458609, |
|
"step": 127 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2564.7500610351562, |
|
"epoch": 0.07314285714285715, |
|
"grad_norm": 0.306539386510849, |
|
"kl": 0.0009932518005371094, |
|
"learning_rate": 9.34913917072228e-07, |
|
"loss": -0.0337, |
|
"num_tokens": 17141448.0, |
|
"reward": -0.004740983247756958, |
|
"reward_std": 0.3173178732395172, |
|
"rewards/cosine_scaled_reward": -0.002370491623878479, |
|
"step": 128 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2082.166717529297, |
|
"epoch": 0.07371428571428572, |
|
"grad_norm": 0.3070489764213562, |
|
"kl": 0.0011272430419921875, |
|
"learning_rate": 9.332771203643714e-07, |
|
"loss": -0.0082, |
|
"num_tokens": 17247344.0, |
|
"reward": -0.18208786100149155, |
|
"reward_std": 0.5415353253483772, |
|
"rewards/cosine_scaled_reward": -0.09104393050074577, |
|
"step": 129 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1620.7291717529297, |
|
"epoch": 0.07428571428571429, |
|
"grad_norm": 0.3711329698562622, |
|
"kl": 0.0028362274169921875, |
|
"learning_rate": 9.316216432703916e-07, |
|
"loss": 0.008, |
|
"num_tokens": 17330167.0, |
|
"reward": -0.04559193179011345, |
|
"reward_std": 0.6725184172391891, |
|
"rewards/cosine_scaled_reward": -0.022795964032411575, |
|
"step": 130 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2907.8333587646484, |
|
"epoch": 0.07485714285714286, |
|
"grad_norm": 0.22048690915107727, |
|
"kl": 0.0009622573852539062, |
|
"learning_rate": 9.299475664759068e-07, |
|
"loss": 0.0646, |
|
"num_tokens": 17475575.0, |
|
"reward": -0.6151376739144325, |
|
"reward_std": 0.3845426104962826, |
|
"rewards/cosine_scaled_reward": -0.30756882205605507, |
|
"step": 131 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2430.7083740234375, |
|
"epoch": 0.07542857142857143, |
|
"grad_norm": 0.22665299475193024, |
|
"kl": 0.0009012222290039062, |
|
"learning_rate": 9.282549715730579e-07, |
|
"loss": 0.0695, |
|
"num_tokens": 17598141.0, |
|
"reward": -0.2976334486156702, |
|
"reward_std": 0.5366611061617732, |
|
"rewards/cosine_scaled_reward": -0.14881671639159322, |
|
"step": 132 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2548.6458435058594, |
|
"epoch": 0.076, |
|
"grad_norm": 0.3216173052787781, |
|
"kl": 0.0010004043579101562, |
|
"learning_rate": 9.265439410565328e-07, |
|
"loss": 0.0916, |
|
"num_tokens": 17726752.0, |
|
"reward": 0.17475611716508865, |
|
"reward_std": 0.47678545862436295, |
|
"rewards/cosine_scaled_reward": 0.08737805113196373, |
|
"step": 133 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3280.7500610351562, |
|
"epoch": 0.07657142857142857, |
|
"grad_norm": 0.17866647243499756, |
|
"kl": 0.0007352828979492188, |
|
"learning_rate": 9.248145583195447e-07, |
|
"loss": -0.003, |
|
"num_tokens": 17890468.0, |
|
"reward": 0.4249248839914799, |
|
"reward_std": 0.7319479957222939, |
|
"rewards/cosine_scaled_reward": 0.21246242709457874, |
|
"step": 134 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2719.062530517578, |
|
"epoch": 0.07714285714285714, |
|
"grad_norm": 0.34077975153923035, |
|
"kl": 0.0010766983032226562, |
|
"learning_rate": 9.230669076497687e-07, |
|
"loss": 0.0249, |
|
"num_tokens": 18027487.0, |
|
"reward": -0.6892034411430359, |
|
"reward_std": 0.43735230527818203, |
|
"rewards/cosine_scaled_reward": -0.34460172057151794, |
|
"step": 135 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1582.0833587646484, |
|
"epoch": 0.07771428571428571, |
|
"grad_norm": 0.5860525965690613, |
|
"kl": 0.0029449462890625, |
|
"learning_rate": 9.213010742252327e-07, |
|
"loss": -0.1361, |
|
"num_tokens": 18108455.0, |
|
"reward": -0.3233466073870659, |
|
"reward_std": 0.4398806467652321, |
|
"rewards/cosine_scaled_reward": -0.1616733018308878, |
|
"step": 136 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2537.8333435058594, |
|
"epoch": 0.07828571428571429, |
|
"grad_norm": 0.21415218710899353, |
|
"kl": 0.000827789306640625, |
|
"learning_rate": 9.195171441101668e-07, |
|
"loss": -0.0168, |
|
"num_tokens": 18236475.0, |
|
"reward": -0.059672433882951736, |
|
"reward_std": 0.7180322706699371, |
|
"rewards/cosine_scaled_reward": -0.02983621321618557, |
|
"step": 137 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2120.3750076293945, |
|
"epoch": 0.07885714285714286, |
|
"grad_norm": 0.29080891609191895, |
|
"kl": 0.0012989044189453125, |
|
"learning_rate": 9.177152042508077e-07, |
|
"loss": 0.0494, |
|
"num_tokens": 18343401.0, |
|
"reward": -0.08782655745744705, |
|
"reward_std": 0.12043035682290792, |
|
"rewards/cosine_scaled_reward": -0.04391326941549778, |
|
"step": 138 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2405.2291870117188, |
|
"epoch": 0.07942857142857143, |
|
"grad_norm": 0.22457921504974365, |
|
"kl": 0.0009593963623046875, |
|
"learning_rate": 9.158953424711624e-07, |
|
"loss": 0.0361, |
|
"num_tokens": 18464648.0, |
|
"reward": -0.41086670011281967, |
|
"reward_std": 0.4968971386551857, |
|
"rewards/cosine_scaled_reward": -0.20543334260582924, |
|
"step": 139 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2780.2291717529297, |
|
"epoch": 0.08, |
|
"grad_norm": 0.2105841189622879, |
|
"kl": 0.0010890960693359375, |
|
"learning_rate": 9.140576474687263e-07, |
|
"loss": 0.0146, |
|
"num_tokens": 18604483.0, |
|
"reward": 0.13771556317806244, |
|
"reward_std": 0.30897051841020584, |
|
"rewards/cosine_scaled_reward": 0.06885778903961182, |
|
"step": 140 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2002.2708740234375, |
|
"epoch": 0.08057142857142857, |
|
"grad_norm": 0.3650009334087372, |
|
"kl": 0.002017974853515625, |
|
"learning_rate": 9.122022088101613e-07, |
|
"loss": 0.0459, |
|
"num_tokens": 18705872.0, |
|
"reward": -0.379656158387661, |
|
"reward_std": 0.5402833297848701, |
|
"rewards/cosine_scaled_reward": -0.1898280642926693, |
|
"step": 141 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2488.7708740234375, |
|
"epoch": 0.08114285714285714, |
|
"grad_norm": 0.2864922285079956, |
|
"kl": 0.001262664794921875, |
|
"learning_rate": 9.103291169269299e-07, |
|
"loss": -0.0564, |
|
"num_tokens": 18831261.0, |
|
"reward": 0.011970575898885727, |
|
"reward_std": 0.7028900012373924, |
|
"rewards/cosine_scaled_reward": 0.0059852879494428635, |
|
"step": 142 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2584.7084350585938, |
|
"epoch": 0.08171428571428571, |
|
"grad_norm": 0.3126732409000397, |
|
"kl": 0.0011653900146484375, |
|
"learning_rate": 9.084384631108882e-07, |
|
"loss": 0.0683, |
|
"num_tokens": 18961471.0, |
|
"reward": 0.24577251449227333, |
|
"reward_std": 0.6674134684726596, |
|
"rewards/cosine_scaled_reward": 0.12288625724613667, |
|
"step": 143 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2151.1041870117188, |
|
"epoch": 0.08228571428571428, |
|
"grad_norm": 0.37290921807289124, |
|
"kl": 0.0016574859619140625, |
|
"learning_rate": 9.065303395098358e-07, |
|
"loss": 0.0363, |
|
"num_tokens": 19070388.0, |
|
"reward": -0.1530948244035244, |
|
"reward_std": 0.6015914604067802, |
|
"rewards/cosine_scaled_reward": -0.076547397300601, |
|
"step": 144 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1968.2084197998047, |
|
"epoch": 0.08285714285714285, |
|
"grad_norm": 0.35119813680648804, |
|
"kl": 0.002681732177734375, |
|
"learning_rate": 9.046048391230247e-07, |
|
"loss": -0.0567, |
|
"num_tokens": 19169698.0, |
|
"reward": -0.20278839766979218, |
|
"reward_std": 0.47369804978370667, |
|
"rewards/cosine_scaled_reward": -0.10139419510960579, |
|
"step": 145 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1638.3125457763672, |
|
"epoch": 0.08342857142857144, |
|
"grad_norm": 0.3501497507095337, |
|
"kl": 0.0024051666259765625, |
|
"learning_rate": 9.026620557966279e-07, |
|
"loss": 0.009, |
|
"num_tokens": 19253065.0, |
|
"reward": 0.25620073080062866, |
|
"reward_std": 0.8665501922369003, |
|
"rewards/cosine_scaled_reward": 0.12810037285089493, |
|
"step": 146 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2041.5417175292969, |
|
"epoch": 0.084, |
|
"grad_norm": 0.29272228479385376, |
|
"kl": 0.0031538009643554688, |
|
"learning_rate": 9.007020842191634e-07, |
|
"loss": -0.0467, |
|
"num_tokens": 19356735.0, |
|
"reward": 0.015910595655441284, |
|
"reward_std": 0.7093758508563042, |
|
"rewards/cosine_scaled_reward": 0.007955307140946388, |
|
"step": 147 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1694.2500305175781, |
|
"epoch": 0.08457142857142858, |
|
"grad_norm": 0.36988532543182373, |
|
"kl": 0.002410888671875, |
|
"learning_rate": 8.987250199168808e-07, |
|
"loss": 0.0963, |
|
"num_tokens": 19444131.0, |
|
"reward": -0.25489600747823715, |
|
"reward_std": 0.4693850204348564, |
|
"rewards/cosine_scaled_reward": -0.12744800373911858, |
|
"step": 148 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1455.7917022705078, |
|
"epoch": 0.08514285714285715, |
|
"grad_norm": 0.3559054434299469, |
|
"kl": 0.00274658203125, |
|
"learning_rate": 8.967309592491052e-07, |
|
"loss": 0.0878, |
|
"num_tokens": 19519601.0, |
|
"reward": -0.10238776355981827, |
|
"reward_std": 0.636756157502532, |
|
"rewards/cosine_scaled_reward": -0.05119386687874794, |
|
"step": 149 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2705.2500228881836, |
|
"epoch": 0.08571428571428572, |
|
"grad_norm": 0.3573072850704193, |
|
"kl": 0.0050811767578125, |
|
"learning_rate": 8.9471999940354e-07, |
|
"loss": 0.0188, |
|
"num_tokens": 19655657.0, |
|
"reward": -0.20403114520013332, |
|
"reward_std": 0.5940973423421383, |
|
"rewards/cosine_scaled_reward": -0.10201557632535696, |
|
"step": 150 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2180.812545776367, |
|
"epoch": 0.08628571428571429, |
|
"grad_norm": 0.2828698456287384, |
|
"kl": 0.0015001296997070312, |
|
"learning_rate": 8.926922383915315e-07, |
|
"loss": -0.0244, |
|
"num_tokens": 19766528.0, |
|
"reward": 0.11930293589830399, |
|
"reward_std": 0.6097396910190582, |
|
"rewards/cosine_scaled_reward": 0.05965147539973259, |
|
"step": 151 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2904.4166870117188, |
|
"epoch": 0.08685714285714285, |
|
"grad_norm": 0.26096129417419434, |
|
"kl": 0.00130462646484375, |
|
"learning_rate": 8.906477750432903e-07, |
|
"loss": 0.1732, |
|
"num_tokens": 19912480.0, |
|
"reward": -0.3498671278357506, |
|
"reward_std": 0.394152645021677, |
|
"rewards/cosine_scaled_reward": -0.1749335676431656, |
|
"step": 152 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2408.5208740234375, |
|
"epoch": 0.08742857142857142, |
|
"grad_norm": 0.35171744227409363, |
|
"kl": 0.0016956329345703125, |
|
"learning_rate": 8.88586709003076e-07, |
|
"loss": 0.1131, |
|
"num_tokens": 20034233.0, |
|
"reward": 0.07599013298749924, |
|
"reward_std": 0.41721872985363007, |
|
"rewards/cosine_scaled_reward": 0.037995072081685066, |
|
"step": 153 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2731.3750610351562, |
|
"epoch": 0.088, |
|
"grad_norm": 0.26374199986457825, |
|
"kl": 0.001861572265625, |
|
"learning_rate": 8.865091407243394e-07, |
|
"loss": -0.0558, |
|
"num_tokens": 20171267.0, |
|
"reward": -0.2839723117649555, |
|
"reward_std": 0.6639672666788101, |
|
"rewards/cosine_scaled_reward": -0.14198614470660686, |
|
"step": 154 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2960.8958435058594, |
|
"epoch": 0.08857142857142856, |
|
"grad_norm": 0.27562668919563293, |
|
"kl": 0.0019378662109375, |
|
"learning_rate": 8.844151714648274e-07, |
|
"loss": -0.0435, |
|
"num_tokens": 20318922.0, |
|
"reward": -0.46872561052441597, |
|
"reward_std": 0.42786915227770805, |
|
"rewards/cosine_scaled_reward": -0.2343627940863371, |
|
"step": 155 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1580.7917404174805, |
|
"epoch": 0.08914285714285715, |
|
"grad_norm": 0.499097615480423, |
|
"kl": 0.005889892578125, |
|
"learning_rate": 8.823049032816478e-07, |
|
"loss": -0.0042, |
|
"num_tokens": 20400236.0, |
|
"reward": 0.6059545688331127, |
|
"reward_std": 0.690888412296772, |
|
"rewards/cosine_scaled_reward": 0.3029772713780403, |
|
"step": 156 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2041.2708435058594, |
|
"epoch": 0.08971428571428572, |
|
"grad_norm": 0.4625336229801178, |
|
"kl": 0.005961418151855469, |
|
"learning_rate": 8.801784390262943e-07, |
|
"loss": 0.1281, |
|
"num_tokens": 20504961.0, |
|
"reward": 0.08923859149217606, |
|
"reward_std": 0.16652610152959824, |
|
"rewards/cosine_scaled_reward": 0.04461930692195892, |
|
"step": 157 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2572.312530517578, |
|
"epoch": 0.09028571428571429, |
|
"grad_norm": 0.25713205337524414, |
|
"kl": 0.0018138885498046875, |
|
"learning_rate": 8.780358823396352e-07, |
|
"loss": 0.0261, |
|
"num_tokens": 20635176.0, |
|
"reward": -0.24785784073174, |
|
"reward_std": 0.6590173244476318, |
|
"rewards/cosine_scaled_reward": -0.1239289166405797, |
|
"step": 158 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3366.6875, |
|
"epoch": 0.09085714285714286, |
|
"grad_norm": 0.21494215726852417, |
|
"kl": 0.00112152099609375, |
|
"learning_rate": 8.758773376468604e-07, |
|
"loss": -0.0031, |
|
"num_tokens": 20803401.0, |
|
"reward": -0.47726341150701046, |
|
"reward_std": 0.34933631122112274, |
|
"rewards/cosine_scaled_reward": -0.23863169085234404, |
|
"step": 159 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3498.0208740234375, |
|
"epoch": 0.09142857142857143, |
|
"grad_norm": 0.19158728420734406, |
|
"kl": 0.0010175704956054688, |
|
"learning_rate": 8.737029101523929e-07, |
|
"loss": -0.0137, |
|
"num_tokens": 20977630.0, |
|
"reward": -0.23914687521755695, |
|
"reward_std": 0.37878062576055527, |
|
"rewards/cosine_scaled_reward": -0.11957343760877848, |
|
"step": 160 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2862.8959350585938, |
|
"epoch": 0.092, |
|
"grad_norm": 0.2647199034690857, |
|
"kl": 0.002674102783203125, |
|
"learning_rate": 8.715127058347614e-07, |
|
"loss": 0.0488, |
|
"num_tokens": 21120425.0, |
|
"reward": -0.28584553534165025, |
|
"reward_std": 0.7033149749040604, |
|
"rewards/cosine_scaled_reward": -0.14292275649495423, |
|
"step": 161 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2344.0208740234375, |
|
"epoch": 0.09257142857142857, |
|
"grad_norm": 0.27368515729904175, |
|
"kl": 0.0023193359375, |
|
"learning_rate": 8.693068314414344e-07, |
|
"loss": 0.0967, |
|
"num_tokens": 21238050.0, |
|
"reward": 0.3530673161149025, |
|
"reward_std": 0.8069795817136765, |
|
"rewards/cosine_scaled_reward": 0.1765336561948061, |
|
"step": 162 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3091.416717529297, |
|
"epoch": 0.09314285714285714, |
|
"grad_norm": 0.2311529964208603, |
|
"kl": 0.0021657943725585938, |
|
"learning_rate": 8.670853944836176e-07, |
|
"loss": 0.0074, |
|
"num_tokens": 21392414.0, |
|
"reward": -0.3498072102665901, |
|
"reward_std": 0.36920344084501266, |
|
"rewards/cosine_scaled_reward": -0.17490360513329506, |
|
"step": 163 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3042.8126220703125, |
|
"epoch": 0.09371428571428571, |
|
"grad_norm": 0.22658418118953705, |
|
"kl": 0.00127410888671875, |
|
"learning_rate": 8.648485032310144e-07, |
|
"loss": 0.0962, |
|
"num_tokens": 21544385.0, |
|
"reward": 0.0001346580684185028, |
|
"reward_std": 0.6785394810140133, |
|
"rewards/cosine_scaled_reward": 6.732158362865448e-05, |
|
"step": 164 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3391.354248046875, |
|
"epoch": 0.09428571428571429, |
|
"grad_norm": 0.2044445425271988, |
|
"kl": 0.0012722015380859375, |
|
"learning_rate": 8.625962667065487e-07, |
|
"loss": 0.0174, |
|
"num_tokens": 21714214.0, |
|
"reward": -0.43216387182474136, |
|
"reward_std": 0.4510202333331108, |
|
"rewards/cosine_scaled_reward": -0.2160819210112095, |
|
"step": 165 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3543.7708740234375, |
|
"epoch": 0.09485714285714286, |
|
"grad_norm": 0.20412862300872803, |
|
"kl": 0.0009775161743164062, |
|
"learning_rate": 8.603287946810513e-07, |
|
"loss": 0.0178, |
|
"num_tokens": 21891155.0, |
|
"reward": -0.5908009447157383, |
|
"reward_std": 0.4042652491480112, |
|
"rewards/cosine_scaled_reward": -0.2954004658386111, |
|
"step": 166 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1522.666732788086, |
|
"epoch": 0.09542857142857143, |
|
"grad_norm": 0.3152174651622772, |
|
"kl": 0.00511932373046875, |
|
"learning_rate": 8.580461976679099e-07, |
|
"loss": -0.0191, |
|
"num_tokens": 21969823.0, |
|
"reward": 0.7545666880905628, |
|
"reward_std": 0.8821780234575272, |
|
"rewards/cosine_scaled_reward": 0.3772833216935396, |
|
"step": 167 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2335.8333587646484, |
|
"epoch": 0.096, |
|
"grad_norm": 0.33240070939064026, |
|
"kl": 0.003902435302734375, |
|
"learning_rate": 8.557485869176825e-07, |
|
"loss": 0.0588, |
|
"num_tokens": 22087907.0, |
|
"reward": -0.26911586057394743, |
|
"reward_std": 0.5829970799386501, |
|
"rewards/cosine_scaled_reward": -0.13455793377943337, |
|
"step": 168 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1914.791748046875, |
|
"epoch": 0.09657142857142857, |
|
"grad_norm": 0.35050421953201294, |
|
"kl": 0.005504608154296875, |
|
"learning_rate": 8.534360744126753e-07, |
|
"loss": 0.1012, |
|
"num_tokens": 22185385.0, |
|
"reward": -0.21728778630495071, |
|
"reward_std": 0.8187869340181351, |
|
"rewards/cosine_scaled_reward": -0.10864389315247536, |
|
"step": 169 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3486.1250610351562, |
|
"epoch": 0.09714285714285714, |
|
"grad_norm": 0.21712802350521088, |
|
"kl": 0.0020456314086914062, |
|
"learning_rate": 8.511087728614862e-07, |
|
"loss": -0.0069, |
|
"num_tokens": 22358287.0, |
|
"reward": -0.2807197757065296, |
|
"reward_std": 0.188842561095953, |
|
"rewards/cosine_scaled_reward": -0.1403598841279745, |
|
"step": 170 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2643.2083435058594, |
|
"epoch": 0.09771428571428571, |
|
"grad_norm": 0.25833019614219666, |
|
"kl": 0.0024633407592773438, |
|
"learning_rate": 8.487667956935087e-07, |
|
"loss": -0.1294, |
|
"num_tokens": 22491605.0, |
|
"reward": 0.01894190162420273, |
|
"reward_std": 0.37402310594916344, |
|
"rewards/cosine_scaled_reward": 0.009470956400036812, |
|
"step": 171 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2838.8958740234375, |
|
"epoch": 0.09828571428571428, |
|
"grad_norm": 0.2890698313713074, |
|
"kl": 0.0052337646484375, |
|
"learning_rate": 8.464102570534061e-07, |
|
"loss": 0.0445, |
|
"num_tokens": 22633668.0, |
|
"reward": -0.39469024166464806, |
|
"reward_std": 0.5306678973138332, |
|
"rewards/cosine_scaled_reward": -0.19734511990100145, |
|
"step": 172 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1792.7916870117188, |
|
"epoch": 0.09885714285714285, |
|
"grad_norm": 0.34095853567123413, |
|
"kl": 0.0036678314208984375, |
|
"learning_rate": 8.440392717955475e-07, |
|
"loss": 0.2055, |
|
"num_tokens": 22724762.0, |
|
"reward": -0.05847650859504938, |
|
"reward_std": 0.7935373112559319, |
|
"rewards/cosine_scaled_reward": -0.029238261049613357, |
|
"step": 173 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3167.6458740234375, |
|
"epoch": 0.09942857142857142, |
|
"grad_norm": 0.2256677895784378, |
|
"kl": 0.00197601318359375, |
|
"learning_rate": 8.416539554784089e-07, |
|
"loss": 0.0598, |
|
"num_tokens": 22882653.0, |
|
"reward": -0.525588646531105, |
|
"reward_std": 0.44627620652318, |
|
"rewards/cosine_scaled_reward": -0.2627943083643913, |
|
"step": 174 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1800.0000457763672, |
|
"epoch": 0.1, |
|
"grad_norm": 0.38881924748420715, |
|
"kl": 0.008832931518554688, |
|
"learning_rate": 8.392544243589427e-07, |
|
"loss": 0.0274, |
|
"num_tokens": 22975461.0, |
|
"reward": 0.41337180882692337, |
|
"reward_std": 0.621935173869133, |
|
"rewards/cosine_scaled_reward": 0.20668590441346169, |
|
"step": 175 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2512.2084045410156, |
|
"epoch": 0.10057142857142858, |
|
"grad_norm": 0.24792905151844025, |
|
"kl": 0.00196075439453125, |
|
"learning_rate": 8.368407953869103e-07, |
|
"loss": -0.0595, |
|
"num_tokens": 23101543.0, |
|
"reward": 0.18181858723983169, |
|
"reward_std": 0.6874982379376888, |
|
"rewards/cosine_scaled_reward": 0.09090929350350052, |
|
"step": 176 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2633.8333740234375, |
|
"epoch": 0.10114285714285715, |
|
"grad_norm": 0.3130466043949127, |
|
"kl": 0.001682281494140625, |
|
"learning_rate": 8.344131861991828e-07, |
|
"loss": 0.0074, |
|
"num_tokens": 23234291.0, |
|
"reward": 0.03136664628982544, |
|
"reward_std": 0.5966363772749901, |
|
"rewards/cosine_scaled_reward": 0.015683308243751526, |
|
"step": 177 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3336.229248046875, |
|
"epoch": 0.10171428571428572, |
|
"grad_norm": 0.17930154502391815, |
|
"kl": 0.0008959770202636719, |
|
"learning_rate": 8.319717151140072e-07, |
|
"loss": 0.0608, |
|
"num_tokens": 23400322.0, |
|
"reward": -0.12859635055065155, |
|
"reward_std": 0.7636773735284805, |
|
"rewards/cosine_scaled_reward": -0.06429817155003548, |
|
"step": 178 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2887.4583435058594, |
|
"epoch": 0.10228571428571429, |
|
"grad_norm": 0.24900512397289276, |
|
"kl": 0.0021228790283203125, |
|
"learning_rate": 8.295165011252396e-07, |
|
"loss": -0.0046, |
|
"num_tokens": 23545100.0, |
|
"reward": -0.3140909820795059, |
|
"reward_std": 0.46572665125131607, |
|
"rewards/cosine_scaled_reward": -0.15704548731446266, |
|
"step": 179 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2308.4583435058594, |
|
"epoch": 0.10285714285714286, |
|
"grad_norm": 0.29636770486831665, |
|
"kl": 0.0054950714111328125, |
|
"learning_rate": 8.270476638965461e-07, |
|
"loss": -0.0668, |
|
"num_tokens": 23662218.0, |
|
"reward": -0.2494659647345543, |
|
"reward_std": 0.7115907035768032, |
|
"rewards/cosine_scaled_reward": -0.124732980504632, |
|
"step": 180 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2766.729232788086, |
|
"epoch": 0.10342857142857143, |
|
"grad_norm": 0.23259904980659485, |
|
"kl": 0.005695343017578125, |
|
"learning_rate": 8.245653237555705e-07, |
|
"loss": -0.0073, |
|
"num_tokens": 23801273.0, |
|
"reward": -0.15345774590969086, |
|
"reward_std": 0.5345512442290783, |
|
"rewards/cosine_scaled_reward": -0.07672888785600662, |
|
"step": 181 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3253.6458740234375, |
|
"epoch": 0.104, |
|
"grad_norm": 0.23866048455238342, |
|
"kl": 0.0014324188232421875, |
|
"learning_rate": 8.220696016880687e-07, |
|
"loss": 0.0049, |
|
"num_tokens": 23964000.0, |
|
"reward": -0.5079273246228695, |
|
"reward_std": 0.6336031965911388, |
|
"rewards/cosine_scaled_reward": -0.2539636502042413, |
|
"step": 182 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2357.2291870117188, |
|
"epoch": 0.10457142857142857, |
|
"grad_norm": 0.31123295426368713, |
|
"kl": 0.00495147705078125, |
|
"learning_rate": 8.195606193320136e-07, |
|
"loss": 0.1906, |
|
"num_tokens": 24084575.0, |
|
"reward": -0.5395753756165504, |
|
"reward_std": 0.2657657843083143, |
|
"rewards/cosine_scaled_reward": -0.2697876766324043, |
|
"step": 183 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1866.9792175292969, |
|
"epoch": 0.10514285714285715, |
|
"grad_norm": 0.3134949803352356, |
|
"kl": 0.00649261474609375, |
|
"learning_rate": 8.170384989716657e-07, |
|
"loss": -0.0113, |
|
"num_tokens": 24179290.0, |
|
"reward": -0.09402483701705933, |
|
"reward_std": 0.6055268943309784, |
|
"rewards/cosine_scaled_reward": -0.047012414783239365, |
|
"step": 184 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2454.770835876465, |
|
"epoch": 0.10571428571428572, |
|
"grad_norm": 0.44388100504875183, |
|
"kl": 0.001873016357421875, |
|
"learning_rate": 8.145033635316128e-07, |
|
"loss": 0.0984, |
|
"num_tokens": 24301895.0, |
|
"reward": -0.41640862822532654, |
|
"reward_std": 0.4775813100859523, |
|
"rewards/cosine_scaled_reward": -0.20820431411266327, |
|
"step": 185 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2527.3750915527344, |
|
"epoch": 0.10628571428571429, |
|
"grad_norm": 0.22966258227825165, |
|
"kl": 0.00251007080078125, |
|
"learning_rate": 8.119553365707802e-07, |
|
"loss": 0.0335, |
|
"num_tokens": 24431105.0, |
|
"reward": 0.3885076344013214, |
|
"reward_std": 0.8602791130542755, |
|
"rewards/cosine_scaled_reward": 0.1942538060247898, |
|
"step": 186 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3435.3958740234375, |
|
"epoch": 0.10685714285714286, |
|
"grad_norm": 0.22783932089805603, |
|
"kl": 0.001224517822265625, |
|
"learning_rate": 8.093945422764069e-07, |
|
"loss": -0.0104, |
|
"num_tokens": 24602028.0, |
|
"reward": -0.35544631630182266, |
|
"reward_std": 0.3423473574221134, |
|
"rewards/cosine_scaled_reward": -0.17772315442562103, |
|
"step": 187 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2700.3125610351562, |
|
"epoch": 0.10742857142857143, |
|
"grad_norm": 0.2991270422935486, |
|
"kl": 0.003947257995605469, |
|
"learning_rate": 8.068211054579943e-07, |
|
"loss": 0.1736, |
|
"num_tokens": 24738075.0, |
|
"reward": 0.23966709151864052, |
|
"reward_std": 0.7574465498328209, |
|
"rewards/cosine_scaled_reward": 0.11983353085815907, |
|
"step": 188 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3063.5001220703125, |
|
"epoch": 0.108, |
|
"grad_norm": 0.21379563212394714, |
|
"kl": 0.0010519027709960938, |
|
"learning_rate": 8.04235151541222e-07, |
|
"loss": 0.0489, |
|
"num_tokens": 24892311.0, |
|
"reward": -0.047808293253183365, |
|
"reward_std": 0.9623388051986694, |
|
"rewards/cosine_scaled_reward": -0.023904146626591682, |
|
"step": 189 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3313.4583740234375, |
|
"epoch": 0.10857142857142857, |
|
"grad_norm": 0.20801162719726562, |
|
"kl": 0.001056671142578125, |
|
"learning_rate": 8.01636806561836e-07, |
|
"loss": 0.1006, |
|
"num_tokens": 25058197.0, |
|
"reward": -0.4445067085325718, |
|
"reward_std": 0.3523157760500908, |
|
"rewards/cosine_scaled_reward": -0.2222533505409956, |
|
"step": 190 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2248.000045776367, |
|
"epoch": 0.10914285714285714, |
|
"grad_norm": 0.45326536893844604, |
|
"kl": 0.003253936767578125, |
|
"learning_rate": 7.990261971595048e-07, |
|
"loss": 0.0083, |
|
"num_tokens": 25171705.0, |
|
"reward": -0.3526854105293751, |
|
"reward_std": 0.45292405039072037, |
|
"rewards/cosine_scaled_reward": -0.17634270247071981, |
|
"step": 191 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2677.645896911621, |
|
"epoch": 0.10971428571428571, |
|
"grad_norm": 0.26036742329597473, |
|
"kl": 0.008434295654296875, |
|
"learning_rate": 7.964034505716476e-07, |
|
"loss": 0.0529, |
|
"num_tokens": 25305920.0, |
|
"reward": 0.11686116084456444, |
|
"reward_std": 0.6998385563492775, |
|
"rewards/cosine_scaled_reward": 0.0584305664524436, |
|
"step": 192 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3415.604248046875, |
|
"epoch": 0.11028571428571429, |
|
"grad_norm": 0.17603135108947754, |
|
"kl": 0.0010662078857421875, |
|
"learning_rate": 7.93768694627233e-07, |
|
"loss": 0.0194, |
|
"num_tokens": 25476889.0, |
|
"reward": -0.3952821143902838, |
|
"reward_std": 0.296985674649477, |
|
"rewards/cosine_scaled_reward": -0.1976410405477509, |
|
"step": 193 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2293.4167098999023, |
|
"epoch": 0.11085714285714286, |
|
"grad_norm": 0.3645835220813751, |
|
"kl": 0.006656646728515625, |
|
"learning_rate": 7.911220577405484e-07, |
|
"loss": 0.0665, |
|
"num_tokens": 25592205.0, |
|
"reward": 0.070842613466084, |
|
"reward_std": 0.631361898034811, |
|
"rewards/cosine_scaled_reward": 0.03542129183188081, |
|
"step": 194 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2965.6458740234375, |
|
"epoch": 0.11142857142857143, |
|
"grad_norm": 0.24365754425525665, |
|
"kl": 0.0016307830810546875, |
|
"learning_rate": 7.884636689049422e-07, |
|
"loss": 0.0612, |
|
"num_tokens": 25740496.0, |
|
"reward": -0.3104175217449665, |
|
"reward_std": 0.5912666730582714, |
|
"rewards/cosine_scaled_reward": -0.15520876459777355, |
|
"step": 195 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2565.3750228881836, |
|
"epoch": 0.112, |
|
"grad_norm": 0.2600879669189453, |
|
"kl": 0.00702667236328125, |
|
"learning_rate": 7.857936576865356e-07, |
|
"loss": -0.0485, |
|
"num_tokens": 25869826.0, |
|
"reward": -0.014622047543525696, |
|
"reward_std": 0.34931765496730804, |
|
"rewards/cosine_scaled_reward": -0.007311023771762848, |
|
"step": 196 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2068.354202270508, |
|
"epoch": 0.11257142857142857, |
|
"grad_norm": 0.48206502199172974, |
|
"kl": 0.0059070587158203125, |
|
"learning_rate": 7.831121542179086e-07, |
|
"loss": 0.1763, |
|
"num_tokens": 25974819.0, |
|
"reward": -0.04546727240085602, |
|
"reward_std": 0.39029328897595406, |
|
"rewards/cosine_scaled_reward": -0.022733643651008606, |
|
"step": 197 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1956.2708740234375, |
|
"epoch": 0.11314285714285714, |
|
"grad_norm": 0.46816733479499817, |
|
"kl": 0.0056858062744140625, |
|
"learning_rate": 7.804192891917571e-07, |
|
"loss": 0.0688, |
|
"num_tokens": 26075332.0, |
|
"reward": 0.2702416032552719, |
|
"reward_std": 0.3293491117656231, |
|
"rewards/cosine_scaled_reward": 0.13512080535292625, |
|
"step": 198 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3000.666748046875, |
|
"epoch": 0.11371428571428571, |
|
"grad_norm": 0.2764574885368347, |
|
"kl": 0.0014142990112304688, |
|
"learning_rate": 7.777151938545235e-07, |
|
"loss": -0.0134, |
|
"num_tokens": 26224608.0, |
|
"reward": 0.04078002646565437, |
|
"reward_std": 0.5825254544615746, |
|
"rewards/cosine_scaled_reward": 0.020389998331665993, |
|
"step": 199 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2605.0625915527344, |
|
"epoch": 0.11428571428571428, |
|
"grad_norm": 0.386100172996521, |
|
"kl": 0.003597259521484375, |
|
"learning_rate": 7.75e-07, |
|
"loss": 0.0811, |
|
"num_tokens": 26356503.0, |
|
"reward": -0.17602870613336563, |
|
"reward_std": 0.5460017845034599, |
|
"rewards/cosine_scaled_reward": -0.08801434934139252, |
|
"step": 200 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2886.8958740234375, |
|
"epoch": 0.11485714285714285, |
|
"grad_norm": 0.25738444924354553, |
|
"kl": 0.0018377304077148438, |
|
"learning_rate": 7.72273839962904e-07, |
|
"loss": -0.0744, |
|
"num_tokens": 26501878.0, |
|
"reward": -0.44855231791734695, |
|
"reward_std": 0.4905274584889412, |
|
"rewards/cosine_scaled_reward": -0.22427614964544773, |
|
"step": 201 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2285.625045776367, |
|
"epoch": 0.11542857142857142, |
|
"grad_norm": 0.32238876819610596, |
|
"kl": 0.0074291229248046875, |
|
"learning_rate": 7.695368466124296e-07, |
|
"loss": 0.0146, |
|
"num_tokens": 26616928.0, |
|
"reward": 0.08226745203137398, |
|
"reward_std": 0.9037965089082718, |
|
"rewards/cosine_scaled_reward": 0.041133725084364414, |
|
"step": 202 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3112.8333740234375, |
|
"epoch": 0.116, |
|
"grad_norm": 0.1928633451461792, |
|
"kl": 0.00113677978515625, |
|
"learning_rate": 7.667891533457718e-07, |
|
"loss": -0.026, |
|
"num_tokens": 26772284.0, |
|
"reward": 0.10418719984591007, |
|
"reward_std": 0.6046584714204073, |
|
"rewards/cosine_scaled_reward": 0.052093599922955036, |
|
"step": 203 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2544.8958740234375, |
|
"epoch": 0.11657142857142858, |
|
"grad_norm": 0.28456971049308777, |
|
"kl": 0.0077342987060546875, |
|
"learning_rate": 7.640308940816239e-07, |
|
"loss": 0.0073, |
|
"num_tokens": 26900391.0, |
|
"reward": -0.42413394153118134, |
|
"reward_std": 0.3923846688121557, |
|
"rewards/cosine_scaled_reward": -0.21206695958971977, |
|
"step": 204 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2439.0208587646484, |
|
"epoch": 0.11714285714285715, |
|
"grad_norm": 0.2515351176261902, |
|
"kl": 0.003063201904296875, |
|
"learning_rate": 7.612622032536507e-07, |
|
"loss": 0.0386, |
|
"num_tokens": 27022972.0, |
|
"reward": -0.41302137821912766, |
|
"reward_std": 0.5369090847671032, |
|
"rewards/cosine_scaled_reward": -0.20651067793369293, |
|
"step": 205 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2899.2500915527344, |
|
"epoch": 0.11771428571428572, |
|
"grad_norm": 0.24134834110736847, |
|
"kl": 0.0021295547485351562, |
|
"learning_rate": 7.584832158039378e-07, |
|
"loss": 0.0364, |
|
"num_tokens": 27167764.0, |
|
"reward": -0.4214736092835665, |
|
"reward_std": 0.4597649797797203, |
|
"rewards/cosine_scaled_reward": -0.21073680510744452, |
|
"step": 206 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2665.9583587646484, |
|
"epoch": 0.11828571428571429, |
|
"grad_norm": 0.27409258484840393, |
|
"kl": 0.005873680114746094, |
|
"learning_rate": 7.556940671764124e-07, |
|
"loss": 0.0453, |
|
"num_tokens": 27301262.0, |
|
"reward": -0.13723902963101864, |
|
"reward_std": 0.5910945013165474, |
|
"rewards/cosine_scaled_reward": -0.06861951481550932, |
|
"step": 207 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2065.4583740234375, |
|
"epoch": 0.11885714285714286, |
|
"grad_norm": 0.3946564793586731, |
|
"kl": 0.018975257873535156, |
|
"learning_rate": 7.528948933102438e-07, |
|
"loss": 0.0449, |
|
"num_tokens": 27406500.0, |
|
"reward": 0.5526407100260258, |
|
"reward_std": 0.3575515812262893, |
|
"rewards/cosine_scaled_reward": 0.2763203550130129, |
|
"step": 208 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2564.06254196167, |
|
"epoch": 0.11942857142857143, |
|
"grad_norm": 0.36452898383140564, |
|
"kl": 0.015535354614257812, |
|
"learning_rate": 7.500858306332172e-07, |
|
"loss": 0.0593, |
|
"num_tokens": 27536463.0, |
|
"reward": 0.1716044805943966, |
|
"reward_std": 0.2996017700061202, |
|
"rewards/cosine_scaled_reward": 0.08580224774777889, |
|
"step": 209 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2995.9376220703125, |
|
"epoch": 0.12, |
|
"grad_norm": 0.22107850015163422, |
|
"kl": 0.001888275146484375, |
|
"learning_rate": 7.472670160550848e-07, |
|
"loss": 0.0598, |
|
"num_tokens": 27686388.0, |
|
"reward": -0.5479435250163078, |
|
"reward_std": 0.3200632855296135, |
|
"rewards/cosine_scaled_reward": -0.2739717550575733, |
|
"step": 210 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2689.4791717529297, |
|
"epoch": 0.12057142857142857, |
|
"grad_norm": 0.2601456940174103, |
|
"kl": 0.005894660949707031, |
|
"learning_rate": 7.444385869608921e-07, |
|
"loss": 0.0078, |
|
"num_tokens": 27821411.0, |
|
"reward": -0.06815922260284424, |
|
"reward_std": 0.5627651736140251, |
|
"rewards/cosine_scaled_reward": -0.03407961130142212, |
|
"step": 211 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3288.8959350585938, |
|
"epoch": 0.12114285714285715, |
|
"grad_norm": 0.25300368666648865, |
|
"kl": 0.0014171600341796875, |
|
"learning_rate": 7.416006812042827e-07, |
|
"loss": 0.0609, |
|
"num_tokens": 27984318.0, |
|
"reward": 0.17400704324245453, |
|
"reward_std": 0.720283254981041, |
|
"rewards/cosine_scaled_reward": 0.08700351975858212, |
|
"step": 212 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2185.479217529297, |
|
"epoch": 0.12171428571428572, |
|
"grad_norm": 0.28523921966552734, |
|
"kl": 0.0068721771240234375, |
|
"learning_rate": 7.387534371007797e-07, |
|
"loss": 0.004, |
|
"num_tokens": 28094729.0, |
|
"reward": 0.2066820189356804, |
|
"reward_std": 0.7146667540073395, |
|
"rewards/cosine_scaled_reward": 0.10334100387990475, |
|
"step": 213 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2242.687545776367, |
|
"epoch": 0.12228571428571429, |
|
"grad_norm": 0.4000585079193115, |
|
"kl": 0.008604049682617188, |
|
"learning_rate": 7.358969934210438e-07, |
|
"loss": 0.1593, |
|
"num_tokens": 28208882.0, |
|
"reward": -0.39366581034846604, |
|
"reward_std": 0.4671928398311138, |
|
"rewards/cosine_scaled_reward": -0.19683289778186008, |
|
"step": 214 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2578.375030517578, |
|
"epoch": 0.12285714285714286, |
|
"grad_norm": 0.26247313618659973, |
|
"kl": 0.005038261413574219, |
|
"learning_rate": 7.330314893841101e-07, |
|
"loss": 0.1202, |
|
"num_tokens": 28338476.0, |
|
"reward": -0.4542321562767029, |
|
"reward_std": 0.4457564279437065, |
|
"rewards/cosine_scaled_reward": -0.2271160762757063, |
|
"step": 215 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 836.9583511352539, |
|
"epoch": 0.12342857142857143, |
|
"grad_norm": 0.6995976567268372, |
|
"kl": 0.0274505615234375, |
|
"learning_rate": 7.301570646506027e-07, |
|
"loss": 0.3925, |
|
"num_tokens": 28384326.0, |
|
"reward": 1.4526022970676422, |
|
"reward_std": 0.5801738128066063, |
|
"rewards/cosine_scaled_reward": 0.7263011261820793, |
|
"step": 216 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3564.6458740234375, |
|
"epoch": 0.124, |
|
"grad_norm": 0.18853364884853363, |
|
"kl": 0.0008840560913085938, |
|
"learning_rate": 7.27273859315928e-07, |
|
"loss": 0.0079, |
|
"num_tokens": 28562401.0, |
|
"reward": -0.1634646449238062, |
|
"reward_std": 0.4835543856024742, |
|
"rewards/cosine_scaled_reward": -0.0817323224619031, |
|
"step": 217 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2188.062545776367, |
|
"epoch": 0.12457142857142857, |
|
"grad_norm": 0.3206729292869568, |
|
"kl": 0.007763862609863281, |
|
"learning_rate": 7.243820139034464e-07, |
|
"loss": 0.0321, |
|
"num_tokens": 28673212.0, |
|
"reward": -0.21583660691976547, |
|
"reward_std": 0.4847491458058357, |
|
"rewards/cosine_scaled_reward": -0.10791829600930214, |
|
"step": 218 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1354.5625762939453, |
|
"epoch": 0.12514285714285714, |
|
"grad_norm": 0.7568894624710083, |
|
"kl": 0.0127105712890625, |
|
"learning_rate": 7.214816693576234e-07, |
|
"loss": 0.2123, |
|
"num_tokens": 28743739.0, |
|
"reward": 0.5299456119537354, |
|
"reward_std": 0.6226745247840881, |
|
"rewards/cosine_scaled_reward": 0.2649728059768677, |
|
"step": 219 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2359.729217529297, |
|
"epoch": 0.12571428571428572, |
|
"grad_norm": 0.2767828404903412, |
|
"kl": 0.0012798309326171875, |
|
"learning_rate": 7.185729670371604e-07, |
|
"loss": -0.0269, |
|
"num_tokens": 28863102.0, |
|
"reward": 0.4944605454802513, |
|
"reward_std": 0.5324547663331032, |
|
"rewards/cosine_scaled_reward": 0.24723027274012566, |
|
"step": 220 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1487.9791717529297, |
|
"epoch": 0.12628571428571428, |
|
"grad_norm": 0.5411548614501953, |
|
"kl": 0.02813243865966797, |
|
"learning_rate": 7.156560487081051e-07, |
|
"loss": 0.1265, |
|
"num_tokens": 28939709.0, |
|
"reward": 0.3169099148362875, |
|
"reward_std": 0.6918673776090145, |
|
"rewards/cosine_scaled_reward": 0.15845494996756315, |
|
"step": 221 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1893.2708740234375, |
|
"epoch": 0.12685714285714286, |
|
"grad_norm": 0.4862631857395172, |
|
"kl": 0.016809463500976562, |
|
"learning_rate": 7.127310565369415e-07, |
|
"loss": 0.1021, |
|
"num_tokens": 29035914.0, |
|
"reward": -0.06954247504472733, |
|
"reward_std": 0.5530165806412697, |
|
"rewards/cosine_scaled_reward": -0.03477124497294426, |
|
"step": 222 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2748.5625, |
|
"epoch": 0.12742857142857142, |
|
"grad_norm": 0.30448853969573975, |
|
"kl": 0.005116462707519531, |
|
"learning_rate": 7.097981330836616e-07, |
|
"loss": 0.0692, |
|
"num_tokens": 29173857.0, |
|
"reward": -0.2174304649233818, |
|
"reward_std": 0.4413216896355152, |
|
"rewards/cosine_scaled_reward": -0.1087152324616909, |
|
"step": 223 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3276.4166870117188, |
|
"epoch": 0.128, |
|
"grad_norm": 0.18361811339855194, |
|
"kl": 0.0010099411010742188, |
|
"learning_rate": 7.068574212948169e-07, |
|
"loss": 0.0613, |
|
"num_tokens": 29336993.0, |
|
"reward": 0.06714647263288498, |
|
"reward_std": 0.6574617028236389, |
|
"rewards/cosine_scaled_reward": 0.033573225140571594, |
|
"step": 224 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2702.625030517578, |
|
"epoch": 0.12857142857142856, |
|
"grad_norm": 0.2830963730812073, |
|
"kl": 0.002536773681640625, |
|
"learning_rate": 7.039090644965509e-07, |
|
"loss": -0.0118, |
|
"num_tokens": 29473259.0, |
|
"reward": -0.10639284551143646, |
|
"reward_std": 0.5328052043914795, |
|
"rewards/cosine_scaled_reward": -0.05319641903042793, |
|
"step": 225 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1861.312515258789, |
|
"epoch": 0.12914285714285714, |
|
"grad_norm": 0.35156944394111633, |
|
"kl": 0.015664100646972656, |
|
"learning_rate": 7.009532063876148e-07, |
|
"loss": 0.0346, |
|
"num_tokens": 29568650.0, |
|
"reward": 0.3703688979148865, |
|
"reward_std": 0.33065274357795715, |
|
"rewards/cosine_scaled_reward": 0.18518445640802383, |
|
"step": 226 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1838.8125305175781, |
|
"epoch": 0.12971428571428573, |
|
"grad_norm": 0.3623930513858795, |
|
"kl": 0.016778945922851562, |
|
"learning_rate": 6.979899910323624e-07, |
|
"loss": 0.1119, |
|
"num_tokens": 29662589.0, |
|
"reward": 0.3070980906486511, |
|
"reward_std": 0.5127528607845306, |
|
"rewards/cosine_scaled_reward": 0.15354903042316437, |
|
"step": 227 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1082.6041717529297, |
|
"epoch": 0.13028571428571428, |
|
"grad_norm": 0.5364865064620972, |
|
"kl": 0.018810272216796875, |
|
"learning_rate": 6.950195628537299e-07, |
|
"loss": 0.0448, |
|
"num_tokens": 29719618.0, |
|
"reward": 0.6576689593493938, |
|
"reward_std": 0.6490252837538719, |
|
"rewards/cosine_scaled_reward": 0.3288344731554389, |
|
"step": 228 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2231.6458740234375, |
|
"epoch": 0.13085714285714287, |
|
"grad_norm": 0.30130934715270996, |
|
"kl": 0.009735107421875, |
|
"learning_rate": 6.920420666261961e-07, |
|
"loss": -0.0144, |
|
"num_tokens": 29833121.0, |
|
"reward": -0.20264260238036513, |
|
"reward_std": 0.4636539947241545, |
|
"rewards/cosine_scaled_reward": -0.10132130864076316, |
|
"step": 229 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2730.1041870117188, |
|
"epoch": 0.13142857142857142, |
|
"grad_norm": 0.36225757002830505, |
|
"kl": 0.0050067901611328125, |
|
"learning_rate": 6.890576474687263e-07, |
|
"loss": -0.1013, |
|
"num_tokens": 29971474.0, |
|
"reward": 0.15790478512644768, |
|
"reward_std": 0.900160625576973, |
|
"rewards/cosine_scaled_reward": 0.07895238231867552, |
|
"step": 230 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2743.7083740234375, |
|
"epoch": 0.132, |
|
"grad_norm": 0.21701067686080933, |
|
"kl": 0.0045013427734375, |
|
"learning_rate": 6.860664508377001e-07, |
|
"loss": 0.0773, |
|
"num_tokens": 30108548.0, |
|
"reward": -0.1372587690129876, |
|
"reward_std": 0.5728438459336758, |
|
"rewards/cosine_scaled_reward": -0.06862937705591321, |
|
"step": 231 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2708.0833740234375, |
|
"epoch": 0.13257142857142856, |
|
"grad_norm": 0.48722752928733826, |
|
"kl": 0.011949539184570312, |
|
"learning_rate": 6.83068622519821e-07, |
|
"loss": -0.0551, |
|
"num_tokens": 30244260.0, |
|
"reward": 0.018355626612901688, |
|
"reward_std": 0.46119677275419235, |
|
"rewards/cosine_scaled_reward": 0.009177813306450844, |
|
"step": 232 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3555.2708740234375, |
|
"epoch": 0.13314285714285715, |
|
"grad_norm": 0.1710912138223648, |
|
"kl": 0.0009517669677734375, |
|
"learning_rate": 6.800643086250121e-07, |
|
"loss": 0.0049, |
|
"num_tokens": 30421249.0, |
|
"reward": -0.30245864391326904, |
|
"reward_std": 0.2900640666484833, |
|
"rewards/cosine_scaled_reward": -0.15122931078076363, |
|
"step": 233 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3123.9375610351562, |
|
"epoch": 0.1337142857142857, |
|
"grad_norm": 0.19327852129936218, |
|
"kl": 0.00162506103515625, |
|
"learning_rate": 6.770536555792944e-07, |
|
"loss": 0.0671, |
|
"num_tokens": 30578182.0, |
|
"reward": -0.25677131395787, |
|
"reward_std": 0.6208681277930737, |
|
"rewards/cosine_scaled_reward": -0.128385656978935, |
|
"step": 234 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1931.9792175292969, |
|
"epoch": 0.13428571428571429, |
|
"grad_norm": 0.46432214975357056, |
|
"kl": 0.024862289428710938, |
|
"learning_rate": 6.740368101176495e-07, |
|
"loss": 0.0701, |
|
"num_tokens": 30677037.0, |
|
"reward": 0.25881527364254, |
|
"reward_std": 0.3069061152637005, |
|
"rewards/cosine_scaled_reward": 0.12940763682127, |
|
"step": 235 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1024.5208740234375, |
|
"epoch": 0.13485714285714287, |
|
"grad_norm": 0.551213264465332, |
|
"kl": 0.0158843994140625, |
|
"learning_rate": 6.710139192768694e-07, |
|
"loss": 0.2681, |
|
"num_tokens": 30732298.0, |
|
"reward": 0.17941563576459885, |
|
"reward_std": 0.547722615301609, |
|
"rewards/cosine_scaled_reward": 0.08970782160758972, |
|
"step": 236 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1741.9791870117188, |
|
"epoch": 0.13542857142857143, |
|
"grad_norm": 0.5294864773750305, |
|
"kl": 0.012974739074707031, |
|
"learning_rate": 6.679851303883891e-07, |
|
"loss": 0.35, |
|
"num_tokens": 30821337.0, |
|
"reward": -0.2729988917708397, |
|
"reward_std": 0.39208219945430756, |
|
"rewards/cosine_scaled_reward": -0.13649944216012955, |
|
"step": 237 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2661.2708740234375, |
|
"epoch": 0.136, |
|
"grad_norm": 0.26540040969848633, |
|
"kl": 0.008977890014648438, |
|
"learning_rate": 6.649505910711058e-07, |
|
"loss": 0.1287, |
|
"num_tokens": 30955702.0, |
|
"reward": -0.24963749200105667, |
|
"reward_std": 0.5366819277405739, |
|
"rewards/cosine_scaled_reward": -0.12481874227523804, |
|
"step": 238 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2663.8958740234375, |
|
"epoch": 0.13657142857142857, |
|
"grad_norm": 0.5480060577392578, |
|
"kl": 0.012514114379882812, |
|
"learning_rate": 6.619104492241847e-07, |
|
"loss": 0.1785, |
|
"num_tokens": 31089365.0, |
|
"reward": -0.02775234915316105, |
|
"reward_std": 0.7676347196102142, |
|
"rewards/cosine_scaled_reward": -0.013876182027161121, |
|
"step": 239 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2176.229248046875, |
|
"epoch": 0.13714285714285715, |
|
"grad_norm": 0.28263622522354126, |
|
"kl": 0.003826141357421875, |
|
"learning_rate": 6.588648530198504e-07, |
|
"loss": 0.0034, |
|
"num_tokens": 31199812.0, |
|
"reward": -0.4636247009038925, |
|
"reward_std": 0.3770550861954689, |
|
"rewards/cosine_scaled_reward": -0.23181235045194626, |
|
"step": 240 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1963.645839691162, |
|
"epoch": 0.1377142857142857, |
|
"grad_norm": 0.5095245838165283, |
|
"kl": 0.02035999298095703, |
|
"learning_rate": 6.558139508961654e-07, |
|
"loss": 0.0311, |
|
"num_tokens": 31299575.0, |
|
"reward": 0.524737037718296, |
|
"reward_std": 0.8398763090372086, |
|
"rewards/cosine_scaled_reward": 0.2623685207217932, |
|
"step": 241 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.1382857142857143, |
|
"grad_norm": 0.21239568293094635, |
|
"kl": 0.0011444091796875, |
|
"learning_rate": 6.527578915497951e-07, |
|
"loss": 0.0, |
|
"num_tokens": 31477739.0, |
|
"reward": -0.3269985783845186, |
|
"reward_std": 0.2089610155671835, |
|
"rewards/cosine_scaled_reward": -0.1634992891922593, |
|
"step": 242 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2510.937530517578, |
|
"epoch": 0.13885714285714285, |
|
"grad_norm": 0.32022956013679504, |
|
"kl": 0.00478363037109375, |
|
"learning_rate": 6.496968239287603e-07, |
|
"loss": 0.0972, |
|
"num_tokens": 31603616.0, |
|
"reward": -0.44686760660260916, |
|
"reward_std": 0.3707558251917362, |
|
"rewards/cosine_scaled_reward": -0.22343379561789334, |
|
"step": 243 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2716.604248046875, |
|
"epoch": 0.13942857142857143, |
|
"grad_norm": 0.3231089115142822, |
|
"kl": 0.0021686553955078125, |
|
"learning_rate": 6.466308972251785e-07, |
|
"loss": -0.0145, |
|
"num_tokens": 31740217.0, |
|
"reward": -0.5142710842192173, |
|
"reward_std": 0.6101813912391663, |
|
"rewards/cosine_scaled_reward": -0.25713553559035063, |
|
"step": 244 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2832.625030517578, |
|
"epoch": 0.14, |
|
"grad_norm": 0.26809558272361755, |
|
"kl": 0.0017642974853515625, |
|
"learning_rate": 6.435602608679916e-07, |
|
"loss": 0.0903, |
|
"num_tokens": 31881463.0, |
|
"reward": -0.1661408469080925, |
|
"reward_std": 0.7787005566060543, |
|
"rewards/cosine_scaled_reward": -0.08307042345404625, |
|
"step": 245 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2692.8541717529297, |
|
"epoch": 0.14057142857142857, |
|
"grad_norm": 0.2669066786766052, |
|
"kl": 0.009328842163085938, |
|
"learning_rate": 6.404850645156841e-07, |
|
"loss": 0.0709, |
|
"num_tokens": 32016624.0, |
|
"reward": -0.19150156527757645, |
|
"reward_std": 0.509680725634098, |
|
"rewards/cosine_scaled_reward": -0.09575077798217535, |
|
"step": 246 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2888.9375, |
|
"epoch": 0.14114285714285715, |
|
"grad_norm": 0.217579185962677, |
|
"kl": 0.0030298233032226562, |
|
"learning_rate": 6.374054580489873e-07, |
|
"loss": -0.1061, |
|
"num_tokens": 32162217.0, |
|
"reward": 0.31709786131978035, |
|
"reward_std": 0.7779325805604458, |
|
"rewards/cosine_scaled_reward": 0.15854893065989017, |
|
"step": 247 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1323.1041870117188, |
|
"epoch": 0.1417142857142857, |
|
"grad_norm": 0.6014187335968018, |
|
"kl": 0.03565692901611328, |
|
"learning_rate": 6.343215915635761e-07, |
|
"loss": 0.0756, |
|
"num_tokens": 32231006.0, |
|
"reward": 0.6246988326311111, |
|
"reward_std": 0.35837205685675144, |
|
"rewards/cosine_scaled_reward": 0.3123493976891041, |
|
"step": 248 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2613.6458740234375, |
|
"epoch": 0.1422857142857143, |
|
"grad_norm": 0.29411664605140686, |
|
"kl": 0.0029964447021484375, |
|
"learning_rate": 6.31233615362752e-07, |
|
"loss": 0.2094, |
|
"num_tokens": 32362737.0, |
|
"reward": -0.4900950863957405, |
|
"reward_std": 0.5070604905486107, |
|
"rewards/cosine_scaled_reward": -0.24504754319787025, |
|
"step": 249 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2731.5208740234375, |
|
"epoch": 0.14285714285714285, |
|
"grad_norm": 0.35469648241996765, |
|
"kl": 0.017116546630859375, |
|
"learning_rate": 6.281416799501187e-07, |
|
"loss": -0.0102, |
|
"num_tokens": 32499598.0, |
|
"reward": -0.21433956921100616, |
|
"reward_std": 0.520557913929224, |
|
"rewards/cosine_scaled_reward": -0.10716977342963219, |
|
"step": 250 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1934.125015258789, |
|
"epoch": 0.14342857142857143, |
|
"grad_norm": 0.3160950839519501, |
|
"kl": 0.009817123413085938, |
|
"learning_rate": 6.25045936022246e-07, |
|
"loss": 0.127, |
|
"num_tokens": 32598400.0, |
|
"reward": -0.07275501638650894, |
|
"reward_std": 0.4598498921841383, |
|
"rewards/cosine_scaled_reward": -0.03637750819325447, |
|
"step": 251 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2064.5834045410156, |
|
"epoch": 0.144, |
|
"grad_norm": 0.3414526581764221, |
|
"kl": 0.00769805908203125, |
|
"learning_rate": 6.219465344613258e-07, |
|
"loss": 0.0573, |
|
"num_tokens": 32703140.0, |
|
"reward": -0.3573624864220619, |
|
"reward_std": 0.4409428536891937, |
|
"rewards/cosine_scaled_reward": -0.17868124693632126, |
|
"step": 252 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2815.083335876465, |
|
"epoch": 0.14457142857142857, |
|
"grad_norm": 0.3612518012523651, |
|
"kl": 0.007879257202148438, |
|
"learning_rate": 6.188436263278172e-07, |
|
"loss": -0.0483, |
|
"num_tokens": 32843784.0, |
|
"reward": -0.22203051671385765, |
|
"reward_std": 0.31091106310486794, |
|
"rewards/cosine_scaled_reward": -0.11101526208221912, |
|
"step": 253 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2616.479248046875, |
|
"epoch": 0.14514285714285713, |
|
"grad_norm": 0.3855963945388794, |
|
"kl": 0.011937141418457031, |
|
"learning_rate": 6.157373628530852e-07, |
|
"loss": 0.1246, |
|
"num_tokens": 32975375.0, |
|
"reward": 0.18154123798012733, |
|
"reward_std": 0.5934804044663906, |
|
"rewards/cosine_scaled_reward": 0.09077062457799911, |
|
"step": 254 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2412.333335876465, |
|
"epoch": 0.1457142857142857, |
|
"grad_norm": 0.34006571769714355, |
|
"kl": 0.01013946533203125, |
|
"learning_rate": 6.126278954320294e-07, |
|
"loss": -0.0014, |
|
"num_tokens": 33097635.0, |
|
"reward": 0.23491641879081726, |
|
"reward_std": 0.5346547961235046, |
|
"rewards/cosine_scaled_reward": 0.11745821312069893, |
|
"step": 255 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3411.3541870117188, |
|
"epoch": 0.1462857142857143, |
|
"grad_norm": 0.19116432964801788, |
|
"kl": 0.0012416839599609375, |
|
"learning_rate": 6.095153756157051e-07, |
|
"loss": 0.0547, |
|
"num_tokens": 33267932.0, |
|
"reward": -0.4405994936823845, |
|
"reward_std": 0.43207642808556557, |
|
"rewards/cosine_scaled_reward": -0.22029974684119225, |
|
"step": 256 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2541.4583435058594, |
|
"epoch": 0.14685714285714285, |
|
"grad_norm": 0.3120291233062744, |
|
"kl": 0.0090484619140625, |
|
"learning_rate": 6.06399955103937e-07, |
|
"loss": 0.0088, |
|
"num_tokens": 33395922.0, |
|
"reward": -0.40020735282450914, |
|
"reward_std": 0.6490463241934776, |
|
"rewards/cosine_scaled_reward": -0.20010367268696427, |
|
"step": 257 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1818.3958587646484, |
|
"epoch": 0.14742857142857144, |
|
"grad_norm": 0.6874107122421265, |
|
"kl": 0.04048919677734375, |
|
"learning_rate": 6.032817857379256e-07, |
|
"loss": 0.1263, |
|
"num_tokens": 33488953.0, |
|
"reward": 0.3148918077349663, |
|
"reward_std": 0.6025057537481189, |
|
"rewards/cosine_scaled_reward": 0.15744590386748314, |
|
"step": 258 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2497.2916870117188, |
|
"epoch": 0.148, |
|
"grad_norm": 0.3090062141418457, |
|
"kl": 0.0051898956298828125, |
|
"learning_rate": 6.001610194928464e-07, |
|
"loss": -0.0475, |
|
"num_tokens": 33614655.0, |
|
"reward": -0.1998734101653099, |
|
"reward_std": 0.5551594458520412, |
|
"rewards/cosine_scaled_reward": -0.09993669763207436, |
|
"step": 259 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3108.2291870117188, |
|
"epoch": 0.14857142857142858, |
|
"grad_norm": 0.20045077800750732, |
|
"kl": 0.00176239013671875, |
|
"learning_rate": 5.97037808470444e-07, |
|
"loss": -0.095, |
|
"num_tokens": 33770126.0, |
|
"reward": -0.24349116533994675, |
|
"reward_std": 0.4448055624961853, |
|
"rewards/cosine_scaled_reward": -0.12174558266997337, |
|
"step": 260 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3407.041748046875, |
|
"epoch": 0.14914285714285713, |
|
"grad_norm": 0.20637670159339905, |
|
"kl": 0.0013484954833984375, |
|
"learning_rate": 5.939123048916173e-07, |
|
"loss": 0.0355, |
|
"num_tokens": 33939832.0, |
|
"reward": -0.44314368814229965, |
|
"reward_std": 0.4703112803399563, |
|
"rewards/cosine_scaled_reward": -0.22157184407114983, |
|
"step": 261 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2691.125045776367, |
|
"epoch": 0.14971428571428572, |
|
"grad_norm": 0.2881263196468353, |
|
"kl": 0.00722503662109375, |
|
"learning_rate": 5.907846610890011e-07, |
|
"loss": 0.0162, |
|
"num_tokens": 34074154.0, |
|
"reward": 0.07448863238096237, |
|
"reward_std": 0.6951295547187328, |
|
"rewards/cosine_scaled_reward": 0.03724431432783604, |
|
"step": 262 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2771.6041717529297, |
|
"epoch": 0.15028571428571427, |
|
"grad_norm": 0.35452720522880554, |
|
"kl": 0.0109100341796875, |
|
"learning_rate": 5.87655029499542e-07, |
|
"loss": -0.012, |
|
"num_tokens": 34213515.0, |
|
"reward": -0.10311572067439556, |
|
"reward_std": 0.3219672627747059, |
|
"rewards/cosine_scaled_reward": -0.05155786033719778, |
|
"step": 263 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3548.666748046875, |
|
"epoch": 0.15085714285714286, |
|
"grad_norm": 0.20419248938560486, |
|
"kl": 0.0011997222900390625, |
|
"learning_rate": 5.845235626570683e-07, |
|
"loss": 0.0187, |
|
"num_tokens": 34389839.0, |
|
"reward": -0.3663049042224884, |
|
"reward_std": 0.6324282512068748, |
|
"rewards/cosine_scaled_reward": -0.1831524483859539, |
|
"step": 264 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3260.1875, |
|
"epoch": 0.15142857142857144, |
|
"grad_norm": 0.23959365487098694, |
|
"kl": 0.0013904571533203125, |
|
"learning_rate": 5.813904131848564e-07, |
|
"loss": -0.0252, |
|
"num_tokens": 34552136.0, |
|
"reward": -0.2764420807361603, |
|
"reward_std": 0.49818455800414085, |
|
"rewards/cosine_scaled_reward": -0.13822103291749954, |
|
"step": 265 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2710.437530517578, |
|
"epoch": 0.152, |
|
"grad_norm": 0.32057079672813416, |
|
"kl": 0.0076694488525390625, |
|
"learning_rate": 5.78255733788191e-07, |
|
"loss": -0.0256, |
|
"num_tokens": 34688285.0, |
|
"reward": -0.3459478300064802, |
|
"reward_std": 0.25663014128804207, |
|
"rewards/cosine_scaled_reward": -0.17297391314059496, |
|
"step": 266 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1547.208396911621, |
|
"epoch": 0.15257142857142858, |
|
"grad_norm": 0.4645328223705292, |
|
"kl": 0.012868881225585938, |
|
"learning_rate": 5.751196772469237e-07, |
|
"loss": 0.113, |
|
"num_tokens": 34767999.0, |
|
"reward": -0.11789099872112274, |
|
"reward_std": 0.5525611527264118, |
|
"rewards/cosine_scaled_reward": -0.05894550122320652, |
|
"step": 267 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2182.375, |
|
"epoch": 0.15314285714285714, |
|
"grad_norm": 0.3756212294101715, |
|
"kl": 0.010714530944824219, |
|
"learning_rate": 5.71982396408026e-07, |
|
"loss": 0.1363, |
|
"num_tokens": 34878669.0, |
|
"reward": -0.077213354408741, |
|
"reward_std": 0.57977394759655, |
|
"rewards/cosine_scaled_reward": -0.0386066734790802, |
|
"step": 268 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3128.6251220703125, |
|
"epoch": 0.15371428571428572, |
|
"grad_norm": 0.1624217927455902, |
|
"kl": 0.0017490386962890625, |
|
"learning_rate": 5.688440441781398e-07, |
|
"loss": -0.0542, |
|
"num_tokens": 35034243.0, |
|
"reward": -0.375546395778656, |
|
"reward_std": 0.2080207783728838, |
|
"rewards/cosine_scaled_reward": -0.18777319695800543, |
|
"step": 269 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2074.270839691162, |
|
"epoch": 0.15428571428571428, |
|
"grad_norm": 0.48845088481903076, |
|
"kl": 0.02855682373046875, |
|
"learning_rate": 5.657047735161255e-07, |
|
"loss": -0.1012, |
|
"num_tokens": 35139796.0, |
|
"reward": 0.26241855323314667, |
|
"reward_std": 0.4325704537332058, |
|
"rewards/cosine_scaled_reward": 0.13120926916599274, |
|
"step": 270 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2039.5416870117188, |
|
"epoch": 0.15485714285714286, |
|
"grad_norm": 0.27519840002059937, |
|
"kl": 0.004604339599609375, |
|
"learning_rate": 5.625647374256061e-07, |
|
"loss": -0.1227, |
|
"num_tokens": 35244654.0, |
|
"reward": -0.13348775170743465, |
|
"reward_std": 0.5455809384584427, |
|
"rewards/cosine_scaled_reward": -0.06674387538805604, |
|
"step": 271 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2399.0625915527344, |
|
"epoch": 0.15542857142857142, |
|
"grad_norm": 0.3173102140426636, |
|
"kl": 0.0048809051513671875, |
|
"learning_rate": 5.594240889475106e-07, |
|
"loss": 0.1177, |
|
"num_tokens": 35365593.0, |
|
"reward": -0.08320139348506927, |
|
"reward_std": 0.7119211666285992, |
|
"rewards/cosine_scaled_reward": -0.04160069301724434, |
|
"step": 272 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3164.75, |
|
"epoch": 0.156, |
|
"grad_norm": 0.22468267381191254, |
|
"kl": 0.001804351806640625, |
|
"learning_rate": 5.562829811526154e-07, |
|
"loss": 0.0294, |
|
"num_tokens": 35523117.0, |
|
"reward": -0.3206188827753067, |
|
"reward_std": 0.5574362277984619, |
|
"rewards/cosine_scaled_reward": -0.16030943393707275, |
|
"step": 273 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2586.1666870117188, |
|
"epoch": 0.15657142857142858, |
|
"grad_norm": 0.3104928731918335, |
|
"kl": 0.0055084228515625, |
|
"learning_rate": 5.531415671340826e-07, |
|
"loss": 0.0949, |
|
"num_tokens": 35653757.0, |
|
"reward": -0.13973749428987503, |
|
"reward_std": 0.3990135118365288, |
|
"rewards/cosine_scaled_reward": -0.06986876204609871, |
|
"step": 274 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2072.541717529297, |
|
"epoch": 0.15714285714285714, |
|
"grad_norm": 0.41220641136169434, |
|
"kl": 0.013563156127929688, |
|
"learning_rate": 5.5e-07, |
|
"loss": 0.0363, |
|
"num_tokens": 35759299.0, |
|
"reward": -0.4124254733324051, |
|
"reward_std": 0.5271046534180641, |
|
"rewards/cosine_scaled_reward": -0.20621273759752512, |
|
"step": 275 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1188.1250457763672, |
|
"epoch": 0.15771428571428572, |
|
"grad_norm": 0.4833068251609802, |
|
"kl": 0.0210723876953125, |
|
"learning_rate": 5.468584328659172e-07, |
|
"loss": -0.0615, |
|
"num_tokens": 35822281.0, |
|
"reward": 0.5711349472403526, |
|
"reward_std": 0.7510395795106888, |
|
"rewards/cosine_scaled_reward": 0.28556746058166027, |
|
"step": 276 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2676.9375915527344, |
|
"epoch": 0.15828571428571428, |
|
"grad_norm": 0.28584277629852295, |
|
"kl": 0.0041713714599609375, |
|
"learning_rate": 5.437170188473847e-07, |
|
"loss": 0.0217, |
|
"num_tokens": 35956750.0, |
|
"reward": 0.05125083029270172, |
|
"reward_std": 0.6981733292341232, |
|
"rewards/cosine_scaled_reward": 0.025625411421060562, |
|
"step": 277 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2614.9375, |
|
"epoch": 0.15885714285714286, |
|
"grad_norm": 0.29526305198669434, |
|
"kl": 0.002593994140625, |
|
"learning_rate": 5.405759110524894e-07, |
|
"loss": 0.0156, |
|
"num_tokens": 36088627.0, |
|
"reward": -0.3907754272222519, |
|
"reward_std": 0.5090431272983551, |
|
"rewards/cosine_scaled_reward": -0.19538771361112595, |
|
"step": 278 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2388.791748046875, |
|
"epoch": 0.15942857142857142, |
|
"grad_norm": 0.32363101840019226, |
|
"kl": 0.0035810470581054688, |
|
"learning_rate": 5.37435262574394e-07, |
|
"loss": 0.1454, |
|
"num_tokens": 36209013.0, |
|
"reward": 0.351345656439662, |
|
"reward_std": 0.7279459312558174, |
|
"rewards/cosine_scaled_reward": 0.1756728133186698, |
|
"step": 279 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1682.5, |
|
"epoch": 0.16, |
|
"grad_norm": 0.4550730884075165, |
|
"kl": 0.0157623291015625, |
|
"learning_rate": 5.342952264838747e-07, |
|
"loss": -0.0517, |
|
"num_tokens": 36295233.0, |
|
"reward": 0.032998040318489075, |
|
"reward_std": 0.4106169492006302, |
|
"rewards/cosine_scaled_reward": 0.016499027609825134, |
|
"step": 280 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2025.9166793823242, |
|
"epoch": 0.16057142857142856, |
|
"grad_norm": 0.4892213046550751, |
|
"kl": 0.01910400390625, |
|
"learning_rate": 5.311559558218603e-07, |
|
"loss": 0.2053, |
|
"num_tokens": 36398009.0, |
|
"reward": 0.2391605954617262, |
|
"reward_std": 0.40038658678531647, |
|
"rewards/cosine_scaled_reward": 0.1195802828297019, |
|
"step": 281 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2755.166717529297, |
|
"epoch": 0.16114285714285714, |
|
"grad_norm": 0.3091196119785309, |
|
"kl": 0.00545501708984375, |
|
"learning_rate": 5.28017603591974e-07, |
|
"loss": -0.0154, |
|
"num_tokens": 36536653.0, |
|
"reward": -0.05713912099599838, |
|
"reward_std": 0.6415699534118176, |
|
"rewards/cosine_scaled_reward": -0.028569556772708893, |
|
"step": 282 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2594.9583587646484, |
|
"epoch": 0.16171428571428573, |
|
"grad_norm": 0.3001720607280731, |
|
"kl": 0.009159088134765625, |
|
"learning_rate": 5.248803227530763e-07, |
|
"loss": 0.1765, |
|
"num_tokens": 36666971.0, |
|
"reward": -0.5184054747223854, |
|
"reward_std": 0.3386564552783966, |
|
"rewards/cosine_scaled_reward": -0.2592027336359024, |
|
"step": 283 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2470.2291717529297, |
|
"epoch": 0.16228571428571428, |
|
"grad_norm": 0.43797507882118225, |
|
"kl": 0.0067348480224609375, |
|
"learning_rate": 5.21744266211809e-07, |
|
"loss": 0.0402, |
|
"num_tokens": 36791950.0, |
|
"reward": 0.5365435220301151, |
|
"reward_std": 0.8577289432287216, |
|
"rewards/cosine_scaled_reward": 0.26827176474034786, |
|
"step": 284 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3468.2709350585938, |
|
"epoch": 0.16285714285714287, |
|
"grad_norm": 0.20558501780033112, |
|
"kl": 0.0013113021850585938, |
|
"learning_rate": 5.186095868151436e-07, |
|
"loss": 0.059, |
|
"num_tokens": 36964079.0, |
|
"reward": -0.32812320441007614, |
|
"reward_std": 0.5332767590880394, |
|
"rewards/cosine_scaled_reward": -0.16406160034239292, |
|
"step": 285 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2288.3125610351562, |
|
"epoch": 0.16342857142857142, |
|
"grad_norm": 0.24818028509616852, |
|
"kl": 0.002147674560546875, |
|
"learning_rate": 5.154764373429315e-07, |
|
"loss": -0.0275, |
|
"num_tokens": 37080458.0, |
|
"reward": 0.6154246423393488, |
|
"reward_std": 0.8250385522842407, |
|
"rewards/cosine_scaled_reward": 0.30771232303231955, |
|
"step": 286 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2368.520896911621, |
|
"epoch": 0.164, |
|
"grad_norm": 0.2976325750350952, |
|
"kl": 0.007648468017578125, |
|
"learning_rate": 5.123449705004581e-07, |
|
"loss": 0.0946, |
|
"num_tokens": 37200447.0, |
|
"reward": 0.4257568195462227, |
|
"reward_std": 0.9012775421142578, |
|
"rewards/cosine_scaled_reward": 0.21287840977311134, |
|
"step": 287 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2584.395896911621, |
|
"epoch": 0.16457142857142856, |
|
"grad_norm": 0.26449525356292725, |
|
"kl": 0.0072422027587890625, |
|
"learning_rate": 5.09215338910999e-07, |
|
"loss": 0.0002, |
|
"num_tokens": 37330510.0, |
|
"reward": -0.43290044367313385, |
|
"reward_std": 0.4573391415178776, |
|
"rewards/cosine_scaled_reward": -0.21645020693540573, |
|
"step": 288 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2852.7083740234375, |
|
"epoch": 0.16514285714285715, |
|
"grad_norm": 0.23969526588916779, |
|
"kl": 0.0036468505859375, |
|
"learning_rate": 5.060876951083828e-07, |
|
"loss": 0.1021, |
|
"num_tokens": 37473044.0, |
|
"reward": -0.14536024630069733, |
|
"reward_std": 0.7508940920233727, |
|
"rewards/cosine_scaled_reward": -0.07268010824918747, |
|
"step": 289 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2355.1250915527344, |
|
"epoch": 0.1657142857142857, |
|
"grad_norm": 0.2979152202606201, |
|
"kl": 0.0029964447021484375, |
|
"learning_rate": 5.02962191529556e-07, |
|
"loss": 0.1276, |
|
"num_tokens": 37591598.0, |
|
"reward": -0.27352139353752136, |
|
"reward_std": 0.5863078981637955, |
|
"rewards/cosine_scaled_reward": -0.13676068745553493, |
|
"step": 290 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3103.0833740234375, |
|
"epoch": 0.1662857142857143, |
|
"grad_norm": 0.24183134734630585, |
|
"kl": 0.0017833709716796875, |
|
"learning_rate": 4.998389805071536e-07, |
|
"loss": 0.0017, |
|
"num_tokens": 37746126.0, |
|
"reward": -0.4938964769244194, |
|
"reward_std": 0.20535914227366447, |
|
"rewards/cosine_scaled_reward": -0.24694822914898396, |
|
"step": 291 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2742.666748046875, |
|
"epoch": 0.16685714285714287, |
|
"grad_norm": 0.3063388764858246, |
|
"kl": 0.00244903564453125, |
|
"learning_rate": 4.967182142620745e-07, |
|
"loss": 0.0973, |
|
"num_tokens": 37883798.0, |
|
"reward": 0.00018896162509918213, |
|
"reward_std": 0.8789166212081909, |
|
"rewards/cosine_scaled_reward": 9.44770872592926e-05, |
|
"step": 292 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1510.7708587646484, |
|
"epoch": 0.16742857142857143, |
|
"grad_norm": 0.48416784405708313, |
|
"kl": 0.020442962646484375, |
|
"learning_rate": 4.93600044896063e-07, |
|
"loss": -0.0533, |
|
"num_tokens": 37961991.0, |
|
"reward": 0.5087592005729675, |
|
"reward_std": 0.6853836588561535, |
|
"rewards/cosine_scaled_reward": 0.2543795704841614, |
|
"step": 293 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3275.375, |
|
"epoch": 0.168, |
|
"grad_norm": 0.2511613368988037, |
|
"kl": 0.0014057159423828125, |
|
"learning_rate": 4.904846243842949e-07, |
|
"loss": 0.0496, |
|
"num_tokens": 38125461.0, |
|
"reward": -0.48528067022562027, |
|
"reward_std": 0.555925726890564, |
|
"rewards/cosine_scaled_reward": -0.24264032766222954, |
|
"step": 294 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3450.3334350585938, |
|
"epoch": 0.16857142857142857, |
|
"grad_norm": 0.2048538625240326, |
|
"kl": 0.001056671142578125, |
|
"learning_rate": 4.873721045679706e-07, |
|
"loss": 0.0367, |
|
"num_tokens": 38297365.0, |
|
"reward": -0.2943028609151952, |
|
"reward_std": 0.4288143813610077, |
|
"rewards/cosine_scaled_reward": -0.1471514304575976, |
|
"step": 295 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3004.9166870117188, |
|
"epoch": 0.16914285714285715, |
|
"grad_norm": 0.2455970197916031, |
|
"kl": 0.001949310302734375, |
|
"learning_rate": 4.842626371469149e-07, |
|
"loss": 0.0845, |
|
"num_tokens": 38447913.0, |
|
"reward": -0.4560352563858032, |
|
"reward_std": 0.5781249962747097, |
|
"rewards/cosine_scaled_reward": -0.22801762074232101, |
|
"step": 296 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2529.437545776367, |
|
"epoch": 0.1697142857142857, |
|
"grad_norm": 0.29431456327438354, |
|
"kl": 0.008335113525390625, |
|
"learning_rate": 4.811563736721829e-07, |
|
"loss": 0.0079, |
|
"num_tokens": 38575026.0, |
|
"reward": 0.16207153722643852, |
|
"reward_std": 0.46869122236967087, |
|
"rewards/cosine_scaled_reward": 0.08103577420115471, |
|
"step": 297 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1855.666732788086, |
|
"epoch": 0.1702857142857143, |
|
"grad_norm": 0.5596351623535156, |
|
"kl": 0.015949249267578125, |
|
"learning_rate": 4.780534655386743e-07, |
|
"loss": -0.1605, |
|
"num_tokens": 38670062.0, |
|
"reward": -0.20284654013812542, |
|
"reward_std": 0.5799939371645451, |
|
"rewards/cosine_scaled_reward": -0.10142326634377241, |
|
"step": 298 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2124.8333740234375, |
|
"epoch": 0.17085714285714285, |
|
"grad_norm": 0.49901139736175537, |
|
"kl": 0.0066089630126953125, |
|
"learning_rate": 4.749540639777539e-07, |
|
"loss": 0.2655, |
|
"num_tokens": 38778390.0, |
|
"reward": -0.23837678879499435, |
|
"reward_std": 0.5732098147273064, |
|
"rewards/cosine_scaled_reward": -0.11918839067220688, |
|
"step": 299 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2721.2709045410156, |
|
"epoch": 0.17142857142857143, |
|
"grad_norm": 0.27538150548934937, |
|
"kl": 0.001575469970703125, |
|
"learning_rate": 4.7185832004988133e-07, |
|
"loss": 0.0042, |
|
"num_tokens": 38915167.0, |
|
"reward": -0.44970532320439816, |
|
"reward_std": 0.338998232036829, |
|
"rewards/cosine_scaled_reward": -0.22485265973955393, |
|
"step": 300 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2737.937545776367, |
|
"epoch": 0.172, |
|
"grad_norm": 0.36887478828430176, |
|
"kl": 0.017746925354003906, |
|
"learning_rate": 4.68766384637248e-07, |
|
"loss": 0.0646, |
|
"num_tokens": 39053920.0, |
|
"reward": -0.25950850173830986, |
|
"reward_std": 0.16469359770417213, |
|
"rewards/cosine_scaled_reward": -0.12975424528121948, |
|
"step": 301 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2430.8958740234375, |
|
"epoch": 0.17257142857142857, |
|
"grad_norm": 0.29716700315475464, |
|
"kl": 0.012701034545898438, |
|
"learning_rate": 4.656784084364238e-07, |
|
"loss": 0.0492, |
|
"num_tokens": 39176351.0, |
|
"reward": 0.08118153735995293, |
|
"reward_std": 0.26172393187880516, |
|
"rewards/cosine_scaled_reward": 0.04059076961129904, |
|
"step": 302 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2375.4166870117188, |
|
"epoch": 0.17314285714285715, |
|
"grad_norm": 0.29471853375434875, |
|
"kl": 0.00960540771484375, |
|
"learning_rate": 4.6259454195101267e-07, |
|
"loss": 0.0308, |
|
"num_tokens": 39296539.0, |
|
"reward": -0.1111888438463211, |
|
"reward_std": 0.5966824367642403, |
|
"rewards/cosine_scaled_reward": -0.05559442937374115, |
|
"step": 303 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3561.3958740234375, |
|
"epoch": 0.1737142857142857, |
|
"grad_norm": 0.20573833584785461, |
|
"kl": 0.0010528564453125, |
|
"learning_rate": 4.59514935484316e-07, |
|
"loss": 0.0075, |
|
"num_tokens": 39473918.0, |
|
"reward": -0.32946310192346573, |
|
"reward_std": 0.5383296981453896, |
|
"rewards/cosine_scaled_reward": -0.16473154537379742, |
|
"step": 304 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2468.7084045410156, |
|
"epoch": 0.1742857142857143, |
|
"grad_norm": 0.2480940818786621, |
|
"kl": 0.0030117034912109375, |
|
"learning_rate": 4.5643973913200837e-07, |
|
"loss": 0.0284, |
|
"num_tokens": 39598176.0, |
|
"reward": 0.03428598493337631, |
|
"reward_std": 0.5898709297180176, |
|
"rewards/cosine_scaled_reward": 0.01714298501610756, |
|
"step": 305 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1914.5209045410156, |
|
"epoch": 0.17485714285714285, |
|
"grad_norm": 0.4037153124809265, |
|
"kl": 0.009237289428710938, |
|
"learning_rate": 4.5336910277482155e-07, |
|
"loss": 0.1171, |
|
"num_tokens": 39695629.0, |
|
"reward": 0.6262711547315121, |
|
"reward_std": 0.7676540613174438, |
|
"rewards/cosine_scaled_reward": 0.31313556246459484, |
|
"step": 306 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3471.2084350585938, |
|
"epoch": 0.17542857142857143, |
|
"grad_norm": 0.18108604848384857, |
|
"kl": 0.0008935928344726562, |
|
"learning_rate": 4.503031760712397e-07, |
|
"loss": 0.0372, |
|
"num_tokens": 39868547.0, |
|
"reward": -0.5682974830269814, |
|
"reward_std": 0.362131267786026, |
|
"rewards/cosine_scaled_reward": -0.28414873220026493, |
|
"step": 307 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1946.3333435058594, |
|
"epoch": 0.176, |
|
"grad_norm": 0.427402138710022, |
|
"kl": 0.012170791625976562, |
|
"learning_rate": 4.4724210845020494e-07, |
|
"loss": 0.0688, |
|
"num_tokens": 39967107.0, |
|
"reward": 0.6227611526846886, |
|
"reward_std": 0.776022270321846, |
|
"rewards/cosine_scaled_reward": 0.3113805763423443, |
|
"step": 308 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3468.5833740234375, |
|
"epoch": 0.17657142857142857, |
|
"grad_norm": 0.21143701672554016, |
|
"kl": 0.0014905929565429688, |
|
"learning_rate": 4.441860491038345e-07, |
|
"loss": 0.0234, |
|
"num_tokens": 40140067.0, |
|
"reward": -0.3016166687011719, |
|
"reward_std": 0.5722797363996506, |
|
"rewards/cosine_scaled_reward": -0.15080832783132792, |
|
"step": 309 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2890.5834350585938, |
|
"epoch": 0.17714285714285713, |
|
"grad_norm": 0.22804640233516693, |
|
"kl": 0.0019989013671875, |
|
"learning_rate": 4.4113514698014953e-07, |
|
"loss": -0.0995, |
|
"num_tokens": 40285223.0, |
|
"reward": 0.00922529399394989, |
|
"reward_std": 0.8645202741026878, |
|
"rewards/cosine_scaled_reward": 0.004612648859620094, |
|
"step": 310 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2540.4792098999023, |
|
"epoch": 0.1777142857142857, |
|
"grad_norm": 0.4666746258735657, |
|
"kl": 0.009735107421875, |
|
"learning_rate": 4.3808955077581546e-07, |
|
"loss": 0.0154, |
|
"num_tokens": 40413298.0, |
|
"reward": -0.29096972569823265, |
|
"reward_std": 0.3306393250823021, |
|
"rewards/cosine_scaled_reward": -0.14548486191779375, |
|
"step": 311 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2994.437530517578, |
|
"epoch": 0.1782857142857143, |
|
"grad_norm": 0.35905346274375916, |
|
"kl": 0.0028743743896484375, |
|
"learning_rate": 4.350494089288943e-07, |
|
"loss": -0.0335, |
|
"num_tokens": 40562971.0, |
|
"reward": -0.363456416875124, |
|
"reward_std": 0.3647861182689667, |
|
"rewards/cosine_scaled_reward": -0.18172819539904594, |
|
"step": 312 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2380.0833587646484, |
|
"epoch": 0.17885714285714285, |
|
"grad_norm": 0.4951138198375702, |
|
"kl": 0.0026788711547851562, |
|
"learning_rate": 4.3201486961161093e-07, |
|
"loss": -0.0585, |
|
"num_tokens": 40683791.0, |
|
"reward": -0.10693264147266746, |
|
"reward_std": 0.5804431773722172, |
|
"rewards/cosine_scaled_reward": -0.05346631887368858, |
|
"step": 313 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2596.604248046875, |
|
"epoch": 0.17942857142857144, |
|
"grad_norm": 0.3246864974498749, |
|
"kl": 0.0043659210205078125, |
|
"learning_rate": 4.2898608072313045e-07, |
|
"loss": 0.2706, |
|
"num_tokens": 40814932.0, |
|
"reward": 0.4242757335305214, |
|
"reward_std": 1.1075529158115387, |
|
"rewards/cosine_scaled_reward": 0.21213785372674465, |
|
"step": 314 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2499.791778564453, |
|
"epoch": 0.18, |
|
"grad_norm": 0.27629679441452026, |
|
"kl": 0.0028667449951171875, |
|
"learning_rate": 4.2596318988235037e-07, |
|
"loss": 0.0475, |
|
"num_tokens": 40940766.0, |
|
"reward": -0.3353143408894539, |
|
"reward_std": 0.637674517929554, |
|
"rewards/cosine_scaled_reward": -0.16765715926885605, |
|
"step": 315 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3229.0416870117188, |
|
"epoch": 0.18057142857142858, |
|
"grad_norm": 0.23208899796009064, |
|
"kl": 0.001750946044921875, |
|
"learning_rate": 4.2294634442070553e-07, |
|
"loss": -0.0575, |
|
"num_tokens": 41101688.0, |
|
"reward": -0.3000589460134506, |
|
"reward_std": 0.4578623231500387, |
|
"rewards/cosine_scaled_reward": -0.15002946369349957, |
|
"step": 316 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2577.125030517578, |
|
"epoch": 0.18114285714285713, |
|
"grad_norm": 0.232590913772583, |
|
"kl": 0.0039806365966796875, |
|
"learning_rate": 4.1993569137498776e-07, |
|
"loss": -0.0006, |
|
"num_tokens": 41231234.0, |
|
"reward": 0.20818227902054787, |
|
"reward_std": 0.40563616156578064, |
|
"rewards/cosine_scaled_reward": 0.10409114044159651, |
|
"step": 317 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2359.3750610351562, |
|
"epoch": 0.18171428571428572, |
|
"grad_norm": 0.26052311062812805, |
|
"kl": 0.003261566162109375, |
|
"learning_rate": 4.1693137748017915e-07, |
|
"loss": 0.0403, |
|
"num_tokens": 41349608.0, |
|
"reward": 0.13113708421587944, |
|
"reward_std": 0.5818060413002968, |
|
"rewards/cosine_scaled_reward": 0.06556854210793972, |
|
"step": 318 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2511.3541870117188, |
|
"epoch": 0.18228571428571427, |
|
"grad_norm": 0.31813371181488037, |
|
"kl": 0.0041961669921875, |
|
"learning_rate": 4.1393354916230005e-07, |
|
"loss": 0.0482, |
|
"num_tokens": 41476213.0, |
|
"reward": 0.3437543660402298, |
|
"reward_std": 0.48744695633649826, |
|
"rewards/cosine_scaled_reward": 0.1718771643936634, |
|
"step": 319 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2694.854232788086, |
|
"epoch": 0.18285714285714286, |
|
"grad_norm": 0.23167455196380615, |
|
"kl": 0.00571441650390625, |
|
"learning_rate": 4.1094235253127374e-07, |
|
"loss": 0.0731, |
|
"num_tokens": 41612898.0, |
|
"reward": 0.04213899374008179, |
|
"reward_std": 0.5815633870661259, |
|
"rewards/cosine_scaled_reward": 0.021069496870040894, |
|
"step": 320 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.18342857142857144, |
|
"grad_norm": 0.19942978024482727, |
|
"kl": 0.00109100341796875, |
|
"learning_rate": 4.079579333738039e-07, |
|
"loss": 0.0, |
|
"num_tokens": 41791110.0, |
|
"reward": -0.4013543911278248, |
|
"reward_std": 0.21282335743308067, |
|
"rewards/cosine_scaled_reward": -0.2006771918386221, |
|
"step": 321 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2271.437545776367, |
|
"epoch": 0.184, |
|
"grad_norm": 0.33824867010116577, |
|
"kl": 0.013680458068847656, |
|
"learning_rate": 4.0498043714627006e-07, |
|
"loss": 0.0733, |
|
"num_tokens": 41905971.0, |
|
"reward": -0.16819965280592442, |
|
"reward_std": 0.7291267365217209, |
|
"rewards/cosine_scaled_reward": -0.08409981848672032, |
|
"step": 322 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2732.0416717529297, |
|
"epoch": 0.18457142857142858, |
|
"grad_norm": 0.3594345450401306, |
|
"kl": 0.011373519897460938, |
|
"learning_rate": 4.020100089676376e-07, |
|
"loss": 0.0273, |
|
"num_tokens": 42042989.0, |
|
"reward": -0.20197953283786774, |
|
"reward_std": 0.6335406377911568, |
|
"rewards/cosine_scaled_reward": -0.10098976641893387, |
|
"step": 323 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1872.7292022705078, |
|
"epoch": 0.18514285714285714, |
|
"grad_norm": 0.28887686133384705, |
|
"kl": 0.0067596435546875, |
|
"learning_rate": 3.9904679361238526e-07, |
|
"loss": -0.096, |
|
"num_tokens": 42138424.0, |
|
"reward": -0.19923657178878784, |
|
"reward_std": 0.6053799614310265, |
|
"rewards/cosine_scaled_reward": -0.09961828961968422, |
|
"step": 324 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2770.062545776367, |
|
"epoch": 0.18571428571428572, |
|
"grad_norm": 0.3336716890335083, |
|
"kl": 0.003292083740234375, |
|
"learning_rate": 3.9609093550344907e-07, |
|
"loss": 0.0489, |
|
"num_tokens": 42277711.0, |
|
"reward": -0.5184944495558739, |
|
"reward_std": 0.2977650426328182, |
|
"rewards/cosine_scaled_reward": -0.25924722105264664, |
|
"step": 325 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1340.2916717529297, |
|
"epoch": 0.18628571428571428, |
|
"grad_norm": 0.6301169991493225, |
|
"kl": 0.020992279052734375, |
|
"learning_rate": 3.931425787051832e-07, |
|
"loss": 0.0975, |
|
"num_tokens": 42347565.0, |
|
"reward": -0.12958931922912598, |
|
"reward_std": 0.37651485204696655, |
|
"rewards/cosine_scaled_reward": -0.06479465775191784, |
|
"step": 326 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2393.8333587646484, |
|
"epoch": 0.18685714285714286, |
|
"grad_norm": 0.34227386116981506, |
|
"kl": 0.010608673095703125, |
|
"learning_rate": 3.902018669163384e-07, |
|
"loss": 0.2647, |
|
"num_tokens": 42467905.0, |
|
"reward": -0.35234155505895615, |
|
"reward_std": 0.40670277923345566, |
|
"rewards/cosine_scaled_reward": -0.17617077007889748, |
|
"step": 327 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2259.1250228881836, |
|
"epoch": 0.18742857142857142, |
|
"grad_norm": 0.4104394018650055, |
|
"kl": 0.013408660888671875, |
|
"learning_rate": 3.872689434630585e-07, |
|
"loss": 0.0312, |
|
"num_tokens": 42581863.0, |
|
"reward": -0.2085840255022049, |
|
"reward_std": 0.539259634912014, |
|
"rewards/cosine_scaled_reward": -0.10429202765226364, |
|
"step": 328 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3043.4375610351562, |
|
"epoch": 0.188, |
|
"grad_norm": 0.2447911649942398, |
|
"kl": 0.0016994476318359375, |
|
"learning_rate": 3.843439512918949e-07, |
|
"loss": 0.0559, |
|
"num_tokens": 42734560.0, |
|
"reward": 0.12149301916360855, |
|
"reward_std": 1.0036925375461578, |
|
"rewards/cosine_scaled_reward": 0.0607465049251914, |
|
"step": 329 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1997.8541717529297, |
|
"epoch": 0.18857142857142858, |
|
"grad_norm": 0.4679599106311798, |
|
"kl": 0.019840240478515625, |
|
"learning_rate": 3.8142703296283953e-07, |
|
"loss": 0.0583, |
|
"num_tokens": 42836877.0, |
|
"reward": 0.17294897139072418, |
|
"reward_std": 0.8219190053641796, |
|
"rewards/cosine_scaled_reward": 0.0864744782447815, |
|
"step": 330 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1952.9167289733887, |
|
"epoch": 0.18914285714285714, |
|
"grad_norm": 0.45652568340301514, |
|
"kl": 0.020021438598632812, |
|
"learning_rate": 3.785183306423767e-07, |
|
"loss": 0.0989, |
|
"num_tokens": 42936137.0, |
|
"reward": 0.13750150427222252, |
|
"reward_std": 0.17961042560636997, |
|
"rewards/cosine_scaled_reward": 0.06875075213611126, |
|
"step": 331 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2125.4166717529297, |
|
"epoch": 0.18971428571428572, |
|
"grad_norm": 0.3350309431552887, |
|
"kl": 0.014866828918457031, |
|
"learning_rate": 3.7561798609655373e-07, |
|
"loss": -0.0255, |
|
"num_tokens": 43043629.0, |
|
"reward": -0.17669325042515993, |
|
"reward_std": 0.4804052673280239, |
|
"rewards/cosine_scaled_reward": -0.08834662148728967, |
|
"step": 332 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2099.1250228881836, |
|
"epoch": 0.19028571428571428, |
|
"grad_norm": 0.5369983315467834, |
|
"kl": 0.010837554931640625, |
|
"learning_rate": 3.72726140684072e-07, |
|
"loss": 0.2866, |
|
"num_tokens": 43150087.0, |
|
"reward": -0.18136774376034737, |
|
"reward_std": 0.4798622354865074, |
|
"rewards/cosine_scaled_reward": -0.09068387281149626, |
|
"step": 333 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3412.8125610351562, |
|
"epoch": 0.19085714285714286, |
|
"grad_norm": 0.1947650909423828, |
|
"kl": 0.0010356903076171875, |
|
"learning_rate": 3.6984293534939737e-07, |
|
"loss": -0.0183, |
|
"num_tokens": 43320226.0, |
|
"reward": -0.28891574777662754, |
|
"reward_std": 0.44802433252334595, |
|
"rewards/cosine_scaled_reward": -0.14445787388831377, |
|
"step": 334 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1843.6666870117188, |
|
"epoch": 0.19142857142857142, |
|
"grad_norm": 0.4157560467720032, |
|
"kl": 0.0060596466064453125, |
|
"learning_rate": 3.6696851061588994e-07, |
|
"loss": 0.1137, |
|
"num_tokens": 43414110.0, |
|
"reward": -0.3695136718451977, |
|
"reward_std": 0.4990896135568619, |
|
"rewards/cosine_scaled_reward": -0.18475682754069567, |
|
"step": 335 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2307.500045776367, |
|
"epoch": 0.192, |
|
"grad_norm": 0.29402440786361694, |
|
"kl": 0.0069637298583984375, |
|
"learning_rate": 3.641030065789562e-07, |
|
"loss": 0.0892, |
|
"num_tokens": 43530186.0, |
|
"reward": -0.007013067603111267, |
|
"reward_std": 0.6947371922433376, |
|
"rewards/cosine_scaled_reward": -0.0035065338015556335, |
|
"step": 336 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2586.0208740234375, |
|
"epoch": 0.19257142857142856, |
|
"grad_norm": 0.48218098282814026, |
|
"kl": 0.01905059814453125, |
|
"learning_rate": 3.612465628992203e-07, |
|
"loss": 0.0188, |
|
"num_tokens": 43660219.0, |
|
"reward": 0.34220655262470245, |
|
"reward_std": 0.47233958914875984, |
|
"rewards/cosine_scaled_reward": 0.17110328003764153, |
|
"step": 337 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2399.0209350585938, |
|
"epoch": 0.19314285714285714, |
|
"grad_norm": 0.3615526258945465, |
|
"kl": 0.007982254028320312, |
|
"learning_rate": 3.5839931879571725e-07, |
|
"loss": 0.1552, |
|
"num_tokens": 43782692.0, |
|
"reward": 0.5692258452763781, |
|
"reward_std": 1.0823566317558289, |
|
"rewards/cosine_scaled_reward": 0.28461292263818905, |
|
"step": 338 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3281.166748046875, |
|
"epoch": 0.19371428571428573, |
|
"grad_norm": 0.20542646944522858, |
|
"kl": 0.0014972686767578125, |
|
"learning_rate": 3.555614130391079e-07, |
|
"loss": 0.0219, |
|
"num_tokens": 43947064.0, |
|
"reward": -0.37549396604299545, |
|
"reward_std": 0.42432290129363537, |
|
"rewards/cosine_scaled_reward": -0.18774697184562683, |
|
"step": 339 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1948.2083358764648, |
|
"epoch": 0.19428571428571428, |
|
"grad_norm": 0.5580980777740479, |
|
"kl": 0.015346527099609375, |
|
"learning_rate": 3.5273298394491515e-07, |
|
"loss": 0.1066, |
|
"num_tokens": 44046482.0, |
|
"reward": -0.09587634727358818, |
|
"reward_std": 0.37748729810118675, |
|
"rewards/cosine_scaled_reward": -0.04793817549943924, |
|
"step": 340 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1392.2917022705078, |
|
"epoch": 0.19485714285714287, |
|
"grad_norm": 0.4597311019897461, |
|
"kl": 0.021306991577148438, |
|
"learning_rate": 3.4991416936678276e-07, |
|
"loss": 0.2272, |
|
"num_tokens": 44118388.0, |
|
"reward": 0.12345625646412373, |
|
"reward_std": 0.5400365628302097, |
|
"rewards/cosine_scaled_reward": 0.06172813195735216, |
|
"step": 341 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2098.729232788086, |
|
"epoch": 0.19542857142857142, |
|
"grad_norm": 0.5311233401298523, |
|
"kl": 0.016597747802734375, |
|
"learning_rate": 3.471051066897562e-07, |
|
"loss": 0.1392, |
|
"num_tokens": 44224671.0, |
|
"reward": 0.09169729612767696, |
|
"reward_std": 0.7804772108793259, |
|
"rewards/cosine_scaled_reward": 0.04584864107891917, |
|
"step": 342 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1821.8333587646484, |
|
"epoch": 0.196, |
|
"grad_norm": 0.37737464904785156, |
|
"kl": 0.00925445556640625, |
|
"learning_rate": 3.4430593282358777e-07, |
|
"loss": 0.1748, |
|
"num_tokens": 44317867.0, |
|
"reward": -0.17036119103431702, |
|
"reward_std": 0.8656125217676163, |
|
"rewards/cosine_scaled_reward": -0.08518059551715851, |
|
"step": 343 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2364.9167098999023, |
|
"epoch": 0.19657142857142856, |
|
"grad_norm": 0.420017808675766, |
|
"kl": 0.022002220153808594, |
|
"learning_rate": 3.4151678419606233e-07, |
|
"loss": 0.0984, |
|
"num_tokens": 44436783.0, |
|
"reward": -0.07778553664684296, |
|
"reward_std": 0.6604952029883862, |
|
"rewards/cosine_scaled_reward": -0.03889276832342148, |
|
"step": 344 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1328.0, |
|
"epoch": 0.19714285714285715, |
|
"grad_norm": 0.6512780785560608, |
|
"kl": 0.020231246948242188, |
|
"learning_rate": 3.387377967463493e-07, |
|
"loss": 0.0694, |
|
"num_tokens": 44506143.0, |
|
"reward": 0.46496348083019257, |
|
"reward_std": 0.42599966563284397, |
|
"rewards/cosine_scaled_reward": 0.23248173296451569, |
|
"step": 345 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2967.9791870117188, |
|
"epoch": 0.1977142857142857, |
|
"grad_norm": 0.27706509828567505, |
|
"kl": 0.00400543212890625, |
|
"learning_rate": 3.359691059183761e-07, |
|
"loss": 0.2109, |
|
"num_tokens": 44655182.0, |
|
"reward": -0.2751462832093239, |
|
"reward_std": 0.48510361462831497, |
|
"rewards/cosine_scaled_reward": -0.13757313415408134, |
|
"step": 346 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2848.125, |
|
"epoch": 0.1982857142857143, |
|
"grad_norm": 0.22808820009231567, |
|
"kl": 0.00789642333984375, |
|
"learning_rate": 3.3321084665422803e-07, |
|
"loss": -0.0171, |
|
"num_tokens": 44798228.0, |
|
"reward": -0.2193898782134056, |
|
"reward_std": 0.39861927926540375, |
|
"rewards/cosine_scaled_reward": -0.10969493724405766, |
|
"step": 347 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2773.1666870117188, |
|
"epoch": 0.19885714285714284, |
|
"grad_norm": 0.2631858289241791, |
|
"kl": 0.0020580291748046875, |
|
"learning_rate": 3.3046315338757026e-07, |
|
"loss": -0.0332, |
|
"num_tokens": 44937184.0, |
|
"reward": -0.23417676240205765, |
|
"reward_std": 0.4104016348719597, |
|
"rewards/cosine_scaled_reward": -0.11708838120102882, |
|
"step": 348 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3129.2084350585938, |
|
"epoch": 0.19942857142857143, |
|
"grad_norm": 0.2336844503879547, |
|
"kl": 0.001857757568359375, |
|
"learning_rate": 3.2772616003709616e-07, |
|
"loss": 0.0764, |
|
"num_tokens": 45093458.0, |
|
"reward": -0.11799243092536926, |
|
"reward_std": 0.512191891670227, |
|
"rewards/cosine_scaled_reward": -0.05899622291326523, |
|
"step": 349 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1597.0625457763672, |
|
"epoch": 0.2, |
|
"grad_norm": 0.3072298467159271, |
|
"kl": 0.0224151611328125, |
|
"learning_rate": 3.250000000000001e-07, |
|
"loss": 0.1081, |
|
"num_tokens": 45175661.0, |
|
"reward": 0.5722237005829811, |
|
"reward_std": 0.33796822652220726, |
|
"rewards/cosine_scaled_reward": 0.2861118447035551, |
|
"step": 350 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2634.166717529297, |
|
"epoch": 0.20057142857142857, |
|
"grad_norm": 0.2797272205352783, |
|
"kl": 0.003326416015625, |
|
"learning_rate": 3.222848061454764e-07, |
|
"loss": 0.2635, |
|
"num_tokens": 45307765.0, |
|
"reward": -0.06509780511260033, |
|
"reward_std": 0.7609989158809185, |
|
"rewards/cosine_scaled_reward": -0.03254890255630016, |
|
"step": 351 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1977.9375, |
|
"epoch": 0.20114285714285715, |
|
"grad_norm": 0.4983066916465759, |
|
"kl": 0.020462989807128906, |
|
"learning_rate": 3.195807108082429e-07, |
|
"loss": 0.0084, |
|
"num_tokens": 45408382.0, |
|
"reward": 0.057065196335315704, |
|
"reward_std": 0.47916316613554955, |
|
"rewards/cosine_scaled_reward": 0.028532586991786957, |
|
"step": 352 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2337.979217529297, |
|
"epoch": 0.2017142857142857, |
|
"grad_norm": 0.30659788846969604, |
|
"kl": 0.005828857421875, |
|
"learning_rate": 3.168878457820915e-07, |
|
"loss": 0.1821, |
|
"num_tokens": 45527085.0, |
|
"reward": -0.3150169067084789, |
|
"reward_std": 0.7280392572283745, |
|
"rewards/cosine_scaled_reward": -0.15750844962894917, |
|
"step": 353 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2067.145835876465, |
|
"epoch": 0.2022857142857143, |
|
"grad_norm": 0.3004125952720642, |
|
"kl": 0.017545700073242188, |
|
"learning_rate": 3.142063423134644e-07, |
|
"loss": 0.0771, |
|
"num_tokens": 45631768.0, |
|
"reward": -0.013154599815607071, |
|
"reward_std": 0.4110375605523586, |
|
"rewards/cosine_scaled_reward": -0.006577307358384132, |
|
"step": 354 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2189.7917098999023, |
|
"epoch": 0.20285714285714285, |
|
"grad_norm": 0.4463716447353363, |
|
"kl": 0.0159759521484375, |
|
"learning_rate": 3.115363310950578e-07, |
|
"loss": 0.0418, |
|
"num_tokens": 45742506.0, |
|
"reward": 0.08110996335744858, |
|
"reward_std": 0.33503045327961445, |
|
"rewards/cosine_scaled_reward": 0.040554989129304886, |
|
"step": 355 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2110.8542098999023, |
|
"epoch": 0.20342857142857143, |
|
"grad_norm": 0.37933966517448425, |
|
"kl": 0.02321624755859375, |
|
"learning_rate": 3.0887794225945143e-07, |
|
"loss": -0.0443, |
|
"num_tokens": 45849299.0, |
|
"reward": 0.5204504579305649, |
|
"reward_std": 0.5081758014857769, |
|
"rewards/cosine_scaled_reward": 0.26022522151470184, |
|
"step": 356 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3009.8958740234375, |
|
"epoch": 0.204, |
|
"grad_norm": 0.22186511754989624, |
|
"kl": 0.0023040771484375, |
|
"learning_rate": 3.062313053727671e-07, |
|
"loss": 0.0384, |
|
"num_tokens": 46000422.0, |
|
"reward": -0.5638711154460907, |
|
"reward_std": 0.42032088339328766, |
|
"rewards/cosine_scaled_reward": -0.28193555772304535, |
|
"step": 357 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2124.437530517578, |
|
"epoch": 0.20457142857142857, |
|
"grad_norm": 0.48417916893959045, |
|
"kl": 0.012571334838867188, |
|
"learning_rate": 3.0359654942835247e-07, |
|
"loss": 0.2297, |
|
"num_tokens": 46108431.0, |
|
"reward": 0.3528481721878052, |
|
"reward_std": 0.594680666923523, |
|
"rewards/cosine_scaled_reward": 0.17642410099506378, |
|
"step": 358 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2094.8542098999023, |
|
"epoch": 0.20514285714285715, |
|
"grad_norm": 0.4315508306026459, |
|
"kl": 0.013898849487304688, |
|
"learning_rate": 3.0097380284049523e-07, |
|
"loss": 0.053, |
|
"num_tokens": 46214660.0, |
|
"reward": -0.27906909096054733, |
|
"reward_std": 0.774463415145874, |
|
"rewards/cosine_scaled_reward": -0.1395345416967757, |
|
"step": 359 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2567.1250610351562, |
|
"epoch": 0.2057142857142857, |
|
"grad_norm": 0.285454660654068, |
|
"kl": 0.009855270385742188, |
|
"learning_rate": 2.9836319343816397e-07, |
|
"loss": 0.0508, |
|
"num_tokens": 46344038.0, |
|
"reward": -0.10896847397089005, |
|
"reward_std": 0.5441469997167587, |
|
"rewards/cosine_scaled_reward": -0.05448423605412245, |
|
"step": 360 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2514.5, |
|
"epoch": 0.2062857142857143, |
|
"grad_norm": 0.26314839720726013, |
|
"kl": 0.0074367523193359375, |
|
"learning_rate": 2.9576484845877793e-07, |
|
"loss": 0.043, |
|
"num_tokens": 46471574.0, |
|
"reward": 0.004594132304191589, |
|
"reward_std": 0.6141269728541374, |
|
"rewards/cosine_scaled_reward": 0.002297069877386093, |
|
"step": 361 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2410.2708740234375, |
|
"epoch": 0.20685714285714285, |
|
"grad_norm": 0.2571261525154114, |
|
"kl": 0.0035762786865234375, |
|
"learning_rate": 2.931788945420058e-07, |
|
"loss": 0.0212, |
|
"num_tokens": 46592595.0, |
|
"reward": 0.23570144176483154, |
|
"reward_std": 0.9038522392511368, |
|
"rewards/cosine_scaled_reward": 0.11785072460770607, |
|
"step": 362 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2346.1250610351562, |
|
"epoch": 0.20742857142857143, |
|
"grad_norm": 0.4718479812145233, |
|
"kl": 0.0143280029296875, |
|
"learning_rate": 2.9060545772359305e-07, |
|
"loss": 0.0335, |
|
"num_tokens": 46710693.0, |
|
"reward": -0.08351481426507235, |
|
"reward_std": 0.6729232966899872, |
|
"rewards/cosine_scaled_reward": -0.041757403407245874, |
|
"step": 363 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3191.6458740234375, |
|
"epoch": 0.208, |
|
"grad_norm": 0.25456610321998596, |
|
"kl": 0.0029468536376953125, |
|
"learning_rate": 2.8804466342921987e-07, |
|
"loss": -0.0697, |
|
"num_tokens": 46869184.0, |
|
"reward": -0.24707527458667755, |
|
"reward_std": 0.4364708364009857, |
|
"rewards/cosine_scaled_reward": -0.12353762984275818, |
|
"step": 364 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2315.5209045410156, |
|
"epoch": 0.20857142857142857, |
|
"grad_norm": 0.33200034499168396, |
|
"kl": 0.00350189208984375, |
|
"learning_rate": 2.854966364683872e-07, |
|
"loss": -0.0, |
|
"num_tokens": 46985621.0, |
|
"reward": -0.40703647769987583, |
|
"reward_std": 0.5758531466126442, |
|
"rewards/cosine_scaled_reward": -0.20351823465898633, |
|
"step": 365 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2923.937530517578, |
|
"epoch": 0.20914285714285713, |
|
"grad_norm": 0.23750042915344238, |
|
"kl": 0.003543853759765625, |
|
"learning_rate": 2.829615010283344e-07, |
|
"loss": -0.013, |
|
"num_tokens": 47132150.0, |
|
"reward": -0.4153160899877548, |
|
"reward_std": 0.33962608128786087, |
|
"rewards/cosine_scaled_reward": -0.2076580412685871, |
|
"step": 366 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2619.208396911621, |
|
"epoch": 0.20971428571428571, |
|
"grad_norm": 0.4440971612930298, |
|
"kl": 0.019357681274414062, |
|
"learning_rate": 2.8043938066798645e-07, |
|
"loss": 0.0553, |
|
"num_tokens": 47263032.0, |
|
"reward": 0.14148210734128952, |
|
"reward_std": 0.9161304086446762, |
|
"rewards/cosine_scaled_reward": 0.0707410629838705, |
|
"step": 367 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2354.354248046875, |
|
"epoch": 0.2102857142857143, |
|
"grad_norm": 0.31312811374664307, |
|
"kl": 0.00652313232421875, |
|
"learning_rate": 2.7793039831193133e-07, |
|
"loss": 0.0197, |
|
"num_tokens": 47382209.0, |
|
"reward": 0.4818605841137469, |
|
"reward_std": 1.0587524473667145, |
|
"rewards/cosine_scaled_reward": 0.24093026970513165, |
|
"step": 368 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2897.875, |
|
"epoch": 0.21085714285714285, |
|
"grad_norm": 0.22899067401885986, |
|
"kl": 0.0037288665771484375, |
|
"learning_rate": 2.7543467624442956e-07, |
|
"loss": -0.0322, |
|
"num_tokens": 47527295.0, |
|
"reward": -0.12967145442962646, |
|
"reward_std": 0.5554239340126514, |
|
"rewards/cosine_scaled_reward": -0.06483572721481323, |
|
"step": 369 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2788.3958740234375, |
|
"epoch": 0.21142857142857144, |
|
"grad_norm": 0.2895820140838623, |
|
"kl": 0.003108978271484375, |
|
"learning_rate": 2.729523361034538e-07, |
|
"loss": 0.0391, |
|
"num_tokens": 47666670.0, |
|
"reward": -0.20442558825016022, |
|
"reward_std": 0.4632416293025017, |
|
"rewards/cosine_scaled_reward": -0.10221279412508011, |
|
"step": 370 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2021.9166793823242, |
|
"epoch": 0.212, |
|
"grad_norm": 0.46538272500038147, |
|
"kl": 0.022820472717285156, |
|
"learning_rate": 2.7048349887476037e-07, |
|
"loss": 0.0362, |
|
"num_tokens": 47769830.0, |
|
"reward": 0.32230387814342976, |
|
"reward_std": 0.5243347375653684, |
|
"rewards/cosine_scaled_reward": 0.16115193953737617, |
|
"step": 371 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2881.1666717529297, |
|
"epoch": 0.21257142857142858, |
|
"grad_norm": 0.25469550490379333, |
|
"kl": 0.002590179443359375, |
|
"learning_rate": 2.6802828488599294e-07, |
|
"loss": -0.0075, |
|
"num_tokens": 47913934.0, |
|
"reward": -0.4967922382056713, |
|
"reward_std": 0.17979411222040653, |
|
"rewards/cosine_scaled_reward": -0.24839610792696476, |
|
"step": 372 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2935.0000610351562, |
|
"epoch": 0.21314285714285713, |
|
"grad_norm": 0.26110002398490906, |
|
"kl": 0.00124359130859375, |
|
"learning_rate": 2.655868138008171e-07, |
|
"loss": -0.0101, |
|
"num_tokens": 48061966.0, |
|
"reward": 0.28749898076057434, |
|
"reward_std": 0.6488082036376, |
|
"rewards/cosine_scaled_reward": 0.14374948665499687, |
|
"step": 373 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3053.4375610351562, |
|
"epoch": 0.21371428571428572, |
|
"grad_norm": 0.22372771799564362, |
|
"kl": 0.002582550048828125, |
|
"learning_rate": 2.631592046130896e-07, |
|
"loss": 0.1397, |
|
"num_tokens": 48215599.0, |
|
"reward": -0.5365136712789536, |
|
"reward_std": 0.4280487932264805, |
|
"rewards/cosine_scaled_reward": -0.2682568356394768, |
|
"step": 374 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1970.3333358764648, |
|
"epoch": 0.21428571428571427, |
|
"grad_norm": 0.4923068583011627, |
|
"kl": 0.022411346435546875, |
|
"learning_rate": 2.6074557564105724e-07, |
|
"loss": -0.1989, |
|
"num_tokens": 48315947.0, |
|
"reward": 0.056493550539016724, |
|
"reward_std": 0.5003200061619282, |
|
"rewards/cosine_scaled_reward": 0.028246776200830936, |
|
"step": 375 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2141.437515258789, |
|
"epoch": 0.21485714285714286, |
|
"grad_norm": 0.2769676744937897, |
|
"kl": 0.0077648162841796875, |
|
"learning_rate": 2.583460445215911e-07, |
|
"loss": -0.1635, |
|
"num_tokens": 48423980.0, |
|
"reward": 0.3381408303976059, |
|
"reward_std": 0.4275904409587383, |
|
"rewards/cosine_scaled_reward": 0.16907040774822235, |
|
"step": 376 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2033.104248046875, |
|
"epoch": 0.21542857142857144, |
|
"grad_norm": 0.3097473680973053, |
|
"kl": 0.0059566497802734375, |
|
"learning_rate": 2.5596072820445254e-07, |
|
"loss": 0.2215, |
|
"num_tokens": 48527305.0, |
|
"reward": -0.06256039813160896, |
|
"reward_std": 0.6639396920800209, |
|
"rewards/cosine_scaled_reward": -0.03128020092844963, |
|
"step": 377 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2935.1875, |
|
"epoch": 0.216, |
|
"grad_norm": 0.26714199781417847, |
|
"kl": 0.0024471282958984375, |
|
"learning_rate": 2.5358974294659373e-07, |
|
"loss": -0.0197, |
|
"num_tokens": 48674110.0, |
|
"reward": -0.44828396290540695, |
|
"reward_std": 0.32541000843048096, |
|
"rewards/cosine_scaled_reward": -0.22414197400212288, |
|
"step": 378 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2753.9792098999023, |
|
"epoch": 0.21657142857142858, |
|
"grad_norm": 0.46519750356674194, |
|
"kl": 0.012783050537109375, |
|
"learning_rate": 2.512332043064913e-07, |
|
"loss": -0.0604, |
|
"num_tokens": 48811893.0, |
|
"reward": -0.3965173475444317, |
|
"reward_std": 0.294017824344337, |
|
"rewards/cosine_scaled_reward": -0.19825865840539336, |
|
"step": 379 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2460.7708587646484, |
|
"epoch": 0.21714285714285714, |
|
"grad_norm": 0.38828662037849426, |
|
"kl": 0.015163421630859375, |
|
"learning_rate": 2.488912271385139e-07, |
|
"loss": 0.0109, |
|
"num_tokens": 48935722.0, |
|
"reward": 0.1380084827542305, |
|
"reward_std": 0.838990144431591, |
|
"rewards/cosine_scaled_reward": 0.06900423765182495, |
|
"step": 380 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3124.3333740234375, |
|
"epoch": 0.21771428571428572, |
|
"grad_norm": 0.2345580905675888, |
|
"kl": 0.0048999786376953125, |
|
"learning_rate": 2.465639255873246e-07, |
|
"loss": 0.0489, |
|
"num_tokens": 49091774.0, |
|
"reward": -0.5142972506582737, |
|
"reward_std": 0.43132054805755615, |
|
"rewards/cosine_scaled_reward": -0.2571486262604594, |
|
"step": 381 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2855.9375610351562, |
|
"epoch": 0.21828571428571428, |
|
"grad_norm": 0.2583065629005432, |
|
"kl": 0.0037899017333984375, |
|
"learning_rate": 2.4425141308231765e-07, |
|
"loss": -0.0191, |
|
"num_tokens": 49234511.0, |
|
"reward": -0.32289815321564674, |
|
"reward_std": 0.5398626290261745, |
|
"rewards/cosine_scaled_reward": -0.16144907008856535, |
|
"step": 382 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2359.1041870117188, |
|
"epoch": 0.21885714285714286, |
|
"grad_norm": 0.2668103873729706, |
|
"kl": 0.003749847412109375, |
|
"learning_rate": 2.4195380233209006e-07, |
|
"loss": -0.0061, |
|
"num_tokens": 49353196.0, |
|
"reward": 0.04249673895537853, |
|
"reward_std": 0.5773097351193428, |
|
"rewards/cosine_scaled_reward": 0.02124837739393115, |
|
"step": 383 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3301.041748046875, |
|
"epoch": 0.21942857142857142, |
|
"grad_norm": 0.21543462574481964, |
|
"kl": 0.0013751983642578125, |
|
"learning_rate": 2.3967120531894857e-07, |
|
"loss": -0.0099, |
|
"num_tokens": 49517442.0, |
|
"reward": -0.13908865815028548, |
|
"reward_std": 0.3829270862042904, |
|
"rewards/cosine_scaled_reward": -0.06954432907514274, |
|
"step": 384 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1987.6667098999023, |
|
"epoch": 0.22, |
|
"grad_norm": 0.48938560485839844, |
|
"kl": 0.023218154907226562, |
|
"learning_rate": 2.374037332934512e-07, |
|
"loss": 0.0919, |
|
"num_tokens": 49618466.0, |
|
"reward": 0.08302738517522812, |
|
"reward_std": 0.382623303681612, |
|
"rewards/cosine_scaled_reward": 0.041513677686452866, |
|
"step": 385 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2939.6458740234375, |
|
"epoch": 0.22057142857142858, |
|
"grad_norm": 0.28820157051086426, |
|
"kl": 0.0025806427001953125, |
|
"learning_rate": 2.3515149676898552e-07, |
|
"loss": -0.034, |
|
"num_tokens": 49765545.0, |
|
"reward": -0.2633480429649353, |
|
"reward_std": 0.5663026869297028, |
|
"rewards/cosine_scaled_reward": -0.13167402520775795, |
|
"step": 386 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1900.6250228881836, |
|
"epoch": 0.22114285714285714, |
|
"grad_norm": 0.6309099197387695, |
|
"kl": 0.03489875793457031, |
|
"learning_rate": 2.3291460551638237e-07, |
|
"loss": 0.0862, |
|
"num_tokens": 49862631.0, |
|
"reward": 0.8468033410608768, |
|
"reward_std": 0.44973103795200586, |
|
"rewards/cosine_scaled_reward": 0.4234016705304384, |
|
"step": 387 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1040.0833587646484, |
|
"epoch": 0.22171428571428572, |
|
"grad_norm": 0.49885639548301697, |
|
"kl": 0.030452728271484375, |
|
"learning_rate": 2.306931685585657e-07, |
|
"loss": -0.0405, |
|
"num_tokens": 49917811.0, |
|
"reward": 0.8224863847717643, |
|
"reward_std": 0.9109100252389908, |
|
"rewards/cosine_scaled_reward": 0.41124319238588214, |
|
"step": 388 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1138.8541717529297, |
|
"epoch": 0.22228571428571428, |
|
"grad_norm": 0.5563756227493286, |
|
"kl": 0.0333251953125, |
|
"learning_rate": 2.2848729416523859e-07, |
|
"loss": -0.022, |
|
"num_tokens": 49977864.0, |
|
"reward": 0.6305245533585548, |
|
"reward_std": 0.6203858032822609, |
|
"rewards/cosine_scaled_reward": 0.3152622692286968, |
|
"step": 389 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2377.1666870117188, |
|
"epoch": 0.22285714285714286, |
|
"grad_norm": 0.2759566605091095, |
|
"kl": 0.0078125, |
|
"learning_rate": 2.2629708984760706e-07, |
|
"loss": 0.1241, |
|
"num_tokens": 50098244.0, |
|
"reward": -0.42858413606882095, |
|
"reward_std": 0.5400056019425392, |
|
"rewards/cosine_scaled_reward": -0.21429206058382988, |
|
"step": 390 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2341.0833587646484, |
|
"epoch": 0.22342857142857142, |
|
"grad_norm": 0.4552629888057709, |
|
"kl": 0.01029205322265625, |
|
"learning_rate": 2.2412266235313973e-07, |
|
"loss": -0.0161, |
|
"num_tokens": 50215944.0, |
|
"reward": -0.3929029032588005, |
|
"reward_std": 0.24793048202991486, |
|
"rewards/cosine_scaled_reward": -0.19645144790410995, |
|
"step": 391 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3172.9583435058594, |
|
"epoch": 0.224, |
|
"grad_norm": 0.22934892773628235, |
|
"kl": 0.0015659332275390625, |
|
"learning_rate": 2.2196411766036487e-07, |
|
"loss": 0.0007, |
|
"num_tokens": 50374774.0, |
|
"reward": -0.2935205027461052, |
|
"reward_std": 0.3377592619508505, |
|
"rewards/cosine_scaled_reward": -0.146760243922472, |
|
"step": 392 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2505.979217529297, |
|
"epoch": 0.22457142857142856, |
|
"grad_norm": 0.2346705198287964, |
|
"kl": 0.004924774169921875, |
|
"learning_rate": 2.1982156097370557e-07, |
|
"loss": 0.0914, |
|
"num_tokens": 50501181.0, |
|
"reward": -0.4504806846380234, |
|
"reward_std": 0.5376848392188549, |
|
"rewards/cosine_scaled_reward": -0.2252403423190117, |
|
"step": 393 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1801.2708358764648, |
|
"epoch": 0.22514285714285714, |
|
"grad_norm": 0.4237222373485565, |
|
"kl": 0.02034759521484375, |
|
"learning_rate": 2.1769509671835223e-07, |
|
"loss": -0.1076, |
|
"num_tokens": 50593294.0, |
|
"reward": 0.2521429820917547, |
|
"reward_std": 0.720090851187706, |
|
"rewards/cosine_scaled_reward": 0.12607147614471614, |
|
"step": 394 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3169.479248046875, |
|
"epoch": 0.2257142857142857, |
|
"grad_norm": 0.20394039154052734, |
|
"kl": 0.00328826904296875, |
|
"learning_rate": 2.1558482853517253e-07, |
|
"loss": 0.1086, |
|
"num_tokens": 50751405.0, |
|
"reward": -0.14242761582136154, |
|
"reward_std": 0.6643783301115036, |
|
"rewards/cosine_scaled_reward": -0.07121380046010017, |
|
"step": 395 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1952.1875915527344, |
|
"epoch": 0.22628571428571428, |
|
"grad_norm": 0.313944548368454, |
|
"kl": 0.006565093994140625, |
|
"learning_rate": 2.134908592756607e-07, |
|
"loss": 0.185, |
|
"num_tokens": 50851554.0, |
|
"reward": -0.08985854685306549, |
|
"reward_std": 0.3944113999605179, |
|
"rewards/cosine_scaled_reward": -0.04492926225066185, |
|
"step": 396 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2726.75, |
|
"epoch": 0.22685714285714287, |
|
"grad_norm": 0.31909239292144775, |
|
"kl": 0.003192901611328125, |
|
"learning_rate": 2.1141329099692406e-07, |
|
"loss": 0.0178, |
|
"num_tokens": 50988486.0, |
|
"reward": 0.4016297087073326, |
|
"reward_std": 0.4423007359728217, |
|
"rewards/cosine_scaled_reward": 0.20081482455134392, |
|
"step": 397 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2519.9584045410156, |
|
"epoch": 0.22742857142857142, |
|
"grad_norm": 0.3040643334388733, |
|
"kl": 0.0029087066650390625, |
|
"learning_rate": 2.0935222495670968e-07, |
|
"loss": 0.0979, |
|
"num_tokens": 51115648.0, |
|
"reward": 0.3028845489025116, |
|
"reward_std": 0.7478218302130699, |
|
"rewards/cosine_scaled_reward": 0.15144225861877203, |
|
"step": 398 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2820.8125, |
|
"epoch": 0.228, |
|
"grad_norm": 0.38658609986305237, |
|
"kl": 0.008646011352539062, |
|
"learning_rate": 2.0730776160846853e-07, |
|
"loss": 0.0512, |
|
"num_tokens": 51257047.0, |
|
"reward": -0.2828827500343323, |
|
"reward_std": 0.5114560127258301, |
|
"rewards/cosine_scaled_reward": -0.14144137874245644, |
|
"step": 399 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 805.1458549499512, |
|
"epoch": 0.22857142857142856, |
|
"grad_norm": 0.6325322985649109, |
|
"kl": 0.02809906005859375, |
|
"learning_rate": 2.0528000059645995e-07, |
|
"loss": 0.0474, |
|
"num_tokens": 51301778.0, |
|
"reward": 1.05329729616642, |
|
"reward_std": 0.6592238266021013, |
|
"rewards/cosine_scaled_reward": 0.52664864808321, |
|
"step": 400 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2285.5209045410156, |
|
"epoch": 0.22914285714285715, |
|
"grad_norm": 0.2752256691455841, |
|
"kl": 0.0121612548828125, |
|
"learning_rate": 2.032690407508949e-07, |
|
"loss": 0.1225, |
|
"num_tokens": 51417255.0, |
|
"reward": -0.1001143604516983, |
|
"reward_std": 0.5159653499722481, |
|
"rewards/cosine_scaled_reward": -0.05005717650055885, |
|
"step": 401 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1967.4375457763672, |
|
"epoch": 0.2297142857142857, |
|
"grad_norm": 0.5139122009277344, |
|
"kl": 0.01324462890625, |
|
"learning_rate": 2.0127498008311922e-07, |
|
"loss": 0.0994, |
|
"num_tokens": 51518244.0, |
|
"reward": -0.23355354368686676, |
|
"reward_std": 0.4070703499019146, |
|
"rewards/cosine_scaled_reward": -0.11677676998078823, |
|
"step": 402 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1201.2916793823242, |
|
"epoch": 0.2302857142857143, |
|
"grad_norm": 0.6655187010765076, |
|
"kl": 0.034503936767578125, |
|
"learning_rate": 1.9929791578083655e-07, |
|
"loss": -0.0482, |
|
"num_tokens": 51581546.0, |
|
"reward": -0.011656701564788818, |
|
"reward_std": 0.6958346888422966, |
|
"rewards/cosine_scaled_reward": -0.005828343331813812, |
|
"step": 403 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2547.125030517578, |
|
"epoch": 0.23085714285714284, |
|
"grad_norm": 0.32773658633232117, |
|
"kl": 0.00426483154296875, |
|
"learning_rate": 1.9733794420337213e-07, |
|
"loss": 0.1827, |
|
"num_tokens": 51710132.0, |
|
"reward": -0.30341653153300285, |
|
"reward_std": 0.6679345816373825, |
|
"rewards/cosine_scaled_reward": -0.15170826390385628, |
|
"step": 404 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2246.2917098999023, |
|
"epoch": 0.23142857142857143, |
|
"grad_norm": 0.4948166608810425, |
|
"kl": 0.020887374877929688, |
|
"learning_rate": 1.9539516087697517e-07, |
|
"loss": 0.2173, |
|
"num_tokens": 51823558.0, |
|
"reward": 0.32773495465517044, |
|
"reward_std": 0.43168997671455145, |
|
"rewards/cosine_scaled_reward": 0.16386747732758522, |
|
"step": 405 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2673.000030517578, |
|
"epoch": 0.232, |
|
"grad_norm": 0.4358680546283722, |
|
"kl": 0.006679534912109375, |
|
"learning_rate": 1.934696604901642e-07, |
|
"loss": 0.116, |
|
"num_tokens": 51958126.0, |
|
"reward": -0.44332827627658844, |
|
"reward_std": 0.464891217648983, |
|
"rewards/cosine_scaled_reward": -0.22166412882506847, |
|
"step": 406 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1675.5000228881836, |
|
"epoch": 0.23257142857142857, |
|
"grad_norm": 0.5162947773933411, |
|
"kl": 0.015869140625, |
|
"learning_rate": 1.915615368891117e-07, |
|
"loss": 0.1665, |
|
"num_tokens": 52043758.0, |
|
"reward": 0.42337319999933243, |
|
"reward_std": 0.5919698104262352, |
|
"rewards/cosine_scaled_reward": 0.21168660186231136, |
|
"step": 407 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1417.083366394043, |
|
"epoch": 0.23314285714285715, |
|
"grad_norm": 0.585673451423645, |
|
"kl": 0.019334793090820312, |
|
"learning_rate": 1.8967088307307e-07, |
|
"loss": 0.1973, |
|
"num_tokens": 52117886.0, |
|
"reward": 0.12191886268556118, |
|
"reward_std": 0.744186770170927, |
|
"rewards/cosine_scaled_reward": 0.06095943506807089, |
|
"step": 408 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2180.645866394043, |
|
"epoch": 0.2337142857142857, |
|
"grad_norm": 0.3679039776325226, |
|
"kl": 0.011453628540039062, |
|
"learning_rate": 1.8779779118983867e-07, |
|
"loss": 0.0241, |
|
"num_tokens": 52228245.0, |
|
"reward": 0.07945476472377777, |
|
"reward_std": 0.7460962496697903, |
|
"rewards/cosine_scaled_reward": 0.039727382361888885, |
|
"step": 409 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3364.2291870117188, |
|
"epoch": 0.2342857142857143, |
|
"grad_norm": 0.1959182769060135, |
|
"kl": 0.0014972686767578125, |
|
"learning_rate": 1.8594235253127372e-07, |
|
"loss": 0.0497, |
|
"num_tokens": 52396376.0, |
|
"reward": -0.09703963249921799, |
|
"reward_std": 0.6702800244092941, |
|
"rewards/cosine_scaled_reward": -0.048519810661673546, |
|
"step": 410 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2733.8333740234375, |
|
"epoch": 0.23485714285714285, |
|
"grad_norm": 0.4071482717990875, |
|
"kl": 0.0033740997314453125, |
|
"learning_rate": 1.8410465752883758e-07, |
|
"loss": 0.0787, |
|
"num_tokens": 52533924.0, |
|
"reward": -0.032442666590213776, |
|
"reward_std": 0.6601308509707451, |
|
"rewards/cosine_scaled_reward": -0.01622132584452629, |
|
"step": 411 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3497.3958740234375, |
|
"epoch": 0.23542857142857143, |
|
"grad_norm": 0.20792892575263977, |
|
"kl": 0.0020923614501953125, |
|
"learning_rate": 1.822847957491922e-07, |
|
"loss": 0.0532, |
|
"num_tokens": 52708939.0, |
|
"reward": -0.6876078844070435, |
|
"reward_std": 0.24106604978442192, |
|
"rewards/cosine_scaled_reward": -0.34380391985177994, |
|
"step": 412 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2646.354232788086, |
|
"epoch": 0.236, |
|
"grad_norm": 0.25247225165367126, |
|
"kl": 0.010334014892578125, |
|
"learning_rate": 1.804828558898332e-07, |
|
"loss": 0.0482, |
|
"num_tokens": 52842144.0, |
|
"reward": 0.3487217575311661, |
|
"reward_std": 0.8396928831934929, |
|
"rewards/cosine_scaled_reward": 0.17436087876558304, |
|
"step": 413 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2190.041732788086, |
|
"epoch": 0.23657142857142857, |
|
"grad_norm": 0.37403154373168945, |
|
"kl": 0.017669677734375, |
|
"learning_rate": 1.7869892577476722e-07, |
|
"loss": 0.0377, |
|
"num_tokens": 52953050.0, |
|
"reward": 0.02339554950594902, |
|
"reward_std": 0.5981272980570793, |
|
"rewards/cosine_scaled_reward": 0.011697770096361637, |
|
"step": 414 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2704.625045776367, |
|
"epoch": 0.23714285714285716, |
|
"grad_norm": 0.5488267540931702, |
|
"kl": 0.01036834716796875, |
|
"learning_rate": 1.7693309235023127e-07, |
|
"loss": 0.0983, |
|
"num_tokens": 53088536.0, |
|
"reward": -0.4232526607811451, |
|
"reward_std": 0.331368088722229, |
|
"rewards/cosine_scaled_reward": -0.21162631921470165, |
|
"step": 415 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3500.1458740234375, |
|
"epoch": 0.2377142857142857, |
|
"grad_norm": 0.20574693381786346, |
|
"kl": 0.0016450881958007812, |
|
"learning_rate": 1.7518544168045524e-07, |
|
"loss": -0.0222, |
|
"num_tokens": 53262651.0, |
|
"reward": -0.3312137499451637, |
|
"reward_std": 0.22738385573029518, |
|
"rewards/cosine_scaled_reward": -0.16560687310993671, |
|
"step": 416 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1754.87504196167, |
|
"epoch": 0.2382857142857143, |
|
"grad_norm": 0.785120964050293, |
|
"kl": 0.019487380981445312, |
|
"learning_rate": 1.7345605894346726e-07, |
|
"loss": 0.1738, |
|
"num_tokens": 53353089.0, |
|
"reward": 0.30004075169563293, |
|
"reward_std": 0.3757967611309141, |
|
"rewards/cosine_scaled_reward": 0.15002036094665527, |
|
"step": 417 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2226.2916870117188, |
|
"epoch": 0.23885714285714285, |
|
"grad_norm": 0.5053194165229797, |
|
"kl": 0.01377105712890625, |
|
"learning_rate": 1.7174502842694212e-07, |
|
"loss": 0.1283, |
|
"num_tokens": 53465675.0, |
|
"reward": -0.448532085865736, |
|
"reward_std": 0.38258010521531105, |
|
"rewards/cosine_scaled_reward": -0.22426604200154543, |
|
"step": 418 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2314.875015258789, |
|
"epoch": 0.23942857142857144, |
|
"grad_norm": 0.3890438973903656, |
|
"kl": 0.005889892578125, |
|
"learning_rate": 1.7005243352409333e-07, |
|
"loss": 0.018, |
|
"num_tokens": 53582981.0, |
|
"reward": -0.3278130046091974, |
|
"reward_std": 0.30072507075965405, |
|
"rewards/cosine_scaled_reward": -0.1639064911287278, |
|
"step": 419 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1690.1250228881836, |
|
"epoch": 0.24, |
|
"grad_norm": 0.5274333357810974, |
|
"kl": 0.02419281005859375, |
|
"learning_rate": 1.6837835672960831e-07, |
|
"loss": 0.0839, |
|
"num_tokens": 53669915.0, |
|
"reward": 0.08054505288600922, |
|
"reward_std": 0.3769830437377095, |
|
"rewards/cosine_scaled_reward": 0.040272533893585205, |
|
"step": 420 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3035.104217529297, |
|
"epoch": 0.24057142857142857, |
|
"grad_norm": 0.25986582040786743, |
|
"kl": 0.002590179443359375, |
|
"learning_rate": 1.6672287963562852e-07, |
|
"loss": 0.0036, |
|
"num_tokens": 53821504.0, |
|
"reward": -0.3971610963344574, |
|
"reward_std": 0.42149341851472855, |
|
"rewards/cosine_scaled_reward": -0.1985805444419384, |
|
"step": 421 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3569.7916870117188, |
|
"epoch": 0.24114285714285713, |
|
"grad_norm": 0.18861441314220428, |
|
"kl": 0.0012664794921875, |
|
"learning_rate": 1.6508608292777203e-07, |
|
"loss": 0.0098, |
|
"num_tokens": 53999874.0, |
|
"reward": -0.4841906800866127, |
|
"reward_std": 0.3270874619483948, |
|
"rewards/cosine_scaled_reward": -0.24209534004330635, |
|
"step": 422 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2044.2083587646484, |
|
"epoch": 0.24171428571428571, |
|
"grad_norm": 0.3931296765804291, |
|
"kl": 0.015514373779296875, |
|
"learning_rate": 1.6346804638120098e-07, |
|
"loss": 0.0304, |
|
"num_tokens": 54103636.0, |
|
"reward": 0.31378229707479477, |
|
"reward_std": 0.7438326478004456, |
|
"rewards/cosine_scaled_reward": 0.15689115412533283, |
|
"step": 423 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3155.916748046875, |
|
"epoch": 0.2422857142857143, |
|
"grad_norm": 0.23850037157535553, |
|
"kl": 0.0032806396484375, |
|
"learning_rate": 1.6186884885673413e-07, |
|
"loss": 0.0152, |
|
"num_tokens": 54260976.0, |
|
"reward": -0.1346125192940235, |
|
"reward_std": 0.6563430987298489, |
|
"rewards/cosine_scaled_reward": -0.06730626057833433, |
|
"step": 424 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3406.041748046875, |
|
"epoch": 0.24285714285714285, |
|
"grad_norm": 0.18366895616054535, |
|
"kl": 0.001506805419921875, |
|
"learning_rate": 1.6028856829700258e-07, |
|
"loss": 0.0637, |
|
"num_tokens": 54430154.0, |
|
"reward": -0.7047783136367798, |
|
"reward_std": 0.22173772007226944, |
|
"rewards/cosine_scaled_reward": -0.3523891530930996, |
|
"step": 425 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2864.2083435058594, |
|
"epoch": 0.24342857142857144, |
|
"grad_norm": 0.23228085041046143, |
|
"kl": 0.0048847198486328125, |
|
"learning_rate": 1.5872728172265146e-07, |
|
"loss": -0.0251, |
|
"num_tokens": 54574260.0, |
|
"reward": -0.05515944957733154, |
|
"reward_std": 0.19446351379156113, |
|
"rewards/cosine_scaled_reward": -0.027579709887504578, |
|
"step": 426 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3079.7709350585938, |
|
"epoch": 0.244, |
|
"grad_norm": 0.21749088168144226, |
|
"kl": 0.0016574859619140625, |
|
"learning_rate": 1.5718506522858572e-07, |
|
"loss": -0.035, |
|
"num_tokens": 54728161.0, |
|
"reward": -0.31441882718354464, |
|
"reward_std": 0.4984753504395485, |
|
"rewards/cosine_scaled_reward": -0.15720941359177232, |
|
"step": 427 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2344.812530517578, |
|
"epoch": 0.24457142857142858, |
|
"grad_norm": 0.26461076736450195, |
|
"kl": 0.00640869140625, |
|
"learning_rate": 1.5566199398026147e-07, |
|
"loss": -0.0287, |
|
"num_tokens": 54847000.0, |
|
"reward": 0.19997850060462952, |
|
"reward_std": 0.6211978904902935, |
|
"rewards/cosine_scaled_reward": 0.09998924285173416, |
|
"step": 428 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2461.812530517578, |
|
"epoch": 0.24514285714285713, |
|
"grad_norm": 0.3109036982059479, |
|
"kl": 0.00499725341796875, |
|
"learning_rate": 1.5415814221002265e-07, |
|
"loss": 0.13, |
|
"num_tokens": 54971407.0, |
|
"reward": -0.4674905724823475, |
|
"reward_std": 0.33745603263378143, |
|
"rewards/cosine_scaled_reward": -0.2337452843785286, |
|
"step": 429 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3317.6459350585938, |
|
"epoch": 0.24571428571428572, |
|
"grad_norm": 0.21356281638145447, |
|
"kl": 0.001575469970703125, |
|
"learning_rate": 1.5267358321348285e-07, |
|
"loss": 0.0489, |
|
"num_tokens": 55136462.0, |
|
"reward": 0.11995480954647064, |
|
"reward_std": 0.8040067255496979, |
|
"rewards/cosine_scaled_reward": 0.05997740477323532, |
|
"step": 430 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.24628571428571427, |
|
"grad_norm": 0.17928774654865265, |
|
"kl": 0.0009527206420898438, |
|
"learning_rate": 1.5120838934595337e-07, |
|
"loss": 0.0, |
|
"num_tokens": 55315274.0, |
|
"reward": -0.5093374960124493, |
|
"reward_std": 0.20916565880179405, |
|
"rewards/cosine_scaled_reward": -0.25466873310506344, |
|
"step": 431 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2002.8542098999023, |
|
"epoch": 0.24685714285714286, |
|
"grad_norm": 0.45603108406066895, |
|
"kl": 0.013976097106933594, |
|
"learning_rate": 1.4976263201891613e-07, |
|
"loss": -0.0071, |
|
"num_tokens": 55416811.0, |
|
"reward": 0.010444401763379574, |
|
"reward_std": 0.6464076600968838, |
|
"rewards/cosine_scaled_reward": 0.005222200881689787, |
|
"step": 432 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1662.4166870117188, |
|
"epoch": 0.24742857142857144, |
|
"grad_norm": 0.3496110737323761, |
|
"kl": 0.015367507934570312, |
|
"learning_rate": 1.483363816965435e-07, |
|
"loss": -0.0874, |
|
"num_tokens": 55502091.0, |
|
"reward": 0.2004665769636631, |
|
"reward_std": 0.4371606968343258, |
|
"rewards/cosine_scaled_reward": 0.10023329593241215, |
|
"step": 433 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3065.2500610351562, |
|
"epoch": 0.248, |
|
"grad_norm": 0.2298591136932373, |
|
"kl": 0.0030975341796875, |
|
"learning_rate": 1.469297078922642e-07, |
|
"loss": -0.0334, |
|
"num_tokens": 55655283.0, |
|
"reward": 0.04710858315229416, |
|
"reward_std": 0.42717816680669785, |
|
"rewards/cosine_scaled_reward": 0.02355429343879223, |
|
"step": 434 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2757.7291717529297, |
|
"epoch": 0.24857142857142858, |
|
"grad_norm": 0.4120491147041321, |
|
"kl": 0.012348175048828125, |
|
"learning_rate": 1.4554267916537495e-07, |
|
"loss": -0.0036, |
|
"num_tokens": 55794170.0, |
|
"reward": 0.13828672468662262, |
|
"reward_std": 0.4802464433014393, |
|
"rewards/cosine_scaled_reward": 0.06914335861802101, |
|
"step": 435 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2647.8125228881836, |
|
"epoch": 0.24914285714285714, |
|
"grad_norm": 0.40533170104026794, |
|
"kl": 0.017702102661132812, |
|
"learning_rate": 1.4417536311769885e-07, |
|
"loss": 0.1134, |
|
"num_tokens": 55927697.0, |
|
"reward": 0.27192703634500504, |
|
"reward_std": 0.8968725055456161, |
|
"rewards/cosine_scaled_reward": 0.13596352562308311, |
|
"step": 436 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1862.0208778381348, |
|
"epoch": 0.24971428571428572, |
|
"grad_norm": 0.9058906435966492, |
|
"kl": 0.04045867919921875, |
|
"learning_rate": 1.4282782639029128e-07, |
|
"loss": 0.0546, |
|
"num_tokens": 56022282.0, |
|
"reward": 0.15178360044956207, |
|
"reward_std": 0.6174656040966511, |
|
"rewards/cosine_scaled_reward": 0.07589179277420044, |
|
"step": 437 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2569.291717529297, |
|
"epoch": 0.2502857142857143, |
|
"grad_norm": 0.25029805302619934, |
|
"kl": 0.007534027099609375, |
|
"learning_rate": 1.4150013466019114e-07, |
|
"loss": 0.0397, |
|
"num_tokens": 56151440.0, |
|
"reward": -0.22741758823394775, |
|
"reward_std": 0.537294939160347, |
|
"rewards/cosine_scaled_reward": -0.11370879039168358, |
|
"step": 438 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1111.958381652832, |
|
"epoch": 0.25085714285714283, |
|
"grad_norm": 0.45296990871429443, |
|
"kl": 0.02793121337890625, |
|
"learning_rate": 1.4019235263722034e-07, |
|
"loss": 0.0986, |
|
"num_tokens": 56210862.0, |
|
"reward": -0.31671690940856934, |
|
"reward_std": 0.350432638078928, |
|
"rewards/cosine_scaled_reward": -0.15835845470428467, |
|
"step": 439 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1919.1458587646484, |
|
"epoch": 0.25142857142857145, |
|
"grad_norm": 0.4494737386703491, |
|
"kl": 0.011789321899414062, |
|
"learning_rate": 1.3890454406082956e-07, |
|
"loss": 0.0479, |
|
"num_tokens": 56309089.0, |
|
"reward": 0.27276327461004257, |
|
"reward_std": 0.5138226337730885, |
|
"rewards/cosine_scaled_reward": 0.1363816224038601, |
|
"step": 440 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3518.625, |
|
"epoch": 0.252, |
|
"grad_norm": 0.19750487804412842, |
|
"kl": 0.0019445419311523438, |
|
"learning_rate": 1.3763677169699217e-07, |
|
"loss": 0.0253, |
|
"num_tokens": 56485063.0, |
|
"reward": -0.358419805765152, |
|
"reward_std": 0.32525915279984474, |
|
"rewards/cosine_scaled_reward": -0.1792098954319954, |
|
"step": 441 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2622.8541870117188, |
|
"epoch": 0.25257142857142856, |
|
"grad_norm": 0.331227570772171, |
|
"kl": 0.011693954467773438, |
|
"learning_rate": 1.3638909733514452e-07, |
|
"loss": 0.0307, |
|
"num_tokens": 56617584.0, |
|
"reward": 0.23914362490177155, |
|
"reward_std": 0.5278170146048069, |
|
"rewards/cosine_scaled_reward": 0.11957181245088577, |
|
"step": 442 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3383.729248046875, |
|
"epoch": 0.25314285714285717, |
|
"grad_norm": 0.1942642778158188, |
|
"kl": 0.00176239013671875, |
|
"learning_rate": 1.351615817851748e-07, |
|
"loss": 0.049, |
|
"num_tokens": 56786303.0, |
|
"reward": -0.6218864843249321, |
|
"reward_std": 0.2640235126018524, |
|
"rewards/cosine_scaled_reward": -0.31094324216246605, |
|
"step": 443 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3413.5833740234375, |
|
"epoch": 0.2537142857142857, |
|
"grad_norm": 0.1867051124572754, |
|
"kl": 0.0013294219970703125, |
|
"learning_rate": 1.3395428487445914e-07, |
|
"loss": 0.0401, |
|
"num_tokens": 56956251.0, |
|
"reward": -0.6013847589492798, |
|
"reward_std": 0.32602211087942123, |
|
"rewards/cosine_scaled_reward": -0.3006923794746399, |
|
"step": 444 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2585.5833740234375, |
|
"epoch": 0.2542857142857143, |
|
"grad_norm": 0.3707902133464813, |
|
"kl": 0.01132965087890625, |
|
"learning_rate": 1.3276726544494571e-07, |
|
"loss": 0.0547, |
|
"num_tokens": 57086335.0, |
|
"reward": 0.8475865125656128, |
|
"reward_std": 0.9629704058170319, |
|
"rewards/cosine_scaled_reward": 0.4237932413816452, |
|
"step": 445 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1880.5209045410156, |
|
"epoch": 0.25485714285714284, |
|
"grad_norm": 0.5581598877906799, |
|
"kl": 0.020392417907714844, |
|
"learning_rate": 1.316005813502869e-07, |
|
"loss": 0.19, |
|
"num_tokens": 57182540.0, |
|
"reward": -0.18871257454156876, |
|
"reward_std": 0.3908994784578681, |
|
"rewards/cosine_scaled_reward": -0.09435627982020378, |
|
"step": 446 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2767.1041717529297, |
|
"epoch": 0.25542857142857145, |
|
"grad_norm": 0.4027818739414215, |
|
"kl": 0.0086669921875, |
|
"learning_rate": 1.3045428945301953e-07, |
|
"loss": 0.051, |
|
"num_tokens": 57320965.0, |
|
"reward": -0.5574091803282499, |
|
"reward_std": 0.29374449513852596, |
|
"rewards/cosine_scaled_reward": -0.27870458643883467, |
|
"step": 447 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2482.3125610351562, |
|
"epoch": 0.256, |
|
"grad_norm": 0.1986325979232788, |
|
"kl": 0.0056400299072265625, |
|
"learning_rate": 1.2932844562179352e-07, |
|
"loss": 0.0354, |
|
"num_tokens": 57446116.0, |
|
"reward": 0.4690280854701996, |
|
"reward_std": 0.40538226813077927, |
|
"rewards/cosine_scaled_reward": 0.2345140352845192, |
|
"step": 448 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2598.9375610351562, |
|
"epoch": 0.25657142857142856, |
|
"grad_norm": 0.2474825382232666, |
|
"kl": 0.003582000732421875, |
|
"learning_rate": 1.2822310472864885e-07, |
|
"loss": 0.1014, |
|
"num_tokens": 57578473.0, |
|
"reward": 0.4089887887239456, |
|
"reward_std": 0.8240047469735146, |
|
"rewards/cosine_scaled_reward": 0.20449439622461796, |
|
"step": 449 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1180.7708740234375, |
|
"epoch": 0.2571428571428571, |
|
"grad_norm": 0.5087565183639526, |
|
"kl": 0.02840423583984375, |
|
"learning_rate": 1.2713832064634125e-07, |
|
"loss": 0.2658, |
|
"num_tokens": 57640442.0, |
|
"reward": 0.25451022386550903, |
|
"reward_std": 0.42495069094002247, |
|
"rewards/cosine_scaled_reward": 0.1272551193833351, |
|
"step": 450 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2582.8541870117188, |
|
"epoch": 0.25771428571428573, |
|
"grad_norm": 0.47060224413871765, |
|
"kl": 0.0055389404296875, |
|
"learning_rate": 1.260741462457165e-07, |
|
"loss": 0.0476, |
|
"num_tokens": 57771091.0, |
|
"reward": -0.6611975803971291, |
|
"reward_std": 0.29068057239055634, |
|
"rewards/cosine_scaled_reward": -0.3305987734347582, |
|
"step": 451 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1834.93754196167, |
|
"epoch": 0.2582857142857143, |
|
"grad_norm": 0.6933012008666992, |
|
"kl": 0.04656410217285156, |
|
"learning_rate": 1.2503063339313356e-07, |
|
"loss": 0.1749, |
|
"num_tokens": 57864196.0, |
|
"reward": -0.06097408011555672, |
|
"reward_std": 0.3460182901471853, |
|
"rewards/cosine_scaled_reward": -0.03048703959211707, |
|
"step": 452 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2453.6875, |
|
"epoch": 0.25885714285714284, |
|
"grad_norm": 0.5309619307518005, |
|
"kl": 0.019407272338867188, |
|
"learning_rate": 1.2400783294793668e-07, |
|
"loss": 0.1398, |
|
"num_tokens": 57987841.0, |
|
"reward": -0.14627531357109547, |
|
"reward_std": 0.40048458334058523, |
|
"rewards/cosine_scaled_reward": -0.07313766423612833, |
|
"step": 453 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2865.8958740234375, |
|
"epoch": 0.25942857142857145, |
|
"grad_norm": 0.2787575423717499, |
|
"kl": 0.0030364990234375, |
|
"learning_rate": 1.2300579475997657e-07, |
|
"loss": 0.0809, |
|
"num_tokens": 58131152.0, |
|
"reward": -0.03645841544494033, |
|
"reward_std": 0.538176491856575, |
|
"rewards/cosine_scaled_reward": -0.01822919282130897, |
|
"step": 454 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2737.3125610351562, |
|
"epoch": 0.26, |
|
"grad_norm": 0.2434437870979309, |
|
"kl": 0.0023136138916015625, |
|
"learning_rate": 1.220245676671809e-07, |
|
"loss": -0.09, |
|
"num_tokens": 58269443.0, |
|
"reward": -0.0827246904373169, |
|
"reward_std": 0.6058220788836479, |
|
"rewards/cosine_scaled_reward": -0.041362347081303596, |
|
"step": 455 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3515.9166870117188, |
|
"epoch": 0.26057142857142856, |
|
"grad_norm": 0.22416526079177856, |
|
"kl": 0.0019941329956054688, |
|
"learning_rate": 1.2106419949317388e-07, |
|
"loss": 0.0015, |
|
"num_tokens": 58444267.0, |
|
"reward": -0.3426622897386551, |
|
"reward_std": 0.2587408199906349, |
|
"rewards/cosine_scaled_reward": -0.17133113741874695, |
|
"step": 456 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2765.5416717529297, |
|
"epoch": 0.2611428571428571, |
|
"grad_norm": 0.3764567971229553, |
|
"kl": 0.018613815307617188, |
|
"learning_rate": 1.2012473704494537e-07, |
|
"loss": -0.0034, |
|
"num_tokens": 58582509.0, |
|
"reward": -0.07094722986221313, |
|
"reward_std": 0.6748832985758781, |
|
"rewards/cosine_scaled_reward": -0.035473618656396866, |
|
"step": 457 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1633.041732788086, |
|
"epoch": 0.26171428571428573, |
|
"grad_norm": 0.49085304141044617, |
|
"kl": 0.01363372802734375, |
|
"learning_rate": 1.1920622611056974e-07, |
|
"loss": 0.4777, |
|
"num_tokens": 58666343.0, |
|
"reward": -0.2368110716342926, |
|
"reward_std": 0.48651882261037827, |
|
"rewards/cosine_scaled_reward": -0.118405532091856, |
|
"step": 458 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2421.395896911621, |
|
"epoch": 0.2622857142857143, |
|
"grad_norm": 0.40670979022979736, |
|
"kl": 0.016485214233398438, |
|
"learning_rate": 1.1830871145697412e-07, |
|
"loss": 0.1308, |
|
"num_tokens": 58789170.0, |
|
"reward": -0.010643558576703072, |
|
"reward_std": 0.8577997833490372, |
|
"rewards/cosine_scaled_reward": -0.005321786738932133, |
|
"step": 459 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2716.1875, |
|
"epoch": 0.26285714285714284, |
|
"grad_norm": 0.40470758080482483, |
|
"kl": 0.01605224609375, |
|
"learning_rate": 1.1743223682775649e-07, |
|
"loss": 0.0628, |
|
"num_tokens": 58925859.0, |
|
"reward": 0.18111078813672066, |
|
"reward_std": 0.2746178447268903, |
|
"rewards/cosine_scaled_reward": 0.09055539406836033, |
|
"step": 460 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2102.7916870117188, |
|
"epoch": 0.2634285714285714, |
|
"grad_norm": 0.6008480191230774, |
|
"kl": 0.015697479248046875, |
|
"learning_rate": 1.1657684494105386e-07, |
|
"loss": 0.2683, |
|
"num_tokens": 59032085.0, |
|
"reward": -0.23030911991372705, |
|
"reward_std": 0.6799526810646057, |
|
"rewards/cosine_scaled_reward": -0.11515455157496035, |
|
"step": 461 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3317.604248046875, |
|
"epoch": 0.264, |
|
"grad_norm": 0.18618419766426086, |
|
"kl": 0.0010318756103515625, |
|
"learning_rate": 1.1574257748745986e-07, |
|
"loss": 0.0053, |
|
"num_tokens": 59198026.0, |
|
"reward": 0.1968272104859352, |
|
"reward_std": 0.4609448295086622, |
|
"rewards/cosine_scaled_reward": 0.09841361455619335, |
|
"step": 462 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3123.4168090820312, |
|
"epoch": 0.26457142857142857, |
|
"grad_norm": 0.19308748841285706, |
|
"kl": 0.0020313262939453125, |
|
"learning_rate": 1.1492947512799328e-07, |
|
"loss": 0.0844, |
|
"num_tokens": 59354562.0, |
|
"reward": -0.48374156653881073, |
|
"reward_std": 0.6329436413943768, |
|
"rewards/cosine_scaled_reward": -0.24187077954411507, |
|
"step": 463 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2666.5625, |
|
"epoch": 0.2651428571428571, |
|
"grad_norm": 0.29937809705734253, |
|
"kl": 0.010850906372070312, |
|
"learning_rate": 1.1413757749211602e-07, |
|
"loss": 0.0164, |
|
"num_tokens": 59489061.0, |
|
"reward": 0.12404083646833897, |
|
"reward_std": 0.621801532804966, |
|
"rewards/cosine_scaled_reward": 0.06202041823416948, |
|
"step": 464 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3420.354248046875, |
|
"epoch": 0.26571428571428574, |
|
"grad_norm": 0.20473025739192963, |
|
"kl": 0.0012063980102539062, |
|
"learning_rate": 1.1336692317580158e-07, |
|
"loss": 0.0185, |
|
"num_tokens": 59658794.0, |
|
"reward": -0.1353940162807703, |
|
"reward_std": 0.5671922750771046, |
|
"rewards/cosine_scaled_reward": -0.06769700860604644, |
|
"step": 465 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2224.750045776367, |
|
"epoch": 0.2662857142857143, |
|
"grad_norm": 0.28893810510635376, |
|
"kl": 0.014387130737304688, |
|
"learning_rate": 1.1261754973965422e-07, |
|
"loss": 0.1965, |
|
"num_tokens": 59772038.0, |
|
"reward": -0.47738330624997616, |
|
"reward_std": 0.32058568112552166, |
|
"rewards/cosine_scaled_reward": -0.23869163822382689, |
|
"step": 466 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3208.9791870117188, |
|
"epoch": 0.26685714285714285, |
|
"grad_norm": 0.24921633303165436, |
|
"kl": 0.0014972686767578125, |
|
"learning_rate": 1.1188949370707787e-07, |
|
"loss": 0.0734, |
|
"num_tokens": 59932489.0, |
|
"reward": -0.10125970467925072, |
|
"reward_std": 0.4448701348155737, |
|
"rewards/cosine_scaled_reward": -0.050629859790205956, |
|
"step": 467 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2674.4375228881836, |
|
"epoch": 0.2674285714285714, |
|
"grad_norm": 0.4345049262046814, |
|
"kl": 0.012060165405273438, |
|
"learning_rate": 1.1118279056249653e-07, |
|
"loss": -0.0264, |
|
"num_tokens": 60067174.0, |
|
"reward": 0.15898653864860535, |
|
"reward_std": 0.5897807292640209, |
|
"rewards/cosine_scaled_reward": 0.07949326187372208, |
|
"step": 468 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2993.8958740234375, |
|
"epoch": 0.268, |
|
"grad_norm": 0.19168232381343842, |
|
"kl": 0.0040569305419921875, |
|
"learning_rate": 1.1049747474962444e-07, |
|
"loss": 0.1375, |
|
"num_tokens": 60216449.0, |
|
"reward": -0.49420715123414993, |
|
"reward_std": 0.4377583935856819, |
|
"rewards/cosine_scaled_reward": -0.24710355699062347, |
|
"step": 469 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1846.8958587646484, |
|
"epoch": 0.26857142857142857, |
|
"grad_norm": 0.6746844053268433, |
|
"kl": 0.0324859619140625, |
|
"learning_rate": 1.0983357966978745e-07, |
|
"loss": 0.1055, |
|
"num_tokens": 60309816.0, |
|
"reward": 0.983911968767643, |
|
"reward_std": 0.7739077722653747, |
|
"rewards/cosine_scaled_reward": 0.49195596762001514, |
|
"step": 470 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2508.1666870117188, |
|
"epoch": 0.26914285714285713, |
|
"grad_norm": 0.2162114381790161, |
|
"kl": 0.004150390625, |
|
"learning_rate": 1.0919113768029517e-07, |
|
"loss": -0.049, |
|
"num_tokens": 60436376.0, |
|
"reward": -0.14057038724422455, |
|
"reward_std": 0.5592290014028549, |
|
"rewards/cosine_scaled_reward": -0.07028519362211227, |
|
"step": 471 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3250.7916870117188, |
|
"epoch": 0.26971428571428574, |
|
"grad_norm": 0.24621860682964325, |
|
"kl": 0.0020599365234375, |
|
"learning_rate": 1.0857018009286381e-07, |
|
"loss": 0.0899, |
|
"num_tokens": 60598222.0, |
|
"reward": -0.35493073239922523, |
|
"reward_std": 0.4200459411367774, |
|
"rewards/cosine_scaled_reward": -0.17746536619961262, |
|
"step": 472 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2814.0208587646484, |
|
"epoch": 0.2702857142857143, |
|
"grad_norm": 0.3132895529270172, |
|
"kl": 0.00676727294921875, |
|
"learning_rate": 1.0797073717209013e-07, |
|
"loss": 0.1305, |
|
"num_tokens": 60740639.0, |
|
"reward": -0.14024419710040092, |
|
"reward_std": 0.6155532822012901, |
|
"rewards/cosine_scaled_reward": -0.07012210600078106, |
|
"step": 473 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1528.2917022705078, |
|
"epoch": 0.27085714285714285, |
|
"grad_norm": 0.2247830480337143, |
|
"kl": 0.00786590576171875, |
|
"learning_rate": 1.0739283813397639e-07, |
|
"loss": 0.0938, |
|
"num_tokens": 60819421.0, |
|
"reward": 0.2218828909099102, |
|
"reward_std": 0.6827406734228134, |
|
"rewards/cosine_scaled_reward": 0.11094144824892282, |
|
"step": 474 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2696.4584197998047, |
|
"epoch": 0.2714285714285714, |
|
"grad_norm": 0.2896968722343445, |
|
"kl": 0.00408172607421875, |
|
"learning_rate": 1.068365111445064e-07, |
|
"loss": 0.1801, |
|
"num_tokens": 60955163.0, |
|
"reward": 0.13359621167182922, |
|
"reward_std": 0.7966333255171776, |
|
"rewards/cosine_scaled_reward": 0.06679810583591461, |
|
"step": 475 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3057.5, |
|
"epoch": 0.272, |
|
"grad_norm": 0.21167173981666565, |
|
"kl": 0.0032901763916015625, |
|
"learning_rate": 1.063017833182728e-07, |
|
"loss": 0.0928, |
|
"num_tokens": 61108295.0, |
|
"reward": -0.6038347482681274, |
|
"reward_std": 0.3766014724969864, |
|
"rewards/cosine_scaled_reward": -0.30191735550761223, |
|
"step": 476 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2832.604217529297, |
|
"epoch": 0.2725714285714286, |
|
"grad_norm": 0.28865718841552734, |
|
"kl": 0.007213592529296875, |
|
"learning_rate": 1.0578868071715544e-07, |
|
"loss": -0.0879, |
|
"num_tokens": 61251244.0, |
|
"reward": -0.4621117692440748, |
|
"reward_std": 0.3360144942998886, |
|
"rewards/cosine_scaled_reward": -0.23105588043108582, |
|
"step": 477 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3392.6458740234375, |
|
"epoch": 0.27314285714285713, |
|
"grad_norm": 0.18357343971729279, |
|
"kl": 0.001720428466796875, |
|
"learning_rate": 1.0529722834905125e-07, |
|
"loss": -0.032, |
|
"num_tokens": 61420799.0, |
|
"reward": -0.3251003101468086, |
|
"reward_std": 0.37241343408823013, |
|
"rewards/cosine_scaled_reward": -0.16255014389753342, |
|
"step": 478 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2110.25004196167, |
|
"epoch": 0.2737142857142857, |
|
"grad_norm": 0.452695369720459, |
|
"kl": 0.0191802978515625, |
|
"learning_rate": 1.0482745016665526e-07, |
|
"loss": 0.1311, |
|
"num_tokens": 61527719.0, |
|
"reward": 0.4394497722387314, |
|
"reward_std": 0.30791839864104986, |
|
"rewards/cosine_scaled_reward": 0.2197248861193657, |
|
"step": 479 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3178.5208740234375, |
|
"epoch": 0.2742857142857143, |
|
"grad_norm": 0.2389601320028305, |
|
"kl": 0.002948760986328125, |
|
"learning_rate": 1.0437936906629334e-07, |
|
"loss": 0.0376, |
|
"num_tokens": 61687032.0, |
|
"reward": 0.2565951645374298, |
|
"reward_std": 0.6847127676010132, |
|
"rewards/cosine_scaled_reward": 0.1282975897192955, |
|
"step": 480 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2711.5625228881836, |
|
"epoch": 0.27485714285714286, |
|
"grad_norm": 0.6793060302734375, |
|
"kl": 0.02097320556640625, |
|
"learning_rate": 1.0395300688680625e-07, |
|
"loss": -0.0007, |
|
"num_tokens": 61823763.0, |
|
"reward": -0.17994186095893383, |
|
"reward_std": 0.5131379179656506, |
|
"rewards/cosine_scaled_reward": -0.08997092954814434, |
|
"step": 481 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2249.0833435058594, |
|
"epoch": 0.2754285714285714, |
|
"grad_norm": 0.33344191312789917, |
|
"kl": 0.0082855224609375, |
|
"learning_rate": 1.0354838440848501e-07, |
|
"loss": 0.1291, |
|
"num_tokens": 61938103.0, |
|
"reward": 0.008612923324108124, |
|
"reward_std": 0.5992633532732725, |
|
"rewards/cosine_scaled_reward": 0.004306461662054062, |
|
"step": 482 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2914.4791870117188, |
|
"epoch": 0.276, |
|
"grad_norm": 0.26747846603393555, |
|
"kl": 0.0048503875732421875, |
|
"learning_rate": 1.0316552135205837e-07, |
|
"loss": 0.061, |
|
"num_tokens": 62083878.0, |
|
"reward": -0.18974144756793976, |
|
"reward_std": 0.6735084727406502, |
|
"rewards/cosine_scaled_reward": -0.09487072005867958, |
|
"step": 483 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1817.7292289733887, |
|
"epoch": 0.2765714285714286, |
|
"grad_norm": 0.5921136736869812, |
|
"kl": 0.02447509765625, |
|
"learning_rate": 1.0280443637773163e-07, |
|
"loss": 0.0267, |
|
"num_tokens": 62176649.0, |
|
"reward": -0.4410746842622757, |
|
"reward_std": 0.301199066452682, |
|
"rewards/cosine_scaled_reward": -0.22053734213113785, |
|
"step": 484 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1706.4791870117188, |
|
"epoch": 0.27714285714285714, |
|
"grad_norm": 0.36696869134902954, |
|
"kl": 0.01856231689453125, |
|
"learning_rate": 1.0246514708427701e-07, |
|
"loss": 0.2368, |
|
"num_tokens": 62263576.0, |
|
"reward": 0.08185825496912003, |
|
"reward_std": 0.23910308256745338, |
|
"rewards/cosine_scaled_reward": 0.04092914238572121, |
|
"step": 485 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1779.6250534057617, |
|
"epoch": 0.2777142857142857, |
|
"grad_norm": 0.44206833839416504, |
|
"kl": 0.017059326171875, |
|
"learning_rate": 1.0214767000817596e-07, |
|
"loss": 0.2164, |
|
"num_tokens": 62354842.0, |
|
"reward": -0.1533157378435135, |
|
"reward_std": 0.4944304316304624, |
|
"rewards/cosine_scaled_reward": -0.07665786519646645, |
|
"step": 486 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1240.7708358764648, |
|
"epoch": 0.2782857142857143, |
|
"grad_norm": 0.6005072593688965, |
|
"kl": 0.02964019775390625, |
|
"learning_rate": 1.0185202062281336e-07, |
|
"loss": 0.2567, |
|
"num_tokens": 62420027.0, |
|
"reward": 0.3417629040777683, |
|
"reward_std": 0.5913028866052628, |
|
"rewards/cosine_scaled_reward": 0.17088143806904554, |
|
"step": 487 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3259.6041870117188, |
|
"epoch": 0.27885714285714286, |
|
"grad_norm": 0.2043127417564392, |
|
"kl": 0.0017719268798828125, |
|
"learning_rate": 1.0157821333772304e-07, |
|
"loss": 0.0477, |
|
"num_tokens": 62583244.0, |
|
"reward": 0.000857822597026825, |
|
"reward_std": 0.6294433549046516, |
|
"rewards/cosine_scaled_reward": 0.0004289112985134125, |
|
"step": 488 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2507.875045776367, |
|
"epoch": 0.2794285714285714, |
|
"grad_norm": 0.41613826155662537, |
|
"kl": 0.0082244873046875, |
|
"learning_rate": 1.013262614978859e-07, |
|
"loss": 0.171, |
|
"num_tokens": 62708254.0, |
|
"reward": -0.3011748939752579, |
|
"reward_std": 0.6403844729065895, |
|
"rewards/cosine_scaled_reward": -0.15058743953704834, |
|
"step": 489 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2246.9166870117188, |
|
"epoch": 0.28, |
|
"grad_norm": 0.3775407671928406, |
|
"kl": 0.00557708740234375, |
|
"learning_rate": 1.0109617738307911e-07, |
|
"loss": 0.2481, |
|
"num_tokens": 62821314.0, |
|
"reward": 0.13066441006958485, |
|
"reward_std": 0.39571982994675636, |
|
"rewards/cosine_scaled_reward": 0.06533220689743757, |
|
"step": 490 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2470.5417098999023, |
|
"epoch": 0.2805714285714286, |
|
"grad_norm": 0.34662094712257385, |
|
"kl": 0.010288238525390625, |
|
"learning_rate": 1.0088797220727779e-07, |
|
"loss": 0.1097, |
|
"num_tokens": 62945936.0, |
|
"reward": -0.09428609162569046, |
|
"reward_std": 0.7784368544816971, |
|
"rewards/cosine_scaled_reward": -0.04714304953813553, |
|
"step": 491 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3139.291748046875, |
|
"epoch": 0.28114285714285714, |
|
"grad_norm": 0.23511607944965363, |
|
"kl": 0.0025539398193359375, |
|
"learning_rate": 1.0070165611810855e-07, |
|
"loss": 0.0161, |
|
"num_tokens": 63102874.0, |
|
"reward": -0.3465424180030823, |
|
"reward_std": 0.5582562312483788, |
|
"rewards/cosine_scaled_reward": -0.17327120155096054, |
|
"step": 492 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2491.479232788086, |
|
"epoch": 0.2817142857142857, |
|
"grad_norm": 0.3599470555782318, |
|
"kl": 0.013715744018554688, |
|
"learning_rate": 1.005372381963547e-07, |
|
"loss": 0.0382, |
|
"num_tokens": 63228357.0, |
|
"reward": 0.5691114738583565, |
|
"reward_std": 1.0396928787231445, |
|
"rewards/cosine_scaled_reward": 0.2845557164400816, |
|
"step": 493 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2203.6667098999023, |
|
"epoch": 0.2822857142857143, |
|
"grad_norm": 0.40486791729927063, |
|
"kl": 0.01262664794921875, |
|
"learning_rate": 1.0039472645551372e-07, |
|
"loss": -0.0022, |
|
"num_tokens": 63339497.0, |
|
"reward": 0.44267672300338745, |
|
"reward_std": 0.6116999462246895, |
|
"rewards/cosine_scaled_reward": 0.22133836150169373, |
|
"step": 494 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3396.854248046875, |
|
"epoch": 0.28285714285714286, |
|
"grad_norm": 0.18696698546409607, |
|
"kl": 0.001537322998046875, |
|
"learning_rate": 1.002741278414069e-07, |
|
"loss": -0.024, |
|
"num_tokens": 63508690.0, |
|
"reward": -0.15206171572208405, |
|
"reward_std": 0.6857472285628319, |
|
"rewards/cosine_scaled_reward": -0.07603085786104202, |
|
"step": 495 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1400.7708587646484, |
|
"epoch": 0.2834285714285714, |
|
"grad_norm": 0.508706271648407, |
|
"kl": 0.018621444702148438, |
|
"learning_rate": 1.0017544823184055e-07, |
|
"loss": 0.2565, |
|
"num_tokens": 63581303.0, |
|
"reward": -0.016182963736355305, |
|
"reward_std": 0.6838839948177338, |
|
"rewards/cosine_scaled_reward": -0.008091474417597055, |
|
"step": 496 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3447.9583740234375, |
|
"epoch": 0.284, |
|
"grad_norm": 0.20476852357387543, |
|
"kl": 0.0011720657348632812, |
|
"learning_rate": 1.0009869243631952e-07, |
|
"loss": 0.0013, |
|
"num_tokens": 63753261.0, |
|
"reward": -0.3728942945599556, |
|
"reward_std": 0.4060557000339031, |
|
"rewards/cosine_scaled_reward": -0.18644713005051017, |
|
"step": 497 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2509.0833587646484, |
|
"epoch": 0.2845714285714286, |
|
"grad_norm": 0.3915875554084778, |
|
"kl": 0.0099945068359375, |
|
"learning_rate": 1.000438641958131e-07, |
|
"loss": 0.1323, |
|
"num_tokens": 63881005.0, |
|
"reward": -0.10937303304672241, |
|
"reward_std": 0.6931461840867996, |
|
"rewards/cosine_scaled_reward": -0.054686516523361206, |
|
"step": 498 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2609.8959197998047, |
|
"epoch": 0.28514285714285714, |
|
"grad_norm": 0.31072404980659485, |
|
"kl": 0.005950927734375, |
|
"learning_rate": 1.0001096618257236e-07, |
|
"loss": 0.0247, |
|
"num_tokens": 64012160.0, |
|
"reward": 0.3292629097122699, |
|
"reward_std": 0.9715245068073273, |
|
"rewards/cosine_scaled_reward": 0.16463144560111687, |
|
"step": 499 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1381.4583435058594, |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 0.41521623730659485, |
|
"kl": 0.023681640625, |
|
"learning_rate": 1e-07, |
|
"loss": 0.032, |
|
"num_tokens": 64084134.0, |
|
"reward": -0.11208531260490417, |
|
"reward_std": 0.5163028538227081, |
|
"rewards/cosine_scaled_reward": -0.05604265257716179, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"step": 500, |
|
"total_flos": 0.0, |
|
"train_loss": 0.04327104251924902, |
|
"train_runtime": 47844.8241, |
|
"train_samples_per_second": 0.502, |
|
"train_steps_per_second": 0.01 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 6, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|