OpenRS-RLoRA-LoftQ-R32-Cosine-Len / trainer_state.json
colinpannikkat's picture
Model save
ea356b9 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.2857142857142857,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 3140.2083435058594,
"epoch": 0.0005714285714285715,
"grad_norm": 0.18732130527496338,
"kl": 0.1341552734375,
"learning_rate": 0.0,
"loss": -0.0125,
"reward": -0.22849145717918873,
"reward_std": 0.40205543488264084,
"rewards/cosine_len_reward": -0.22849145717918873,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 3231.1666870117188,
"epoch": 0.001142857142857143,
"grad_norm": 0.2461026906967163,
"kl": 0.05010986328125,
"learning_rate": 2e-08,
"loss": 0.0599,
"reward": -0.4702305719256401,
"reward_std": 0.45737794041633606,
"rewards/cosine_len_reward": -0.4702305719256401,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 3447.7500610351562,
"epoch": 0.0017142857142857142,
"grad_norm": 0.190439373254776,
"kl": 0.0457763671875,
"learning_rate": 4e-08,
"loss": -0.0022,
"reward": 0.24990134686231613,
"reward_std": 0.4683762863278389,
"rewards/cosine_len_reward": 0.24990134686231613,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 3227.5000610351562,
"epoch": 0.002285714285714286,
"grad_norm": 0.23573388159275055,
"kl": 0.044189453125,
"learning_rate": 6e-08,
"loss": -0.0779,
"reward": -0.3011130467057228,
"reward_std": 0.5483239553868771,
"rewards/cosine_len_reward": -0.3011130467057228,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 3382.2083740234375,
"epoch": 0.002857142857142857,
"grad_norm": 0.1897883266210556,
"kl": 0.0562744140625,
"learning_rate": 8e-08,
"loss": -0.0596,
"reward": -0.2980368435382843,
"reward_std": 0.5630971193313599,
"rewards/cosine_len_reward": -0.2980368435382843,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 3024.5,
"epoch": 0.0034285714285714284,
"grad_norm": 0.34085553884506226,
"kl": 0.044189453125,
"learning_rate": 1e-07,
"loss": 0.1698,
"reward": 0.12611429148819298,
"reward_std": 0.3041386976838112,
"rewards/cosine_len_reward": 0.12611429148819298,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 3208.4166870117188,
"epoch": 0.004,
"grad_norm": 0.21276699006557465,
"kl": 0.052337646484375,
"learning_rate": 1.2e-07,
"loss": 0.0942,
"reward": 0.17614510841667652,
"reward_std": 0.5236565172672272,
"rewards/cosine_len_reward": 0.17614510841667652,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 3275.1666870117188,
"epoch": 0.004571428571428572,
"grad_norm": 0.19224224984645844,
"kl": 0.0458984375,
"learning_rate": 1.4e-07,
"loss": -0.0598,
"reward": -0.38038296496961266,
"reward_std": 0.4509882442653179,
"rewards/cosine_len_reward": -0.38038296496961266,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 3164.7083740234375,
"epoch": 0.005142857142857143,
"grad_norm": 0.1741683930158615,
"kl": 0.0439453125,
"learning_rate": 1.6e-07,
"loss": -0.0354,
"reward": -0.2483000010251999,
"reward_std": 0.3471040166914463,
"rewards/cosine_len_reward": -0.2483000010251999,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 2916.0834350585938,
"epoch": 0.005714285714285714,
"grad_norm": 0.4565255641937256,
"kl": 0.23822021484375,
"learning_rate": 1.8e-07,
"loss": -0.1471,
"reward": -0.1595626100897789,
"reward_std": 0.5751340016722679,
"rewards/cosine_len_reward": -0.1595626100897789,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 2857.2083435058594,
"epoch": 0.006285714285714286,
"grad_norm": 0.269521027803421,
"kl": 0.05877685546875,
"learning_rate": 2e-07,
"loss": 0.0606,
"reward": -0.10372118093073368,
"reward_std": 0.45142534002661705,
"rewards/cosine_len_reward": -0.10372118093073368,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 3136.9583740234375,
"epoch": 0.006857142857142857,
"grad_norm": 0.1991637796163559,
"kl": 0.04522705078125,
"learning_rate": 2.1999999999999998e-07,
"loss": -0.0514,
"reward": 0.20943743363022804,
"reward_std": 0.46535836160182953,
"rewards/cosine_len_reward": 0.20943743363022804,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 3189.9166870117188,
"epoch": 0.0074285714285714285,
"grad_norm": 0.27295684814453125,
"kl": 0.040252685546875,
"learning_rate": 2.4e-07,
"loss": 0.1719,
"reward": -0.01165345311164856,
"reward_std": 0.5004820078611374,
"rewards/cosine_len_reward": -0.01165345311164856,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 3201.5833740234375,
"epoch": 0.008,
"grad_norm": 0.16425774991512299,
"kl": 0.04278564453125,
"learning_rate": 2.6e-07,
"loss": 0.0225,
"reward": -0.11870134994387627,
"reward_std": 0.5749331563711166,
"rewards/cosine_len_reward": -0.11870134994387627,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 2892.041717529297,
"epoch": 0.008571428571428572,
"grad_norm": 0.18019337952136993,
"kl": 0.0435791015625,
"learning_rate": 2.8e-07,
"loss": 0.0287,
"reward": -0.08168663457036018,
"reward_std": 0.37698886543512344,
"rewards/cosine_len_reward": -0.08168663457036018,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 3394.0416870117188,
"epoch": 0.009142857142857144,
"grad_norm": 0.20076414942741394,
"kl": 0.0494384765625,
"learning_rate": 3e-07,
"loss": -0.0377,
"reward": -0.3673112988471985,
"reward_std": 0.5469748228788376,
"rewards/cosine_len_reward": -0.3673112988471985,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.009714285714285713,
"grad_norm": 0.2074183225631714,
"kl": 0.04840087890625,
"learning_rate": 3.2e-07,
"loss": 0.0002,
"reward": -0.08958722651004791,
"reward_std": 0.46227796375751495,
"rewards/cosine_len_reward": -0.08958722651004791,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 3327.9166870117188,
"epoch": 0.010285714285714285,
"grad_norm": 0.16970673203468323,
"kl": 0.04620361328125,
"learning_rate": 3.4000000000000003e-07,
"loss": -0.0541,
"reward": -0.06295697297900915,
"reward_std": 0.48097309097647667,
"rewards/cosine_len_reward": -0.06295697297900915,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 3291.2083740234375,
"epoch": 0.010857142857142857,
"grad_norm": 0.18424074351787567,
"kl": 0.04925537109375,
"learning_rate": 3.6e-07,
"loss": -0.0548,
"reward": -0.2277919389307499,
"reward_std": 0.48374123871326447,
"rewards/cosine_len_reward": -0.2277919389307499,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 2776.0833740234375,
"epoch": 0.011428571428571429,
"grad_norm": 0.1781274378299713,
"kl": 0.04132080078125,
"learning_rate": 3.7999999999999996e-07,
"loss": 0.0761,
"reward": -0.4576909616589546,
"reward_std": 0.5087253600358963,
"rewards/cosine_len_reward": -0.4576909616589546,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 2673.541717529297,
"epoch": 0.012,
"grad_norm": 0.21896369755268097,
"kl": 0.06439208984375,
"learning_rate": 4e-07,
"loss": 0.0382,
"reward": -0.40826891735196114,
"reward_std": 0.475625216960907,
"rewards/cosine_len_reward": -0.40826891735196114,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 3570.5,
"epoch": 0.012571428571428572,
"grad_norm": 2.473532199859619,
"kl": 0.65576171875,
"learning_rate": 4.1999999999999995e-07,
"loss": 0.0095,
"reward": 0.19061553291976452,
"reward_std": 0.43087163195014,
"rewards/cosine_len_reward": 0.19061553291976452,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 3022.9166870117188,
"epoch": 0.013142857142857144,
"grad_norm": 0.30063244700431824,
"kl": 0.045166015625,
"learning_rate": 4.3999999999999997e-07,
"loss": -0.0909,
"reward": -0.22615898214280605,
"reward_std": 0.6225969791412354,
"rewards/cosine_len_reward": -0.22615898214280605,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 3317.2083740234375,
"epoch": 0.013714285714285714,
"grad_norm": 0.19970481097698212,
"kl": 0.0474853515625,
"learning_rate": 4.6e-07,
"loss": 0.0273,
"reward": -0.07718010246753693,
"reward_std": 0.3550200453028083,
"rewards/cosine_len_reward": -0.07718010246753693,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 2408.7083740234375,
"epoch": 0.014285714285714285,
"grad_norm": 0.25198450684547424,
"kl": 0.04376220703125,
"learning_rate": 4.8e-07,
"loss": -0.1531,
"reward": -0.3547450974583626,
"reward_std": 0.4954472631216049,
"rewards/cosine_len_reward": -0.3547450974583626,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 3286.1666870117188,
"epoch": 0.014857142857142857,
"grad_norm": 0.1706409901380539,
"kl": 0.0460205078125,
"learning_rate": 5e-07,
"loss": -0.0278,
"reward": -0.12194318068213761,
"reward_std": 0.5183630883693695,
"rewards/cosine_len_reward": -0.12194318068213761,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.015428571428571429,
"grad_norm": 0.15060991048812866,
"kl": 0.04339599609375,
"learning_rate": 5.2e-07,
"loss": 0.0002,
"reward": -0.11912490613758564,
"reward_std": 0.41010989993810654,
"rewards/cosine_len_reward": -0.11912490613758564,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 3351.2083740234375,
"epoch": 0.016,
"grad_norm": 0.17934846878051758,
"kl": 0.04510498046875,
"learning_rate": 5.4e-07,
"loss": -0.0621,
"reward": 0.05903707444667816,
"reward_std": 0.5544452294707298,
"rewards/cosine_len_reward": 0.05903707444667816,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 2895.625,
"epoch": 0.01657142857142857,
"grad_norm": 0.18568940460681915,
"kl": 0.04815673828125,
"learning_rate": 5.6e-07,
"loss": 0.0543,
"reward": -0.010081298649311066,
"reward_std": 0.36467570811510086,
"rewards/cosine_len_reward": -0.010081298649311066,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.017142857142857144,
"grad_norm": 0.15779373049736023,
"kl": 0.04766845703125,
"learning_rate": 5.8e-07,
"loss": 0.0002,
"reward": -0.28753336891531944,
"reward_std": 0.46697998046875,
"rewards/cosine_len_reward": -0.28753336891531944,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 2773.000030517578,
"epoch": 0.017714285714285714,
"grad_norm": 0.33190807700157166,
"kl": 0.05267333984375,
"learning_rate": 6e-07,
"loss": 0.1689,
"reward": -0.45986294001340866,
"reward_std": 0.38838284835219383,
"rewards/cosine_len_reward": -0.45986294001340866,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 2857.4166870117188,
"epoch": 0.018285714285714287,
"grad_norm": 0.19518494606018066,
"kl": 0.04412841796875,
"learning_rate": 6.2e-07,
"loss": -0.1331,
"reward": 0.06440184079110622,
"reward_std": 0.5917607620358467,
"rewards/cosine_len_reward": 0.06440184079110622,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.018857142857142857,
"grad_norm": 0.15161477029323578,
"kl": 0.05377197265625,
"learning_rate": 6.4e-07,
"loss": 0.0002,
"reward": 0.06449370458722115,
"reward_std": 0.40188299119472504,
"rewards/cosine_len_reward": 0.06449370458722115,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 3228.4583740234375,
"epoch": 0.019428571428571427,
"grad_norm": 0.19298215210437775,
"kl": 0.04290771484375,
"learning_rate": 6.6e-07,
"loss": 0.0274,
"reward": -0.1688922978937626,
"reward_std": 0.34010200947523117,
"rewards/cosine_len_reward": -0.1688922978937626,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 3495.125,
"epoch": 0.02,
"grad_norm": 0.18180347979068756,
"kl": 0.0584716796875,
"learning_rate": 6.800000000000001e-07,
"loss": -0.029,
"reward": -0.40583060681819916,
"reward_std": 0.5360103398561478,
"rewards/cosine_len_reward": -0.40583060681819916,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 3421.25,
"epoch": 0.02057142857142857,
"grad_norm": 0.17146086692810059,
"kl": 0.0543212890625,
"learning_rate": 7e-07,
"loss": -0.021,
"reward": 0.3642941191792488,
"reward_std": 0.3542026989161968,
"rewards/cosine_len_reward": 0.3642941191792488,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.021142857142857144,
"grad_norm": 0.14214502274990082,
"kl": 0.04010009765625,
"learning_rate": 7.2e-07,
"loss": 0.0002,
"reward": -0.07616325793787837,
"reward_std": 0.3850269839167595,
"rewards/cosine_len_reward": -0.07616325793787837,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 3495.291748046875,
"epoch": 0.021714285714285714,
"grad_norm": 0.17879678308963776,
"kl": 0.055908203125,
"learning_rate": 7.4e-07,
"loss": 0.0359,
"reward": -0.40084876120090485,
"reward_std": 0.44661080837249756,
"rewards/cosine_len_reward": -0.40084876120090485,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 2343.0834045410156,
"epoch": 0.022285714285714287,
"grad_norm": 0.24563190340995789,
"kl": 0.0567626953125,
"learning_rate": 7.599999999999999e-07,
"loss": 0.0236,
"reward": -0.11119658034294844,
"reward_std": 0.45050790160894394,
"rewards/cosine_len_reward": -0.11119658034294844,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.022857142857142857,
"grad_norm": 0.17550677061080933,
"kl": 0.0445556640625,
"learning_rate": 7.799999999999999e-07,
"loss": 0.0002,
"reward": -0.3701670579612255,
"reward_std": 0.3519079238176346,
"rewards/cosine_len_reward": -0.3701670579612255,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 3079.7083740234375,
"epoch": 0.023428571428571427,
"grad_norm": 0.17628754675388336,
"kl": 0.047119140625,
"learning_rate": 8e-07,
"loss": -0.0516,
"reward": -0.6531281769275665,
"reward_std": 0.3740066960453987,
"rewards/cosine_len_reward": -0.6531281769275665,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 2641.125030517578,
"epoch": 0.024,
"grad_norm": 0.36992311477661133,
"kl": 0.043609619140625,
"learning_rate": 8.199999999999999e-07,
"loss": 0.0454,
"reward": -0.12115837261080742,
"reward_std": 0.3918045926839113,
"rewards/cosine_len_reward": -0.12115837261080742,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.02457142857142857,
"grad_norm": 0.1853361576795578,
"kl": 0.0462646484375,
"learning_rate": 8.399999999999999e-07,
"loss": 0.0002,
"reward": -0.1932130679488182,
"reward_std": 0.4540579691529274,
"rewards/cosine_len_reward": -0.1932130679488182,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 3507.7500610351562,
"epoch": 0.025142857142857144,
"grad_norm": 0.17624704539775848,
"kl": 0.04443359375,
"learning_rate": 8.599999999999999e-07,
"loss": 0.0284,
"reward": -0.25109320878982544,
"reward_std": 0.5501705221831799,
"rewards/cosine_len_reward": -0.25109320878982544,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 3504.2083740234375,
"epoch": 0.025714285714285714,
"grad_norm": 0.19684357941150665,
"kl": 0.0775146484375,
"learning_rate": 8.799999999999999e-07,
"loss": 0.0359,
"reward": -0.3305511474609375,
"reward_std": 0.37591269612312317,
"rewards/cosine_len_reward": -0.3305511474609375,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 3419.6666870117188,
"epoch": 0.026285714285714287,
"grad_norm": 0.18291282653808594,
"kl": 0.047119140625,
"learning_rate": 9e-07,
"loss": -0.0574,
"reward": -0.3297368660569191,
"reward_std": 0.31174512580037117,
"rewards/cosine_len_reward": -0.3297368660569191,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 2836.0,
"epoch": 0.026857142857142857,
"grad_norm": 0.365488737821579,
"kl": 0.04034423828125,
"learning_rate": 9.2e-07,
"loss": 0.0762,
"reward": -0.35849449783563614,
"reward_std": 0.47992704063653946,
"rewards/cosine_len_reward": -0.35849449783563614,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 2915.9583740234375,
"epoch": 0.027428571428571427,
"grad_norm": 0.17787908017635345,
"kl": 0.052734375,
"learning_rate": 9.399999999999999e-07,
"loss": -0.0863,
"reward": -0.3561765216290951,
"reward_std": 0.6259109601378441,
"rewards/cosine_len_reward": -0.3561765216290951,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 2647.7083740234375,
"epoch": 0.028,
"grad_norm": 0.2575705647468567,
"kl": 0.05340576171875,
"learning_rate": 9.6e-07,
"loss": -0.108,
"reward": -0.4050520211458206,
"reward_std": 0.6429588124155998,
"rewards/cosine_len_reward": -0.4050520211458206,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 3576.875,
"epoch": 0.02857142857142857,
"grad_norm": 0.15978150069713593,
"kl": 0.04681396484375,
"learning_rate": 9.8e-07,
"loss": -0.0,
"reward": -0.0018655331805348396,
"reward_std": 0.3724985048174858,
"rewards/cosine_len_reward": -0.0018655331805348396,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 2881.9583435058594,
"epoch": 0.029142857142857144,
"grad_norm": 0.1981167048215866,
"kl": 0.038238525390625,
"learning_rate": 1e-06,
"loss": 0.0753,
"reward": -0.14606139063835144,
"reward_std": 0.4776982143521309,
"rewards/cosine_len_reward": -0.14606139063835144,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 3474.4583740234375,
"epoch": 0.029714285714285714,
"grad_norm": 0.1720048487186432,
"kl": 0.0439453125,
"learning_rate": 9.999890338174275e-07,
"loss": 0.0049,
"reward": -0.47205302864313126,
"reward_std": 0.389950018376112,
"rewards/cosine_len_reward": -0.47205302864313126,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 3330.2916870117188,
"epoch": 0.030285714285714287,
"grad_norm": 0.18484823405742645,
"kl": 0.042236328125,
"learning_rate": 9.999561358041868e-07,
"loss": -0.0542,
"reward": 0.1891404390335083,
"reward_std": 0.26611476950347424,
"rewards/cosine_len_reward": 0.1891404390335083,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 3254.375,
"epoch": 0.030857142857142857,
"grad_norm": 0.19311083853244781,
"kl": 0.0550537109375,
"learning_rate": 9.999013075636804e-07,
"loss": -0.0485,
"reward": -0.4130496457219124,
"reward_std": 0.39373762160539627,
"rewards/cosine_len_reward": -0.4130496457219124,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.03142857142857143,
"grad_norm": 0.15826281905174255,
"kl": 0.06268310546875,
"learning_rate": 9.998245517681593e-07,
"loss": 0.0003,
"reward": -0.4410124532878399,
"reward_std": 0.3935729172080755,
"rewards/cosine_len_reward": -0.4410124532878399,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 3478.875,
"epoch": 0.032,
"grad_norm": 0.1909007728099823,
"kl": 0.06378173828125,
"learning_rate": 9.997258721585931e-07,
"loss": -0.0334,
"reward": -0.14306456223130226,
"reward_std": 0.5048741772770882,
"rewards/cosine_len_reward": -0.14306456223130226,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 3030.9583740234375,
"epoch": 0.03257142857142857,
"grad_norm": 0.16624057292938232,
"kl": 0.04339599609375,
"learning_rate": 9.996052735444862e-07,
"loss": 0.0213,
"reward": -0.2449845364317298,
"reward_std": 0.2843635231256485,
"rewards/cosine_len_reward": -0.2449845364317298,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.03314285714285714,
"grad_norm": 0.14864130318164825,
"kl": 0.04376220703125,
"learning_rate": 9.994627618036452e-07,
"loss": 0.0002,
"reward": -0.477820735424757,
"reward_std": 0.42124854028224945,
"rewards/cosine_len_reward": -0.477820735424757,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.03371428571428572,
"grad_norm": 0.17053081095218658,
"kl": 0.05047607421875,
"learning_rate": 9.992983438818915e-07,
"loss": 0.0002,
"reward": -0.018036723136901855,
"reward_std": 0.4528404772281647,
"rewards/cosine_len_reward": -0.018036723136901855,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 3503.9583740234375,
"epoch": 0.03428571428571429,
"grad_norm": 0.15964706242084503,
"kl": 0.040802001953125,
"learning_rate": 9.991120277927223e-07,
"loss": 0.0065,
"reward": -0.16116868564859033,
"reward_std": 0.40593869611620903,
"rewards/cosine_len_reward": -0.16116868564859033,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 2942.5833740234375,
"epoch": 0.03485714285714286,
"grad_norm": 0.25859034061431885,
"kl": 0.05389404296875,
"learning_rate": 9.989038226169207e-07,
"loss": -0.0849,
"reward": -0.3965909481048584,
"reward_std": 0.4511411488056183,
"rewards/cosine_len_reward": -0.3965909481048584,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 2976.75,
"epoch": 0.03542857142857143,
"grad_norm": 0.21166636049747467,
"kl": 0.073974609375,
"learning_rate": 9.98673738502114e-07,
"loss": 0.0526,
"reward": -0.46443600207567215,
"reward_std": 0.3938862681388855,
"rewards/cosine_len_reward": -0.46443600207567215,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.036,
"grad_norm": 0.15653888881206512,
"kl": 0.04681396484375,
"learning_rate": 9.98421786662277e-07,
"loss": 0.0002,
"reward": 0.19495274312794209,
"reward_std": 0.27957216277718544,
"rewards/cosine_len_reward": 0.19495274312794209,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.036571428571428574,
"grad_norm": 0.14757102727890015,
"kl": 0.045074462890625,
"learning_rate": 9.981479793771866e-07,
"loss": 0.0002,
"reward": -0.21563971042633057,
"reward_std": 0.33880844712257385,
"rewards/cosine_len_reward": -0.21563971042633057,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.037142857142857144,
"grad_norm": 0.16783742606639862,
"kl": 0.04986572265625,
"learning_rate": 9.97852329991824e-07,
"loss": 0.0002,
"reward": 0.1148218410089612,
"reward_std": 0.48075347393751144,
"rewards/cosine_len_reward": 0.1148218410089612,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 3331.2083740234375,
"epoch": 0.037714285714285714,
"grad_norm": 0.13948343694210052,
"kl": 0.04083251953125,
"learning_rate": 9.975348529157229e-07,
"loss": -0.0053,
"reward": -0.0602837149053812,
"reward_std": 0.5337617173790932,
"rewards/cosine_len_reward": -0.0602837149053812,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 2811.375,
"epoch": 0.038285714285714284,
"grad_norm": 0.2400464117527008,
"kl": 0.05047607421875,
"learning_rate": 9.971955636222684e-07,
"loss": 0.0394,
"reward": -0.29330265894532204,
"reward_std": 0.3466031104326248,
"rewards/cosine_len_reward": -0.29330265894532204,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.038857142857142854,
"grad_norm": 0.14053481817245483,
"kl": 0.03558349609375,
"learning_rate": 9.968344786479415e-07,
"loss": 0.0001,
"reward": -0.15899624675512314,
"reward_std": 0.4388071522116661,
"rewards/cosine_len_reward": -0.15899624675512314,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 3306.6666870117188,
"epoch": 0.03942857142857143,
"grad_norm": 0.1672668159008026,
"kl": 0.0478515625,
"learning_rate": 9.964516155915151e-07,
"loss": -0.0384,
"reward": -0.5151599273085594,
"reward_std": 0.3366116564720869,
"rewards/cosine_len_reward": -0.5151599273085594,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 3380.9583740234375,
"epoch": 0.04,
"grad_norm": 0.20518527925014496,
"kl": 0.0628662109375,
"learning_rate": 9.960469931131936e-07,
"loss": 0.0704,
"reward": -0.2993215322494507,
"reward_std": 0.4852989763021469,
"rewards/cosine_len_reward": -0.2993215322494507,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 2935.3750915527344,
"epoch": 0.04057142857142857,
"grad_norm": 0.21485304832458496,
"kl": 0.050140380859375,
"learning_rate": 9.956206309337066e-07,
"loss": 0.1084,
"reward": -0.3775600343942642,
"reward_std": 0.2518395222723484,
"rewards/cosine_len_reward": -0.3775600343942642,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 3209.5000610351562,
"epoch": 0.04114285714285714,
"grad_norm": 0.18990004062652588,
"kl": 0.05255126953125,
"learning_rate": 9.951725498333448e-07,
"loss": 0.0459,
"reward": -0.15486798901110888,
"reward_std": 0.564708050340414,
"rewards/cosine_len_reward": -0.15486798901110888,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 2904.9583435058594,
"epoch": 0.04171428571428572,
"grad_norm": 0.24567438662052155,
"kl": 0.0462646484375,
"learning_rate": 9.947027716509488e-07,
"loss": 0.061,
"reward": -0.44290701299905777,
"reward_std": 0.3936958834528923,
"rewards/cosine_len_reward": -0.44290701299905777,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 3459.375,
"epoch": 0.04228571428571429,
"grad_norm": 0.13506685197353363,
"kl": 0.04144287109375,
"learning_rate": 9.942113192828444e-07,
"loss": -0.031,
"reward": -0.12243828363716602,
"reward_std": 0.40473101660609245,
"rewards/cosine_len_reward": -0.12243828363716602,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 3118.5000610351562,
"epoch": 0.04285714285714286,
"grad_norm": 0.15806534886360168,
"kl": 0.045806884765625,
"learning_rate": 9.93698216681727e-07,
"loss": 0.034,
"reward": -0.3190483935177326,
"reward_std": 0.5546181797981262,
"rewards/cosine_len_reward": -0.3190483935177326,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 3512.0416870117188,
"epoch": 0.04342857142857143,
"grad_norm": 0.15939749777317047,
"kl": 0.0576171875,
"learning_rate": 9.931634888554935e-07,
"loss": -0.043,
"reward": -0.38086188584566116,
"reward_std": 0.37592547200620174,
"rewards/cosine_len_reward": -0.38086188584566116,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 3427.5416870117188,
"epoch": 0.044,
"grad_norm": 0.15605805814266205,
"kl": 0.0501708984375,
"learning_rate": 9.926071618660237e-07,
"loss": 0.0109,
"reward": -0.1746644452214241,
"reward_std": 0.49196697771549225,
"rewards/cosine_len_reward": -0.1746644452214241,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 3170.1250610351562,
"epoch": 0.044571428571428574,
"grad_norm": 0.15445668995380402,
"kl": 0.050537109375,
"learning_rate": 9.9202926282791e-07,
"loss": 0.0035,
"reward": -0.20145024731755257,
"reward_std": 0.47183725237846375,
"rewards/cosine_len_reward": -0.20145024731755257,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 2967.25,
"epoch": 0.045142857142857144,
"grad_norm": 0.17259816825389862,
"kl": 0.0391845703125,
"learning_rate": 9.91429819907136e-07,
"loss": 0.0579,
"reward": -0.2730537634342909,
"reward_std": 0.35801637917757034,
"rewards/cosine_len_reward": -0.2730537634342909,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.045714285714285714,
"grad_norm": 0.1690661758184433,
"kl": 0.04119873046875,
"learning_rate": 9.908088623197048e-07,
"loss": 0.0002,
"reward": -0.12683004513382912,
"reward_std": 0.4240909740328789,
"rewards/cosine_len_reward": -0.12683004513382912,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 3565.375,
"epoch": 0.046285714285714284,
"grad_norm": 0.1551636904478073,
"kl": 0.0504150390625,
"learning_rate": 9.901664203302124e-07,
"loss": -0.0052,
"reward": 0.41808657720685005,
"reward_std": 0.4268548539839685,
"rewards/cosine_len_reward": 0.41808657720685005,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 3496.7083740234375,
"epoch": 0.046857142857142854,
"grad_norm": 0.16650335490703583,
"kl": 0.04388427734375,
"learning_rate": 9.895025252503755e-07,
"loss": 0.0324,
"reward": -0.5734090358018875,
"reward_std": 0.5235396586358547,
"rewards/cosine_len_reward": -0.5734090358018875,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 2126.4166870117188,
"epoch": 0.04742857142857143,
"grad_norm": 0.2955755293369293,
"kl": 0.045166015625,
"learning_rate": 9.888172094375033e-07,
"loss": -0.0123,
"reward": -0.059800997376441956,
"reward_std": 0.57283616065979,
"rewards/cosine_len_reward": -0.059800997376441956,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 3072.9166870117188,
"epoch": 0.048,
"grad_norm": 0.18237070739269257,
"kl": 0.047119140625,
"learning_rate": 9.881105062929221e-07,
"loss": 0.032,
"reward": 0.17192097008228302,
"reward_std": 0.2965017929673195,
"rewards/cosine_len_reward": 0.17192097008228302,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 3147.625,
"epoch": 0.04857142857142857,
"grad_norm": 0.2028658390045166,
"kl": 0.0504150390625,
"learning_rate": 9.873824502603459e-07,
"loss": -0.1166,
"reward": -0.07770865596830845,
"reward_std": 0.5760391876101494,
"rewards/cosine_len_reward": -0.07770865596830845,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 3556.5833740234375,
"epoch": 0.04914285714285714,
"grad_norm": 0.1445421427488327,
"kl": 0.04150390625,
"learning_rate": 9.866330768241983e-07,
"loss": 0.0109,
"reward": -0.3070959039032459,
"reward_std": 0.37722079269587994,
"rewards/cosine_len_reward": -0.3070959039032459,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 3544.0416870117188,
"epoch": 0.04971428571428571,
"grad_norm": 0.15989543497562408,
"kl": 0.052490234375,
"learning_rate": 9.85862422507884e-07,
"loss": 0.0039,
"reward": 0.007470171898603439,
"reward_std": 0.6530721038579941,
"rewards/cosine_len_reward": 0.007470171898603439,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 3037.25,
"epoch": 0.05028571428571429,
"grad_norm": 0.19774754345417023,
"kl": 0.050048828125,
"learning_rate": 9.850705248720068e-07,
"loss": 0.0108,
"reward": -0.5706376060843468,
"reward_std": 0.3964765667915344,
"rewards/cosine_len_reward": -0.5706376060843468,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 3323.0416870117188,
"epoch": 0.05085714285714286,
"grad_norm": 0.16871894896030426,
"kl": 0.04931640625,
"learning_rate": 9.8425742251254e-07,
"loss": -0.0064,
"reward": -0.3148947209119797,
"reward_std": 0.49598074331879616,
"rewards/cosine_len_reward": -0.3148947209119797,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 3020.5833740234375,
"epoch": 0.05142857142857143,
"grad_norm": 0.15906846523284912,
"kl": 0.035430908203125,
"learning_rate": 9.83423155058946e-07,
"loss": -0.1082,
"reward": -0.3871347066015005,
"reward_std": 0.48989518731832504,
"rewards/cosine_len_reward": -0.3871347066015005,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 3493.125,
"epoch": 0.052,
"grad_norm": 0.158765509724617,
"kl": 0.047607421875,
"learning_rate": 9.825677631722435e-07,
"loss": 0.0357,
"reward": -0.019672296941280365,
"reward_std": 0.5390463247895241,
"rewards/cosine_len_reward": -0.019672296941280365,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 2866.375,
"epoch": 0.052571428571428575,
"grad_norm": 0.3300051689147949,
"kl": 0.05999755859375,
"learning_rate": 9.816912885430258e-07,
"loss": 0.0759,
"reward": -0.32447104528546333,
"reward_std": 0.4015946201980114,
"rewards/cosine_len_reward": -0.32447104528546333,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 2927.4166870117188,
"epoch": 0.053142857142857144,
"grad_norm": 0.2505541741847992,
"kl": 0.0513916015625,
"learning_rate": 9.807937738894303e-07,
"loss": -0.0618,
"reward": -0.26003802567720413,
"reward_std": 0.44223659485578537,
"rewards/cosine_len_reward": -0.26003802567720413,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 3462.6250610351562,
"epoch": 0.053714285714285714,
"grad_norm": 0.13466289639472961,
"kl": 0.037841796875,
"learning_rate": 9.798752629550546e-07,
"loss": -0.0284,
"reward": -0.3509189058095217,
"reward_std": 0.46187154948711395,
"rewards/cosine_len_reward": -0.3509189058095217,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 2668.5834045410156,
"epoch": 0.054285714285714284,
"grad_norm": 0.40417179465293884,
"kl": 0.057861328125,
"learning_rate": 9.78935800506826e-07,
"loss": 0.1304,
"reward": -0.12394729629158974,
"reward_std": 0.36398765817284584,
"rewards/cosine_len_reward": -0.12394729629158974,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.054857142857142854,
"grad_norm": 0.15475483238697052,
"kl": 0.04119873046875,
"learning_rate": 9.779754323328192e-07,
"loss": 0.0002,
"reward": 0.05401550233364105,
"reward_std": 0.4927036911249161,
"rewards/cosine_len_reward": 0.05401550233364105,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 3166.9166870117188,
"epoch": 0.05542857142857143,
"grad_norm": 0.18519708514213562,
"kl": 0.05224609375,
"learning_rate": 9.769942052400235e-07,
"loss": 0.0336,
"reward": -0.1515663117170334,
"reward_std": 0.4508054330945015,
"rewards/cosine_len_reward": -0.1515663117170334,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 2021.6667175292969,
"epoch": 0.056,
"grad_norm": 0.5753190517425537,
"kl": 0.047119140625,
"learning_rate": 9.759921670520634e-07,
"loss": 0.0775,
"reward": -0.10128153627738357,
"reward_std": 0.5495500713586807,
"rewards/cosine_len_reward": -0.10128153627738357,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 2799.875,
"epoch": 0.05657142857142857,
"grad_norm": 0.25073686242103577,
"kl": 0.04339599609375,
"learning_rate": 9.749693666068663e-07,
"loss": 0.0462,
"reward": 0.16878609731793404,
"reward_std": 0.4969979338347912,
"rewards/cosine_len_reward": 0.16878609731793404,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 2547.6666717529297,
"epoch": 0.05714285714285714,
"grad_norm": 0.2086445391178131,
"kl": 0.0418701171875,
"learning_rate": 9.739258537542835e-07,
"loss": 0.0808,
"reward": 0.23498320078942925,
"reward_std": 0.4619377665221691,
"rewards/cosine_len_reward": 0.23498320078942925,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 3272.291748046875,
"epoch": 0.05771428571428571,
"grad_norm": 0.16273322701454163,
"kl": 0.04364013671875,
"learning_rate": 9.728616793536587e-07,
"loss": 0.0951,
"reward": -5.926936864852905e-06,
"reward_std": 0.4691709503531456,
"rewards/cosine_len_reward": -5.926936864852905e-06,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 3564.25,
"epoch": 0.05828571428571429,
"grad_norm": 0.16961455345153809,
"kl": 0.05303955078125,
"learning_rate": 9.717768952713511e-07,
"loss": 0.007,
"reward": -0.39636145159602165,
"reward_std": 0.3456159494817257,
"rewards/cosine_len_reward": -0.39636145159602165,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.05885714285714286,
"grad_norm": 0.17157597839832306,
"kl": 0.0433349609375,
"learning_rate": 9.706715543782064e-07,
"loss": 0.0002,
"reward": -0.5884420499205589,
"reward_std": 0.35518577694892883,
"rewards/cosine_len_reward": -0.5884420499205589,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.05942857142857143,
"grad_norm": 0.14763562381267548,
"kl": 0.05126953125,
"learning_rate": 9.695457105469804e-07,
"loss": 0.0002,
"reward": -0.1686297096312046,
"reward_std": 0.43287331238389015,
"rewards/cosine_len_reward": -0.1686297096312046,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 3580.0416870117188,
"epoch": 0.06,
"grad_norm": 0.14612889289855957,
"kl": 0.04803466796875,
"learning_rate": 9.683994186497132e-07,
"loss": -0.0009,
"reward": -0.3449864834547043,
"reward_std": 0.30942362174391747,
"rewards/cosine_len_reward": -0.3449864834547043,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 3102.7083435058594,
"epoch": 0.060571428571428575,
"grad_norm": 0.19139307737350464,
"kl": 0.04766845703125,
"learning_rate": 9.672327345550543e-07,
"loss": -0.0008,
"reward": -0.35323648154735565,
"reward_std": 0.4123990163207054,
"rewards/cosine_len_reward": -0.35323648154735565,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 2927.375,
"epoch": 0.061142857142857145,
"grad_norm": 0.19999508559703827,
"kl": 0.0523681640625,
"learning_rate": 9.66045715125541e-07,
"loss": 0.0263,
"reward": 0.09981860686093569,
"reward_std": 0.3403412587940693,
"rewards/cosine_len_reward": 0.09981860686093569,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 2892.0833740234375,
"epoch": 0.061714285714285715,
"grad_norm": 0.20375655591487885,
"kl": 0.04254150390625,
"learning_rate": 9.648384182148252e-07,
"loss": 0.0582,
"reward": -0.0642486959695816,
"reward_std": 0.5786372274160385,
"rewards/cosine_len_reward": -0.0642486959695816,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 3558.125,
"epoch": 0.062285714285714285,
"grad_norm": 0.15504595637321472,
"kl": 0.051025390625,
"learning_rate": 9.636109026648554e-07,
"loss": 0.014,
"reward": -0.164622500538826,
"reward_std": 0.3372262194752693,
"rewards/cosine_len_reward": -0.164622500538826,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 3348.0833740234375,
"epoch": 0.06285714285714286,
"grad_norm": 0.14603567123413086,
"kl": 0.041534423828125,
"learning_rate": 9.623632283030077e-07,
"loss": -0.0193,
"reward": -0.2948532775044441,
"reward_std": 0.39745140075683594,
"rewards/cosine_len_reward": -0.2948532775044441,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 2819.0833435058594,
"epoch": 0.06342857142857143,
"grad_norm": 0.24989280104637146,
"kl": 0.0501708984375,
"learning_rate": 9.610954559391704e-07,
"loss": 0.0304,
"reward": 0.09465552493929863,
"reward_std": 0.4290134608745575,
"rewards/cosine_len_reward": 0.09465552493929863,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.064,
"grad_norm": 0.1428801417350769,
"kl": 0.04595947265625,
"learning_rate": 9.598076473627796e-07,
"loss": 0.0002,
"reward": -0.19034046679735184,
"reward_std": 0.2815712224692106,
"rewards/cosine_len_reward": -0.19034046679735184,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 3333.75,
"epoch": 0.06457142857142857,
"grad_norm": 0.17772957682609558,
"kl": 0.0450439453125,
"learning_rate": 9.58499865339809e-07,
"loss": -0.0675,
"reward": 0.15639985352754593,
"reward_std": 0.49782028794288635,
"rewards/cosine_len_reward": 0.15639985352754593,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 3200.2916870117188,
"epoch": 0.06514285714285714,
"grad_norm": 0.18176570534706116,
"kl": 0.041351318359375,
"learning_rate": 9.571721736097088e-07,
"loss": 0.0359,
"reward": -0.23854705691337585,
"reward_std": 0.3703417256474495,
"rewards/cosine_len_reward": -0.23854705691337585,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 3426.2083740234375,
"epoch": 0.06571428571428571,
"grad_norm": 0.18212103843688965,
"kl": 0.05426025390625,
"learning_rate": 9.55824636882301e-07,
"loss": -0.0068,
"reward": 0.19186732172966003,
"reward_std": 0.5083862394094467,
"rewards/cosine_len_reward": 0.19186732172966003,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.06628571428571428,
"grad_norm": 0.1472100466489792,
"kl": 0.04229736328125,
"learning_rate": 9.54457320834625e-07,
"loss": 0.0002,
"reward": -0.6675661206245422,
"reward_std": 0.25827275589108467,
"rewards/cosine_len_reward": -0.6675661206245422,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 3405.0,
"epoch": 0.06685714285714285,
"grad_norm": 0.1627156138420105,
"kl": 0.04766845703125,
"learning_rate": 9.530702921077358e-07,
"loss": 0.0332,
"reward": -0.04678164981305599,
"reward_std": 0.4391244202852249,
"rewards/cosine_len_reward": -0.04678164981305599,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 3078.125,
"epoch": 0.06742857142857143,
"grad_norm": 0.23983289301395416,
"kl": 0.0458984375,
"learning_rate": 9.516636183034564e-07,
"loss": 0.0603,
"reward": -0.4750605896115303,
"reward_std": 0.4151216857135296,
"rewards/cosine_len_reward": -0.4750605896115303,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 2418.000030517578,
"epoch": 0.068,
"grad_norm": 0.24952396750450134,
"kl": 0.05657958984375,
"learning_rate": 9.502373679810839e-07,
"loss": -0.0338,
"reward": -0.32502793427556753,
"reward_std": 0.4044921174645424,
"rewards/cosine_len_reward": -0.32502793427556753,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 3572.25,
"epoch": 0.06857142857142857,
"grad_norm": 0.15841135382652283,
"kl": 0.04815673828125,
"learning_rate": 9.487916106540465e-07,
"loss": 0.0057,
"reward": -0.2742099305614829,
"reward_std": 0.6057566404342651,
"rewards/cosine_len_reward": -0.2742099305614829,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 3224.6666870117188,
"epoch": 0.06914285714285714,
"grad_norm": 0.17755034565925598,
"kl": 0.0517578125,
"learning_rate": 9.473264167865171e-07,
"loss": -0.0588,
"reward": 0.009542322251945734,
"reward_std": 0.46895354986190796,
"rewards/cosine_len_reward": 0.009542322251945734,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 2758.4166870117188,
"epoch": 0.06971428571428571,
"grad_norm": 0.20300975441932678,
"kl": 0.0401611328125,
"learning_rate": 9.458418577899774e-07,
"loss": 0.0569,
"reward": -0.16051488742232323,
"reward_std": 0.5097866654396057,
"rewards/cosine_len_reward": -0.16051488742232323,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 3386.9583740234375,
"epoch": 0.07028571428571428,
"grad_norm": 0.14953473210334778,
"kl": 0.045745849609375,
"learning_rate": 9.443380060197385e-07,
"loss": -0.0194,
"reward": -0.28279081732034683,
"reward_std": 0.46559938788414,
"rewards/cosine_len_reward": -0.28279081732034683,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 3543.1666870117188,
"epoch": 0.07085714285714285,
"grad_norm": 0.13742747902870178,
"kl": 0.041259765625,
"learning_rate": 9.428149347714143e-07,
"loss": 0.0186,
"reward": 0.1831405507400632,
"reward_std": 0.4129592701792717,
"rewards/cosine_len_reward": 0.1831405507400632,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 2738.7916870117188,
"epoch": 0.07142857142857142,
"grad_norm": 0.22325678169727325,
"kl": 0.0482177734375,
"learning_rate": 9.412727182773486e-07,
"loss": -0.1429,
"reward": -0.3428646810352802,
"reward_std": 0.418590746819973,
"rewards/cosine_len_reward": -0.3428646810352802,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 2154.0,
"epoch": 0.072,
"grad_norm": 0.28196725249290466,
"kl": 0.04034423828125,
"learning_rate": 9.397114317029974e-07,
"loss": 0.1723,
"reward": -0.04585587605834007,
"reward_std": 0.5331225916743279,
"rewards/cosine_len_reward": -0.04585587605834007,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 3005.375030517578,
"epoch": 0.07257142857142856,
"grad_norm": 0.23298974335193634,
"kl": 0.043731689453125,
"learning_rate": 9.381311511432658e-07,
"loss": -0.1325,
"reward": -0.21542668342590332,
"reward_std": 0.49729710817337036,
"rewards/cosine_len_reward": -0.21542668342590332,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 3290.0000610351562,
"epoch": 0.07314285714285715,
"grad_norm": 0.1574346274137497,
"kl": 0.047027587890625,
"learning_rate": 9.36531953618799e-07,
"loss": -0.0311,
"reward": 0.13029874116182327,
"reward_std": 0.4678277000784874,
"rewards/cosine_len_reward": 0.13029874116182327,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 2900.3750610351562,
"epoch": 0.07371428571428572,
"grad_norm": 0.1946721225976944,
"kl": 0.03863525390625,
"learning_rate": 9.34913917072228e-07,
"loss": -0.0883,
"reward": 0.22734229266643524,
"reward_std": 0.49264590442180634,
"rewards/cosine_len_reward": 0.22734229266643524,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 3336.4583740234375,
"epoch": 0.07428571428571429,
"grad_norm": 0.16337695717811584,
"kl": 0.0472412109375,
"learning_rate": 9.332771203643714e-07,
"loss": -0.1032,
"reward": -0.0558868944644928,
"reward_std": 0.22906366735696793,
"rewards/cosine_len_reward": -0.0558868944644928,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 3318.0416870117188,
"epoch": 0.07485714285714286,
"grad_norm": 0.14785107970237732,
"kl": 0.04193115234375,
"learning_rate": 9.316216432703916e-07,
"loss": 0.0209,
"reward": -0.4462656928226352,
"reward_std": 0.4424574077129364,
"rewards/cosine_len_reward": -0.4462656928226352,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 3420.75,
"epoch": 0.07542857142857143,
"grad_norm": 0.15938332676887512,
"kl": 0.04559326171875,
"learning_rate": 9.299475664759068e-07,
"loss": -0.0455,
"reward": -0.19067499786615372,
"reward_std": 0.2114762719720602,
"rewards/cosine_len_reward": -0.19067499786615372,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 3216.5416870117188,
"epoch": 0.076,
"grad_norm": 21.750301361083984,
"kl": 7.40423583984375,
"learning_rate": 9.282549715730579e-07,
"loss": -0.0148,
"reward": 0.1352168396115303,
"reward_std": 0.30277941189706326,
"rewards/cosine_len_reward": 0.1352168396115303,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 3362.2500610351562,
"epoch": 0.07657142857142857,
"grad_norm": 0.14318473637104034,
"kl": 0.04864501953125,
"learning_rate": 9.265439410565328e-07,
"loss": -0.0521,
"reward": 0.48804809525609016,
"reward_std": 0.4641268514096737,
"rewards/cosine_len_reward": 0.48804809525609016,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 3025.2083435058594,
"epoch": 0.07714285714285714,
"grad_norm": 0.17564912140369415,
"kl": 0.05517578125,
"learning_rate": 9.248145583195447e-07,
"loss": 0.0415,
"reward": -0.44774627685546875,
"reward_std": 0.5005357041954994,
"rewards/cosine_len_reward": -0.44774627685546875,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 2600.541748046875,
"epoch": 0.07771428571428571,
"grad_norm": 0.2430720180273056,
"kl": 0.07049560546875,
"learning_rate": 9.230669076497687e-07,
"loss": 0.0399,
"reward": 0.02273743972182274,
"reward_std": 0.6690217182040215,
"rewards/cosine_len_reward": 0.02273743972182274,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 3021.1666870117188,
"epoch": 0.07828571428571429,
"grad_norm": 0.1892845779657364,
"kl": 0.0400390625,
"learning_rate": 9.213010742252327e-07,
"loss": -0.0509,
"reward": -0.007634974084794521,
"reward_std": 0.4675633981823921,
"rewards/cosine_len_reward": -0.007634974084794521,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 2813.9166870117188,
"epoch": 0.07885714285714286,
"grad_norm": 0.17883385717868805,
"kl": 0.04425048828125,
"learning_rate": 9.195171441101668e-07,
"loss": -0.0573,
"reward": -0.4329497180879116,
"reward_std": 0.447217158973217,
"rewards/cosine_len_reward": -0.4329497180879116,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 3572.0,
"epoch": 0.07942857142857143,
"grad_norm": 0.14457197487354279,
"kl": 0.04534912109375,
"learning_rate": 9.177152042508077e-07,
"loss": 0.0069,
"reward": -0.40829066932201385,
"reward_std": 0.3393409103155136,
"rewards/cosine_len_reward": -0.40829066932201385,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 3333.2916870117188,
"epoch": 0.08,
"grad_norm": 0.14431318640708923,
"kl": 0.04229736328125,
"learning_rate": 9.158953424711624e-07,
"loss": -0.0213,
"reward": -0.23191562667489052,
"reward_std": 0.45371272414922714,
"rewards/cosine_len_reward": -0.23191562667489052,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 3125.2500610351562,
"epoch": 0.08057142857142857,
"grad_norm": 0.18807615339756012,
"kl": 0.056884765625,
"learning_rate": 9.140576474687263e-07,
"loss": 0.0654,
"reward": -0.58438640832901,
"reward_std": 0.34158289059996605,
"rewards/cosine_len_reward": -0.58438640832901,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 3470.5416870117188,
"epoch": 0.08114285714285714,
"grad_norm": 0.16417664289474487,
"kl": 0.054443359375,
"learning_rate": 9.122022088101613e-07,
"loss": -0.0156,
"reward": 0.4055279679596424,
"reward_std": 0.5106773134320974,
"rewards/cosine_len_reward": 0.4055279679596424,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 2767.625,
"epoch": 0.08171428571428571,
"grad_norm": 0.23091278970241547,
"kl": 0.046142578125,
"learning_rate": 9.103291169269299e-07,
"loss": 0.0155,
"reward": -0.025373689830303192,
"reward_std": 0.3311509620398283,
"rewards/cosine_len_reward": -0.025373689830303192,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 3129.4583740234375,
"epoch": 0.08228571428571428,
"grad_norm": 0.21718049049377441,
"kl": 0.04827880859375,
"learning_rate": 9.084384631108882e-07,
"loss": -0.0761,
"reward": -0.16653983620926738,
"reward_std": 0.6158961765468121,
"rewards/cosine_len_reward": -0.16653983620926738,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 3168.875,
"epoch": 0.08285714285714285,
"grad_norm": 0.17413252592086792,
"kl": 0.06317138671875,
"learning_rate": 9.065303395098358e-07,
"loss": 0.0265,
"reward": 0.03458546567708254,
"reward_std": 0.6630469337105751,
"rewards/cosine_len_reward": 0.03458546567708254,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 2882.125030517578,
"epoch": 0.08342857142857144,
"grad_norm": 0.34923824667930603,
"kl": 0.0794677734375,
"learning_rate": 9.046048391230247e-07,
"loss": -0.1138,
"reward": -0.3882593959569931,
"reward_std": 0.4065255671739578,
"rewards/cosine_len_reward": -0.3882593959569931,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 3010.625030517578,
"epoch": 0.084,
"grad_norm": 0.16546739637851715,
"kl": 0.04425048828125,
"learning_rate": 9.026620557966279e-07,
"loss": -0.0449,
"reward": -0.2956245392560959,
"reward_std": 0.2463199496269226,
"rewards/cosine_len_reward": -0.2956245392560959,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 3277.625,
"epoch": 0.08457142857142858,
"grad_norm": 0.1968696564435959,
"kl": 0.04840087890625,
"learning_rate": 9.007020842191634e-07,
"loss": 0.0347,
"reward": -0.1628122478723526,
"reward_std": 0.32364944741129875,
"rewards/cosine_len_reward": -0.1628122478723526,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 3504.0416870117188,
"epoch": 0.08514285714285715,
"grad_norm": 0.17119912803173065,
"kl": 0.06097412109375,
"learning_rate": 8.987250199168808e-07,
"loss": 0.0393,
"reward": -0.14941053837537766,
"reward_std": 0.3042156994342804,
"rewards/cosine_len_reward": -0.14941053837537766,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 3513.0,
"epoch": 0.08571428571428572,
"grad_norm": 0.16584385931491852,
"kl": 0.059814453125,
"learning_rate": 8.967309592491052e-07,
"loss": -0.0111,
"reward": 0.037046159617602825,
"reward_std": 0.5206618458032608,
"rewards/cosine_len_reward": 0.037046159617602825,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 3420.7500610351562,
"epoch": 0.08628571428571429,
"grad_norm": 0.16107913851737976,
"kl": 0.03912353515625,
"learning_rate": 8.9471999940354e-07,
"loss": -0.0009,
"reward": -0.03606244549155235,
"reward_std": 0.24746908619999886,
"rewards/cosine_len_reward": -0.03606244549155235,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 3524.4166870117188,
"epoch": 0.08685714285714285,
"grad_norm": 0.16076341271400452,
"kl": 0.0489501953125,
"learning_rate": 8.926922383915315e-07,
"loss": 0.0361,
"reward": -0.19618698582053185,
"reward_std": 0.46029967814683914,
"rewards/cosine_len_reward": -0.19618698582053185,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 3083.5833435058594,
"epoch": 0.08742857142857142,
"grad_norm": 0.1417928785085678,
"kl": 0.039093017578125,
"learning_rate": 8.906477750432903e-07,
"loss": 0.0074,
"reward": -0.4691794868558645,
"reward_std": 0.4594339244067669,
"rewards/cosine_len_reward": -0.4691794868558645,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.088,
"grad_norm": 0.16717113554477692,
"kl": 0.05596923828125,
"learning_rate": 8.88586709003076e-07,
"loss": 0.0002,
"reward": -0.15261722169816494,
"reward_std": 0.3467218354344368,
"rewards/cosine_len_reward": -0.15261722169816494,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.08857142857142856,
"grad_norm": 0.149485781788826,
"kl": 0.044677734375,
"learning_rate": 8.865091407243394e-07,
"loss": 0.0002,
"reward": -0.22241253405809402,
"reward_std": 0.3932184986770153,
"rewards/cosine_len_reward": -0.22241253405809402,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 2931.7084045410156,
"epoch": 0.08914285714285715,
"grad_norm": 0.17892643809318542,
"kl": 0.0556640625,
"learning_rate": 8.844151714648274e-07,
"loss": -0.0795,
"reward": 0.15793364495038986,
"reward_std": 0.44282783567905426,
"rewards/cosine_len_reward": 0.15793364495038986,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 2960.9166870117188,
"epoch": 0.08971428571428572,
"grad_norm": 0.24480818212032318,
"kl": 0.0452880859375,
"learning_rate": 8.823049032816478e-07,
"loss": -0.1566,
"reward": -0.31062010303139687,
"reward_std": 0.4297754764556885,
"rewards/cosine_len_reward": -0.31062010303139687,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 3432.125,
"epoch": 0.09028571428571429,
"grad_norm": 0.15342198312282562,
"kl": 0.04534912109375,
"learning_rate": 8.801784390262943e-07,
"loss": 0.0076,
"reward": -0.1336694210767746,
"reward_std": 0.5960573703050613,
"rewards/cosine_len_reward": -0.1336694210767746,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 3393.541748046875,
"epoch": 0.09085714285714286,
"grad_norm": 0.289773166179657,
"kl": 0.1083984375,
"learning_rate": 8.780358823396352e-07,
"loss": -0.0277,
"reward": -0.14455506764352322,
"reward_std": 0.4936875104904175,
"rewards/cosine_len_reward": -0.14455506764352322,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.09142857142857143,
"grad_norm": 0.14998309314250946,
"kl": 0.04327392578125,
"learning_rate": 8.758773376468604e-07,
"loss": 0.0002,
"reward": 0.18973015248775482,
"reward_std": 0.40343762934207916,
"rewards/cosine_len_reward": 0.18973015248775482,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 3510.75,
"epoch": 0.092,
"grad_norm": 0.15721499919891357,
"kl": 0.052490234375,
"learning_rate": 8.737029101523929e-07,
"loss": -0.0232,
"reward": -0.1421234980225563,
"reward_std": 0.47652409970760345,
"rewards/cosine_len_reward": -0.1421234980225563,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 3427.9583740234375,
"epoch": 0.09257142857142857,
"grad_norm": 0.14637959003448486,
"kl": 0.037109375,
"learning_rate": 8.715127058347614e-07,
"loss": -0.0183,
"reward": -0.11180838942527771,
"reward_std": 0.6244986057281494,
"rewards/cosine_len_reward": -0.11180838942527771,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.09314285714285714,
"grad_norm": 0.1724317967891693,
"kl": 0.0538330078125,
"learning_rate": 8.693068314414344e-07,
"loss": 0.0002,
"reward": -0.024596035480499268,
"reward_std": 0.39021917432546616,
"rewards/cosine_len_reward": -0.024596035480499268,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 3473.7083740234375,
"epoch": 0.09371428571428571,
"grad_norm": 0.13588137924671173,
"kl": 0.04315185546875,
"learning_rate": 8.670853944836176e-07,
"loss": -0.0452,
"reward": -0.1590297818183899,
"reward_std": 0.4862016811966896,
"rewards/cosine_len_reward": -0.1590297818183899,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.09428571428571429,
"grad_norm": 0.14206375181674957,
"kl": 0.0416259765625,
"learning_rate": 8.648485032310144e-07,
"loss": 0.0002,
"reward": -0.3889569491147995,
"reward_std": 0.2624204456806183,
"rewards/cosine_len_reward": -0.3889569491147995,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.09485714285714286,
"grad_norm": 0.14916691184043884,
"kl": 0.04949951171875,
"learning_rate": 8.625962667065487e-07,
"loss": 0.0002,
"reward": -0.7786369696259499,
"reward_std": 0.22933637350797653,
"rewards/cosine_len_reward": -0.7786369696259499,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 3278.1251220703125,
"epoch": 0.09542857142857143,
"grad_norm": 0.14883826673030853,
"kl": 0.04718017578125,
"learning_rate": 8.603287946810513e-07,
"loss": 0.0132,
"reward": -0.41120439767837524,
"reward_std": 0.5235109552741051,
"rewards/cosine_len_reward": -0.41120439767837524,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 3423.8333740234375,
"epoch": 0.096,
"grad_norm": 0.14527034759521484,
"kl": 0.043701171875,
"learning_rate": 8.580461976679099e-07,
"loss": -0.0132,
"reward": -0.5333987874910235,
"reward_std": 0.3139747306704521,
"rewards/cosine_len_reward": -0.5333987874910235,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 3220.625,
"epoch": 0.09657142857142857,
"grad_norm": 0.1992996782064438,
"kl": 0.043212890625,
"learning_rate": 8.557485869176825e-07,
"loss": -0.0149,
"reward": -0.04628082364797592,
"reward_std": 0.4247637018561363,
"rewards/cosine_len_reward": -0.04628082364797592,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.09714285714285714,
"grad_norm": 0.15683424472808838,
"kl": 0.0523681640625,
"learning_rate": 8.534360744126753e-07,
"loss": 0.0002,
"reward": 0.5332798510789871,
"reward_std": 0.2842548470944166,
"rewards/cosine_len_reward": 0.5332798510789871,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.09771428571428571,
"grad_norm": 0.14955052733421326,
"kl": 0.03985595703125,
"learning_rate": 8.511087728614862e-07,
"loss": 0.0002,
"reward": 0.32699378207325935,
"reward_std": 0.3331494480371475,
"rewards/cosine_len_reward": 0.32699378207325935,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 3279.125,
"epoch": 0.09828571428571428,
"grad_norm": 0.18859492242336273,
"kl": 0.06658935546875,
"learning_rate": 8.487667956935087e-07,
"loss": -0.0026,
"reward": -0.15557575225830078,
"reward_std": 0.26120322197675705,
"rewards/cosine_len_reward": -0.15557575225830078,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 3291.4583740234375,
"epoch": 0.09885714285714285,
"grad_norm": 0.18678006529808044,
"kl": 0.05291748046875,
"learning_rate": 8.464102570534061e-07,
"loss": 0.0472,
"reward": -0.348556749522686,
"reward_std": 0.4245801493525505,
"rewards/cosine_len_reward": -0.348556749522686,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.09942857142857142,
"grad_norm": 0.1602144092321396,
"kl": 0.0545654296875,
"learning_rate": 8.440392717955475e-07,
"loss": 0.0002,
"reward": -0.13419683277606964,
"reward_std": 0.42287082970142365,
"rewards/cosine_len_reward": -0.13419683277606964,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 3479.416748046875,
"epoch": 0.1,
"grad_norm": 0.13922035694122314,
"kl": 0.03887939453125,
"learning_rate": 8.416539554784089e-07,
"loss": 0.0199,
"reward": -0.4109889939427376,
"reward_std": 0.4904426783323288,
"rewards/cosine_len_reward": -0.4109889939427376,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 3444.8333740234375,
"epoch": 0.10057142857142858,
"grad_norm": 0.16923050582408905,
"kl": 0.048248291015625,
"learning_rate": 8.392544243589427e-07,
"loss": 0.0326,
"reward": 0.014319989830255508,
"reward_std": 0.4780692644417286,
"rewards/cosine_len_reward": 0.014319989830255508,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 3200.0416870117188,
"epoch": 0.10114285714285715,
"grad_norm": 0.1463550180196762,
"kl": 0.04083251953125,
"learning_rate": 8.368407953869103e-07,
"loss": -0.0459,
"reward": -0.2681840620934963,
"reward_std": 0.4939410388469696,
"rewards/cosine_len_reward": -0.2681840620934963,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 3515.1666870117188,
"epoch": 0.10171428571428572,
"grad_norm": 0.1453145146369934,
"kl": 0.0435791015625,
"learning_rate": 8.344131861991828e-07,
"loss": 0.0372,
"reward": -0.3437151834368706,
"reward_std": 0.5837994962930679,
"rewards/cosine_len_reward": -0.3437151834368706,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 3500.5833740234375,
"epoch": 0.10228571428571429,
"grad_norm": 0.1510867178440094,
"kl": 0.0496826171875,
"learning_rate": 8.319717151140072e-07,
"loss": 0.0206,
"reward": -0.25809749960899353,
"reward_std": 0.5083489567041397,
"rewards/cosine_len_reward": -0.25809749960899353,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.10285714285714286,
"grad_norm": 0.15222983062267303,
"kl": 0.04559326171875,
"learning_rate": 8.295165011252396e-07,
"loss": 0.0002,
"reward": 0.055209167301654816,
"reward_std": 0.4925672523677349,
"rewards/cosine_len_reward": 0.055209167301654816,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 3583.5,
"epoch": 0.10342857142857143,
"grad_norm": 0.1334083080291748,
"kl": 0.039703369140625,
"learning_rate": 8.270476638965461e-07,
"loss": 0.0003,
"reward": -0.10784891247749329,
"reward_std": 0.3197548817843199,
"rewards/cosine_len_reward": -0.10784891247749329,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.104,
"grad_norm": 0.1473800241947174,
"kl": 0.048828125,
"learning_rate": 8.245653237555705e-07,
"loss": 0.0002,
"reward": -0.5589503794908524,
"reward_std": 0.3745484910905361,
"rewards/cosine_len_reward": -0.5589503794908524,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 3420.6666870117188,
"epoch": 0.10457142857142857,
"grad_norm": 0.17142295837402344,
"kl": 0.04559326171875,
"learning_rate": 8.220696016880687e-07,
"loss": 0.0476,
"reward": -0.03992478083819151,
"reward_std": 0.6503849253058434,
"rewards/cosine_len_reward": -0.03992478083819151,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 3193.7083740234375,
"epoch": 0.10514285714285715,
"grad_norm": 0.18599683046340942,
"kl": 0.05780029296875,
"learning_rate": 8.195606193320136e-07,
"loss": 0.0201,
"reward": -0.1778981164097786,
"reward_std": 0.38635776191949844,
"rewards/cosine_len_reward": -0.1778981164097786,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 2720.625,
"epoch": 0.10571428571428572,
"grad_norm": 0.390965074300766,
"kl": 0.08233642578125,
"learning_rate": 8.170384989716657e-07,
"loss": 0.1901,
"reward": -0.395973265171051,
"reward_std": 0.42710288241505623,
"rewards/cosine_len_reward": -0.395973265171051,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 2766.4583587646484,
"epoch": 0.10628571428571429,
"grad_norm": 0.2267504781484604,
"kl": 0.04254150390625,
"learning_rate": 8.145033635316128e-07,
"loss": 0.0364,
"reward": -0.3062758632004261,
"reward_std": 0.33072756230831146,
"rewards/cosine_len_reward": -0.3062758632004261,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 3509.25,
"epoch": 0.10685714285714286,
"grad_norm": 0.16118666529655457,
"kl": 0.05560302734375,
"learning_rate": 8.119553365707802e-07,
"loss": -0.0053,
"reward": -0.18290760926902294,
"reward_std": 0.38534967601299286,
"rewards/cosine_len_reward": -0.18290760926902294,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 3558.6666870117188,
"epoch": 0.10742857142857143,
"grad_norm": 0.1462363302707672,
"kl": 0.04461669921875,
"learning_rate": 8.093945422764069e-07,
"loss": 0.009,
"reward": -0.3126313886605203,
"reward_std": 0.3192543825134635,
"rewards/cosine_len_reward": -0.3126313886605203,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.108,
"grad_norm": 0.14533929526805878,
"kl": 0.03973388671875,
"learning_rate": 8.068211054579943e-07,
"loss": 0.0002,
"reward": -0.4464322179555893,
"reward_std": 0.336882084608078,
"rewards/cosine_len_reward": -0.4464322179555893,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 3254.5416870117188,
"epoch": 0.10857142857142857,
"grad_norm": 0.16986821591854095,
"kl": 0.0482177734375,
"learning_rate": 8.04235151541222e-07,
"loss": 0.0441,
"reward": -0.5396054945886135,
"reward_std": 0.3040096387267113,
"rewards/cosine_len_reward": -0.5396054945886135,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 3005.4583435058594,
"epoch": 0.10914285714285714,
"grad_norm": 0.2879737317562103,
"kl": 0.0799560546875,
"learning_rate": 8.01636806561836e-07,
"loss": 0.1028,
"reward": -0.04980655759572983,
"reward_std": 0.3840261846780777,
"rewards/cosine_len_reward": -0.04980655759572983,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 3162.1666870117188,
"epoch": 0.10971428571428571,
"grad_norm": 0.18578751385211945,
"kl": 0.0562744140625,
"learning_rate": 7.990261971595048e-07,
"loss": 0.0364,
"reward": -0.09453568607568741,
"reward_std": 0.5176538079977036,
"rewards/cosine_len_reward": -0.09453568607568741,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.11028571428571429,
"grad_norm": 0.14698241651058197,
"kl": 0.03900146484375,
"learning_rate": 7.964034505716476e-07,
"loss": 0.0002,
"reward": -0.20598075166344643,
"reward_std": 0.39230766519904137,
"rewards/cosine_len_reward": -0.20598075166344643,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 2715.1666870117188,
"epoch": 0.11085714285714286,
"grad_norm": 0.27284160256385803,
"kl": 0.06475830078125,
"learning_rate": 7.93768694627233e-07,
"loss": 0.0727,
"reward": -0.30249132215976715,
"reward_std": 0.6033280193805695,
"rewards/cosine_len_reward": -0.30249132215976715,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 3347.3333740234375,
"epoch": 0.11142857142857143,
"grad_norm": 0.17070569097995758,
"kl": 0.059814453125,
"learning_rate": 7.911220577405484e-07,
"loss": -0.028,
"reward": -0.11190107837319374,
"reward_std": 0.5215081870555878,
"rewards/cosine_len_reward": -0.11190107837319374,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 2826.0,
"epoch": 0.112,
"grad_norm": 0.20371770858764648,
"kl": 0.040924072265625,
"learning_rate": 7.884636689049422e-07,
"loss": 0.0097,
"reward": -0.24280931055545807,
"reward_std": 0.24882662668824196,
"rewards/cosine_len_reward": -0.24280931055545807,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 3179.5416870117188,
"epoch": 0.11257142857142857,
"grad_norm": 0.1733676791191101,
"kl": 0.04510498046875,
"learning_rate": 7.857936576865356e-07,
"loss": -0.0353,
"reward": -0.5386351570487022,
"reward_std": 0.47516174614429474,
"rewards/cosine_len_reward": -0.5386351570487022,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 3113.3333435058594,
"epoch": 0.11314285714285714,
"grad_norm": 0.2758618891239166,
"kl": 0.04486083984375,
"learning_rate": 7.831121542179086e-07,
"loss": -0.0971,
"reward": 0.38623735681176186,
"reward_std": 0.37613956816494465,
"rewards/cosine_len_reward": 0.38623735681176186,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 3521.166748046875,
"epoch": 0.11371428571428571,
"grad_norm": 0.16294990479946136,
"kl": 0.0465087890625,
"learning_rate": 7.804192891917571e-07,
"loss": -0.0159,
"reward": -0.03587418794631958,
"reward_std": 0.39774488657712936,
"rewards/cosine_len_reward": -0.03587418794631958,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 3434.5833740234375,
"epoch": 0.11428571428571428,
"grad_norm": 0.18072271347045898,
"kl": 0.05572509765625,
"learning_rate": 7.777151938545235e-07,
"loss": 0.0029,
"reward": -0.29647984355688095,
"reward_std": 0.43220172449946404,
"rewards/cosine_len_reward": -0.29647984355688095,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 3440.9583740234375,
"epoch": 0.11485714285714285,
"grad_norm": 0.18761895596981049,
"kl": 0.05572509765625,
"learning_rate": 7.75e-07,
"loss": -0.0399,
"reward": 0.26733987778425217,
"reward_std": 0.4860861897468567,
"rewards/cosine_len_reward": 0.26733987778425217,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 3017.9583435058594,
"epoch": 0.11542857142857142,
"grad_norm": 0.2005700170993805,
"kl": 0.046875,
"learning_rate": 7.72273839962904e-07,
"loss": 0.0871,
"reward": -0.1812760317698121,
"reward_std": 0.5165529623627663,
"rewards/cosine_len_reward": -0.1812760317698121,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 3487.3333740234375,
"epoch": 0.116,
"grad_norm": 0.15808533132076263,
"kl": 0.08843994140625,
"learning_rate": 7.695368466124296e-07,
"loss": -0.0573,
"reward": 0.20985493808984756,
"reward_std": 0.2973843924701214,
"rewards/cosine_len_reward": 0.20985493808984756,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 2917.250030517578,
"epoch": 0.11657142857142858,
"grad_norm": 0.15920044481754303,
"kl": 0.040771484375,
"learning_rate": 7.667891533457718e-07,
"loss": 0.0565,
"reward": -0.1725541353225708,
"reward_std": 0.26636107824742794,
"rewards/cosine_len_reward": -0.1725541353225708,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 3479.0416870117188,
"epoch": 0.11714285714285715,
"grad_norm": 0.16037701070308685,
"kl": 0.04156494140625,
"learning_rate": 7.640308940816239e-07,
"loss": -0.0214,
"reward": 0.1938185803592205,
"reward_std": 0.42734283953905106,
"rewards/cosine_len_reward": 0.1938185803592205,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 3439.6250610351562,
"epoch": 0.11771428571428572,
"grad_norm": 0.15823422372341156,
"kl": 0.0565185546875,
"learning_rate": 7.612622032536507e-07,
"loss": -0.0167,
"reward": 0.1364712193608284,
"reward_std": 0.47612153738737106,
"rewards/cosine_len_reward": 0.1364712193608284,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 3275.8333740234375,
"epoch": 0.11828571428571429,
"grad_norm": 0.16698057949543,
"kl": 0.04461669921875,
"learning_rate": 7.584832158039378e-07,
"loss": -0.0623,
"reward": -0.13615961745381355,
"reward_std": 0.25652020424604416,
"rewards/cosine_len_reward": -0.13615961745381355,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 2980.6250610351562,
"epoch": 0.11885714285714286,
"grad_norm": 0.1898704171180725,
"kl": 0.0433349609375,
"learning_rate": 7.556940671764124e-07,
"loss": 0.1447,
"reward": -0.31068204157054424,
"reward_std": 0.48038899153470993,
"rewards/cosine_len_reward": -0.31068204157054424,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 2793.1666717529297,
"epoch": 0.11942857142857143,
"grad_norm": 0.34890925884246826,
"kl": 0.043212890625,
"learning_rate": 7.528948933102438e-07,
"loss": 0.1303,
"reward": -0.18676769733428955,
"reward_std": 0.4481005147099495,
"rewards/cosine_len_reward": -0.18676769733428955,
"step": 209
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.12,
"grad_norm": 0.15249201655387878,
"kl": 0.05181884765625,
"learning_rate": 7.500858306332172e-07,
"loss": 0.0002,
"reward": -0.07446567714214325,
"reward_std": 0.18282443098723888,
"rewards/cosine_len_reward": -0.07446567714214325,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 2958.8333740234375,
"epoch": 0.12057142857142857,
"grad_norm": 0.17982329428195953,
"kl": 0.04461669921875,
"learning_rate": 7.472670160550848e-07,
"loss": -0.1281,
"reward": 0.04834838956594467,
"reward_std": 0.4822767600417137,
"rewards/cosine_len_reward": 0.04834838956594467,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 3580.625,
"epoch": 0.12114285714285715,
"grad_norm": 0.17494414746761322,
"kl": 0.05645751953125,
"learning_rate": 7.444385869608921e-07,
"loss": 0.0015,
"reward": 0.2551577538251877,
"reward_std": 0.322468139231205,
"rewards/cosine_len_reward": 0.2551577538251877,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 3130.0416870117188,
"epoch": 0.12171428571428572,
"grad_norm": 0.22192618250846863,
"kl": 0.0361328125,
"learning_rate": 7.416006812042827e-07,
"loss": -0.0966,
"reward": 0.09590141475200653,
"reward_std": 0.3808777518570423,
"rewards/cosine_len_reward": 0.09590141475200653,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 3035.8333435058594,
"epoch": 0.12228571428571429,
"grad_norm": 0.23687028884887695,
"kl": 0.04388427734375,
"learning_rate": 7.387534371007797e-07,
"loss": -0.0639,
"reward": -0.2096049189567566,
"reward_std": 0.36091630533337593,
"rewards/cosine_len_reward": -0.2096049189567566,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 3564.6666870117188,
"epoch": 0.12285714285714286,
"grad_norm": 0.16293585300445557,
"kl": 0.0419921875,
"learning_rate": 7.358969934210438e-07,
"loss": 0.0036,
"reward": -0.3932174853980541,
"reward_std": 0.4780453070998192,
"rewards/cosine_len_reward": -0.3932174853980541,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 2151.041748046875,
"epoch": 0.12342857142857143,
"grad_norm": 0.18021531403064728,
"kl": 0.046142578125,
"learning_rate": 7.330314893841101e-07,
"loss": 0.0426,
"reward": -0.43433932960033417,
"reward_std": 0.39597445726394653,
"rewards/cosine_len_reward": -0.43433932960033417,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.124,
"grad_norm": 0.1701132208108902,
"kl": 0.0526123046875,
"learning_rate": 7.301570646506027e-07,
"loss": 0.0002,
"reward": 0.27941954880952835,
"reward_std": 0.33769937977194786,
"rewards/cosine_len_reward": 0.27941954880952835,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 3329.75,
"epoch": 0.12457142857142857,
"grad_norm": 0.164340078830719,
"kl": 0.0550537109375,
"learning_rate": 7.27273859315928e-07,
"loss": -0.05,
"reward": -0.12782394886016846,
"reward_std": 0.5567526817321777,
"rewards/cosine_len_reward": -0.12782394886016846,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 2744.3750610351562,
"epoch": 0.12514285714285714,
"grad_norm": 0.22663144767284393,
"kl": 0.05523681640625,
"learning_rate": 7.243820139034464e-07,
"loss": -0.0076,
"reward": -0.43699468672275543,
"reward_std": 0.4379548355937004,
"rewards/cosine_len_reward": -0.43699468672275543,
"step": 219
},
{
"clip_ratio": 0.0,
"completion_length": 3289.916748046875,
"epoch": 0.12571428571428572,
"grad_norm": 0.15933476388454437,
"kl": 0.04522705078125,
"learning_rate": 7.214816693576234e-07,
"loss": 0.0215,
"reward": 0.18807393498718739,
"reward_std": 0.5197709389030933,
"rewards/cosine_len_reward": 0.18807393498718739,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 2958.1666870117188,
"epoch": 0.12628571428571428,
"grad_norm": 0.21169209480285645,
"kl": 0.0587158203125,
"learning_rate": 7.185729670371604e-07,
"loss": -0.0842,
"reward": -0.10242819041013718,
"reward_std": 0.5134933441877365,
"rewards/cosine_len_reward": -0.10242819041013718,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 2638.3750610351562,
"epoch": 0.12685714285714286,
"grad_norm": 0.3018430769443512,
"kl": 0.04803466796875,
"learning_rate": 7.156560487081051e-07,
"loss": -0.1641,
"reward": -0.17160223424434662,
"reward_std": 0.40449361503124237,
"rewards/cosine_len_reward": -0.17160223424434662,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 2908.2916870117188,
"epoch": 0.12742857142857142,
"grad_norm": 0.4184667766094208,
"kl": 0.0513916015625,
"learning_rate": 7.127310565369415e-07,
"loss": -0.1032,
"reward": -0.2708446606993675,
"reward_std": 0.4833526462316513,
"rewards/cosine_len_reward": -0.2708446606993675,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.128,
"grad_norm": 0.14835132658481598,
"kl": 0.0413818359375,
"learning_rate": 7.097981330836616e-07,
"loss": 0.0002,
"reward": -0.06757992756320164,
"reward_std": 0.6502542048692703,
"rewards/cosine_len_reward": -0.06757992756320164,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 3400.1250610351562,
"epoch": 0.12857142857142856,
"grad_norm": 0.15779976546764374,
"kl": 0.04864501953125,
"learning_rate": 7.068574212948169e-07,
"loss": -0.0015,
"reward": -0.3096113298088312,
"reward_std": 0.5966300740838051,
"rewards/cosine_len_reward": -0.3096113298088312,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 2957.0000610351562,
"epoch": 0.12914285714285714,
"grad_norm": 0.2116604745388031,
"kl": 0.04412841796875,
"learning_rate": 7.039090644965509e-07,
"loss": 0.0589,
"reward": 0.3183956618886441,
"reward_std": 0.5148555189371109,
"rewards/cosine_len_reward": 0.3183956618886441,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 2699.5416870117188,
"epoch": 0.12971428571428573,
"grad_norm": 0.2158554196357727,
"kl": 0.036590576171875,
"learning_rate": 7.009532063876148e-07,
"loss": 0.0252,
"reward": -0.019072268158197403,
"reward_std": 0.51263727247715,
"rewards/cosine_len_reward": -0.019072268158197403,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 2288.250030517578,
"epoch": 0.13028571428571428,
"grad_norm": 0.4335751235485077,
"kl": 0.04742431640625,
"learning_rate": 6.979899910323624e-07,
"loss": 0.1996,
"reward": -0.08665098808705807,
"reward_std": 0.4403616450726986,
"rewards/cosine_len_reward": -0.08665098808705807,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 3032.75,
"epoch": 0.13085714285714287,
"grad_norm": 0.17546547949314117,
"kl": 0.04461669921875,
"learning_rate": 6.950195628537299e-07,
"loss": -0.0213,
"reward": -0.4365268647670746,
"reward_std": 0.30412301421165466,
"rewards/cosine_len_reward": -0.4365268647670746,
"step": 229
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.13142857142857142,
"grad_norm": 0.14412453770637512,
"kl": 0.04443359375,
"learning_rate": 6.920420666261961e-07,
"loss": 0.0002,
"reward": 0.007472768425941467,
"reward_std": 0.3222753778100014,
"rewards/cosine_len_reward": 0.007472768425941467,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 3375.6666870117188,
"epoch": 0.132,
"grad_norm": 0.21483556926250458,
"kl": 0.04730224609375,
"learning_rate": 6.890576474687263e-07,
"loss": -0.0604,
"reward": 0.14519068226218224,
"reward_std": 0.3973130788654089,
"rewards/cosine_len_reward": 0.14519068226218224,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 2880.75,
"epoch": 0.13257142857142856,
"grad_norm": 0.19802089035511017,
"kl": 0.04461669921875,
"learning_rate": 6.860664508377001e-07,
"loss": 0.0559,
"reward": -0.46153586730360985,
"reward_std": 0.4743086025118828,
"rewards/cosine_len_reward": -0.46153586730360985,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 3466.0416870117188,
"epoch": 0.13314285714285715,
"grad_norm": 0.14607660472393036,
"kl": 0.04266357421875,
"learning_rate": 6.83068622519821e-07,
"loss": -0.0599,
"reward": -0.0832620239816606,
"reward_std": 0.37259791046380997,
"rewards/cosine_len_reward": -0.0832620239816606,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.1337142857142857,
"grad_norm": 0.15555140376091003,
"kl": 0.0460205078125,
"learning_rate": 6.800643086250121e-07,
"loss": 0.0002,
"reward": -0.141191266477108,
"reward_std": 0.2727678678929806,
"rewards/cosine_len_reward": -0.141191266477108,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 2840.375,
"epoch": 0.13428571428571429,
"grad_norm": 0.2309923619031906,
"kl": 0.04388427734375,
"learning_rate": 6.770536555792944e-07,
"loss": 0.0619,
"reward": -0.030156303197145462,
"reward_std": 0.4152325987815857,
"rewards/cosine_len_reward": -0.030156303197145462,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 2298.25,
"epoch": 0.13485714285714287,
"grad_norm": 0.43432554602622986,
"kl": 0.0498046875,
"learning_rate": 6.740368101176495e-07,
"loss": -0.1391,
"reward": -0.2793803792446852,
"reward_std": 0.5643313974142075,
"rewards/cosine_len_reward": -0.2793803792446852,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 2497.625,
"epoch": 0.13542857142857143,
"grad_norm": 0.29664841294288635,
"kl": 0.06695556640625,
"learning_rate": 6.710139192768694e-07,
"loss": -0.0854,
"reward": -0.17586842365562916,
"reward_std": 0.5680941194295883,
"rewards/cosine_len_reward": -0.17586842365562916,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 3128.7083740234375,
"epoch": 0.136,
"grad_norm": 0.21420085430145264,
"kl": 0.0462646484375,
"learning_rate": 6.679851303883891e-07,
"loss": -0.0854,
"reward": -0.4128701612353325,
"reward_std": 0.3054894767701626,
"rewards/cosine_len_reward": -0.4128701612353325,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.13657142857142857,
"grad_norm": 0.1544465720653534,
"kl": 0.05194091796875,
"learning_rate": 6.649505910711058e-07,
"loss": 0.0002,
"reward": 0.15613190457224846,
"reward_std": 0.3620890751481056,
"rewards/cosine_len_reward": 0.15613190457224846,
"step": 239
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.13714285714285715,
"grad_norm": 0.17545334994792938,
"kl": 0.04376220703125,
"learning_rate": 6.619104492241847e-07,
"loss": 0.0002,
"reward": -0.3966597355902195,
"reward_std": 0.3349955417215824,
"rewards/cosine_len_reward": -0.3966597355902195,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 2736.4583740234375,
"epoch": 0.1377142857142857,
"grad_norm": 0.2892467975616455,
"kl": 0.04608154296875,
"learning_rate": 6.588648530198504e-07,
"loss": 0.0983,
"reward": 0.0027789995074272156,
"reward_std": 0.4166061468422413,
"rewards/cosine_len_reward": 0.0027789995074272156,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.1382857142857143,
"grad_norm": 0.15071091055870056,
"kl": 0.05181884765625,
"learning_rate": 6.558139508961654e-07,
"loss": 0.0002,
"reward": 3.524124622344971e-05,
"reward_std": 0.23610319755971432,
"rewards/cosine_len_reward": 3.524124622344971e-05,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 3580.7916870117188,
"epoch": 0.13885714285714285,
"grad_norm": 0.17169542610645294,
"kl": 0.05706787109375,
"learning_rate": 6.527578915497951e-07,
"loss": -0.0004,
"reward": -0.4086134284734726,
"reward_std": 0.34414974600076675,
"rewards/cosine_len_reward": -0.4086134284734726,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 3362.0416870117188,
"epoch": 0.13942857142857143,
"grad_norm": 0.28142961859703064,
"kl": 0.0596923828125,
"learning_rate": 6.496968239287603e-07,
"loss": 0.0377,
"reward": -0.5328942588530481,
"reward_std": 0.45538509637117386,
"rewards/cosine_len_reward": -0.5328942588530481,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 3371.9583740234375,
"epoch": 0.14,
"grad_norm": 0.16989277303218842,
"kl": 0.0560302734375,
"learning_rate": 6.466308972251785e-07,
"loss": 0.0021,
"reward": -0.057370007038116455,
"reward_std": 0.44972486793994904,
"rewards/cosine_len_reward": -0.057370007038116455,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 3018.541717529297,
"epoch": 0.14057142857142857,
"grad_norm": 0.14576825499534607,
"kl": 0.03741455078125,
"learning_rate": 6.435602608679916e-07,
"loss": -0.015,
"reward": 0.28845351678319275,
"reward_std": 0.40407272428274155,
"rewards/cosine_len_reward": 0.28845351678319275,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.14114285714285715,
"grad_norm": 0.1530689150094986,
"kl": 0.0460205078125,
"learning_rate": 6.404850645156841e-07,
"loss": 0.0002,
"reward": 0.1336237620562315,
"reward_std": 0.47701434791088104,
"rewards/cosine_len_reward": 0.1336237620562315,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 2176.7500228881836,
"epoch": 0.1417142857142857,
"grad_norm": 0.38253119587898254,
"kl": 0.04736328125,
"learning_rate": 6.374054580489873e-07,
"loss": 0.1374,
"reward": -0.14296885952353477,
"reward_std": 0.45761511474847794,
"rewards/cosine_len_reward": -0.14296885952353477,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 3580.25,
"epoch": 0.1422857142857143,
"grad_norm": 0.16433897614479065,
"kl": 0.04766845703125,
"learning_rate": 6.343215915635761e-07,
"loss": 0.0014,
"reward": -0.33391520008444786,
"reward_std": 0.3410606235265732,
"rewards/cosine_len_reward": -0.33391520008444786,
"step": 249
},
{
"clip_ratio": 0.0,
"completion_length": 3267.875,
"epoch": 0.14285714285714285,
"grad_norm": 0.17220118641853333,
"kl": 0.04205322265625,
"learning_rate": 6.31233615362752e-07,
"loss": -0.0927,
"reward": -0.2974054589867592,
"reward_std": 0.5179209262132645,
"rewards/cosine_len_reward": -0.2974054589867592,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 2834.875030517578,
"epoch": 0.14342857142857143,
"grad_norm": 0.1579933613538742,
"kl": 0.042724609375,
"learning_rate": 6.281416799501187e-07,
"loss": -0.1072,
"reward": -0.3689499036408961,
"reward_std": 0.48299194872379303,
"rewards/cosine_len_reward": -0.3689499036408961,
"step": 251
},
{
"clip_ratio": 0.0,
"completion_length": 3071.5833740234375,
"epoch": 0.144,
"grad_norm": 0.265338659286499,
"kl": 0.05572509765625,
"learning_rate": 6.25045936022246e-07,
"loss": -0.0647,
"reward": -0.3644823618233204,
"reward_std": 0.4328817129135132,
"rewards/cosine_len_reward": -0.3644823618233204,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 3208.8333740234375,
"epoch": 0.14457142857142857,
"grad_norm": 0.17507553100585938,
"kl": 0.045654296875,
"learning_rate": 6.219465344613258e-07,
"loss": -0.0457,
"reward": 0.07047359831631184,
"reward_std": 0.47896186634898186,
"rewards/cosine_len_reward": 0.07047359831631184,
"step": 253
},
{
"clip_ratio": 0.0,
"completion_length": 3244.2916870117188,
"epoch": 0.14514285714285713,
"grad_norm": 0.13542579114437103,
"kl": 0.03094482421875,
"learning_rate": 6.188436263278172e-07,
"loss": -0.0562,
"reward": 0.38170455396175385,
"reward_std": 0.4014411121606827,
"rewards/cosine_len_reward": 0.38170455396175385,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 3375.375,
"epoch": 0.1457142857142857,
"grad_norm": 0.1907050907611847,
"kl": 0.06121826171875,
"learning_rate": 6.157373628530852e-07,
"loss": 0.0057,
"reward": -0.2342095747590065,
"reward_std": 0.5328280627727509,
"rewards/cosine_len_reward": -0.2342095747590065,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.1462857142857143,
"grad_norm": 0.14469757676124573,
"kl": 0.03936767578125,
"learning_rate": 6.126278954320294e-07,
"loss": 0.0002,
"reward": -0.2734826896339655,
"reward_std": 0.2763890288770199,
"rewards/cosine_len_reward": -0.2734826896339655,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 3096.2916870117188,
"epoch": 0.14685714285714285,
"grad_norm": 1.1167607307434082,
"kl": 0.1475830078125,
"learning_rate": 6.095153756157051e-07,
"loss": 0.1404,
"reward": -0.2988658621907234,
"reward_std": 0.4727318063378334,
"rewards/cosine_len_reward": -0.2988658621907234,
"step": 257
},
{
"clip_ratio": 0.0,
"completion_length": 2151.0,
"epoch": 0.14742857142857144,
"grad_norm": 0.2602831721305847,
"kl": 0.03936767578125,
"learning_rate": 6.06399955103937e-07,
"loss": 0.142,
"reward": 0.09373210370540619,
"reward_std": 0.3661756291985512,
"rewards/cosine_len_reward": 0.09373210370540619,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.148,
"grad_norm": 0.15572789311408997,
"kl": 0.04443359375,
"learning_rate": 6.032817857379256e-07,
"loss": 0.0002,
"reward": -0.016586612910032272,
"reward_std": 0.45464975386857986,
"rewards/cosine_len_reward": -0.016586612910032272,
"step": 259
},
{
"clip_ratio": 0.0,
"completion_length": 3412.0000610351562,
"epoch": 0.14857142857142858,
"grad_norm": 0.18364061415195465,
"kl": 0.04327392578125,
"learning_rate": 6.001610194928464e-07,
"loss": 0.0325,
"reward": 0.19075123965740204,
"reward_std": 0.40447286888957024,
"rewards/cosine_len_reward": 0.19075123965740204,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.14914285714285713,
"grad_norm": 0.13612648844718933,
"kl": 0.041015625,
"learning_rate": 5.97037808470444e-07,
"loss": 0.0002,
"reward": -0.20534592866897583,
"reward_std": 0.292750746011734,
"rewards/cosine_len_reward": -0.20534592866897583,
"step": 261
},
{
"clip_ratio": 0.0,
"completion_length": 3564.125,
"epoch": 0.14971428571428572,
"grad_norm": 0.14559589326381683,
"kl": 0.0447998046875,
"learning_rate": 5.939123048916173e-07,
"loss": 0.0092,
"reward": -0.18081504851579666,
"reward_std": 0.4650590941309929,
"rewards/cosine_len_reward": -0.18081504851579666,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 3468.375,
"epoch": 0.15028571428571427,
"grad_norm": 0.17928771674633026,
"kl": 0.05291748046875,
"learning_rate": 5.907846610890011e-07,
"loss": 0.0121,
"reward": 0.09083682298660278,
"reward_std": 0.3736903816461563,
"rewards/cosine_len_reward": 0.09083682298660278,
"step": 263
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.15085714285714286,
"grad_norm": 0.20904290676116943,
"kl": 0.05096435546875,
"learning_rate": 5.87655029499542e-07,
"loss": 0.0002,
"reward": -0.08812252432107925,
"reward_std": 0.37937742099165916,
"rewards/cosine_len_reward": -0.08812252432107925,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 3428.0416870117188,
"epoch": 0.15142857142857144,
"grad_norm": 0.1781691014766693,
"kl": 0.0589599609375,
"learning_rate": 5.845235626570683e-07,
"loss": -0.0085,
"reward": -0.18174926191568375,
"reward_std": 0.5525826513767242,
"rewards/cosine_len_reward": -0.18174926191568375,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 3207.125,
"epoch": 0.152,
"grad_norm": 0.16391095519065857,
"kl": 0.055908203125,
"learning_rate": 5.813904131848564e-07,
"loss": 0.01,
"reward": -0.08231775928288698,
"reward_std": 0.4217005968093872,
"rewards/cosine_len_reward": -0.08231775928288698,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 2443.0834045410156,
"epoch": 0.15257142857142858,
"grad_norm": 0.5960751175880432,
"kl": 0.064697265625,
"learning_rate": 5.78255733788191e-07,
"loss": 0.3249,
"reward": -0.4397448003292084,
"reward_std": 0.4261315129697323,
"rewards/cosine_len_reward": -0.4397448003292084,
"step": 267
},
{
"clip_ratio": 0.0,
"completion_length": 2792.7916717529297,
"epoch": 0.15314285714285714,
"grad_norm": 0.37113648653030396,
"kl": 0.04974365234375,
"learning_rate": 5.751196772469237e-07,
"loss": 0.1018,
"reward": 0.23075676709413528,
"reward_std": 0.3936547078192234,
"rewards/cosine_len_reward": 0.23075676709413528,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 3461.5833740234375,
"epoch": 0.15371428571428572,
"grad_norm": 0.15312907099723816,
"kl": 0.0408935546875,
"learning_rate": 5.71982396408026e-07,
"loss": 0.0456,
"reward": 0.018859659554436803,
"reward_std": 0.36391543596982956,
"rewards/cosine_len_reward": 0.018859659554436803,
"step": 269
},
{
"clip_ratio": 0.0,
"completion_length": 2649.0833740234375,
"epoch": 0.15428571428571428,
"grad_norm": 0.2657376527786255,
"kl": 0.05450439453125,
"learning_rate": 5.688440441781398e-07,
"loss": -0.0693,
"reward": -0.2678499221801758,
"reward_std": 0.4750388078391552,
"rewards/cosine_len_reward": -0.2678499221801758,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 3246.2084350585938,
"epoch": 0.15485714285714286,
"grad_norm": 0.26408547163009644,
"kl": 0.06634521484375,
"learning_rate": 5.657047735161255e-07,
"loss": 0.1012,
"reward": -0.10909051727503538,
"reward_std": 0.4644903093576431,
"rewards/cosine_len_reward": -0.10909051727503538,
"step": 271
},
{
"clip_ratio": 0.0,
"completion_length": 2596.5416870117188,
"epoch": 0.15542857142857142,
"grad_norm": 0.3016497492790222,
"kl": 0.0518798828125,
"learning_rate": 5.625647374256061e-07,
"loss": 0.2311,
"reward": -0.2824300043284893,
"reward_std": 0.4353151246905327,
"rewards/cosine_len_reward": -0.2824300043284893,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 3439.3333740234375,
"epoch": 0.156,
"grad_norm": 0.18352919816970825,
"kl": 0.05572509765625,
"learning_rate": 5.594240889475106e-07,
"loss": -0.0308,
"reward": -0.41946647968143225,
"reward_std": 0.5297495797276497,
"rewards/cosine_len_reward": -0.41946647968143225,
"step": 273
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.15657142857142858,
"grad_norm": 0.14747214317321777,
"kl": 0.0426025390625,
"learning_rate": 5.562829811526154e-07,
"loss": 0.0002,
"reward": -0.2709335945546627,
"reward_std": 0.3659312203526497,
"rewards/cosine_len_reward": -0.2709335945546627,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 3159.541748046875,
"epoch": 0.15714285714285714,
"grad_norm": 0.21950587630271912,
"kl": 0.05682373046875,
"learning_rate": 5.531415671340826e-07,
"loss": -0.0847,
"reward": -0.36149609088897705,
"reward_std": 0.4367978870868683,
"rewards/cosine_len_reward": -0.36149609088897705,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 2158.1666717529297,
"epoch": 0.15771428571428572,
"grad_norm": 0.2495608627796173,
"kl": 0.0460205078125,
"learning_rate": 5.5e-07,
"loss": 0.07,
"reward": -0.10891957813873887,
"reward_std": 0.31222014874219894,
"rewards/cosine_len_reward": -0.10891957813873887,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 3555.7916870117188,
"epoch": 0.15828571428571428,
"grad_norm": 0.15767265856266022,
"kl": 0.04681396484375,
"learning_rate": 5.468584328659172e-07,
"loss": 0.0018,
"reward": -0.11694742739200592,
"reward_std": 0.5289106294512749,
"rewards/cosine_len_reward": -0.11694742739200592,
"step": 277
},
{
"clip_ratio": 0.0,
"completion_length": 3408.4166870117188,
"epoch": 0.15885714285714286,
"grad_norm": 0.15594230592250824,
"kl": 0.0472412109375,
"learning_rate": 5.437170188473847e-07,
"loss": 0.0096,
"reward": -0.6330656111240387,
"reward_std": 0.4878097102046013,
"rewards/cosine_len_reward": -0.6330656111240387,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 3456.7916870117188,
"epoch": 0.15942857142857142,
"grad_norm": 0.17065085470676422,
"kl": 0.0494384765625,
"learning_rate": 5.405759110524894e-07,
"loss": -0.0232,
"reward": -0.011678516864776611,
"reward_std": 0.29044996201992035,
"rewards/cosine_len_reward": -0.011678516864776611,
"step": 279
},
{
"clip_ratio": 0.0,
"completion_length": 3121.3333740234375,
"epoch": 0.16,
"grad_norm": 0.18095840513706207,
"kl": 0.04248046875,
"learning_rate": 5.37435262574394e-07,
"loss": 0.0204,
"reward": 0.11361887771636248,
"reward_std": 0.4011372458189726,
"rewards/cosine_len_reward": 0.11361887771636248,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 2895.125030517578,
"epoch": 0.16057142857142856,
"grad_norm": 0.2143956571817398,
"kl": 0.059814453125,
"learning_rate": 5.342952264838747e-07,
"loss": 0.0687,
"reward": -0.08828364498913288,
"reward_std": 0.6478094309568405,
"rewards/cosine_len_reward": -0.08828364498913288,
"step": 281
},
{
"clip_ratio": 0.0,
"completion_length": 3567.1666870117188,
"epoch": 0.16114285714285714,
"grad_norm": 0.188002809882164,
"kl": 0.05224609375,
"learning_rate": 5.311559558218603e-07,
"loss": 0.0041,
"reward": 0.028641201555728912,
"reward_std": 0.5138514451682568,
"rewards/cosine_len_reward": 0.028641201555728912,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 2583.625030517578,
"epoch": 0.16171428571428573,
"grad_norm": 0.2364453375339508,
"kl": 0.05224609375,
"learning_rate": 5.28017603591974e-07,
"loss": 0.0869,
"reward": -0.26807255670428276,
"reward_std": 0.41767074167728424,
"rewards/cosine_len_reward": -0.26807255670428276,
"step": 283
},
{
"clip_ratio": 0.0,
"completion_length": 2788.9166717529297,
"epoch": 0.16228571428571428,
"grad_norm": 0.3112829029560089,
"kl": 0.04931640625,
"learning_rate": 5.248803227530763e-07,
"loss": 0.0558,
"reward": 0.36127352714538574,
"reward_std": 0.25031092017889023,
"rewards/cosine_len_reward": 0.36127352714538574,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 3566.25,
"epoch": 0.16285714285714287,
"grad_norm": 0.14432963728904724,
"kl": 0.053955078125,
"learning_rate": 5.21744266211809e-07,
"loss": -0.0087,
"reward": -0.09366314113140106,
"reward_std": 0.4134976416826248,
"rewards/cosine_len_reward": -0.09366314113140106,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 3367.0,
"epoch": 0.16342857142857142,
"grad_norm": 0.13774707913398743,
"kl": 0.037109375,
"learning_rate": 5.186095868151436e-07,
"loss": -0.0435,
"reward": 0.15960774943232536,
"reward_std": 0.44187677651643753,
"rewards/cosine_len_reward": 0.15960774943232536,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 3458.6250610351562,
"epoch": 0.164,
"grad_norm": 0.1523509919643402,
"kl": 0.0465087890625,
"learning_rate": 5.154764373429315e-07,
"loss": 0.0327,
"reward": -0.1596720740199089,
"reward_std": 0.29411060363054276,
"rewards/cosine_len_reward": -0.1596720740199089,
"step": 287
},
{
"clip_ratio": 0.0,
"completion_length": 3469.75,
"epoch": 0.16457142857142856,
"grad_norm": 0.17270562052726746,
"kl": 0.05035400390625,
"learning_rate": 5.123449705004581e-07,
"loss": 0.07,
"reward": -0.7143011689186096,
"reward_std": 0.25607092306017876,
"rewards/cosine_len_reward": -0.7143011689186096,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 3468.0416870117188,
"epoch": 0.16514285714285715,
"grad_norm": 0.19703444838523865,
"kl": 0.0517578125,
"learning_rate": 5.09215338910999e-07,
"loss": -0.0199,
"reward": -0.009370148181915283,
"reward_std": 0.26325560361146927,
"rewards/cosine_len_reward": -0.009370148181915283,
"step": 289
},
{
"clip_ratio": 0.0,
"completion_length": 3348.5000610351562,
"epoch": 0.1657142857142857,
"grad_norm": 0.26584550738334656,
"kl": 0.106689453125,
"learning_rate": 5.060876951083828e-07,
"loss": -0.1073,
"reward": 0.4936791881918907,
"reward_std": 0.5171967372298241,
"rewards/cosine_len_reward": 0.4936791881918907,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 3520.2916870117188,
"epoch": 0.1662857142857143,
"grad_norm": 0.160246804356575,
"kl": 0.04925537109375,
"learning_rate": 5.02962191529556e-07,
"loss": 0.019,
"reward": 0.06787654012441635,
"reward_std": 0.4259056970477104,
"rewards/cosine_len_reward": 0.06787654012441635,
"step": 291
},
{
"clip_ratio": 0.0,
"completion_length": 3394.7083740234375,
"epoch": 0.16685714285714287,
"grad_norm": 0.16499929130077362,
"kl": 0.0457763671875,
"learning_rate": 4.998389805071536e-07,
"loss": 0.0067,
"reward": -0.37206324748694897,
"reward_std": 0.6219586506485939,
"rewards/cosine_len_reward": -0.37206324748694897,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 3028.2500610351562,
"epoch": 0.16742857142857143,
"grad_norm": 0.1728057563304901,
"kl": 0.05438232421875,
"learning_rate": 4.967182142620745e-07,
"loss": -0.0489,
"reward": -0.24254203587770462,
"reward_std": 0.5258737653493881,
"rewards/cosine_len_reward": -0.24254203587770462,
"step": 293
},
{
"clip_ratio": 0.0,
"completion_length": 3453.7500610351562,
"epoch": 0.168,
"grad_norm": 0.16823387145996094,
"kl": 0.05694580078125,
"learning_rate": 4.93600044896063e-07,
"loss": -0.0096,
"reward": -0.27283355966210365,
"reward_std": 0.4063083231449127,
"rewards/cosine_len_reward": -0.27283355966210365,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 3561.1666870117188,
"epoch": 0.16857142857142857,
"grad_norm": 0.14513014256954193,
"kl": 0.0462646484375,
"learning_rate": 4.904846243842949e-07,
"loss": -0.0056,
"reward": 0.28293178975582123,
"reward_std": 0.30718712508678436,
"rewards/cosine_len_reward": 0.28293178975582123,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 3301.2916870117188,
"epoch": 0.16914285714285715,
"grad_norm": 0.18542294204235077,
"kl": 0.07574462890625,
"learning_rate": 4.873721045679706e-07,
"loss": 0.0151,
"reward": -0.4336223527789116,
"reward_std": 0.4073316380381584,
"rewards/cosine_len_reward": -0.4336223527789116,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 3456.4583740234375,
"epoch": 0.1697142857142857,
"grad_norm": 0.1608293056488037,
"kl": 0.04888916015625,
"learning_rate": 4.842626371469149e-07,
"loss": 0.0333,
"reward": 0.384184792637825,
"reward_std": 0.2665172927081585,
"rewards/cosine_len_reward": 0.384184792637825,
"step": 297
},
{
"clip_ratio": 0.0,
"completion_length": 3206.0416870117188,
"epoch": 0.1702857142857143,
"grad_norm": 0.19832450151443481,
"kl": 0.07696533203125,
"learning_rate": 4.811563736721829e-07,
"loss": 0.0752,
"reward": -0.399520443752408,
"reward_std": 0.32169621996581554,
"rewards/cosine_len_reward": -0.399520443752408,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 3512.625,
"epoch": 0.17085714285714285,
"grad_norm": 0.21216793358325958,
"kl": 0.06488037109375,
"learning_rate": 4.780534655386743e-07,
"loss": 0.0376,
"reward": 0.398956298828125,
"reward_std": 0.3917335644364357,
"rewards/cosine_len_reward": 0.398956298828125,
"step": 299
},
{
"clip_ratio": 0.0,
"completion_length": 3413.25,
"epoch": 0.17142857142857143,
"grad_norm": 0.16395436227321625,
"kl": 0.053955078125,
"learning_rate": 4.749540639777539e-07,
"loss": -0.015,
"reward": -0.36241818219423294,
"reward_std": 0.4449329450726509,
"rewards/cosine_len_reward": -0.36241818219423294,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 3202.125,
"epoch": 0.172,
"grad_norm": 0.15313901007175446,
"kl": 0.04730224609375,
"learning_rate": 4.7185832004988133e-07,
"loss": 0.0129,
"reward": -0.2640495439991355,
"reward_std": 0.47315043210983276,
"rewards/cosine_len_reward": -0.2640495439991355,
"step": 301
},
{
"clip_ratio": 0.0,
"completion_length": 2909.0833435058594,
"epoch": 0.17257142857142857,
"grad_norm": 27.085668563842773,
"kl": 21.65618896484375,
"learning_rate": 4.68766384637248e-07,
"loss": 0.1262,
"reward": 0.044873252511024475,
"reward_std": 0.23454123549163342,
"rewards/cosine_len_reward": 0.044873252511024475,
"step": 302
},
{
"clip_ratio": 0.0,
"completion_length": 3122.625,
"epoch": 0.17314285714285715,
"grad_norm": 0.1699696183204651,
"kl": 0.043304443359375,
"learning_rate": 4.656784084364238e-07,
"loss": -0.0568,
"reward": -0.30118887685239315,
"reward_std": 0.4637632668018341,
"rewards/cosine_len_reward": -0.30118887685239315,
"step": 303
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.1737142857142857,
"grad_norm": 0.19118285179138184,
"kl": 0.04522705078125,
"learning_rate": 4.6259454195101267e-07,
"loss": 0.0002,
"reward": -0.37123019248247147,
"reward_std": 0.4171289950609207,
"rewards/cosine_len_reward": -0.37123019248247147,
"step": 304
},
{
"clip_ratio": 0.0,
"completion_length": 3526.9583740234375,
"epoch": 0.1742857142857143,
"grad_norm": 0.14197200536727905,
"kl": 0.0484619140625,
"learning_rate": 4.59514935484316e-07,
"loss": 0.0266,
"reward": -0.04980198014527559,
"reward_std": 0.5029722154140472,
"rewards/cosine_len_reward": -0.04980198014527559,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 2224.3750228881836,
"epoch": 0.17485714285714285,
"grad_norm": 0.5903677940368652,
"kl": 0.15130615234375,
"learning_rate": 4.5643973913200837e-07,
"loss": 0.2378,
"reward": 0.26856766641139984,
"reward_std": 0.42505303025245667,
"rewards/cosine_len_reward": 0.26856766641139984,
"step": 306
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.17542857142857143,
"grad_norm": 0.1757889688014984,
"kl": 0.0382080078125,
"learning_rate": 4.5336910277482155e-07,
"loss": 0.0002,
"reward": -0.4401230961084366,
"reward_std": 0.23055214434862137,
"rewards/cosine_len_reward": -0.4401230961084366,
"step": 307
},
{
"clip_ratio": 0.0,
"completion_length": 2553.7083740234375,
"epoch": 0.176,
"grad_norm": 0.2097814977169037,
"kl": 0.0811767578125,
"learning_rate": 4.503031760712397e-07,
"loss": -0.1351,
"reward": -0.36412402987480164,
"reward_std": 0.49435050785541534,
"rewards/cosine_len_reward": -0.36412402987480164,
"step": 308
},
{
"clip_ratio": 0.0,
"completion_length": 3548.8333740234375,
"epoch": 0.17657142857142857,
"grad_norm": 0.16092661023139954,
"kl": 0.05072021484375,
"learning_rate": 4.4724210845020494e-07,
"loss": -0.0036,
"reward": -0.14657190442085266,
"reward_std": 0.38866011798381805,
"rewards/cosine_len_reward": -0.14657190442085266,
"step": 309
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.17714285714285713,
"grad_norm": 0.13816198706626892,
"kl": 0.03985595703125,
"learning_rate": 4.441860491038345e-07,
"loss": 0.0002,
"reward": -0.41758012771606445,
"reward_std": 0.37766269221901894,
"rewards/cosine_len_reward": -0.41758012771606445,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 3376.0833740234375,
"epoch": 0.1777142857142857,
"grad_norm": 0.18006065487861633,
"kl": 0.05718994140625,
"learning_rate": 4.4113514698014953e-07,
"loss": -0.0754,
"reward": 0.1647863369435072,
"reward_std": 0.48134757578372955,
"rewards/cosine_len_reward": 0.1647863369435072,
"step": 311
},
{
"clip_ratio": 0.0,
"completion_length": 3477.916748046875,
"epoch": 0.1782857142857143,
"grad_norm": 0.17141719162464142,
"kl": 0.0614013671875,
"learning_rate": 4.3808955077581546e-07,
"loss": -0.0411,
"reward": -0.01686130464076996,
"reward_std": 0.40112806484103203,
"rewards/cosine_len_reward": -0.01686130464076996,
"step": 312
},
{
"clip_ratio": 0.0,
"completion_length": 2756.375,
"epoch": 0.17885714285714285,
"grad_norm": 0.31155240535736084,
"kl": 0.050994873046875,
"learning_rate": 4.350494089288943e-07,
"loss": 0.0399,
"reward": 0.356310099363327,
"reward_std": 0.22038070112466812,
"rewards/cosine_len_reward": 0.356310099363327,
"step": 313
},
{
"clip_ratio": 0.0,
"completion_length": 3327.0833740234375,
"epoch": 0.17942857142857144,
"grad_norm": 0.15625828504562378,
"kl": 0.0460205078125,
"learning_rate": 4.3201486961161093e-07,
"loss": -0.0522,
"reward": -0.1617246214300394,
"reward_std": 0.2959202714264393,
"rewards/cosine_len_reward": -0.1617246214300394,
"step": 314
},
{
"clip_ratio": 0.0,
"completion_length": 3233.1666870117188,
"epoch": 0.18,
"grad_norm": 0.21146373450756073,
"kl": 0.05609130859375,
"learning_rate": 4.2898608072313045e-07,
"loss": -0.0672,
"reward": -0.5065985713154078,
"reward_std": 0.29883245564997196,
"rewards/cosine_len_reward": -0.5065985713154078,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.18057142857142858,
"grad_norm": 0.16894738376140594,
"kl": 0.05133056640625,
"learning_rate": 4.2596318988235037e-07,
"loss": 0.0002,
"reward": -0.08826442807912827,
"reward_std": 0.45449625700712204,
"rewards/cosine_len_reward": -0.08826442807912827,
"step": 316
},
{
"clip_ratio": 0.0,
"completion_length": 3471.666748046875,
"epoch": 0.18114285714285713,
"grad_norm": 0.16621477901935577,
"kl": 0.05419921875,
"learning_rate": 4.2294634442070553e-07,
"loss": -0.0353,
"reward": -0.05168744921684265,
"reward_std": 0.4507448337972164,
"rewards/cosine_len_reward": -0.05168744921684265,
"step": 317
},
{
"clip_ratio": 0.0,
"completion_length": 3045.0416870117188,
"epoch": 0.18171428571428572,
"grad_norm": 0.3005415201187134,
"kl": 0.05413818359375,
"learning_rate": 4.1993569137498776e-07,
"loss": 0.081,
"reward": -0.2615435868501663,
"reward_std": 0.42054056003689766,
"rewards/cosine_len_reward": -0.2615435868501663,
"step": 318
},
{
"clip_ratio": 0.0,
"completion_length": 3559.5416870117188,
"epoch": 0.18228571428571427,
"grad_norm": 0.15960033237934113,
"kl": 0.04742431640625,
"learning_rate": 4.1693137748017915e-07,
"loss": 0.0034,
"reward": -0.23648555018007755,
"reward_std": 0.5849247425794601,
"rewards/cosine_len_reward": -0.23648555018007755,
"step": 319
},
{
"clip_ratio": 0.0,
"completion_length": 3215.4583740234375,
"epoch": 0.18285714285714286,
"grad_norm": 0.20692546665668488,
"kl": 0.044647216796875,
"learning_rate": 4.1393354916230005e-07,
"loss": -0.057,
"reward": -0.3042381815612316,
"reward_std": 0.6242840066552162,
"rewards/cosine_len_reward": -0.3042381815612316,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.18342857142857144,
"grad_norm": 0.16232453286647797,
"kl": 0.05303955078125,
"learning_rate": 4.1094235253127374e-07,
"loss": 0.0002,
"reward": 0.05929779075086117,
"reward_std": 0.25769692938774824,
"rewards/cosine_len_reward": 0.05929779075086117,
"step": 321
},
{
"clip_ratio": 0.0,
"completion_length": 3570.0416870117188,
"epoch": 0.184,
"grad_norm": 0.14924933016300201,
"kl": 0.043487548828125,
"learning_rate": 4.079579333738039e-07,
"loss": 0.0055,
"reward": -0.3419235274195671,
"reward_std": 0.47414593398571014,
"rewards/cosine_len_reward": -0.3419235274195671,
"step": 322
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.18457142857142858,
"grad_norm": 0.15801897644996643,
"kl": 0.0462646484375,
"learning_rate": 4.0498043714627006e-07,
"loss": 0.0002,
"reward": -0.17604749649763107,
"reward_std": 0.455319419503212,
"rewards/cosine_len_reward": -0.17604749649763107,
"step": 323
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.18514285714285714,
"grad_norm": 0.1491517424583435,
"kl": 0.0474853515625,
"learning_rate": 4.020100089676376e-07,
"loss": 0.0002,
"reward": -0.2877863794565201,
"reward_std": 0.37373417615890503,
"rewards/cosine_len_reward": -0.2877863794565201,
"step": 324
},
{
"clip_ratio": 0.0,
"completion_length": 2878.7083435058594,
"epoch": 0.18571428571428572,
"grad_norm": 0.4391452670097351,
"kl": 0.04974365234375,
"learning_rate": 3.9904679361238526e-07,
"loss": -0.1851,
"reward": 0.12535587698221207,
"reward_std": 0.30845265835523605,
"rewards/cosine_len_reward": 0.12535587698221207,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 2754.0416870117188,
"epoch": 0.18628571428571428,
"grad_norm": 0.35371315479278564,
"kl": 0.05859375,
"learning_rate": 3.9609093550344907e-07,
"loss": 0.0695,
"reward": 0.3398248925805092,
"reward_std": 0.35160839185118675,
"rewards/cosine_len_reward": 0.3398248925805092,
"step": 326
},
{
"clip_ratio": 0.0,
"completion_length": 3322.4166870117188,
"epoch": 0.18685714285714286,
"grad_norm": 0.17805320024490356,
"kl": 0.04644775390625,
"learning_rate": 3.931425787051832e-07,
"loss": 0.0978,
"reward": -0.2754373177886009,
"reward_std": 0.3834121897816658,
"rewards/cosine_len_reward": -0.2754373177886009,
"step": 327
},
{
"clip_ratio": 0.0,
"completion_length": 3382.7083740234375,
"epoch": 0.18742857142857142,
"grad_norm": 0.2152082622051239,
"kl": 0.0640869140625,
"learning_rate": 3.902018669163384e-07,
"loss": 0.0142,
"reward": -0.6704437732696533,
"reward_std": 0.29631161503493786,
"rewards/cosine_len_reward": -0.6704437732696533,
"step": 328
},
{
"clip_ratio": 0.0,
"completion_length": 3507.0,
"epoch": 0.188,
"grad_norm": 0.15424416959285736,
"kl": 0.0408935546875,
"learning_rate": 3.872689434630585e-07,
"loss": -0.0165,
"reward": -0.30218280851840973,
"reward_std": 0.45681217312812805,
"rewards/cosine_len_reward": -0.30218280851840973,
"step": 329
},
{
"clip_ratio": 0.0,
"completion_length": 3086.7916870117188,
"epoch": 0.18857142857142858,
"grad_norm": 0.15681946277618408,
"kl": 0.04803466796875,
"learning_rate": 3.843439512918949e-07,
"loss": -0.0145,
"reward": -0.3932885080575943,
"reward_std": 0.38081324100494385,
"rewards/cosine_len_reward": -0.3932885080575943,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 2938.0833435058594,
"epoch": 0.18914285714285714,
"grad_norm": 0.20311184227466583,
"kl": 0.0517578125,
"learning_rate": 3.8142703296283953e-07,
"loss": 0.0564,
"reward": -0.21995433419942856,
"reward_std": 0.4448351748287678,
"rewards/cosine_len_reward": -0.21995433419942856,
"step": 331
},
{
"clip_ratio": 0.0,
"completion_length": 3013.5833740234375,
"epoch": 0.18971428571428572,
"grad_norm": 0.19754794239997864,
"kl": 0.042724609375,
"learning_rate": 3.785183306423767e-07,
"loss": -0.1519,
"reward": 0.14706944674253464,
"reward_std": 0.4487891271710396,
"rewards/cosine_len_reward": 0.14706944674253464,
"step": 332
},
{
"clip_ratio": 0.0,
"completion_length": 3266.75,
"epoch": 0.19028571428571428,
"grad_norm": 0.17275254428386688,
"kl": 0.05133056640625,
"learning_rate": 3.7561798609655373e-07,
"loss": 0.0115,
"reward": -0.12366479635238647,
"reward_std": 0.4474521279335022,
"rewards/cosine_len_reward": -0.12366479635238647,
"step": 333
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.19085714285714286,
"grad_norm": 0.15105217695236206,
"kl": 0.04327392578125,
"learning_rate": 3.72726140684072e-07,
"loss": 0.0002,
"reward": -0.2032340094447136,
"reward_std": 0.3833343982696533,
"rewards/cosine_len_reward": -0.2032340094447136,
"step": 334
},
{
"clip_ratio": 0.0,
"completion_length": 3215.0000610351562,
"epoch": 0.19142857142857142,
"grad_norm": 0.20102421939373016,
"kl": 0.0518798828125,
"learning_rate": 3.6984293534939737e-07,
"loss": 0.0936,
"reward": 0.031060203909873962,
"reward_std": 0.5999341458082199,
"rewards/cosine_len_reward": 0.031060203909873962,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 3227.0000610351562,
"epoch": 0.192,
"grad_norm": 0.166560098528862,
"kl": 0.04205322265625,
"learning_rate": 3.6696851061588994e-07,
"loss": -0.0589,
"reward": -0.11390832741744816,
"reward_std": 0.5906789004802704,
"rewards/cosine_len_reward": -0.11390832741744816,
"step": 336
},
{
"clip_ratio": 0.0,
"completion_length": 3075.7916870117188,
"epoch": 0.19257142857142856,
"grad_norm": 0.15513859689235687,
"kl": 0.045654296875,
"learning_rate": 3.641030065789562e-07,
"loss": -0.0208,
"reward": -0.22384709864854813,
"reward_std": 0.3124929741024971,
"rewards/cosine_len_reward": -0.22384709864854813,
"step": 337
},
{
"clip_ratio": 0.0,
"completion_length": 2809.666717529297,
"epoch": 0.19314285714285714,
"grad_norm": 0.2667958736419678,
"kl": 0.0615234375,
"learning_rate": 3.612465628992203e-07,
"loss": 0.0492,
"reward": -0.3067406304180622,
"reward_std": 0.43594905734062195,
"rewards/cosine_len_reward": -0.3067406304180622,
"step": 338
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.19371428571428573,
"grad_norm": 0.14318394660949707,
"kl": 0.0484619140625,
"learning_rate": 3.5839931879571725e-07,
"loss": 0.0002,
"reward": -0.3350971192121506,
"reward_std": 0.31668924540281296,
"rewards/cosine_len_reward": -0.3350971192121506,
"step": 339
},
{
"clip_ratio": 0.0,
"completion_length": 2548.875030517578,
"epoch": 0.19428571428571428,
"grad_norm": 0.26427459716796875,
"kl": 0.04827880859375,
"learning_rate": 3.555614130391079e-07,
"loss": 0.0543,
"reward": 0.14779387041926384,
"reward_std": 0.6089051365852356,
"rewards/cosine_len_reward": 0.14779387041926384,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 2463.750030517578,
"epoch": 0.19485714285714287,
"grad_norm": 0.2228541374206543,
"kl": 0.0516357421875,
"learning_rate": 3.5273298394491515e-07,
"loss": -0.0836,
"reward": -0.48010372975841165,
"reward_std": 0.2938494123518467,
"rewards/cosine_len_reward": -0.48010372975841165,
"step": 341
},
{
"clip_ratio": 0.0,
"completion_length": 3194.5000610351562,
"epoch": 0.19542857142857142,
"grad_norm": 0.1965860277414322,
"kl": 0.05621337890625,
"learning_rate": 3.4991416936678276e-07,
"loss": -0.0477,
"reward": -0.40905338898301125,
"reward_std": 0.3798966519534588,
"rewards/cosine_len_reward": -0.40905338898301125,
"step": 342
},
{
"clip_ratio": 0.0,
"completion_length": 3555.6666870117188,
"epoch": 0.196,
"grad_norm": 0.16845592856407166,
"kl": 0.0560302734375,
"learning_rate": 3.471051066897562e-07,
"loss": -0.0108,
"reward": -0.3629562482237816,
"reward_std": 0.31892314925789833,
"rewards/cosine_len_reward": -0.3629562482237816,
"step": 343
},
{
"clip_ratio": 0.0,
"completion_length": 3099.8750610351562,
"epoch": 0.19657142857142856,
"grad_norm": 0.1927381455898285,
"kl": 0.05059814453125,
"learning_rate": 3.4430593282358777e-07,
"loss": -0.016,
"reward": -0.2542693614959717,
"reward_std": 0.4295632019639015,
"rewards/cosine_len_reward": -0.2542693614959717,
"step": 344
},
{
"clip_ratio": 0.0,
"completion_length": 2577.9583740234375,
"epoch": 0.19714285714285715,
"grad_norm": 0.22457395493984222,
"kl": 0.0718994140625,
"learning_rate": 3.4151678419606233e-07,
"loss": -0.1906,
"reward": -0.30330940335989,
"reward_std": 0.50710579007864,
"rewards/cosine_len_reward": -0.30330940335989,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.1977142857142857,
"grad_norm": 0.1638084501028061,
"kl": 0.046417236328125,
"learning_rate": 3.387377967463493e-07,
"loss": 0.0002,
"reward": 0.060419052839279175,
"reward_std": 0.3038778752088547,
"rewards/cosine_len_reward": 0.060419052839279175,
"step": 346
},
{
"clip_ratio": 0.0,
"completion_length": 3490.2916870117188,
"epoch": 0.1982857142857143,
"grad_norm": 0.14090070128440857,
"kl": 0.0477294921875,
"learning_rate": 3.359691059183761e-07,
"loss": -0.0544,
"reward": -0.15175998210906982,
"reward_std": 0.4236246980726719,
"rewards/cosine_len_reward": -0.15175998210906982,
"step": 347
},
{
"clip_ratio": 0.0,
"completion_length": 3236.5833740234375,
"epoch": 0.19885714285714284,
"grad_norm": 0.17946301400661469,
"kl": 0.0538330078125,
"learning_rate": 3.3321084665422803e-07,
"loss": -0.0636,
"reward": -0.06013108603656292,
"reward_std": 0.5346257090568542,
"rewards/cosine_len_reward": -0.06013108603656292,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 3556.9583740234375,
"epoch": 0.19942857142857143,
"grad_norm": 0.1601073294878006,
"kl": 0.04962158203125,
"learning_rate": 3.3046315338757026e-07,
"loss": -0.0029,
"reward": -0.21468165516853333,
"reward_std": 0.5103632658720016,
"rewards/cosine_len_reward": -0.21468165516853333,
"step": 349
},
{
"clip_ratio": 0.0,
"completion_length": 2724.125030517578,
"epoch": 0.2,
"grad_norm": 0.2437424212694168,
"kl": 0.05059814453125,
"learning_rate": 3.2772616003709616e-07,
"loss": -0.03,
"reward": 0.16293304320424795,
"reward_std": 0.6034757569432259,
"rewards/cosine_len_reward": 0.16293304320424795,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.20057142857142857,
"grad_norm": 0.15608130395412445,
"kl": 0.05377197265625,
"learning_rate": 3.250000000000001e-07,
"loss": 0.0002,
"reward": -0.49736592173576355,
"reward_std": 0.3650210574269295,
"rewards/cosine_len_reward": -0.49736592173576355,
"step": 351
},
{
"clip_ratio": 0.0,
"completion_length": 2824.000030517578,
"epoch": 0.20114285714285715,
"grad_norm": 0.19428031146526337,
"kl": 0.05194091796875,
"learning_rate": 3.222848061454764e-07,
"loss": 0.2626,
"reward": -0.3008427929598838,
"reward_std": 0.41808854788541794,
"rewards/cosine_len_reward": -0.3008427929598838,
"step": 352
},
{
"clip_ratio": 0.0,
"completion_length": 3445.9583740234375,
"epoch": 0.2017142857142857,
"grad_norm": 0.16469155251979828,
"kl": 0.05267333984375,
"learning_rate": 3.195807108082429e-07,
"loss": -0.0527,
"reward": -0.13214807212352753,
"reward_std": 0.49090687185525894,
"rewards/cosine_len_reward": -0.13214807212352753,
"step": 353
},
{
"clip_ratio": 0.0,
"completion_length": 3180.875,
"epoch": 0.2022857142857143,
"grad_norm": 0.1331019401550293,
"kl": 0.0361328125,
"learning_rate": 3.168878457820915e-07,
"loss": -0.0633,
"reward": -0.04561649262905121,
"reward_std": 0.42332185059785843,
"rewards/cosine_len_reward": -0.04561649262905121,
"step": 354
},
{
"clip_ratio": 0.0,
"completion_length": 3065.4583435058594,
"epoch": 0.20285714285714285,
"grad_norm": 0.20506833493709564,
"kl": 0.06689453125,
"learning_rate": 3.142063423134644e-07,
"loss": -0.0127,
"reward": -0.10782808437943459,
"reward_std": 0.36676693707704544,
"rewards/cosine_len_reward": -0.10782808437943459,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 3181.1666870117188,
"epoch": 0.20342857142857143,
"grad_norm": 0.14900951087474823,
"kl": 0.03961181640625,
"learning_rate": 3.115363310950578e-07,
"loss": -0.0347,
"reward": -0.19400886073708534,
"reward_std": 0.4912572205066681,
"rewards/cosine_len_reward": -0.19400886073708534,
"step": 356
},
{
"clip_ratio": 0.0,
"completion_length": 3115.7083740234375,
"epoch": 0.204,
"grad_norm": 0.5810701251029968,
"kl": 0.333984375,
"learning_rate": 3.0887794225945143e-07,
"loss": -0.0253,
"reward": -0.2779521383345127,
"reward_std": 0.48514702171087265,
"rewards/cosine_len_reward": -0.2779521383345127,
"step": 357
},
{
"clip_ratio": 0.0,
"completion_length": 3074.9166870117188,
"epoch": 0.20457142857142857,
"grad_norm": 0.21780595183372498,
"kl": 0.0684814453125,
"learning_rate": 3.062313053727671e-07,
"loss": -0.0599,
"reward": -0.5599739253520966,
"reward_std": 0.34018975496292114,
"rewards/cosine_len_reward": -0.5599739253520966,
"step": 358
},
{
"clip_ratio": 0.0,
"completion_length": 3332.5833740234375,
"epoch": 0.20514285714285715,
"grad_norm": 0.17007119953632355,
"kl": 0.05792236328125,
"learning_rate": 3.0359654942835247e-07,
"loss": 0.0268,
"reward": -0.22800956666469574,
"reward_std": 0.48570629209280014,
"rewards/cosine_len_reward": -0.22800956666469574,
"step": 359
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.2057142857142857,
"grad_norm": 0.15876568853855133,
"kl": 0.0408935546875,
"learning_rate": 3.0097380284049523e-07,
"loss": 0.0002,
"reward": 0.08246973995119333,
"reward_std": 0.6036019250750542,
"rewards/cosine_len_reward": 0.08246973995119333,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 3412.2916870117188,
"epoch": 0.2062857142857143,
"grad_norm": 0.14641037583351135,
"kl": 0.047119140625,
"learning_rate": 2.9836319343816397e-07,
"loss": -0.003,
"reward": 0.05859617702662945,
"reward_std": 0.4323427379131317,
"rewards/cosine_len_reward": 0.05859617702662945,
"step": 361
},
{
"clip_ratio": 0.0,
"completion_length": 3474.25,
"epoch": 0.20685714285714285,
"grad_norm": 0.1528189331293106,
"kl": 0.04168701171875,
"learning_rate": 2.9576484845877793e-07,
"loss": -0.0358,
"reward": -0.4896080791950226,
"reward_std": 0.37986551597714424,
"rewards/cosine_len_reward": -0.4896080791950226,
"step": 362
},
{
"clip_ratio": 0.0,
"completion_length": 3300.0000610351562,
"epoch": 0.20742857142857143,
"grad_norm": 0.20170490443706512,
"kl": 0.0927734375,
"learning_rate": 2.931788945420058e-07,
"loss": -0.0207,
"reward": 0.16002619452774525,
"reward_std": 0.5198325589299202,
"rewards/cosine_len_reward": 0.16002619452774525,
"step": 363
},
{
"clip_ratio": 0.0,
"completion_length": 3578.4166870117188,
"epoch": 0.208,
"grad_norm": 0.17275533080101013,
"kl": 0.04876708984375,
"learning_rate": 2.9060545772359305e-07,
"loss": 0.0011,
"reward": -0.2858371250331402,
"reward_std": 0.4636296257376671,
"rewards/cosine_len_reward": -0.2858371250331402,
"step": 364
},
{
"clip_ratio": 0.0,
"completion_length": 3389.6666870117188,
"epoch": 0.20857142857142857,
"grad_norm": 0.19140967726707458,
"kl": 0.05511474609375,
"learning_rate": 2.8804466342921987e-07,
"loss": 0.0506,
"reward": -0.24161814898252487,
"reward_std": 0.5207706317305565,
"rewards/cosine_len_reward": -0.24161814898252487,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 3495.041748046875,
"epoch": 0.20914285714285713,
"grad_norm": 0.15582628548145294,
"kl": 0.04608154296875,
"learning_rate": 2.854966364683872e-07,
"loss": 0.024,
"reward": 0.0705061387270689,
"reward_std": 0.29681421583518386,
"rewards/cosine_len_reward": 0.0705061387270689,
"step": 366
},
{
"clip_ratio": 0.0,
"completion_length": 2878.250030517578,
"epoch": 0.20971428571428571,
"grad_norm": 0.26657283306121826,
"kl": 0.05078125,
"learning_rate": 2.829615010283344e-07,
"loss": 0.055,
"reward": -0.19209666550159454,
"reward_std": 0.5073798671364784,
"rewards/cosine_len_reward": -0.19209666550159454,
"step": 367
},
{
"clip_ratio": 0.0,
"completion_length": 2952.625,
"epoch": 0.2102857142857143,
"grad_norm": 0.2453869879245758,
"kl": 0.04644775390625,
"learning_rate": 2.8043938066798645e-07,
"loss": -0.0595,
"reward": -0.4701330562820658,
"reward_std": 0.43810437619686127,
"rewards/cosine_len_reward": -0.4701330562820658,
"step": 368
},
{
"clip_ratio": 0.0,
"completion_length": 3518.25,
"epoch": 0.21085714285714285,
"grad_norm": 0.17311729490756989,
"kl": 0.04888916015625,
"learning_rate": 2.7793039831193133e-07,
"loss": 0.0191,
"reward": -0.45571963116526604,
"reward_std": 0.40335123240947723,
"rewards/cosine_len_reward": -0.45571963116526604,
"step": 369
},
{
"clip_ratio": 0.0,
"completion_length": 3502.0,
"epoch": 0.21142857142857144,
"grad_norm": 0.1664201319217682,
"kl": 0.04937744140625,
"learning_rate": 2.7543467624442956e-07,
"loss": 0.0327,
"reward": -0.38726338744163513,
"reward_std": 0.5652562528848648,
"rewards/cosine_len_reward": -0.38726338744163513,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 3288.4583740234375,
"epoch": 0.212,
"grad_norm": 0.17351403832435608,
"kl": 0.042816162109375,
"learning_rate": 2.729523361034538e-07,
"loss": 0.045,
"reward": 0.021196894347667694,
"reward_std": 0.579230286180973,
"rewards/cosine_len_reward": 0.021196894347667694,
"step": 371
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.21257142857142858,
"grad_norm": 1.5477689504623413,
"kl": 0.093994140625,
"learning_rate": 2.7048349887476037e-07,
"loss": 0.0004,
"reward": -0.0014846324920654297,
"reward_std": 0.4312394857406616,
"rewards/cosine_len_reward": -0.0014846324920654297,
"step": 372
},
{
"clip_ratio": 0.0,
"completion_length": 3127.2916870117188,
"epoch": 0.21314285714285713,
"grad_norm": 0.1729487031698227,
"kl": 0.04461669921875,
"learning_rate": 2.6802828488599294e-07,
"loss": -0.0303,
"reward": -0.3757053539156914,
"reward_std": 0.5460054390132427,
"rewards/cosine_len_reward": -0.3757053539156914,
"step": 373
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.21371428571428572,
"grad_norm": 0.18273600935935974,
"kl": 0.0545654296875,
"learning_rate": 2.655868138008171e-07,
"loss": 0.0002,
"reward": -0.4427555501461029,
"reward_std": 0.4617629870772362,
"rewards/cosine_len_reward": -0.4427555501461029,
"step": 374
},
{
"clip_ratio": 0.0,
"completion_length": 3204.4166870117188,
"epoch": 0.21428571428571427,
"grad_norm": 0.26651179790496826,
"kl": 0.09613037109375,
"learning_rate": 2.631592046130896e-07,
"loss": -0.0441,
"reward": 0.39611528790555894,
"reward_std": 0.3069186918437481,
"rewards/cosine_len_reward": 0.39611528790555894,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.21485714285714286,
"grad_norm": 0.28572776913642883,
"kl": 0.137451171875,
"learning_rate": 2.6074557564105724e-07,
"loss": 0.0006,
"reward": -0.3743774890899658,
"reward_std": 0.4345959648489952,
"rewards/cosine_len_reward": -0.3743774890899658,
"step": 376
},
{
"clip_ratio": 0.0,
"completion_length": 3067.9583740234375,
"epoch": 0.21542857142857144,
"grad_norm": 0.20133064687252045,
"kl": 0.052734375,
"learning_rate": 2.583460445215911e-07,
"loss": -0.0541,
"reward": -0.36722417175769806,
"reward_std": 0.46697434037923813,
"rewards/cosine_len_reward": -0.36722417175769806,
"step": 377
},
{
"clip_ratio": 0.0,
"completion_length": 3256.8750610351562,
"epoch": 0.216,
"grad_norm": 0.19015897810459137,
"kl": 0.06170654296875,
"learning_rate": 2.5596072820445254e-07,
"loss": 0.0618,
"reward": -0.18510014936327934,
"reward_std": 0.5434570163488388,
"rewards/cosine_len_reward": -0.18510014936327934,
"step": 378
},
{
"clip_ratio": 0.0,
"completion_length": 3358.0833740234375,
"epoch": 0.21657142857142858,
"grad_norm": 0.16775275766849518,
"kl": 0.052734375,
"learning_rate": 2.5358974294659373e-07,
"loss": -0.0717,
"reward": -0.20583120733499527,
"reward_std": 0.1991448849439621,
"rewards/cosine_len_reward": -0.20583120733499527,
"step": 379
},
{
"clip_ratio": 0.0,
"completion_length": 3357.666748046875,
"epoch": 0.21714285714285714,
"grad_norm": 0.1613640934228897,
"kl": 0.049560546875,
"learning_rate": 2.512332043064913e-07,
"loss": 0.0323,
"reward": -0.1545943170785904,
"reward_std": 0.5345202684402466,
"rewards/cosine_len_reward": -0.1545943170785904,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 3260.0000610351562,
"epoch": 0.21771428571428572,
"grad_norm": 0.1736854612827301,
"kl": 0.0679931640625,
"learning_rate": 2.488912271385139e-07,
"loss": -0.038,
"reward": -0.4681055396795273,
"reward_std": 0.4248203635215759,
"rewards/cosine_len_reward": -0.4681055396795273,
"step": 381
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.21828571428571428,
"grad_norm": 0.18592111766338348,
"kl": 0.0501708984375,
"learning_rate": 2.465639255873246e-07,
"loss": 0.0002,
"reward": -0.39792611449956894,
"reward_std": 0.32762938365340233,
"rewards/cosine_len_reward": -0.39792611449956894,
"step": 382
},
{
"clip_ratio": 0.0,
"completion_length": 3378.3750610351562,
"epoch": 0.21885714285714286,
"grad_norm": 0.22236335277557373,
"kl": 0.070556640625,
"learning_rate": 2.4425141308231765e-07,
"loss": -0.0429,
"reward": -0.10803265869617462,
"reward_std": 0.3375392761081457,
"rewards/cosine_len_reward": -0.10803265869617462,
"step": 383
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.21942857142857142,
"grad_norm": 0.15268638730049133,
"kl": 0.0458984375,
"learning_rate": 2.4195380233209006e-07,
"loss": 0.0002,
"reward": 0.11043217405676842,
"reward_std": 0.46337635442614555,
"rewards/cosine_len_reward": 0.11043217405676842,
"step": 384
},
{
"clip_ratio": 0.0,
"completion_length": 2849.875045776367,
"epoch": 0.22,
"grad_norm": 0.29308760166168213,
"kl": 0.0633544921875,
"learning_rate": 2.3967120531894857e-07,
"loss": 0.0655,
"reward": -0.3381078392267227,
"reward_std": 0.5556919574737549,
"rewards/cosine_len_reward": -0.3381078392267227,
"step": 385
},
{
"clip_ratio": 0.0,
"completion_length": 3408.9166870117188,
"epoch": 0.22057142857142858,
"grad_norm": 0.18074069917201996,
"kl": 0.05487060546875,
"learning_rate": 2.374037332934512e-07,
"loss": -0.0111,
"reward": -0.31479221396148205,
"reward_std": 0.43774885684251785,
"rewards/cosine_len_reward": -0.31479221396148205,
"step": 386
},
{
"clip_ratio": 0.0,
"completion_length": 2028.0000114440918,
"epoch": 0.22114285714285714,
"grad_norm": 0.345003217458725,
"kl": 0.05218505859375,
"learning_rate": 2.3515149676898552e-07,
"loss": 0.0685,
"reward": 0.2636047229170799,
"reward_std": 0.26575359515845776,
"rewards/cosine_len_reward": 0.2636047229170799,
"step": 387
},
{
"clip_ratio": 0.0,
"completion_length": 2304.9583740234375,
"epoch": 0.22171428571428572,
"grad_norm": 0.2163887768983841,
"kl": 0.0439453125,
"learning_rate": 2.3291460551638237e-07,
"loss": -0.0088,
"reward": -0.5689196065068245,
"reward_std": 0.40281252190470695,
"rewards/cosine_len_reward": -0.5689196065068245,
"step": 388
},
{
"clip_ratio": 0.0,
"completion_length": 1966.8333435058594,
"epoch": 0.22228571428571428,
"grad_norm": 0.23669986426830292,
"kl": 0.0467529296875,
"learning_rate": 2.306931685585657e-07,
"loss": 0.1076,
"reward": -0.19248197972774506,
"reward_std": 0.2740873768925667,
"rewards/cosine_len_reward": -0.19248197972774506,
"step": 389
},
{
"clip_ratio": 0.0,
"completion_length": 3330.8333740234375,
"epoch": 0.22285714285714286,
"grad_norm": 0.19466285407543182,
"kl": 0.05242919921875,
"learning_rate": 2.2848729416523859e-07,
"loss": 0.0745,
"reward": -0.25136643648147583,
"reward_std": 0.5177683755755424,
"rewards/cosine_len_reward": -0.25136643648147583,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.22342857142857142,
"grad_norm": 0.18898797035217285,
"kl": 0.05535888671875,
"learning_rate": 2.2629708984760706e-07,
"loss": 0.0002,
"reward": 0.15870603919029236,
"reward_std": 0.42898403853178024,
"rewards/cosine_len_reward": 0.15870603919029236,
"step": 391
},
{
"clip_ratio": 0.0,
"completion_length": 3555.2916870117188,
"epoch": 0.224,
"grad_norm": 0.165949285030365,
"kl": 0.0509033203125,
"learning_rate": 2.2412266235313973e-07,
"loss": 0.0141,
"reward": -0.4548497349023819,
"reward_std": 0.24412627145648003,
"rewards/cosine_len_reward": -0.4548497349023819,
"step": 392
},
{
"clip_ratio": 0.0,
"completion_length": 3505.5416870117188,
"epoch": 0.22457142857142856,
"grad_norm": 0.16940677165985107,
"kl": 0.04827880859375,
"learning_rate": 2.2196411766036487e-07,
"loss": -0.0325,
"reward": -0.019142277538776398,
"reward_std": 0.49893130362033844,
"rewards/cosine_len_reward": -0.019142277538776398,
"step": 393
},
{
"clip_ratio": 0.0,
"completion_length": 2805.875,
"epoch": 0.22514285714285714,
"grad_norm": 0.2236856073141098,
"kl": 0.04705810546875,
"learning_rate": 2.1982156097370557e-07,
"loss": 0.0315,
"reward": 0.17413194477558136,
"reward_std": 0.37101365998387337,
"rewards/cosine_len_reward": 0.17413194477558136,
"step": 394
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.2257142857142857,
"grad_norm": 0.1488247662782669,
"kl": 0.04559326171875,
"learning_rate": 2.1769509671835223e-07,
"loss": 0.0002,
"reward": -0.16203116870019585,
"reward_std": 0.4705766849219799,
"rewards/cosine_len_reward": -0.16203116870019585,
"step": 395
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.22628571428571428,
"grad_norm": 0.13519521057605743,
"kl": 0.03631591796875,
"learning_rate": 2.1558482853517253e-07,
"loss": 0.0001,
"reward": 0.14726969599723816,
"reward_std": 0.3467975091189146,
"rewards/cosine_len_reward": 0.14726969599723816,
"step": 396
},
{
"clip_ratio": 0.0,
"completion_length": 3178.6666870117188,
"epoch": 0.22685714285714287,
"grad_norm": 0.26082149147987366,
"kl": 0.05474853515625,
"learning_rate": 2.134908592756607e-07,
"loss": -0.0699,
"reward": 0.2510442901402712,
"reward_std": 0.4601794481277466,
"rewards/cosine_len_reward": 0.2510442901402712,
"step": 397
},
{
"clip_ratio": 0.0,
"completion_length": 3374.0833740234375,
"epoch": 0.22742857142857142,
"grad_norm": 0.16580724716186523,
"kl": 0.04669189453125,
"learning_rate": 2.1141329099692406e-07,
"loss": -0.0645,
"reward": -0.13105885684490204,
"reward_std": 0.4312235489487648,
"rewards/cosine_len_reward": -0.13105885684490204,
"step": 398
},
{
"clip_ratio": 0.0,
"completion_length": 3359.25,
"epoch": 0.228,
"grad_norm": 0.19662907719612122,
"kl": 0.06268310546875,
"learning_rate": 2.0935222495670968e-07,
"loss": 0.0101,
"reward": -0.20190414786338806,
"reward_std": 0.47945015504956245,
"rewards/cosine_len_reward": -0.20190414786338806,
"step": 399
},
{
"clip_ratio": 0.0,
"completion_length": 2060.4583587646484,
"epoch": 0.22857142857142856,
"grad_norm": 0.35796305537223816,
"kl": 0.057525634765625,
"learning_rate": 2.0730776160846853e-07,
"loss": 0.0891,
"reward": -0.038805801421403885,
"reward_std": 0.3311319947242737,
"rewards/cosine_len_reward": -0.038805801421403885,
"step": 400
},
{
"clip_ratio": 0.0,
"completion_length": 3128.6666870117188,
"epoch": 0.22914285714285715,
"grad_norm": 0.25352492928504944,
"kl": 0.0992431640625,
"learning_rate": 2.0528000059645995e-07,
"loss": -0.1331,
"reward": 0.3173316791653633,
"reward_std": 0.5900417268276215,
"rewards/cosine_len_reward": 0.3173316791653633,
"step": 401
},
{
"clip_ratio": 0.0,
"completion_length": 2864.75,
"epoch": 0.2297142857142857,
"grad_norm": 0.18959416449069977,
"kl": 0.04730224609375,
"learning_rate": 2.032690407508949e-07,
"loss": -0.011,
"reward": 0.1726340614259243,
"reward_std": 0.4275776147842407,
"rewards/cosine_len_reward": 0.1726340614259243,
"step": 402
},
{
"clip_ratio": 0.0,
"completion_length": 2573.000045776367,
"epoch": 0.2302857142857143,
"grad_norm": 0.22201809287071228,
"kl": 0.05474853515625,
"learning_rate": 2.0127498008311922e-07,
"loss": -0.0644,
"reward": 0.020148977637290955,
"reward_std": 0.6248219758272171,
"rewards/cosine_len_reward": 0.020148977637290955,
"step": 403
},
{
"clip_ratio": 0.0,
"completion_length": 3304.7083740234375,
"epoch": 0.23085714285714284,
"grad_norm": 0.22055773437023163,
"kl": 0.05517578125,
"learning_rate": 1.9929791578083655e-07,
"loss": 0.0566,
"reward": -0.24377129971981049,
"reward_std": 0.42593977600336075,
"rewards/cosine_len_reward": -0.24377129971981049,
"step": 404
},
{
"clip_ratio": 0.0,
"completion_length": 2901.875,
"epoch": 0.23142857142857143,
"grad_norm": 0.25622329115867615,
"kl": 0.0494384765625,
"learning_rate": 1.9733794420337213e-07,
"loss": -0.1542,
"reward": 0.20779071189463139,
"reward_std": 0.4233979359269142,
"rewards/cosine_len_reward": 0.20779071189463139,
"step": 405
},
{
"clip_ratio": 0.0,
"completion_length": 3525.3333740234375,
"epoch": 0.232,
"grad_norm": 0.19775983691215515,
"kl": 0.064697265625,
"learning_rate": 1.9539516087697517e-07,
"loss": -0.0078,
"reward": -0.3867212012410164,
"reward_std": 0.3607660289853811,
"rewards/cosine_len_reward": -0.3867212012410164,
"step": 406
},
{
"clip_ratio": 0.0,
"completion_length": 3248.125,
"epoch": 0.23257142857142857,
"grad_norm": 0.19077526032924652,
"kl": 0.0391845703125,
"learning_rate": 1.934696604901642e-07,
"loss": 0.0377,
"reward": 0.05661339312791824,
"reward_std": 0.5278025027364492,
"rewards/cosine_len_reward": 0.05661339312791824,
"step": 407
},
{
"clip_ratio": 0.0,
"completion_length": 3040.2500610351562,
"epoch": 0.23314285714285715,
"grad_norm": 0.2332611382007599,
"kl": 0.0648193359375,
"learning_rate": 1.915615368891117e-07,
"loss": 0.0701,
"reward": -0.5832930132746696,
"reward_std": 0.33875271677970886,
"rewards/cosine_len_reward": -0.5832930132746696,
"step": 408
},
{
"clip_ratio": 0.0,
"completion_length": 3566.0833740234375,
"epoch": 0.2337142857142857,
"grad_norm": 0.18132203817367554,
"kl": 0.0550537109375,
"learning_rate": 1.8967088307307e-07,
"loss": 0.0092,
"reward": -0.1184474304318428,
"reward_std": 0.6181638091802597,
"rewards/cosine_len_reward": -0.1184474304318428,
"step": 409
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.2342857142857143,
"grad_norm": 0.15652252733707428,
"kl": 0.0477294921875,
"learning_rate": 1.8779779118983867e-07,
"loss": 0.0002,
"reward": -0.5223753824830055,
"reward_std": 0.3389586843550205,
"rewards/cosine_len_reward": -0.5223753824830055,
"step": 410
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.23485714285714285,
"grad_norm": 0.15396270155906677,
"kl": 0.04498291015625,
"learning_rate": 1.8594235253127372e-07,
"loss": 0.0002,
"reward": 0.004480212926864624,
"reward_std": 0.32227758690714836,
"rewards/cosine_len_reward": 0.004480212926864624,
"step": 411
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.23542857142857143,
"grad_norm": 0.1605028212070465,
"kl": 0.0478515625,
"learning_rate": 1.8410465752883758e-07,
"loss": 0.0002,
"reward": -0.28013300243765116,
"reward_std": 0.4526229053735733,
"rewards/cosine_len_reward": -0.28013300243765116,
"step": 412
},
{
"clip_ratio": 0.0,
"completion_length": 3207.0833740234375,
"epoch": 0.236,
"grad_norm": 0.1472114473581314,
"kl": 0.04119873046875,
"learning_rate": 1.822847957491922e-07,
"loss": -0.022,
"reward": -0.6025111824274063,
"reward_std": 0.3982557747513056,
"rewards/cosine_len_reward": -0.6025111824274063,
"step": 413
},
{
"clip_ratio": 0.0,
"completion_length": 2968.875,
"epoch": 0.23657142857142857,
"grad_norm": 0.279211163520813,
"kl": 0.0579833984375,
"learning_rate": 1.804828558898332e-07,
"loss": -0.1383,
"reward": 0.30589140206575394,
"reward_std": 0.3166845068335533,
"rewards/cosine_len_reward": 0.30589140206575394,
"step": 414
},
{
"clip_ratio": 0.0,
"completion_length": 3109.1666870117188,
"epoch": 0.23714285714285716,
"grad_norm": 0.22959719598293304,
"kl": 0.04766845703125,
"learning_rate": 1.7869892577476722e-07,
"loss": -0.1332,
"reward": -0.19372307881712914,
"reward_std": 0.3859623149037361,
"rewards/cosine_len_reward": -0.19372307881712914,
"step": 415
},
{
"clip_ratio": 0.0,
"completion_length": 3371.1666870117188,
"epoch": 0.2377142857142857,
"grad_norm": 0.1638830453157425,
"kl": 0.04876708984375,
"learning_rate": 1.7693309235023127e-07,
"loss": -0.0842,
"reward": 0.24745731800794601,
"reward_std": 0.33830365166068077,
"rewards/cosine_len_reward": 0.24745731800794601,
"step": 416
},
{
"clip_ratio": 0.0,
"completion_length": 2687.7917098999023,
"epoch": 0.2382857142857143,
"grad_norm": 0.25798413157463074,
"kl": 0.04632568359375,
"learning_rate": 1.7518544168045524e-07,
"loss": 0.0659,
"reward": 0.22542408853769302,
"reward_std": 0.3038127450272441,
"rewards/cosine_len_reward": 0.22542408853769302,
"step": 417
},
{
"clip_ratio": 0.0,
"completion_length": 2884.8333740234375,
"epoch": 0.23885714285714285,
"grad_norm": 0.4755244255065918,
"kl": 0.090576171875,
"learning_rate": 1.7345605894346726e-07,
"loss": -0.1926,
"reward": -0.580606535077095,
"reward_std": 0.5118565671145916,
"rewards/cosine_len_reward": -0.580606535077095,
"step": 418
},
{
"clip_ratio": 0.0,
"completion_length": 3318.2916870117188,
"epoch": 0.23942857142857144,
"grad_norm": 0.202090322971344,
"kl": 0.04998779296875,
"learning_rate": 1.7174502842694212e-07,
"loss": -0.0433,
"reward": 0.20114058069884777,
"reward_std": 0.3687387742102146,
"rewards/cosine_len_reward": 0.20114058069884777,
"step": 419
},
{
"clip_ratio": 0.0,
"completion_length": 2232.416717529297,
"epoch": 0.24,
"grad_norm": 0.2654793858528137,
"kl": 0.0609130859375,
"learning_rate": 1.7005243352409333e-07,
"loss": 0.0579,
"reward": -0.08468323387205601,
"reward_std": 0.60483318567276,
"rewards/cosine_len_reward": -0.08468323387205601,
"step": 420
},
{
"clip_ratio": 0.0,
"completion_length": 3315.125,
"epoch": 0.24057142857142857,
"grad_norm": 0.18679861724376678,
"kl": 0.06512451171875,
"learning_rate": 1.6837835672960831e-07,
"loss": 0.0097,
"reward": -0.22448206320405006,
"reward_std": 0.5287001729011536,
"rewards/cosine_len_reward": -0.22448206320405006,
"step": 421
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.24114285714285713,
"grad_norm": 0.14611122012138367,
"kl": 0.04083251953125,
"learning_rate": 1.6672287963562852e-07,
"loss": 0.0002,
"reward": -0.1785850077867508,
"reward_std": 0.42264287918806076,
"rewards/cosine_len_reward": -0.1785850077867508,
"step": 422
},
{
"clip_ratio": 0.0,
"completion_length": 2946.4583435058594,
"epoch": 0.24171428571428571,
"grad_norm": 0.202953040599823,
"kl": 0.0562744140625,
"learning_rate": 1.6508608292777203e-07,
"loss": 0.0495,
"reward": -0.3013099692761898,
"reward_std": 0.47750524431467056,
"rewards/cosine_len_reward": -0.3013099692761898,
"step": 423
},
{
"clip_ratio": 0.0,
"completion_length": 3452.375,
"epoch": 0.2422857142857143,
"grad_norm": 0.2135123759508133,
"kl": 0.04937744140625,
"learning_rate": 1.6346804638120098e-07,
"loss": 0.0447,
"reward": -0.1179568525403738,
"reward_std": 0.5295333191752434,
"rewards/cosine_len_reward": -0.1179568525403738,
"step": 424
},
{
"clip_ratio": 0.0,
"completion_length": 3552.375,
"epoch": 0.24285714285714285,
"grad_norm": 0.1585531085729599,
"kl": 0.05169677734375,
"learning_rate": 1.6186884885673413e-07,
"loss": -0.0117,
"reward": -0.48371345549821854,
"reward_std": 0.2882954329252243,
"rewards/cosine_len_reward": -0.48371345549821854,
"step": 425
},
{
"clip_ratio": 0.0,
"completion_length": 3492.0416870117188,
"epoch": 0.24342857142857144,
"grad_norm": 0.16782714426517487,
"kl": 0.04248046875,
"learning_rate": 1.6028856829700258e-07,
"loss": 0.0336,
"reward": -0.3401487283408642,
"reward_std": 0.548667848110199,
"rewards/cosine_len_reward": -0.3401487283408642,
"step": 426
},
{
"clip_ratio": 0.0,
"completion_length": 3474.9166870117188,
"epoch": 0.244,
"grad_norm": 0.17093884944915771,
"kl": 0.04315185546875,
"learning_rate": 1.5872728172265146e-07,
"loss": -0.0182,
"reward": 0.2449711412191391,
"reward_std": 0.5114858150482178,
"rewards/cosine_len_reward": 0.2449711412191391,
"step": 427
},
{
"clip_ratio": 0.0,
"completion_length": 3381.041748046875,
"epoch": 0.24457142857142858,
"grad_norm": 0.18249650299549103,
"kl": 0.052978515625,
"learning_rate": 1.5718506522858572e-07,
"loss": -0.0526,
"reward": -0.26586161740124226,
"reward_std": 0.5688638612627983,
"rewards/cosine_len_reward": -0.26586161740124226,
"step": 428
},
{
"clip_ratio": 0.0,
"completion_length": 3452.7083740234375,
"epoch": 0.24514285714285713,
"grad_norm": 0.1674465388059616,
"kl": 0.05694580078125,
"learning_rate": 1.5566199398026147e-07,
"loss": 0.0578,
"reward": -0.3403562903404236,
"reward_std": 0.3575546946376562,
"rewards/cosine_len_reward": -0.3403562903404236,
"step": 429
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.24571428571428572,
"grad_norm": 0.2020343542098999,
"kl": 0.05108642578125,
"learning_rate": 1.5415814221002265e-07,
"loss": 0.0002,
"reward": -0.3109110891819,
"reward_std": 0.4139351099729538,
"rewards/cosine_len_reward": -0.3109110891819,
"step": 430
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.24628571428571427,
"grad_norm": 0.143864706158638,
"kl": 0.05267333984375,
"learning_rate": 1.5267358321348285e-07,
"loss": 0.0002,
"reward": -0.1861107312142849,
"reward_std": 0.2637110576033592,
"rewards/cosine_len_reward": -0.1861107312142849,
"step": 431
},
{
"clip_ratio": 0.0,
"completion_length": 2981.25,
"epoch": 0.24685714285714286,
"grad_norm": 0.165711909532547,
"kl": 0.04681396484375,
"learning_rate": 1.5120838934595337e-07,
"loss": 0.0404,
"reward": 0.12767831981182098,
"reward_std": 0.3957044407725334,
"rewards/cosine_len_reward": 0.12767831981182098,
"step": 432
},
{
"clip_ratio": 0.0,
"completion_length": 2924.9166870117188,
"epoch": 0.24742857142857144,
"grad_norm": 0.17230506241321564,
"kl": 0.04913330078125,
"learning_rate": 1.4976263201891613e-07,
"loss": -0.0391,
"reward": 0.0737846726551652,
"reward_std": 0.7179519534111023,
"rewards/cosine_len_reward": 0.0737846726551652,
"step": 433
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.248,
"grad_norm": 0.16101869940757751,
"kl": 0.0555419921875,
"learning_rate": 1.483363816965435e-07,
"loss": 0.0002,
"reward": 0.1376335695385933,
"reward_std": 0.39823680371046066,
"rewards/cosine_len_reward": 0.1376335695385933,
"step": 434
},
{
"clip_ratio": 0.0,
"completion_length": 3123.3333435058594,
"epoch": 0.24857142857142858,
"grad_norm": 0.15367208421230316,
"kl": 0.041259765625,
"learning_rate": 1.469297078922642e-07,
"loss": -0.0384,
"reward": 0.09720689244568348,
"reward_std": 0.44290298968553543,
"rewards/cosine_len_reward": 0.09720689244568348,
"step": 435
},
{
"clip_ratio": 0.0,
"completion_length": 3015.2916870117188,
"epoch": 0.24914285714285714,
"grad_norm": 0.2917621433734894,
"kl": 0.0430908203125,
"learning_rate": 1.4554267916537495e-07,
"loss": 0.0654,
"reward": -0.12698917463421822,
"reward_std": 0.37568875774741173,
"rewards/cosine_len_reward": -0.12698917463421822,
"step": 436
},
{
"clip_ratio": 0.0,
"completion_length": 2433.7501220703125,
"epoch": 0.24971428571428572,
"grad_norm": 0.46212059259414673,
"kl": 0.04766845703125,
"learning_rate": 1.4417536311769885e-07,
"loss": 0.2189,
"reward": -0.24191192165017128,
"reward_std": 0.44904495403170586,
"rewards/cosine_len_reward": -0.24191192165017128,
"step": 437
},
{
"clip_ratio": 0.0,
"completion_length": 3474.6666870117188,
"epoch": 0.2502857142857143,
"grad_norm": 0.1538762003183365,
"kl": 0.0439453125,
"learning_rate": 1.4282782639029128e-07,
"loss": -0.0377,
"reward": -0.14186367020010948,
"reward_std": 0.39842598885297775,
"rewards/cosine_len_reward": -0.14186367020010948,
"step": 438
},
{
"clip_ratio": 0.0,
"completion_length": 3523.875,
"epoch": 0.25085714285714283,
"grad_norm": 0.15350596606731415,
"kl": 0.0535888671875,
"learning_rate": 1.4150013466019114e-07,
"loss": 0.013,
"reward": -0.34710339456796646,
"reward_std": 0.28258360363543034,
"rewards/cosine_len_reward": -0.34710339456796646,
"step": 439
},
{
"clip_ratio": 0.0,
"completion_length": 3099.8334045410156,
"epoch": 0.25142857142857145,
"grad_norm": 0.1954686939716339,
"kl": 0.06451416015625,
"learning_rate": 1.4019235263722034e-07,
"loss": 0.0499,
"reward": -0.3662155866622925,
"reward_std": 0.3874107152223587,
"rewards/cosine_len_reward": -0.3662155866622925,
"step": 440
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.252,
"grad_norm": 0.15524186193943024,
"kl": 0.0484619140625,
"learning_rate": 1.3890454406082956e-07,
"loss": 0.0002,
"reward": -0.24688275158405304,
"reward_std": 0.4402740001678467,
"rewards/cosine_len_reward": -0.24688275158405304,
"step": 441
},
{
"clip_ratio": 0.0,
"completion_length": 2872.0416870117188,
"epoch": 0.25257142857142856,
"grad_norm": 0.41410180926322937,
"kl": 0.04583740234375,
"learning_rate": 1.3763677169699217e-07,
"loss": -0.1527,
"reward": -0.30389176309108734,
"reward_std": 0.35049962252378464,
"rewards/cosine_len_reward": -0.30389176309108734,
"step": 442
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.25314285714285717,
"grad_norm": 0.13994863629341125,
"kl": 0.0460205078125,
"learning_rate": 1.3638909733514452e-07,
"loss": 0.0002,
"reward": -0.45876485109329224,
"reward_std": 0.27763616293668747,
"rewards/cosine_len_reward": -0.45876485109329224,
"step": 443
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.2537142857142857,
"grad_norm": 0.13927753269672394,
"kl": 0.0404052734375,
"learning_rate": 1.351615817851748e-07,
"loss": 0.0002,
"reward": -0.5448080375790596,
"reward_std": 0.423710398375988,
"rewards/cosine_len_reward": -0.5448080375790596,
"step": 444
},
{
"clip_ratio": 0.0,
"completion_length": 3210.5833740234375,
"epoch": 0.2542857142857143,
"grad_norm": 0.16514094173908234,
"kl": 0.050933837890625,
"learning_rate": 1.3395428487445914e-07,
"loss": -0.0006,
"reward": 0.08599076699465513,
"reward_std": 0.44797009229660034,
"rewards/cosine_len_reward": 0.08599076699465513,
"step": 445
},
{
"clip_ratio": 0.0,
"completion_length": 2722.500030517578,
"epoch": 0.25485714285714284,
"grad_norm": 0.18482115864753723,
"kl": 0.0404052734375,
"learning_rate": 1.3276726544494571e-07,
"loss": 0.1305,
"reward": -0.2829571203328669,
"reward_std": 0.4319111257791519,
"rewards/cosine_len_reward": -0.2829571203328669,
"step": 446
},
{
"clip_ratio": 0.0,
"completion_length": 2910.4583435058594,
"epoch": 0.25542857142857145,
"grad_norm": 0.2328081727027893,
"kl": 0.0455322265625,
"learning_rate": 1.316005813502869e-07,
"loss": -0.0287,
"reward": -0.2549123764038086,
"reward_std": 0.4166012778878212,
"rewards/cosine_len_reward": -0.2549123764038086,
"step": 447
},
{
"clip_ratio": 0.0,
"completion_length": 3550.5833740234375,
"epoch": 0.256,
"grad_norm": 0.15128394961357117,
"kl": 0.04803466796875,
"learning_rate": 1.3045428945301953e-07,
"loss": 0.0127,
"reward": 0.059171320259338245,
"reward_std": 0.4689246341586113,
"rewards/cosine_len_reward": 0.059171320259338245,
"step": 448
},
{
"clip_ratio": 0.0,
"completion_length": 3580.125,
"epoch": 0.25657142857142856,
"grad_norm": 0.13547244668006897,
"kl": 0.03497314453125,
"learning_rate": 1.2932844562179352e-07,
"loss": -0.0015,
"reward": 0.269029151648283,
"reward_std": 0.42868663370609283,
"rewards/cosine_len_reward": 0.269029151648283,
"step": 449
},
{
"clip_ratio": 0.0,
"completion_length": 2534.791748046875,
"epoch": 0.2571428571428571,
"grad_norm": 0.21846917271614075,
"kl": 0.042938232421875,
"learning_rate": 1.2822310472864885e-07,
"loss": 0.0232,
"reward": -0.10483485460281372,
"reward_std": 0.653083398938179,
"rewards/cosine_len_reward": -0.10483485460281372,
"step": 450
},
{
"clip_ratio": 0.0,
"completion_length": 3452.7083740234375,
"epoch": 0.25771428571428573,
"grad_norm": 0.19205856323242188,
"kl": 0.0594482421875,
"learning_rate": 1.2713832064634125e-07,
"loss": 0.0789,
"reward": -0.721703439950943,
"reward_std": 0.2940813582390547,
"rewards/cosine_len_reward": -0.721703439950943,
"step": 451
},
{
"clip_ratio": 0.0,
"completion_length": 2950.625030517578,
"epoch": 0.2582857142857143,
"grad_norm": 0.20902574062347412,
"kl": 0.04718017578125,
"learning_rate": 1.260741462457165e-07,
"loss": 0.1,
"reward": -0.05610589450225234,
"reward_std": 0.5669073164463043,
"rewards/cosine_len_reward": -0.05610589450225234,
"step": 452
},
{
"clip_ratio": 0.0,
"completion_length": 2547.6666717529297,
"epoch": 0.25885714285714284,
"grad_norm": 0.31278106570243835,
"kl": 0.0537109375,
"learning_rate": 1.2503063339313356e-07,
"loss": 0.0898,
"reward": 0.06469499785453081,
"reward_std": 0.386950358748436,
"rewards/cosine_len_reward": 0.06469499785453081,
"step": 453
},
{
"clip_ratio": 0.0,
"completion_length": 3400.6666870117188,
"epoch": 0.25942857142857145,
"grad_norm": 0.26481106877326965,
"kl": 0.12164306640625,
"learning_rate": 1.2400783294793668e-07,
"loss": -0.0344,
"reward": -0.10013716202229261,
"reward_std": 0.6173229813575745,
"rewards/cosine_len_reward": -0.10013716202229261,
"step": 454
},
{
"clip_ratio": 0.0,
"completion_length": 3558.375,
"epoch": 0.26,
"grad_norm": 0.16176006197929382,
"kl": 0.041015625,
"learning_rate": 1.2300579475997657e-07,
"loss": 0.0146,
"reward": -0.06885599717497826,
"reward_std": 0.5056197047233582,
"rewards/cosine_len_reward": -0.06885599717497826,
"step": 455
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.26057142857142856,
"grad_norm": 0.15790623426437378,
"kl": 0.052734375,
"learning_rate": 1.220245676671809e-07,
"loss": 0.0002,
"reward": 0.1165997963398695,
"reward_std": 0.4357607886195183,
"rewards/cosine_len_reward": 0.1165997963398695,
"step": 456
},
{
"clip_ratio": 0.0,
"completion_length": 2898.375,
"epoch": 0.2611428571428571,
"grad_norm": 0.2058410793542862,
"kl": 0.06005859375,
"learning_rate": 1.2106419949317388e-07,
"loss": 0.0056,
"reward": -0.03989528864622116,
"reward_std": 0.46769294142723083,
"rewards/cosine_len_reward": -0.03989528864622116,
"step": 457
},
{
"clip_ratio": 0.0,
"completion_length": 3425.5,
"epoch": 0.26171428571428573,
"grad_norm": 0.18089525401592255,
"kl": 0.051483154296875,
"learning_rate": 1.2012473704494537e-07,
"loss": -0.0597,
"reward": -0.03286702465265989,
"reward_std": 0.5668174773454666,
"rewards/cosine_len_reward": -0.03286702465265989,
"step": 458
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.2622857142857143,
"grad_norm": 0.1464470624923706,
"kl": 0.04827880859375,
"learning_rate": 1.1920622611056974e-07,
"loss": 0.0002,
"reward": -0.1251504085958004,
"reward_std": 0.2749813590198755,
"rewards/cosine_len_reward": -0.1251504085958004,
"step": 459
},
{
"clip_ratio": 0.0,
"completion_length": 3265.3333740234375,
"epoch": 0.26285714285714284,
"grad_norm": 0.2260807752609253,
"kl": 0.05755615234375,
"learning_rate": 1.1830871145697412e-07,
"loss": 0.002,
"reward": 0.15173575282096863,
"reward_std": 0.4503812901675701,
"rewards/cosine_len_reward": 0.15173575282096863,
"step": 460
},
{
"clip_ratio": 0.0,
"completion_length": 2819.500030517578,
"epoch": 0.2634285714285714,
"grad_norm": 0.2082054316997528,
"kl": 0.054931640625,
"learning_rate": 1.1743223682775649e-07,
"loss": 0.0659,
"reward": 0.03663429245352745,
"reward_std": 0.38886918127536774,
"rewards/cosine_len_reward": 0.03663429245352745,
"step": 461
},
{
"clip_ratio": 0.0,
"completion_length": 3554.375,
"epoch": 0.264,
"grad_norm": 0.13131263852119446,
"kl": 0.03497314453125,
"learning_rate": 1.1657684494105386e-07,
"loss": 0.0134,
"reward": 0.02211976982653141,
"reward_std": 0.3899141475558281,
"rewards/cosine_len_reward": 0.02211976982653141,
"step": 462
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.26457142857142857,
"grad_norm": 0.17710186541080475,
"kl": 0.04364013671875,
"learning_rate": 1.1574257748745986e-07,
"loss": 0.0002,
"reward": -0.16693101823329926,
"reward_std": 0.42890677601099014,
"rewards/cosine_len_reward": -0.16693101823329926,
"step": 463
},
{
"clip_ratio": 0.0,
"completion_length": 3220.0833740234375,
"epoch": 0.2651428571428571,
"grad_norm": 0.22023098170757294,
"kl": 0.0413818359375,
"learning_rate": 1.1492947512799328e-07,
"loss": -0.0812,
"reward": -0.09500008448958397,
"reward_std": 0.38959069550037384,
"rewards/cosine_len_reward": -0.09500008448958397,
"step": 464
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.26571428571428574,
"grad_norm": 0.15240216255187988,
"kl": 0.0389404296875,
"learning_rate": 1.1413757749211602e-07,
"loss": 0.0002,
"reward": 0.16528711840510368,
"reward_std": 0.5445774495601654,
"rewards/cosine_len_reward": 0.16528711840510368,
"step": 465
},
{
"clip_ratio": 0.0,
"completion_length": 3150.625,
"epoch": 0.2662857142857143,
"grad_norm": 0.16584616899490356,
"kl": 0.04217529296875,
"learning_rate": 1.1336692317580158e-07,
"loss": -0.0149,
"reward": -0.257739894092083,
"reward_std": 0.25873103737831116,
"rewards/cosine_len_reward": -0.257739894092083,
"step": 466
},
{
"clip_ratio": 0.0,
"completion_length": 3531.7916870117188,
"epoch": 0.26685714285714285,
"grad_norm": 0.15734802186489105,
"kl": 0.043212890625,
"learning_rate": 1.1261754973965422e-07,
"loss": 0.0251,
"reward": 0.20713631808757782,
"reward_std": 0.4699949249625206,
"rewards/cosine_len_reward": 0.20713631808757782,
"step": 467
},
{
"clip_ratio": 0.0,
"completion_length": 3358.5833740234375,
"epoch": 0.2674285714285714,
"grad_norm": 0.15957476198673248,
"kl": 0.05712890625,
"learning_rate": 1.1188949370707787e-07,
"loss": -0.049,
"reward": -0.12391296029090881,
"reward_std": 0.5305506736040115,
"rewards/cosine_len_reward": -0.12391296029090881,
"step": 468
},
{
"clip_ratio": 0.0,
"completion_length": 3467.5,
"epoch": 0.268,
"grad_norm": 0.16685207188129425,
"kl": 0.04998779296875,
"learning_rate": 1.1118279056249653e-07,
"loss": 0.0354,
"reward": -0.40232421085238457,
"reward_std": 0.4523722641170025,
"rewards/cosine_len_reward": -0.40232421085238457,
"step": 469
},
{
"clip_ratio": 0.0,
"completion_length": 2356.9166870117188,
"epoch": 0.26857142857142857,
"grad_norm": 0.2733165919780731,
"kl": 0.0599365234375,
"learning_rate": 1.1049747474962444e-07,
"loss": 0.1523,
"reward": -0.20090949651785195,
"reward_std": 0.5469123795628548,
"rewards/cosine_len_reward": -0.20090949651785195,
"step": 470
},
{
"clip_ratio": 0.0,
"completion_length": 3320.0416870117188,
"epoch": 0.26914285714285713,
"grad_norm": 0.1729491502046585,
"kl": 0.04193115234375,
"learning_rate": 1.0983357966978745e-07,
"loss": -0.0024,
"reward": 0.02039976231753826,
"reward_std": 0.4906746745109558,
"rewards/cosine_len_reward": 0.02039976231753826,
"step": 471
},
{
"clip_ratio": 0.0,
"completion_length": 3436.1666870117188,
"epoch": 0.26971428571428574,
"grad_norm": 0.15716099739074707,
"kl": 0.0533447265625,
"learning_rate": 1.0919113768029517e-07,
"loss": 0.0387,
"reward": 0.10714660119265318,
"reward_std": 0.4147900193929672,
"rewards/cosine_len_reward": 0.10714660119265318,
"step": 472
},
{
"clip_ratio": 0.0,
"completion_length": 2971.4583435058594,
"epoch": 0.2702857142857143,
"grad_norm": 0.20368468761444092,
"kl": 0.0540771484375,
"learning_rate": 1.0857018009286381e-07,
"loss": -0.0079,
"reward": -0.6853213012218475,
"reward_std": 0.3286122828722,
"rewards/cosine_len_reward": -0.6853213012218475,
"step": 473
},
{
"clip_ratio": 0.0,
"completion_length": 2670.0416870117188,
"epoch": 0.27085714285714285,
"grad_norm": 0.17690664529800415,
"kl": 0.03338623046875,
"learning_rate": 1.0797073717209013e-07,
"loss": 0.0268,
"reward": -0.29098084941506386,
"reward_std": 0.49564552307128906,
"rewards/cosine_len_reward": -0.29098084941506386,
"step": 474
},
{
"clip_ratio": 0.0,
"completion_length": 3349.6250610351562,
"epoch": 0.2714285714285714,
"grad_norm": 0.19052566587924957,
"kl": 0.05389404296875,
"learning_rate": 1.0739283813397639e-07,
"loss": -0.029,
"reward": -0.1653405874967575,
"reward_std": 0.5682762004435062,
"rewards/cosine_len_reward": -0.1653405874967575,
"step": 475
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.272,
"grad_norm": 0.15981078147888184,
"kl": 0.05010986328125,
"learning_rate": 1.068365111445064e-07,
"loss": 0.0002,
"reward": -0.6135335117578506,
"reward_std": 0.3161969594657421,
"rewards/cosine_len_reward": -0.6135335117578506,
"step": 476
},
{
"clip_ratio": 0.0,
"completion_length": 3550.7916870117188,
"epoch": 0.2725714285714286,
"grad_norm": 0.16642750799655914,
"kl": 0.04486083984375,
"learning_rate": 1.063017833182728e-07,
"loss": 0.0194,
"reward": -0.25540533661842346,
"reward_std": 0.48319850862026215,
"rewards/cosine_len_reward": -0.25540533661842346,
"step": 477
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.27314285714285713,
"grad_norm": 0.16831888258457184,
"kl": 0.04388427734375,
"learning_rate": 1.0578868071715544e-07,
"loss": 0.0002,
"reward": 0.19778522849082947,
"reward_std": 0.35869789123535156,
"rewards/cosine_len_reward": 0.19778522849082947,
"step": 478
},
{
"clip_ratio": 0.0,
"completion_length": 3169.6666870117188,
"epoch": 0.2737142857142857,
"grad_norm": 0.16316814720630646,
"kl": 0.05181884765625,
"learning_rate": 1.0529722834905125e-07,
"loss": 0.0321,
"reward": -0.1417078822851181,
"reward_std": 0.4882785305380821,
"rewards/cosine_len_reward": -0.1417078822851181,
"step": 479
},
{
"clip_ratio": 0.0,
"completion_length": 3529.0833740234375,
"epoch": 0.2742857142857143,
"grad_norm": 0.14413844048976898,
"kl": 0.05267333984375,
"learning_rate": 1.0482745016665526e-07,
"loss": -0.0048,
"reward": -0.21695643290877342,
"reward_std": 0.43684104457497597,
"rewards/cosine_len_reward": -0.21695643290877342,
"step": 480
},
{
"clip_ratio": 0.0,
"completion_length": 2982.5416870117188,
"epoch": 0.27485714285714286,
"grad_norm": 0.7589179277420044,
"kl": 0.05596923828125,
"learning_rate": 1.0437936906629334e-07,
"loss": 0.1907,
"reward": -0.02579532004892826,
"reward_std": 0.24179279431700706,
"rewards/cosine_len_reward": -0.02579532004892826,
"step": 481
},
{
"clip_ratio": 0.0,
"completion_length": 3213.5833740234375,
"epoch": 0.2754285714285714,
"grad_norm": 0.1643751859664917,
"kl": 0.048980712890625,
"learning_rate": 1.0395300688680625e-07,
"loss": -0.1096,
"reward": 0.004770293831825256,
"reward_std": 0.44362664967775345,
"rewards/cosine_len_reward": 0.004770293831825256,
"step": 482
},
{
"clip_ratio": 0.0,
"completion_length": 3459.0416870117188,
"epoch": 0.276,
"grad_norm": 0.1669885367155075,
"kl": 0.05377197265625,
"learning_rate": 1.0354838440848501e-07,
"loss": 0.0103,
"reward": -0.5875770300626755,
"reward_std": 0.31428810209035873,
"rewards/cosine_len_reward": -0.5875770300626755,
"step": 483
},
{
"clip_ratio": 0.0,
"completion_length": 2685.000030517578,
"epoch": 0.2765714285714286,
"grad_norm": 0.33783161640167236,
"kl": 0.0472412109375,
"learning_rate": 1.0316552135205837e-07,
"loss": -0.0571,
"reward": -0.06883000582456589,
"reward_std": 0.30373527109622955,
"rewards/cosine_len_reward": -0.06883000582456589,
"step": 484
},
{
"clip_ratio": 0.0,
"completion_length": 3374.7916870117188,
"epoch": 0.27714285714285714,
"grad_norm": 0.15891632437705994,
"kl": 0.047607421875,
"learning_rate": 1.0280443637773163e-07,
"loss": 0.016,
"reward": -0.2871231231838465,
"reward_std": 0.5418589785695076,
"rewards/cosine_len_reward": -0.2871231231838465,
"step": 485
},
{
"clip_ratio": 0.0,
"completion_length": 2790.208335876465,
"epoch": 0.2777142857142857,
"grad_norm": 0.28455618023872375,
"kl": 0.047607421875,
"learning_rate": 1.0246514708427701e-07,
"loss": 0.0522,
"reward": 0.011902973055839539,
"reward_std": 0.3104940876364708,
"rewards/cosine_len_reward": 0.011902973055839539,
"step": 486
},
{
"clip_ratio": 0.0,
"completion_length": 2883.5,
"epoch": 0.2782857142857143,
"grad_norm": 0.20438960194587708,
"kl": 0.041229248046875,
"learning_rate": 1.0214767000817596e-07,
"loss": 0.0462,
"reward": 0.34762556478381157,
"reward_std": 0.29484141059219837,
"rewards/cosine_len_reward": 0.34762556478381157,
"step": 487
},
{
"clip_ratio": 0.0,
"completion_length": 3420.1666870117188,
"epoch": 0.27885714285714286,
"grad_norm": 0.13923093676567078,
"kl": 0.0538330078125,
"learning_rate": 1.0185202062281336e-07,
"loss": -0.0637,
"reward": -0.3226715254713781,
"reward_std": 0.4678328037261963,
"rewards/cosine_len_reward": -0.3226715254713781,
"step": 488
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.2794285714285714,
"grad_norm": 0.22048485279083252,
"kl": 0.1324462890625,
"learning_rate": 1.0157821333772304e-07,
"loss": 0.0005,
"reward": -0.08934877812862396,
"reward_std": 0.6139358580112457,
"rewards/cosine_len_reward": -0.08934877812862396,
"step": 489
},
{
"clip_ratio": 0.0,
"completion_length": 2766.4166870117188,
"epoch": 0.28,
"grad_norm": 0.21742548048496246,
"kl": 0.06146240234375,
"learning_rate": 1.013262614978859e-07,
"loss": -0.1495,
"reward": -0.2719786809757352,
"reward_std": 0.5813306570053101,
"rewards/cosine_len_reward": -0.2719786809757352,
"step": 490
},
{
"clip_ratio": 0.0,
"completion_length": 2932.4583435058594,
"epoch": 0.2805714285714286,
"grad_norm": 0.4229942262172699,
"kl": 0.0546875,
"learning_rate": 1.0109617738307911e-07,
"loss": 0.2389,
"reward": -0.3160274773836136,
"reward_std": 0.4782961308956146,
"rewards/cosine_len_reward": -0.3160274773836136,
"step": 491
},
{
"clip_ratio": 0.0,
"completion_length": 2788.375,
"epoch": 0.28114285714285714,
"grad_norm": 0.5186205506324768,
"kl": 0.08941650390625,
"learning_rate": 1.0088797220727779e-07,
"loss": 0.2067,
"reward": 0.012615039944648743,
"reward_std": 0.5839090943336487,
"rewards/cosine_len_reward": 0.012615039944648743,
"step": 492
},
{
"clip_ratio": 0.0,
"completion_length": 2743.9583587646484,
"epoch": 0.2817142857142857,
"grad_norm": 0.25046536326408386,
"kl": 0.077392578125,
"learning_rate": 1.0070165611810855e-07,
"loss": 0.0376,
"reward": -0.17700890451669693,
"reward_std": 0.6207068264484406,
"rewards/cosine_len_reward": -0.17700890451669693,
"step": 493
},
{
"clip_ratio": 0.0,
"completion_length": 2884.7083740234375,
"epoch": 0.2822857142857143,
"grad_norm": 0.27387452125549316,
"kl": 0.05712890625,
"learning_rate": 1.005372381963547e-07,
"loss": -0.1072,
"reward": -0.5983569696545601,
"reward_std": 0.48679885268211365,
"rewards/cosine_len_reward": -0.5983569696545601,
"step": 494
},
{
"clip_ratio": 0.0,
"completion_length": 3555.75,
"epoch": 0.28285714285714286,
"grad_norm": 0.14310041069984436,
"kl": 0.04302978515625,
"learning_rate": 1.0039472645551372e-07,
"loss": 0.0063,
"reward": -0.03963024541735649,
"reward_std": 0.41995228826999664,
"rewards/cosine_len_reward": -0.03963024541735649,
"step": 495
},
{
"clip_ratio": 0.0,
"completion_length": 2138.916732788086,
"epoch": 0.2834285714285714,
"grad_norm": 0.4353350102901459,
"kl": 0.059326171875,
"learning_rate": 1.002741278414069e-07,
"loss": -0.0066,
"reward": 0.10497748292982578,
"reward_std": 0.4895341917872429,
"rewards/cosine_len_reward": 0.10497748292982578,
"step": 496
},
{
"clip_ratio": 0.0,
"completion_length": 3536.7916870117188,
"epoch": 0.284,
"grad_norm": 0.1279255449771881,
"kl": 0.037567138671875,
"learning_rate": 1.0017544823184055e-07,
"loss": 0.0205,
"reward": 0.0027112215757369995,
"reward_std": 0.41501184925436974,
"rewards/cosine_len_reward": 0.0027112215757369995,
"step": 497
},
{
"clip_ratio": 0.0,
"completion_length": 3007.2916870117188,
"epoch": 0.2845714285714286,
"grad_norm": 0.37080392241477966,
"kl": 0.1373291015625,
"learning_rate": 1.0009869243631952e-07,
"loss": 0.0912,
"reward": -0.23442494124174118,
"reward_std": 0.4831595793366432,
"rewards/cosine_len_reward": -0.23442494124174118,
"step": 498
},
{
"clip_ratio": 0.0,
"completion_length": 3276.4166870117188,
"epoch": 0.28514285714285714,
"grad_norm": 0.21131351590156555,
"kl": 0.0936279296875,
"learning_rate": 1.000438641958131e-07,
"loss": -0.0781,
"reward": -0.07970089465379715,
"reward_std": 0.351084902882576,
"rewards/cosine_len_reward": -0.07970089465379715,
"step": 499
},
{
"clip_ratio": 0.0,
"completion_length": 3468.1666870117188,
"epoch": 0.2857142857142857,
"grad_norm": 0.14872434735298157,
"kl": 0.051025390625,
"learning_rate": 1.0001096618257236e-07,
"loss": -0.0709,
"reward": 0.30408234894275665,
"reward_std": 0.49906710535287857,
"rewards/cosine_len_reward": 0.30408234894275665,
"step": 500
},
{
"epoch": 0.2857142857142857,
"step": 500,
"total_flos": 0.0,
"train_loss": 0.002130194778990699,
"train_runtime": 27178.7627,
"train_samples_per_second": 0.442,
"train_steps_per_second": 0.018
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}