|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.2857142857142857, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3140.2083435058594, |
|
"epoch": 0.0005714285714285715, |
|
"grad_norm": 0.18732130527496338, |
|
"kl": 0.1341552734375, |
|
"learning_rate": 0.0, |
|
"loss": -0.0125, |
|
"reward": -0.22849145717918873, |
|
"reward_std": 0.40205543488264084, |
|
"rewards/cosine_len_reward": -0.22849145717918873, |
|
"step": 1 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3231.1666870117188, |
|
"epoch": 0.001142857142857143, |
|
"grad_norm": 0.2461026906967163, |
|
"kl": 0.05010986328125, |
|
"learning_rate": 2e-08, |
|
"loss": 0.0599, |
|
"reward": -0.4702305719256401, |
|
"reward_std": 0.45737794041633606, |
|
"rewards/cosine_len_reward": -0.4702305719256401, |
|
"step": 2 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3447.7500610351562, |
|
"epoch": 0.0017142857142857142, |
|
"grad_norm": 0.190439373254776, |
|
"kl": 0.0457763671875, |
|
"learning_rate": 4e-08, |
|
"loss": -0.0022, |
|
"reward": 0.24990134686231613, |
|
"reward_std": 0.4683762863278389, |
|
"rewards/cosine_len_reward": 0.24990134686231613, |
|
"step": 3 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3227.5000610351562, |
|
"epoch": 0.002285714285714286, |
|
"grad_norm": 0.23573388159275055, |
|
"kl": 0.044189453125, |
|
"learning_rate": 6e-08, |
|
"loss": -0.0779, |
|
"reward": -0.3011130467057228, |
|
"reward_std": 0.5483239553868771, |
|
"rewards/cosine_len_reward": -0.3011130467057228, |
|
"step": 4 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3382.2083740234375, |
|
"epoch": 0.002857142857142857, |
|
"grad_norm": 0.1897883266210556, |
|
"kl": 0.0562744140625, |
|
"learning_rate": 8e-08, |
|
"loss": -0.0596, |
|
"reward": -0.2980368435382843, |
|
"reward_std": 0.5630971193313599, |
|
"rewards/cosine_len_reward": -0.2980368435382843, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3024.5, |
|
"epoch": 0.0034285714285714284, |
|
"grad_norm": 0.34085553884506226, |
|
"kl": 0.044189453125, |
|
"learning_rate": 1e-07, |
|
"loss": 0.1698, |
|
"reward": 0.12611429148819298, |
|
"reward_std": 0.3041386976838112, |
|
"rewards/cosine_len_reward": 0.12611429148819298, |
|
"step": 6 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3208.4166870117188, |
|
"epoch": 0.004, |
|
"grad_norm": 0.21276699006557465, |
|
"kl": 0.052337646484375, |
|
"learning_rate": 1.2e-07, |
|
"loss": 0.0942, |
|
"reward": 0.17614510841667652, |
|
"reward_std": 0.5236565172672272, |
|
"rewards/cosine_len_reward": 0.17614510841667652, |
|
"step": 7 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3275.1666870117188, |
|
"epoch": 0.004571428571428572, |
|
"grad_norm": 0.19224224984645844, |
|
"kl": 0.0458984375, |
|
"learning_rate": 1.4e-07, |
|
"loss": -0.0598, |
|
"reward": -0.38038296496961266, |
|
"reward_std": 0.4509882442653179, |
|
"rewards/cosine_len_reward": -0.38038296496961266, |
|
"step": 8 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3164.7083740234375, |
|
"epoch": 0.005142857142857143, |
|
"grad_norm": 0.1741683930158615, |
|
"kl": 0.0439453125, |
|
"learning_rate": 1.6e-07, |
|
"loss": -0.0354, |
|
"reward": -0.2483000010251999, |
|
"reward_std": 0.3471040166914463, |
|
"rewards/cosine_len_reward": -0.2483000010251999, |
|
"step": 9 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2916.0834350585938, |
|
"epoch": 0.005714285714285714, |
|
"grad_norm": 0.4565255641937256, |
|
"kl": 0.23822021484375, |
|
"learning_rate": 1.8e-07, |
|
"loss": -0.1471, |
|
"reward": -0.1595626100897789, |
|
"reward_std": 0.5751340016722679, |
|
"rewards/cosine_len_reward": -0.1595626100897789, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2857.2083435058594, |
|
"epoch": 0.006285714285714286, |
|
"grad_norm": 0.269521027803421, |
|
"kl": 0.05877685546875, |
|
"learning_rate": 2e-07, |
|
"loss": 0.0606, |
|
"reward": -0.10372118093073368, |
|
"reward_std": 0.45142534002661705, |
|
"rewards/cosine_len_reward": -0.10372118093073368, |
|
"step": 11 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3136.9583740234375, |
|
"epoch": 0.006857142857142857, |
|
"grad_norm": 0.1991637796163559, |
|
"kl": 0.04522705078125, |
|
"learning_rate": 2.1999999999999998e-07, |
|
"loss": -0.0514, |
|
"reward": 0.20943743363022804, |
|
"reward_std": 0.46535836160182953, |
|
"rewards/cosine_len_reward": 0.20943743363022804, |
|
"step": 12 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3189.9166870117188, |
|
"epoch": 0.0074285714285714285, |
|
"grad_norm": 0.27295684814453125, |
|
"kl": 0.040252685546875, |
|
"learning_rate": 2.4e-07, |
|
"loss": 0.1719, |
|
"reward": -0.01165345311164856, |
|
"reward_std": 0.5004820078611374, |
|
"rewards/cosine_len_reward": -0.01165345311164856, |
|
"step": 13 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3201.5833740234375, |
|
"epoch": 0.008, |
|
"grad_norm": 0.16425774991512299, |
|
"kl": 0.04278564453125, |
|
"learning_rate": 2.6e-07, |
|
"loss": 0.0225, |
|
"reward": -0.11870134994387627, |
|
"reward_std": 0.5749331563711166, |
|
"rewards/cosine_len_reward": -0.11870134994387627, |
|
"step": 14 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2892.041717529297, |
|
"epoch": 0.008571428571428572, |
|
"grad_norm": 0.18019337952136993, |
|
"kl": 0.0435791015625, |
|
"learning_rate": 2.8e-07, |
|
"loss": 0.0287, |
|
"reward": -0.08168663457036018, |
|
"reward_std": 0.37698886543512344, |
|
"rewards/cosine_len_reward": -0.08168663457036018, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3394.0416870117188, |
|
"epoch": 0.009142857142857144, |
|
"grad_norm": 0.20076414942741394, |
|
"kl": 0.0494384765625, |
|
"learning_rate": 3e-07, |
|
"loss": -0.0377, |
|
"reward": -0.3673112988471985, |
|
"reward_std": 0.5469748228788376, |
|
"rewards/cosine_len_reward": -0.3673112988471985, |
|
"step": 16 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.009714285714285713, |
|
"grad_norm": 0.2074183225631714, |
|
"kl": 0.04840087890625, |
|
"learning_rate": 3.2e-07, |
|
"loss": 0.0002, |
|
"reward": -0.08958722651004791, |
|
"reward_std": 0.46227796375751495, |
|
"rewards/cosine_len_reward": -0.08958722651004791, |
|
"step": 17 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3327.9166870117188, |
|
"epoch": 0.010285714285714285, |
|
"grad_norm": 0.16970673203468323, |
|
"kl": 0.04620361328125, |
|
"learning_rate": 3.4000000000000003e-07, |
|
"loss": -0.0541, |
|
"reward": -0.06295697297900915, |
|
"reward_std": 0.48097309097647667, |
|
"rewards/cosine_len_reward": -0.06295697297900915, |
|
"step": 18 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3291.2083740234375, |
|
"epoch": 0.010857142857142857, |
|
"grad_norm": 0.18424074351787567, |
|
"kl": 0.04925537109375, |
|
"learning_rate": 3.6e-07, |
|
"loss": -0.0548, |
|
"reward": -0.2277919389307499, |
|
"reward_std": 0.48374123871326447, |
|
"rewards/cosine_len_reward": -0.2277919389307499, |
|
"step": 19 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2776.0833740234375, |
|
"epoch": 0.011428571428571429, |
|
"grad_norm": 0.1781274378299713, |
|
"kl": 0.04132080078125, |
|
"learning_rate": 3.7999999999999996e-07, |
|
"loss": 0.0761, |
|
"reward": -0.4576909616589546, |
|
"reward_std": 0.5087253600358963, |
|
"rewards/cosine_len_reward": -0.4576909616589546, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2673.541717529297, |
|
"epoch": 0.012, |
|
"grad_norm": 0.21896369755268097, |
|
"kl": 0.06439208984375, |
|
"learning_rate": 4e-07, |
|
"loss": 0.0382, |
|
"reward": -0.40826891735196114, |
|
"reward_std": 0.475625216960907, |
|
"rewards/cosine_len_reward": -0.40826891735196114, |
|
"step": 21 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3570.5, |
|
"epoch": 0.012571428571428572, |
|
"grad_norm": 2.473532199859619, |
|
"kl": 0.65576171875, |
|
"learning_rate": 4.1999999999999995e-07, |
|
"loss": 0.0095, |
|
"reward": 0.19061553291976452, |
|
"reward_std": 0.43087163195014, |
|
"rewards/cosine_len_reward": 0.19061553291976452, |
|
"step": 22 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3022.9166870117188, |
|
"epoch": 0.013142857142857144, |
|
"grad_norm": 0.30063244700431824, |
|
"kl": 0.045166015625, |
|
"learning_rate": 4.3999999999999997e-07, |
|
"loss": -0.0909, |
|
"reward": -0.22615898214280605, |
|
"reward_std": 0.6225969791412354, |
|
"rewards/cosine_len_reward": -0.22615898214280605, |
|
"step": 23 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3317.2083740234375, |
|
"epoch": 0.013714285714285714, |
|
"grad_norm": 0.19970481097698212, |
|
"kl": 0.0474853515625, |
|
"learning_rate": 4.6e-07, |
|
"loss": 0.0273, |
|
"reward": -0.07718010246753693, |
|
"reward_std": 0.3550200453028083, |
|
"rewards/cosine_len_reward": -0.07718010246753693, |
|
"step": 24 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2408.7083740234375, |
|
"epoch": 0.014285714285714285, |
|
"grad_norm": 0.25198450684547424, |
|
"kl": 0.04376220703125, |
|
"learning_rate": 4.8e-07, |
|
"loss": -0.1531, |
|
"reward": -0.3547450974583626, |
|
"reward_std": 0.4954472631216049, |
|
"rewards/cosine_len_reward": -0.3547450974583626, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3286.1666870117188, |
|
"epoch": 0.014857142857142857, |
|
"grad_norm": 0.1706409901380539, |
|
"kl": 0.0460205078125, |
|
"learning_rate": 5e-07, |
|
"loss": -0.0278, |
|
"reward": -0.12194318068213761, |
|
"reward_std": 0.5183630883693695, |
|
"rewards/cosine_len_reward": -0.12194318068213761, |
|
"step": 26 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.015428571428571429, |
|
"grad_norm": 0.15060991048812866, |
|
"kl": 0.04339599609375, |
|
"learning_rate": 5.2e-07, |
|
"loss": 0.0002, |
|
"reward": -0.11912490613758564, |
|
"reward_std": 0.41010989993810654, |
|
"rewards/cosine_len_reward": -0.11912490613758564, |
|
"step": 27 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3351.2083740234375, |
|
"epoch": 0.016, |
|
"grad_norm": 0.17934846878051758, |
|
"kl": 0.04510498046875, |
|
"learning_rate": 5.4e-07, |
|
"loss": -0.0621, |
|
"reward": 0.05903707444667816, |
|
"reward_std": 0.5544452294707298, |
|
"rewards/cosine_len_reward": 0.05903707444667816, |
|
"step": 28 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2895.625, |
|
"epoch": 0.01657142857142857, |
|
"grad_norm": 0.18568940460681915, |
|
"kl": 0.04815673828125, |
|
"learning_rate": 5.6e-07, |
|
"loss": 0.0543, |
|
"reward": -0.010081298649311066, |
|
"reward_std": 0.36467570811510086, |
|
"rewards/cosine_len_reward": -0.010081298649311066, |
|
"step": 29 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.017142857142857144, |
|
"grad_norm": 0.15779373049736023, |
|
"kl": 0.04766845703125, |
|
"learning_rate": 5.8e-07, |
|
"loss": 0.0002, |
|
"reward": -0.28753336891531944, |
|
"reward_std": 0.46697998046875, |
|
"rewards/cosine_len_reward": -0.28753336891531944, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2773.000030517578, |
|
"epoch": 0.017714285714285714, |
|
"grad_norm": 0.33190807700157166, |
|
"kl": 0.05267333984375, |
|
"learning_rate": 6e-07, |
|
"loss": 0.1689, |
|
"reward": -0.45986294001340866, |
|
"reward_std": 0.38838284835219383, |
|
"rewards/cosine_len_reward": -0.45986294001340866, |
|
"step": 31 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2857.4166870117188, |
|
"epoch": 0.018285714285714287, |
|
"grad_norm": 0.19518494606018066, |
|
"kl": 0.04412841796875, |
|
"learning_rate": 6.2e-07, |
|
"loss": -0.1331, |
|
"reward": 0.06440184079110622, |
|
"reward_std": 0.5917607620358467, |
|
"rewards/cosine_len_reward": 0.06440184079110622, |
|
"step": 32 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.018857142857142857, |
|
"grad_norm": 0.15161477029323578, |
|
"kl": 0.05377197265625, |
|
"learning_rate": 6.4e-07, |
|
"loss": 0.0002, |
|
"reward": 0.06449370458722115, |
|
"reward_std": 0.40188299119472504, |
|
"rewards/cosine_len_reward": 0.06449370458722115, |
|
"step": 33 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3228.4583740234375, |
|
"epoch": 0.019428571428571427, |
|
"grad_norm": 0.19298215210437775, |
|
"kl": 0.04290771484375, |
|
"learning_rate": 6.6e-07, |
|
"loss": 0.0274, |
|
"reward": -0.1688922978937626, |
|
"reward_std": 0.34010200947523117, |
|
"rewards/cosine_len_reward": -0.1688922978937626, |
|
"step": 34 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3495.125, |
|
"epoch": 0.02, |
|
"grad_norm": 0.18180347979068756, |
|
"kl": 0.0584716796875, |
|
"learning_rate": 6.800000000000001e-07, |
|
"loss": -0.029, |
|
"reward": -0.40583060681819916, |
|
"reward_std": 0.5360103398561478, |
|
"rewards/cosine_len_reward": -0.40583060681819916, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3421.25, |
|
"epoch": 0.02057142857142857, |
|
"grad_norm": 0.17146086692810059, |
|
"kl": 0.0543212890625, |
|
"learning_rate": 7e-07, |
|
"loss": -0.021, |
|
"reward": 0.3642941191792488, |
|
"reward_std": 0.3542026989161968, |
|
"rewards/cosine_len_reward": 0.3642941191792488, |
|
"step": 36 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.021142857142857144, |
|
"grad_norm": 0.14214502274990082, |
|
"kl": 0.04010009765625, |
|
"learning_rate": 7.2e-07, |
|
"loss": 0.0002, |
|
"reward": -0.07616325793787837, |
|
"reward_std": 0.3850269839167595, |
|
"rewards/cosine_len_reward": -0.07616325793787837, |
|
"step": 37 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3495.291748046875, |
|
"epoch": 0.021714285714285714, |
|
"grad_norm": 0.17879678308963776, |
|
"kl": 0.055908203125, |
|
"learning_rate": 7.4e-07, |
|
"loss": 0.0359, |
|
"reward": -0.40084876120090485, |
|
"reward_std": 0.44661080837249756, |
|
"rewards/cosine_len_reward": -0.40084876120090485, |
|
"step": 38 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2343.0834045410156, |
|
"epoch": 0.022285714285714287, |
|
"grad_norm": 0.24563190340995789, |
|
"kl": 0.0567626953125, |
|
"learning_rate": 7.599999999999999e-07, |
|
"loss": 0.0236, |
|
"reward": -0.11119658034294844, |
|
"reward_std": 0.45050790160894394, |
|
"rewards/cosine_len_reward": -0.11119658034294844, |
|
"step": 39 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.022857142857142857, |
|
"grad_norm": 0.17550677061080933, |
|
"kl": 0.0445556640625, |
|
"learning_rate": 7.799999999999999e-07, |
|
"loss": 0.0002, |
|
"reward": -0.3701670579612255, |
|
"reward_std": 0.3519079238176346, |
|
"rewards/cosine_len_reward": -0.3701670579612255, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3079.7083740234375, |
|
"epoch": 0.023428571428571427, |
|
"grad_norm": 0.17628754675388336, |
|
"kl": 0.047119140625, |
|
"learning_rate": 8e-07, |
|
"loss": -0.0516, |
|
"reward": -0.6531281769275665, |
|
"reward_std": 0.3740066960453987, |
|
"rewards/cosine_len_reward": -0.6531281769275665, |
|
"step": 41 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2641.125030517578, |
|
"epoch": 0.024, |
|
"grad_norm": 0.36992311477661133, |
|
"kl": 0.043609619140625, |
|
"learning_rate": 8.199999999999999e-07, |
|
"loss": 0.0454, |
|
"reward": -0.12115837261080742, |
|
"reward_std": 0.3918045926839113, |
|
"rewards/cosine_len_reward": -0.12115837261080742, |
|
"step": 42 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.02457142857142857, |
|
"grad_norm": 0.1853361576795578, |
|
"kl": 0.0462646484375, |
|
"learning_rate": 8.399999999999999e-07, |
|
"loss": 0.0002, |
|
"reward": -0.1932130679488182, |
|
"reward_std": 0.4540579691529274, |
|
"rewards/cosine_len_reward": -0.1932130679488182, |
|
"step": 43 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3507.7500610351562, |
|
"epoch": 0.025142857142857144, |
|
"grad_norm": 0.17624704539775848, |
|
"kl": 0.04443359375, |
|
"learning_rate": 8.599999999999999e-07, |
|
"loss": 0.0284, |
|
"reward": -0.25109320878982544, |
|
"reward_std": 0.5501705221831799, |
|
"rewards/cosine_len_reward": -0.25109320878982544, |
|
"step": 44 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3504.2083740234375, |
|
"epoch": 0.025714285714285714, |
|
"grad_norm": 0.19684357941150665, |
|
"kl": 0.0775146484375, |
|
"learning_rate": 8.799999999999999e-07, |
|
"loss": 0.0359, |
|
"reward": -0.3305511474609375, |
|
"reward_std": 0.37591269612312317, |
|
"rewards/cosine_len_reward": -0.3305511474609375, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3419.6666870117188, |
|
"epoch": 0.026285714285714287, |
|
"grad_norm": 0.18291282653808594, |
|
"kl": 0.047119140625, |
|
"learning_rate": 9e-07, |
|
"loss": -0.0574, |
|
"reward": -0.3297368660569191, |
|
"reward_std": 0.31174512580037117, |
|
"rewards/cosine_len_reward": -0.3297368660569191, |
|
"step": 46 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2836.0, |
|
"epoch": 0.026857142857142857, |
|
"grad_norm": 0.365488737821579, |
|
"kl": 0.04034423828125, |
|
"learning_rate": 9.2e-07, |
|
"loss": 0.0762, |
|
"reward": -0.35849449783563614, |
|
"reward_std": 0.47992704063653946, |
|
"rewards/cosine_len_reward": -0.35849449783563614, |
|
"step": 47 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2915.9583740234375, |
|
"epoch": 0.027428571428571427, |
|
"grad_norm": 0.17787908017635345, |
|
"kl": 0.052734375, |
|
"learning_rate": 9.399999999999999e-07, |
|
"loss": -0.0863, |
|
"reward": -0.3561765216290951, |
|
"reward_std": 0.6259109601378441, |
|
"rewards/cosine_len_reward": -0.3561765216290951, |
|
"step": 48 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2647.7083740234375, |
|
"epoch": 0.028, |
|
"grad_norm": 0.2575705647468567, |
|
"kl": 0.05340576171875, |
|
"learning_rate": 9.6e-07, |
|
"loss": -0.108, |
|
"reward": -0.4050520211458206, |
|
"reward_std": 0.6429588124155998, |
|
"rewards/cosine_len_reward": -0.4050520211458206, |
|
"step": 49 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3576.875, |
|
"epoch": 0.02857142857142857, |
|
"grad_norm": 0.15978150069713593, |
|
"kl": 0.04681396484375, |
|
"learning_rate": 9.8e-07, |
|
"loss": -0.0, |
|
"reward": -0.0018655331805348396, |
|
"reward_std": 0.3724985048174858, |
|
"rewards/cosine_len_reward": -0.0018655331805348396, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2881.9583435058594, |
|
"epoch": 0.029142857142857144, |
|
"grad_norm": 0.1981167048215866, |
|
"kl": 0.038238525390625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0753, |
|
"reward": -0.14606139063835144, |
|
"reward_std": 0.4776982143521309, |
|
"rewards/cosine_len_reward": -0.14606139063835144, |
|
"step": 51 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3474.4583740234375, |
|
"epoch": 0.029714285714285714, |
|
"grad_norm": 0.1720048487186432, |
|
"kl": 0.0439453125, |
|
"learning_rate": 9.999890338174275e-07, |
|
"loss": 0.0049, |
|
"reward": -0.47205302864313126, |
|
"reward_std": 0.389950018376112, |
|
"rewards/cosine_len_reward": -0.47205302864313126, |
|
"step": 52 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3330.2916870117188, |
|
"epoch": 0.030285714285714287, |
|
"grad_norm": 0.18484823405742645, |
|
"kl": 0.042236328125, |
|
"learning_rate": 9.999561358041868e-07, |
|
"loss": -0.0542, |
|
"reward": 0.1891404390335083, |
|
"reward_std": 0.26611476950347424, |
|
"rewards/cosine_len_reward": 0.1891404390335083, |
|
"step": 53 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3254.375, |
|
"epoch": 0.030857142857142857, |
|
"grad_norm": 0.19311083853244781, |
|
"kl": 0.0550537109375, |
|
"learning_rate": 9.999013075636804e-07, |
|
"loss": -0.0485, |
|
"reward": -0.4130496457219124, |
|
"reward_std": 0.39373762160539627, |
|
"rewards/cosine_len_reward": -0.4130496457219124, |
|
"step": 54 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.03142857142857143, |
|
"grad_norm": 0.15826281905174255, |
|
"kl": 0.06268310546875, |
|
"learning_rate": 9.998245517681593e-07, |
|
"loss": 0.0003, |
|
"reward": -0.4410124532878399, |
|
"reward_std": 0.3935729172080755, |
|
"rewards/cosine_len_reward": -0.4410124532878399, |
|
"step": 55 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3478.875, |
|
"epoch": 0.032, |
|
"grad_norm": 0.1909007728099823, |
|
"kl": 0.06378173828125, |
|
"learning_rate": 9.997258721585931e-07, |
|
"loss": -0.0334, |
|
"reward": -0.14306456223130226, |
|
"reward_std": 0.5048741772770882, |
|
"rewards/cosine_len_reward": -0.14306456223130226, |
|
"step": 56 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3030.9583740234375, |
|
"epoch": 0.03257142857142857, |
|
"grad_norm": 0.16624057292938232, |
|
"kl": 0.04339599609375, |
|
"learning_rate": 9.996052735444862e-07, |
|
"loss": 0.0213, |
|
"reward": -0.2449845364317298, |
|
"reward_std": 0.2843635231256485, |
|
"rewards/cosine_len_reward": -0.2449845364317298, |
|
"step": 57 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.03314285714285714, |
|
"grad_norm": 0.14864130318164825, |
|
"kl": 0.04376220703125, |
|
"learning_rate": 9.994627618036452e-07, |
|
"loss": 0.0002, |
|
"reward": -0.477820735424757, |
|
"reward_std": 0.42124854028224945, |
|
"rewards/cosine_len_reward": -0.477820735424757, |
|
"step": 58 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.03371428571428572, |
|
"grad_norm": 0.17053081095218658, |
|
"kl": 0.05047607421875, |
|
"learning_rate": 9.992983438818915e-07, |
|
"loss": 0.0002, |
|
"reward": -0.018036723136901855, |
|
"reward_std": 0.4528404772281647, |
|
"rewards/cosine_len_reward": -0.018036723136901855, |
|
"step": 59 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3503.9583740234375, |
|
"epoch": 0.03428571428571429, |
|
"grad_norm": 0.15964706242084503, |
|
"kl": 0.040802001953125, |
|
"learning_rate": 9.991120277927223e-07, |
|
"loss": 0.0065, |
|
"reward": -0.16116868564859033, |
|
"reward_std": 0.40593869611620903, |
|
"rewards/cosine_len_reward": -0.16116868564859033, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2942.5833740234375, |
|
"epoch": 0.03485714285714286, |
|
"grad_norm": 0.25859034061431885, |
|
"kl": 0.05389404296875, |
|
"learning_rate": 9.989038226169207e-07, |
|
"loss": -0.0849, |
|
"reward": -0.3965909481048584, |
|
"reward_std": 0.4511411488056183, |
|
"rewards/cosine_len_reward": -0.3965909481048584, |
|
"step": 61 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2976.75, |
|
"epoch": 0.03542857142857143, |
|
"grad_norm": 0.21166636049747467, |
|
"kl": 0.073974609375, |
|
"learning_rate": 9.98673738502114e-07, |
|
"loss": 0.0526, |
|
"reward": -0.46443600207567215, |
|
"reward_std": 0.3938862681388855, |
|
"rewards/cosine_len_reward": -0.46443600207567215, |
|
"step": 62 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.036, |
|
"grad_norm": 0.15653888881206512, |
|
"kl": 0.04681396484375, |
|
"learning_rate": 9.98421786662277e-07, |
|
"loss": 0.0002, |
|
"reward": 0.19495274312794209, |
|
"reward_std": 0.27957216277718544, |
|
"rewards/cosine_len_reward": 0.19495274312794209, |
|
"step": 63 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.036571428571428574, |
|
"grad_norm": 0.14757102727890015, |
|
"kl": 0.045074462890625, |
|
"learning_rate": 9.981479793771866e-07, |
|
"loss": 0.0002, |
|
"reward": -0.21563971042633057, |
|
"reward_std": 0.33880844712257385, |
|
"rewards/cosine_len_reward": -0.21563971042633057, |
|
"step": 64 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.037142857142857144, |
|
"grad_norm": 0.16783742606639862, |
|
"kl": 0.04986572265625, |
|
"learning_rate": 9.97852329991824e-07, |
|
"loss": 0.0002, |
|
"reward": 0.1148218410089612, |
|
"reward_std": 0.48075347393751144, |
|
"rewards/cosine_len_reward": 0.1148218410089612, |
|
"step": 65 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3331.2083740234375, |
|
"epoch": 0.037714285714285714, |
|
"grad_norm": 0.13948343694210052, |
|
"kl": 0.04083251953125, |
|
"learning_rate": 9.975348529157229e-07, |
|
"loss": -0.0053, |
|
"reward": -0.0602837149053812, |
|
"reward_std": 0.5337617173790932, |
|
"rewards/cosine_len_reward": -0.0602837149053812, |
|
"step": 66 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2811.375, |
|
"epoch": 0.038285714285714284, |
|
"grad_norm": 0.2400464117527008, |
|
"kl": 0.05047607421875, |
|
"learning_rate": 9.971955636222684e-07, |
|
"loss": 0.0394, |
|
"reward": -0.29330265894532204, |
|
"reward_std": 0.3466031104326248, |
|
"rewards/cosine_len_reward": -0.29330265894532204, |
|
"step": 67 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.038857142857142854, |
|
"grad_norm": 0.14053481817245483, |
|
"kl": 0.03558349609375, |
|
"learning_rate": 9.968344786479415e-07, |
|
"loss": 0.0001, |
|
"reward": -0.15899624675512314, |
|
"reward_std": 0.4388071522116661, |
|
"rewards/cosine_len_reward": -0.15899624675512314, |
|
"step": 68 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3306.6666870117188, |
|
"epoch": 0.03942857142857143, |
|
"grad_norm": 0.1672668159008026, |
|
"kl": 0.0478515625, |
|
"learning_rate": 9.964516155915151e-07, |
|
"loss": -0.0384, |
|
"reward": -0.5151599273085594, |
|
"reward_std": 0.3366116564720869, |
|
"rewards/cosine_len_reward": -0.5151599273085594, |
|
"step": 69 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3380.9583740234375, |
|
"epoch": 0.04, |
|
"grad_norm": 0.20518527925014496, |
|
"kl": 0.0628662109375, |
|
"learning_rate": 9.960469931131936e-07, |
|
"loss": 0.0704, |
|
"reward": -0.2993215322494507, |
|
"reward_std": 0.4852989763021469, |
|
"rewards/cosine_len_reward": -0.2993215322494507, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2935.3750915527344, |
|
"epoch": 0.04057142857142857, |
|
"grad_norm": 0.21485304832458496, |
|
"kl": 0.050140380859375, |
|
"learning_rate": 9.956206309337066e-07, |
|
"loss": 0.1084, |
|
"reward": -0.3775600343942642, |
|
"reward_std": 0.2518395222723484, |
|
"rewards/cosine_len_reward": -0.3775600343942642, |
|
"step": 71 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3209.5000610351562, |
|
"epoch": 0.04114285714285714, |
|
"grad_norm": 0.18990004062652588, |
|
"kl": 0.05255126953125, |
|
"learning_rate": 9.951725498333448e-07, |
|
"loss": 0.0459, |
|
"reward": -0.15486798901110888, |
|
"reward_std": 0.564708050340414, |
|
"rewards/cosine_len_reward": -0.15486798901110888, |
|
"step": 72 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2904.9583435058594, |
|
"epoch": 0.04171428571428572, |
|
"grad_norm": 0.24567438662052155, |
|
"kl": 0.0462646484375, |
|
"learning_rate": 9.947027716509488e-07, |
|
"loss": 0.061, |
|
"reward": -0.44290701299905777, |
|
"reward_std": 0.3936958834528923, |
|
"rewards/cosine_len_reward": -0.44290701299905777, |
|
"step": 73 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3459.375, |
|
"epoch": 0.04228571428571429, |
|
"grad_norm": 0.13506685197353363, |
|
"kl": 0.04144287109375, |
|
"learning_rate": 9.942113192828444e-07, |
|
"loss": -0.031, |
|
"reward": -0.12243828363716602, |
|
"reward_std": 0.40473101660609245, |
|
"rewards/cosine_len_reward": -0.12243828363716602, |
|
"step": 74 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3118.5000610351562, |
|
"epoch": 0.04285714285714286, |
|
"grad_norm": 0.15806534886360168, |
|
"kl": 0.045806884765625, |
|
"learning_rate": 9.93698216681727e-07, |
|
"loss": 0.034, |
|
"reward": -0.3190483935177326, |
|
"reward_std": 0.5546181797981262, |
|
"rewards/cosine_len_reward": -0.3190483935177326, |
|
"step": 75 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3512.0416870117188, |
|
"epoch": 0.04342857142857143, |
|
"grad_norm": 0.15939749777317047, |
|
"kl": 0.0576171875, |
|
"learning_rate": 9.931634888554935e-07, |
|
"loss": -0.043, |
|
"reward": -0.38086188584566116, |
|
"reward_std": 0.37592547200620174, |
|
"rewards/cosine_len_reward": -0.38086188584566116, |
|
"step": 76 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3427.5416870117188, |
|
"epoch": 0.044, |
|
"grad_norm": 0.15605805814266205, |
|
"kl": 0.0501708984375, |
|
"learning_rate": 9.926071618660237e-07, |
|
"loss": 0.0109, |
|
"reward": -0.1746644452214241, |
|
"reward_std": 0.49196697771549225, |
|
"rewards/cosine_len_reward": -0.1746644452214241, |
|
"step": 77 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3170.1250610351562, |
|
"epoch": 0.044571428571428574, |
|
"grad_norm": 0.15445668995380402, |
|
"kl": 0.050537109375, |
|
"learning_rate": 9.9202926282791e-07, |
|
"loss": 0.0035, |
|
"reward": -0.20145024731755257, |
|
"reward_std": 0.47183725237846375, |
|
"rewards/cosine_len_reward": -0.20145024731755257, |
|
"step": 78 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2967.25, |
|
"epoch": 0.045142857142857144, |
|
"grad_norm": 0.17259816825389862, |
|
"kl": 0.0391845703125, |
|
"learning_rate": 9.91429819907136e-07, |
|
"loss": 0.0579, |
|
"reward": -0.2730537634342909, |
|
"reward_std": 0.35801637917757034, |
|
"rewards/cosine_len_reward": -0.2730537634342909, |
|
"step": 79 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.045714285714285714, |
|
"grad_norm": 0.1690661758184433, |
|
"kl": 0.04119873046875, |
|
"learning_rate": 9.908088623197048e-07, |
|
"loss": 0.0002, |
|
"reward": -0.12683004513382912, |
|
"reward_std": 0.4240909740328789, |
|
"rewards/cosine_len_reward": -0.12683004513382912, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3565.375, |
|
"epoch": 0.046285714285714284, |
|
"grad_norm": 0.1551636904478073, |
|
"kl": 0.0504150390625, |
|
"learning_rate": 9.901664203302124e-07, |
|
"loss": -0.0052, |
|
"reward": 0.41808657720685005, |
|
"reward_std": 0.4268548539839685, |
|
"rewards/cosine_len_reward": 0.41808657720685005, |
|
"step": 81 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3496.7083740234375, |
|
"epoch": 0.046857142857142854, |
|
"grad_norm": 0.16650335490703583, |
|
"kl": 0.04388427734375, |
|
"learning_rate": 9.895025252503755e-07, |
|
"loss": 0.0324, |
|
"reward": -0.5734090358018875, |
|
"reward_std": 0.5235396586358547, |
|
"rewards/cosine_len_reward": -0.5734090358018875, |
|
"step": 82 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2126.4166870117188, |
|
"epoch": 0.04742857142857143, |
|
"grad_norm": 0.2955755293369293, |
|
"kl": 0.045166015625, |
|
"learning_rate": 9.888172094375033e-07, |
|
"loss": -0.0123, |
|
"reward": -0.059800997376441956, |
|
"reward_std": 0.57283616065979, |
|
"rewards/cosine_len_reward": -0.059800997376441956, |
|
"step": 83 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3072.9166870117188, |
|
"epoch": 0.048, |
|
"grad_norm": 0.18237070739269257, |
|
"kl": 0.047119140625, |
|
"learning_rate": 9.881105062929221e-07, |
|
"loss": 0.032, |
|
"reward": 0.17192097008228302, |
|
"reward_std": 0.2965017929673195, |
|
"rewards/cosine_len_reward": 0.17192097008228302, |
|
"step": 84 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3147.625, |
|
"epoch": 0.04857142857142857, |
|
"grad_norm": 0.2028658390045166, |
|
"kl": 0.0504150390625, |
|
"learning_rate": 9.873824502603459e-07, |
|
"loss": -0.1166, |
|
"reward": -0.07770865596830845, |
|
"reward_std": 0.5760391876101494, |
|
"rewards/cosine_len_reward": -0.07770865596830845, |
|
"step": 85 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3556.5833740234375, |
|
"epoch": 0.04914285714285714, |
|
"grad_norm": 0.1445421427488327, |
|
"kl": 0.04150390625, |
|
"learning_rate": 9.866330768241983e-07, |
|
"loss": 0.0109, |
|
"reward": -0.3070959039032459, |
|
"reward_std": 0.37722079269587994, |
|
"rewards/cosine_len_reward": -0.3070959039032459, |
|
"step": 86 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3544.0416870117188, |
|
"epoch": 0.04971428571428571, |
|
"grad_norm": 0.15989543497562408, |
|
"kl": 0.052490234375, |
|
"learning_rate": 9.85862422507884e-07, |
|
"loss": 0.0039, |
|
"reward": 0.007470171898603439, |
|
"reward_std": 0.6530721038579941, |
|
"rewards/cosine_len_reward": 0.007470171898603439, |
|
"step": 87 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3037.25, |
|
"epoch": 0.05028571428571429, |
|
"grad_norm": 0.19774754345417023, |
|
"kl": 0.050048828125, |
|
"learning_rate": 9.850705248720068e-07, |
|
"loss": 0.0108, |
|
"reward": -0.5706376060843468, |
|
"reward_std": 0.3964765667915344, |
|
"rewards/cosine_len_reward": -0.5706376060843468, |
|
"step": 88 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3323.0416870117188, |
|
"epoch": 0.05085714285714286, |
|
"grad_norm": 0.16871894896030426, |
|
"kl": 0.04931640625, |
|
"learning_rate": 9.8425742251254e-07, |
|
"loss": -0.0064, |
|
"reward": -0.3148947209119797, |
|
"reward_std": 0.49598074331879616, |
|
"rewards/cosine_len_reward": -0.3148947209119797, |
|
"step": 89 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3020.5833740234375, |
|
"epoch": 0.05142857142857143, |
|
"grad_norm": 0.15906846523284912, |
|
"kl": 0.035430908203125, |
|
"learning_rate": 9.83423155058946e-07, |
|
"loss": -0.1082, |
|
"reward": -0.3871347066015005, |
|
"reward_std": 0.48989518731832504, |
|
"rewards/cosine_len_reward": -0.3871347066015005, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3493.125, |
|
"epoch": 0.052, |
|
"grad_norm": 0.158765509724617, |
|
"kl": 0.047607421875, |
|
"learning_rate": 9.825677631722435e-07, |
|
"loss": 0.0357, |
|
"reward": -0.019672296941280365, |
|
"reward_std": 0.5390463247895241, |
|
"rewards/cosine_len_reward": -0.019672296941280365, |
|
"step": 91 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2866.375, |
|
"epoch": 0.052571428571428575, |
|
"grad_norm": 0.3300051689147949, |
|
"kl": 0.05999755859375, |
|
"learning_rate": 9.816912885430258e-07, |
|
"loss": 0.0759, |
|
"reward": -0.32447104528546333, |
|
"reward_std": 0.4015946201980114, |
|
"rewards/cosine_len_reward": -0.32447104528546333, |
|
"step": 92 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2927.4166870117188, |
|
"epoch": 0.053142857142857144, |
|
"grad_norm": 0.2505541741847992, |
|
"kl": 0.0513916015625, |
|
"learning_rate": 9.807937738894303e-07, |
|
"loss": -0.0618, |
|
"reward": -0.26003802567720413, |
|
"reward_std": 0.44223659485578537, |
|
"rewards/cosine_len_reward": -0.26003802567720413, |
|
"step": 93 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3462.6250610351562, |
|
"epoch": 0.053714285714285714, |
|
"grad_norm": 0.13466289639472961, |
|
"kl": 0.037841796875, |
|
"learning_rate": 9.798752629550546e-07, |
|
"loss": -0.0284, |
|
"reward": -0.3509189058095217, |
|
"reward_std": 0.46187154948711395, |
|
"rewards/cosine_len_reward": -0.3509189058095217, |
|
"step": 94 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2668.5834045410156, |
|
"epoch": 0.054285714285714284, |
|
"grad_norm": 0.40417179465293884, |
|
"kl": 0.057861328125, |
|
"learning_rate": 9.78935800506826e-07, |
|
"loss": 0.1304, |
|
"reward": -0.12394729629158974, |
|
"reward_std": 0.36398765817284584, |
|
"rewards/cosine_len_reward": -0.12394729629158974, |
|
"step": 95 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.054857142857142854, |
|
"grad_norm": 0.15475483238697052, |
|
"kl": 0.04119873046875, |
|
"learning_rate": 9.779754323328192e-07, |
|
"loss": 0.0002, |
|
"reward": 0.05401550233364105, |
|
"reward_std": 0.4927036911249161, |
|
"rewards/cosine_len_reward": 0.05401550233364105, |
|
"step": 96 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3166.9166870117188, |
|
"epoch": 0.05542857142857143, |
|
"grad_norm": 0.18519708514213562, |
|
"kl": 0.05224609375, |
|
"learning_rate": 9.769942052400235e-07, |
|
"loss": 0.0336, |
|
"reward": -0.1515663117170334, |
|
"reward_std": 0.4508054330945015, |
|
"rewards/cosine_len_reward": -0.1515663117170334, |
|
"step": 97 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2021.6667175292969, |
|
"epoch": 0.056, |
|
"grad_norm": 0.5753190517425537, |
|
"kl": 0.047119140625, |
|
"learning_rate": 9.759921670520634e-07, |
|
"loss": 0.0775, |
|
"reward": -0.10128153627738357, |
|
"reward_std": 0.5495500713586807, |
|
"rewards/cosine_len_reward": -0.10128153627738357, |
|
"step": 98 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2799.875, |
|
"epoch": 0.05657142857142857, |
|
"grad_norm": 0.25073686242103577, |
|
"kl": 0.04339599609375, |
|
"learning_rate": 9.749693666068663e-07, |
|
"loss": 0.0462, |
|
"reward": 0.16878609731793404, |
|
"reward_std": 0.4969979338347912, |
|
"rewards/cosine_len_reward": 0.16878609731793404, |
|
"step": 99 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2547.6666717529297, |
|
"epoch": 0.05714285714285714, |
|
"grad_norm": 0.2086445391178131, |
|
"kl": 0.0418701171875, |
|
"learning_rate": 9.739258537542835e-07, |
|
"loss": 0.0808, |
|
"reward": 0.23498320078942925, |
|
"reward_std": 0.4619377665221691, |
|
"rewards/cosine_len_reward": 0.23498320078942925, |
|
"step": 100 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3272.291748046875, |
|
"epoch": 0.05771428571428571, |
|
"grad_norm": 0.16273322701454163, |
|
"kl": 0.04364013671875, |
|
"learning_rate": 9.728616793536587e-07, |
|
"loss": 0.0951, |
|
"reward": -5.926936864852905e-06, |
|
"reward_std": 0.4691709503531456, |
|
"rewards/cosine_len_reward": -5.926936864852905e-06, |
|
"step": 101 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3564.25, |
|
"epoch": 0.05828571428571429, |
|
"grad_norm": 0.16961455345153809, |
|
"kl": 0.05303955078125, |
|
"learning_rate": 9.717768952713511e-07, |
|
"loss": 0.007, |
|
"reward": -0.39636145159602165, |
|
"reward_std": 0.3456159494817257, |
|
"rewards/cosine_len_reward": -0.39636145159602165, |
|
"step": 102 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.05885714285714286, |
|
"grad_norm": 0.17157597839832306, |
|
"kl": 0.0433349609375, |
|
"learning_rate": 9.706715543782064e-07, |
|
"loss": 0.0002, |
|
"reward": -0.5884420499205589, |
|
"reward_std": 0.35518577694892883, |
|
"rewards/cosine_len_reward": -0.5884420499205589, |
|
"step": 103 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.05942857142857143, |
|
"grad_norm": 0.14763562381267548, |
|
"kl": 0.05126953125, |
|
"learning_rate": 9.695457105469804e-07, |
|
"loss": 0.0002, |
|
"reward": -0.1686297096312046, |
|
"reward_std": 0.43287331238389015, |
|
"rewards/cosine_len_reward": -0.1686297096312046, |
|
"step": 104 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3580.0416870117188, |
|
"epoch": 0.06, |
|
"grad_norm": 0.14612889289855957, |
|
"kl": 0.04803466796875, |
|
"learning_rate": 9.683994186497132e-07, |
|
"loss": -0.0009, |
|
"reward": -0.3449864834547043, |
|
"reward_std": 0.30942362174391747, |
|
"rewards/cosine_len_reward": -0.3449864834547043, |
|
"step": 105 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3102.7083435058594, |
|
"epoch": 0.060571428571428575, |
|
"grad_norm": 0.19139307737350464, |
|
"kl": 0.04766845703125, |
|
"learning_rate": 9.672327345550543e-07, |
|
"loss": -0.0008, |
|
"reward": -0.35323648154735565, |
|
"reward_std": 0.4123990163207054, |
|
"rewards/cosine_len_reward": -0.35323648154735565, |
|
"step": 106 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2927.375, |
|
"epoch": 0.061142857142857145, |
|
"grad_norm": 0.19999508559703827, |
|
"kl": 0.0523681640625, |
|
"learning_rate": 9.66045715125541e-07, |
|
"loss": 0.0263, |
|
"reward": 0.09981860686093569, |
|
"reward_std": 0.3403412587940693, |
|
"rewards/cosine_len_reward": 0.09981860686093569, |
|
"step": 107 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2892.0833740234375, |
|
"epoch": 0.061714285714285715, |
|
"grad_norm": 0.20375655591487885, |
|
"kl": 0.04254150390625, |
|
"learning_rate": 9.648384182148252e-07, |
|
"loss": 0.0582, |
|
"reward": -0.0642486959695816, |
|
"reward_std": 0.5786372274160385, |
|
"rewards/cosine_len_reward": -0.0642486959695816, |
|
"step": 108 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3558.125, |
|
"epoch": 0.062285714285714285, |
|
"grad_norm": 0.15504595637321472, |
|
"kl": 0.051025390625, |
|
"learning_rate": 9.636109026648554e-07, |
|
"loss": 0.014, |
|
"reward": -0.164622500538826, |
|
"reward_std": 0.3372262194752693, |
|
"rewards/cosine_len_reward": -0.164622500538826, |
|
"step": 109 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3348.0833740234375, |
|
"epoch": 0.06285714285714286, |
|
"grad_norm": 0.14603567123413086, |
|
"kl": 0.041534423828125, |
|
"learning_rate": 9.623632283030077e-07, |
|
"loss": -0.0193, |
|
"reward": -0.2948532775044441, |
|
"reward_std": 0.39745140075683594, |
|
"rewards/cosine_len_reward": -0.2948532775044441, |
|
"step": 110 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2819.0833435058594, |
|
"epoch": 0.06342857142857143, |
|
"grad_norm": 0.24989280104637146, |
|
"kl": 0.0501708984375, |
|
"learning_rate": 9.610954559391704e-07, |
|
"loss": 0.0304, |
|
"reward": 0.09465552493929863, |
|
"reward_std": 0.4290134608745575, |
|
"rewards/cosine_len_reward": 0.09465552493929863, |
|
"step": 111 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.064, |
|
"grad_norm": 0.1428801417350769, |
|
"kl": 0.04595947265625, |
|
"learning_rate": 9.598076473627796e-07, |
|
"loss": 0.0002, |
|
"reward": -0.19034046679735184, |
|
"reward_std": 0.2815712224692106, |
|
"rewards/cosine_len_reward": -0.19034046679735184, |
|
"step": 112 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3333.75, |
|
"epoch": 0.06457142857142857, |
|
"grad_norm": 0.17772957682609558, |
|
"kl": 0.0450439453125, |
|
"learning_rate": 9.58499865339809e-07, |
|
"loss": -0.0675, |
|
"reward": 0.15639985352754593, |
|
"reward_std": 0.49782028794288635, |
|
"rewards/cosine_len_reward": 0.15639985352754593, |
|
"step": 113 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3200.2916870117188, |
|
"epoch": 0.06514285714285714, |
|
"grad_norm": 0.18176570534706116, |
|
"kl": 0.041351318359375, |
|
"learning_rate": 9.571721736097088e-07, |
|
"loss": 0.0359, |
|
"reward": -0.23854705691337585, |
|
"reward_std": 0.3703417256474495, |
|
"rewards/cosine_len_reward": -0.23854705691337585, |
|
"step": 114 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3426.2083740234375, |
|
"epoch": 0.06571428571428571, |
|
"grad_norm": 0.18212103843688965, |
|
"kl": 0.05426025390625, |
|
"learning_rate": 9.55824636882301e-07, |
|
"loss": -0.0068, |
|
"reward": 0.19186732172966003, |
|
"reward_std": 0.5083862394094467, |
|
"rewards/cosine_len_reward": 0.19186732172966003, |
|
"step": 115 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.06628571428571428, |
|
"grad_norm": 0.1472100466489792, |
|
"kl": 0.04229736328125, |
|
"learning_rate": 9.54457320834625e-07, |
|
"loss": 0.0002, |
|
"reward": -0.6675661206245422, |
|
"reward_std": 0.25827275589108467, |
|
"rewards/cosine_len_reward": -0.6675661206245422, |
|
"step": 116 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3405.0, |
|
"epoch": 0.06685714285714285, |
|
"grad_norm": 0.1627156138420105, |
|
"kl": 0.04766845703125, |
|
"learning_rate": 9.530702921077358e-07, |
|
"loss": 0.0332, |
|
"reward": -0.04678164981305599, |
|
"reward_std": 0.4391244202852249, |
|
"rewards/cosine_len_reward": -0.04678164981305599, |
|
"step": 117 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3078.125, |
|
"epoch": 0.06742857142857143, |
|
"grad_norm": 0.23983289301395416, |
|
"kl": 0.0458984375, |
|
"learning_rate": 9.516636183034564e-07, |
|
"loss": 0.0603, |
|
"reward": -0.4750605896115303, |
|
"reward_std": 0.4151216857135296, |
|
"rewards/cosine_len_reward": -0.4750605896115303, |
|
"step": 118 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2418.000030517578, |
|
"epoch": 0.068, |
|
"grad_norm": 0.24952396750450134, |
|
"kl": 0.05657958984375, |
|
"learning_rate": 9.502373679810839e-07, |
|
"loss": -0.0338, |
|
"reward": -0.32502793427556753, |
|
"reward_std": 0.4044921174645424, |
|
"rewards/cosine_len_reward": -0.32502793427556753, |
|
"step": 119 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3572.25, |
|
"epoch": 0.06857142857142857, |
|
"grad_norm": 0.15841135382652283, |
|
"kl": 0.04815673828125, |
|
"learning_rate": 9.487916106540465e-07, |
|
"loss": 0.0057, |
|
"reward": -0.2742099305614829, |
|
"reward_std": 0.6057566404342651, |
|
"rewards/cosine_len_reward": -0.2742099305614829, |
|
"step": 120 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3224.6666870117188, |
|
"epoch": 0.06914285714285714, |
|
"grad_norm": 0.17755034565925598, |
|
"kl": 0.0517578125, |
|
"learning_rate": 9.473264167865171e-07, |
|
"loss": -0.0588, |
|
"reward": 0.009542322251945734, |
|
"reward_std": 0.46895354986190796, |
|
"rewards/cosine_len_reward": 0.009542322251945734, |
|
"step": 121 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2758.4166870117188, |
|
"epoch": 0.06971428571428571, |
|
"grad_norm": 0.20300975441932678, |
|
"kl": 0.0401611328125, |
|
"learning_rate": 9.458418577899774e-07, |
|
"loss": 0.0569, |
|
"reward": -0.16051488742232323, |
|
"reward_std": 0.5097866654396057, |
|
"rewards/cosine_len_reward": -0.16051488742232323, |
|
"step": 122 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3386.9583740234375, |
|
"epoch": 0.07028571428571428, |
|
"grad_norm": 0.14953473210334778, |
|
"kl": 0.045745849609375, |
|
"learning_rate": 9.443380060197385e-07, |
|
"loss": -0.0194, |
|
"reward": -0.28279081732034683, |
|
"reward_std": 0.46559938788414, |
|
"rewards/cosine_len_reward": -0.28279081732034683, |
|
"step": 123 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3543.1666870117188, |
|
"epoch": 0.07085714285714285, |
|
"grad_norm": 0.13742747902870178, |
|
"kl": 0.041259765625, |
|
"learning_rate": 9.428149347714143e-07, |
|
"loss": 0.0186, |
|
"reward": 0.1831405507400632, |
|
"reward_std": 0.4129592701792717, |
|
"rewards/cosine_len_reward": 0.1831405507400632, |
|
"step": 124 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2738.7916870117188, |
|
"epoch": 0.07142857142857142, |
|
"grad_norm": 0.22325678169727325, |
|
"kl": 0.0482177734375, |
|
"learning_rate": 9.412727182773486e-07, |
|
"loss": -0.1429, |
|
"reward": -0.3428646810352802, |
|
"reward_std": 0.418590746819973, |
|
"rewards/cosine_len_reward": -0.3428646810352802, |
|
"step": 125 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2154.0, |
|
"epoch": 0.072, |
|
"grad_norm": 0.28196725249290466, |
|
"kl": 0.04034423828125, |
|
"learning_rate": 9.397114317029974e-07, |
|
"loss": 0.1723, |
|
"reward": -0.04585587605834007, |
|
"reward_std": 0.5331225916743279, |
|
"rewards/cosine_len_reward": -0.04585587605834007, |
|
"step": 126 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3005.375030517578, |
|
"epoch": 0.07257142857142856, |
|
"grad_norm": 0.23298974335193634, |
|
"kl": 0.043731689453125, |
|
"learning_rate": 9.381311511432658e-07, |
|
"loss": -0.1325, |
|
"reward": -0.21542668342590332, |
|
"reward_std": 0.49729710817337036, |
|
"rewards/cosine_len_reward": -0.21542668342590332, |
|
"step": 127 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3290.0000610351562, |
|
"epoch": 0.07314285714285715, |
|
"grad_norm": 0.1574346274137497, |
|
"kl": 0.047027587890625, |
|
"learning_rate": 9.36531953618799e-07, |
|
"loss": -0.0311, |
|
"reward": 0.13029874116182327, |
|
"reward_std": 0.4678277000784874, |
|
"rewards/cosine_len_reward": 0.13029874116182327, |
|
"step": 128 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2900.3750610351562, |
|
"epoch": 0.07371428571428572, |
|
"grad_norm": 0.1946721225976944, |
|
"kl": 0.03863525390625, |
|
"learning_rate": 9.34913917072228e-07, |
|
"loss": -0.0883, |
|
"reward": 0.22734229266643524, |
|
"reward_std": 0.49264590442180634, |
|
"rewards/cosine_len_reward": 0.22734229266643524, |
|
"step": 129 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3336.4583740234375, |
|
"epoch": 0.07428571428571429, |
|
"grad_norm": 0.16337695717811584, |
|
"kl": 0.0472412109375, |
|
"learning_rate": 9.332771203643714e-07, |
|
"loss": -0.1032, |
|
"reward": -0.0558868944644928, |
|
"reward_std": 0.22906366735696793, |
|
"rewards/cosine_len_reward": -0.0558868944644928, |
|
"step": 130 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3318.0416870117188, |
|
"epoch": 0.07485714285714286, |
|
"grad_norm": 0.14785107970237732, |
|
"kl": 0.04193115234375, |
|
"learning_rate": 9.316216432703916e-07, |
|
"loss": 0.0209, |
|
"reward": -0.4462656928226352, |
|
"reward_std": 0.4424574077129364, |
|
"rewards/cosine_len_reward": -0.4462656928226352, |
|
"step": 131 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3420.75, |
|
"epoch": 0.07542857142857143, |
|
"grad_norm": 0.15938332676887512, |
|
"kl": 0.04559326171875, |
|
"learning_rate": 9.299475664759068e-07, |
|
"loss": -0.0455, |
|
"reward": -0.19067499786615372, |
|
"reward_std": 0.2114762719720602, |
|
"rewards/cosine_len_reward": -0.19067499786615372, |
|
"step": 132 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3216.5416870117188, |
|
"epoch": 0.076, |
|
"grad_norm": 21.750301361083984, |
|
"kl": 7.40423583984375, |
|
"learning_rate": 9.282549715730579e-07, |
|
"loss": -0.0148, |
|
"reward": 0.1352168396115303, |
|
"reward_std": 0.30277941189706326, |
|
"rewards/cosine_len_reward": 0.1352168396115303, |
|
"step": 133 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3362.2500610351562, |
|
"epoch": 0.07657142857142857, |
|
"grad_norm": 0.14318473637104034, |
|
"kl": 0.04864501953125, |
|
"learning_rate": 9.265439410565328e-07, |
|
"loss": -0.0521, |
|
"reward": 0.48804809525609016, |
|
"reward_std": 0.4641268514096737, |
|
"rewards/cosine_len_reward": 0.48804809525609016, |
|
"step": 134 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3025.2083435058594, |
|
"epoch": 0.07714285714285714, |
|
"grad_norm": 0.17564912140369415, |
|
"kl": 0.05517578125, |
|
"learning_rate": 9.248145583195447e-07, |
|
"loss": 0.0415, |
|
"reward": -0.44774627685546875, |
|
"reward_std": 0.5005357041954994, |
|
"rewards/cosine_len_reward": -0.44774627685546875, |
|
"step": 135 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2600.541748046875, |
|
"epoch": 0.07771428571428571, |
|
"grad_norm": 0.2430720180273056, |
|
"kl": 0.07049560546875, |
|
"learning_rate": 9.230669076497687e-07, |
|
"loss": 0.0399, |
|
"reward": 0.02273743972182274, |
|
"reward_std": 0.6690217182040215, |
|
"rewards/cosine_len_reward": 0.02273743972182274, |
|
"step": 136 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3021.1666870117188, |
|
"epoch": 0.07828571428571429, |
|
"grad_norm": 0.1892845779657364, |
|
"kl": 0.0400390625, |
|
"learning_rate": 9.213010742252327e-07, |
|
"loss": -0.0509, |
|
"reward": -0.007634974084794521, |
|
"reward_std": 0.4675633981823921, |
|
"rewards/cosine_len_reward": -0.007634974084794521, |
|
"step": 137 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2813.9166870117188, |
|
"epoch": 0.07885714285714286, |
|
"grad_norm": 0.17883385717868805, |
|
"kl": 0.04425048828125, |
|
"learning_rate": 9.195171441101668e-07, |
|
"loss": -0.0573, |
|
"reward": -0.4329497180879116, |
|
"reward_std": 0.447217158973217, |
|
"rewards/cosine_len_reward": -0.4329497180879116, |
|
"step": 138 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3572.0, |
|
"epoch": 0.07942857142857143, |
|
"grad_norm": 0.14457197487354279, |
|
"kl": 0.04534912109375, |
|
"learning_rate": 9.177152042508077e-07, |
|
"loss": 0.0069, |
|
"reward": -0.40829066932201385, |
|
"reward_std": 0.3393409103155136, |
|
"rewards/cosine_len_reward": -0.40829066932201385, |
|
"step": 139 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3333.2916870117188, |
|
"epoch": 0.08, |
|
"grad_norm": 0.14431318640708923, |
|
"kl": 0.04229736328125, |
|
"learning_rate": 9.158953424711624e-07, |
|
"loss": -0.0213, |
|
"reward": -0.23191562667489052, |
|
"reward_std": 0.45371272414922714, |
|
"rewards/cosine_len_reward": -0.23191562667489052, |
|
"step": 140 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3125.2500610351562, |
|
"epoch": 0.08057142857142857, |
|
"grad_norm": 0.18807615339756012, |
|
"kl": 0.056884765625, |
|
"learning_rate": 9.140576474687263e-07, |
|
"loss": 0.0654, |
|
"reward": -0.58438640832901, |
|
"reward_std": 0.34158289059996605, |
|
"rewards/cosine_len_reward": -0.58438640832901, |
|
"step": 141 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3470.5416870117188, |
|
"epoch": 0.08114285714285714, |
|
"grad_norm": 0.16417664289474487, |
|
"kl": 0.054443359375, |
|
"learning_rate": 9.122022088101613e-07, |
|
"loss": -0.0156, |
|
"reward": 0.4055279679596424, |
|
"reward_std": 0.5106773134320974, |
|
"rewards/cosine_len_reward": 0.4055279679596424, |
|
"step": 142 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2767.625, |
|
"epoch": 0.08171428571428571, |
|
"grad_norm": 0.23091278970241547, |
|
"kl": 0.046142578125, |
|
"learning_rate": 9.103291169269299e-07, |
|
"loss": 0.0155, |
|
"reward": -0.025373689830303192, |
|
"reward_std": 0.3311509620398283, |
|
"rewards/cosine_len_reward": -0.025373689830303192, |
|
"step": 143 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3129.4583740234375, |
|
"epoch": 0.08228571428571428, |
|
"grad_norm": 0.21718049049377441, |
|
"kl": 0.04827880859375, |
|
"learning_rate": 9.084384631108882e-07, |
|
"loss": -0.0761, |
|
"reward": -0.16653983620926738, |
|
"reward_std": 0.6158961765468121, |
|
"rewards/cosine_len_reward": -0.16653983620926738, |
|
"step": 144 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3168.875, |
|
"epoch": 0.08285714285714285, |
|
"grad_norm": 0.17413252592086792, |
|
"kl": 0.06317138671875, |
|
"learning_rate": 9.065303395098358e-07, |
|
"loss": 0.0265, |
|
"reward": 0.03458546567708254, |
|
"reward_std": 0.6630469337105751, |
|
"rewards/cosine_len_reward": 0.03458546567708254, |
|
"step": 145 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2882.125030517578, |
|
"epoch": 0.08342857142857144, |
|
"grad_norm": 0.34923824667930603, |
|
"kl": 0.0794677734375, |
|
"learning_rate": 9.046048391230247e-07, |
|
"loss": -0.1138, |
|
"reward": -0.3882593959569931, |
|
"reward_std": 0.4065255671739578, |
|
"rewards/cosine_len_reward": -0.3882593959569931, |
|
"step": 146 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3010.625030517578, |
|
"epoch": 0.084, |
|
"grad_norm": 0.16546739637851715, |
|
"kl": 0.04425048828125, |
|
"learning_rate": 9.026620557966279e-07, |
|
"loss": -0.0449, |
|
"reward": -0.2956245392560959, |
|
"reward_std": 0.2463199496269226, |
|
"rewards/cosine_len_reward": -0.2956245392560959, |
|
"step": 147 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3277.625, |
|
"epoch": 0.08457142857142858, |
|
"grad_norm": 0.1968696564435959, |
|
"kl": 0.04840087890625, |
|
"learning_rate": 9.007020842191634e-07, |
|
"loss": 0.0347, |
|
"reward": -0.1628122478723526, |
|
"reward_std": 0.32364944741129875, |
|
"rewards/cosine_len_reward": -0.1628122478723526, |
|
"step": 148 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3504.0416870117188, |
|
"epoch": 0.08514285714285715, |
|
"grad_norm": 0.17119912803173065, |
|
"kl": 0.06097412109375, |
|
"learning_rate": 8.987250199168808e-07, |
|
"loss": 0.0393, |
|
"reward": -0.14941053837537766, |
|
"reward_std": 0.3042156994342804, |
|
"rewards/cosine_len_reward": -0.14941053837537766, |
|
"step": 149 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3513.0, |
|
"epoch": 0.08571428571428572, |
|
"grad_norm": 0.16584385931491852, |
|
"kl": 0.059814453125, |
|
"learning_rate": 8.967309592491052e-07, |
|
"loss": -0.0111, |
|
"reward": 0.037046159617602825, |
|
"reward_std": 0.5206618458032608, |
|
"rewards/cosine_len_reward": 0.037046159617602825, |
|
"step": 150 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3420.7500610351562, |
|
"epoch": 0.08628571428571429, |
|
"grad_norm": 0.16107913851737976, |
|
"kl": 0.03912353515625, |
|
"learning_rate": 8.9471999940354e-07, |
|
"loss": -0.0009, |
|
"reward": -0.03606244549155235, |
|
"reward_std": 0.24746908619999886, |
|
"rewards/cosine_len_reward": -0.03606244549155235, |
|
"step": 151 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3524.4166870117188, |
|
"epoch": 0.08685714285714285, |
|
"grad_norm": 0.16076341271400452, |
|
"kl": 0.0489501953125, |
|
"learning_rate": 8.926922383915315e-07, |
|
"loss": 0.0361, |
|
"reward": -0.19618698582053185, |
|
"reward_std": 0.46029967814683914, |
|
"rewards/cosine_len_reward": -0.19618698582053185, |
|
"step": 152 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3083.5833435058594, |
|
"epoch": 0.08742857142857142, |
|
"grad_norm": 0.1417928785085678, |
|
"kl": 0.039093017578125, |
|
"learning_rate": 8.906477750432903e-07, |
|
"loss": 0.0074, |
|
"reward": -0.4691794868558645, |
|
"reward_std": 0.4594339244067669, |
|
"rewards/cosine_len_reward": -0.4691794868558645, |
|
"step": 153 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.088, |
|
"grad_norm": 0.16717113554477692, |
|
"kl": 0.05596923828125, |
|
"learning_rate": 8.88586709003076e-07, |
|
"loss": 0.0002, |
|
"reward": -0.15261722169816494, |
|
"reward_std": 0.3467218354344368, |
|
"rewards/cosine_len_reward": -0.15261722169816494, |
|
"step": 154 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.08857142857142856, |
|
"grad_norm": 0.149485781788826, |
|
"kl": 0.044677734375, |
|
"learning_rate": 8.865091407243394e-07, |
|
"loss": 0.0002, |
|
"reward": -0.22241253405809402, |
|
"reward_std": 0.3932184986770153, |
|
"rewards/cosine_len_reward": -0.22241253405809402, |
|
"step": 155 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2931.7084045410156, |
|
"epoch": 0.08914285714285715, |
|
"grad_norm": 0.17892643809318542, |
|
"kl": 0.0556640625, |
|
"learning_rate": 8.844151714648274e-07, |
|
"loss": -0.0795, |
|
"reward": 0.15793364495038986, |
|
"reward_std": 0.44282783567905426, |
|
"rewards/cosine_len_reward": 0.15793364495038986, |
|
"step": 156 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2960.9166870117188, |
|
"epoch": 0.08971428571428572, |
|
"grad_norm": 0.24480818212032318, |
|
"kl": 0.0452880859375, |
|
"learning_rate": 8.823049032816478e-07, |
|
"loss": -0.1566, |
|
"reward": -0.31062010303139687, |
|
"reward_std": 0.4297754764556885, |
|
"rewards/cosine_len_reward": -0.31062010303139687, |
|
"step": 157 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3432.125, |
|
"epoch": 0.09028571428571429, |
|
"grad_norm": 0.15342198312282562, |
|
"kl": 0.04534912109375, |
|
"learning_rate": 8.801784390262943e-07, |
|
"loss": 0.0076, |
|
"reward": -0.1336694210767746, |
|
"reward_std": 0.5960573703050613, |
|
"rewards/cosine_len_reward": -0.1336694210767746, |
|
"step": 158 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3393.541748046875, |
|
"epoch": 0.09085714285714286, |
|
"grad_norm": 0.289773166179657, |
|
"kl": 0.1083984375, |
|
"learning_rate": 8.780358823396352e-07, |
|
"loss": -0.0277, |
|
"reward": -0.14455506764352322, |
|
"reward_std": 0.4936875104904175, |
|
"rewards/cosine_len_reward": -0.14455506764352322, |
|
"step": 159 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.09142857142857143, |
|
"grad_norm": 0.14998309314250946, |
|
"kl": 0.04327392578125, |
|
"learning_rate": 8.758773376468604e-07, |
|
"loss": 0.0002, |
|
"reward": 0.18973015248775482, |
|
"reward_std": 0.40343762934207916, |
|
"rewards/cosine_len_reward": 0.18973015248775482, |
|
"step": 160 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3510.75, |
|
"epoch": 0.092, |
|
"grad_norm": 0.15721499919891357, |
|
"kl": 0.052490234375, |
|
"learning_rate": 8.737029101523929e-07, |
|
"loss": -0.0232, |
|
"reward": -0.1421234980225563, |
|
"reward_std": 0.47652409970760345, |
|
"rewards/cosine_len_reward": -0.1421234980225563, |
|
"step": 161 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3427.9583740234375, |
|
"epoch": 0.09257142857142857, |
|
"grad_norm": 0.14637959003448486, |
|
"kl": 0.037109375, |
|
"learning_rate": 8.715127058347614e-07, |
|
"loss": -0.0183, |
|
"reward": -0.11180838942527771, |
|
"reward_std": 0.6244986057281494, |
|
"rewards/cosine_len_reward": -0.11180838942527771, |
|
"step": 162 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.09314285714285714, |
|
"grad_norm": 0.1724317967891693, |
|
"kl": 0.0538330078125, |
|
"learning_rate": 8.693068314414344e-07, |
|
"loss": 0.0002, |
|
"reward": -0.024596035480499268, |
|
"reward_std": 0.39021917432546616, |
|
"rewards/cosine_len_reward": -0.024596035480499268, |
|
"step": 163 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3473.7083740234375, |
|
"epoch": 0.09371428571428571, |
|
"grad_norm": 0.13588137924671173, |
|
"kl": 0.04315185546875, |
|
"learning_rate": 8.670853944836176e-07, |
|
"loss": -0.0452, |
|
"reward": -0.1590297818183899, |
|
"reward_std": 0.4862016811966896, |
|
"rewards/cosine_len_reward": -0.1590297818183899, |
|
"step": 164 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.09428571428571429, |
|
"grad_norm": 0.14206375181674957, |
|
"kl": 0.0416259765625, |
|
"learning_rate": 8.648485032310144e-07, |
|
"loss": 0.0002, |
|
"reward": -0.3889569491147995, |
|
"reward_std": 0.2624204456806183, |
|
"rewards/cosine_len_reward": -0.3889569491147995, |
|
"step": 165 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.09485714285714286, |
|
"grad_norm": 0.14916691184043884, |
|
"kl": 0.04949951171875, |
|
"learning_rate": 8.625962667065487e-07, |
|
"loss": 0.0002, |
|
"reward": -0.7786369696259499, |
|
"reward_std": 0.22933637350797653, |
|
"rewards/cosine_len_reward": -0.7786369696259499, |
|
"step": 166 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3278.1251220703125, |
|
"epoch": 0.09542857142857143, |
|
"grad_norm": 0.14883826673030853, |
|
"kl": 0.04718017578125, |
|
"learning_rate": 8.603287946810513e-07, |
|
"loss": 0.0132, |
|
"reward": -0.41120439767837524, |
|
"reward_std": 0.5235109552741051, |
|
"rewards/cosine_len_reward": -0.41120439767837524, |
|
"step": 167 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3423.8333740234375, |
|
"epoch": 0.096, |
|
"grad_norm": 0.14527034759521484, |
|
"kl": 0.043701171875, |
|
"learning_rate": 8.580461976679099e-07, |
|
"loss": -0.0132, |
|
"reward": -0.5333987874910235, |
|
"reward_std": 0.3139747306704521, |
|
"rewards/cosine_len_reward": -0.5333987874910235, |
|
"step": 168 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3220.625, |
|
"epoch": 0.09657142857142857, |
|
"grad_norm": 0.1992996782064438, |
|
"kl": 0.043212890625, |
|
"learning_rate": 8.557485869176825e-07, |
|
"loss": -0.0149, |
|
"reward": -0.04628082364797592, |
|
"reward_std": 0.4247637018561363, |
|
"rewards/cosine_len_reward": -0.04628082364797592, |
|
"step": 169 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.09714285714285714, |
|
"grad_norm": 0.15683424472808838, |
|
"kl": 0.0523681640625, |
|
"learning_rate": 8.534360744126753e-07, |
|
"loss": 0.0002, |
|
"reward": 0.5332798510789871, |
|
"reward_std": 0.2842548470944166, |
|
"rewards/cosine_len_reward": 0.5332798510789871, |
|
"step": 170 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.09771428571428571, |
|
"grad_norm": 0.14955052733421326, |
|
"kl": 0.03985595703125, |
|
"learning_rate": 8.511087728614862e-07, |
|
"loss": 0.0002, |
|
"reward": 0.32699378207325935, |
|
"reward_std": 0.3331494480371475, |
|
"rewards/cosine_len_reward": 0.32699378207325935, |
|
"step": 171 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3279.125, |
|
"epoch": 0.09828571428571428, |
|
"grad_norm": 0.18859492242336273, |
|
"kl": 0.06658935546875, |
|
"learning_rate": 8.487667956935087e-07, |
|
"loss": -0.0026, |
|
"reward": -0.15557575225830078, |
|
"reward_std": 0.26120322197675705, |
|
"rewards/cosine_len_reward": -0.15557575225830078, |
|
"step": 172 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3291.4583740234375, |
|
"epoch": 0.09885714285714285, |
|
"grad_norm": 0.18678006529808044, |
|
"kl": 0.05291748046875, |
|
"learning_rate": 8.464102570534061e-07, |
|
"loss": 0.0472, |
|
"reward": -0.348556749522686, |
|
"reward_std": 0.4245801493525505, |
|
"rewards/cosine_len_reward": -0.348556749522686, |
|
"step": 173 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.09942857142857142, |
|
"grad_norm": 0.1602144092321396, |
|
"kl": 0.0545654296875, |
|
"learning_rate": 8.440392717955475e-07, |
|
"loss": 0.0002, |
|
"reward": -0.13419683277606964, |
|
"reward_std": 0.42287082970142365, |
|
"rewards/cosine_len_reward": -0.13419683277606964, |
|
"step": 174 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3479.416748046875, |
|
"epoch": 0.1, |
|
"grad_norm": 0.13922035694122314, |
|
"kl": 0.03887939453125, |
|
"learning_rate": 8.416539554784089e-07, |
|
"loss": 0.0199, |
|
"reward": -0.4109889939427376, |
|
"reward_std": 0.4904426783323288, |
|
"rewards/cosine_len_reward": -0.4109889939427376, |
|
"step": 175 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3444.8333740234375, |
|
"epoch": 0.10057142857142858, |
|
"grad_norm": 0.16923050582408905, |
|
"kl": 0.048248291015625, |
|
"learning_rate": 8.392544243589427e-07, |
|
"loss": 0.0326, |
|
"reward": 0.014319989830255508, |
|
"reward_std": 0.4780692644417286, |
|
"rewards/cosine_len_reward": 0.014319989830255508, |
|
"step": 176 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3200.0416870117188, |
|
"epoch": 0.10114285714285715, |
|
"grad_norm": 0.1463550180196762, |
|
"kl": 0.04083251953125, |
|
"learning_rate": 8.368407953869103e-07, |
|
"loss": -0.0459, |
|
"reward": -0.2681840620934963, |
|
"reward_std": 0.4939410388469696, |
|
"rewards/cosine_len_reward": -0.2681840620934963, |
|
"step": 177 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3515.1666870117188, |
|
"epoch": 0.10171428571428572, |
|
"grad_norm": 0.1453145146369934, |
|
"kl": 0.0435791015625, |
|
"learning_rate": 8.344131861991828e-07, |
|
"loss": 0.0372, |
|
"reward": -0.3437151834368706, |
|
"reward_std": 0.5837994962930679, |
|
"rewards/cosine_len_reward": -0.3437151834368706, |
|
"step": 178 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3500.5833740234375, |
|
"epoch": 0.10228571428571429, |
|
"grad_norm": 0.1510867178440094, |
|
"kl": 0.0496826171875, |
|
"learning_rate": 8.319717151140072e-07, |
|
"loss": 0.0206, |
|
"reward": -0.25809749960899353, |
|
"reward_std": 0.5083489567041397, |
|
"rewards/cosine_len_reward": -0.25809749960899353, |
|
"step": 179 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.10285714285714286, |
|
"grad_norm": 0.15222983062267303, |
|
"kl": 0.04559326171875, |
|
"learning_rate": 8.295165011252396e-07, |
|
"loss": 0.0002, |
|
"reward": 0.055209167301654816, |
|
"reward_std": 0.4925672523677349, |
|
"rewards/cosine_len_reward": 0.055209167301654816, |
|
"step": 180 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3583.5, |
|
"epoch": 0.10342857142857143, |
|
"grad_norm": 0.1334083080291748, |
|
"kl": 0.039703369140625, |
|
"learning_rate": 8.270476638965461e-07, |
|
"loss": 0.0003, |
|
"reward": -0.10784891247749329, |
|
"reward_std": 0.3197548817843199, |
|
"rewards/cosine_len_reward": -0.10784891247749329, |
|
"step": 181 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.104, |
|
"grad_norm": 0.1473800241947174, |
|
"kl": 0.048828125, |
|
"learning_rate": 8.245653237555705e-07, |
|
"loss": 0.0002, |
|
"reward": -0.5589503794908524, |
|
"reward_std": 0.3745484910905361, |
|
"rewards/cosine_len_reward": -0.5589503794908524, |
|
"step": 182 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3420.6666870117188, |
|
"epoch": 0.10457142857142857, |
|
"grad_norm": 0.17142295837402344, |
|
"kl": 0.04559326171875, |
|
"learning_rate": 8.220696016880687e-07, |
|
"loss": 0.0476, |
|
"reward": -0.03992478083819151, |
|
"reward_std": 0.6503849253058434, |
|
"rewards/cosine_len_reward": -0.03992478083819151, |
|
"step": 183 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3193.7083740234375, |
|
"epoch": 0.10514285714285715, |
|
"grad_norm": 0.18599683046340942, |
|
"kl": 0.05780029296875, |
|
"learning_rate": 8.195606193320136e-07, |
|
"loss": 0.0201, |
|
"reward": -0.1778981164097786, |
|
"reward_std": 0.38635776191949844, |
|
"rewards/cosine_len_reward": -0.1778981164097786, |
|
"step": 184 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2720.625, |
|
"epoch": 0.10571428571428572, |
|
"grad_norm": 0.390965074300766, |
|
"kl": 0.08233642578125, |
|
"learning_rate": 8.170384989716657e-07, |
|
"loss": 0.1901, |
|
"reward": -0.395973265171051, |
|
"reward_std": 0.42710288241505623, |
|
"rewards/cosine_len_reward": -0.395973265171051, |
|
"step": 185 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2766.4583587646484, |
|
"epoch": 0.10628571428571429, |
|
"grad_norm": 0.2267504781484604, |
|
"kl": 0.04254150390625, |
|
"learning_rate": 8.145033635316128e-07, |
|
"loss": 0.0364, |
|
"reward": -0.3062758632004261, |
|
"reward_std": 0.33072756230831146, |
|
"rewards/cosine_len_reward": -0.3062758632004261, |
|
"step": 186 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3509.25, |
|
"epoch": 0.10685714285714286, |
|
"grad_norm": 0.16118666529655457, |
|
"kl": 0.05560302734375, |
|
"learning_rate": 8.119553365707802e-07, |
|
"loss": -0.0053, |
|
"reward": -0.18290760926902294, |
|
"reward_std": 0.38534967601299286, |
|
"rewards/cosine_len_reward": -0.18290760926902294, |
|
"step": 187 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3558.6666870117188, |
|
"epoch": 0.10742857142857143, |
|
"grad_norm": 0.1462363302707672, |
|
"kl": 0.04461669921875, |
|
"learning_rate": 8.093945422764069e-07, |
|
"loss": 0.009, |
|
"reward": -0.3126313886605203, |
|
"reward_std": 0.3192543825134635, |
|
"rewards/cosine_len_reward": -0.3126313886605203, |
|
"step": 188 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.108, |
|
"grad_norm": 0.14533929526805878, |
|
"kl": 0.03973388671875, |
|
"learning_rate": 8.068211054579943e-07, |
|
"loss": 0.0002, |
|
"reward": -0.4464322179555893, |
|
"reward_std": 0.336882084608078, |
|
"rewards/cosine_len_reward": -0.4464322179555893, |
|
"step": 189 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3254.5416870117188, |
|
"epoch": 0.10857142857142857, |
|
"grad_norm": 0.16986821591854095, |
|
"kl": 0.0482177734375, |
|
"learning_rate": 8.04235151541222e-07, |
|
"loss": 0.0441, |
|
"reward": -0.5396054945886135, |
|
"reward_std": 0.3040096387267113, |
|
"rewards/cosine_len_reward": -0.5396054945886135, |
|
"step": 190 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3005.4583435058594, |
|
"epoch": 0.10914285714285714, |
|
"grad_norm": 0.2879737317562103, |
|
"kl": 0.0799560546875, |
|
"learning_rate": 8.01636806561836e-07, |
|
"loss": 0.1028, |
|
"reward": -0.04980655759572983, |
|
"reward_std": 0.3840261846780777, |
|
"rewards/cosine_len_reward": -0.04980655759572983, |
|
"step": 191 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3162.1666870117188, |
|
"epoch": 0.10971428571428571, |
|
"grad_norm": 0.18578751385211945, |
|
"kl": 0.0562744140625, |
|
"learning_rate": 7.990261971595048e-07, |
|
"loss": 0.0364, |
|
"reward": -0.09453568607568741, |
|
"reward_std": 0.5176538079977036, |
|
"rewards/cosine_len_reward": -0.09453568607568741, |
|
"step": 192 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.11028571428571429, |
|
"grad_norm": 0.14698241651058197, |
|
"kl": 0.03900146484375, |
|
"learning_rate": 7.964034505716476e-07, |
|
"loss": 0.0002, |
|
"reward": -0.20598075166344643, |
|
"reward_std": 0.39230766519904137, |
|
"rewards/cosine_len_reward": -0.20598075166344643, |
|
"step": 193 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2715.1666870117188, |
|
"epoch": 0.11085714285714286, |
|
"grad_norm": 0.27284160256385803, |
|
"kl": 0.06475830078125, |
|
"learning_rate": 7.93768694627233e-07, |
|
"loss": 0.0727, |
|
"reward": -0.30249132215976715, |
|
"reward_std": 0.6033280193805695, |
|
"rewards/cosine_len_reward": -0.30249132215976715, |
|
"step": 194 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3347.3333740234375, |
|
"epoch": 0.11142857142857143, |
|
"grad_norm": 0.17070569097995758, |
|
"kl": 0.059814453125, |
|
"learning_rate": 7.911220577405484e-07, |
|
"loss": -0.028, |
|
"reward": -0.11190107837319374, |
|
"reward_std": 0.5215081870555878, |
|
"rewards/cosine_len_reward": -0.11190107837319374, |
|
"step": 195 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2826.0, |
|
"epoch": 0.112, |
|
"grad_norm": 0.20371770858764648, |
|
"kl": 0.040924072265625, |
|
"learning_rate": 7.884636689049422e-07, |
|
"loss": 0.0097, |
|
"reward": -0.24280931055545807, |
|
"reward_std": 0.24882662668824196, |
|
"rewards/cosine_len_reward": -0.24280931055545807, |
|
"step": 196 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3179.5416870117188, |
|
"epoch": 0.11257142857142857, |
|
"grad_norm": 0.1733676791191101, |
|
"kl": 0.04510498046875, |
|
"learning_rate": 7.857936576865356e-07, |
|
"loss": -0.0353, |
|
"reward": -0.5386351570487022, |
|
"reward_std": 0.47516174614429474, |
|
"rewards/cosine_len_reward": -0.5386351570487022, |
|
"step": 197 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3113.3333435058594, |
|
"epoch": 0.11314285714285714, |
|
"grad_norm": 0.2758618891239166, |
|
"kl": 0.04486083984375, |
|
"learning_rate": 7.831121542179086e-07, |
|
"loss": -0.0971, |
|
"reward": 0.38623735681176186, |
|
"reward_std": 0.37613956816494465, |
|
"rewards/cosine_len_reward": 0.38623735681176186, |
|
"step": 198 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3521.166748046875, |
|
"epoch": 0.11371428571428571, |
|
"grad_norm": 0.16294990479946136, |
|
"kl": 0.0465087890625, |
|
"learning_rate": 7.804192891917571e-07, |
|
"loss": -0.0159, |
|
"reward": -0.03587418794631958, |
|
"reward_std": 0.39774488657712936, |
|
"rewards/cosine_len_reward": -0.03587418794631958, |
|
"step": 199 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3434.5833740234375, |
|
"epoch": 0.11428571428571428, |
|
"grad_norm": 0.18072271347045898, |
|
"kl": 0.05572509765625, |
|
"learning_rate": 7.777151938545235e-07, |
|
"loss": 0.0029, |
|
"reward": -0.29647984355688095, |
|
"reward_std": 0.43220172449946404, |
|
"rewards/cosine_len_reward": -0.29647984355688095, |
|
"step": 200 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3440.9583740234375, |
|
"epoch": 0.11485714285714285, |
|
"grad_norm": 0.18761895596981049, |
|
"kl": 0.05572509765625, |
|
"learning_rate": 7.75e-07, |
|
"loss": -0.0399, |
|
"reward": 0.26733987778425217, |
|
"reward_std": 0.4860861897468567, |
|
"rewards/cosine_len_reward": 0.26733987778425217, |
|
"step": 201 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3017.9583435058594, |
|
"epoch": 0.11542857142857142, |
|
"grad_norm": 0.2005700170993805, |
|
"kl": 0.046875, |
|
"learning_rate": 7.72273839962904e-07, |
|
"loss": 0.0871, |
|
"reward": -0.1812760317698121, |
|
"reward_std": 0.5165529623627663, |
|
"rewards/cosine_len_reward": -0.1812760317698121, |
|
"step": 202 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3487.3333740234375, |
|
"epoch": 0.116, |
|
"grad_norm": 0.15808533132076263, |
|
"kl": 0.08843994140625, |
|
"learning_rate": 7.695368466124296e-07, |
|
"loss": -0.0573, |
|
"reward": 0.20985493808984756, |
|
"reward_std": 0.2973843924701214, |
|
"rewards/cosine_len_reward": 0.20985493808984756, |
|
"step": 203 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2917.250030517578, |
|
"epoch": 0.11657142857142858, |
|
"grad_norm": 0.15920044481754303, |
|
"kl": 0.040771484375, |
|
"learning_rate": 7.667891533457718e-07, |
|
"loss": 0.0565, |
|
"reward": -0.1725541353225708, |
|
"reward_std": 0.26636107824742794, |
|
"rewards/cosine_len_reward": -0.1725541353225708, |
|
"step": 204 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3479.0416870117188, |
|
"epoch": 0.11714285714285715, |
|
"grad_norm": 0.16037701070308685, |
|
"kl": 0.04156494140625, |
|
"learning_rate": 7.640308940816239e-07, |
|
"loss": -0.0214, |
|
"reward": 0.1938185803592205, |
|
"reward_std": 0.42734283953905106, |
|
"rewards/cosine_len_reward": 0.1938185803592205, |
|
"step": 205 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3439.6250610351562, |
|
"epoch": 0.11771428571428572, |
|
"grad_norm": 0.15823422372341156, |
|
"kl": 0.0565185546875, |
|
"learning_rate": 7.612622032536507e-07, |
|
"loss": -0.0167, |
|
"reward": 0.1364712193608284, |
|
"reward_std": 0.47612153738737106, |
|
"rewards/cosine_len_reward": 0.1364712193608284, |
|
"step": 206 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3275.8333740234375, |
|
"epoch": 0.11828571428571429, |
|
"grad_norm": 0.16698057949543, |
|
"kl": 0.04461669921875, |
|
"learning_rate": 7.584832158039378e-07, |
|
"loss": -0.0623, |
|
"reward": -0.13615961745381355, |
|
"reward_std": 0.25652020424604416, |
|
"rewards/cosine_len_reward": -0.13615961745381355, |
|
"step": 207 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2980.6250610351562, |
|
"epoch": 0.11885714285714286, |
|
"grad_norm": 0.1898704171180725, |
|
"kl": 0.0433349609375, |
|
"learning_rate": 7.556940671764124e-07, |
|
"loss": 0.1447, |
|
"reward": -0.31068204157054424, |
|
"reward_std": 0.48038899153470993, |
|
"rewards/cosine_len_reward": -0.31068204157054424, |
|
"step": 208 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2793.1666717529297, |
|
"epoch": 0.11942857142857143, |
|
"grad_norm": 0.34890925884246826, |
|
"kl": 0.043212890625, |
|
"learning_rate": 7.528948933102438e-07, |
|
"loss": 0.1303, |
|
"reward": -0.18676769733428955, |
|
"reward_std": 0.4481005147099495, |
|
"rewards/cosine_len_reward": -0.18676769733428955, |
|
"step": 209 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.12, |
|
"grad_norm": 0.15249201655387878, |
|
"kl": 0.05181884765625, |
|
"learning_rate": 7.500858306332172e-07, |
|
"loss": 0.0002, |
|
"reward": -0.07446567714214325, |
|
"reward_std": 0.18282443098723888, |
|
"rewards/cosine_len_reward": -0.07446567714214325, |
|
"step": 210 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2958.8333740234375, |
|
"epoch": 0.12057142857142857, |
|
"grad_norm": 0.17982329428195953, |
|
"kl": 0.04461669921875, |
|
"learning_rate": 7.472670160550848e-07, |
|
"loss": -0.1281, |
|
"reward": 0.04834838956594467, |
|
"reward_std": 0.4822767600417137, |
|
"rewards/cosine_len_reward": 0.04834838956594467, |
|
"step": 211 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3580.625, |
|
"epoch": 0.12114285714285715, |
|
"grad_norm": 0.17494414746761322, |
|
"kl": 0.05645751953125, |
|
"learning_rate": 7.444385869608921e-07, |
|
"loss": 0.0015, |
|
"reward": 0.2551577538251877, |
|
"reward_std": 0.322468139231205, |
|
"rewards/cosine_len_reward": 0.2551577538251877, |
|
"step": 212 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3130.0416870117188, |
|
"epoch": 0.12171428571428572, |
|
"grad_norm": 0.22192618250846863, |
|
"kl": 0.0361328125, |
|
"learning_rate": 7.416006812042827e-07, |
|
"loss": -0.0966, |
|
"reward": 0.09590141475200653, |
|
"reward_std": 0.3808777518570423, |
|
"rewards/cosine_len_reward": 0.09590141475200653, |
|
"step": 213 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3035.8333435058594, |
|
"epoch": 0.12228571428571429, |
|
"grad_norm": 0.23687028884887695, |
|
"kl": 0.04388427734375, |
|
"learning_rate": 7.387534371007797e-07, |
|
"loss": -0.0639, |
|
"reward": -0.2096049189567566, |
|
"reward_std": 0.36091630533337593, |
|
"rewards/cosine_len_reward": -0.2096049189567566, |
|
"step": 214 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3564.6666870117188, |
|
"epoch": 0.12285714285714286, |
|
"grad_norm": 0.16293585300445557, |
|
"kl": 0.0419921875, |
|
"learning_rate": 7.358969934210438e-07, |
|
"loss": 0.0036, |
|
"reward": -0.3932174853980541, |
|
"reward_std": 0.4780453070998192, |
|
"rewards/cosine_len_reward": -0.3932174853980541, |
|
"step": 215 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2151.041748046875, |
|
"epoch": 0.12342857142857143, |
|
"grad_norm": 0.18021531403064728, |
|
"kl": 0.046142578125, |
|
"learning_rate": 7.330314893841101e-07, |
|
"loss": 0.0426, |
|
"reward": -0.43433932960033417, |
|
"reward_std": 0.39597445726394653, |
|
"rewards/cosine_len_reward": -0.43433932960033417, |
|
"step": 216 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.124, |
|
"grad_norm": 0.1701132208108902, |
|
"kl": 0.0526123046875, |
|
"learning_rate": 7.301570646506027e-07, |
|
"loss": 0.0002, |
|
"reward": 0.27941954880952835, |
|
"reward_std": 0.33769937977194786, |
|
"rewards/cosine_len_reward": 0.27941954880952835, |
|
"step": 217 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3329.75, |
|
"epoch": 0.12457142857142857, |
|
"grad_norm": 0.164340078830719, |
|
"kl": 0.0550537109375, |
|
"learning_rate": 7.27273859315928e-07, |
|
"loss": -0.05, |
|
"reward": -0.12782394886016846, |
|
"reward_std": 0.5567526817321777, |
|
"rewards/cosine_len_reward": -0.12782394886016846, |
|
"step": 218 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2744.3750610351562, |
|
"epoch": 0.12514285714285714, |
|
"grad_norm": 0.22663144767284393, |
|
"kl": 0.05523681640625, |
|
"learning_rate": 7.243820139034464e-07, |
|
"loss": -0.0076, |
|
"reward": -0.43699468672275543, |
|
"reward_std": 0.4379548355937004, |
|
"rewards/cosine_len_reward": -0.43699468672275543, |
|
"step": 219 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3289.916748046875, |
|
"epoch": 0.12571428571428572, |
|
"grad_norm": 0.15933476388454437, |
|
"kl": 0.04522705078125, |
|
"learning_rate": 7.214816693576234e-07, |
|
"loss": 0.0215, |
|
"reward": 0.18807393498718739, |
|
"reward_std": 0.5197709389030933, |
|
"rewards/cosine_len_reward": 0.18807393498718739, |
|
"step": 220 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2958.1666870117188, |
|
"epoch": 0.12628571428571428, |
|
"grad_norm": 0.21169209480285645, |
|
"kl": 0.0587158203125, |
|
"learning_rate": 7.185729670371604e-07, |
|
"loss": -0.0842, |
|
"reward": -0.10242819041013718, |
|
"reward_std": 0.5134933441877365, |
|
"rewards/cosine_len_reward": -0.10242819041013718, |
|
"step": 221 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2638.3750610351562, |
|
"epoch": 0.12685714285714286, |
|
"grad_norm": 0.3018430769443512, |
|
"kl": 0.04803466796875, |
|
"learning_rate": 7.156560487081051e-07, |
|
"loss": -0.1641, |
|
"reward": -0.17160223424434662, |
|
"reward_std": 0.40449361503124237, |
|
"rewards/cosine_len_reward": -0.17160223424434662, |
|
"step": 222 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2908.2916870117188, |
|
"epoch": 0.12742857142857142, |
|
"grad_norm": 0.4184667766094208, |
|
"kl": 0.0513916015625, |
|
"learning_rate": 7.127310565369415e-07, |
|
"loss": -0.1032, |
|
"reward": -0.2708446606993675, |
|
"reward_std": 0.4833526462316513, |
|
"rewards/cosine_len_reward": -0.2708446606993675, |
|
"step": 223 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.128, |
|
"grad_norm": 0.14835132658481598, |
|
"kl": 0.0413818359375, |
|
"learning_rate": 7.097981330836616e-07, |
|
"loss": 0.0002, |
|
"reward": -0.06757992756320164, |
|
"reward_std": 0.6502542048692703, |
|
"rewards/cosine_len_reward": -0.06757992756320164, |
|
"step": 224 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3400.1250610351562, |
|
"epoch": 0.12857142857142856, |
|
"grad_norm": 0.15779976546764374, |
|
"kl": 0.04864501953125, |
|
"learning_rate": 7.068574212948169e-07, |
|
"loss": -0.0015, |
|
"reward": -0.3096113298088312, |
|
"reward_std": 0.5966300740838051, |
|
"rewards/cosine_len_reward": -0.3096113298088312, |
|
"step": 225 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2957.0000610351562, |
|
"epoch": 0.12914285714285714, |
|
"grad_norm": 0.2116604745388031, |
|
"kl": 0.04412841796875, |
|
"learning_rate": 7.039090644965509e-07, |
|
"loss": 0.0589, |
|
"reward": 0.3183956618886441, |
|
"reward_std": 0.5148555189371109, |
|
"rewards/cosine_len_reward": 0.3183956618886441, |
|
"step": 226 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2699.5416870117188, |
|
"epoch": 0.12971428571428573, |
|
"grad_norm": 0.2158554196357727, |
|
"kl": 0.036590576171875, |
|
"learning_rate": 7.009532063876148e-07, |
|
"loss": 0.0252, |
|
"reward": -0.019072268158197403, |
|
"reward_std": 0.51263727247715, |
|
"rewards/cosine_len_reward": -0.019072268158197403, |
|
"step": 227 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2288.250030517578, |
|
"epoch": 0.13028571428571428, |
|
"grad_norm": 0.4335751235485077, |
|
"kl": 0.04742431640625, |
|
"learning_rate": 6.979899910323624e-07, |
|
"loss": 0.1996, |
|
"reward": -0.08665098808705807, |
|
"reward_std": 0.4403616450726986, |
|
"rewards/cosine_len_reward": -0.08665098808705807, |
|
"step": 228 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3032.75, |
|
"epoch": 0.13085714285714287, |
|
"grad_norm": 0.17546547949314117, |
|
"kl": 0.04461669921875, |
|
"learning_rate": 6.950195628537299e-07, |
|
"loss": -0.0213, |
|
"reward": -0.4365268647670746, |
|
"reward_std": 0.30412301421165466, |
|
"rewards/cosine_len_reward": -0.4365268647670746, |
|
"step": 229 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.13142857142857142, |
|
"grad_norm": 0.14412453770637512, |
|
"kl": 0.04443359375, |
|
"learning_rate": 6.920420666261961e-07, |
|
"loss": 0.0002, |
|
"reward": 0.007472768425941467, |
|
"reward_std": 0.3222753778100014, |
|
"rewards/cosine_len_reward": 0.007472768425941467, |
|
"step": 230 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3375.6666870117188, |
|
"epoch": 0.132, |
|
"grad_norm": 0.21483556926250458, |
|
"kl": 0.04730224609375, |
|
"learning_rate": 6.890576474687263e-07, |
|
"loss": -0.0604, |
|
"reward": 0.14519068226218224, |
|
"reward_std": 0.3973130788654089, |
|
"rewards/cosine_len_reward": 0.14519068226218224, |
|
"step": 231 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2880.75, |
|
"epoch": 0.13257142857142856, |
|
"grad_norm": 0.19802089035511017, |
|
"kl": 0.04461669921875, |
|
"learning_rate": 6.860664508377001e-07, |
|
"loss": 0.0559, |
|
"reward": -0.46153586730360985, |
|
"reward_std": 0.4743086025118828, |
|
"rewards/cosine_len_reward": -0.46153586730360985, |
|
"step": 232 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3466.0416870117188, |
|
"epoch": 0.13314285714285715, |
|
"grad_norm": 0.14607660472393036, |
|
"kl": 0.04266357421875, |
|
"learning_rate": 6.83068622519821e-07, |
|
"loss": -0.0599, |
|
"reward": -0.0832620239816606, |
|
"reward_std": 0.37259791046380997, |
|
"rewards/cosine_len_reward": -0.0832620239816606, |
|
"step": 233 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.1337142857142857, |
|
"grad_norm": 0.15555140376091003, |
|
"kl": 0.0460205078125, |
|
"learning_rate": 6.800643086250121e-07, |
|
"loss": 0.0002, |
|
"reward": -0.141191266477108, |
|
"reward_std": 0.2727678678929806, |
|
"rewards/cosine_len_reward": -0.141191266477108, |
|
"step": 234 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2840.375, |
|
"epoch": 0.13428571428571429, |
|
"grad_norm": 0.2309923619031906, |
|
"kl": 0.04388427734375, |
|
"learning_rate": 6.770536555792944e-07, |
|
"loss": 0.0619, |
|
"reward": -0.030156303197145462, |
|
"reward_std": 0.4152325987815857, |
|
"rewards/cosine_len_reward": -0.030156303197145462, |
|
"step": 235 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2298.25, |
|
"epoch": 0.13485714285714287, |
|
"grad_norm": 0.43432554602622986, |
|
"kl": 0.0498046875, |
|
"learning_rate": 6.740368101176495e-07, |
|
"loss": -0.1391, |
|
"reward": -0.2793803792446852, |
|
"reward_std": 0.5643313974142075, |
|
"rewards/cosine_len_reward": -0.2793803792446852, |
|
"step": 236 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2497.625, |
|
"epoch": 0.13542857142857143, |
|
"grad_norm": 0.29664841294288635, |
|
"kl": 0.06695556640625, |
|
"learning_rate": 6.710139192768694e-07, |
|
"loss": -0.0854, |
|
"reward": -0.17586842365562916, |
|
"reward_std": 0.5680941194295883, |
|
"rewards/cosine_len_reward": -0.17586842365562916, |
|
"step": 237 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3128.7083740234375, |
|
"epoch": 0.136, |
|
"grad_norm": 0.21420085430145264, |
|
"kl": 0.0462646484375, |
|
"learning_rate": 6.679851303883891e-07, |
|
"loss": -0.0854, |
|
"reward": -0.4128701612353325, |
|
"reward_std": 0.3054894767701626, |
|
"rewards/cosine_len_reward": -0.4128701612353325, |
|
"step": 238 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.13657142857142857, |
|
"grad_norm": 0.1544465720653534, |
|
"kl": 0.05194091796875, |
|
"learning_rate": 6.649505910711058e-07, |
|
"loss": 0.0002, |
|
"reward": 0.15613190457224846, |
|
"reward_std": 0.3620890751481056, |
|
"rewards/cosine_len_reward": 0.15613190457224846, |
|
"step": 239 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.13714285714285715, |
|
"grad_norm": 0.17545334994792938, |
|
"kl": 0.04376220703125, |
|
"learning_rate": 6.619104492241847e-07, |
|
"loss": 0.0002, |
|
"reward": -0.3966597355902195, |
|
"reward_std": 0.3349955417215824, |
|
"rewards/cosine_len_reward": -0.3966597355902195, |
|
"step": 240 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2736.4583740234375, |
|
"epoch": 0.1377142857142857, |
|
"grad_norm": 0.2892467975616455, |
|
"kl": 0.04608154296875, |
|
"learning_rate": 6.588648530198504e-07, |
|
"loss": 0.0983, |
|
"reward": 0.0027789995074272156, |
|
"reward_std": 0.4166061468422413, |
|
"rewards/cosine_len_reward": 0.0027789995074272156, |
|
"step": 241 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.1382857142857143, |
|
"grad_norm": 0.15071091055870056, |
|
"kl": 0.05181884765625, |
|
"learning_rate": 6.558139508961654e-07, |
|
"loss": 0.0002, |
|
"reward": 3.524124622344971e-05, |
|
"reward_std": 0.23610319755971432, |
|
"rewards/cosine_len_reward": 3.524124622344971e-05, |
|
"step": 242 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3580.7916870117188, |
|
"epoch": 0.13885714285714285, |
|
"grad_norm": 0.17169542610645294, |
|
"kl": 0.05706787109375, |
|
"learning_rate": 6.527578915497951e-07, |
|
"loss": -0.0004, |
|
"reward": -0.4086134284734726, |
|
"reward_std": 0.34414974600076675, |
|
"rewards/cosine_len_reward": -0.4086134284734726, |
|
"step": 243 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3362.0416870117188, |
|
"epoch": 0.13942857142857143, |
|
"grad_norm": 0.28142961859703064, |
|
"kl": 0.0596923828125, |
|
"learning_rate": 6.496968239287603e-07, |
|
"loss": 0.0377, |
|
"reward": -0.5328942588530481, |
|
"reward_std": 0.45538509637117386, |
|
"rewards/cosine_len_reward": -0.5328942588530481, |
|
"step": 244 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3371.9583740234375, |
|
"epoch": 0.14, |
|
"grad_norm": 0.16989277303218842, |
|
"kl": 0.0560302734375, |
|
"learning_rate": 6.466308972251785e-07, |
|
"loss": 0.0021, |
|
"reward": -0.057370007038116455, |
|
"reward_std": 0.44972486793994904, |
|
"rewards/cosine_len_reward": -0.057370007038116455, |
|
"step": 245 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3018.541717529297, |
|
"epoch": 0.14057142857142857, |
|
"grad_norm": 0.14576825499534607, |
|
"kl": 0.03741455078125, |
|
"learning_rate": 6.435602608679916e-07, |
|
"loss": -0.015, |
|
"reward": 0.28845351678319275, |
|
"reward_std": 0.40407272428274155, |
|
"rewards/cosine_len_reward": 0.28845351678319275, |
|
"step": 246 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.14114285714285715, |
|
"grad_norm": 0.1530689150094986, |
|
"kl": 0.0460205078125, |
|
"learning_rate": 6.404850645156841e-07, |
|
"loss": 0.0002, |
|
"reward": 0.1336237620562315, |
|
"reward_std": 0.47701434791088104, |
|
"rewards/cosine_len_reward": 0.1336237620562315, |
|
"step": 247 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2176.7500228881836, |
|
"epoch": 0.1417142857142857, |
|
"grad_norm": 0.38253119587898254, |
|
"kl": 0.04736328125, |
|
"learning_rate": 6.374054580489873e-07, |
|
"loss": 0.1374, |
|
"reward": -0.14296885952353477, |
|
"reward_std": 0.45761511474847794, |
|
"rewards/cosine_len_reward": -0.14296885952353477, |
|
"step": 248 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3580.25, |
|
"epoch": 0.1422857142857143, |
|
"grad_norm": 0.16433897614479065, |
|
"kl": 0.04766845703125, |
|
"learning_rate": 6.343215915635761e-07, |
|
"loss": 0.0014, |
|
"reward": -0.33391520008444786, |
|
"reward_std": 0.3410606235265732, |
|
"rewards/cosine_len_reward": -0.33391520008444786, |
|
"step": 249 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3267.875, |
|
"epoch": 0.14285714285714285, |
|
"grad_norm": 0.17220118641853333, |
|
"kl": 0.04205322265625, |
|
"learning_rate": 6.31233615362752e-07, |
|
"loss": -0.0927, |
|
"reward": -0.2974054589867592, |
|
"reward_std": 0.5179209262132645, |
|
"rewards/cosine_len_reward": -0.2974054589867592, |
|
"step": 250 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2834.875030517578, |
|
"epoch": 0.14342857142857143, |
|
"grad_norm": 0.1579933613538742, |
|
"kl": 0.042724609375, |
|
"learning_rate": 6.281416799501187e-07, |
|
"loss": -0.1072, |
|
"reward": -0.3689499036408961, |
|
"reward_std": 0.48299194872379303, |
|
"rewards/cosine_len_reward": -0.3689499036408961, |
|
"step": 251 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3071.5833740234375, |
|
"epoch": 0.144, |
|
"grad_norm": 0.265338659286499, |
|
"kl": 0.05572509765625, |
|
"learning_rate": 6.25045936022246e-07, |
|
"loss": -0.0647, |
|
"reward": -0.3644823618233204, |
|
"reward_std": 0.4328817129135132, |
|
"rewards/cosine_len_reward": -0.3644823618233204, |
|
"step": 252 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3208.8333740234375, |
|
"epoch": 0.14457142857142857, |
|
"grad_norm": 0.17507553100585938, |
|
"kl": 0.045654296875, |
|
"learning_rate": 6.219465344613258e-07, |
|
"loss": -0.0457, |
|
"reward": 0.07047359831631184, |
|
"reward_std": 0.47896186634898186, |
|
"rewards/cosine_len_reward": 0.07047359831631184, |
|
"step": 253 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3244.2916870117188, |
|
"epoch": 0.14514285714285713, |
|
"grad_norm": 0.13542579114437103, |
|
"kl": 0.03094482421875, |
|
"learning_rate": 6.188436263278172e-07, |
|
"loss": -0.0562, |
|
"reward": 0.38170455396175385, |
|
"reward_std": 0.4014411121606827, |
|
"rewards/cosine_len_reward": 0.38170455396175385, |
|
"step": 254 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3375.375, |
|
"epoch": 0.1457142857142857, |
|
"grad_norm": 0.1907050907611847, |
|
"kl": 0.06121826171875, |
|
"learning_rate": 6.157373628530852e-07, |
|
"loss": 0.0057, |
|
"reward": -0.2342095747590065, |
|
"reward_std": 0.5328280627727509, |
|
"rewards/cosine_len_reward": -0.2342095747590065, |
|
"step": 255 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.1462857142857143, |
|
"grad_norm": 0.14469757676124573, |
|
"kl": 0.03936767578125, |
|
"learning_rate": 6.126278954320294e-07, |
|
"loss": 0.0002, |
|
"reward": -0.2734826896339655, |
|
"reward_std": 0.2763890288770199, |
|
"rewards/cosine_len_reward": -0.2734826896339655, |
|
"step": 256 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3096.2916870117188, |
|
"epoch": 0.14685714285714285, |
|
"grad_norm": 1.1167607307434082, |
|
"kl": 0.1475830078125, |
|
"learning_rate": 6.095153756157051e-07, |
|
"loss": 0.1404, |
|
"reward": -0.2988658621907234, |
|
"reward_std": 0.4727318063378334, |
|
"rewards/cosine_len_reward": -0.2988658621907234, |
|
"step": 257 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2151.0, |
|
"epoch": 0.14742857142857144, |
|
"grad_norm": 0.2602831721305847, |
|
"kl": 0.03936767578125, |
|
"learning_rate": 6.06399955103937e-07, |
|
"loss": 0.142, |
|
"reward": 0.09373210370540619, |
|
"reward_std": 0.3661756291985512, |
|
"rewards/cosine_len_reward": 0.09373210370540619, |
|
"step": 258 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.148, |
|
"grad_norm": 0.15572789311408997, |
|
"kl": 0.04443359375, |
|
"learning_rate": 6.032817857379256e-07, |
|
"loss": 0.0002, |
|
"reward": -0.016586612910032272, |
|
"reward_std": 0.45464975386857986, |
|
"rewards/cosine_len_reward": -0.016586612910032272, |
|
"step": 259 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3412.0000610351562, |
|
"epoch": 0.14857142857142858, |
|
"grad_norm": 0.18364061415195465, |
|
"kl": 0.04327392578125, |
|
"learning_rate": 6.001610194928464e-07, |
|
"loss": 0.0325, |
|
"reward": 0.19075123965740204, |
|
"reward_std": 0.40447286888957024, |
|
"rewards/cosine_len_reward": 0.19075123965740204, |
|
"step": 260 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.14914285714285713, |
|
"grad_norm": 0.13612648844718933, |
|
"kl": 0.041015625, |
|
"learning_rate": 5.97037808470444e-07, |
|
"loss": 0.0002, |
|
"reward": -0.20534592866897583, |
|
"reward_std": 0.292750746011734, |
|
"rewards/cosine_len_reward": -0.20534592866897583, |
|
"step": 261 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3564.125, |
|
"epoch": 0.14971428571428572, |
|
"grad_norm": 0.14559589326381683, |
|
"kl": 0.0447998046875, |
|
"learning_rate": 5.939123048916173e-07, |
|
"loss": 0.0092, |
|
"reward": -0.18081504851579666, |
|
"reward_std": 0.4650590941309929, |
|
"rewards/cosine_len_reward": -0.18081504851579666, |
|
"step": 262 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3468.375, |
|
"epoch": 0.15028571428571427, |
|
"grad_norm": 0.17928771674633026, |
|
"kl": 0.05291748046875, |
|
"learning_rate": 5.907846610890011e-07, |
|
"loss": 0.0121, |
|
"reward": 0.09083682298660278, |
|
"reward_std": 0.3736903816461563, |
|
"rewards/cosine_len_reward": 0.09083682298660278, |
|
"step": 263 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.15085714285714286, |
|
"grad_norm": 0.20904290676116943, |
|
"kl": 0.05096435546875, |
|
"learning_rate": 5.87655029499542e-07, |
|
"loss": 0.0002, |
|
"reward": -0.08812252432107925, |
|
"reward_std": 0.37937742099165916, |
|
"rewards/cosine_len_reward": -0.08812252432107925, |
|
"step": 264 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3428.0416870117188, |
|
"epoch": 0.15142857142857144, |
|
"grad_norm": 0.1781691014766693, |
|
"kl": 0.0589599609375, |
|
"learning_rate": 5.845235626570683e-07, |
|
"loss": -0.0085, |
|
"reward": -0.18174926191568375, |
|
"reward_std": 0.5525826513767242, |
|
"rewards/cosine_len_reward": -0.18174926191568375, |
|
"step": 265 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3207.125, |
|
"epoch": 0.152, |
|
"grad_norm": 0.16391095519065857, |
|
"kl": 0.055908203125, |
|
"learning_rate": 5.813904131848564e-07, |
|
"loss": 0.01, |
|
"reward": -0.08231775928288698, |
|
"reward_std": 0.4217005968093872, |
|
"rewards/cosine_len_reward": -0.08231775928288698, |
|
"step": 266 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2443.0834045410156, |
|
"epoch": 0.15257142857142858, |
|
"grad_norm": 0.5960751175880432, |
|
"kl": 0.064697265625, |
|
"learning_rate": 5.78255733788191e-07, |
|
"loss": 0.3249, |
|
"reward": -0.4397448003292084, |
|
"reward_std": 0.4261315129697323, |
|
"rewards/cosine_len_reward": -0.4397448003292084, |
|
"step": 267 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2792.7916717529297, |
|
"epoch": 0.15314285714285714, |
|
"grad_norm": 0.37113648653030396, |
|
"kl": 0.04974365234375, |
|
"learning_rate": 5.751196772469237e-07, |
|
"loss": 0.1018, |
|
"reward": 0.23075676709413528, |
|
"reward_std": 0.3936547078192234, |
|
"rewards/cosine_len_reward": 0.23075676709413528, |
|
"step": 268 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3461.5833740234375, |
|
"epoch": 0.15371428571428572, |
|
"grad_norm": 0.15312907099723816, |
|
"kl": 0.0408935546875, |
|
"learning_rate": 5.71982396408026e-07, |
|
"loss": 0.0456, |
|
"reward": 0.018859659554436803, |
|
"reward_std": 0.36391543596982956, |
|
"rewards/cosine_len_reward": 0.018859659554436803, |
|
"step": 269 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2649.0833740234375, |
|
"epoch": 0.15428571428571428, |
|
"grad_norm": 0.2657376527786255, |
|
"kl": 0.05450439453125, |
|
"learning_rate": 5.688440441781398e-07, |
|
"loss": -0.0693, |
|
"reward": -0.2678499221801758, |
|
"reward_std": 0.4750388078391552, |
|
"rewards/cosine_len_reward": -0.2678499221801758, |
|
"step": 270 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3246.2084350585938, |
|
"epoch": 0.15485714285714286, |
|
"grad_norm": 0.26408547163009644, |
|
"kl": 0.06634521484375, |
|
"learning_rate": 5.657047735161255e-07, |
|
"loss": 0.1012, |
|
"reward": -0.10909051727503538, |
|
"reward_std": 0.4644903093576431, |
|
"rewards/cosine_len_reward": -0.10909051727503538, |
|
"step": 271 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2596.5416870117188, |
|
"epoch": 0.15542857142857142, |
|
"grad_norm": 0.3016497492790222, |
|
"kl": 0.0518798828125, |
|
"learning_rate": 5.625647374256061e-07, |
|
"loss": 0.2311, |
|
"reward": -0.2824300043284893, |
|
"reward_std": 0.4353151246905327, |
|
"rewards/cosine_len_reward": -0.2824300043284893, |
|
"step": 272 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3439.3333740234375, |
|
"epoch": 0.156, |
|
"grad_norm": 0.18352919816970825, |
|
"kl": 0.05572509765625, |
|
"learning_rate": 5.594240889475106e-07, |
|
"loss": -0.0308, |
|
"reward": -0.41946647968143225, |
|
"reward_std": 0.5297495797276497, |
|
"rewards/cosine_len_reward": -0.41946647968143225, |
|
"step": 273 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.15657142857142858, |
|
"grad_norm": 0.14747214317321777, |
|
"kl": 0.0426025390625, |
|
"learning_rate": 5.562829811526154e-07, |
|
"loss": 0.0002, |
|
"reward": -0.2709335945546627, |
|
"reward_std": 0.3659312203526497, |
|
"rewards/cosine_len_reward": -0.2709335945546627, |
|
"step": 274 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3159.541748046875, |
|
"epoch": 0.15714285714285714, |
|
"grad_norm": 0.21950587630271912, |
|
"kl": 0.05682373046875, |
|
"learning_rate": 5.531415671340826e-07, |
|
"loss": -0.0847, |
|
"reward": -0.36149609088897705, |
|
"reward_std": 0.4367978870868683, |
|
"rewards/cosine_len_reward": -0.36149609088897705, |
|
"step": 275 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2158.1666717529297, |
|
"epoch": 0.15771428571428572, |
|
"grad_norm": 0.2495608627796173, |
|
"kl": 0.0460205078125, |
|
"learning_rate": 5.5e-07, |
|
"loss": 0.07, |
|
"reward": -0.10891957813873887, |
|
"reward_std": 0.31222014874219894, |
|
"rewards/cosine_len_reward": -0.10891957813873887, |
|
"step": 276 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3555.7916870117188, |
|
"epoch": 0.15828571428571428, |
|
"grad_norm": 0.15767265856266022, |
|
"kl": 0.04681396484375, |
|
"learning_rate": 5.468584328659172e-07, |
|
"loss": 0.0018, |
|
"reward": -0.11694742739200592, |
|
"reward_std": 0.5289106294512749, |
|
"rewards/cosine_len_reward": -0.11694742739200592, |
|
"step": 277 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3408.4166870117188, |
|
"epoch": 0.15885714285714286, |
|
"grad_norm": 0.15594230592250824, |
|
"kl": 0.0472412109375, |
|
"learning_rate": 5.437170188473847e-07, |
|
"loss": 0.0096, |
|
"reward": -0.6330656111240387, |
|
"reward_std": 0.4878097102046013, |
|
"rewards/cosine_len_reward": -0.6330656111240387, |
|
"step": 278 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3456.7916870117188, |
|
"epoch": 0.15942857142857142, |
|
"grad_norm": 0.17065085470676422, |
|
"kl": 0.0494384765625, |
|
"learning_rate": 5.405759110524894e-07, |
|
"loss": -0.0232, |
|
"reward": -0.011678516864776611, |
|
"reward_std": 0.29044996201992035, |
|
"rewards/cosine_len_reward": -0.011678516864776611, |
|
"step": 279 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3121.3333740234375, |
|
"epoch": 0.16, |
|
"grad_norm": 0.18095840513706207, |
|
"kl": 0.04248046875, |
|
"learning_rate": 5.37435262574394e-07, |
|
"loss": 0.0204, |
|
"reward": 0.11361887771636248, |
|
"reward_std": 0.4011372458189726, |
|
"rewards/cosine_len_reward": 0.11361887771636248, |
|
"step": 280 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2895.125030517578, |
|
"epoch": 0.16057142857142856, |
|
"grad_norm": 0.2143956571817398, |
|
"kl": 0.059814453125, |
|
"learning_rate": 5.342952264838747e-07, |
|
"loss": 0.0687, |
|
"reward": -0.08828364498913288, |
|
"reward_std": 0.6478094309568405, |
|
"rewards/cosine_len_reward": -0.08828364498913288, |
|
"step": 281 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3567.1666870117188, |
|
"epoch": 0.16114285714285714, |
|
"grad_norm": 0.188002809882164, |
|
"kl": 0.05224609375, |
|
"learning_rate": 5.311559558218603e-07, |
|
"loss": 0.0041, |
|
"reward": 0.028641201555728912, |
|
"reward_std": 0.5138514451682568, |
|
"rewards/cosine_len_reward": 0.028641201555728912, |
|
"step": 282 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2583.625030517578, |
|
"epoch": 0.16171428571428573, |
|
"grad_norm": 0.2364453375339508, |
|
"kl": 0.05224609375, |
|
"learning_rate": 5.28017603591974e-07, |
|
"loss": 0.0869, |
|
"reward": -0.26807255670428276, |
|
"reward_std": 0.41767074167728424, |
|
"rewards/cosine_len_reward": -0.26807255670428276, |
|
"step": 283 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2788.9166717529297, |
|
"epoch": 0.16228571428571428, |
|
"grad_norm": 0.3112829029560089, |
|
"kl": 0.04931640625, |
|
"learning_rate": 5.248803227530763e-07, |
|
"loss": 0.0558, |
|
"reward": 0.36127352714538574, |
|
"reward_std": 0.25031092017889023, |
|
"rewards/cosine_len_reward": 0.36127352714538574, |
|
"step": 284 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3566.25, |
|
"epoch": 0.16285714285714287, |
|
"grad_norm": 0.14432963728904724, |
|
"kl": 0.053955078125, |
|
"learning_rate": 5.21744266211809e-07, |
|
"loss": -0.0087, |
|
"reward": -0.09366314113140106, |
|
"reward_std": 0.4134976416826248, |
|
"rewards/cosine_len_reward": -0.09366314113140106, |
|
"step": 285 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3367.0, |
|
"epoch": 0.16342857142857142, |
|
"grad_norm": 0.13774707913398743, |
|
"kl": 0.037109375, |
|
"learning_rate": 5.186095868151436e-07, |
|
"loss": -0.0435, |
|
"reward": 0.15960774943232536, |
|
"reward_std": 0.44187677651643753, |
|
"rewards/cosine_len_reward": 0.15960774943232536, |
|
"step": 286 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3458.6250610351562, |
|
"epoch": 0.164, |
|
"grad_norm": 0.1523509919643402, |
|
"kl": 0.0465087890625, |
|
"learning_rate": 5.154764373429315e-07, |
|
"loss": 0.0327, |
|
"reward": -0.1596720740199089, |
|
"reward_std": 0.29411060363054276, |
|
"rewards/cosine_len_reward": -0.1596720740199089, |
|
"step": 287 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3469.75, |
|
"epoch": 0.16457142857142856, |
|
"grad_norm": 0.17270562052726746, |
|
"kl": 0.05035400390625, |
|
"learning_rate": 5.123449705004581e-07, |
|
"loss": 0.07, |
|
"reward": -0.7143011689186096, |
|
"reward_std": 0.25607092306017876, |
|
"rewards/cosine_len_reward": -0.7143011689186096, |
|
"step": 288 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3468.0416870117188, |
|
"epoch": 0.16514285714285715, |
|
"grad_norm": 0.19703444838523865, |
|
"kl": 0.0517578125, |
|
"learning_rate": 5.09215338910999e-07, |
|
"loss": -0.0199, |
|
"reward": -0.009370148181915283, |
|
"reward_std": 0.26325560361146927, |
|
"rewards/cosine_len_reward": -0.009370148181915283, |
|
"step": 289 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3348.5000610351562, |
|
"epoch": 0.1657142857142857, |
|
"grad_norm": 0.26584550738334656, |
|
"kl": 0.106689453125, |
|
"learning_rate": 5.060876951083828e-07, |
|
"loss": -0.1073, |
|
"reward": 0.4936791881918907, |
|
"reward_std": 0.5171967372298241, |
|
"rewards/cosine_len_reward": 0.4936791881918907, |
|
"step": 290 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3520.2916870117188, |
|
"epoch": 0.1662857142857143, |
|
"grad_norm": 0.160246804356575, |
|
"kl": 0.04925537109375, |
|
"learning_rate": 5.02962191529556e-07, |
|
"loss": 0.019, |
|
"reward": 0.06787654012441635, |
|
"reward_std": 0.4259056970477104, |
|
"rewards/cosine_len_reward": 0.06787654012441635, |
|
"step": 291 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3394.7083740234375, |
|
"epoch": 0.16685714285714287, |
|
"grad_norm": 0.16499929130077362, |
|
"kl": 0.0457763671875, |
|
"learning_rate": 4.998389805071536e-07, |
|
"loss": 0.0067, |
|
"reward": -0.37206324748694897, |
|
"reward_std": 0.6219586506485939, |
|
"rewards/cosine_len_reward": -0.37206324748694897, |
|
"step": 292 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3028.2500610351562, |
|
"epoch": 0.16742857142857143, |
|
"grad_norm": 0.1728057563304901, |
|
"kl": 0.05438232421875, |
|
"learning_rate": 4.967182142620745e-07, |
|
"loss": -0.0489, |
|
"reward": -0.24254203587770462, |
|
"reward_std": 0.5258737653493881, |
|
"rewards/cosine_len_reward": -0.24254203587770462, |
|
"step": 293 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3453.7500610351562, |
|
"epoch": 0.168, |
|
"grad_norm": 0.16823387145996094, |
|
"kl": 0.05694580078125, |
|
"learning_rate": 4.93600044896063e-07, |
|
"loss": -0.0096, |
|
"reward": -0.27283355966210365, |
|
"reward_std": 0.4063083231449127, |
|
"rewards/cosine_len_reward": -0.27283355966210365, |
|
"step": 294 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3561.1666870117188, |
|
"epoch": 0.16857142857142857, |
|
"grad_norm": 0.14513014256954193, |
|
"kl": 0.0462646484375, |
|
"learning_rate": 4.904846243842949e-07, |
|
"loss": -0.0056, |
|
"reward": 0.28293178975582123, |
|
"reward_std": 0.30718712508678436, |
|
"rewards/cosine_len_reward": 0.28293178975582123, |
|
"step": 295 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3301.2916870117188, |
|
"epoch": 0.16914285714285715, |
|
"grad_norm": 0.18542294204235077, |
|
"kl": 0.07574462890625, |
|
"learning_rate": 4.873721045679706e-07, |
|
"loss": 0.0151, |
|
"reward": -0.4336223527789116, |
|
"reward_std": 0.4073316380381584, |
|
"rewards/cosine_len_reward": -0.4336223527789116, |
|
"step": 296 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3456.4583740234375, |
|
"epoch": 0.1697142857142857, |
|
"grad_norm": 0.1608293056488037, |
|
"kl": 0.04888916015625, |
|
"learning_rate": 4.842626371469149e-07, |
|
"loss": 0.0333, |
|
"reward": 0.384184792637825, |
|
"reward_std": 0.2665172927081585, |
|
"rewards/cosine_len_reward": 0.384184792637825, |
|
"step": 297 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3206.0416870117188, |
|
"epoch": 0.1702857142857143, |
|
"grad_norm": 0.19832450151443481, |
|
"kl": 0.07696533203125, |
|
"learning_rate": 4.811563736721829e-07, |
|
"loss": 0.0752, |
|
"reward": -0.399520443752408, |
|
"reward_std": 0.32169621996581554, |
|
"rewards/cosine_len_reward": -0.399520443752408, |
|
"step": 298 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3512.625, |
|
"epoch": 0.17085714285714285, |
|
"grad_norm": 0.21216793358325958, |
|
"kl": 0.06488037109375, |
|
"learning_rate": 4.780534655386743e-07, |
|
"loss": 0.0376, |
|
"reward": 0.398956298828125, |
|
"reward_std": 0.3917335644364357, |
|
"rewards/cosine_len_reward": 0.398956298828125, |
|
"step": 299 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3413.25, |
|
"epoch": 0.17142857142857143, |
|
"grad_norm": 0.16395436227321625, |
|
"kl": 0.053955078125, |
|
"learning_rate": 4.749540639777539e-07, |
|
"loss": -0.015, |
|
"reward": -0.36241818219423294, |
|
"reward_std": 0.4449329450726509, |
|
"rewards/cosine_len_reward": -0.36241818219423294, |
|
"step": 300 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3202.125, |
|
"epoch": 0.172, |
|
"grad_norm": 0.15313901007175446, |
|
"kl": 0.04730224609375, |
|
"learning_rate": 4.7185832004988133e-07, |
|
"loss": 0.0129, |
|
"reward": -0.2640495439991355, |
|
"reward_std": 0.47315043210983276, |
|
"rewards/cosine_len_reward": -0.2640495439991355, |
|
"step": 301 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2909.0833435058594, |
|
"epoch": 0.17257142857142857, |
|
"grad_norm": 27.085668563842773, |
|
"kl": 21.65618896484375, |
|
"learning_rate": 4.68766384637248e-07, |
|
"loss": 0.1262, |
|
"reward": 0.044873252511024475, |
|
"reward_std": 0.23454123549163342, |
|
"rewards/cosine_len_reward": 0.044873252511024475, |
|
"step": 302 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3122.625, |
|
"epoch": 0.17314285714285715, |
|
"grad_norm": 0.1699696183204651, |
|
"kl": 0.043304443359375, |
|
"learning_rate": 4.656784084364238e-07, |
|
"loss": -0.0568, |
|
"reward": -0.30118887685239315, |
|
"reward_std": 0.4637632668018341, |
|
"rewards/cosine_len_reward": -0.30118887685239315, |
|
"step": 303 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.1737142857142857, |
|
"grad_norm": 0.19118285179138184, |
|
"kl": 0.04522705078125, |
|
"learning_rate": 4.6259454195101267e-07, |
|
"loss": 0.0002, |
|
"reward": -0.37123019248247147, |
|
"reward_std": 0.4171289950609207, |
|
"rewards/cosine_len_reward": -0.37123019248247147, |
|
"step": 304 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3526.9583740234375, |
|
"epoch": 0.1742857142857143, |
|
"grad_norm": 0.14197200536727905, |
|
"kl": 0.0484619140625, |
|
"learning_rate": 4.59514935484316e-07, |
|
"loss": 0.0266, |
|
"reward": -0.04980198014527559, |
|
"reward_std": 0.5029722154140472, |
|
"rewards/cosine_len_reward": -0.04980198014527559, |
|
"step": 305 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2224.3750228881836, |
|
"epoch": 0.17485714285714285, |
|
"grad_norm": 0.5903677940368652, |
|
"kl": 0.15130615234375, |
|
"learning_rate": 4.5643973913200837e-07, |
|
"loss": 0.2378, |
|
"reward": 0.26856766641139984, |
|
"reward_std": 0.42505303025245667, |
|
"rewards/cosine_len_reward": 0.26856766641139984, |
|
"step": 306 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.17542857142857143, |
|
"grad_norm": 0.1757889688014984, |
|
"kl": 0.0382080078125, |
|
"learning_rate": 4.5336910277482155e-07, |
|
"loss": 0.0002, |
|
"reward": -0.4401230961084366, |
|
"reward_std": 0.23055214434862137, |
|
"rewards/cosine_len_reward": -0.4401230961084366, |
|
"step": 307 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2553.7083740234375, |
|
"epoch": 0.176, |
|
"grad_norm": 0.2097814977169037, |
|
"kl": 0.0811767578125, |
|
"learning_rate": 4.503031760712397e-07, |
|
"loss": -0.1351, |
|
"reward": -0.36412402987480164, |
|
"reward_std": 0.49435050785541534, |
|
"rewards/cosine_len_reward": -0.36412402987480164, |
|
"step": 308 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3548.8333740234375, |
|
"epoch": 0.17657142857142857, |
|
"grad_norm": 0.16092661023139954, |
|
"kl": 0.05072021484375, |
|
"learning_rate": 4.4724210845020494e-07, |
|
"loss": -0.0036, |
|
"reward": -0.14657190442085266, |
|
"reward_std": 0.38866011798381805, |
|
"rewards/cosine_len_reward": -0.14657190442085266, |
|
"step": 309 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.17714285714285713, |
|
"grad_norm": 0.13816198706626892, |
|
"kl": 0.03985595703125, |
|
"learning_rate": 4.441860491038345e-07, |
|
"loss": 0.0002, |
|
"reward": -0.41758012771606445, |
|
"reward_std": 0.37766269221901894, |
|
"rewards/cosine_len_reward": -0.41758012771606445, |
|
"step": 310 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3376.0833740234375, |
|
"epoch": 0.1777142857142857, |
|
"grad_norm": 0.18006065487861633, |
|
"kl": 0.05718994140625, |
|
"learning_rate": 4.4113514698014953e-07, |
|
"loss": -0.0754, |
|
"reward": 0.1647863369435072, |
|
"reward_std": 0.48134757578372955, |
|
"rewards/cosine_len_reward": 0.1647863369435072, |
|
"step": 311 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3477.916748046875, |
|
"epoch": 0.1782857142857143, |
|
"grad_norm": 0.17141719162464142, |
|
"kl": 0.0614013671875, |
|
"learning_rate": 4.3808955077581546e-07, |
|
"loss": -0.0411, |
|
"reward": -0.01686130464076996, |
|
"reward_std": 0.40112806484103203, |
|
"rewards/cosine_len_reward": -0.01686130464076996, |
|
"step": 312 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2756.375, |
|
"epoch": 0.17885714285714285, |
|
"grad_norm": 0.31155240535736084, |
|
"kl": 0.050994873046875, |
|
"learning_rate": 4.350494089288943e-07, |
|
"loss": 0.0399, |
|
"reward": 0.356310099363327, |
|
"reward_std": 0.22038070112466812, |
|
"rewards/cosine_len_reward": 0.356310099363327, |
|
"step": 313 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3327.0833740234375, |
|
"epoch": 0.17942857142857144, |
|
"grad_norm": 0.15625828504562378, |
|
"kl": 0.0460205078125, |
|
"learning_rate": 4.3201486961161093e-07, |
|
"loss": -0.0522, |
|
"reward": -0.1617246214300394, |
|
"reward_std": 0.2959202714264393, |
|
"rewards/cosine_len_reward": -0.1617246214300394, |
|
"step": 314 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3233.1666870117188, |
|
"epoch": 0.18, |
|
"grad_norm": 0.21146373450756073, |
|
"kl": 0.05609130859375, |
|
"learning_rate": 4.2898608072313045e-07, |
|
"loss": -0.0672, |
|
"reward": -0.5065985713154078, |
|
"reward_std": 0.29883245564997196, |
|
"rewards/cosine_len_reward": -0.5065985713154078, |
|
"step": 315 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.18057142857142858, |
|
"grad_norm": 0.16894738376140594, |
|
"kl": 0.05133056640625, |
|
"learning_rate": 4.2596318988235037e-07, |
|
"loss": 0.0002, |
|
"reward": -0.08826442807912827, |
|
"reward_std": 0.45449625700712204, |
|
"rewards/cosine_len_reward": -0.08826442807912827, |
|
"step": 316 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3471.666748046875, |
|
"epoch": 0.18114285714285713, |
|
"grad_norm": 0.16621477901935577, |
|
"kl": 0.05419921875, |
|
"learning_rate": 4.2294634442070553e-07, |
|
"loss": -0.0353, |
|
"reward": -0.05168744921684265, |
|
"reward_std": 0.4507448337972164, |
|
"rewards/cosine_len_reward": -0.05168744921684265, |
|
"step": 317 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3045.0416870117188, |
|
"epoch": 0.18171428571428572, |
|
"grad_norm": 0.3005415201187134, |
|
"kl": 0.05413818359375, |
|
"learning_rate": 4.1993569137498776e-07, |
|
"loss": 0.081, |
|
"reward": -0.2615435868501663, |
|
"reward_std": 0.42054056003689766, |
|
"rewards/cosine_len_reward": -0.2615435868501663, |
|
"step": 318 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3559.5416870117188, |
|
"epoch": 0.18228571428571427, |
|
"grad_norm": 0.15960033237934113, |
|
"kl": 0.04742431640625, |
|
"learning_rate": 4.1693137748017915e-07, |
|
"loss": 0.0034, |
|
"reward": -0.23648555018007755, |
|
"reward_std": 0.5849247425794601, |
|
"rewards/cosine_len_reward": -0.23648555018007755, |
|
"step": 319 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3215.4583740234375, |
|
"epoch": 0.18285714285714286, |
|
"grad_norm": 0.20692546665668488, |
|
"kl": 0.044647216796875, |
|
"learning_rate": 4.1393354916230005e-07, |
|
"loss": -0.057, |
|
"reward": -0.3042381815612316, |
|
"reward_std": 0.6242840066552162, |
|
"rewards/cosine_len_reward": -0.3042381815612316, |
|
"step": 320 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.18342857142857144, |
|
"grad_norm": 0.16232453286647797, |
|
"kl": 0.05303955078125, |
|
"learning_rate": 4.1094235253127374e-07, |
|
"loss": 0.0002, |
|
"reward": 0.05929779075086117, |
|
"reward_std": 0.25769692938774824, |
|
"rewards/cosine_len_reward": 0.05929779075086117, |
|
"step": 321 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3570.0416870117188, |
|
"epoch": 0.184, |
|
"grad_norm": 0.14924933016300201, |
|
"kl": 0.043487548828125, |
|
"learning_rate": 4.079579333738039e-07, |
|
"loss": 0.0055, |
|
"reward": -0.3419235274195671, |
|
"reward_std": 0.47414593398571014, |
|
"rewards/cosine_len_reward": -0.3419235274195671, |
|
"step": 322 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.18457142857142858, |
|
"grad_norm": 0.15801897644996643, |
|
"kl": 0.0462646484375, |
|
"learning_rate": 4.0498043714627006e-07, |
|
"loss": 0.0002, |
|
"reward": -0.17604749649763107, |
|
"reward_std": 0.455319419503212, |
|
"rewards/cosine_len_reward": -0.17604749649763107, |
|
"step": 323 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.18514285714285714, |
|
"grad_norm": 0.1491517424583435, |
|
"kl": 0.0474853515625, |
|
"learning_rate": 4.020100089676376e-07, |
|
"loss": 0.0002, |
|
"reward": -0.2877863794565201, |
|
"reward_std": 0.37373417615890503, |
|
"rewards/cosine_len_reward": -0.2877863794565201, |
|
"step": 324 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2878.7083435058594, |
|
"epoch": 0.18571428571428572, |
|
"grad_norm": 0.4391452670097351, |
|
"kl": 0.04974365234375, |
|
"learning_rate": 3.9904679361238526e-07, |
|
"loss": -0.1851, |
|
"reward": 0.12535587698221207, |
|
"reward_std": 0.30845265835523605, |
|
"rewards/cosine_len_reward": 0.12535587698221207, |
|
"step": 325 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2754.0416870117188, |
|
"epoch": 0.18628571428571428, |
|
"grad_norm": 0.35371315479278564, |
|
"kl": 0.05859375, |
|
"learning_rate": 3.9609093550344907e-07, |
|
"loss": 0.0695, |
|
"reward": 0.3398248925805092, |
|
"reward_std": 0.35160839185118675, |
|
"rewards/cosine_len_reward": 0.3398248925805092, |
|
"step": 326 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3322.4166870117188, |
|
"epoch": 0.18685714285714286, |
|
"grad_norm": 0.17805320024490356, |
|
"kl": 0.04644775390625, |
|
"learning_rate": 3.931425787051832e-07, |
|
"loss": 0.0978, |
|
"reward": -0.2754373177886009, |
|
"reward_std": 0.3834121897816658, |
|
"rewards/cosine_len_reward": -0.2754373177886009, |
|
"step": 327 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3382.7083740234375, |
|
"epoch": 0.18742857142857142, |
|
"grad_norm": 0.2152082622051239, |
|
"kl": 0.0640869140625, |
|
"learning_rate": 3.902018669163384e-07, |
|
"loss": 0.0142, |
|
"reward": -0.6704437732696533, |
|
"reward_std": 0.29631161503493786, |
|
"rewards/cosine_len_reward": -0.6704437732696533, |
|
"step": 328 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3507.0, |
|
"epoch": 0.188, |
|
"grad_norm": 0.15424416959285736, |
|
"kl": 0.0408935546875, |
|
"learning_rate": 3.872689434630585e-07, |
|
"loss": -0.0165, |
|
"reward": -0.30218280851840973, |
|
"reward_std": 0.45681217312812805, |
|
"rewards/cosine_len_reward": -0.30218280851840973, |
|
"step": 329 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3086.7916870117188, |
|
"epoch": 0.18857142857142858, |
|
"grad_norm": 0.15681946277618408, |
|
"kl": 0.04803466796875, |
|
"learning_rate": 3.843439512918949e-07, |
|
"loss": -0.0145, |
|
"reward": -0.3932885080575943, |
|
"reward_std": 0.38081324100494385, |
|
"rewards/cosine_len_reward": -0.3932885080575943, |
|
"step": 330 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2938.0833435058594, |
|
"epoch": 0.18914285714285714, |
|
"grad_norm": 0.20311184227466583, |
|
"kl": 0.0517578125, |
|
"learning_rate": 3.8142703296283953e-07, |
|
"loss": 0.0564, |
|
"reward": -0.21995433419942856, |
|
"reward_std": 0.4448351748287678, |
|
"rewards/cosine_len_reward": -0.21995433419942856, |
|
"step": 331 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3013.5833740234375, |
|
"epoch": 0.18971428571428572, |
|
"grad_norm": 0.19754794239997864, |
|
"kl": 0.042724609375, |
|
"learning_rate": 3.785183306423767e-07, |
|
"loss": -0.1519, |
|
"reward": 0.14706944674253464, |
|
"reward_std": 0.4487891271710396, |
|
"rewards/cosine_len_reward": 0.14706944674253464, |
|
"step": 332 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3266.75, |
|
"epoch": 0.19028571428571428, |
|
"grad_norm": 0.17275254428386688, |
|
"kl": 0.05133056640625, |
|
"learning_rate": 3.7561798609655373e-07, |
|
"loss": 0.0115, |
|
"reward": -0.12366479635238647, |
|
"reward_std": 0.4474521279335022, |
|
"rewards/cosine_len_reward": -0.12366479635238647, |
|
"step": 333 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.19085714285714286, |
|
"grad_norm": 0.15105217695236206, |
|
"kl": 0.04327392578125, |
|
"learning_rate": 3.72726140684072e-07, |
|
"loss": 0.0002, |
|
"reward": -0.2032340094447136, |
|
"reward_std": 0.3833343982696533, |
|
"rewards/cosine_len_reward": -0.2032340094447136, |
|
"step": 334 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3215.0000610351562, |
|
"epoch": 0.19142857142857142, |
|
"grad_norm": 0.20102421939373016, |
|
"kl": 0.0518798828125, |
|
"learning_rate": 3.6984293534939737e-07, |
|
"loss": 0.0936, |
|
"reward": 0.031060203909873962, |
|
"reward_std": 0.5999341458082199, |
|
"rewards/cosine_len_reward": 0.031060203909873962, |
|
"step": 335 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3227.0000610351562, |
|
"epoch": 0.192, |
|
"grad_norm": 0.166560098528862, |
|
"kl": 0.04205322265625, |
|
"learning_rate": 3.6696851061588994e-07, |
|
"loss": -0.0589, |
|
"reward": -0.11390832741744816, |
|
"reward_std": 0.5906789004802704, |
|
"rewards/cosine_len_reward": -0.11390832741744816, |
|
"step": 336 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3075.7916870117188, |
|
"epoch": 0.19257142857142856, |
|
"grad_norm": 0.15513859689235687, |
|
"kl": 0.045654296875, |
|
"learning_rate": 3.641030065789562e-07, |
|
"loss": -0.0208, |
|
"reward": -0.22384709864854813, |
|
"reward_std": 0.3124929741024971, |
|
"rewards/cosine_len_reward": -0.22384709864854813, |
|
"step": 337 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2809.666717529297, |
|
"epoch": 0.19314285714285714, |
|
"grad_norm": 0.2667958736419678, |
|
"kl": 0.0615234375, |
|
"learning_rate": 3.612465628992203e-07, |
|
"loss": 0.0492, |
|
"reward": -0.3067406304180622, |
|
"reward_std": 0.43594905734062195, |
|
"rewards/cosine_len_reward": -0.3067406304180622, |
|
"step": 338 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.19371428571428573, |
|
"grad_norm": 0.14318394660949707, |
|
"kl": 0.0484619140625, |
|
"learning_rate": 3.5839931879571725e-07, |
|
"loss": 0.0002, |
|
"reward": -0.3350971192121506, |
|
"reward_std": 0.31668924540281296, |
|
"rewards/cosine_len_reward": -0.3350971192121506, |
|
"step": 339 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2548.875030517578, |
|
"epoch": 0.19428571428571428, |
|
"grad_norm": 0.26427459716796875, |
|
"kl": 0.04827880859375, |
|
"learning_rate": 3.555614130391079e-07, |
|
"loss": 0.0543, |
|
"reward": 0.14779387041926384, |
|
"reward_std": 0.6089051365852356, |
|
"rewards/cosine_len_reward": 0.14779387041926384, |
|
"step": 340 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2463.750030517578, |
|
"epoch": 0.19485714285714287, |
|
"grad_norm": 0.2228541374206543, |
|
"kl": 0.0516357421875, |
|
"learning_rate": 3.5273298394491515e-07, |
|
"loss": -0.0836, |
|
"reward": -0.48010372975841165, |
|
"reward_std": 0.2938494123518467, |
|
"rewards/cosine_len_reward": -0.48010372975841165, |
|
"step": 341 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3194.5000610351562, |
|
"epoch": 0.19542857142857142, |
|
"grad_norm": 0.1965860277414322, |
|
"kl": 0.05621337890625, |
|
"learning_rate": 3.4991416936678276e-07, |
|
"loss": -0.0477, |
|
"reward": -0.40905338898301125, |
|
"reward_std": 0.3798966519534588, |
|
"rewards/cosine_len_reward": -0.40905338898301125, |
|
"step": 342 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3555.6666870117188, |
|
"epoch": 0.196, |
|
"grad_norm": 0.16845592856407166, |
|
"kl": 0.0560302734375, |
|
"learning_rate": 3.471051066897562e-07, |
|
"loss": -0.0108, |
|
"reward": -0.3629562482237816, |
|
"reward_std": 0.31892314925789833, |
|
"rewards/cosine_len_reward": -0.3629562482237816, |
|
"step": 343 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3099.8750610351562, |
|
"epoch": 0.19657142857142856, |
|
"grad_norm": 0.1927381455898285, |
|
"kl": 0.05059814453125, |
|
"learning_rate": 3.4430593282358777e-07, |
|
"loss": -0.016, |
|
"reward": -0.2542693614959717, |
|
"reward_std": 0.4295632019639015, |
|
"rewards/cosine_len_reward": -0.2542693614959717, |
|
"step": 344 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2577.9583740234375, |
|
"epoch": 0.19714285714285715, |
|
"grad_norm": 0.22457395493984222, |
|
"kl": 0.0718994140625, |
|
"learning_rate": 3.4151678419606233e-07, |
|
"loss": -0.1906, |
|
"reward": -0.30330940335989, |
|
"reward_std": 0.50710579007864, |
|
"rewards/cosine_len_reward": -0.30330940335989, |
|
"step": 345 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.1977142857142857, |
|
"grad_norm": 0.1638084501028061, |
|
"kl": 0.046417236328125, |
|
"learning_rate": 3.387377967463493e-07, |
|
"loss": 0.0002, |
|
"reward": 0.060419052839279175, |
|
"reward_std": 0.3038778752088547, |
|
"rewards/cosine_len_reward": 0.060419052839279175, |
|
"step": 346 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3490.2916870117188, |
|
"epoch": 0.1982857142857143, |
|
"grad_norm": 0.14090070128440857, |
|
"kl": 0.0477294921875, |
|
"learning_rate": 3.359691059183761e-07, |
|
"loss": -0.0544, |
|
"reward": -0.15175998210906982, |
|
"reward_std": 0.4236246980726719, |
|
"rewards/cosine_len_reward": -0.15175998210906982, |
|
"step": 347 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3236.5833740234375, |
|
"epoch": 0.19885714285714284, |
|
"grad_norm": 0.17946301400661469, |
|
"kl": 0.0538330078125, |
|
"learning_rate": 3.3321084665422803e-07, |
|
"loss": -0.0636, |
|
"reward": -0.06013108603656292, |
|
"reward_std": 0.5346257090568542, |
|
"rewards/cosine_len_reward": -0.06013108603656292, |
|
"step": 348 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3556.9583740234375, |
|
"epoch": 0.19942857142857143, |
|
"grad_norm": 0.1601073294878006, |
|
"kl": 0.04962158203125, |
|
"learning_rate": 3.3046315338757026e-07, |
|
"loss": -0.0029, |
|
"reward": -0.21468165516853333, |
|
"reward_std": 0.5103632658720016, |
|
"rewards/cosine_len_reward": -0.21468165516853333, |
|
"step": 349 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2724.125030517578, |
|
"epoch": 0.2, |
|
"grad_norm": 0.2437424212694168, |
|
"kl": 0.05059814453125, |
|
"learning_rate": 3.2772616003709616e-07, |
|
"loss": -0.03, |
|
"reward": 0.16293304320424795, |
|
"reward_std": 0.6034757569432259, |
|
"rewards/cosine_len_reward": 0.16293304320424795, |
|
"step": 350 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.20057142857142857, |
|
"grad_norm": 0.15608130395412445, |
|
"kl": 0.05377197265625, |
|
"learning_rate": 3.250000000000001e-07, |
|
"loss": 0.0002, |
|
"reward": -0.49736592173576355, |
|
"reward_std": 0.3650210574269295, |
|
"rewards/cosine_len_reward": -0.49736592173576355, |
|
"step": 351 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2824.000030517578, |
|
"epoch": 0.20114285714285715, |
|
"grad_norm": 0.19428031146526337, |
|
"kl": 0.05194091796875, |
|
"learning_rate": 3.222848061454764e-07, |
|
"loss": 0.2626, |
|
"reward": -0.3008427929598838, |
|
"reward_std": 0.41808854788541794, |
|
"rewards/cosine_len_reward": -0.3008427929598838, |
|
"step": 352 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3445.9583740234375, |
|
"epoch": 0.2017142857142857, |
|
"grad_norm": 0.16469155251979828, |
|
"kl": 0.05267333984375, |
|
"learning_rate": 3.195807108082429e-07, |
|
"loss": -0.0527, |
|
"reward": -0.13214807212352753, |
|
"reward_std": 0.49090687185525894, |
|
"rewards/cosine_len_reward": -0.13214807212352753, |
|
"step": 353 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3180.875, |
|
"epoch": 0.2022857142857143, |
|
"grad_norm": 0.1331019401550293, |
|
"kl": 0.0361328125, |
|
"learning_rate": 3.168878457820915e-07, |
|
"loss": -0.0633, |
|
"reward": -0.04561649262905121, |
|
"reward_std": 0.42332185059785843, |
|
"rewards/cosine_len_reward": -0.04561649262905121, |
|
"step": 354 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3065.4583435058594, |
|
"epoch": 0.20285714285714285, |
|
"grad_norm": 0.20506833493709564, |
|
"kl": 0.06689453125, |
|
"learning_rate": 3.142063423134644e-07, |
|
"loss": -0.0127, |
|
"reward": -0.10782808437943459, |
|
"reward_std": 0.36676693707704544, |
|
"rewards/cosine_len_reward": -0.10782808437943459, |
|
"step": 355 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3181.1666870117188, |
|
"epoch": 0.20342857142857143, |
|
"grad_norm": 0.14900951087474823, |
|
"kl": 0.03961181640625, |
|
"learning_rate": 3.115363310950578e-07, |
|
"loss": -0.0347, |
|
"reward": -0.19400886073708534, |
|
"reward_std": 0.4912572205066681, |
|
"rewards/cosine_len_reward": -0.19400886073708534, |
|
"step": 356 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3115.7083740234375, |
|
"epoch": 0.204, |
|
"grad_norm": 0.5810701251029968, |
|
"kl": 0.333984375, |
|
"learning_rate": 3.0887794225945143e-07, |
|
"loss": -0.0253, |
|
"reward": -0.2779521383345127, |
|
"reward_std": 0.48514702171087265, |
|
"rewards/cosine_len_reward": -0.2779521383345127, |
|
"step": 357 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3074.9166870117188, |
|
"epoch": 0.20457142857142857, |
|
"grad_norm": 0.21780595183372498, |
|
"kl": 0.0684814453125, |
|
"learning_rate": 3.062313053727671e-07, |
|
"loss": -0.0599, |
|
"reward": -0.5599739253520966, |
|
"reward_std": 0.34018975496292114, |
|
"rewards/cosine_len_reward": -0.5599739253520966, |
|
"step": 358 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3332.5833740234375, |
|
"epoch": 0.20514285714285715, |
|
"grad_norm": 0.17007119953632355, |
|
"kl": 0.05792236328125, |
|
"learning_rate": 3.0359654942835247e-07, |
|
"loss": 0.0268, |
|
"reward": -0.22800956666469574, |
|
"reward_std": 0.48570629209280014, |
|
"rewards/cosine_len_reward": -0.22800956666469574, |
|
"step": 359 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.2057142857142857, |
|
"grad_norm": 0.15876568853855133, |
|
"kl": 0.0408935546875, |
|
"learning_rate": 3.0097380284049523e-07, |
|
"loss": 0.0002, |
|
"reward": 0.08246973995119333, |
|
"reward_std": 0.6036019250750542, |
|
"rewards/cosine_len_reward": 0.08246973995119333, |
|
"step": 360 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3412.2916870117188, |
|
"epoch": 0.2062857142857143, |
|
"grad_norm": 0.14641037583351135, |
|
"kl": 0.047119140625, |
|
"learning_rate": 2.9836319343816397e-07, |
|
"loss": -0.003, |
|
"reward": 0.05859617702662945, |
|
"reward_std": 0.4323427379131317, |
|
"rewards/cosine_len_reward": 0.05859617702662945, |
|
"step": 361 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3474.25, |
|
"epoch": 0.20685714285714285, |
|
"grad_norm": 0.1528189331293106, |
|
"kl": 0.04168701171875, |
|
"learning_rate": 2.9576484845877793e-07, |
|
"loss": -0.0358, |
|
"reward": -0.4896080791950226, |
|
"reward_std": 0.37986551597714424, |
|
"rewards/cosine_len_reward": -0.4896080791950226, |
|
"step": 362 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3300.0000610351562, |
|
"epoch": 0.20742857142857143, |
|
"grad_norm": 0.20170490443706512, |
|
"kl": 0.0927734375, |
|
"learning_rate": 2.931788945420058e-07, |
|
"loss": -0.0207, |
|
"reward": 0.16002619452774525, |
|
"reward_std": 0.5198325589299202, |
|
"rewards/cosine_len_reward": 0.16002619452774525, |
|
"step": 363 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3578.4166870117188, |
|
"epoch": 0.208, |
|
"grad_norm": 0.17275533080101013, |
|
"kl": 0.04876708984375, |
|
"learning_rate": 2.9060545772359305e-07, |
|
"loss": 0.0011, |
|
"reward": -0.2858371250331402, |
|
"reward_std": 0.4636296257376671, |
|
"rewards/cosine_len_reward": -0.2858371250331402, |
|
"step": 364 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3389.6666870117188, |
|
"epoch": 0.20857142857142857, |
|
"grad_norm": 0.19140967726707458, |
|
"kl": 0.05511474609375, |
|
"learning_rate": 2.8804466342921987e-07, |
|
"loss": 0.0506, |
|
"reward": -0.24161814898252487, |
|
"reward_std": 0.5207706317305565, |
|
"rewards/cosine_len_reward": -0.24161814898252487, |
|
"step": 365 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3495.041748046875, |
|
"epoch": 0.20914285714285713, |
|
"grad_norm": 0.15582628548145294, |
|
"kl": 0.04608154296875, |
|
"learning_rate": 2.854966364683872e-07, |
|
"loss": 0.024, |
|
"reward": 0.0705061387270689, |
|
"reward_std": 0.29681421583518386, |
|
"rewards/cosine_len_reward": 0.0705061387270689, |
|
"step": 366 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2878.250030517578, |
|
"epoch": 0.20971428571428571, |
|
"grad_norm": 0.26657283306121826, |
|
"kl": 0.05078125, |
|
"learning_rate": 2.829615010283344e-07, |
|
"loss": 0.055, |
|
"reward": -0.19209666550159454, |
|
"reward_std": 0.5073798671364784, |
|
"rewards/cosine_len_reward": -0.19209666550159454, |
|
"step": 367 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2952.625, |
|
"epoch": 0.2102857142857143, |
|
"grad_norm": 0.2453869879245758, |
|
"kl": 0.04644775390625, |
|
"learning_rate": 2.8043938066798645e-07, |
|
"loss": -0.0595, |
|
"reward": -0.4701330562820658, |
|
"reward_std": 0.43810437619686127, |
|
"rewards/cosine_len_reward": -0.4701330562820658, |
|
"step": 368 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3518.25, |
|
"epoch": 0.21085714285714285, |
|
"grad_norm": 0.17311729490756989, |
|
"kl": 0.04888916015625, |
|
"learning_rate": 2.7793039831193133e-07, |
|
"loss": 0.0191, |
|
"reward": -0.45571963116526604, |
|
"reward_std": 0.40335123240947723, |
|
"rewards/cosine_len_reward": -0.45571963116526604, |
|
"step": 369 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3502.0, |
|
"epoch": 0.21142857142857144, |
|
"grad_norm": 0.1664201319217682, |
|
"kl": 0.04937744140625, |
|
"learning_rate": 2.7543467624442956e-07, |
|
"loss": 0.0327, |
|
"reward": -0.38726338744163513, |
|
"reward_std": 0.5652562528848648, |
|
"rewards/cosine_len_reward": -0.38726338744163513, |
|
"step": 370 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3288.4583740234375, |
|
"epoch": 0.212, |
|
"grad_norm": 0.17351403832435608, |
|
"kl": 0.042816162109375, |
|
"learning_rate": 2.729523361034538e-07, |
|
"loss": 0.045, |
|
"reward": 0.021196894347667694, |
|
"reward_std": 0.579230286180973, |
|
"rewards/cosine_len_reward": 0.021196894347667694, |
|
"step": 371 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.21257142857142858, |
|
"grad_norm": 1.5477689504623413, |
|
"kl": 0.093994140625, |
|
"learning_rate": 2.7048349887476037e-07, |
|
"loss": 0.0004, |
|
"reward": -0.0014846324920654297, |
|
"reward_std": 0.4312394857406616, |
|
"rewards/cosine_len_reward": -0.0014846324920654297, |
|
"step": 372 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3127.2916870117188, |
|
"epoch": 0.21314285714285713, |
|
"grad_norm": 0.1729487031698227, |
|
"kl": 0.04461669921875, |
|
"learning_rate": 2.6802828488599294e-07, |
|
"loss": -0.0303, |
|
"reward": -0.3757053539156914, |
|
"reward_std": 0.5460054390132427, |
|
"rewards/cosine_len_reward": -0.3757053539156914, |
|
"step": 373 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.21371428571428572, |
|
"grad_norm": 0.18273600935935974, |
|
"kl": 0.0545654296875, |
|
"learning_rate": 2.655868138008171e-07, |
|
"loss": 0.0002, |
|
"reward": -0.4427555501461029, |
|
"reward_std": 0.4617629870772362, |
|
"rewards/cosine_len_reward": -0.4427555501461029, |
|
"step": 374 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3204.4166870117188, |
|
"epoch": 0.21428571428571427, |
|
"grad_norm": 0.26651179790496826, |
|
"kl": 0.09613037109375, |
|
"learning_rate": 2.631592046130896e-07, |
|
"loss": -0.0441, |
|
"reward": 0.39611528790555894, |
|
"reward_std": 0.3069186918437481, |
|
"rewards/cosine_len_reward": 0.39611528790555894, |
|
"step": 375 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.21485714285714286, |
|
"grad_norm": 0.28572776913642883, |
|
"kl": 0.137451171875, |
|
"learning_rate": 2.6074557564105724e-07, |
|
"loss": 0.0006, |
|
"reward": -0.3743774890899658, |
|
"reward_std": 0.4345959648489952, |
|
"rewards/cosine_len_reward": -0.3743774890899658, |
|
"step": 376 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3067.9583740234375, |
|
"epoch": 0.21542857142857144, |
|
"grad_norm": 0.20133064687252045, |
|
"kl": 0.052734375, |
|
"learning_rate": 2.583460445215911e-07, |
|
"loss": -0.0541, |
|
"reward": -0.36722417175769806, |
|
"reward_std": 0.46697434037923813, |
|
"rewards/cosine_len_reward": -0.36722417175769806, |
|
"step": 377 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3256.8750610351562, |
|
"epoch": 0.216, |
|
"grad_norm": 0.19015897810459137, |
|
"kl": 0.06170654296875, |
|
"learning_rate": 2.5596072820445254e-07, |
|
"loss": 0.0618, |
|
"reward": -0.18510014936327934, |
|
"reward_std": 0.5434570163488388, |
|
"rewards/cosine_len_reward": -0.18510014936327934, |
|
"step": 378 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3358.0833740234375, |
|
"epoch": 0.21657142857142858, |
|
"grad_norm": 0.16775275766849518, |
|
"kl": 0.052734375, |
|
"learning_rate": 2.5358974294659373e-07, |
|
"loss": -0.0717, |
|
"reward": -0.20583120733499527, |
|
"reward_std": 0.1991448849439621, |
|
"rewards/cosine_len_reward": -0.20583120733499527, |
|
"step": 379 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3357.666748046875, |
|
"epoch": 0.21714285714285714, |
|
"grad_norm": 0.1613640934228897, |
|
"kl": 0.049560546875, |
|
"learning_rate": 2.512332043064913e-07, |
|
"loss": 0.0323, |
|
"reward": -0.1545943170785904, |
|
"reward_std": 0.5345202684402466, |
|
"rewards/cosine_len_reward": -0.1545943170785904, |
|
"step": 380 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3260.0000610351562, |
|
"epoch": 0.21771428571428572, |
|
"grad_norm": 0.1736854612827301, |
|
"kl": 0.0679931640625, |
|
"learning_rate": 2.488912271385139e-07, |
|
"loss": -0.038, |
|
"reward": -0.4681055396795273, |
|
"reward_std": 0.4248203635215759, |
|
"rewards/cosine_len_reward": -0.4681055396795273, |
|
"step": 381 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.21828571428571428, |
|
"grad_norm": 0.18592111766338348, |
|
"kl": 0.0501708984375, |
|
"learning_rate": 2.465639255873246e-07, |
|
"loss": 0.0002, |
|
"reward": -0.39792611449956894, |
|
"reward_std": 0.32762938365340233, |
|
"rewards/cosine_len_reward": -0.39792611449956894, |
|
"step": 382 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3378.3750610351562, |
|
"epoch": 0.21885714285714286, |
|
"grad_norm": 0.22236335277557373, |
|
"kl": 0.070556640625, |
|
"learning_rate": 2.4425141308231765e-07, |
|
"loss": -0.0429, |
|
"reward": -0.10803265869617462, |
|
"reward_std": 0.3375392761081457, |
|
"rewards/cosine_len_reward": -0.10803265869617462, |
|
"step": 383 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.21942857142857142, |
|
"grad_norm": 0.15268638730049133, |
|
"kl": 0.0458984375, |
|
"learning_rate": 2.4195380233209006e-07, |
|
"loss": 0.0002, |
|
"reward": 0.11043217405676842, |
|
"reward_std": 0.46337635442614555, |
|
"rewards/cosine_len_reward": 0.11043217405676842, |
|
"step": 384 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2849.875045776367, |
|
"epoch": 0.22, |
|
"grad_norm": 0.29308760166168213, |
|
"kl": 0.0633544921875, |
|
"learning_rate": 2.3967120531894857e-07, |
|
"loss": 0.0655, |
|
"reward": -0.3381078392267227, |
|
"reward_std": 0.5556919574737549, |
|
"rewards/cosine_len_reward": -0.3381078392267227, |
|
"step": 385 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3408.9166870117188, |
|
"epoch": 0.22057142857142858, |
|
"grad_norm": 0.18074069917201996, |
|
"kl": 0.05487060546875, |
|
"learning_rate": 2.374037332934512e-07, |
|
"loss": -0.0111, |
|
"reward": -0.31479221396148205, |
|
"reward_std": 0.43774885684251785, |
|
"rewards/cosine_len_reward": -0.31479221396148205, |
|
"step": 386 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2028.0000114440918, |
|
"epoch": 0.22114285714285714, |
|
"grad_norm": 0.345003217458725, |
|
"kl": 0.05218505859375, |
|
"learning_rate": 2.3515149676898552e-07, |
|
"loss": 0.0685, |
|
"reward": 0.2636047229170799, |
|
"reward_std": 0.26575359515845776, |
|
"rewards/cosine_len_reward": 0.2636047229170799, |
|
"step": 387 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2304.9583740234375, |
|
"epoch": 0.22171428571428572, |
|
"grad_norm": 0.2163887768983841, |
|
"kl": 0.0439453125, |
|
"learning_rate": 2.3291460551638237e-07, |
|
"loss": -0.0088, |
|
"reward": -0.5689196065068245, |
|
"reward_std": 0.40281252190470695, |
|
"rewards/cosine_len_reward": -0.5689196065068245, |
|
"step": 388 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1966.8333435058594, |
|
"epoch": 0.22228571428571428, |
|
"grad_norm": 0.23669986426830292, |
|
"kl": 0.0467529296875, |
|
"learning_rate": 2.306931685585657e-07, |
|
"loss": 0.1076, |
|
"reward": -0.19248197972774506, |
|
"reward_std": 0.2740873768925667, |
|
"rewards/cosine_len_reward": -0.19248197972774506, |
|
"step": 389 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3330.8333740234375, |
|
"epoch": 0.22285714285714286, |
|
"grad_norm": 0.19466285407543182, |
|
"kl": 0.05242919921875, |
|
"learning_rate": 2.2848729416523859e-07, |
|
"loss": 0.0745, |
|
"reward": -0.25136643648147583, |
|
"reward_std": 0.5177683755755424, |
|
"rewards/cosine_len_reward": -0.25136643648147583, |
|
"step": 390 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.22342857142857142, |
|
"grad_norm": 0.18898797035217285, |
|
"kl": 0.05535888671875, |
|
"learning_rate": 2.2629708984760706e-07, |
|
"loss": 0.0002, |
|
"reward": 0.15870603919029236, |
|
"reward_std": 0.42898403853178024, |
|
"rewards/cosine_len_reward": 0.15870603919029236, |
|
"step": 391 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3555.2916870117188, |
|
"epoch": 0.224, |
|
"grad_norm": 0.165949285030365, |
|
"kl": 0.0509033203125, |
|
"learning_rate": 2.2412266235313973e-07, |
|
"loss": 0.0141, |
|
"reward": -0.4548497349023819, |
|
"reward_std": 0.24412627145648003, |
|
"rewards/cosine_len_reward": -0.4548497349023819, |
|
"step": 392 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3505.5416870117188, |
|
"epoch": 0.22457142857142856, |
|
"grad_norm": 0.16940677165985107, |
|
"kl": 0.04827880859375, |
|
"learning_rate": 2.2196411766036487e-07, |
|
"loss": -0.0325, |
|
"reward": -0.019142277538776398, |
|
"reward_std": 0.49893130362033844, |
|
"rewards/cosine_len_reward": -0.019142277538776398, |
|
"step": 393 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2805.875, |
|
"epoch": 0.22514285714285714, |
|
"grad_norm": 0.2236856073141098, |
|
"kl": 0.04705810546875, |
|
"learning_rate": 2.1982156097370557e-07, |
|
"loss": 0.0315, |
|
"reward": 0.17413194477558136, |
|
"reward_std": 0.37101365998387337, |
|
"rewards/cosine_len_reward": 0.17413194477558136, |
|
"step": 394 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.2257142857142857, |
|
"grad_norm": 0.1488247662782669, |
|
"kl": 0.04559326171875, |
|
"learning_rate": 2.1769509671835223e-07, |
|
"loss": 0.0002, |
|
"reward": -0.16203116870019585, |
|
"reward_std": 0.4705766849219799, |
|
"rewards/cosine_len_reward": -0.16203116870019585, |
|
"step": 395 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.22628571428571428, |
|
"grad_norm": 0.13519521057605743, |
|
"kl": 0.03631591796875, |
|
"learning_rate": 2.1558482853517253e-07, |
|
"loss": 0.0001, |
|
"reward": 0.14726969599723816, |
|
"reward_std": 0.3467975091189146, |
|
"rewards/cosine_len_reward": 0.14726969599723816, |
|
"step": 396 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3178.6666870117188, |
|
"epoch": 0.22685714285714287, |
|
"grad_norm": 0.26082149147987366, |
|
"kl": 0.05474853515625, |
|
"learning_rate": 2.134908592756607e-07, |
|
"loss": -0.0699, |
|
"reward": 0.2510442901402712, |
|
"reward_std": 0.4601794481277466, |
|
"rewards/cosine_len_reward": 0.2510442901402712, |
|
"step": 397 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3374.0833740234375, |
|
"epoch": 0.22742857142857142, |
|
"grad_norm": 0.16580724716186523, |
|
"kl": 0.04669189453125, |
|
"learning_rate": 2.1141329099692406e-07, |
|
"loss": -0.0645, |
|
"reward": -0.13105885684490204, |
|
"reward_std": 0.4312235489487648, |
|
"rewards/cosine_len_reward": -0.13105885684490204, |
|
"step": 398 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3359.25, |
|
"epoch": 0.228, |
|
"grad_norm": 0.19662907719612122, |
|
"kl": 0.06268310546875, |
|
"learning_rate": 2.0935222495670968e-07, |
|
"loss": 0.0101, |
|
"reward": -0.20190414786338806, |
|
"reward_std": 0.47945015504956245, |
|
"rewards/cosine_len_reward": -0.20190414786338806, |
|
"step": 399 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2060.4583587646484, |
|
"epoch": 0.22857142857142856, |
|
"grad_norm": 0.35796305537223816, |
|
"kl": 0.057525634765625, |
|
"learning_rate": 2.0730776160846853e-07, |
|
"loss": 0.0891, |
|
"reward": -0.038805801421403885, |
|
"reward_std": 0.3311319947242737, |
|
"rewards/cosine_len_reward": -0.038805801421403885, |
|
"step": 400 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3128.6666870117188, |
|
"epoch": 0.22914285714285715, |
|
"grad_norm": 0.25352492928504944, |
|
"kl": 0.0992431640625, |
|
"learning_rate": 2.0528000059645995e-07, |
|
"loss": -0.1331, |
|
"reward": 0.3173316791653633, |
|
"reward_std": 0.5900417268276215, |
|
"rewards/cosine_len_reward": 0.3173316791653633, |
|
"step": 401 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2864.75, |
|
"epoch": 0.2297142857142857, |
|
"grad_norm": 0.18959416449069977, |
|
"kl": 0.04730224609375, |
|
"learning_rate": 2.032690407508949e-07, |
|
"loss": -0.011, |
|
"reward": 0.1726340614259243, |
|
"reward_std": 0.4275776147842407, |
|
"rewards/cosine_len_reward": 0.1726340614259243, |
|
"step": 402 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2573.000045776367, |
|
"epoch": 0.2302857142857143, |
|
"grad_norm": 0.22201809287071228, |
|
"kl": 0.05474853515625, |
|
"learning_rate": 2.0127498008311922e-07, |
|
"loss": -0.0644, |
|
"reward": 0.020148977637290955, |
|
"reward_std": 0.6248219758272171, |
|
"rewards/cosine_len_reward": 0.020148977637290955, |
|
"step": 403 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3304.7083740234375, |
|
"epoch": 0.23085714285714284, |
|
"grad_norm": 0.22055773437023163, |
|
"kl": 0.05517578125, |
|
"learning_rate": 1.9929791578083655e-07, |
|
"loss": 0.0566, |
|
"reward": -0.24377129971981049, |
|
"reward_std": 0.42593977600336075, |
|
"rewards/cosine_len_reward": -0.24377129971981049, |
|
"step": 404 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2901.875, |
|
"epoch": 0.23142857142857143, |
|
"grad_norm": 0.25622329115867615, |
|
"kl": 0.0494384765625, |
|
"learning_rate": 1.9733794420337213e-07, |
|
"loss": -0.1542, |
|
"reward": 0.20779071189463139, |
|
"reward_std": 0.4233979359269142, |
|
"rewards/cosine_len_reward": 0.20779071189463139, |
|
"step": 405 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3525.3333740234375, |
|
"epoch": 0.232, |
|
"grad_norm": 0.19775983691215515, |
|
"kl": 0.064697265625, |
|
"learning_rate": 1.9539516087697517e-07, |
|
"loss": -0.0078, |
|
"reward": -0.3867212012410164, |
|
"reward_std": 0.3607660289853811, |
|
"rewards/cosine_len_reward": -0.3867212012410164, |
|
"step": 406 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3248.125, |
|
"epoch": 0.23257142857142857, |
|
"grad_norm": 0.19077526032924652, |
|
"kl": 0.0391845703125, |
|
"learning_rate": 1.934696604901642e-07, |
|
"loss": 0.0377, |
|
"reward": 0.05661339312791824, |
|
"reward_std": 0.5278025027364492, |
|
"rewards/cosine_len_reward": 0.05661339312791824, |
|
"step": 407 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3040.2500610351562, |
|
"epoch": 0.23314285714285715, |
|
"grad_norm": 0.2332611382007599, |
|
"kl": 0.0648193359375, |
|
"learning_rate": 1.915615368891117e-07, |
|
"loss": 0.0701, |
|
"reward": -0.5832930132746696, |
|
"reward_std": 0.33875271677970886, |
|
"rewards/cosine_len_reward": -0.5832930132746696, |
|
"step": 408 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3566.0833740234375, |
|
"epoch": 0.2337142857142857, |
|
"grad_norm": 0.18132203817367554, |
|
"kl": 0.0550537109375, |
|
"learning_rate": 1.8967088307307e-07, |
|
"loss": 0.0092, |
|
"reward": -0.1184474304318428, |
|
"reward_std": 0.6181638091802597, |
|
"rewards/cosine_len_reward": -0.1184474304318428, |
|
"step": 409 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.2342857142857143, |
|
"grad_norm": 0.15652252733707428, |
|
"kl": 0.0477294921875, |
|
"learning_rate": 1.8779779118983867e-07, |
|
"loss": 0.0002, |
|
"reward": -0.5223753824830055, |
|
"reward_std": 0.3389586843550205, |
|
"rewards/cosine_len_reward": -0.5223753824830055, |
|
"step": 410 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.23485714285714285, |
|
"grad_norm": 0.15396270155906677, |
|
"kl": 0.04498291015625, |
|
"learning_rate": 1.8594235253127372e-07, |
|
"loss": 0.0002, |
|
"reward": 0.004480212926864624, |
|
"reward_std": 0.32227758690714836, |
|
"rewards/cosine_len_reward": 0.004480212926864624, |
|
"step": 411 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.23542857142857143, |
|
"grad_norm": 0.1605028212070465, |
|
"kl": 0.0478515625, |
|
"learning_rate": 1.8410465752883758e-07, |
|
"loss": 0.0002, |
|
"reward": -0.28013300243765116, |
|
"reward_std": 0.4526229053735733, |
|
"rewards/cosine_len_reward": -0.28013300243765116, |
|
"step": 412 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3207.0833740234375, |
|
"epoch": 0.236, |
|
"grad_norm": 0.1472114473581314, |
|
"kl": 0.04119873046875, |
|
"learning_rate": 1.822847957491922e-07, |
|
"loss": -0.022, |
|
"reward": -0.6025111824274063, |
|
"reward_std": 0.3982557747513056, |
|
"rewards/cosine_len_reward": -0.6025111824274063, |
|
"step": 413 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2968.875, |
|
"epoch": 0.23657142857142857, |
|
"grad_norm": 0.279211163520813, |
|
"kl": 0.0579833984375, |
|
"learning_rate": 1.804828558898332e-07, |
|
"loss": -0.1383, |
|
"reward": 0.30589140206575394, |
|
"reward_std": 0.3166845068335533, |
|
"rewards/cosine_len_reward": 0.30589140206575394, |
|
"step": 414 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3109.1666870117188, |
|
"epoch": 0.23714285714285716, |
|
"grad_norm": 0.22959719598293304, |
|
"kl": 0.04766845703125, |
|
"learning_rate": 1.7869892577476722e-07, |
|
"loss": -0.1332, |
|
"reward": -0.19372307881712914, |
|
"reward_std": 0.3859623149037361, |
|
"rewards/cosine_len_reward": -0.19372307881712914, |
|
"step": 415 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3371.1666870117188, |
|
"epoch": 0.2377142857142857, |
|
"grad_norm": 0.1638830453157425, |
|
"kl": 0.04876708984375, |
|
"learning_rate": 1.7693309235023127e-07, |
|
"loss": -0.0842, |
|
"reward": 0.24745731800794601, |
|
"reward_std": 0.33830365166068077, |
|
"rewards/cosine_len_reward": 0.24745731800794601, |
|
"step": 416 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2687.7917098999023, |
|
"epoch": 0.2382857142857143, |
|
"grad_norm": 0.25798413157463074, |
|
"kl": 0.04632568359375, |
|
"learning_rate": 1.7518544168045524e-07, |
|
"loss": 0.0659, |
|
"reward": 0.22542408853769302, |
|
"reward_std": 0.3038127450272441, |
|
"rewards/cosine_len_reward": 0.22542408853769302, |
|
"step": 417 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2884.8333740234375, |
|
"epoch": 0.23885714285714285, |
|
"grad_norm": 0.4755244255065918, |
|
"kl": 0.090576171875, |
|
"learning_rate": 1.7345605894346726e-07, |
|
"loss": -0.1926, |
|
"reward": -0.580606535077095, |
|
"reward_std": 0.5118565671145916, |
|
"rewards/cosine_len_reward": -0.580606535077095, |
|
"step": 418 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3318.2916870117188, |
|
"epoch": 0.23942857142857144, |
|
"grad_norm": 0.202090322971344, |
|
"kl": 0.04998779296875, |
|
"learning_rate": 1.7174502842694212e-07, |
|
"loss": -0.0433, |
|
"reward": 0.20114058069884777, |
|
"reward_std": 0.3687387742102146, |
|
"rewards/cosine_len_reward": 0.20114058069884777, |
|
"step": 419 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2232.416717529297, |
|
"epoch": 0.24, |
|
"grad_norm": 0.2654793858528137, |
|
"kl": 0.0609130859375, |
|
"learning_rate": 1.7005243352409333e-07, |
|
"loss": 0.0579, |
|
"reward": -0.08468323387205601, |
|
"reward_std": 0.60483318567276, |
|
"rewards/cosine_len_reward": -0.08468323387205601, |
|
"step": 420 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3315.125, |
|
"epoch": 0.24057142857142857, |
|
"grad_norm": 0.18679861724376678, |
|
"kl": 0.06512451171875, |
|
"learning_rate": 1.6837835672960831e-07, |
|
"loss": 0.0097, |
|
"reward": -0.22448206320405006, |
|
"reward_std": 0.5287001729011536, |
|
"rewards/cosine_len_reward": -0.22448206320405006, |
|
"step": 421 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.24114285714285713, |
|
"grad_norm": 0.14611122012138367, |
|
"kl": 0.04083251953125, |
|
"learning_rate": 1.6672287963562852e-07, |
|
"loss": 0.0002, |
|
"reward": -0.1785850077867508, |
|
"reward_std": 0.42264287918806076, |
|
"rewards/cosine_len_reward": -0.1785850077867508, |
|
"step": 422 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2946.4583435058594, |
|
"epoch": 0.24171428571428571, |
|
"grad_norm": 0.202953040599823, |
|
"kl": 0.0562744140625, |
|
"learning_rate": 1.6508608292777203e-07, |
|
"loss": 0.0495, |
|
"reward": -0.3013099692761898, |
|
"reward_std": 0.47750524431467056, |
|
"rewards/cosine_len_reward": -0.3013099692761898, |
|
"step": 423 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3452.375, |
|
"epoch": 0.2422857142857143, |
|
"grad_norm": 0.2135123759508133, |
|
"kl": 0.04937744140625, |
|
"learning_rate": 1.6346804638120098e-07, |
|
"loss": 0.0447, |
|
"reward": -0.1179568525403738, |
|
"reward_std": 0.5295333191752434, |
|
"rewards/cosine_len_reward": -0.1179568525403738, |
|
"step": 424 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3552.375, |
|
"epoch": 0.24285714285714285, |
|
"grad_norm": 0.1585531085729599, |
|
"kl": 0.05169677734375, |
|
"learning_rate": 1.6186884885673413e-07, |
|
"loss": -0.0117, |
|
"reward": -0.48371345549821854, |
|
"reward_std": 0.2882954329252243, |
|
"rewards/cosine_len_reward": -0.48371345549821854, |
|
"step": 425 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3492.0416870117188, |
|
"epoch": 0.24342857142857144, |
|
"grad_norm": 0.16782714426517487, |
|
"kl": 0.04248046875, |
|
"learning_rate": 1.6028856829700258e-07, |
|
"loss": 0.0336, |
|
"reward": -0.3401487283408642, |
|
"reward_std": 0.548667848110199, |
|
"rewards/cosine_len_reward": -0.3401487283408642, |
|
"step": 426 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3474.9166870117188, |
|
"epoch": 0.244, |
|
"grad_norm": 0.17093884944915771, |
|
"kl": 0.04315185546875, |
|
"learning_rate": 1.5872728172265146e-07, |
|
"loss": -0.0182, |
|
"reward": 0.2449711412191391, |
|
"reward_std": 0.5114858150482178, |
|
"rewards/cosine_len_reward": 0.2449711412191391, |
|
"step": 427 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3381.041748046875, |
|
"epoch": 0.24457142857142858, |
|
"grad_norm": 0.18249650299549103, |
|
"kl": 0.052978515625, |
|
"learning_rate": 1.5718506522858572e-07, |
|
"loss": -0.0526, |
|
"reward": -0.26586161740124226, |
|
"reward_std": 0.5688638612627983, |
|
"rewards/cosine_len_reward": -0.26586161740124226, |
|
"step": 428 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3452.7083740234375, |
|
"epoch": 0.24514285714285713, |
|
"grad_norm": 0.1674465388059616, |
|
"kl": 0.05694580078125, |
|
"learning_rate": 1.5566199398026147e-07, |
|
"loss": 0.0578, |
|
"reward": -0.3403562903404236, |
|
"reward_std": 0.3575546946376562, |
|
"rewards/cosine_len_reward": -0.3403562903404236, |
|
"step": 429 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.24571428571428572, |
|
"grad_norm": 0.2020343542098999, |
|
"kl": 0.05108642578125, |
|
"learning_rate": 1.5415814221002265e-07, |
|
"loss": 0.0002, |
|
"reward": -0.3109110891819, |
|
"reward_std": 0.4139351099729538, |
|
"rewards/cosine_len_reward": -0.3109110891819, |
|
"step": 430 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.24628571428571427, |
|
"grad_norm": 0.143864706158638, |
|
"kl": 0.05267333984375, |
|
"learning_rate": 1.5267358321348285e-07, |
|
"loss": 0.0002, |
|
"reward": -0.1861107312142849, |
|
"reward_std": 0.2637110576033592, |
|
"rewards/cosine_len_reward": -0.1861107312142849, |
|
"step": 431 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2981.25, |
|
"epoch": 0.24685714285714286, |
|
"grad_norm": 0.165711909532547, |
|
"kl": 0.04681396484375, |
|
"learning_rate": 1.5120838934595337e-07, |
|
"loss": 0.0404, |
|
"reward": 0.12767831981182098, |
|
"reward_std": 0.3957044407725334, |
|
"rewards/cosine_len_reward": 0.12767831981182098, |
|
"step": 432 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2924.9166870117188, |
|
"epoch": 0.24742857142857144, |
|
"grad_norm": 0.17230506241321564, |
|
"kl": 0.04913330078125, |
|
"learning_rate": 1.4976263201891613e-07, |
|
"loss": -0.0391, |
|
"reward": 0.0737846726551652, |
|
"reward_std": 0.7179519534111023, |
|
"rewards/cosine_len_reward": 0.0737846726551652, |
|
"step": 433 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.248, |
|
"grad_norm": 0.16101869940757751, |
|
"kl": 0.0555419921875, |
|
"learning_rate": 1.483363816965435e-07, |
|
"loss": 0.0002, |
|
"reward": 0.1376335695385933, |
|
"reward_std": 0.39823680371046066, |
|
"rewards/cosine_len_reward": 0.1376335695385933, |
|
"step": 434 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3123.3333435058594, |
|
"epoch": 0.24857142857142858, |
|
"grad_norm": 0.15367208421230316, |
|
"kl": 0.041259765625, |
|
"learning_rate": 1.469297078922642e-07, |
|
"loss": -0.0384, |
|
"reward": 0.09720689244568348, |
|
"reward_std": 0.44290298968553543, |
|
"rewards/cosine_len_reward": 0.09720689244568348, |
|
"step": 435 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3015.2916870117188, |
|
"epoch": 0.24914285714285714, |
|
"grad_norm": 0.2917621433734894, |
|
"kl": 0.0430908203125, |
|
"learning_rate": 1.4554267916537495e-07, |
|
"loss": 0.0654, |
|
"reward": -0.12698917463421822, |
|
"reward_std": 0.37568875774741173, |
|
"rewards/cosine_len_reward": -0.12698917463421822, |
|
"step": 436 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2433.7501220703125, |
|
"epoch": 0.24971428571428572, |
|
"grad_norm": 0.46212059259414673, |
|
"kl": 0.04766845703125, |
|
"learning_rate": 1.4417536311769885e-07, |
|
"loss": 0.2189, |
|
"reward": -0.24191192165017128, |
|
"reward_std": 0.44904495403170586, |
|
"rewards/cosine_len_reward": -0.24191192165017128, |
|
"step": 437 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3474.6666870117188, |
|
"epoch": 0.2502857142857143, |
|
"grad_norm": 0.1538762003183365, |
|
"kl": 0.0439453125, |
|
"learning_rate": 1.4282782639029128e-07, |
|
"loss": -0.0377, |
|
"reward": -0.14186367020010948, |
|
"reward_std": 0.39842598885297775, |
|
"rewards/cosine_len_reward": -0.14186367020010948, |
|
"step": 438 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3523.875, |
|
"epoch": 0.25085714285714283, |
|
"grad_norm": 0.15350596606731415, |
|
"kl": 0.0535888671875, |
|
"learning_rate": 1.4150013466019114e-07, |
|
"loss": 0.013, |
|
"reward": -0.34710339456796646, |
|
"reward_std": 0.28258360363543034, |
|
"rewards/cosine_len_reward": -0.34710339456796646, |
|
"step": 439 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3099.8334045410156, |
|
"epoch": 0.25142857142857145, |
|
"grad_norm": 0.1954686939716339, |
|
"kl": 0.06451416015625, |
|
"learning_rate": 1.4019235263722034e-07, |
|
"loss": 0.0499, |
|
"reward": -0.3662155866622925, |
|
"reward_std": 0.3874107152223587, |
|
"rewards/cosine_len_reward": -0.3662155866622925, |
|
"step": 440 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.252, |
|
"grad_norm": 0.15524186193943024, |
|
"kl": 0.0484619140625, |
|
"learning_rate": 1.3890454406082956e-07, |
|
"loss": 0.0002, |
|
"reward": -0.24688275158405304, |
|
"reward_std": 0.4402740001678467, |
|
"rewards/cosine_len_reward": -0.24688275158405304, |
|
"step": 441 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2872.0416870117188, |
|
"epoch": 0.25257142857142856, |
|
"grad_norm": 0.41410180926322937, |
|
"kl": 0.04583740234375, |
|
"learning_rate": 1.3763677169699217e-07, |
|
"loss": -0.1527, |
|
"reward": -0.30389176309108734, |
|
"reward_std": 0.35049962252378464, |
|
"rewards/cosine_len_reward": -0.30389176309108734, |
|
"step": 442 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.25314285714285717, |
|
"grad_norm": 0.13994863629341125, |
|
"kl": 0.0460205078125, |
|
"learning_rate": 1.3638909733514452e-07, |
|
"loss": 0.0002, |
|
"reward": -0.45876485109329224, |
|
"reward_std": 0.27763616293668747, |
|
"rewards/cosine_len_reward": -0.45876485109329224, |
|
"step": 443 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.2537142857142857, |
|
"grad_norm": 0.13927753269672394, |
|
"kl": 0.0404052734375, |
|
"learning_rate": 1.351615817851748e-07, |
|
"loss": 0.0002, |
|
"reward": -0.5448080375790596, |
|
"reward_std": 0.423710398375988, |
|
"rewards/cosine_len_reward": -0.5448080375790596, |
|
"step": 444 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3210.5833740234375, |
|
"epoch": 0.2542857142857143, |
|
"grad_norm": 0.16514094173908234, |
|
"kl": 0.050933837890625, |
|
"learning_rate": 1.3395428487445914e-07, |
|
"loss": -0.0006, |
|
"reward": 0.08599076699465513, |
|
"reward_std": 0.44797009229660034, |
|
"rewards/cosine_len_reward": 0.08599076699465513, |
|
"step": 445 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2722.500030517578, |
|
"epoch": 0.25485714285714284, |
|
"grad_norm": 0.18482115864753723, |
|
"kl": 0.0404052734375, |
|
"learning_rate": 1.3276726544494571e-07, |
|
"loss": 0.1305, |
|
"reward": -0.2829571203328669, |
|
"reward_std": 0.4319111257791519, |
|
"rewards/cosine_len_reward": -0.2829571203328669, |
|
"step": 446 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2910.4583435058594, |
|
"epoch": 0.25542857142857145, |
|
"grad_norm": 0.2328081727027893, |
|
"kl": 0.0455322265625, |
|
"learning_rate": 1.316005813502869e-07, |
|
"loss": -0.0287, |
|
"reward": -0.2549123764038086, |
|
"reward_std": 0.4166012778878212, |
|
"rewards/cosine_len_reward": -0.2549123764038086, |
|
"step": 447 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3550.5833740234375, |
|
"epoch": 0.256, |
|
"grad_norm": 0.15128394961357117, |
|
"kl": 0.04803466796875, |
|
"learning_rate": 1.3045428945301953e-07, |
|
"loss": 0.0127, |
|
"reward": 0.059171320259338245, |
|
"reward_std": 0.4689246341586113, |
|
"rewards/cosine_len_reward": 0.059171320259338245, |
|
"step": 448 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3580.125, |
|
"epoch": 0.25657142857142856, |
|
"grad_norm": 0.13547244668006897, |
|
"kl": 0.03497314453125, |
|
"learning_rate": 1.2932844562179352e-07, |
|
"loss": -0.0015, |
|
"reward": 0.269029151648283, |
|
"reward_std": 0.42868663370609283, |
|
"rewards/cosine_len_reward": 0.269029151648283, |
|
"step": 449 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2534.791748046875, |
|
"epoch": 0.2571428571428571, |
|
"grad_norm": 0.21846917271614075, |
|
"kl": 0.042938232421875, |
|
"learning_rate": 1.2822310472864885e-07, |
|
"loss": 0.0232, |
|
"reward": -0.10483485460281372, |
|
"reward_std": 0.653083398938179, |
|
"rewards/cosine_len_reward": -0.10483485460281372, |
|
"step": 450 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3452.7083740234375, |
|
"epoch": 0.25771428571428573, |
|
"grad_norm": 0.19205856323242188, |
|
"kl": 0.0594482421875, |
|
"learning_rate": 1.2713832064634125e-07, |
|
"loss": 0.0789, |
|
"reward": -0.721703439950943, |
|
"reward_std": 0.2940813582390547, |
|
"rewards/cosine_len_reward": -0.721703439950943, |
|
"step": 451 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2950.625030517578, |
|
"epoch": 0.2582857142857143, |
|
"grad_norm": 0.20902574062347412, |
|
"kl": 0.04718017578125, |
|
"learning_rate": 1.260741462457165e-07, |
|
"loss": 0.1, |
|
"reward": -0.05610589450225234, |
|
"reward_std": 0.5669073164463043, |
|
"rewards/cosine_len_reward": -0.05610589450225234, |
|
"step": 452 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2547.6666717529297, |
|
"epoch": 0.25885714285714284, |
|
"grad_norm": 0.31278106570243835, |
|
"kl": 0.0537109375, |
|
"learning_rate": 1.2503063339313356e-07, |
|
"loss": 0.0898, |
|
"reward": 0.06469499785453081, |
|
"reward_std": 0.386950358748436, |
|
"rewards/cosine_len_reward": 0.06469499785453081, |
|
"step": 453 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3400.6666870117188, |
|
"epoch": 0.25942857142857145, |
|
"grad_norm": 0.26481106877326965, |
|
"kl": 0.12164306640625, |
|
"learning_rate": 1.2400783294793668e-07, |
|
"loss": -0.0344, |
|
"reward": -0.10013716202229261, |
|
"reward_std": 0.6173229813575745, |
|
"rewards/cosine_len_reward": -0.10013716202229261, |
|
"step": 454 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3558.375, |
|
"epoch": 0.26, |
|
"grad_norm": 0.16176006197929382, |
|
"kl": 0.041015625, |
|
"learning_rate": 1.2300579475997657e-07, |
|
"loss": 0.0146, |
|
"reward": -0.06885599717497826, |
|
"reward_std": 0.5056197047233582, |
|
"rewards/cosine_len_reward": -0.06885599717497826, |
|
"step": 455 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.26057142857142856, |
|
"grad_norm": 0.15790623426437378, |
|
"kl": 0.052734375, |
|
"learning_rate": 1.220245676671809e-07, |
|
"loss": 0.0002, |
|
"reward": 0.1165997963398695, |
|
"reward_std": 0.4357607886195183, |
|
"rewards/cosine_len_reward": 0.1165997963398695, |
|
"step": 456 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2898.375, |
|
"epoch": 0.2611428571428571, |
|
"grad_norm": 0.2058410793542862, |
|
"kl": 0.06005859375, |
|
"learning_rate": 1.2106419949317388e-07, |
|
"loss": 0.0056, |
|
"reward": -0.03989528864622116, |
|
"reward_std": 0.46769294142723083, |
|
"rewards/cosine_len_reward": -0.03989528864622116, |
|
"step": 457 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3425.5, |
|
"epoch": 0.26171428571428573, |
|
"grad_norm": 0.18089525401592255, |
|
"kl": 0.051483154296875, |
|
"learning_rate": 1.2012473704494537e-07, |
|
"loss": -0.0597, |
|
"reward": -0.03286702465265989, |
|
"reward_std": 0.5668174773454666, |
|
"rewards/cosine_len_reward": -0.03286702465265989, |
|
"step": 458 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.2622857142857143, |
|
"grad_norm": 0.1464470624923706, |
|
"kl": 0.04827880859375, |
|
"learning_rate": 1.1920622611056974e-07, |
|
"loss": 0.0002, |
|
"reward": -0.1251504085958004, |
|
"reward_std": 0.2749813590198755, |
|
"rewards/cosine_len_reward": -0.1251504085958004, |
|
"step": 459 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3265.3333740234375, |
|
"epoch": 0.26285714285714284, |
|
"grad_norm": 0.2260807752609253, |
|
"kl": 0.05755615234375, |
|
"learning_rate": 1.1830871145697412e-07, |
|
"loss": 0.002, |
|
"reward": 0.15173575282096863, |
|
"reward_std": 0.4503812901675701, |
|
"rewards/cosine_len_reward": 0.15173575282096863, |
|
"step": 460 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2819.500030517578, |
|
"epoch": 0.2634285714285714, |
|
"grad_norm": 0.2082054316997528, |
|
"kl": 0.054931640625, |
|
"learning_rate": 1.1743223682775649e-07, |
|
"loss": 0.0659, |
|
"reward": 0.03663429245352745, |
|
"reward_std": 0.38886918127536774, |
|
"rewards/cosine_len_reward": 0.03663429245352745, |
|
"step": 461 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3554.375, |
|
"epoch": 0.264, |
|
"grad_norm": 0.13131263852119446, |
|
"kl": 0.03497314453125, |
|
"learning_rate": 1.1657684494105386e-07, |
|
"loss": 0.0134, |
|
"reward": 0.02211976982653141, |
|
"reward_std": 0.3899141475558281, |
|
"rewards/cosine_len_reward": 0.02211976982653141, |
|
"step": 462 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.26457142857142857, |
|
"grad_norm": 0.17710186541080475, |
|
"kl": 0.04364013671875, |
|
"learning_rate": 1.1574257748745986e-07, |
|
"loss": 0.0002, |
|
"reward": -0.16693101823329926, |
|
"reward_std": 0.42890677601099014, |
|
"rewards/cosine_len_reward": -0.16693101823329926, |
|
"step": 463 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3220.0833740234375, |
|
"epoch": 0.2651428571428571, |
|
"grad_norm": 0.22023098170757294, |
|
"kl": 0.0413818359375, |
|
"learning_rate": 1.1492947512799328e-07, |
|
"loss": -0.0812, |
|
"reward": -0.09500008448958397, |
|
"reward_std": 0.38959069550037384, |
|
"rewards/cosine_len_reward": -0.09500008448958397, |
|
"step": 464 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.26571428571428574, |
|
"grad_norm": 0.15240216255187988, |
|
"kl": 0.0389404296875, |
|
"learning_rate": 1.1413757749211602e-07, |
|
"loss": 0.0002, |
|
"reward": 0.16528711840510368, |
|
"reward_std": 0.5445774495601654, |
|
"rewards/cosine_len_reward": 0.16528711840510368, |
|
"step": 465 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3150.625, |
|
"epoch": 0.2662857142857143, |
|
"grad_norm": 0.16584616899490356, |
|
"kl": 0.04217529296875, |
|
"learning_rate": 1.1336692317580158e-07, |
|
"loss": -0.0149, |
|
"reward": -0.257739894092083, |
|
"reward_std": 0.25873103737831116, |
|
"rewards/cosine_len_reward": -0.257739894092083, |
|
"step": 466 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3531.7916870117188, |
|
"epoch": 0.26685714285714285, |
|
"grad_norm": 0.15734802186489105, |
|
"kl": 0.043212890625, |
|
"learning_rate": 1.1261754973965422e-07, |
|
"loss": 0.0251, |
|
"reward": 0.20713631808757782, |
|
"reward_std": 0.4699949249625206, |
|
"rewards/cosine_len_reward": 0.20713631808757782, |
|
"step": 467 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3358.5833740234375, |
|
"epoch": 0.2674285714285714, |
|
"grad_norm": 0.15957476198673248, |
|
"kl": 0.05712890625, |
|
"learning_rate": 1.1188949370707787e-07, |
|
"loss": -0.049, |
|
"reward": -0.12391296029090881, |
|
"reward_std": 0.5305506736040115, |
|
"rewards/cosine_len_reward": -0.12391296029090881, |
|
"step": 468 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3467.5, |
|
"epoch": 0.268, |
|
"grad_norm": 0.16685207188129425, |
|
"kl": 0.04998779296875, |
|
"learning_rate": 1.1118279056249653e-07, |
|
"loss": 0.0354, |
|
"reward": -0.40232421085238457, |
|
"reward_std": 0.4523722641170025, |
|
"rewards/cosine_len_reward": -0.40232421085238457, |
|
"step": 469 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2356.9166870117188, |
|
"epoch": 0.26857142857142857, |
|
"grad_norm": 0.2733165919780731, |
|
"kl": 0.0599365234375, |
|
"learning_rate": 1.1049747474962444e-07, |
|
"loss": 0.1523, |
|
"reward": -0.20090949651785195, |
|
"reward_std": 0.5469123795628548, |
|
"rewards/cosine_len_reward": -0.20090949651785195, |
|
"step": 470 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3320.0416870117188, |
|
"epoch": 0.26914285714285713, |
|
"grad_norm": 0.1729491502046585, |
|
"kl": 0.04193115234375, |
|
"learning_rate": 1.0983357966978745e-07, |
|
"loss": -0.0024, |
|
"reward": 0.02039976231753826, |
|
"reward_std": 0.4906746745109558, |
|
"rewards/cosine_len_reward": 0.02039976231753826, |
|
"step": 471 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3436.1666870117188, |
|
"epoch": 0.26971428571428574, |
|
"grad_norm": 0.15716099739074707, |
|
"kl": 0.0533447265625, |
|
"learning_rate": 1.0919113768029517e-07, |
|
"loss": 0.0387, |
|
"reward": 0.10714660119265318, |
|
"reward_std": 0.4147900193929672, |
|
"rewards/cosine_len_reward": 0.10714660119265318, |
|
"step": 472 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2971.4583435058594, |
|
"epoch": 0.2702857142857143, |
|
"grad_norm": 0.20368468761444092, |
|
"kl": 0.0540771484375, |
|
"learning_rate": 1.0857018009286381e-07, |
|
"loss": -0.0079, |
|
"reward": -0.6853213012218475, |
|
"reward_std": 0.3286122828722, |
|
"rewards/cosine_len_reward": -0.6853213012218475, |
|
"step": 473 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2670.0416870117188, |
|
"epoch": 0.27085714285714285, |
|
"grad_norm": 0.17690664529800415, |
|
"kl": 0.03338623046875, |
|
"learning_rate": 1.0797073717209013e-07, |
|
"loss": 0.0268, |
|
"reward": -0.29098084941506386, |
|
"reward_std": 0.49564552307128906, |
|
"rewards/cosine_len_reward": -0.29098084941506386, |
|
"step": 474 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3349.6250610351562, |
|
"epoch": 0.2714285714285714, |
|
"grad_norm": 0.19052566587924957, |
|
"kl": 0.05389404296875, |
|
"learning_rate": 1.0739283813397639e-07, |
|
"loss": -0.029, |
|
"reward": -0.1653405874967575, |
|
"reward_std": 0.5682762004435062, |
|
"rewards/cosine_len_reward": -0.1653405874967575, |
|
"step": 475 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.272, |
|
"grad_norm": 0.15981078147888184, |
|
"kl": 0.05010986328125, |
|
"learning_rate": 1.068365111445064e-07, |
|
"loss": 0.0002, |
|
"reward": -0.6135335117578506, |
|
"reward_std": 0.3161969594657421, |
|
"rewards/cosine_len_reward": -0.6135335117578506, |
|
"step": 476 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3550.7916870117188, |
|
"epoch": 0.2725714285714286, |
|
"grad_norm": 0.16642750799655914, |
|
"kl": 0.04486083984375, |
|
"learning_rate": 1.063017833182728e-07, |
|
"loss": 0.0194, |
|
"reward": -0.25540533661842346, |
|
"reward_std": 0.48319850862026215, |
|
"rewards/cosine_len_reward": -0.25540533661842346, |
|
"step": 477 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.27314285714285713, |
|
"grad_norm": 0.16831888258457184, |
|
"kl": 0.04388427734375, |
|
"learning_rate": 1.0578868071715544e-07, |
|
"loss": 0.0002, |
|
"reward": 0.19778522849082947, |
|
"reward_std": 0.35869789123535156, |
|
"rewards/cosine_len_reward": 0.19778522849082947, |
|
"step": 478 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3169.6666870117188, |
|
"epoch": 0.2737142857142857, |
|
"grad_norm": 0.16316814720630646, |
|
"kl": 0.05181884765625, |
|
"learning_rate": 1.0529722834905125e-07, |
|
"loss": 0.0321, |
|
"reward": -0.1417078822851181, |
|
"reward_std": 0.4882785305380821, |
|
"rewards/cosine_len_reward": -0.1417078822851181, |
|
"step": 479 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3529.0833740234375, |
|
"epoch": 0.2742857142857143, |
|
"grad_norm": 0.14413844048976898, |
|
"kl": 0.05267333984375, |
|
"learning_rate": 1.0482745016665526e-07, |
|
"loss": -0.0048, |
|
"reward": -0.21695643290877342, |
|
"reward_std": 0.43684104457497597, |
|
"rewards/cosine_len_reward": -0.21695643290877342, |
|
"step": 480 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2982.5416870117188, |
|
"epoch": 0.27485714285714286, |
|
"grad_norm": 0.7589179277420044, |
|
"kl": 0.05596923828125, |
|
"learning_rate": 1.0437936906629334e-07, |
|
"loss": 0.1907, |
|
"reward": -0.02579532004892826, |
|
"reward_std": 0.24179279431700706, |
|
"rewards/cosine_len_reward": -0.02579532004892826, |
|
"step": 481 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3213.5833740234375, |
|
"epoch": 0.2754285714285714, |
|
"grad_norm": 0.1643751859664917, |
|
"kl": 0.048980712890625, |
|
"learning_rate": 1.0395300688680625e-07, |
|
"loss": -0.1096, |
|
"reward": 0.004770293831825256, |
|
"reward_std": 0.44362664967775345, |
|
"rewards/cosine_len_reward": 0.004770293831825256, |
|
"step": 482 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3459.0416870117188, |
|
"epoch": 0.276, |
|
"grad_norm": 0.1669885367155075, |
|
"kl": 0.05377197265625, |
|
"learning_rate": 1.0354838440848501e-07, |
|
"loss": 0.0103, |
|
"reward": -0.5875770300626755, |
|
"reward_std": 0.31428810209035873, |
|
"rewards/cosine_len_reward": -0.5875770300626755, |
|
"step": 483 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2685.000030517578, |
|
"epoch": 0.2765714285714286, |
|
"grad_norm": 0.33783161640167236, |
|
"kl": 0.0472412109375, |
|
"learning_rate": 1.0316552135205837e-07, |
|
"loss": -0.0571, |
|
"reward": -0.06883000582456589, |
|
"reward_std": 0.30373527109622955, |
|
"rewards/cosine_len_reward": -0.06883000582456589, |
|
"step": 484 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3374.7916870117188, |
|
"epoch": 0.27714285714285714, |
|
"grad_norm": 0.15891632437705994, |
|
"kl": 0.047607421875, |
|
"learning_rate": 1.0280443637773163e-07, |
|
"loss": 0.016, |
|
"reward": -0.2871231231838465, |
|
"reward_std": 0.5418589785695076, |
|
"rewards/cosine_len_reward": -0.2871231231838465, |
|
"step": 485 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2790.208335876465, |
|
"epoch": 0.2777142857142857, |
|
"grad_norm": 0.28455618023872375, |
|
"kl": 0.047607421875, |
|
"learning_rate": 1.0246514708427701e-07, |
|
"loss": 0.0522, |
|
"reward": 0.011902973055839539, |
|
"reward_std": 0.3104940876364708, |
|
"rewards/cosine_len_reward": 0.011902973055839539, |
|
"step": 486 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2883.5, |
|
"epoch": 0.2782857142857143, |
|
"grad_norm": 0.20438960194587708, |
|
"kl": 0.041229248046875, |
|
"learning_rate": 1.0214767000817596e-07, |
|
"loss": 0.0462, |
|
"reward": 0.34762556478381157, |
|
"reward_std": 0.29484141059219837, |
|
"rewards/cosine_len_reward": 0.34762556478381157, |
|
"step": 487 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3420.1666870117188, |
|
"epoch": 0.27885714285714286, |
|
"grad_norm": 0.13923093676567078, |
|
"kl": 0.0538330078125, |
|
"learning_rate": 1.0185202062281336e-07, |
|
"loss": -0.0637, |
|
"reward": -0.3226715254713781, |
|
"reward_std": 0.4678328037261963, |
|
"rewards/cosine_len_reward": -0.3226715254713781, |
|
"step": 488 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3584.0, |
|
"epoch": 0.2794285714285714, |
|
"grad_norm": 0.22048485279083252, |
|
"kl": 0.1324462890625, |
|
"learning_rate": 1.0157821333772304e-07, |
|
"loss": 0.0005, |
|
"reward": -0.08934877812862396, |
|
"reward_std": 0.6139358580112457, |
|
"rewards/cosine_len_reward": -0.08934877812862396, |
|
"step": 489 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2766.4166870117188, |
|
"epoch": 0.28, |
|
"grad_norm": 0.21742548048496246, |
|
"kl": 0.06146240234375, |
|
"learning_rate": 1.013262614978859e-07, |
|
"loss": -0.1495, |
|
"reward": -0.2719786809757352, |
|
"reward_std": 0.5813306570053101, |
|
"rewards/cosine_len_reward": -0.2719786809757352, |
|
"step": 490 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2932.4583435058594, |
|
"epoch": 0.2805714285714286, |
|
"grad_norm": 0.4229942262172699, |
|
"kl": 0.0546875, |
|
"learning_rate": 1.0109617738307911e-07, |
|
"loss": 0.2389, |
|
"reward": -0.3160274773836136, |
|
"reward_std": 0.4782961308956146, |
|
"rewards/cosine_len_reward": -0.3160274773836136, |
|
"step": 491 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2788.375, |
|
"epoch": 0.28114285714285714, |
|
"grad_norm": 0.5186205506324768, |
|
"kl": 0.08941650390625, |
|
"learning_rate": 1.0088797220727779e-07, |
|
"loss": 0.2067, |
|
"reward": 0.012615039944648743, |
|
"reward_std": 0.5839090943336487, |
|
"rewards/cosine_len_reward": 0.012615039944648743, |
|
"step": 492 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2743.9583587646484, |
|
"epoch": 0.2817142857142857, |
|
"grad_norm": 0.25046536326408386, |
|
"kl": 0.077392578125, |
|
"learning_rate": 1.0070165611810855e-07, |
|
"loss": 0.0376, |
|
"reward": -0.17700890451669693, |
|
"reward_std": 0.6207068264484406, |
|
"rewards/cosine_len_reward": -0.17700890451669693, |
|
"step": 493 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2884.7083740234375, |
|
"epoch": 0.2822857142857143, |
|
"grad_norm": 0.27387452125549316, |
|
"kl": 0.05712890625, |
|
"learning_rate": 1.005372381963547e-07, |
|
"loss": -0.1072, |
|
"reward": -0.5983569696545601, |
|
"reward_std": 0.48679885268211365, |
|
"rewards/cosine_len_reward": -0.5983569696545601, |
|
"step": 494 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3555.75, |
|
"epoch": 0.28285714285714286, |
|
"grad_norm": 0.14310041069984436, |
|
"kl": 0.04302978515625, |
|
"learning_rate": 1.0039472645551372e-07, |
|
"loss": 0.0063, |
|
"reward": -0.03963024541735649, |
|
"reward_std": 0.41995228826999664, |
|
"rewards/cosine_len_reward": -0.03963024541735649, |
|
"step": 495 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2138.916732788086, |
|
"epoch": 0.2834285714285714, |
|
"grad_norm": 0.4353350102901459, |
|
"kl": 0.059326171875, |
|
"learning_rate": 1.002741278414069e-07, |
|
"loss": -0.0066, |
|
"reward": 0.10497748292982578, |
|
"reward_std": 0.4895341917872429, |
|
"rewards/cosine_len_reward": 0.10497748292982578, |
|
"step": 496 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3536.7916870117188, |
|
"epoch": 0.284, |
|
"grad_norm": 0.1279255449771881, |
|
"kl": 0.037567138671875, |
|
"learning_rate": 1.0017544823184055e-07, |
|
"loss": 0.0205, |
|
"reward": 0.0027112215757369995, |
|
"reward_std": 0.41501184925436974, |
|
"rewards/cosine_len_reward": 0.0027112215757369995, |
|
"step": 497 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3007.2916870117188, |
|
"epoch": 0.2845714285714286, |
|
"grad_norm": 0.37080392241477966, |
|
"kl": 0.1373291015625, |
|
"learning_rate": 1.0009869243631952e-07, |
|
"loss": 0.0912, |
|
"reward": -0.23442494124174118, |
|
"reward_std": 0.4831595793366432, |
|
"rewards/cosine_len_reward": -0.23442494124174118, |
|
"step": 498 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3276.4166870117188, |
|
"epoch": 0.28514285714285714, |
|
"grad_norm": 0.21131351590156555, |
|
"kl": 0.0936279296875, |
|
"learning_rate": 1.000438641958131e-07, |
|
"loss": -0.0781, |
|
"reward": -0.07970089465379715, |
|
"reward_std": 0.351084902882576, |
|
"rewards/cosine_len_reward": -0.07970089465379715, |
|
"step": 499 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3468.1666870117188, |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 0.14872434735298157, |
|
"kl": 0.051025390625, |
|
"learning_rate": 1.0001096618257236e-07, |
|
"loss": -0.0709, |
|
"reward": 0.30408234894275665, |
|
"reward_std": 0.49906710535287857, |
|
"rewards/cosine_len_reward": 0.30408234894275665, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"step": 500, |
|
"total_flos": 0.0, |
|
"train_loss": 0.002130194778990699, |
|
"train_runtime": 27178.7627, |
|
"train_samples_per_second": 0.442, |
|
"train_steps_per_second": 0.018 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 6, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|