|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2822.7857666015625, |
|
"epoch": 0.004, |
|
"grad_norm": 0.12564538419246674, |
|
"kl": 0.0, |
|
"learning_rate": 2e-08, |
|
"loss": 0.0645, |
|
"reward": 0.09580668434500694, |
|
"reward_std": 0.5702872574329376, |
|
"rewards/cosine_scaled_reward": -0.14554904401302338, |
|
"rewards/format_reward": 0.3869047649204731, |
|
"step": 1 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2575.571533203125, |
|
"epoch": 0.008, |
|
"grad_norm": 0.15411853790283203, |
|
"kl": 0.0, |
|
"learning_rate": 4e-08, |
|
"loss": 0.0717, |
|
"reward": 0.5743008255958557, |
|
"reward_std": 0.7826777100563049, |
|
"rewards/cosine_scaled_reward": 0.03119804011657834, |
|
"rewards/format_reward": 0.5119047686457634, |
|
"step": 2 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2762.1190490722656, |
|
"epoch": 0.012, |
|
"grad_norm": 0.13477382063865662, |
|
"kl": 3.463029861450195e-05, |
|
"learning_rate": 6e-08, |
|
"loss": 0.0865, |
|
"reward": 0.21700193732976913, |
|
"reward_std": 0.6844624578952789, |
|
"rewards/cosine_scaled_reward": -0.10578475520014763, |
|
"rewards/format_reward": 0.4285714402794838, |
|
"step": 3 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2686.3214721679688, |
|
"epoch": 0.016, |
|
"grad_norm": 0.1282820850610733, |
|
"kl": 2.434849739074707e-05, |
|
"learning_rate": 8e-08, |
|
"loss": 0.0525, |
|
"reward": 0.4696298725903034, |
|
"reward_std": 0.7235232815146446, |
|
"rewards/cosine_scaled_reward": -0.0062565067782998085, |
|
"rewards/format_reward": 0.4821428582072258, |
|
"step": 4 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2917.5535888671875, |
|
"epoch": 0.02, |
|
"grad_norm": 0.14993517100811005, |
|
"kl": 3.725290298461914e-05, |
|
"learning_rate": 1e-07, |
|
"loss": 0.0762, |
|
"reward": 0.15318153076805174, |
|
"reward_std": 0.7213103845715523, |
|
"rewards/cosine_scaled_reward": -0.08412353717721999, |
|
"rewards/format_reward": 0.3214285857975483, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2816.2559814453125, |
|
"epoch": 0.024, |
|
"grad_norm": 0.14960958063602448, |
|
"kl": 3.1054019927978516e-05, |
|
"learning_rate": 1.2e-07, |
|
"loss": 0.0537, |
|
"reward": 0.2950221598148346, |
|
"reward_std": 0.738863505423069, |
|
"rewards/cosine_scaled_reward": -0.057846077223075554, |
|
"rewards/format_reward": 0.410714291036129, |
|
"step": 6 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2870.3988647460938, |
|
"epoch": 0.028, |
|
"grad_norm": 0.10985030233860016, |
|
"kl": 2.8133392333984375e-05, |
|
"learning_rate": 1.4e-07, |
|
"loss": 0.0068, |
|
"reward": 0.27893248095642775, |
|
"reward_std": 0.7550084367394447, |
|
"rewards/cosine_scaled_reward": -0.05993851972743869, |
|
"rewards/format_reward": 0.3988095410168171, |
|
"step": 7 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3160.452392578125, |
|
"epoch": 0.032, |
|
"grad_norm": 0.10308283567428589, |
|
"kl": 3.8176774978637695e-05, |
|
"learning_rate": 1.6e-07, |
|
"loss": 0.0223, |
|
"reward": 0.07877065148204565, |
|
"reward_std": 0.6431715190410614, |
|
"rewards/cosine_scaled_reward": -0.08263849129434675, |
|
"rewards/format_reward": 0.2440476268529892, |
|
"step": 8 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3020.607177734375, |
|
"epoch": 0.036, |
|
"grad_norm": 0.15057384967803955, |
|
"kl": 3.37064266204834e-05, |
|
"learning_rate": 1.8e-07, |
|
"loss": 0.0733, |
|
"reward": 0.06793000735342503, |
|
"reward_std": 0.6978132948279381, |
|
"rewards/cosine_scaled_reward": -0.12079690210521221, |
|
"rewards/format_reward": 0.3095238171517849, |
|
"step": 9 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3089.84521484375, |
|
"epoch": 0.04, |
|
"grad_norm": 0.11256518214941025, |
|
"kl": 3.2395124435424805e-05, |
|
"learning_rate": 2e-07, |
|
"loss": 0.0413, |
|
"reward": 0.032662300392985344, |
|
"reward_std": 0.6881319805979729, |
|
"rewards/cosine_scaled_reward": -0.13545456249266863, |
|
"rewards/format_reward": 0.3035714365541935, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2851.946533203125, |
|
"epoch": 0.044, |
|
"grad_norm": 0.17106953263282776, |
|
"kl": 3.784894943237305e-05, |
|
"learning_rate": 2.1999999999999998e-07, |
|
"loss": 0.0636, |
|
"reward": 0.3718952457420528, |
|
"reward_std": 0.6902545392513275, |
|
"rewards/cosine_scaled_reward": -0.03131428617052734, |
|
"rewards/format_reward": 0.4345238134264946, |
|
"step": 11 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2798.5178833007812, |
|
"epoch": 0.048, |
|
"grad_norm": 0.1335103064775467, |
|
"kl": 2.9146671295166016e-05, |
|
"learning_rate": 2.4e-07, |
|
"loss": 0.0543, |
|
"reward": 0.40071453526616096, |
|
"reward_std": 0.7024472132325172, |
|
"rewards/cosine_scaled_reward": -0.02285701408982277, |
|
"rewards/format_reward": 0.4464285746216774, |
|
"step": 12 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2948.9464721679688, |
|
"epoch": 0.052, |
|
"grad_norm": 0.1271769255399704, |
|
"kl": 3.698468208312988e-05, |
|
"learning_rate": 2.6e-07, |
|
"loss": 0.0693, |
|
"reward": 0.47545497864484787, |
|
"reward_std": 0.7740402817726135, |
|
"rewards/cosine_scaled_reward": 0.002608438953757286, |
|
"rewards/format_reward": 0.470238097012043, |
|
"step": 13 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2679.3928833007812, |
|
"epoch": 0.056, |
|
"grad_norm": 0.12242422997951508, |
|
"kl": 2.8014183044433594e-05, |
|
"learning_rate": 2.8e-07, |
|
"loss": 0.0489, |
|
"reward": 0.40165250562131405, |
|
"reward_std": 0.7790777683258057, |
|
"rewards/cosine_scaled_reward": -0.028340420685708523, |
|
"rewards/format_reward": 0.4583333507180214, |
|
"step": 14 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2889.136962890625, |
|
"epoch": 0.06, |
|
"grad_norm": 0.19158992171287537, |
|
"kl": 3.224611282348633e-05, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0704, |
|
"reward": 0.15117042418569326, |
|
"reward_std": 0.6893174201250076, |
|
"rewards/cosine_scaled_reward": -0.10893859504722059, |
|
"rewards/format_reward": 0.3690476231276989, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2892.6488647460938, |
|
"epoch": 0.064, |
|
"grad_norm": 0.15633279085159302, |
|
"kl": 3.668665885925293e-05, |
|
"learning_rate": 3.2e-07, |
|
"loss": 0.0733, |
|
"reward": -0.09426919370889664, |
|
"reward_std": 0.5802397355437279, |
|
"rewards/cosine_scaled_reward": -0.18403935432434082, |
|
"rewards/format_reward": 0.2738095298409462, |
|
"step": 16 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2920.3511962890625, |
|
"epoch": 0.068, |
|
"grad_norm": 0.12191536277532578, |
|
"kl": 3.221631050109863e-05, |
|
"learning_rate": 3.4000000000000003e-07, |
|
"loss": 0.0214, |
|
"reward": 0.13339833123609424, |
|
"reward_std": 0.6428257077932358, |
|
"rewards/cosine_scaled_reward": -0.11187227349728346, |
|
"rewards/format_reward": 0.3571428619325161, |
|
"step": 17 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2704.672607421875, |
|
"epoch": 0.072, |
|
"grad_norm": 0.2207571119070053, |
|
"kl": 2.4259090423583984e-05, |
|
"learning_rate": 3.6e-07, |
|
"loss": 0.0858, |
|
"reward": 0.4250662699341774, |
|
"reward_std": 0.7673918604850769, |
|
"rewards/cosine_scaled_reward": -0.019609727547504008, |
|
"rewards/format_reward": 0.4642857350409031, |
|
"step": 18 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2800.6429443359375, |
|
"epoch": 0.076, |
|
"grad_norm": 0.12734168767929077, |
|
"kl": 2.4378299713134766e-05, |
|
"learning_rate": 3.7999999999999996e-07, |
|
"loss": 0.0471, |
|
"reward": 0.4042445756494999, |
|
"reward_std": 0.6929292231798172, |
|
"rewards/cosine_scaled_reward": -0.02704438249929808, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 19 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2697.8274536132812, |
|
"epoch": 0.08, |
|
"grad_norm": 0.15472018718719482, |
|
"kl": 2.35140323638916e-05, |
|
"learning_rate": 4e-07, |
|
"loss": 0.0265, |
|
"reward": 0.3560524769127369, |
|
"reward_std": 0.6769110411405563, |
|
"rewards/cosine_scaled_reward": -0.05114044318906963, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2339.4405517578125, |
|
"epoch": 0.084, |
|
"grad_norm": 0.21466447412967682, |
|
"kl": 2.086162567138672e-05, |
|
"learning_rate": 4.1999999999999995e-07, |
|
"loss": 0.0806, |
|
"reward": 0.7416469305753708, |
|
"reward_std": 0.841043546795845, |
|
"rewards/cosine_scaled_reward": 0.06725202780216932, |
|
"rewards/format_reward": 0.6071428656578064, |
|
"step": 21 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2781.5238037109375, |
|
"epoch": 0.088, |
|
"grad_norm": 0.19139103591442108, |
|
"kl": 3.0338764190673828e-05, |
|
"learning_rate": 4.3999999999999997e-07, |
|
"loss": 0.0773, |
|
"reward": 0.20257593411952257, |
|
"reward_std": 0.7891978472471237, |
|
"rewards/cosine_scaled_reward": -0.1129977386444807, |
|
"rewards/format_reward": 0.4285714365541935, |
|
"step": 22 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3036.761962890625, |
|
"epoch": 0.092, |
|
"grad_norm": 0.1108132153749466, |
|
"kl": 2.6345252990722656e-05, |
|
"learning_rate": 4.6e-07, |
|
"loss": 0.0238, |
|
"reward": 0.20629926398396492, |
|
"reward_std": 0.7457813173532486, |
|
"rewards/cosine_scaled_reward": -0.07542179408483207, |
|
"rewards/format_reward": 0.3571428619325161, |
|
"step": 23 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3143.1131591796875, |
|
"epoch": 0.096, |
|
"grad_norm": 0.10591176152229309, |
|
"kl": 2.703070640563965e-05, |
|
"learning_rate": 4.8e-07, |
|
"loss": 0.0637, |
|
"reward": 0.0749267227947712, |
|
"reward_std": 0.6808565855026245, |
|
"rewards/cosine_scaled_reward": -0.1292033027857542, |
|
"rewards/format_reward": 0.3333333395421505, |
|
"step": 24 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2934.4227294921875, |
|
"epoch": 0.1, |
|
"grad_norm": 0.12180113047361374, |
|
"kl": 1.4990568161010742e-05, |
|
"learning_rate": 5e-07, |
|
"loss": 0.0525, |
|
"reward": 0.3605663161724806, |
|
"reward_std": 0.7757866084575653, |
|
"rewards/cosine_scaled_reward": -0.028050171211361885, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3100.5357666015625, |
|
"epoch": 0.104, |
|
"grad_norm": 0.14736856520175934, |
|
"kl": 2.230703830718994e-05, |
|
"learning_rate": 5.2e-07, |
|
"loss": 0.087, |
|
"reward": 0.1489051878452301, |
|
"reward_std": 0.7608643025159836, |
|
"rewards/cosine_scaled_reward": -0.08923787740059197, |
|
"rewards/format_reward": 0.32738095708191395, |
|
"step": 26 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2978.452392578125, |
|
"epoch": 0.108, |
|
"grad_norm": 0.1490376740694046, |
|
"kl": 2.7447938919067383e-05, |
|
"learning_rate": 5.4e-07, |
|
"loss": 0.0633, |
|
"reward": 0.16602796246297657, |
|
"reward_std": 0.762113556265831, |
|
"rewards/cosine_scaled_reward": -0.09555743727833033, |
|
"rewards/format_reward": 0.3571428656578064, |
|
"step": 27 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2923.1012573242188, |
|
"epoch": 0.112, |
|
"grad_norm": 0.11410558968782425, |
|
"kl": 3.108382225036621e-05, |
|
"learning_rate": 5.6e-07, |
|
"loss": 0.0618, |
|
"reward": 0.058234728407114744, |
|
"reward_std": 0.5919530540704727, |
|
"rewards/cosine_scaled_reward": -0.14052549470216036, |
|
"rewards/format_reward": 0.3392857201397419, |
|
"step": 28 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2925.1011962890625, |
|
"epoch": 0.116, |
|
"grad_norm": 0.17734545469284058, |
|
"kl": 5.251169204711914e-05, |
|
"learning_rate": 5.8e-07, |
|
"loss": 0.0463, |
|
"reward": 0.24072746047750115, |
|
"reward_std": 0.7061209976673126, |
|
"rewards/cosine_scaled_reward": -0.061183891259133816, |
|
"rewards/format_reward": 0.3630952425301075, |
|
"step": 29 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2882.39892578125, |
|
"epoch": 0.12, |
|
"grad_norm": 0.15557299554347992, |
|
"kl": 2.1502375602722168e-05, |
|
"learning_rate": 6e-07, |
|
"loss": 0.0703, |
|
"reward": 0.22035705484449863, |
|
"reward_std": 0.5751676708459854, |
|
"rewards/cosine_scaled_reward": -0.07136909663677216, |
|
"rewards/format_reward": 0.3630952462553978, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2717.607177734375, |
|
"epoch": 0.124, |
|
"grad_norm": 0.16903533041477203, |
|
"kl": 6.181001663208008e-05, |
|
"learning_rate": 6.2e-07, |
|
"loss": 0.07, |
|
"reward": 0.3481953740119934, |
|
"reward_std": 0.7361179888248444, |
|
"rewards/cosine_scaled_reward": -0.025307081639766693, |
|
"rewards/format_reward": 0.3988095298409462, |
|
"step": 31 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2585.244140625, |
|
"epoch": 0.128, |
|
"grad_norm": 0.1481872797012329, |
|
"kl": 0.00023996829986572266, |
|
"learning_rate": 6.4e-07, |
|
"loss": 0.0664, |
|
"reward": 0.5805501043796539, |
|
"reward_std": 0.805858314037323, |
|
"rewards/cosine_scaled_reward": 0.019441714510321617, |
|
"rewards/format_reward": 0.5416666865348816, |
|
"step": 32 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2608.7083740234375, |
|
"epoch": 0.132, |
|
"grad_norm": 0.10800693184137344, |
|
"kl": 0.0002808570861816406, |
|
"learning_rate": 6.6e-07, |
|
"loss": 0.0255, |
|
"reward": 0.5432634204626083, |
|
"reward_std": 0.7363616675138474, |
|
"rewards/cosine_scaled_reward": 0.02460789866745472, |
|
"rewards/format_reward": 0.4940476268529892, |
|
"step": 33 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2769.1964721679688, |
|
"epoch": 0.136, |
|
"grad_norm": 0.12105516344308853, |
|
"kl": 0.00020498037338256836, |
|
"learning_rate": 6.800000000000001e-07, |
|
"loss": 0.0184, |
|
"reward": 0.18091929703950882, |
|
"reward_std": 0.6703035831451416, |
|
"rewards/cosine_scaled_reward": -0.10596893168985844, |
|
"rewards/format_reward": 0.3928571604192257, |
|
"step": 34 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3090.1607666015625, |
|
"epoch": 0.14, |
|
"grad_norm": 0.11914981156587601, |
|
"kl": 0.0002568960189819336, |
|
"learning_rate": 7e-07, |
|
"loss": 0.0617, |
|
"reward": 0.08196480484912172, |
|
"reward_std": 0.7742973417043686, |
|
"rewards/cosine_scaled_reward": -0.10782713070511818, |
|
"rewards/format_reward": 0.2976190559566021, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2790.0596313476562, |
|
"epoch": 0.144, |
|
"grad_norm": 0.10883598774671555, |
|
"kl": 0.00027942657470703125, |
|
"learning_rate": 7.2e-07, |
|
"loss": 0.0163, |
|
"reward": 0.31424143677577376, |
|
"reward_std": 0.669949933886528, |
|
"rewards/cosine_scaled_reward": -0.06311738677322865, |
|
"rewards/format_reward": 0.4404762014746666, |
|
"step": 36 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2916.3452758789062, |
|
"epoch": 0.148, |
|
"grad_norm": 0.1492447555065155, |
|
"kl": 0.00025272369384765625, |
|
"learning_rate": 7.4e-07, |
|
"loss": 0.0592, |
|
"reward": 0.1357547640800476, |
|
"reward_std": 0.7365808188915253, |
|
"rewards/cosine_scaled_reward": -0.10771786456461996, |
|
"rewards/format_reward": 0.3511904813349247, |
|
"step": 37 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3342.6786499023438, |
|
"epoch": 0.152, |
|
"grad_norm": 0.08414288610219955, |
|
"kl": 0.00013870000839233398, |
|
"learning_rate": 7.599999999999999e-07, |
|
"loss": 0.0239, |
|
"reward": -0.1723631415516138, |
|
"reward_std": 0.6109825298190117, |
|
"rewards/cosine_scaled_reward": -0.16951490193605423, |
|
"rewards/format_reward": 0.16666666977107525, |
|
"step": 38 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2762.5774536132812, |
|
"epoch": 0.156, |
|
"grad_norm": 0.14042888581752777, |
|
"kl": 0.0005602836608886719, |
|
"learning_rate": 7.799999999999999e-07, |
|
"loss": 0.0588, |
|
"reward": 0.3224933594465256, |
|
"reward_std": 0.6976565718650818, |
|
"rewards/cosine_scaled_reward": -0.04708665423095226, |
|
"rewards/format_reward": 0.416666679084301, |
|
"step": 39 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2729.9644165039062, |
|
"epoch": 0.16, |
|
"grad_norm": 0.110390305519104, |
|
"kl": 0.00016605854034423828, |
|
"learning_rate": 8e-07, |
|
"loss": 0.0549, |
|
"reward": 0.4423699714243412, |
|
"reward_std": 0.6286562532186508, |
|
"rewards/cosine_scaled_reward": -0.016910257749259472, |
|
"rewards/format_reward": 0.476190485060215, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2955.7084350585938, |
|
"epoch": 0.164, |
|
"grad_norm": 0.15253609418869019, |
|
"kl": 0.00040471553802490234, |
|
"learning_rate": 8.199999999999999e-07, |
|
"loss": 0.0718, |
|
"reward": 0.44552009692415595, |
|
"reward_std": 0.7759689763188362, |
|
"rewards/cosine_scaled_reward": 0.0144266925053671, |
|
"rewards/format_reward": 0.4166666753590107, |
|
"step": 41 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2961.0596313476562, |
|
"epoch": 0.168, |
|
"grad_norm": 0.22110103070735931, |
|
"kl": 0.0009613037109375, |
|
"learning_rate": 8.399999999999999e-07, |
|
"loss": 0.1186, |
|
"reward": 0.2217194978147745, |
|
"reward_std": 0.6207270994782448, |
|
"rewards/cosine_scaled_reward": -0.07366406172513962, |
|
"rewards/format_reward": 0.3690476268529892, |
|
"step": 42 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3015.7738647460938, |
|
"epoch": 0.172, |
|
"grad_norm": 0.23720885813236237, |
|
"kl": 0.0005915164947509766, |
|
"learning_rate": 8.599999999999999e-07, |
|
"loss": 0.1245, |
|
"reward": -0.04521503113210201, |
|
"reward_std": 0.62105892598629, |
|
"rewards/cosine_scaled_reward": -0.16844085440970957, |
|
"rewards/format_reward": 0.2916666716337204, |
|
"step": 43 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2937.2381591796875, |
|
"epoch": 0.176, |
|
"grad_norm": 0.09096106886863708, |
|
"kl": 0.00051116943359375, |
|
"learning_rate": 8.799999999999999e-07, |
|
"loss": 0.0249, |
|
"reward": 0.22813843563199043, |
|
"reward_std": 0.6727291792631149, |
|
"rewards/cosine_scaled_reward": -0.058549837151076645, |
|
"rewards/format_reward": 0.3452381007373333, |
|
"step": 44 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3149.511962890625, |
|
"epoch": 0.18, |
|
"grad_norm": 0.11164555698633194, |
|
"kl": 0.0004715919494628906, |
|
"learning_rate": 9e-07, |
|
"loss": 0.017, |
|
"reward": 0.05970348231494427, |
|
"reward_std": 0.7763290405273438, |
|
"rewards/cosine_scaled_reward": -0.11300540715456009, |
|
"rewards/format_reward": 0.285714291036129, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3184.6845703125, |
|
"epoch": 0.184, |
|
"grad_norm": 0.1711866706609726, |
|
"kl": 0.0007777214050292969, |
|
"learning_rate": 9.2e-07, |
|
"loss": 0.0731, |
|
"reward": 0.1386737246066332, |
|
"reward_std": 0.7276585251092911, |
|
"rewards/cosine_scaled_reward": -0.06459171324968338, |
|
"rewards/format_reward": 0.26785714738070965, |
|
"step": 46 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3015.386962890625, |
|
"epoch": 0.188, |
|
"grad_norm": 0.12470238655805588, |
|
"kl": 0.0014100074768066406, |
|
"learning_rate": 9.399999999999999e-07, |
|
"loss": 0.0653, |
|
"reward": 0.16049158992245793, |
|
"reward_std": 0.7017006278038025, |
|
"rewards/cosine_scaled_reward": -0.08642087457701564, |
|
"rewards/format_reward": 0.3333333432674408, |
|
"step": 47 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2909.96435546875, |
|
"epoch": 0.192, |
|
"grad_norm": 0.28355535864830017, |
|
"kl": 0.009288787841796875, |
|
"learning_rate": 9.6e-07, |
|
"loss": 0.0581, |
|
"reward": 0.0768200121819973, |
|
"reward_std": 0.7135801166296005, |
|
"rewards/cosine_scaled_reward": -0.14016142301261425, |
|
"rewards/format_reward": 0.3571428693830967, |
|
"step": 48 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2753.6845703125, |
|
"epoch": 0.196, |
|
"grad_norm": 0.6567728519439697, |
|
"kl": 0.023477554321289062, |
|
"learning_rate": 9.8e-07, |
|
"loss": 0.0866, |
|
"reward": 0.4337980281561613, |
|
"reward_std": 0.8068065047264099, |
|
"rewards/cosine_scaled_reward": -0.006315283477306366, |
|
"rewards/format_reward": 0.4464285746216774, |
|
"step": 49 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2934.8274536132812, |
|
"epoch": 0.2, |
|
"grad_norm": 0.11382321268320084, |
|
"kl": 0.0025758743286132812, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0714, |
|
"reward": 0.2880665063858032, |
|
"reward_std": 0.6403830945491791, |
|
"rewards/cosine_scaled_reward": -0.049419129034504294, |
|
"rewards/format_reward": 0.3869047649204731, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2840.7678833007812, |
|
"epoch": 0.204, |
|
"grad_norm": 0.11641126126050949, |
|
"kl": 0.0050792694091796875, |
|
"learning_rate": 9.999890338174275e-07, |
|
"loss": 0.023, |
|
"reward": 0.2840890493243933, |
|
"reward_std": 0.692708894610405, |
|
"rewards/cosine_scaled_reward": -0.06628882512450218, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 51 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3119.9881591796875, |
|
"epoch": 0.208, |
|
"grad_norm": 0.14652805030345917, |
|
"kl": 0.0032052993774414062, |
|
"learning_rate": 9.999561358041868e-07, |
|
"loss": 0.0641, |
|
"reward": 0.18620363296940923, |
|
"reward_std": 0.8490904271602631, |
|
"rewards/cosine_scaled_reward": -0.058683907613158226, |
|
"rewards/format_reward": 0.3035714365541935, |
|
"step": 52 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2926.9940795898438, |
|
"epoch": 0.212, |
|
"grad_norm": 0.19241634011268616, |
|
"kl": 0.00447845458984375, |
|
"learning_rate": 9.999013075636804e-07, |
|
"loss": 0.0747, |
|
"reward": 0.36803684243932366, |
|
"reward_std": 0.823193870484829, |
|
"rewards/cosine_scaled_reward": -0.006457769020926207, |
|
"rewards/format_reward": 0.380952388048172, |
|
"step": 53 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3031.7203369140625, |
|
"epoch": 0.216, |
|
"grad_norm": 0.12843742966651917, |
|
"kl": 0.0029668807983398438, |
|
"learning_rate": 9.998245517681593e-07, |
|
"loss": 0.0153, |
|
"reward": 0.2685772944241762, |
|
"reward_std": 0.6489126533269882, |
|
"rewards/cosine_scaled_reward": -0.04428278561681509, |
|
"rewards/format_reward": 0.3571428693830967, |
|
"step": 54 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3123.136962890625, |
|
"epoch": 0.22, |
|
"grad_norm": 0.1504901498556137, |
|
"kl": 0.006084442138671875, |
|
"learning_rate": 9.997258721585931e-07, |
|
"loss": 0.0691, |
|
"reward": 0.03501664288341999, |
|
"reward_std": 0.6481388062238693, |
|
"rewards/cosine_scaled_reward": -0.11642025248147547, |
|
"rewards/format_reward": 0.2678571529686451, |
|
"step": 55 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2673.434539794922, |
|
"epoch": 0.224, |
|
"grad_norm": 0.1224113255739212, |
|
"kl": 0.0033931732177734375, |
|
"learning_rate": 9.996052735444862e-07, |
|
"loss": 0.0269, |
|
"reward": 0.6296312126796693, |
|
"reward_std": 0.6751764714717865, |
|
"rewards/cosine_scaled_reward": 0.07374419644474983, |
|
"rewards/format_reward": 0.482142873108387, |
|
"step": 56 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3170.0535888671875, |
|
"epoch": 0.228, |
|
"grad_norm": 0.1044960618019104, |
|
"kl": 0.0024929046630859375, |
|
"learning_rate": 9.994627618036452e-07, |
|
"loss": 0.0262, |
|
"reward": 0.21022793278098106, |
|
"reward_std": 0.7194458544254303, |
|
"rewards/cosine_scaled_reward": -0.05560031719505787, |
|
"rewards/format_reward": 0.3214285857975483, |
|
"step": 57 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2590.0179138183594, |
|
"epoch": 0.232, |
|
"grad_norm": 0.13768674433231354, |
|
"kl": 0.0045948028564453125, |
|
"learning_rate": 9.992983438818915e-07, |
|
"loss": 0.0592, |
|
"reward": 0.4493846707046032, |
|
"reward_std": 0.7118680775165558, |
|
"rewards/cosine_scaled_reward": -0.02233148762024939, |
|
"rewards/format_reward": 0.4940476231276989, |
|
"step": 58 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3087.4702758789062, |
|
"epoch": 0.236, |
|
"grad_norm": 0.13870052993297577, |
|
"kl": 0.001789093017578125, |
|
"learning_rate": 9.991120277927223e-07, |
|
"loss": 0.0691, |
|
"reward": 0.3166997814550996, |
|
"reward_std": 0.7532177269458771, |
|
"rewards/cosine_scaled_reward": -0.011292967945337296, |
|
"rewards/format_reward": 0.3392857201397419, |
|
"step": 59 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2717.3036499023438, |
|
"epoch": 0.24, |
|
"grad_norm": 0.14514827728271484, |
|
"kl": 0.0060253143310546875, |
|
"learning_rate": 9.989038226169207e-07, |
|
"loss": 0.0622, |
|
"reward": 0.16810212982818484, |
|
"reward_std": 0.5123014599084854, |
|
"rewards/cosine_scaled_reward": -0.10642512841150165, |
|
"rewards/format_reward": 0.3809523843228817, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3105.7977294921875, |
|
"epoch": 0.244, |
|
"grad_norm": 0.10494968295097351, |
|
"kl": 0.0026693344116210938, |
|
"learning_rate": 9.98673738502114e-07, |
|
"loss": 0.0639, |
|
"reward": 0.08244643732905388, |
|
"reward_std": 0.7039294093847275, |
|
"rewards/cosine_scaled_reward": -0.10461012227460742, |
|
"rewards/format_reward": 0.29166666977107525, |
|
"step": 61 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3213.3214721679688, |
|
"epoch": 0.248, |
|
"grad_norm": 0.0999075248837471, |
|
"kl": 0.0023097991943359375, |
|
"learning_rate": 9.98421786662277e-07, |
|
"loss": 0.053, |
|
"reward": 0.09679291397333145, |
|
"reward_std": 0.7138089835643768, |
|
"rewards/cosine_scaled_reward": -0.0914845080114901, |
|
"rewards/format_reward": 0.2797619104385376, |
|
"step": 62 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2783.9584350585938, |
|
"epoch": 0.252, |
|
"grad_norm": 0.19832438230514526, |
|
"kl": 0.0027294158935546875, |
|
"learning_rate": 9.981479793771866e-07, |
|
"loss": 0.0773, |
|
"reward": 0.2238014191389084, |
|
"reward_std": 0.6036202609539032, |
|
"rewards/cosine_scaled_reward": -0.06369452457875013, |
|
"rewards/format_reward": 0.351190485060215, |
|
"step": 63 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2976.6607666015625, |
|
"epoch": 0.256, |
|
"grad_norm": 0.12335456907749176, |
|
"kl": 0.0020885467529296875, |
|
"learning_rate": 9.97852329991824e-07, |
|
"loss": 0.0857, |
|
"reward": 0.27347568422555923, |
|
"reward_std": 0.7463532835245132, |
|
"rewards/cosine_scaled_reward": -0.03885740428813733, |
|
"rewards/format_reward": 0.3511904887855053, |
|
"step": 64 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2825.6130981445312, |
|
"epoch": 0.26, |
|
"grad_norm": 0.13863790035247803, |
|
"kl": 0.002285003662109375, |
|
"learning_rate": 9.975348529157229e-07, |
|
"loss": 0.0349, |
|
"reward": 0.3712610546499491, |
|
"reward_std": 0.7277249395847321, |
|
"rewards/cosine_scaled_reward": -0.037583764642477036, |
|
"rewards/format_reward": 0.4464285746216774, |
|
"step": 65 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2974.232177734375, |
|
"epoch": 0.264, |
|
"grad_norm": 0.11186616122722626, |
|
"kl": 0.002407073974609375, |
|
"learning_rate": 9.971955636222684e-07, |
|
"loss": 0.0025, |
|
"reward": 0.07817286718636751, |
|
"reward_std": 0.640307292342186, |
|
"rewards/cosine_scaled_reward": -0.12460404448211193, |
|
"rewards/format_reward": 0.3273809589445591, |
|
"step": 66 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3152.702392578125, |
|
"epoch": 0.268, |
|
"grad_norm": 0.12694397568702698, |
|
"kl": 0.00250244140625, |
|
"learning_rate": 9.968344786479415e-07, |
|
"loss": 0.07, |
|
"reward": 0.1549822874367237, |
|
"reward_std": 0.6663320288062096, |
|
"rewards/cosine_scaled_reward": -0.08322314161341637, |
|
"rewards/format_reward": 0.3214285783469677, |
|
"step": 67 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2938.0179443359375, |
|
"epoch": 0.272, |
|
"grad_norm": 0.12655070424079895, |
|
"kl": 0.00341033935546875, |
|
"learning_rate": 9.964516155915151e-07, |
|
"loss": 0.0545, |
|
"reward": 0.2076467089354992, |
|
"reward_std": 0.7705407291650772, |
|
"rewards/cosine_scaled_reward": -0.08070044964551926, |
|
"rewards/format_reward": 0.3690476268529892, |
|
"step": 68 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3025.8095703125, |
|
"epoch": 0.276, |
|
"grad_norm": 0.12574610114097595, |
|
"kl": 0.003986358642578125, |
|
"learning_rate": 9.960469931131936e-07, |
|
"loss": 0.0227, |
|
"reward": 0.03160261735320091, |
|
"reward_std": 0.621779277920723, |
|
"rewards/cosine_scaled_reward": -0.13003203552216291, |
|
"rewards/format_reward": 0.2916666716337204, |
|
"step": 69 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2877.5536499023438, |
|
"epoch": 0.28, |
|
"grad_norm": 0.12181198596954346, |
|
"kl": 0.0038127899169921875, |
|
"learning_rate": 9.956206309337066e-07, |
|
"loss": 0.0248, |
|
"reward": 0.2757916431874037, |
|
"reward_std": 0.7794490903615952, |
|
"rewards/cosine_scaled_reward": -0.06448512757197022, |
|
"rewards/format_reward": 0.4047619178891182, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2776.6131591796875, |
|
"epoch": 0.284, |
|
"grad_norm": 0.17934156954288483, |
|
"kl": 0.0039825439453125, |
|
"learning_rate": 9.951725498333448e-07, |
|
"loss": 0.1072, |
|
"reward": 0.2793612889945507, |
|
"reward_std": 0.7607921361923218, |
|
"rewards/cosine_scaled_reward": -0.06865269318223, |
|
"rewards/format_reward": 0.4166666716337204, |
|
"step": 71 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3037.5178833007812, |
|
"epoch": 0.288, |
|
"grad_norm": 0.1311851143836975, |
|
"kl": 0.0038604736328125, |
|
"learning_rate": 9.947027716509488e-07, |
|
"loss": 0.0745, |
|
"reward": 0.34610075503587723, |
|
"reward_std": 0.8604296147823334, |
|
"rewards/cosine_scaled_reward": -0.01444962713867426, |
|
"rewards/format_reward": 0.3750000074505806, |
|
"step": 72 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2880.3036499023438, |
|
"epoch": 0.292, |
|
"grad_norm": 0.10903972387313843, |
|
"kl": 0.005123138427734375, |
|
"learning_rate": 9.942113192828444e-07, |
|
"loss": 0.0247, |
|
"reward": 0.41264417115598917, |
|
"reward_std": 0.6988394409418106, |
|
"rewards/cosine_scaled_reward": -0.00201124744489789, |
|
"rewards/format_reward": 0.4166666753590107, |
|
"step": 73 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2689.5774536132812, |
|
"epoch": 0.296, |
|
"grad_norm": 0.1187940314412117, |
|
"kl": 0.00371551513671875, |
|
"learning_rate": 9.93698216681727e-07, |
|
"loss": 0.0343, |
|
"reward": 0.49459290131926537, |
|
"reward_std": 0.6582471132278442, |
|
"rewards/cosine_scaled_reward": -0.005679763096850365, |
|
"rewards/format_reward": 0.505952388048172, |
|
"step": 74 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3011.2440795898438, |
|
"epoch": 0.3, |
|
"grad_norm": 0.15027488768100739, |
|
"kl": 0.00612640380859375, |
|
"learning_rate": 9.931634888554935e-07, |
|
"loss": 0.0922, |
|
"reward": -0.12369688227772713, |
|
"reward_std": 0.5938592255115509, |
|
"rewards/cosine_scaled_reward": -0.1957770138978958, |
|
"rewards/format_reward": 0.26785715110599995, |
|
"step": 75 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3087.9762573242188, |
|
"epoch": 0.304, |
|
"grad_norm": 0.1342087835073471, |
|
"kl": 0.004589080810546875, |
|
"learning_rate": 9.926071618660237e-07, |
|
"loss": 0.051, |
|
"reward": 0.029461721424013376, |
|
"reward_std": 0.5008194297552109, |
|
"rewards/cosine_scaled_reward": -0.12812629727704916, |
|
"rewards/format_reward": 0.2857142915017903, |
|
"step": 76 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2708.6845703125, |
|
"epoch": 0.308, |
|
"grad_norm": 0.11936229467391968, |
|
"kl": 0.00518035888671875, |
|
"learning_rate": 9.9202926282791e-07, |
|
"loss": 0.0693, |
|
"reward": 0.38730931747704744, |
|
"reward_std": 0.6931557953357697, |
|
"rewards/cosine_scaled_reward": -0.0503929746337235, |
|
"rewards/format_reward": 0.4880952388048172, |
|
"step": 77 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3123.4345703125, |
|
"epoch": 0.312, |
|
"grad_norm": 0.10926749557256699, |
|
"kl": 0.006168365478515625, |
|
"learning_rate": 9.91429819907136e-07, |
|
"loss": 0.0281, |
|
"reward": 0.14759791223332286, |
|
"reward_std": 0.6552696228027344, |
|
"rewards/cosine_scaled_reward": -0.08989150635898113, |
|
"rewards/format_reward": 0.3273809519596398, |
|
"step": 78 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3104.886962890625, |
|
"epoch": 0.316, |
|
"grad_norm": 0.17055809497833252, |
|
"kl": 0.005580902099609375, |
|
"learning_rate": 9.908088623197048e-07, |
|
"loss": 0.122, |
|
"reward": 0.1187831275165081, |
|
"reward_std": 0.7589289993047714, |
|
"rewards/cosine_scaled_reward": -0.11917985696345568, |
|
"rewards/format_reward": 0.3571428656578064, |
|
"step": 79 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2892.2261962890625, |
|
"epoch": 0.32, |
|
"grad_norm": 0.12601953744888306, |
|
"kl": 0.004444122314453125, |
|
"learning_rate": 9.901664203302124e-07, |
|
"loss": 0.0261, |
|
"reward": 0.2744547198526561, |
|
"reward_std": 0.686069905757904, |
|
"rewards/cosine_scaled_reward": -0.05920121353119612, |
|
"rewards/format_reward": 0.3928571492433548, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3041.1488647460938, |
|
"epoch": 0.324, |
|
"grad_norm": 0.11552488803863525, |
|
"kl": 0.00655364990234375, |
|
"learning_rate": 9.895025252503755e-07, |
|
"loss": 0.0229, |
|
"reward": 0.12106413394212723, |
|
"reward_std": 0.675617903470993, |
|
"rewards/cosine_scaled_reward": -0.1061346041969955, |
|
"rewards/format_reward": 0.33333333767950535, |
|
"step": 81 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3129.6429443359375, |
|
"epoch": 0.328, |
|
"grad_norm": 0.09706410765647888, |
|
"kl": 0.0055084228515625, |
|
"learning_rate": 9.888172094375033e-07, |
|
"loss": 0.0452, |
|
"reward": 0.12287123966962099, |
|
"reward_std": 0.7173575460910797, |
|
"rewards/cosine_scaled_reward": -0.09035009983927011, |
|
"rewards/format_reward": 0.3035714365541935, |
|
"step": 82 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2602.047637939453, |
|
"epoch": 0.332, |
|
"grad_norm": 0.11233574151992798, |
|
"kl": 0.01010894775390625, |
|
"learning_rate": 9.881105062929221e-07, |
|
"loss": 0.0258, |
|
"reward": 0.35400932375341654, |
|
"reward_std": 0.6462785750627518, |
|
"rewards/cosine_scaled_reward": -0.06109058950096369, |
|
"rewards/format_reward": 0.476190485060215, |
|
"step": 83 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2894.011962890625, |
|
"epoch": 0.336, |
|
"grad_norm": 0.12059750407934189, |
|
"kl": 0.00872802734375, |
|
"learning_rate": 9.873824502603459e-07, |
|
"loss": 0.0806, |
|
"reward": 0.2941260803490877, |
|
"reward_std": 0.78522889316082, |
|
"rewards/cosine_scaled_reward": -0.06722268275916576, |
|
"rewards/format_reward": 0.4285714328289032, |
|
"step": 84 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2979.226318359375, |
|
"epoch": 0.34, |
|
"grad_norm": 0.15206296741962433, |
|
"kl": 0.00783538818359375, |
|
"learning_rate": 9.866330768241983e-07, |
|
"loss": 0.0755, |
|
"reward": 0.21889091655611992, |
|
"reward_std": 0.6674999743700027, |
|
"rewards/cosine_scaled_reward": -0.06614979542791843, |
|
"rewards/format_reward": 0.3511904887855053, |
|
"step": 85 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2819.5892944335938, |
|
"epoch": 0.344, |
|
"grad_norm": 0.1093529462814331, |
|
"kl": 0.00661468505859375, |
|
"learning_rate": 9.85862422507884e-07, |
|
"loss": 0.0353, |
|
"reward": 0.20389786185114644, |
|
"reward_std": 0.6942542195320129, |
|
"rewards/cosine_scaled_reward": -0.07959869271144271, |
|
"rewards/format_reward": 0.3630952425301075, |
|
"step": 86 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2616.452392578125, |
|
"epoch": 0.348, |
|
"grad_norm": 0.15942011773586273, |
|
"kl": 0.00847625732421875, |
|
"learning_rate": 9.850705248720068e-07, |
|
"loss": 0.0773, |
|
"reward": 0.3760679364204407, |
|
"reward_std": 0.6467384025454521, |
|
"rewards/cosine_scaled_reward": -0.05898985452950001, |
|
"rewards/format_reward": 0.4940476194024086, |
|
"step": 87 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2923.7500610351562, |
|
"epoch": 0.352, |
|
"grad_norm": 0.10565865784883499, |
|
"kl": 0.0073089599609375, |
|
"learning_rate": 9.8425742251254e-07, |
|
"loss": 0.0227, |
|
"reward": 0.3185804970562458, |
|
"reward_std": 0.6098055616021156, |
|
"rewards/cosine_scaled_reward": -0.010352615499868989, |
|
"rewards/format_reward": 0.3392857201397419, |
|
"step": 88 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2752.6309814453125, |
|
"epoch": 0.356, |
|
"grad_norm": 0.1690302938222885, |
|
"kl": 0.0132293701171875, |
|
"learning_rate": 9.83423155058946e-07, |
|
"loss": 0.0312, |
|
"reward": 0.46914676763117313, |
|
"reward_std": 0.7854363918304443, |
|
"rewards/cosine_scaled_reward": 0.005406718701124191, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 89 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2769.1488647460938, |
|
"epoch": 0.36, |
|
"grad_norm": 0.17653611302375793, |
|
"kl": 0.011993408203125, |
|
"learning_rate": 9.825677631722435e-07, |
|
"loss": 0.0815, |
|
"reward": 0.3604448903352022, |
|
"reward_std": 0.798264317214489, |
|
"rewards/cosine_scaled_reward": -0.04894421715289354, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3187.3631591796875, |
|
"epoch": 0.364, |
|
"grad_norm": 0.1068400964140892, |
|
"kl": 0.0078277587890625, |
|
"learning_rate": 9.816912885430258e-07, |
|
"loss": 0.0211, |
|
"reward": -0.03608314320445061, |
|
"reward_std": 0.5826699808239937, |
|
"rewards/cosine_scaled_reward": -0.14006539154797792, |
|
"rewards/format_reward": 0.2440476305782795, |
|
"step": 91 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2924.916748046875, |
|
"epoch": 0.368, |
|
"grad_norm": 0.17665976285934448, |
|
"kl": 0.0091400146484375, |
|
"learning_rate": 9.807937738894303e-07, |
|
"loss": 0.0852, |
|
"reward": 0.2780441716313362, |
|
"reward_std": 0.7524297386407852, |
|
"rewards/cosine_scaled_reward": -0.07526363711804152, |
|
"rewards/format_reward": 0.4285714291036129, |
|
"step": 92 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2939.3512573242188, |
|
"epoch": 0.372, |
|
"grad_norm": 0.13597099483013153, |
|
"kl": 0.0076446533203125, |
|
"learning_rate": 9.798752629550546e-07, |
|
"loss": 0.0324, |
|
"reward": 0.33797190338373184, |
|
"reward_std": 0.5880008786916733, |
|
"rewards/cosine_scaled_reward": -0.01851405529305339, |
|
"rewards/format_reward": 0.3750000111758709, |
|
"step": 93 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2922.875, |
|
"epoch": 0.376, |
|
"grad_norm": 0.12246444076299667, |
|
"kl": 0.00815582275390625, |
|
"learning_rate": 9.78935800506826e-07, |
|
"loss": 0.0801, |
|
"reward": 0.14625070057809353, |
|
"reward_std": 0.690229170024395, |
|
"rewards/cosine_scaled_reward": -0.12330322340130806, |
|
"rewards/format_reward": 0.3928571492433548, |
|
"step": 94 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3222.3632202148438, |
|
"epoch": 0.38, |
|
"grad_norm": 0.08992121368646622, |
|
"kl": 0.00704193115234375, |
|
"learning_rate": 9.779754323328192e-07, |
|
"loss": -0.001, |
|
"reward": 0.26800261437892914, |
|
"reward_std": 0.7103277295827866, |
|
"rewards/cosine_scaled_reward": -0.011832039803266525, |
|
"rewards/format_reward": 0.2916666716337204, |
|
"step": 95 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3215.2202758789062, |
|
"epoch": 0.384, |
|
"grad_norm": 0.12997964024543762, |
|
"kl": 0.00983428955078125, |
|
"learning_rate": 9.769942052400235e-07, |
|
"loss": 0.0591, |
|
"reward": -0.03136127255856991, |
|
"reward_std": 0.6326467096805573, |
|
"rewards/cosine_scaled_reward": -0.18234730698168278, |
|
"rewards/format_reward": 0.33333333767950535, |
|
"step": 96 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2857.0416870117188, |
|
"epoch": 0.388, |
|
"grad_norm": 0.16558504104614258, |
|
"kl": 0.007293701171875, |
|
"learning_rate": 9.759921670520634e-07, |
|
"loss": 0.0821, |
|
"reward": 0.19707820191979408, |
|
"reward_std": 0.6188783794641495, |
|
"rewards/cosine_scaled_reward": -0.10681804455816746, |
|
"rewards/format_reward": 0.410714291036129, |
|
"step": 97 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2983.9940795898438, |
|
"epoch": 0.392, |
|
"grad_norm": 0.1882523149251938, |
|
"kl": 0.00878143310546875, |
|
"learning_rate": 9.749693666068663e-07, |
|
"loss": 0.1027, |
|
"reward": 0.3959239423274994, |
|
"reward_std": 0.875861182808876, |
|
"rewards/cosine_scaled_reward": 0.004509590216912329, |
|
"rewards/format_reward": 0.3869047686457634, |
|
"step": 98 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3077.0952758789062, |
|
"epoch": 0.396, |
|
"grad_norm": 0.1565464437007904, |
|
"kl": 0.011505126953125, |
|
"learning_rate": 9.739258537542835e-07, |
|
"loss": 0.0968, |
|
"reward": 0.027048692107200623, |
|
"reward_std": 0.7064545601606369, |
|
"rewards/cosine_scaled_reward": -0.1323089925572276, |
|
"rewards/format_reward": 0.2916666679084301, |
|
"step": 99 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2358.2381591796875, |
|
"epoch": 0.4, |
|
"grad_norm": 0.14640696346759796, |
|
"kl": 0.00804901123046875, |
|
"learning_rate": 9.728616793536587e-07, |
|
"loss": 0.0774, |
|
"reward": 0.8142919540405273, |
|
"reward_std": 0.7067123055458069, |
|
"rewards/cosine_scaled_reward": 0.10357456840574741, |
|
"rewards/format_reward": 0.6071428507566452, |
|
"step": 100 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3195.6428833007812, |
|
"epoch": 0.404, |
|
"grad_norm": 0.14821472764015198, |
|
"kl": 0.0126495361328125, |
|
"learning_rate": 9.717768952713511e-07, |
|
"loss": 0.0897, |
|
"reward": 0.2797414679080248, |
|
"reward_std": 0.8859200328588486, |
|
"rewards/cosine_scaled_reward": -0.0535816540941596, |
|
"rewards/format_reward": 0.3869047649204731, |
|
"step": 101 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3046.7798461914062, |
|
"epoch": 0.408, |
|
"grad_norm": 0.13244982063770294, |
|
"kl": 0.0107879638671875, |
|
"learning_rate": 9.706715543782064e-07, |
|
"loss": 0.0638, |
|
"reward": 0.16126136109232903, |
|
"reward_std": 0.6933339387178421, |
|
"rewards/cosine_scaled_reward": -0.08305979892611504, |
|
"rewards/format_reward": 0.3273809552192688, |
|
"step": 102 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2718.8275146484375, |
|
"epoch": 0.412, |
|
"grad_norm": 0.20267100632190704, |
|
"kl": 0.01003265380859375, |
|
"learning_rate": 9.695457105469804e-07, |
|
"loss": 0.1246, |
|
"reward": 0.4206714928150177, |
|
"reward_std": 0.6706456393003464, |
|
"rewards/cosine_scaled_reward": -0.0069261584430933, |
|
"rewards/format_reward": 0.43452382180839777, |
|
"step": 103 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2973.7559814453125, |
|
"epoch": 0.416, |
|
"grad_norm": 0.12351427227258682, |
|
"kl": 0.01038360595703125, |
|
"learning_rate": 9.683994186497132e-07, |
|
"loss": 0.0716, |
|
"reward": 0.20578511937389976, |
|
"reward_std": 0.7138822227716446, |
|
"rewards/cosine_scaled_reward": -0.0846074327128008, |
|
"rewards/format_reward": 0.3750000037252903, |
|
"step": 104 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2596.1250915527344, |
|
"epoch": 0.42, |
|
"grad_norm": 0.13755492866039276, |
|
"kl": 0.0099945068359375, |
|
"learning_rate": 9.672327345550543e-07, |
|
"loss": 0.0544, |
|
"reward": 0.6021162122488022, |
|
"reward_std": 0.8435305505990982, |
|
"rewards/cosine_scaled_reward": 0.012367631308734417, |
|
"rewards/format_reward": 0.5773809663951397, |
|
"step": 105 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2986.226318359375, |
|
"epoch": 0.424, |
|
"grad_norm": 0.10337146371603012, |
|
"kl": 0.014007568359375, |
|
"learning_rate": 9.66045715125541e-07, |
|
"loss": 0.0441, |
|
"reward": 0.24333537928760052, |
|
"reward_std": 0.7411631494760513, |
|
"rewards/cosine_scaled_reward": -0.08071326930075884, |
|
"rewards/format_reward": 0.4047619141638279, |
|
"step": 106 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2804.7500610351562, |
|
"epoch": 0.428, |
|
"grad_norm": 0.1334601491689682, |
|
"kl": 0.010711669921875, |
|
"learning_rate": 9.648384182148252e-07, |
|
"loss": 0.0741, |
|
"reward": 0.21579574886709452, |
|
"reward_std": 0.559941440820694, |
|
"rewards/cosine_scaled_reward": -0.08257831074297428, |
|
"rewards/format_reward": 0.380952388048172, |
|
"step": 107 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2947.9583740234375, |
|
"epoch": 0.432, |
|
"grad_norm": 0.15339502692222595, |
|
"kl": 0.0123443603515625, |
|
"learning_rate": 9.636109026648554e-07, |
|
"loss": 0.0797, |
|
"reward": 0.2714387159794569, |
|
"reward_std": 0.5535019040107727, |
|
"rewards/cosine_scaled_reward": -0.060709220357239246, |
|
"rewards/format_reward": 0.3928571492433548, |
|
"step": 108 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3165.3631591796875, |
|
"epoch": 0.436, |
|
"grad_norm": 0.14555484056472778, |
|
"kl": 0.0123748779296875, |
|
"learning_rate": 9.623632283030077e-07, |
|
"loss": 0.0689, |
|
"reward": 0.3741426505148411, |
|
"reward_std": 0.7712415158748627, |
|
"rewards/cosine_scaled_reward": 0.011476085986942053, |
|
"rewards/format_reward": 0.351190485060215, |
|
"step": 109 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2682.607177734375, |
|
"epoch": 0.44, |
|
"grad_norm": 3.0458719730377197, |
|
"kl": 0.1771697998046875, |
|
"learning_rate": 9.610954559391704e-07, |
|
"loss": 0.0576, |
|
"reward": 0.43371669203042984, |
|
"reward_std": 0.6959643810987473, |
|
"rewards/cosine_scaled_reward": -0.02718928176909685, |
|
"rewards/format_reward": 0.4880952462553978, |
|
"step": 110 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2601.636993408203, |
|
"epoch": 0.444, |
|
"grad_norm": 0.16071970760822296, |
|
"kl": 0.0132904052734375, |
|
"learning_rate": 9.598076473627796e-07, |
|
"loss": 0.0475, |
|
"reward": 0.3634342849254608, |
|
"reward_std": 0.5500286221504211, |
|
"rewards/cosine_scaled_reward": -0.06233047042042017, |
|
"rewards/format_reward": 0.4880952462553978, |
|
"step": 111 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2848.0, |
|
"epoch": 0.448, |
|
"grad_norm": 0.11652833968400955, |
|
"kl": 0.01318359375, |
|
"learning_rate": 9.58499865339809e-07, |
|
"loss": 0.0216, |
|
"reward": 0.4104595482349396, |
|
"reward_std": 0.7775004655122757, |
|
"rewards/cosine_scaled_reward": -0.023936893790960312, |
|
"rewards/format_reward": 0.4583333358168602, |
|
"step": 112 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3060.696533203125, |
|
"epoch": 0.452, |
|
"grad_norm": 0.11911512911319733, |
|
"kl": 0.019012451171875, |
|
"learning_rate": 9.571721736097088e-07, |
|
"loss": 0.0351, |
|
"reward": 0.14249714091420174, |
|
"reward_std": 0.5928184911608696, |
|
"rewards/cosine_scaled_reward": -0.08946572133572772, |
|
"rewards/format_reward": 0.32142857648432255, |
|
"step": 113 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 3025.3392944335938, |
|
"epoch": 0.456, |
|
"grad_norm": 0.11119002103805542, |
|
"kl": 0.013275146484375, |
|
"learning_rate": 9.55824636882301e-07, |
|
"loss": 0.0157, |
|
"reward": 0.2583576124161482, |
|
"reward_std": 0.5952321216464043, |
|
"rewards/cosine_scaled_reward": -0.05236881226301193, |
|
"rewards/format_reward": 0.3630952462553978, |
|
"step": 114 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2642.4524536132812, |
|
"epoch": 0.46, |
|
"grad_norm": 0.13256801664829254, |
|
"kl": 0.0142669677734375, |
|
"learning_rate": 9.54457320834625e-07, |
|
"loss": 0.0464, |
|
"reward": 0.26410975866019726, |
|
"reward_std": 0.7131557315587997, |
|
"rewards/cosine_scaled_reward": -0.10604035668075085, |
|
"rewards/format_reward": 0.476190485060215, |
|
"step": 115 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2842.172607421875, |
|
"epoch": 0.464, |
|
"grad_norm": 0.14555224776268005, |
|
"kl": 0.0131683349609375, |
|
"learning_rate": 9.530702921077358e-07, |
|
"loss": 0.06, |
|
"reward": 0.7474905252456665, |
|
"reward_std": 0.9560296833515167, |
|
"rewards/cosine_scaled_reward": 0.10291192133445293, |
|
"rewards/format_reward": 0.5416666716337204, |
|
"step": 116 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2644.851318359375, |
|
"epoch": 0.468, |
|
"grad_norm": 0.1801517903804779, |
|
"kl": 0.0159149169921875, |
|
"learning_rate": 9.516636183034564e-07, |
|
"loss": 0.0868, |
|
"reward": 0.3559920974075794, |
|
"reward_std": 0.6948887631297112, |
|
"rewards/cosine_scaled_reward": -0.05712300445884466, |
|
"rewards/format_reward": 0.4702381044626236, |
|
"step": 117 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2898.8692016601562, |
|
"epoch": 0.472, |
|
"grad_norm": 0.11308304965496063, |
|
"kl": 0.01580810546875, |
|
"learning_rate": 9.502373679810839e-07, |
|
"loss": 0.0347, |
|
"reward": -0.02927885064855218, |
|
"reward_std": 0.5874328389763832, |
|
"rewards/cosine_scaled_reward": -0.18725845962762833, |
|
"rewards/format_reward": 0.3452381044626236, |
|
"step": 118 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2942.3869018554688, |
|
"epoch": 0.476, |
|
"grad_norm": 0.11883487552404404, |
|
"kl": 0.0146484375, |
|
"learning_rate": 9.487916106540465e-07, |
|
"loss": 0.0533, |
|
"reward": 0.2897670716047287, |
|
"reward_std": 0.624944195151329, |
|
"rewards/cosine_scaled_reward": -0.05452123726718128, |
|
"rewards/format_reward": 0.3988095298409462, |
|
"step": 119 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2888.5537109375, |
|
"epoch": 0.48, |
|
"grad_norm": 0.18806912004947662, |
|
"kl": 0.0160980224609375, |
|
"learning_rate": 9.473264167865171e-07, |
|
"loss": 0.0988, |
|
"reward": 0.42369477450847626, |
|
"reward_std": 0.8195747882127762, |
|
"rewards/cosine_scaled_reward": -0.002438324736431241, |
|
"rewards/format_reward": 0.4285714365541935, |
|
"step": 120 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2582.732177734375, |
|
"epoch": 0.484, |
|
"grad_norm": 0.24437126517295837, |
|
"kl": 0.0138702392578125, |
|
"learning_rate": 9.458418577899774e-07, |
|
"loss": 0.1221, |
|
"reward": 0.5238880245015025, |
|
"reward_std": 0.7648549973964691, |
|
"rewards/cosine_scaled_reward": -0.005913139786571264, |
|
"rewards/format_reward": 0.535714291036129, |
|
"step": 121 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2693.202392578125, |
|
"epoch": 0.488, |
|
"grad_norm": 0.1520787924528122, |
|
"kl": 0.016265869140625, |
|
"learning_rate": 9.443380060197385e-07, |
|
"loss": 0.075, |
|
"reward": 0.5096995830535889, |
|
"reward_std": 0.8008040487766266, |
|
"rewards/cosine_scaled_reward": 0.01675456203520298, |
|
"rewards/format_reward": 0.4761904776096344, |
|
"step": 122 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2794.4822387695312, |
|
"epoch": 0.492, |
|
"grad_norm": 0.276404470205307, |
|
"kl": 0.016845703125, |
|
"learning_rate": 9.428149347714143e-07, |
|
"loss": 0.1229, |
|
"reward": 0.1483494946733117, |
|
"reward_std": 0.7319334298372269, |
|
"rewards/cosine_scaled_reward": -0.11034906562417746, |
|
"rewards/format_reward": 0.3690476268529892, |
|
"step": 123 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2586.9285583496094, |
|
"epoch": 0.496, |
|
"grad_norm": 0.19590145349502563, |
|
"kl": 0.0192413330078125, |
|
"learning_rate": 9.412727182773486e-07, |
|
"loss": 0.0668, |
|
"reward": 0.42041725292801857, |
|
"reward_std": 0.7440174072980881, |
|
"rewards/cosine_scaled_reward": -0.05169615335762501, |
|
"rewards/format_reward": 0.5238095372915268, |
|
"step": 124 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2647.1309814453125, |
|
"epoch": 0.5, |
|
"grad_norm": 0.1762569099664688, |
|
"kl": 0.018951416015625, |
|
"learning_rate": 9.397114317029974e-07, |
|
"loss": 0.058, |
|
"reward": 0.26979109086096287, |
|
"reward_std": 0.7384046316146851, |
|
"rewards/cosine_scaled_reward": -0.10022350586950779, |
|
"rewards/format_reward": 0.4702380932867527, |
|
"step": 125 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2329.4642639160156, |
|
"epoch": 0.504, |
|
"grad_norm": 0.2497519552707672, |
|
"kl": 0.016754150390625, |
|
"learning_rate": 9.381311511432658e-07, |
|
"loss": 0.1113, |
|
"reward": 0.5470606535673141, |
|
"reward_std": 0.7599766105413437, |
|
"rewards/cosine_scaled_reward": -0.04194586584344506, |
|
"rewards/format_reward": 0.630952388048172, |
|
"step": 126 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2630.851318359375, |
|
"epoch": 0.508, |
|
"grad_norm": 0.24775269627571106, |
|
"kl": 0.018798828125, |
|
"learning_rate": 9.36531953618799e-07, |
|
"loss": 0.0886, |
|
"reward": 0.5138388648629189, |
|
"reward_std": 0.8158636838197708, |
|
"rewards/cosine_scaled_reward": 0.00989562287577428, |
|
"rewards/format_reward": 0.4940476194024086, |
|
"step": 127 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2193.1250610351562, |
|
"epoch": 0.512, |
|
"grad_norm": 0.3206213712692261, |
|
"kl": 0.0168914794921875, |
|
"learning_rate": 9.34913917072228e-07, |
|
"loss": 0.1596, |
|
"reward": 0.7910499274730682, |
|
"reward_std": 0.7149495184421539, |
|
"rewards/cosine_scaled_reward": 0.0502868490293622, |
|
"rewards/format_reward": 0.6904762089252472, |
|
"step": 128 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2532.4642944335938, |
|
"epoch": 0.516, |
|
"grad_norm": 0.28250807523727417, |
|
"kl": 0.021270751953125, |
|
"learning_rate": 9.332771203643714e-07, |
|
"loss": 0.0845, |
|
"reward": 0.36558002047240734, |
|
"reward_std": 0.637114867568016, |
|
"rewards/cosine_scaled_reward": -0.05232903314754367, |
|
"rewards/format_reward": 0.470238097012043, |
|
"step": 129 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2768.4702758789062, |
|
"epoch": 0.52, |
|
"grad_norm": 0.26120948791503906, |
|
"kl": 0.026092529296875, |
|
"learning_rate": 9.316216432703916e-07, |
|
"loss": 0.1052, |
|
"reward": 0.41776999086141586, |
|
"reward_std": 0.9506262838840485, |
|
"rewards/cosine_scaled_reward": -0.029210255946964025, |
|
"rewards/format_reward": 0.4761904776096344, |
|
"step": 130 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2400.5000915527344, |
|
"epoch": 0.524, |
|
"grad_norm": 0.33211514353752136, |
|
"kl": 0.0215606689453125, |
|
"learning_rate": 9.299475664759068e-07, |
|
"loss": 0.1452, |
|
"reward": 0.42596414871513844, |
|
"reward_std": 0.6515605002641678, |
|
"rewards/cosine_scaled_reward": -0.04892268590629101, |
|
"rewards/format_reward": 0.5238095298409462, |
|
"step": 131 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2368.9524536132812, |
|
"epoch": 0.528, |
|
"grad_norm": 0.41833382844924927, |
|
"kl": 0.020782470703125, |
|
"learning_rate": 9.282549715730579e-07, |
|
"loss": 0.1107, |
|
"reward": 0.5763177648186684, |
|
"reward_std": 0.7463207244873047, |
|
"rewards/cosine_scaled_reward": 0.005420786794275045, |
|
"rewards/format_reward": 0.5654762089252472, |
|
"step": 132 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2800.4940795898438, |
|
"epoch": 0.532, |
|
"grad_norm": 0.21257169544696808, |
|
"kl": 0.031768798828125, |
|
"learning_rate": 9.265439410565328e-07, |
|
"loss": 0.0654, |
|
"reward": 0.27612858824431896, |
|
"reward_std": 0.6431511640548706, |
|
"rewards/cosine_scaled_reward": -0.07026905845850706, |
|
"rewards/format_reward": 0.4166666679084301, |
|
"step": 133 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2558.386962890625, |
|
"epoch": 0.536, |
|
"grad_norm": 0.42514950037002563, |
|
"kl": 0.02978515625, |
|
"learning_rate": 9.248145583195447e-07, |
|
"loss": 0.1228, |
|
"reward": 0.4270520806312561, |
|
"reward_std": 0.7729989290237427, |
|
"rewards/cosine_scaled_reward": -0.02159299748018384, |
|
"rewards/format_reward": 0.4702381044626236, |
|
"step": 134 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2229.363067626953, |
|
"epoch": 0.54, |
|
"grad_norm": 0.33076903223991394, |
|
"kl": 0.03668212890625, |
|
"learning_rate": 9.230669076497687e-07, |
|
"loss": 0.1019, |
|
"reward": 0.28207028564065695, |
|
"reward_std": 0.6516975909471512, |
|
"rewards/cosine_scaled_reward": -0.10301248356699944, |
|
"rewards/format_reward": 0.4880952388048172, |
|
"step": 135 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2353.029815673828, |
|
"epoch": 0.544, |
|
"grad_norm": 0.3519177734851837, |
|
"kl": 0.035308837890625, |
|
"learning_rate": 9.213010742252327e-07, |
|
"loss": 0.0997, |
|
"reward": 0.37077474407851696, |
|
"reward_std": 0.668467104434967, |
|
"rewards/cosine_scaled_reward": -0.0675888154655695, |
|
"rewards/format_reward": 0.505952388048172, |
|
"step": 136 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2052.1964721679688, |
|
"epoch": 0.548, |
|
"grad_norm": 0.23379026353359222, |
|
"kl": 0.035400390625, |
|
"learning_rate": 9.195171441101668e-07, |
|
"loss": 0.058, |
|
"reward": 0.6550269052386284, |
|
"reward_std": 0.6502309143543243, |
|
"rewards/cosine_scaled_reward": 0.009061065968126059, |
|
"rewards/format_reward": 0.6369047611951828, |
|
"step": 137 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2325.732177734375, |
|
"epoch": 0.552, |
|
"grad_norm": 0.19041714072227478, |
|
"kl": 0.0423583984375, |
|
"learning_rate": 9.177152042508077e-07, |
|
"loss": 0.0232, |
|
"reward": 0.561458358541131, |
|
"reward_std": 0.9615298509597778, |
|
"rewards/cosine_scaled_reward": 0.012872030027210712, |
|
"rewards/format_reward": 0.5357142947614193, |
|
"step": 138 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2440.327423095703, |
|
"epoch": 0.556, |
|
"grad_norm": 0.2846536934375763, |
|
"kl": 0.0457763671875, |
|
"learning_rate": 9.158953424711624e-07, |
|
"loss": 0.0439, |
|
"reward": 0.44825945422053337, |
|
"reward_std": 0.7441610246896744, |
|
"rewards/cosine_scaled_reward": -0.022894082590937614, |
|
"rewards/format_reward": 0.4940476417541504, |
|
"step": 139 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2455.071533203125, |
|
"epoch": 0.56, |
|
"grad_norm": 0.5667356252670288, |
|
"kl": 0.05535888671875, |
|
"learning_rate": 9.140576474687263e-07, |
|
"loss": 0.1203, |
|
"reward": 0.42634591602836736, |
|
"reward_std": 0.6539553329348564, |
|
"rewards/cosine_scaled_reward": 0.007815815508365631, |
|
"rewards/format_reward": 0.4107142984867096, |
|
"step": 140 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2289.0416870117188, |
|
"epoch": 0.564, |
|
"grad_norm": 0.2632148861885071, |
|
"kl": 0.06378173828125, |
|
"learning_rate": 9.122022088101613e-07, |
|
"loss": 0.0588, |
|
"reward": 0.25764250196516514, |
|
"reward_std": 0.5646726861596107, |
|
"rewards/cosine_scaled_reward": -0.07355970796197653, |
|
"rewards/format_reward": 0.4047619141638279, |
|
"step": 141 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2396.1011962890625, |
|
"epoch": 0.568, |
|
"grad_norm": 0.48258456587791443, |
|
"kl": 0.070556640625, |
|
"learning_rate": 9.103291169269299e-07, |
|
"loss": 0.1188, |
|
"reward": 0.3830295614898205, |
|
"reward_std": 0.7874267548322678, |
|
"rewards/cosine_scaled_reward": 0.0069909729063510895, |
|
"rewards/format_reward": 0.3690476268529892, |
|
"step": 142 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2465.184539794922, |
|
"epoch": 0.572, |
|
"grad_norm": 0.3696215748786926, |
|
"kl": 0.083984375, |
|
"learning_rate": 9.084384631108882e-07, |
|
"loss": 0.0378, |
|
"reward": 0.17246808065101504, |
|
"reward_std": 0.7914570420980453, |
|
"rewards/cosine_scaled_reward": -0.11614691279828548, |
|
"rewards/format_reward": 0.4047619104385376, |
|
"step": 143 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2460.184600830078, |
|
"epoch": 0.576, |
|
"grad_norm": 0.30795326828956604, |
|
"kl": 0.08349609375, |
|
"learning_rate": 9.065303395098358e-07, |
|
"loss": 0.0254, |
|
"reward": 0.23997123539447784, |
|
"reward_std": 0.7133302837610245, |
|
"rewards/cosine_scaled_reward": -0.09430009685456753, |
|
"rewards/format_reward": 0.4285714365541935, |
|
"step": 144 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2376.089324951172, |
|
"epoch": 0.58, |
|
"grad_norm": 0.5491130352020264, |
|
"kl": 0.0899658203125, |
|
"learning_rate": 9.046048391230247e-07, |
|
"loss": 0.0823, |
|
"reward": 0.4339366629719734, |
|
"reward_std": 0.7774848788976669, |
|
"rewards/cosine_scaled_reward": 0.005658812588080764, |
|
"rewards/format_reward": 0.4226190559566021, |
|
"step": 145 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2340.136962890625, |
|
"epoch": 0.584, |
|
"grad_norm": 0.3470991551876068, |
|
"kl": 0.1185302734375, |
|
"learning_rate": 9.026620557966279e-07, |
|
"loss": 0.0431, |
|
"reward": 0.31324461475014687, |
|
"reward_std": 0.7800580561161041, |
|
"rewards/cosine_scaled_reward": -0.10528245754539967, |
|
"rewards/format_reward": 0.5238095298409462, |
|
"step": 146 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2512.7262573242188, |
|
"epoch": 0.588, |
|
"grad_norm": 0.31661325693130493, |
|
"kl": 0.1114501953125, |
|
"learning_rate": 9.007020842191634e-07, |
|
"loss": 0.0189, |
|
"reward": 0.2997382581233978, |
|
"reward_std": 0.8120257556438446, |
|
"rewards/cosine_scaled_reward": -0.06144038587808609, |
|
"rewards/format_reward": 0.4226190485060215, |
|
"step": 147 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2311.8035888671875, |
|
"epoch": 0.592, |
|
"grad_norm": 0.9110273718833923, |
|
"kl": 0.1287841796875, |
|
"learning_rate": 8.987250199168808e-07, |
|
"loss": 0.0875, |
|
"reward": 0.29848775546997786, |
|
"reward_std": 0.7376701682806015, |
|
"rewards/cosine_scaled_reward": -0.06206565350294113, |
|
"rewards/format_reward": 0.4226190596818924, |
|
"step": 148 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2368.2500915527344, |
|
"epoch": 0.596, |
|
"grad_norm": 0.5145498514175415, |
|
"kl": 0.142333984375, |
|
"learning_rate": 8.967309592491052e-07, |
|
"loss": -0.0091, |
|
"reward": 0.1381237395107746, |
|
"reward_std": 0.7537627294659615, |
|
"rewards/cosine_scaled_reward": -0.12736669927835464, |
|
"rewards/format_reward": 0.3928571492433548, |
|
"step": 149 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2541.4226684570312, |
|
"epoch": 0.6, |
|
"grad_norm": 0.4558282792568207, |
|
"kl": 0.1424560546875, |
|
"learning_rate": 8.9471999940354e-07, |
|
"loss": 0.0545, |
|
"reward": 0.517300067236647, |
|
"reward_std": 0.8328704386949539, |
|
"rewards/cosine_scaled_reward": 0.014602408395148814, |
|
"rewards/format_reward": 0.4880952388048172, |
|
"step": 150 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2635.4226989746094, |
|
"epoch": 0.604, |
|
"grad_norm": 0.3748377859592438, |
|
"kl": 0.18310546875, |
|
"learning_rate": 8.926922383915315e-07, |
|
"loss": 0.0372, |
|
"reward": 0.09937155619263649, |
|
"reward_std": 0.7007840871810913, |
|
"rewards/cosine_scaled_reward": -0.1259094497654587, |
|
"rewards/format_reward": 0.3511904776096344, |
|
"step": 151 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2366.8036499023438, |
|
"epoch": 0.608, |
|
"grad_norm": 0.7343178391456604, |
|
"kl": 0.196533203125, |
|
"learning_rate": 8.906477750432903e-07, |
|
"loss": 0.1069, |
|
"reward": 0.6113147716969252, |
|
"reward_std": 0.8982192724943161, |
|
"rewards/cosine_scaled_reward": 0.028871658723801374, |
|
"rewards/format_reward": 0.5535714402794838, |
|
"step": 152 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2539.6726684570312, |
|
"epoch": 0.612, |
|
"grad_norm": 0.3661244213581085, |
|
"kl": 0.23095703125, |
|
"learning_rate": 8.88586709003076e-07, |
|
"loss": 0.0493, |
|
"reward": 0.3096798346377909, |
|
"reward_std": 0.6143878847360611, |
|
"rewards/cosine_scaled_reward": -0.07432675641030073, |
|
"rewards/format_reward": 0.4583333432674408, |
|
"step": 153 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2693.077392578125, |
|
"epoch": 0.616, |
|
"grad_norm": 0.39779341220855713, |
|
"kl": 0.26123046875, |
|
"learning_rate": 8.865091407243394e-07, |
|
"loss": 0.0484, |
|
"reward": 0.20376494899392128, |
|
"reward_std": 0.7454717755317688, |
|
"rewards/cosine_scaled_reward": -0.0856175352819264, |
|
"rewards/format_reward": 0.3750000074505806, |
|
"step": 154 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2522.8333740234375, |
|
"epoch": 0.62, |
|
"grad_norm": 0.7077339291572571, |
|
"kl": 0.275634765625, |
|
"learning_rate": 8.844151714648274e-07, |
|
"loss": 0.1048, |
|
"reward": 0.28493453562259674, |
|
"reward_std": 0.7751601040363312, |
|
"rewards/cosine_scaled_reward": -0.050985115580260754, |
|
"rewards/format_reward": 0.3869047649204731, |
|
"step": 155 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2791.2083740234375, |
|
"epoch": 0.624, |
|
"grad_norm": 0.6277625560760498, |
|
"kl": 0.3359375, |
|
"learning_rate": 8.823049032816478e-07, |
|
"loss": 0.063, |
|
"reward": 0.15741928666830063, |
|
"reward_std": 0.7891719415783882, |
|
"rewards/cosine_scaled_reward": -0.1058141621761024, |
|
"rewards/format_reward": 0.3690476268529892, |
|
"step": 156 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2756.0596313476562, |
|
"epoch": 0.628, |
|
"grad_norm": 0.9464259147644043, |
|
"kl": 0.35107421875, |
|
"learning_rate": 8.801784390262943e-07, |
|
"loss": 0.1337, |
|
"reward": 0.20047340355813503, |
|
"reward_std": 0.7717511355876923, |
|
"rewards/cosine_scaled_reward": -0.10214426182210445, |
|
"rewards/format_reward": 0.4047619178891182, |
|
"step": 157 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2546.386993408203, |
|
"epoch": 0.632, |
|
"grad_norm": 0.9672547578811646, |
|
"kl": 0.3583984375, |
|
"learning_rate": 8.780358823396352e-07, |
|
"loss": 0.1309, |
|
"reward": 0.3896455895155668, |
|
"reward_std": 0.8362017869949341, |
|
"rewards/cosine_scaled_reward": -0.025415319949388504, |
|
"rewards/format_reward": 0.4404761902987957, |
|
"step": 158 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2490.3869018554688, |
|
"epoch": 0.636, |
|
"grad_norm": 0.5016924738883972, |
|
"kl": 0.37744140625, |
|
"learning_rate": 8.758773376468604e-07, |
|
"loss": 0.0548, |
|
"reward": 0.3971053212881088, |
|
"reward_std": 0.6817308068275452, |
|
"rewards/cosine_scaled_reward": -0.07823306252248585, |
|
"rewards/format_reward": 0.5535714328289032, |
|
"step": 159 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2715.90478515625, |
|
"epoch": 0.64, |
|
"grad_norm": 0.776878833770752, |
|
"kl": 0.4169921875, |
|
"learning_rate": 8.737029101523929e-07, |
|
"loss": 0.1414, |
|
"reward": 0.37737663462758064, |
|
"reward_std": 0.8348212540149689, |
|
"rewards/cosine_scaled_reward": -0.03452597954310477, |
|
"rewards/format_reward": 0.4464285746216774, |
|
"step": 160 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2636.0357666015625, |
|
"epoch": 0.644, |
|
"grad_norm": 1.2749825716018677, |
|
"kl": 0.48486328125, |
|
"learning_rate": 8.715127058347614e-07, |
|
"loss": 0.1445, |
|
"reward": 0.24750607460737228, |
|
"reward_std": 0.7917188853025436, |
|
"rewards/cosine_scaled_reward": -0.10541364271193743, |
|
"rewards/format_reward": 0.4583333469927311, |
|
"step": 161 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2622.9405517578125, |
|
"epoch": 0.648, |
|
"grad_norm": 1.3737562894821167, |
|
"kl": 0.5888671875, |
|
"learning_rate": 8.693068314414344e-07, |
|
"loss": 0.1549, |
|
"reward": 0.10282446062774397, |
|
"reward_std": 0.6833581179380417, |
|
"rewards/cosine_scaled_reward": -0.18073063343763351, |
|
"rewards/format_reward": 0.4642857275903225, |
|
"step": 162 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2187.9286193847656, |
|
"epoch": 0.652, |
|
"grad_norm": 1.2476062774658203, |
|
"kl": 0.64453125, |
|
"learning_rate": 8.670853944836176e-07, |
|
"loss": 0.1442, |
|
"reward": 0.5821249708533287, |
|
"reward_std": 0.8525291532278061, |
|
"rewards/cosine_scaled_reward": -0.0303660926874727, |
|
"rewards/format_reward": 0.6428571492433548, |
|
"step": 163 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2645.2202758789062, |
|
"epoch": 0.656, |
|
"grad_norm": 0.8759295344352722, |
|
"kl": 0.83203125, |
|
"learning_rate": 8.648485032310144e-07, |
|
"loss": 0.1369, |
|
"reward": 0.354750145226717, |
|
"reward_std": 0.6708278656005859, |
|
"rewards/cosine_scaled_reward": -0.09941063448786736, |
|
"rewards/format_reward": 0.5535714328289032, |
|
"step": 164 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2744.5952758789062, |
|
"epoch": 0.66, |
|
"grad_norm": 1.443908452987671, |
|
"kl": 0.9365234375, |
|
"learning_rate": 8.625962667065487e-07, |
|
"loss": 0.1514, |
|
"reward": 0.07671361323446035, |
|
"reward_std": 0.7401341199874878, |
|
"rewards/cosine_scaled_reward": -0.16997654270380735, |
|
"rewards/format_reward": 0.4166666679084301, |
|
"step": 165 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2762.2381591796875, |
|
"epoch": 0.664, |
|
"grad_norm": 2.171701192855835, |
|
"kl": 1.064453125, |
|
"learning_rate": 8.603287946810513e-07, |
|
"loss": 0.0493, |
|
"reward": 0.3810354620218277, |
|
"reward_std": 0.6359066590666771, |
|
"rewards/cosine_scaled_reward": -0.05650608614087105, |
|
"rewards/format_reward": 0.4940476231276989, |
|
"step": 166 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2410.6726684570312, |
|
"epoch": 0.668, |
|
"grad_norm": 1.1915135383605957, |
|
"kl": 0.9716796875, |
|
"learning_rate": 8.580461976679099e-07, |
|
"loss": 0.1178, |
|
"reward": 0.5956609398126602, |
|
"reward_std": 0.7429262697696686, |
|
"rewards/cosine_scaled_reward": -0.011693337932229042, |
|
"rewards/format_reward": 0.6190476268529892, |
|
"step": 167 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2624.7083740234375, |
|
"epoch": 0.672, |
|
"grad_norm": 1.2750567197799683, |
|
"kl": 1.111328125, |
|
"learning_rate": 8.557485869176825e-07, |
|
"loss": 0.1676, |
|
"reward": 0.35937594436109066, |
|
"reward_std": 0.7485721707344055, |
|
"rewards/cosine_scaled_reward": -0.0613834522664547, |
|
"rewards/format_reward": 0.482142873108387, |
|
"step": 168 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2566.7857666015625, |
|
"epoch": 0.676, |
|
"grad_norm": 0.8985515832901001, |
|
"kl": 1.0439453125, |
|
"learning_rate": 8.534360744126753e-07, |
|
"loss": 0.1232, |
|
"reward": 0.23157138470560312, |
|
"reward_std": 0.6288014650344849, |
|
"rewards/cosine_scaled_reward": -0.14314288273453712, |
|
"rewards/format_reward": 0.5178571492433548, |
|
"step": 169 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2820.5059814453125, |
|
"epoch": 0.68, |
|
"grad_norm": 1.1454522609710693, |
|
"kl": 0.9677734375, |
|
"learning_rate": 8.511087728614862e-07, |
|
"loss": 0.1412, |
|
"reward": 0.08721911488100886, |
|
"reward_std": 0.6948041319847107, |
|
"rewards/cosine_scaled_reward": -0.16769996285438538, |
|
"rewards/format_reward": 0.4226190522313118, |
|
"step": 170 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2376.166748046875, |
|
"epoch": 0.684, |
|
"grad_norm": 0.9355194568634033, |
|
"kl": 0.9521484375, |
|
"learning_rate": 8.487667956935087e-07, |
|
"loss": 0.128, |
|
"reward": 0.41750996466726065, |
|
"reward_std": 0.7085302621126175, |
|
"rewards/cosine_scaled_reward": -0.05910217575728893, |
|
"rewards/format_reward": 0.535714291036129, |
|
"step": 171 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2571.5000610351562, |
|
"epoch": 0.688, |
|
"grad_norm": 0.9496890902519226, |
|
"kl": 1.0341796875, |
|
"learning_rate": 8.464102570534061e-07, |
|
"loss": 0.147, |
|
"reward": 0.21527537889778614, |
|
"reward_std": 0.6487467139959335, |
|
"rewards/cosine_scaled_reward": -0.2048623152077198, |
|
"rewards/format_reward": 0.6250000149011612, |
|
"step": 172 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2577.839324951172, |
|
"epoch": 0.692, |
|
"grad_norm": 1.125106692314148, |
|
"kl": 1.005859375, |
|
"learning_rate": 8.440392717955475e-07, |
|
"loss": 0.1126, |
|
"reward": 0.29065654147416353, |
|
"reward_std": 0.5777322202920914, |
|
"rewards/cosine_scaled_reward": -0.11955267190933228, |
|
"rewards/format_reward": 0.5297619178891182, |
|
"step": 173 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2297.0416870117188, |
|
"epoch": 0.696, |
|
"grad_norm": 1.5477794408798218, |
|
"kl": 0.9482421875, |
|
"learning_rate": 8.416539554784089e-07, |
|
"loss": 0.0866, |
|
"reward": 0.35003964975476265, |
|
"reward_std": 0.7120198756456375, |
|
"rewards/cosine_scaled_reward": -0.13152779638767242, |
|
"rewards/format_reward": 0.6130952388048172, |
|
"step": 174 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2239.952423095703, |
|
"epoch": 0.7, |
|
"grad_norm": 1.1404165029525757, |
|
"kl": 0.7734375, |
|
"learning_rate": 8.392544243589427e-07, |
|
"loss": 0.1326, |
|
"reward": 0.7693988904356956, |
|
"reward_std": 0.8029063045978546, |
|
"rewards/cosine_scaled_reward": 0.0543422931805253, |
|
"rewards/format_reward": 0.6607142984867096, |
|
"step": 175 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2214.148895263672, |
|
"epoch": 0.704, |
|
"grad_norm": 0.976016104221344, |
|
"kl": 0.8193359375, |
|
"learning_rate": 8.368407953869103e-07, |
|
"loss": 0.1005, |
|
"reward": 0.5222894381731749, |
|
"reward_std": 0.6858630776405334, |
|
"rewards/cosine_scaled_reward": -0.07814099243842065, |
|
"rewards/format_reward": 0.6785714477300644, |
|
"step": 176 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2167.4345092773438, |
|
"epoch": 0.708, |
|
"grad_norm": 1.6724809408187866, |
|
"kl": 0.740234375, |
|
"learning_rate": 8.344131861991828e-07, |
|
"loss": 0.1424, |
|
"reward": 0.3468378521502018, |
|
"reward_std": 0.6407709717750549, |
|
"rewards/cosine_scaled_reward": -0.16289059445261955, |
|
"rewards/format_reward": 0.6726190596818924, |
|
"step": 177 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2593.5654907226562, |
|
"epoch": 0.712, |
|
"grad_norm": 1.3712421655654907, |
|
"kl": 0.9814453125, |
|
"learning_rate": 8.319717151140072e-07, |
|
"loss": 0.1121, |
|
"reward": 0.27433447539806366, |
|
"reward_std": 0.6857093423604965, |
|
"rewards/cosine_scaled_reward": -0.16342800296843052, |
|
"rewards/format_reward": 0.6011904925107956, |
|
"step": 178 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2240.9583129882812, |
|
"epoch": 0.716, |
|
"grad_norm": 2.2109479904174805, |
|
"kl": 0.7880859375, |
|
"learning_rate": 8.295165011252396e-07, |
|
"loss": 0.0613, |
|
"reward": 0.3249462991952896, |
|
"reward_std": 0.7396285533905029, |
|
"rewards/cosine_scaled_reward": -0.12919352855533361, |
|
"rewards/format_reward": 0.5833333358168602, |
|
"step": 179 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2391.261962890625, |
|
"epoch": 0.72, |
|
"grad_norm": 0.9252892136573792, |
|
"kl": 0.8369140625, |
|
"learning_rate": 8.270476638965461e-07, |
|
"loss": 0.0766, |
|
"reward": 0.37066294252872467, |
|
"reward_std": 0.5772489011287689, |
|
"rewards/cosine_scaled_reward": -0.15395426377654076, |
|
"rewards/format_reward": 0.6785714477300644, |
|
"step": 180 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2188.21435546875, |
|
"epoch": 0.724, |
|
"grad_norm": 1.4679890871047974, |
|
"kl": 0.7177734375, |
|
"learning_rate": 8.245653237555705e-07, |
|
"loss": 0.1271, |
|
"reward": 0.47163213789463043, |
|
"reward_std": 0.7110278159379959, |
|
"rewards/cosine_scaled_reward": -0.10942202992737293, |
|
"rewards/format_reward": 0.6904762089252472, |
|
"step": 181 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2330.3572387695312, |
|
"epoch": 0.728, |
|
"grad_norm": 0.8398174047470093, |
|
"kl": 0.71875, |
|
"learning_rate": 8.220696016880687e-07, |
|
"loss": 0.0837, |
|
"reward": 0.5167603380978107, |
|
"reward_std": 0.704664558172226, |
|
"rewards/cosine_scaled_reward": -0.08090554922819138, |
|
"rewards/format_reward": 0.6785714402794838, |
|
"step": 182 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2319.154815673828, |
|
"epoch": 0.732, |
|
"grad_norm": 1.0028657913208008, |
|
"kl": 0.7421875, |
|
"learning_rate": 8.195606193320136e-07, |
|
"loss": 0.1069, |
|
"reward": 0.6520561873912811, |
|
"reward_std": 0.8034340292215347, |
|
"rewards/cosine_scaled_reward": -0.04301954247057438, |
|
"rewards/format_reward": 0.7380952537059784, |
|
"step": 183 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2419.6131591796875, |
|
"epoch": 0.736, |
|
"grad_norm": 0.9799902439117432, |
|
"kl": 0.794921875, |
|
"learning_rate": 8.170384989716657e-07, |
|
"loss": 0.1, |
|
"reward": 0.5134465768933296, |
|
"reward_std": 0.7416307479143143, |
|
"rewards/cosine_scaled_reward": -0.10637196339666843, |
|
"rewards/format_reward": 0.7261904776096344, |
|
"step": 184 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2279.3630981445312, |
|
"epoch": 0.74, |
|
"grad_norm": 1.1403197050094604, |
|
"kl": 0.75390625, |
|
"learning_rate": 8.145033635316128e-07, |
|
"loss": 0.08, |
|
"reward": 0.5693989507853985, |
|
"reward_std": 0.6981105357408524, |
|
"rewards/cosine_scaled_reward": -0.06946719996631145, |
|
"rewards/format_reward": 0.7083333507180214, |
|
"step": 185 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2087.2679443359375, |
|
"epoch": 0.744, |
|
"grad_norm": 0.8785580992698669, |
|
"kl": 0.6123046875, |
|
"learning_rate": 8.119553365707802e-07, |
|
"loss": 0.0849, |
|
"reward": 0.4244233965873718, |
|
"reward_std": 0.718925341963768, |
|
"rewards/cosine_scaled_reward": -0.1449311599135399, |
|
"rewards/format_reward": 0.7142857313156128, |
|
"step": 186 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2415.8036499023438, |
|
"epoch": 0.748, |
|
"grad_norm": 1.325434684753418, |
|
"kl": 0.6298828125, |
|
"learning_rate": 8.093945422764069e-07, |
|
"loss": 0.0477, |
|
"reward": 0.594460990279913, |
|
"reward_std": 0.7041322290897369, |
|
"rewards/cosine_scaled_reward": -0.021221883594989777, |
|
"rewards/format_reward": 0.636904776096344, |
|
"step": 187 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2371.0714721679688, |
|
"epoch": 0.752, |
|
"grad_norm": 1.3853912353515625, |
|
"kl": 0.638671875, |
|
"learning_rate": 8.068211054579943e-07, |
|
"loss": 0.1131, |
|
"reward": 0.5956445932388306, |
|
"reward_std": 0.7780069708824158, |
|
"rewards/cosine_scaled_reward": -0.062296761316247284, |
|
"rewards/format_reward": 0.7202381044626236, |
|
"step": 188 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2243.3333740234375, |
|
"epoch": 0.756, |
|
"grad_norm": 0.7066504955291748, |
|
"kl": 0.564453125, |
|
"learning_rate": 8.04235151541222e-07, |
|
"loss": 0.043, |
|
"reward": 0.7391829118132591, |
|
"reward_std": 0.6626263409852982, |
|
"rewards/cosine_scaled_reward": -0.014337139204144478, |
|
"rewards/format_reward": 0.767857164144516, |
|
"step": 189 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2111.875030517578, |
|
"epoch": 0.76, |
|
"grad_norm": 1.1808303594589233, |
|
"kl": 0.5361328125, |
|
"learning_rate": 8.01636806561836e-07, |
|
"loss": 0.0212, |
|
"reward": 0.6303885579109192, |
|
"reward_std": 0.7266089022159576, |
|
"rewards/cosine_scaled_reward": -0.04790095146745443, |
|
"rewards/format_reward": 0.7261904925107956, |
|
"step": 190 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2512.1607666015625, |
|
"epoch": 0.764, |
|
"grad_norm": 1.169936180114746, |
|
"kl": 0.54736328125, |
|
"learning_rate": 7.990261971595048e-07, |
|
"loss": 0.0239, |
|
"reward": 0.4208872392773628, |
|
"reward_std": 0.6789906620979309, |
|
"rewards/cosine_scaled_reward": -0.12288972595706582, |
|
"rewards/format_reward": 0.6666666716337204, |
|
"step": 191 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2421.7381591796875, |
|
"epoch": 0.768, |
|
"grad_norm": 1.9125944375991821, |
|
"kl": 0.44970703125, |
|
"learning_rate": 7.964034505716476e-07, |
|
"loss": 0.162, |
|
"reward": 0.6703099310398102, |
|
"reward_std": 0.7079124301671982, |
|
"rewards/cosine_scaled_reward": 0.022654948756098747, |
|
"rewards/format_reward": 0.625, |
|
"step": 192 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2342.3333435058594, |
|
"epoch": 0.772, |
|
"grad_norm": 1.1848394870758057, |
|
"kl": 0.4287109375, |
|
"learning_rate": 7.93768694627233e-07, |
|
"loss": 0.1217, |
|
"reward": 0.3946942985057831, |
|
"reward_std": 0.7293716818094254, |
|
"rewards/cosine_scaled_reward": -0.14789094775915146, |
|
"rewards/format_reward": 0.6904762089252472, |
|
"step": 193 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2488.1786193847656, |
|
"epoch": 0.776, |
|
"grad_norm": 0.8427687883377075, |
|
"kl": 0.40673828125, |
|
"learning_rate": 7.911220577405484e-07, |
|
"loss": 0.0681, |
|
"reward": 0.33857931289821863, |
|
"reward_std": 0.7693478316068649, |
|
"rewards/cosine_scaled_reward": -0.11642462853342295, |
|
"rewards/format_reward": 0.5714285671710968, |
|
"step": 194 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2235.4762573242188, |
|
"epoch": 0.78, |
|
"grad_norm": 1.9778449535369873, |
|
"kl": 0.4599609375, |
|
"learning_rate": 7.884636689049422e-07, |
|
"loss": 0.1203, |
|
"reward": 0.7276730462908745, |
|
"reward_std": 0.8504652380943298, |
|
"rewards/cosine_scaled_reward": 0.02455079648643732, |
|
"rewards/format_reward": 0.6785714328289032, |
|
"step": 195 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2252.7262268066406, |
|
"epoch": 0.784, |
|
"grad_norm": 1.251224398612976, |
|
"kl": 0.49169921875, |
|
"learning_rate": 7.857936576865356e-07, |
|
"loss": 0.0753, |
|
"reward": 0.6360676661133766, |
|
"reward_std": 0.8185366541147232, |
|
"rewards/cosine_scaled_reward": -0.01827568793669343, |
|
"rewards/format_reward": 0.6726190596818924, |
|
"step": 196 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2399.7500610351562, |
|
"epoch": 0.788, |
|
"grad_norm": 0.9470409154891968, |
|
"kl": 0.517578125, |
|
"learning_rate": 7.831121542179086e-07, |
|
"loss": 0.1036, |
|
"reward": 0.550631508231163, |
|
"reward_std": 0.7208298593759537, |
|
"rewards/cosine_scaled_reward": -0.037184251472353935, |
|
"rewards/format_reward": 0.625, |
|
"step": 197 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2269.5000610351562, |
|
"epoch": 0.792, |
|
"grad_norm": 2.047698974609375, |
|
"kl": 0.595703125, |
|
"learning_rate": 7.804192891917571e-07, |
|
"loss": 0.1831, |
|
"reward": 0.29151881486177444, |
|
"reward_std": 0.666583925485611, |
|
"rewards/cosine_scaled_reward": -0.15483582392334938, |
|
"rewards/format_reward": 0.601190485060215, |
|
"step": 198 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2316.7857666015625, |
|
"epoch": 0.796, |
|
"grad_norm": 1.6713296175003052, |
|
"kl": 0.60205078125, |
|
"learning_rate": 7.777151938545235e-07, |
|
"loss": 0.1356, |
|
"reward": 0.5018086154013872, |
|
"reward_std": 0.8012387007474899, |
|
"rewards/cosine_scaled_reward": -0.0615957040572539, |
|
"rewards/format_reward": 0.6250000223517418, |
|
"step": 199 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2471.1964721679688, |
|
"epoch": 0.8, |
|
"grad_norm": 0.9633775949478149, |
|
"kl": 0.740234375, |
|
"learning_rate": 7.75e-07, |
|
"loss": 0.1277, |
|
"reward": 0.3792301341891289, |
|
"reward_std": 0.76199010014534, |
|
"rewards/cosine_scaled_reward": -0.11693255044519901, |
|
"rewards/format_reward": 0.6130952537059784, |
|
"step": 200 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2280.6607971191406, |
|
"epoch": 0.804, |
|
"grad_norm": 1.1369765996932983, |
|
"kl": 0.7587890625, |
|
"learning_rate": 7.72273839962904e-07, |
|
"loss": 0.1174, |
|
"reward": 0.4361310079693794, |
|
"reward_std": 0.7977508455514908, |
|
"rewards/cosine_scaled_reward": -0.0736011671833694, |
|
"rewards/format_reward": 0.5833333432674408, |
|
"step": 201 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2239.3809814453125, |
|
"epoch": 0.808, |
|
"grad_norm": 1.1852681636810303, |
|
"kl": 0.80078125, |
|
"learning_rate": 7.695368466124296e-07, |
|
"loss": 0.1861, |
|
"reward": 0.4273875653743744, |
|
"reward_std": 0.7939650565385818, |
|
"rewards/cosine_scaled_reward": -0.08987765479832888, |
|
"rewards/format_reward": 0.6071428656578064, |
|
"step": 202 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2270.9524536132812, |
|
"epoch": 0.812, |
|
"grad_norm": 2.2510244846343994, |
|
"kl": 1.05859375, |
|
"learning_rate": 7.667891533457718e-07, |
|
"loss": 0.1778, |
|
"reward": 0.5268369093537331, |
|
"reward_std": 0.7606751769781113, |
|
"rewards/cosine_scaled_reward": -0.05205773119814694, |
|
"rewards/format_reward": 0.630952388048172, |
|
"step": 203 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2307.83935546875, |
|
"epoch": 0.816, |
|
"grad_norm": 3.0754034519195557, |
|
"kl": 1.107421875, |
|
"learning_rate": 7.640308940816239e-07, |
|
"loss": 0.1251, |
|
"reward": 0.046380717772990465, |
|
"reward_std": 0.6517826318740845, |
|
"rewards/cosine_scaled_reward": -0.23573821783065796, |
|
"rewards/format_reward": 0.5178571566939354, |
|
"step": 204 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2065.482177734375, |
|
"epoch": 0.82, |
|
"grad_norm": 3.317054033279419, |
|
"kl": 0.8037109375, |
|
"learning_rate": 7.612622032536507e-07, |
|
"loss": 0.1229, |
|
"reward": 0.629617914557457, |
|
"reward_std": 0.7360707223415375, |
|
"rewards/cosine_scaled_reward": -0.012572012841701508, |
|
"rewards/format_reward": 0.654761902987957, |
|
"step": 205 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2119.2679443359375, |
|
"epoch": 0.824, |
|
"grad_norm": 1.985148310661316, |
|
"kl": 0.70849609375, |
|
"learning_rate": 7.584832158039378e-07, |
|
"loss": 0.1697, |
|
"reward": 0.4503296762704849, |
|
"reward_std": 0.7717154771089554, |
|
"rewards/cosine_scaled_reward": -0.07840658072382212, |
|
"rewards/format_reward": 0.6071428656578064, |
|
"step": 206 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2085.839324951172, |
|
"epoch": 0.828, |
|
"grad_norm": 2.4033172130584717, |
|
"kl": 0.6025390625, |
|
"learning_rate": 7.556940671764124e-07, |
|
"loss": 0.1578, |
|
"reward": 0.4145805863663554, |
|
"reward_std": 0.7361099421977997, |
|
"rewards/cosine_scaled_reward": -0.11116209626197815, |
|
"rewards/format_reward": 0.6369047611951828, |
|
"step": 207 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1872.482177734375, |
|
"epoch": 0.832, |
|
"grad_norm": 2.11576247215271, |
|
"kl": 0.408203125, |
|
"learning_rate": 7.528948933102438e-07, |
|
"loss": 0.0839, |
|
"reward": 0.4670650511980057, |
|
"reward_std": 0.7250475585460663, |
|
"rewards/cosine_scaled_reward": -0.10872937482781708, |
|
"rewards/format_reward": 0.6845238208770752, |
|
"step": 208 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2085.6786499023438, |
|
"epoch": 0.836, |
|
"grad_norm": 0.8786793351173401, |
|
"kl": 0.51806640625, |
|
"learning_rate": 7.500858306332172e-07, |
|
"loss": 0.0649, |
|
"reward": 0.46545055881142616, |
|
"reward_std": 0.6805593073368073, |
|
"rewards/cosine_scaled_reward": -0.09465568419545889, |
|
"rewards/format_reward": 0.6547619104385376, |
|
"step": 209 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2425.7916870117188, |
|
"epoch": 0.84, |
|
"grad_norm": 1.3337445259094238, |
|
"kl": 0.58837890625, |
|
"learning_rate": 7.472670160550848e-07, |
|
"loss": 0.1561, |
|
"reward": 0.4684627018868923, |
|
"reward_std": 0.824245348572731, |
|
"rewards/cosine_scaled_reward": -0.04553056287113577, |
|
"rewards/format_reward": 0.5595238208770752, |
|
"step": 210 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2630.5655517578125, |
|
"epoch": 0.844, |
|
"grad_norm": 1.3039979934692383, |
|
"kl": 0.732421875, |
|
"learning_rate": 7.444385869608921e-07, |
|
"loss": 0.1559, |
|
"reward": 0.1796425711363554, |
|
"reward_std": 0.6979469060897827, |
|
"rewards/cosine_scaled_reward": -0.17803586274385452, |
|
"rewards/format_reward": 0.5357143059372902, |
|
"step": 211 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1983.9762268066406, |
|
"epoch": 0.848, |
|
"grad_norm": 0.9129418134689331, |
|
"kl": 0.546875, |
|
"learning_rate": 7.416006812042827e-07, |
|
"loss": 0.1352, |
|
"reward": 0.4564796891063452, |
|
"reward_std": 0.6133182421326637, |
|
"rewards/cosine_scaled_reward": -0.11402205377817154, |
|
"rewards/format_reward": 0.6845238208770752, |
|
"step": 212 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2315.5119018554688, |
|
"epoch": 0.852, |
|
"grad_norm": 1.2220977544784546, |
|
"kl": 0.5751953125, |
|
"learning_rate": 7.387534371007797e-07, |
|
"loss": 0.1683, |
|
"reward": 0.6708191484212875, |
|
"reward_std": 0.9547160714864731, |
|
"rewards/cosine_scaled_reward": 0.016957183834165335, |
|
"rewards/format_reward": 0.636904776096344, |
|
"step": 213 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2233.0655212402344, |
|
"epoch": 0.856, |
|
"grad_norm": 0.7978451251983643, |
|
"kl": 0.61474609375, |
|
"learning_rate": 7.358969934210438e-07, |
|
"loss": 0.1304, |
|
"reward": 0.40765415877103806, |
|
"reward_std": 0.7158278822898865, |
|
"rewards/cosine_scaled_reward": -0.12057768838712946, |
|
"rewards/format_reward": 0.6488095372915268, |
|
"step": 214 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2194.791748046875, |
|
"epoch": 0.86, |
|
"grad_norm": 1.0176509618759155, |
|
"kl": 0.61181640625, |
|
"learning_rate": 7.330314893841101e-07, |
|
"loss": 0.0991, |
|
"reward": 0.5912733934819698, |
|
"reward_std": 0.6540912538766861, |
|
"rewards/cosine_scaled_reward": -0.013887112960219383, |
|
"rewards/format_reward": 0.6190476194024086, |
|
"step": 215 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2189.5535888671875, |
|
"epoch": 0.864, |
|
"grad_norm": 0.7862021923065186, |
|
"kl": 0.6572265625, |
|
"learning_rate": 7.301570646506027e-07, |
|
"loss": 0.1369, |
|
"reward": 0.4810000769793987, |
|
"reward_std": 0.6697472035884857, |
|
"rewards/cosine_scaled_reward": -0.09283328615128994, |
|
"rewards/format_reward": 0.6666666716337204, |
|
"step": 216 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2238.6487731933594, |
|
"epoch": 0.868, |
|
"grad_norm": 0.675116240978241, |
|
"kl": 0.662109375, |
|
"learning_rate": 7.27273859315928e-07, |
|
"loss": 0.1234, |
|
"reward": 0.3597661480307579, |
|
"reward_std": 0.6638298779726028, |
|
"rewards/cosine_scaled_reward": -0.13559313118457794, |
|
"rewards/format_reward": 0.6309523731470108, |
|
"step": 217 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2330.7560424804688, |
|
"epoch": 0.872, |
|
"grad_norm": 0.7294526696205139, |
|
"kl": 0.6875, |
|
"learning_rate": 7.243820139034464e-07, |
|
"loss": 0.1182, |
|
"reward": 0.5070892386138439, |
|
"reward_std": 0.770987793803215, |
|
"rewards/cosine_scaled_reward": -0.029193488880991936, |
|
"rewards/format_reward": 0.565476194024086, |
|
"step": 218 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2425.4048461914062, |
|
"epoch": 0.876, |
|
"grad_norm": 0.9955194592475891, |
|
"kl": 0.76171875, |
|
"learning_rate": 7.214816693576234e-07, |
|
"loss": 0.145, |
|
"reward": 0.3850390911102295, |
|
"reward_std": 0.72886823117733, |
|
"rewards/cosine_scaled_reward": -0.11700426135212183, |
|
"rewards/format_reward": 0.6190476417541504, |
|
"step": 219 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2443.869110107422, |
|
"epoch": 0.88, |
|
"grad_norm": 0.8245673179626465, |
|
"kl": 0.7412109375, |
|
"learning_rate": 7.185729670371604e-07, |
|
"loss": 0.1517, |
|
"reward": 0.3367026010528207, |
|
"reward_std": 0.6719767898321152, |
|
"rewards/cosine_scaled_reward": -0.1471248921006918, |
|
"rewards/format_reward": 0.630952388048172, |
|
"step": 220 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2401.65478515625, |
|
"epoch": 0.884, |
|
"grad_norm": 0.6434879302978516, |
|
"kl": 0.552734375, |
|
"learning_rate": 7.156560487081051e-07, |
|
"loss": 0.0964, |
|
"reward": 0.5589644331485033, |
|
"reward_std": 0.6387112140655518, |
|
"rewards/cosine_scaled_reward": -0.030041599762625992, |
|
"rewards/format_reward": 0.619047611951828, |
|
"step": 221 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2159.9107666015625, |
|
"epoch": 0.888, |
|
"grad_norm": 0.8747764229774475, |
|
"kl": 0.48974609375, |
|
"learning_rate": 7.127310565369415e-07, |
|
"loss": 0.0648, |
|
"reward": 1.0575831979513168, |
|
"reward_std": 0.8345089554786682, |
|
"rewards/cosine_scaled_reward": 0.15379157848656178, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 222 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2736.8095703125, |
|
"epoch": 0.892, |
|
"grad_norm": 1.644534707069397, |
|
"kl": 0.650390625, |
|
"learning_rate": 7.097981330836616e-07, |
|
"loss": 0.0898, |
|
"reward": 0.28782752249389887, |
|
"reward_std": 0.6842672526836395, |
|
"rewards/cosine_scaled_reward": -0.11799101112410426, |
|
"rewards/format_reward": 0.5238095298409462, |
|
"step": 223 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2585.4464721679688, |
|
"epoch": 0.896, |
|
"grad_norm": 0.5411848425865173, |
|
"kl": 0.603515625, |
|
"learning_rate": 7.068574212948169e-07, |
|
"loss": 0.1243, |
|
"reward": 0.49723897874355316, |
|
"reward_std": 0.810086615383625, |
|
"rewards/cosine_scaled_reward": -0.07280909270048141, |
|
"rewards/format_reward": 0.6428571492433548, |
|
"step": 224 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2394.8988647460938, |
|
"epoch": 0.9, |
|
"grad_norm": 0.7165555357933044, |
|
"kl": 0.52783203125, |
|
"learning_rate": 7.039090644965509e-07, |
|
"loss": 0.0888, |
|
"reward": 0.5129196643829346, |
|
"reward_std": 0.787805512547493, |
|
"rewards/cosine_scaled_reward": -0.056040180614218116, |
|
"rewards/format_reward": 0.6250000298023224, |
|
"step": 225 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2482.7678833007812, |
|
"epoch": 0.904, |
|
"grad_norm": 0.5211958289146423, |
|
"kl": 0.51416015625, |
|
"learning_rate": 7.009532063876148e-07, |
|
"loss": 0.0812, |
|
"reward": 0.4906727410852909, |
|
"reward_std": 0.7880082875490189, |
|
"rewards/cosine_scaled_reward": -0.0641874436987564, |
|
"rewards/format_reward": 0.6190476417541504, |
|
"step": 226 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2290.0000610351562, |
|
"epoch": 0.908, |
|
"grad_norm": 0.5630519986152649, |
|
"kl": 0.382568359375, |
|
"learning_rate": 6.979899910323624e-07, |
|
"loss": 0.1034, |
|
"reward": 0.6861637309193611, |
|
"reward_std": 0.7359699308872223, |
|
"rewards/cosine_scaled_reward": -0.049775293562561274, |
|
"rewards/format_reward": 0.7857142984867096, |
|
"step": 227 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2686.7440795898438, |
|
"epoch": 0.912, |
|
"grad_norm": 0.6647688746452332, |
|
"kl": 0.4326171875, |
|
"learning_rate": 6.950195628537299e-07, |
|
"loss": 0.0594, |
|
"reward": 0.5352285588160157, |
|
"reward_std": 0.7634364515542984, |
|
"rewards/cosine_scaled_reward": -0.047861908678896725, |
|
"rewards/format_reward": 0.6309524029493332, |
|
"step": 228 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2687.5416564941406, |
|
"epoch": 0.916, |
|
"grad_norm": 0.37424567341804504, |
|
"kl": 0.39208984375, |
|
"learning_rate": 6.920420666261961e-07, |
|
"loss": 0.0465, |
|
"reward": 0.43462158273905516, |
|
"reward_std": 0.6648337990045547, |
|
"rewards/cosine_scaled_reward": -0.1070939814671874, |
|
"rewards/format_reward": 0.6488095298409462, |
|
"step": 229 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2462.8452758789062, |
|
"epoch": 0.92, |
|
"grad_norm": 0.52641361951828, |
|
"kl": 0.37158203125, |
|
"learning_rate": 6.890576474687263e-07, |
|
"loss": 0.1061, |
|
"reward": 0.5536616146564484, |
|
"reward_std": 0.6706894189119339, |
|
"rewards/cosine_scaled_reward": -0.06840727850794792, |
|
"rewards/format_reward": 0.690476194024086, |
|
"step": 230 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2395.279754638672, |
|
"epoch": 0.924, |
|
"grad_norm": 0.5165700912475586, |
|
"kl": 0.369140625, |
|
"learning_rate": 6.860664508377001e-07, |
|
"loss": 0.0951, |
|
"reward": 0.4786584824323654, |
|
"reward_std": 0.774825245141983, |
|
"rewards/cosine_scaled_reward": -0.1267421804368496, |
|
"rewards/format_reward": 0.7321428656578064, |
|
"step": 231 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2468.2857666015625, |
|
"epoch": 0.928, |
|
"grad_norm": 0.4581441879272461, |
|
"kl": 0.31591796875, |
|
"learning_rate": 6.83068622519821e-07, |
|
"loss": 0.0555, |
|
"reward": 0.6299031171947718, |
|
"reward_std": 0.7808382511138916, |
|
"rewards/cosine_scaled_reward": -0.045167478267103434, |
|
"rewards/format_reward": 0.7202381044626236, |
|
"step": 232 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2712.58935546875, |
|
"epoch": 0.932, |
|
"grad_norm": 0.7744795083999634, |
|
"kl": 0.333984375, |
|
"learning_rate": 6.800643086250121e-07, |
|
"loss": 0.0623, |
|
"reward": 0.6912369206547737, |
|
"reward_std": 0.7789230197668076, |
|
"rewards/cosine_scaled_reward": 0.04204704426229, |
|
"rewards/format_reward": 0.607142873108387, |
|
"step": 233 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2556.619110107422, |
|
"epoch": 0.936, |
|
"grad_norm": 0.8385416865348816, |
|
"kl": 0.38427734375, |
|
"learning_rate": 6.770536555792944e-07, |
|
"loss": 0.0662, |
|
"reward": 0.4830031730234623, |
|
"reward_std": 0.7291474640369415, |
|
"rewards/cosine_scaled_reward": -0.07397460378706455, |
|
"rewards/format_reward": 0.6309523731470108, |
|
"step": 234 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2724.8809814453125, |
|
"epoch": 0.94, |
|
"grad_norm": 0.6943939328193665, |
|
"kl": 0.330078125, |
|
"learning_rate": 6.740368101176495e-07, |
|
"loss": 0.1008, |
|
"reward": 0.38701344281435013, |
|
"reward_std": 0.7834271490573883, |
|
"rewards/cosine_scaled_reward": -0.11304090730845928, |
|
"rewards/format_reward": 0.6130952388048172, |
|
"step": 235 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2819.7738647460938, |
|
"epoch": 0.944, |
|
"grad_norm": 0.3916683495044708, |
|
"kl": 0.32177734375, |
|
"learning_rate": 6.710139192768694e-07, |
|
"loss": 0.0365, |
|
"reward": 0.5249419808387756, |
|
"reward_std": 0.8138006925582886, |
|
"rewards/cosine_scaled_reward": -0.023243289440870285, |
|
"rewards/format_reward": 0.571428582072258, |
|
"step": 236 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2499.0536499023438, |
|
"epoch": 0.948, |
|
"grad_norm": 0.9175835847854614, |
|
"kl": 0.321533203125, |
|
"learning_rate": 6.679851303883891e-07, |
|
"loss": 0.1055, |
|
"reward": 0.6389507204294205, |
|
"reward_std": 0.8023868650197983, |
|
"rewards/cosine_scaled_reward": -0.04064369201660156, |
|
"rewards/format_reward": 0.7202381044626236, |
|
"step": 237 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2557.0655517578125, |
|
"epoch": 0.952, |
|
"grad_norm": 0.4397272765636444, |
|
"kl": 0.30810546875, |
|
"learning_rate": 6.649505910711058e-07, |
|
"loss": 0.0869, |
|
"reward": 0.4888541977852583, |
|
"reward_std": 0.7550098150968552, |
|
"rewards/cosine_scaled_reward": -0.09783481806516647, |
|
"rewards/format_reward": 0.684523805975914, |
|
"step": 238 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2600.422637939453, |
|
"epoch": 0.956, |
|
"grad_norm": 0.9344379305839539, |
|
"kl": 0.345703125, |
|
"learning_rate": 6.619104492241847e-07, |
|
"loss": 0.1329, |
|
"reward": 0.27865387313067913, |
|
"reward_std": 0.6713129729032516, |
|
"rewards/cosine_scaled_reward": -0.18210165202617645, |
|
"rewards/format_reward": 0.6428571417927742, |
|
"step": 239 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2331.8928833007812, |
|
"epoch": 0.96, |
|
"grad_norm": 0.5995355248451233, |
|
"kl": 0.326904296875, |
|
"learning_rate": 6.588648530198504e-07, |
|
"loss": 0.0705, |
|
"reward": 0.7613647617399693, |
|
"reward_std": 0.8133140057325363, |
|
"rewards/cosine_scaled_reward": 0.023539513116702437, |
|
"rewards/format_reward": 0.7142857164144516, |
|
"step": 240 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2355.52978515625, |
|
"epoch": 0.964, |
|
"grad_norm": 0.3258729875087738, |
|
"kl": 0.307861328125, |
|
"learning_rate": 6.558139508961654e-07, |
|
"loss": 0.0608, |
|
"reward": 0.6096780672669411, |
|
"reward_std": 0.7518916502594948, |
|
"rewards/cosine_scaled_reward": -0.05230383496382274, |
|
"rewards/format_reward": 0.7142857313156128, |
|
"step": 241 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2414.0834350585938, |
|
"epoch": 0.968, |
|
"grad_norm": 0.3521139919757843, |
|
"kl": 0.3212890625, |
|
"learning_rate": 6.527578915497951e-07, |
|
"loss": 0.0742, |
|
"reward": 0.6254040375351906, |
|
"reward_std": 0.8331593424081802, |
|
"rewards/cosine_scaled_reward": -0.03551226551644504, |
|
"rewards/format_reward": 0.696428582072258, |
|
"step": 242 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2138.107177734375, |
|
"epoch": 0.972, |
|
"grad_norm": 0.5599615573883057, |
|
"kl": 0.33251953125, |
|
"learning_rate": 6.496968239287603e-07, |
|
"loss": 0.0315, |
|
"reward": 0.8373362571001053, |
|
"reward_std": 0.6551230400800705, |
|
"rewards/cosine_scaled_reward": 0.04366813227534294, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 243 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2277.714324951172, |
|
"epoch": 0.976, |
|
"grad_norm": 0.6147165298461914, |
|
"kl": 0.336181640625, |
|
"learning_rate": 6.466308972251785e-07, |
|
"loss": 0.1075, |
|
"reward": 0.46155789494514465, |
|
"reward_std": 0.6391154229640961, |
|
"rewards/cosine_scaled_reward": -0.14124487387016416, |
|
"rewards/format_reward": 0.7440476417541504, |
|
"step": 244 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2401.232208251953, |
|
"epoch": 0.98, |
|
"grad_norm": 0.8454631567001343, |
|
"kl": 0.4814453125, |
|
"learning_rate": 6.435602608679916e-07, |
|
"loss": 0.0417, |
|
"reward": 0.5565547049045563, |
|
"reward_std": 0.6768698394298553, |
|
"rewards/cosine_scaled_reward": -0.04315121428226121, |
|
"rewards/format_reward": 0.6428571492433548, |
|
"step": 245 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2027.3453369140625, |
|
"epoch": 0.984, |
|
"grad_norm": 0.38341155648231506, |
|
"kl": 0.289794921875, |
|
"learning_rate": 6.404850645156841e-07, |
|
"loss": 0.0993, |
|
"reward": 0.7784423977136612, |
|
"reward_std": 0.6467820554971695, |
|
"rewards/cosine_scaled_reward": -0.009588314220309258, |
|
"rewards/format_reward": 0.7976190596818924, |
|
"step": 246 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2170.5416564941406, |
|
"epoch": 0.988, |
|
"grad_norm": 0.445024311542511, |
|
"kl": 0.4326171875, |
|
"learning_rate": 6.374054580489873e-07, |
|
"loss": 0.1244, |
|
"reward": 0.6971250772476196, |
|
"reward_std": 0.7919557690620422, |
|
"rewards/cosine_scaled_reward": -0.026437478853040375, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 247 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2060.2559814453125, |
|
"epoch": 0.992, |
|
"grad_norm": 0.49659866094589233, |
|
"kl": 0.36865234375, |
|
"learning_rate": 6.343215915635761e-07, |
|
"loss": 0.0959, |
|
"reward": 0.6287773251533508, |
|
"reward_std": 0.7386345416307449, |
|
"rewards/cosine_scaled_reward": -0.060611339285969734, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 248 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2148.869140625, |
|
"epoch": 0.996, |
|
"grad_norm": 0.4539166986942291, |
|
"kl": 0.3701171875, |
|
"learning_rate": 6.31233615362752e-07, |
|
"loss": 0.1019, |
|
"reward": 0.4996798560023308, |
|
"reward_std": 0.6163481399416924, |
|
"rewards/cosine_scaled_reward": -0.11027912324061617, |
|
"rewards/format_reward": 0.7202381044626236, |
|
"step": 249 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2330.3482971191406, |
|
"epoch": 1.0, |
|
"grad_norm": 0.5344291925430298, |
|
"kl": 0.5205078125, |
|
"learning_rate": 6.281416799501187e-07, |
|
"loss": 0.0866, |
|
"reward": 0.42578159645199776, |
|
"reward_std": 0.7348527163267136, |
|
"rewards/cosine_scaled_reward": -0.08472825400531292, |
|
"rewards/format_reward": 0.5952381044626236, |
|
"step": 250 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2106.059539794922, |
|
"epoch": 1.004, |
|
"grad_norm": 0.5930522680282593, |
|
"kl": 0.38232421875, |
|
"learning_rate": 6.25045936022246e-07, |
|
"loss": 0.1423, |
|
"reward": 0.5456876549869776, |
|
"reward_std": 0.6847013607621193, |
|
"rewards/cosine_scaled_reward": -0.0842990386299789, |
|
"rewards/format_reward": 0.7142857164144516, |
|
"step": 251 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2112.4762573242188, |
|
"epoch": 1.008, |
|
"grad_norm": 0.4610899090766907, |
|
"kl": 0.39111328125, |
|
"learning_rate": 6.219465344613258e-07, |
|
"loss": 0.0641, |
|
"reward": 0.6148004308342934, |
|
"reward_std": 0.6790047585964203, |
|
"rewards/cosine_scaled_reward": -0.05867121648043394, |
|
"rewards/format_reward": 0.7321428805589676, |
|
"step": 252 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2289.309600830078, |
|
"epoch": 1.012, |
|
"grad_norm": 0.3950199782848358, |
|
"kl": 0.387451171875, |
|
"learning_rate": 6.188436263278172e-07, |
|
"loss": 0.1336, |
|
"reward": 0.626802071928978, |
|
"reward_std": 0.6337872818112373, |
|
"rewards/cosine_scaled_reward": -0.028860883321613073, |
|
"rewards/format_reward": 0.6845238283276558, |
|
"step": 253 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2465.6130981445312, |
|
"epoch": 1.016, |
|
"grad_norm": 0.6084108352661133, |
|
"kl": 0.45947265625, |
|
"learning_rate": 6.157373628530852e-07, |
|
"loss": 0.0932, |
|
"reward": 0.6250473670661449, |
|
"reward_std": 0.7445118278264999, |
|
"rewards/cosine_scaled_reward": 2.3671891540288925e-05, |
|
"rewards/format_reward": 0.6250000074505806, |
|
"step": 254 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2127.8988647460938, |
|
"epoch": 1.02, |
|
"grad_norm": 0.8596522212028503, |
|
"kl": 0.368896484375, |
|
"learning_rate": 6.126278954320294e-07, |
|
"loss": 0.0589, |
|
"reward": 0.4597589522600174, |
|
"reward_std": 0.710930123925209, |
|
"rewards/cosine_scaled_reward": -0.1361919562332332, |
|
"rewards/format_reward": 0.7321428656578064, |
|
"step": 255 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2113.029815673828, |
|
"epoch": 1.024, |
|
"grad_norm": 0.6557802557945251, |
|
"kl": 0.39306640625, |
|
"learning_rate": 6.095153756157051e-07, |
|
"loss": 0.0808, |
|
"reward": 0.7969172149896622, |
|
"reward_std": 0.7165066450834274, |
|
"rewards/cosine_scaled_reward": 0.02643477637320757, |
|
"rewards/format_reward": 0.7440476268529892, |
|
"step": 256 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2350.4881591796875, |
|
"epoch": 1.028, |
|
"grad_norm": 0.7259902954101562, |
|
"kl": 0.37548828125, |
|
"learning_rate": 6.06399955103937e-07, |
|
"loss": 0.0556, |
|
"reward": 0.6144686937332153, |
|
"reward_std": 0.7161982655525208, |
|
"rewards/cosine_scaled_reward": -0.0052656568586826324, |
|
"rewards/format_reward": 0.6250000149011612, |
|
"step": 257 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2659.2500610351562, |
|
"epoch": 1.032, |
|
"grad_norm": 0.6974296569824219, |
|
"kl": 0.4482421875, |
|
"learning_rate": 6.032817857379256e-07, |
|
"loss": 0.1425, |
|
"reward": 0.38613639771938324, |
|
"reward_std": 0.7526693046092987, |
|
"rewards/cosine_scaled_reward": -0.10455084778368473, |
|
"rewards/format_reward": 0.595238097012043, |
|
"step": 258 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2219.6964721679688, |
|
"epoch": 1.036, |
|
"grad_norm": 0.5528798699378967, |
|
"kl": 0.33984375, |
|
"learning_rate": 6.001610194928464e-07, |
|
"loss": 0.1191, |
|
"reward": 0.6971464306116104, |
|
"reward_std": 0.7383679300546646, |
|
"rewards/cosine_scaled_reward": -0.023450596883776598, |
|
"rewards/format_reward": 0.744047611951828, |
|
"step": 259 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2010.1726989746094, |
|
"epoch": 1.04, |
|
"grad_norm": 0.36631372570991516, |
|
"kl": 0.30126953125, |
|
"learning_rate": 5.97037808470444e-07, |
|
"loss": 0.0699, |
|
"reward": 0.771461233496666, |
|
"reward_std": 0.5148339942097664, |
|
"rewards/cosine_scaled_reward": -0.01307891309261322, |
|
"rewards/format_reward": 0.7976190596818924, |
|
"step": 260 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2153.559539794922, |
|
"epoch": 1.044, |
|
"grad_norm": 0.48435378074645996, |
|
"kl": 0.3251953125, |
|
"learning_rate": 5.939123048916173e-07, |
|
"loss": 0.0931, |
|
"reward": 0.5015835016965866, |
|
"reward_std": 0.69777412712574, |
|
"rewards/cosine_scaled_reward": -0.121232058852911, |
|
"rewards/format_reward": 0.7440476268529892, |
|
"step": 261 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2309.1131591796875, |
|
"epoch": 1.048, |
|
"grad_norm": 0.6150787472724915, |
|
"kl": 0.37060546875, |
|
"learning_rate": 5.907846610890011e-07, |
|
"loss": 0.1074, |
|
"reward": 0.656824603676796, |
|
"reward_std": 0.7539815902709961, |
|
"rewards/cosine_scaled_reward": -0.025754368398338556, |
|
"rewards/format_reward": 0.7083333283662796, |
|
"step": 262 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2073.1488647460938, |
|
"epoch": 1.052, |
|
"grad_norm": 0.5915967226028442, |
|
"kl": 0.32958984375, |
|
"learning_rate": 5.87655029499542e-07, |
|
"loss": 0.1016, |
|
"reward": 0.5839189141988754, |
|
"reward_std": 0.6906930133700371, |
|
"rewards/cosine_scaled_reward": -0.10089768993202597, |
|
"rewards/format_reward": 0.7857142984867096, |
|
"step": 263 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2258.6607971191406, |
|
"epoch": 1.056, |
|
"grad_norm": 0.5032393932342529, |
|
"kl": 0.421875, |
|
"learning_rate": 5.845235626570683e-07, |
|
"loss": 0.0833, |
|
"reward": 0.7445018216967583, |
|
"reward_std": 0.7239043861627579, |
|
"rewards/cosine_scaled_reward": 0.0002271006815135479, |
|
"rewards/format_reward": 0.7440476417541504, |
|
"step": 264 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2421.386962890625, |
|
"epoch": 1.06, |
|
"grad_norm": 0.5948444604873657, |
|
"kl": 0.46826171875, |
|
"learning_rate": 5.813904131848564e-07, |
|
"loss": 0.1342, |
|
"reward": 0.3432777523994446, |
|
"reward_std": 0.7306928038597107, |
|
"rewards/cosine_scaled_reward": -0.1527658887207508, |
|
"rewards/format_reward": 0.6488095223903656, |
|
"step": 265 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1943.8214416503906, |
|
"epoch": 1.064, |
|
"grad_norm": 0.672618567943573, |
|
"kl": 0.33251953125, |
|
"learning_rate": 5.78255733788191e-07, |
|
"loss": 0.074, |
|
"reward": 0.5523176118731499, |
|
"reward_std": 0.6472664028406143, |
|
"rewards/cosine_scaled_reward": -0.08693643007427454, |
|
"rewards/format_reward": 0.7261904776096344, |
|
"step": 266 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2289.4107666015625, |
|
"epoch": 1.068, |
|
"grad_norm": 0.43480613827705383, |
|
"kl": 0.41064453125, |
|
"learning_rate": 5.751196772469237e-07, |
|
"loss": 0.116, |
|
"reward": 0.6816908866167068, |
|
"reward_std": 0.7700821459293365, |
|
"rewards/cosine_scaled_reward": -0.01034504920244217, |
|
"rewards/format_reward": 0.70238097012043, |
|
"step": 267 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2265.166748046875, |
|
"epoch": 1.072, |
|
"grad_norm": 0.8894410729408264, |
|
"kl": 0.37890625, |
|
"learning_rate": 5.71982396408026e-07, |
|
"loss": 0.1102, |
|
"reward": 0.5768959820270538, |
|
"reward_std": 0.7392304837703705, |
|
"rewards/cosine_scaled_reward": -0.04786152858287096, |
|
"rewards/format_reward": 0.6726190447807312, |
|
"step": 268 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2102.4345092773438, |
|
"epoch": 1.076, |
|
"grad_norm": 1.40628182888031, |
|
"kl": 0.34814453125, |
|
"learning_rate": 5.688440441781398e-07, |
|
"loss": 0.1523, |
|
"reward": 0.6540864631533623, |
|
"reward_std": 0.7483679950237274, |
|
"rewards/cosine_scaled_reward": -0.030099631054326892, |
|
"rewards/format_reward": 0.7142857164144516, |
|
"step": 269 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1760.5893249511719, |
|
"epoch": 1.08, |
|
"grad_norm": 0.655262291431427, |
|
"kl": 0.34228515625, |
|
"learning_rate": 5.657047735161255e-07, |
|
"loss": 0.0938, |
|
"reward": 0.7075737789273262, |
|
"reward_std": 0.712226152420044, |
|
"rewards/cosine_scaled_reward": -0.045022654812783, |
|
"rewards/format_reward": 0.7976190596818924, |
|
"step": 270 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1989.1488037109375, |
|
"epoch": 1.084, |
|
"grad_norm": 0.5984042286872864, |
|
"kl": 0.3974609375, |
|
"learning_rate": 5.625647374256061e-07, |
|
"loss": 0.0893, |
|
"reward": 0.5623346008360386, |
|
"reward_std": 0.7052316814661026, |
|
"rewards/cosine_scaled_reward": -0.08192794572096318, |
|
"rewards/format_reward": 0.7261904925107956, |
|
"step": 271 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1998.327392578125, |
|
"epoch": 1.088, |
|
"grad_norm": 0.41462650895118713, |
|
"kl": 0.37939453125, |
|
"learning_rate": 5.594240889475106e-07, |
|
"loss": 0.1384, |
|
"reward": 0.6586858294904232, |
|
"reward_std": 0.8071554154157639, |
|
"rewards/cosine_scaled_reward": -0.018871376756578684, |
|
"rewards/format_reward": 0.696428582072258, |
|
"step": 272 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1905.9880981445312, |
|
"epoch": 1.092, |
|
"grad_norm": 1.1817877292633057, |
|
"kl": 0.4287109375, |
|
"learning_rate": 5.562829811526154e-07, |
|
"loss": 0.108, |
|
"reward": 0.585694283246994, |
|
"reward_std": 0.6987177431583405, |
|
"rewards/cosine_scaled_reward": -0.08512906730175018, |
|
"rewards/format_reward": 0.755952388048172, |
|
"step": 273 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1958.0892639160156, |
|
"epoch": 1.096, |
|
"grad_norm": 0.6756201982498169, |
|
"kl": 0.44580078125, |
|
"learning_rate": 5.531415671340826e-07, |
|
"loss": 0.1298, |
|
"reward": 0.5423668641597033, |
|
"reward_std": 0.5766877979040146, |
|
"rewards/cosine_scaled_reward": -0.09786419570446014, |
|
"rewards/format_reward": 0.7380952537059784, |
|
"step": 274 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1423.4405212402344, |
|
"epoch": 1.1, |
|
"grad_norm": 0.9936150908470154, |
|
"kl": 0.283203125, |
|
"learning_rate": 5.5e-07, |
|
"loss": 0.0068, |
|
"reward": 0.8336242958903313, |
|
"reward_std": 0.6556554213166237, |
|
"rewards/cosine_scaled_reward": -0.02366404954227619, |
|
"rewards/format_reward": 0.8809524178504944, |
|
"step": 275 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1477.011962890625, |
|
"epoch": 1.104, |
|
"grad_norm": 1.4654834270477295, |
|
"kl": 0.30419921875, |
|
"learning_rate": 5.468584328659172e-07, |
|
"loss": 0.1583, |
|
"reward": 0.9086148589849472, |
|
"reward_std": 0.7289283871650696, |
|
"rewards/cosine_scaled_reward": 0.0197836235165596, |
|
"rewards/format_reward": 0.8690476417541504, |
|
"step": 276 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1600.8333740234375, |
|
"epoch": 1.108, |
|
"grad_norm": 0.5991122126579285, |
|
"kl": 0.39697265625, |
|
"learning_rate": 5.437170188473847e-07, |
|
"loss": 0.0615, |
|
"reward": 0.6998666599392891, |
|
"reward_std": 0.6800315380096436, |
|
"rewards/cosine_scaled_reward": -0.06375712971203029, |
|
"rewards/format_reward": 0.82738097012043, |
|
"step": 277 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2001.3035583496094, |
|
"epoch": 1.112, |
|
"grad_norm": 0.9033568501472473, |
|
"kl": 0.4404296875, |
|
"learning_rate": 5.405759110524894e-07, |
|
"loss": 0.0566, |
|
"reward": 0.5947119817137718, |
|
"reward_std": 0.6757695525884628, |
|
"rewards/cosine_scaled_reward": -0.0806201882660389, |
|
"rewards/format_reward": 0.755952388048172, |
|
"step": 278 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1884.9286499023438, |
|
"epoch": 1.116, |
|
"grad_norm": 1.0505043268203735, |
|
"kl": 0.41162109375, |
|
"learning_rate": 5.37435262574394e-07, |
|
"loss": 0.0838, |
|
"reward": 0.546771340072155, |
|
"reward_std": 0.5643983408808708, |
|
"rewards/cosine_scaled_reward": -0.13137624226510525, |
|
"rewards/format_reward": 0.8095238208770752, |
|
"step": 279 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1721.6309814453125, |
|
"epoch": 1.12, |
|
"grad_norm": 2.6171982288360596, |
|
"kl": 0.35400390625, |
|
"learning_rate": 5.342952264838747e-07, |
|
"loss": 0.119, |
|
"reward": 0.7959851026535034, |
|
"reward_std": 0.6236628741025925, |
|
"rewards/cosine_scaled_reward": -0.03057891083881259, |
|
"rewards/format_reward": 0.8571428805589676, |
|
"step": 280 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1974.3809814453125, |
|
"epoch": 1.124, |
|
"grad_norm": 0.9569424390792847, |
|
"kl": 0.4814453125, |
|
"learning_rate": 5.311559558218603e-07, |
|
"loss": 0.1494, |
|
"reward": 0.573462575674057, |
|
"reward_std": 0.6640851646661758, |
|
"rewards/cosine_scaled_reward": -0.09422110859304667, |
|
"rewards/format_reward": 0.761904776096344, |
|
"step": 281 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1699.2142944335938, |
|
"epoch": 1.1280000000000001, |
|
"grad_norm": 0.5432654619216919, |
|
"kl": 0.33935546875, |
|
"learning_rate": 5.28017603591974e-07, |
|
"loss": 0.0877, |
|
"reward": 0.7524446099996567, |
|
"reward_std": 0.6557567343115807, |
|
"rewards/cosine_scaled_reward": -0.04342056508176029, |
|
"rewards/format_reward": 0.8392857164144516, |
|
"step": 282 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2023.1012573242188, |
|
"epoch": 1.1320000000000001, |
|
"grad_norm": 1.5788854360580444, |
|
"kl": 0.498046875, |
|
"learning_rate": 5.248803227530763e-07, |
|
"loss": 0.1449, |
|
"reward": 0.471544723957777, |
|
"reward_std": 0.7016247361898422, |
|
"rewards/cosine_scaled_reward": -0.14220385067164898, |
|
"rewards/format_reward": 0.755952388048172, |
|
"step": 283 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1751.6190795898438, |
|
"epoch": 1.1360000000000001, |
|
"grad_norm": 0.8654693365097046, |
|
"kl": 0.45654296875, |
|
"learning_rate": 5.21744266211809e-07, |
|
"loss": 0.096, |
|
"reward": 0.8401590138673782, |
|
"reward_std": 0.7027324140071869, |
|
"rewards/cosine_scaled_reward": -0.008491916581988335, |
|
"rewards/format_reward": 0.8571428805589676, |
|
"step": 284 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1953.1309814453125, |
|
"epoch": 1.1400000000000001, |
|
"grad_norm": 0.7724223732948303, |
|
"kl": 0.43017578125, |
|
"learning_rate": 5.186095868151436e-07, |
|
"loss": 0.1257, |
|
"reward": 0.5251086875796318, |
|
"reward_std": 0.75553198158741, |
|
"rewards/cosine_scaled_reward": -0.11244566680397838, |
|
"rewards/format_reward": 0.7500000298023224, |
|
"step": 285 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1932.9940795898438, |
|
"epoch": 1.144, |
|
"grad_norm": 0.6642920970916748, |
|
"kl": 0.470703125, |
|
"learning_rate": 5.154764373429315e-07, |
|
"loss": 0.096, |
|
"reward": 0.8715938031673431, |
|
"reward_std": 0.7678115516901016, |
|
"rewards/cosine_scaled_reward": 0.036987369414418936, |
|
"rewards/format_reward": 0.7976190745830536, |
|
"step": 286 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1784.1428527832031, |
|
"epoch": 1.148, |
|
"grad_norm": 0.9823849201202393, |
|
"kl": 0.38134765625, |
|
"learning_rate": 5.123449705004581e-07, |
|
"loss": 0.0437, |
|
"reward": 0.7326274067163467, |
|
"reward_std": 0.6021066680550575, |
|
"rewards/cosine_scaled_reward": -0.044400574173778296, |
|
"rewards/format_reward": 0.8214285969734192, |
|
"step": 287 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1929.0536193847656, |
|
"epoch": 1.152, |
|
"grad_norm": 2.430745840072632, |
|
"kl": 0.45458984375, |
|
"learning_rate": 5.09215338910999e-07, |
|
"loss": 0.1275, |
|
"reward": 0.76754130423069, |
|
"reward_std": 0.6635829508304596, |
|
"rewards/cosine_scaled_reward": -0.0001579252420924604, |
|
"rewards/format_reward": 0.7678571492433548, |
|
"step": 288 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1567.4226684570312, |
|
"epoch": 1.156, |
|
"grad_norm": 1.8522855043411255, |
|
"kl": 0.35302734375, |
|
"learning_rate": 5.060876951083828e-07, |
|
"loss": 0.0659, |
|
"reward": 0.8090793639421463, |
|
"reward_std": 0.6970714181661606, |
|
"rewards/cosine_scaled_reward": -0.02105556521564722, |
|
"rewards/format_reward": 0.8511905074119568, |
|
"step": 289 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1916.0298156738281, |
|
"epoch": 1.16, |
|
"grad_norm": 0.8320524096488953, |
|
"kl": 0.353515625, |
|
"learning_rate": 5.02962191529556e-07, |
|
"loss": 0.0397, |
|
"reward": 0.8147249445319176, |
|
"reward_std": 0.7559010833501816, |
|
"rewards/cosine_scaled_reward": 0.014505308354273438, |
|
"rewards/format_reward": 0.7857143133878708, |
|
"step": 290 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1871.5595703125, |
|
"epoch": 1.164, |
|
"grad_norm": 1.639461636543274, |
|
"kl": 0.44482421875, |
|
"learning_rate": 4.998389805071536e-07, |
|
"loss": 0.02, |
|
"reward": 0.7966814041137695, |
|
"reward_std": 0.6868171393871307, |
|
"rewards/cosine_scaled_reward": -0.03320692107081413, |
|
"rewards/format_reward": 0.8630952686071396, |
|
"step": 291 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1849.0714721679688, |
|
"epoch": 1.168, |
|
"grad_norm": 0.9159106016159058, |
|
"kl": 0.41357421875, |
|
"learning_rate": 4.967182142620745e-07, |
|
"loss": 0.1098, |
|
"reward": 0.8123535662889481, |
|
"reward_std": 0.7406510710716248, |
|
"rewards/cosine_scaled_reward": -0.0075137000530958176, |
|
"rewards/format_reward": 0.82738097012043, |
|
"step": 292 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1627.2262268066406, |
|
"epoch": 1.172, |
|
"grad_norm": 1.2907826900482178, |
|
"kl": 0.3271484375, |
|
"learning_rate": 4.93600044896063e-07, |
|
"loss": 0.028, |
|
"reward": 0.7378726750612259, |
|
"reward_std": 0.6904594451189041, |
|
"rewards/cosine_scaled_reward": -0.07153987139463425, |
|
"rewards/format_reward": 0.8809524029493332, |
|
"step": 293 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2141.2202758789062, |
|
"epoch": 1.176, |
|
"grad_norm": 0.7737708687782288, |
|
"kl": 0.4482421875, |
|
"learning_rate": 4.904846243842949e-07, |
|
"loss": 0.0644, |
|
"reward": 0.7625293210148811, |
|
"reward_std": 0.7152971476316452, |
|
"rewards/cosine_scaled_reward": 0.009240844286978245, |
|
"rewards/format_reward": 0.7440476417541504, |
|
"step": 294 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2187.3809814453125, |
|
"epoch": 1.18, |
|
"grad_norm": 1.1525542736053467, |
|
"kl": 0.51025390625, |
|
"learning_rate": 4.873721045679706e-07, |
|
"loss": 0.0634, |
|
"reward": 0.5901899486780167, |
|
"reward_std": 0.6728092133998871, |
|
"rewards/cosine_scaled_reward": -0.0739526596153155, |
|
"rewards/format_reward": 0.7380952537059784, |
|
"step": 295 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2388.577392578125, |
|
"epoch": 1.184, |
|
"grad_norm": 0.9084761738777161, |
|
"kl": 0.52734375, |
|
"learning_rate": 4.842626371469149e-07, |
|
"loss": 0.0587, |
|
"reward": 0.4302752036601305, |
|
"reward_std": 0.615352213382721, |
|
"rewards/cosine_scaled_reward": -0.12117192603182048, |
|
"rewards/format_reward": 0.6726190596818924, |
|
"step": 296 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1568.6786193847656, |
|
"epoch": 1.188, |
|
"grad_norm": 0.852024495601654, |
|
"kl": 0.1533203125, |
|
"learning_rate": 4.811563736721829e-07, |
|
"loss": 0.0574, |
|
"reward": 0.7380149587988853, |
|
"reward_std": 0.7155523598194122, |
|
"rewards/cosine_scaled_reward": -0.029802043922245502, |
|
"rewards/format_reward": 0.7976190596818924, |
|
"step": 297 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1983.4524230957031, |
|
"epoch": 1.192, |
|
"grad_norm": 0.7617373466491699, |
|
"kl": 0.303955078125, |
|
"learning_rate": 4.780534655386743e-07, |
|
"loss": 0.068, |
|
"reward": 0.7127486318349838, |
|
"reward_std": 0.7076264545321465, |
|
"rewards/cosine_scaled_reward": -0.018625682685524225, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 298 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2038.3750305175781, |
|
"epoch": 1.196, |
|
"grad_norm": 0.8094474673271179, |
|
"kl": 0.25830078125, |
|
"learning_rate": 4.749540639777539e-07, |
|
"loss": 0.0566, |
|
"reward": 0.6301854252815247, |
|
"reward_std": 0.6336864829063416, |
|
"rewards/cosine_scaled_reward": -0.036097751930356026, |
|
"rewards/format_reward": 0.7023809552192688, |
|
"step": 299 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1825.9643249511719, |
|
"epoch": 1.2, |
|
"grad_norm": 1.8039993047714233, |
|
"kl": 0.225830078125, |
|
"learning_rate": 4.7185832004988133e-07, |
|
"loss": 0.0501, |
|
"reward": 0.9151953011751175, |
|
"reward_std": 0.6518659368157387, |
|
"rewards/cosine_scaled_reward": 0.03200240898877382, |
|
"rewards/format_reward": 0.8511905074119568, |
|
"step": 300 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1954.3809814453125, |
|
"epoch": 1.204, |
|
"grad_norm": 0.9098180532455444, |
|
"kl": 0.2685546875, |
|
"learning_rate": 4.68766384637248e-07, |
|
"loss": 0.08, |
|
"reward": 0.903901144862175, |
|
"reward_std": 0.7074443101882935, |
|
"rewards/cosine_scaled_reward": 0.059093400835990906, |
|
"rewards/format_reward": 0.7857142835855484, |
|
"step": 301 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2221.6012268066406, |
|
"epoch": 1.208, |
|
"grad_norm": 0.628447949886322, |
|
"kl": 0.297119140625, |
|
"learning_rate": 4.656784084364238e-07, |
|
"loss": 0.0225, |
|
"reward": 0.7435066364705563, |
|
"reward_std": 0.7286128550767899, |
|
"rewards/cosine_scaled_reward": 0.002705696038901806, |
|
"rewards/format_reward": 0.7380952537059784, |
|
"step": 302 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1967.6607360839844, |
|
"epoch": 1.212, |
|
"grad_norm": 1.4870760440826416, |
|
"kl": 0.2193603515625, |
|
"learning_rate": 4.6259454195101267e-07, |
|
"loss": 0.0076, |
|
"reward": 0.6118638888001442, |
|
"reward_std": 0.6256552934646606, |
|
"rewards/cosine_scaled_reward": -0.08990138117223978, |
|
"rewards/format_reward": 0.7916666716337204, |
|
"step": 303 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2091.8810119628906, |
|
"epoch": 1.216, |
|
"grad_norm": 1.0213916301727295, |
|
"kl": 0.28857421875, |
|
"learning_rate": 4.59514935484316e-07, |
|
"loss": 0.0819, |
|
"reward": 0.8858746439218521, |
|
"reward_std": 0.760543704032898, |
|
"rewards/cosine_scaled_reward": 0.04412779211997986, |
|
"rewards/format_reward": 0.7976190596818924, |
|
"step": 304 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2117.5357971191406, |
|
"epoch": 1.22, |
|
"grad_norm": 1.1696289777755737, |
|
"kl": 0.266845703125, |
|
"learning_rate": 4.5643973913200837e-07, |
|
"loss": 0.0319, |
|
"reward": 0.4878672659397125, |
|
"reward_std": 0.5883132815361023, |
|
"rewards/cosine_scaled_reward": -0.13999494537711143, |
|
"rewards/format_reward": 0.7678571492433548, |
|
"step": 305 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1962.8512268066406, |
|
"epoch": 1.224, |
|
"grad_norm": 1.1181604862213135, |
|
"kl": 0.26171875, |
|
"learning_rate": 4.5336910277482155e-07, |
|
"loss": 0.0553, |
|
"reward": 0.8040256127715111, |
|
"reward_std": 0.7542890757322311, |
|
"rewards/cosine_scaled_reward": 0.00915566342882812, |
|
"rewards/format_reward": 0.7857143059372902, |
|
"step": 306 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2503.541717529297, |
|
"epoch": 1.228, |
|
"grad_norm": 1.0075181722640991, |
|
"kl": 0.274658203125, |
|
"learning_rate": 4.503031760712397e-07, |
|
"loss": 0.0639, |
|
"reward": 0.5502185635268688, |
|
"reward_std": 0.7036140263080597, |
|
"rewards/cosine_scaled_reward": -0.043343101628124714, |
|
"rewards/format_reward": 0.6369047686457634, |
|
"step": 307 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2027.6905212402344, |
|
"epoch": 1.232, |
|
"grad_norm": 2.7786951065063477, |
|
"kl": 0.265380859375, |
|
"learning_rate": 4.4724210845020494e-07, |
|
"loss": 0.1529, |
|
"reward": 0.8017951250076294, |
|
"reward_std": 0.7912951856851578, |
|
"rewards/cosine_scaled_reward": 0.005064212018623948, |
|
"rewards/format_reward": 0.7916666865348816, |
|
"step": 308 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2560.6845703125, |
|
"epoch": 1.236, |
|
"grad_norm": 1.6693713665008545, |
|
"kl": 0.2939453125, |
|
"learning_rate": 4.441860491038345e-07, |
|
"loss": 0.1046, |
|
"reward": 0.6068699322640896, |
|
"reward_std": 0.7445466667413712, |
|
"rewards/cosine_scaled_reward": -0.00013647368177771568, |
|
"rewards/format_reward": 0.6071428656578064, |
|
"step": 309 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2221.3988037109375, |
|
"epoch": 1.24, |
|
"grad_norm": 0.7167072892189026, |
|
"kl": 0.253173828125, |
|
"learning_rate": 4.4113514698014953e-07, |
|
"loss": 0.046, |
|
"reward": 0.5108997635543346, |
|
"reward_std": 0.6983606815338135, |
|
"rewards/cosine_scaled_reward": -0.09276440553367138, |
|
"rewards/format_reward": 0.696428582072258, |
|
"step": 310 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2308.1012573242188, |
|
"epoch": 1.244, |
|
"grad_norm": 1.289093255996704, |
|
"kl": 0.24072265625, |
|
"learning_rate": 4.3808955077581546e-07, |
|
"loss": 0.074, |
|
"reward": 0.49418094009160995, |
|
"reward_std": 0.6803844273090363, |
|
"rewards/cosine_scaled_reward": -0.0862428704276681, |
|
"rewards/format_reward": 0.6666666716337204, |
|
"step": 311 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2221.500030517578, |
|
"epoch": 1.248, |
|
"grad_norm": 0.7747544646263123, |
|
"kl": 0.284423828125, |
|
"learning_rate": 4.350494089288943e-07, |
|
"loss": 0.0127, |
|
"reward": 0.5928547494113445, |
|
"reward_std": 0.6995180547237396, |
|
"rewards/cosine_scaled_reward": -0.0875012082979083, |
|
"rewards/format_reward": 0.7678571492433548, |
|
"step": 312 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2528.3750610351562, |
|
"epoch": 1.252, |
|
"grad_norm": 0.9067274928092957, |
|
"kl": 0.261962890625, |
|
"learning_rate": 4.3201486961161093e-07, |
|
"loss": 0.0588, |
|
"reward": 0.580617468804121, |
|
"reward_std": 0.7565959244966507, |
|
"rewards/cosine_scaled_reward": -0.05195318069308996, |
|
"rewards/format_reward": 0.6845238208770752, |
|
"step": 313 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2290.434539794922, |
|
"epoch": 1.256, |
|
"grad_norm": 1.0397149324417114, |
|
"kl": 0.2939453125, |
|
"learning_rate": 4.2898608072313045e-07, |
|
"loss": 0.0843, |
|
"reward": 0.923637330532074, |
|
"reward_std": 0.8029050081968307, |
|
"rewards/cosine_scaled_reward": 0.07491390081122518, |
|
"rewards/format_reward": 0.7738095372915268, |
|
"step": 314 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2221.1785583496094, |
|
"epoch": 1.26, |
|
"grad_norm": 0.8451793789863586, |
|
"kl": 0.2978515625, |
|
"learning_rate": 4.2596318988235037e-07, |
|
"loss": 0.0794, |
|
"reward": 0.9175606220960617, |
|
"reward_std": 0.6950835883617401, |
|
"rewards/cosine_scaled_reward": 0.06592314876616001, |
|
"rewards/format_reward": 0.7857142984867096, |
|
"step": 315 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2737.1131591796875, |
|
"epoch": 1.264, |
|
"grad_norm": 0.8914613723754883, |
|
"kl": 0.4072265625, |
|
"learning_rate": 4.2294634442070553e-07, |
|
"loss": 0.0266, |
|
"reward": 0.39799112919718027, |
|
"reward_std": 0.5211281925439835, |
|
"rewards/cosine_scaled_reward": -0.0896949004381895, |
|
"rewards/format_reward": 0.5773809626698494, |
|
"step": 316 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2356.8155517578125, |
|
"epoch": 1.268, |
|
"grad_norm": 0.8658885955810547, |
|
"kl": 0.32080078125, |
|
"learning_rate": 4.1993569137498776e-07, |
|
"loss": 0.0558, |
|
"reward": 0.4528093598783016, |
|
"reward_std": 0.5718662440776825, |
|
"rewards/cosine_scaled_reward": -0.11585722491145134, |
|
"rewards/format_reward": 0.6845238283276558, |
|
"step": 317 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2067.0952758789062, |
|
"epoch": 1.272, |
|
"grad_norm": 0.6174459457397461, |
|
"kl": 0.2724609375, |
|
"learning_rate": 4.1693137748017915e-07, |
|
"loss": 0.0532, |
|
"reward": 0.9527914822101593, |
|
"reward_std": 0.7573249191045761, |
|
"rewards/cosine_scaled_reward": 0.059729063883423805, |
|
"rewards/format_reward": 0.833333358168602, |
|
"step": 318 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2158.3631286621094, |
|
"epoch": 1.276, |
|
"grad_norm": 0.5749858617782593, |
|
"kl": 0.2744140625, |
|
"learning_rate": 4.1393354916230005e-07, |
|
"loss": 0.0398, |
|
"reward": 0.7759583368897438, |
|
"reward_std": 0.7076128423213959, |
|
"rewards/cosine_scaled_reward": 0.00702677620574832, |
|
"rewards/format_reward": 0.761904776096344, |
|
"step": 319 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2376.7857666015625, |
|
"epoch": 1.28, |
|
"grad_norm": 0.4824450612068176, |
|
"kl": 0.358642578125, |
|
"learning_rate": 4.1094235253127374e-07, |
|
"loss": 0.0579, |
|
"reward": 0.5863704346120358, |
|
"reward_std": 0.69185970723629, |
|
"rewards/cosine_scaled_reward": -0.058005278930068016, |
|
"rewards/format_reward": 0.7023809552192688, |
|
"step": 320 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2094.6786193847656, |
|
"epoch": 1.284, |
|
"grad_norm": 1.153307318687439, |
|
"kl": 0.32373046875, |
|
"learning_rate": 4.079579333738039e-07, |
|
"loss": 0.0147, |
|
"reward": 0.5667938031256199, |
|
"reward_std": 0.6206858605146408, |
|
"rewards/cosine_scaled_reward": -0.12434119766112417, |
|
"rewards/format_reward": 0.8154762089252472, |
|
"step": 321 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2360.970245361328, |
|
"epoch": 1.288, |
|
"grad_norm": 0.7556703090667725, |
|
"kl": 0.314697265625, |
|
"learning_rate": 4.0498043714627006e-07, |
|
"loss": 0.0413, |
|
"reward": 0.54334956407547, |
|
"reward_std": 0.7112371101975441, |
|
"rewards/cosine_scaled_reward": -0.10927761369384825, |
|
"rewards/format_reward": 0.761904776096344, |
|
"step": 322 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2466.9703369140625, |
|
"epoch": 1.292, |
|
"grad_norm": 0.6241899728775024, |
|
"kl": 0.345703125, |
|
"learning_rate": 4.020100089676376e-07, |
|
"loss": 0.0463, |
|
"reward": 0.5620089694857597, |
|
"reward_std": 0.6381285488605499, |
|
"rewards/cosine_scaled_reward": -0.0672098146751523, |
|
"rewards/format_reward": 0.696428582072258, |
|
"step": 323 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2297.2916564941406, |
|
"epoch": 1.296, |
|
"grad_norm": 1.050784945487976, |
|
"kl": 0.289306640625, |
|
"learning_rate": 3.9904679361238526e-07, |
|
"loss": 0.095, |
|
"reward": 0.6569867879152298, |
|
"reward_std": 0.6581598520278931, |
|
"rewards/cosine_scaled_reward": -0.034601859748363495, |
|
"rewards/format_reward": 0.7261904925107956, |
|
"step": 324 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2391.8095703125, |
|
"epoch": 1.3, |
|
"grad_norm": 0.5910518169403076, |
|
"kl": 0.327392578125, |
|
"learning_rate": 3.9609093550344907e-07, |
|
"loss": 0.065, |
|
"reward": 0.6689947620034218, |
|
"reward_std": 0.5862837731838226, |
|
"rewards/cosine_scaled_reward": -0.0434788279235363, |
|
"rewards/format_reward": 0.7559524029493332, |
|
"step": 325 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2161.34521484375, |
|
"epoch": 1.304, |
|
"grad_norm": 1.3934383392333984, |
|
"kl": 0.2412109375, |
|
"learning_rate": 3.931425787051832e-07, |
|
"loss": 0.0952, |
|
"reward": 0.7927189618349075, |
|
"reward_std": 0.8861154615879059, |
|
"rewards/cosine_scaled_reward": 0.03624042624142021, |
|
"rewards/format_reward": 0.7202381044626236, |
|
"step": 326 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2205.75, |
|
"epoch": 1.308, |
|
"grad_norm": 0.5909004211425781, |
|
"kl": 0.276611328125, |
|
"learning_rate": 3.902018669163384e-07, |
|
"loss": 0.0265, |
|
"reward": 0.7868844717741013, |
|
"reward_std": 0.6631656885147095, |
|
"rewards/cosine_scaled_reward": 0.024394613516051322, |
|
"rewards/format_reward": 0.7380952388048172, |
|
"step": 327 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2526.71435546875, |
|
"epoch": 1.312, |
|
"grad_norm": 0.37658610939979553, |
|
"kl": 0.30908203125, |
|
"learning_rate": 3.872689434630585e-07, |
|
"loss": 0.0593, |
|
"reward": 0.3922804482281208, |
|
"reward_std": 0.7164648473262787, |
|
"rewards/cosine_scaled_reward": -0.11933596897870302, |
|
"rewards/format_reward": 0.6309523731470108, |
|
"step": 328 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2406.6726684570312, |
|
"epoch": 1.316, |
|
"grad_norm": 0.5439748764038086, |
|
"kl": 0.28759765625, |
|
"learning_rate": 3.843439512918949e-07, |
|
"loss": 0.0395, |
|
"reward": 0.457830130122602, |
|
"reward_std": 0.6897861212491989, |
|
"rewards/cosine_scaled_reward": -0.10739446245133877, |
|
"rewards/format_reward": 0.6726190447807312, |
|
"step": 329 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2233.6548461914062, |
|
"epoch": 1.32, |
|
"grad_norm": 1.2243571281433105, |
|
"kl": 0.289306640625, |
|
"learning_rate": 3.8142703296283953e-07, |
|
"loss": 0.1087, |
|
"reward": 0.6516863703727722, |
|
"reward_std": 0.7036527991294861, |
|
"rewards/cosine_scaled_reward": -0.08189492486417294, |
|
"rewards/format_reward": 0.8154762089252472, |
|
"step": 330 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2212.7560119628906, |
|
"epoch": 1.324, |
|
"grad_norm": 0.8144615888595581, |
|
"kl": 0.28857421875, |
|
"learning_rate": 3.785183306423767e-07, |
|
"loss": 0.0775, |
|
"reward": 0.5815620422363281, |
|
"reward_std": 0.5177476480603218, |
|
"rewards/cosine_scaled_reward": -0.042552310740575194, |
|
"rewards/format_reward": 0.6666666865348816, |
|
"step": 331 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2412.0059814453125, |
|
"epoch": 1.328, |
|
"grad_norm": 0.42855292558670044, |
|
"kl": 0.3232421875, |
|
"learning_rate": 3.7561798609655373e-07, |
|
"loss": 0.0791, |
|
"reward": 0.642042949795723, |
|
"reward_std": 0.6289803832769394, |
|
"rewards/cosine_scaled_reward": -0.04207377042621374, |
|
"rewards/format_reward": 0.7261904925107956, |
|
"step": 332 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2163.9822387695312, |
|
"epoch": 1.332, |
|
"grad_norm": 1.0114275217056274, |
|
"kl": 0.255859375, |
|
"learning_rate": 3.72726140684072e-07, |
|
"loss": 0.088, |
|
"reward": 0.811268161451153, |
|
"reward_std": 0.6822613030672073, |
|
"rewards/cosine_scaled_reward": 0.042538831010460854, |
|
"rewards/format_reward": 0.7261904925107956, |
|
"step": 333 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2329.0178833007812, |
|
"epoch": 1.336, |
|
"grad_norm": 0.7170870900154114, |
|
"kl": 0.3486328125, |
|
"learning_rate": 3.6984293534939737e-07, |
|
"loss": 0.0455, |
|
"reward": 0.8848401606082916, |
|
"reward_std": 0.7328508943319321, |
|
"rewards/cosine_scaled_reward": 0.04361054569017142, |
|
"rewards/format_reward": 0.7976190745830536, |
|
"step": 334 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2709.2262573242188, |
|
"epoch": 1.34, |
|
"grad_norm": 0.47293010354042053, |
|
"kl": 0.38037109375, |
|
"learning_rate": 3.6696851061588994e-07, |
|
"loss": 0.0416, |
|
"reward": 0.3898888286203146, |
|
"reward_std": 0.6401937156915665, |
|
"rewards/cosine_scaled_reward": -0.10862701199948788, |
|
"rewards/format_reward": 0.6071428582072258, |
|
"step": 335 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2530.416748046875, |
|
"epoch": 1.3439999999999999, |
|
"grad_norm": 0.4423607885837555, |
|
"kl": 0.282958984375, |
|
"learning_rate": 3.641030065789562e-07, |
|
"loss": 0.0446, |
|
"reward": 0.6725399196147919, |
|
"reward_std": 0.7871751934289932, |
|
"rewards/cosine_scaled_reward": 0.0059128133580088615, |
|
"rewards/format_reward": 0.6607142984867096, |
|
"step": 336 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2311.5358276367188, |
|
"epoch": 1.3479999999999999, |
|
"grad_norm": 0.5007253885269165, |
|
"kl": 0.3203125, |
|
"learning_rate": 3.612465628992203e-07, |
|
"loss": 0.0455, |
|
"reward": 0.8073793947696686, |
|
"reward_std": 0.7870100140571594, |
|
"rewards/cosine_scaled_reward": 0.010832530329935253, |
|
"rewards/format_reward": 0.7857142984867096, |
|
"step": 337 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2489.7500610351562, |
|
"epoch": 1.3519999999999999, |
|
"grad_norm": 0.36444640159606934, |
|
"kl": 0.305908203125, |
|
"learning_rate": 3.5839931879571725e-07, |
|
"loss": 0.0652, |
|
"reward": 0.6751855611801147, |
|
"reward_std": 0.6701688021421432, |
|
"rewards/cosine_scaled_reward": 0.001283254474401474, |
|
"rewards/format_reward": 0.6726190596818924, |
|
"step": 338 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2460.8154907226562, |
|
"epoch": 1.3559999999999999, |
|
"grad_norm": 0.43892228603363037, |
|
"kl": 0.3369140625, |
|
"learning_rate": 3.555614130391079e-07, |
|
"loss": 0.0519, |
|
"reward": 0.6638183146715164, |
|
"reward_std": 0.770327016711235, |
|
"rewards/cosine_scaled_reward": 0.010480590397492051, |
|
"rewards/format_reward": 0.6428571566939354, |
|
"step": 339 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2244.3631896972656, |
|
"epoch": 1.3599999999999999, |
|
"grad_norm": 0.6102768778800964, |
|
"kl": 0.31201171875, |
|
"learning_rate": 3.5273298394491515e-07, |
|
"loss": 0.0694, |
|
"reward": 0.8422182202339172, |
|
"reward_std": 0.6671302318572998, |
|
"rewards/cosine_scaled_reward": 0.04610910080373287, |
|
"rewards/format_reward": 0.7500000074505806, |
|
"step": 340 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2239.75, |
|
"epoch": 1.3639999999999999, |
|
"grad_norm": 0.6582260727882385, |
|
"kl": 0.3271484375, |
|
"learning_rate": 3.4991416936678276e-07, |
|
"loss": 0.076, |
|
"reward": 0.6709855943918228, |
|
"reward_std": 0.7041856721043587, |
|
"rewards/cosine_scaled_reward": -0.03355482150800526, |
|
"rewards/format_reward": 0.7380952537059784, |
|
"step": 341 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2438.3214721679688, |
|
"epoch": 1.3679999999999999, |
|
"grad_norm": 0.5521511435508728, |
|
"kl": 0.320556640625, |
|
"learning_rate": 3.471051066897562e-07, |
|
"loss": 0.047, |
|
"reward": 0.6942454129457474, |
|
"reward_std": 0.6340186148881912, |
|
"rewards/cosine_scaled_reward": 0.010813180379045662, |
|
"rewards/format_reward": 0.6726190745830536, |
|
"step": 342 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2433.6488647460938, |
|
"epoch": 1.3719999999999999, |
|
"grad_norm": 0.928674042224884, |
|
"kl": 0.40478515625, |
|
"learning_rate": 3.4430593282358777e-07, |
|
"loss": 0.0328, |
|
"reward": 0.5231252759695053, |
|
"reward_std": 0.7485495656728745, |
|
"rewards/cosine_scaled_reward": -0.11641356535255909, |
|
"rewards/format_reward": 0.755952388048172, |
|
"step": 343 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2388.2202758789062, |
|
"epoch": 1.376, |
|
"grad_norm": 0.43529966473579407, |
|
"kl": 0.32080078125, |
|
"learning_rate": 3.4151678419606233e-07, |
|
"loss": 0.0303, |
|
"reward": 0.7449862584471703, |
|
"reward_std": 0.6971839666366577, |
|
"rewards/cosine_scaled_reward": 0.024278827477246523, |
|
"rewards/format_reward": 0.696428582072258, |
|
"step": 344 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2686.15478515625, |
|
"epoch": 1.38, |
|
"grad_norm": 0.5191164016723633, |
|
"kl": 0.36328125, |
|
"learning_rate": 3.387377967463493e-07, |
|
"loss": 0.0602, |
|
"reward": 0.4467791821807623, |
|
"reward_std": 0.6689166128635406, |
|
"rewards/cosine_scaled_reward": -0.07125327130779624, |
|
"rewards/format_reward": 0.5892857238650322, |
|
"step": 345 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2138.119110107422, |
|
"epoch": 1.384, |
|
"grad_norm": 0.40859875082969666, |
|
"kl": 0.344970703125, |
|
"learning_rate": 3.359691059183761e-07, |
|
"loss": 0.0894, |
|
"reward": 0.7263324186205864, |
|
"reward_std": 0.7082626074552536, |
|
"rewards/cosine_scaled_reward": -0.029690947383642197, |
|
"rewards/format_reward": 0.7857143133878708, |
|
"step": 346 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2158.6429138183594, |
|
"epoch": 1.388, |
|
"grad_norm": 0.35558465123176575, |
|
"kl": 0.29638671875, |
|
"learning_rate": 3.3321084665422803e-07, |
|
"loss": 0.0262, |
|
"reward": 0.6269577667117119, |
|
"reward_std": 0.5908889323472977, |
|
"rewards/cosine_scaled_reward": -0.04664018237963319, |
|
"rewards/format_reward": 0.7202381119132042, |
|
"step": 347 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2144.619110107422, |
|
"epoch": 1.392, |
|
"grad_norm": 1.211071491241455, |
|
"kl": 0.306640625, |
|
"learning_rate": 3.3046315338757026e-07, |
|
"loss": -0.0105, |
|
"reward": 0.6653935462236404, |
|
"reward_std": 0.6245283707976341, |
|
"rewards/cosine_scaled_reward": -0.04230323247611523, |
|
"rewards/format_reward": 0.75, |
|
"step": 348 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2366.4940795898438, |
|
"epoch": 1.396, |
|
"grad_norm": 0.5814414620399475, |
|
"kl": 0.33154296875, |
|
"learning_rate": 3.2772616003709616e-07, |
|
"loss": 0.0485, |
|
"reward": 0.5602632537484169, |
|
"reward_std": 0.5761818215250969, |
|
"rewards/cosine_scaled_reward": -0.0978445541113615, |
|
"rewards/format_reward": 0.755952388048172, |
|
"step": 349 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2348.6607666015625, |
|
"epoch": 1.4, |
|
"grad_norm": 0.675369918346405, |
|
"kl": 0.29931640625, |
|
"learning_rate": 3.250000000000001e-07, |
|
"loss": 0.0825, |
|
"reward": 0.475093599408865, |
|
"reward_std": 0.604865163564682, |
|
"rewards/cosine_scaled_reward": -0.07792939431965351, |
|
"rewards/format_reward": 0.6309523731470108, |
|
"step": 350 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2099.279815673828, |
|
"epoch": 1.404, |
|
"grad_norm": 0.5227596163749695, |
|
"kl": 0.33447265625, |
|
"learning_rate": 3.222848061454764e-07, |
|
"loss": 0.0454, |
|
"reward": 0.6502892896533012, |
|
"reward_std": 0.676431730389595, |
|
"rewards/cosine_scaled_reward": -0.05878393305465579, |
|
"rewards/format_reward": 0.7678571492433548, |
|
"step": 351 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2465.202392578125, |
|
"epoch": 1.408, |
|
"grad_norm": 0.4936739206314087, |
|
"kl": 0.33154296875, |
|
"learning_rate": 3.195807108082429e-07, |
|
"loss": 0.0349, |
|
"reward": 0.51472207903862, |
|
"reward_std": 0.6474315822124481, |
|
"rewards/cosine_scaled_reward": -0.05216278973966837, |
|
"rewards/format_reward": 0.6190476417541504, |
|
"step": 352 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2241.089324951172, |
|
"epoch": 1.412, |
|
"grad_norm": 0.4653976857662201, |
|
"kl": 0.3046875, |
|
"learning_rate": 3.168878457820915e-07, |
|
"loss": 0.0576, |
|
"reward": 0.7246856689453125, |
|
"reward_std": 0.7023278325796127, |
|
"rewards/cosine_scaled_reward": -0.02456192229874432, |
|
"rewards/format_reward": 0.7738095223903656, |
|
"step": 353 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2174.4107666015625, |
|
"epoch": 1.416, |
|
"grad_norm": 1.179158091545105, |
|
"kl": 0.31982421875, |
|
"learning_rate": 3.142063423134644e-07, |
|
"loss": 0.1321, |
|
"reward": 0.4120100736618042, |
|
"reward_std": 0.5803252756595612, |
|
"rewards/cosine_scaled_reward": -0.19280448742210865, |
|
"rewards/format_reward": 0.7976190596818924, |
|
"step": 354 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2560.9405517578125, |
|
"epoch": 1.42, |
|
"grad_norm": 0.6409890651702881, |
|
"kl": 0.3291015625, |
|
"learning_rate": 3.115363310950578e-07, |
|
"loss": 0.0637, |
|
"reward": 0.6557277590036392, |
|
"reward_std": 0.8805683702230453, |
|
"rewards/cosine_scaled_reward": -0.02332661801483482, |
|
"rewards/format_reward": 0.70238097012043, |
|
"step": 355 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1872.1012573242188, |
|
"epoch": 1.424, |
|
"grad_norm": 0.4570577144622803, |
|
"kl": 0.244873046875, |
|
"learning_rate": 3.0887794225945143e-07, |
|
"loss": 0.0537, |
|
"reward": 0.8301898017525673, |
|
"reward_std": 0.6987727582454681, |
|
"rewards/cosine_scaled_reward": -0.016452712705358863, |
|
"rewards/format_reward": 0.8630952537059784, |
|
"step": 356 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2199.4880981445312, |
|
"epoch": 1.428, |
|
"grad_norm": 0.5453688502311707, |
|
"kl": 0.3388671875, |
|
"learning_rate": 3.062313053727671e-07, |
|
"loss": 0.1004, |
|
"reward": 0.5429714322090149, |
|
"reward_std": 0.757801964879036, |
|
"rewards/cosine_scaled_reward": -0.11244285944849253, |
|
"rewards/format_reward": 0.7678571492433548, |
|
"step": 357 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2392.5536193847656, |
|
"epoch": 1.432, |
|
"grad_norm": 0.4179025888442993, |
|
"kl": 0.36767578125, |
|
"learning_rate": 3.0359654942835247e-07, |
|
"loss": 0.057, |
|
"reward": 0.6754717975854874, |
|
"reward_std": 0.8176562935113907, |
|
"rewards/cosine_scaled_reward": -0.004526023752987385, |
|
"rewards/format_reward": 0.6845238357782364, |
|
"step": 358 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2280.5000915527344, |
|
"epoch": 1.436, |
|
"grad_norm": 0.5272053480148315, |
|
"kl": 0.273681640625, |
|
"learning_rate": 3.0097380284049523e-07, |
|
"loss": 0.0565, |
|
"reward": 0.644446611404419, |
|
"reward_std": 0.7567472010850906, |
|
"rewards/cosine_scaled_reward": -0.02896718680858612, |
|
"rewards/format_reward": 0.70238097012043, |
|
"step": 359 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2270.3928833007812, |
|
"epoch": 1.44, |
|
"grad_norm": 0.8152810335159302, |
|
"kl": 0.34619140625, |
|
"learning_rate": 2.9836319343816397e-07, |
|
"loss": 0.0306, |
|
"reward": 0.7786325067281723, |
|
"reward_std": 0.559767447412014, |
|
"rewards/cosine_scaled_reward": -0.012469482608139515, |
|
"rewards/format_reward": 0.8035714477300644, |
|
"step": 360 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2094.2083740234375, |
|
"epoch": 1.444, |
|
"grad_norm": 0.9731494188308716, |
|
"kl": 0.33203125, |
|
"learning_rate": 2.9576484845877793e-07, |
|
"loss": 0.0315, |
|
"reward": 0.7239094823598862, |
|
"reward_std": 0.6780030280351639, |
|
"rewards/cosine_scaled_reward": -0.057688117027282715, |
|
"rewards/format_reward": 0.839285746216774, |
|
"step": 361 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2515.3809814453125, |
|
"epoch": 1.448, |
|
"grad_norm": 0.5006127953529358, |
|
"kl": 0.3583984375, |
|
"learning_rate": 2.931788945420058e-07, |
|
"loss": 0.0632, |
|
"reward": 0.5585716450586915, |
|
"reward_std": 0.6955743506550789, |
|
"rewards/cosine_scaled_reward": -0.08976180851459503, |
|
"rewards/format_reward": 0.7380952462553978, |
|
"step": 362 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2665.2560424804688, |
|
"epoch": 1.452, |
|
"grad_norm": 0.4868517220020294, |
|
"kl": 0.373046875, |
|
"learning_rate": 2.9060545772359305e-07, |
|
"loss": 0.0555, |
|
"reward": 0.5607914663851261, |
|
"reward_std": 0.6483574956655502, |
|
"rewards/cosine_scaled_reward": -0.07377092959359288, |
|
"rewards/format_reward": 0.7083333432674408, |
|
"step": 363 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2244.7262573242188, |
|
"epoch": 1.456, |
|
"grad_norm": 0.6844132542610168, |
|
"kl": 0.3173828125, |
|
"learning_rate": 2.8804466342921987e-07, |
|
"loss": 0.0109, |
|
"reward": 0.7073798812925816, |
|
"reward_std": 0.6621369272470474, |
|
"rewards/cosine_scaled_reward": -0.01833386719226837, |
|
"rewards/format_reward": 0.7440476417541504, |
|
"step": 364 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2576.1905517578125, |
|
"epoch": 1.46, |
|
"grad_norm": 0.5755227208137512, |
|
"kl": 0.35400390625, |
|
"learning_rate": 2.854966364683872e-07, |
|
"loss": 0.0531, |
|
"reward": 0.6706622801721096, |
|
"reward_std": 0.8000525310635567, |
|
"rewards/cosine_scaled_reward": -0.03371649980545044, |
|
"rewards/format_reward": 0.7380952537059784, |
|
"step": 365 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2664.3928833007812, |
|
"epoch": 1.464, |
|
"grad_norm": 0.6695978045463562, |
|
"kl": 0.4052734375, |
|
"learning_rate": 2.829615010283344e-07, |
|
"loss": 0.1001, |
|
"reward": 0.6332942470908165, |
|
"reward_std": 0.9363250732421875, |
|
"rewards/cosine_scaled_reward": -0.04049574676901102, |
|
"rewards/format_reward": 0.7142857313156128, |
|
"step": 366 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2493.2857666015625, |
|
"epoch": 1.468, |
|
"grad_norm": 0.41825661063194275, |
|
"kl": 0.269775390625, |
|
"learning_rate": 2.8043938066798645e-07, |
|
"loss": 0.0634, |
|
"reward": 0.6000736728310585, |
|
"reward_std": 0.6958686709403992, |
|
"rewards/cosine_scaled_reward": -0.04520127363502979, |
|
"rewards/format_reward": 0.690476194024086, |
|
"step": 367 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2441.184600830078, |
|
"epoch": 1.472, |
|
"grad_norm": 0.6742368936538696, |
|
"kl": 0.29248046875, |
|
"learning_rate": 2.7793039831193133e-07, |
|
"loss": 0.0205, |
|
"reward": 0.7077510952949524, |
|
"reward_std": 0.8173489719629288, |
|
"rewards/cosine_scaled_reward": -0.003267320804297924, |
|
"rewards/format_reward": 0.7142857313156128, |
|
"step": 368 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2645.202392578125, |
|
"epoch": 1.476, |
|
"grad_norm": 0.6914957761764526, |
|
"kl": 0.298095703125, |
|
"learning_rate": 2.7543467624442956e-07, |
|
"loss": 0.0967, |
|
"reward": 0.2303389220032841, |
|
"reward_std": 0.6355866640806198, |
|
"rewards/cosine_scaled_reward": -0.1616162583231926, |
|
"rewards/format_reward": 0.5535714328289032, |
|
"step": 369 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2256.9286193847656, |
|
"epoch": 1.48, |
|
"grad_norm": 0.9714637994766235, |
|
"kl": 0.255126953125, |
|
"learning_rate": 2.729523361034538e-07, |
|
"loss": 0.0866, |
|
"reward": 0.7436040937900543, |
|
"reward_std": 0.6377575844526291, |
|
"rewards/cosine_scaled_reward": -0.012126525864005089, |
|
"rewards/format_reward": 0.767857164144516, |
|
"step": 370 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2519.7202758789062, |
|
"epoch": 1.484, |
|
"grad_norm": 0.6541756987571716, |
|
"kl": 0.32470703125, |
|
"learning_rate": 2.7048349887476037e-07, |
|
"loss": 0.0731, |
|
"reward": 0.8480066582560539, |
|
"reward_std": 0.7711106240749359, |
|
"rewards/cosine_scaled_reward": 0.031146179419010878, |
|
"rewards/format_reward": 0.7857142984867096, |
|
"step": 371 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2420.970245361328, |
|
"epoch": 1.488, |
|
"grad_norm": 0.5346278548240662, |
|
"kl": 0.2998046875, |
|
"learning_rate": 2.6802828488599294e-07, |
|
"loss": 0.0556, |
|
"reward": 0.6287192776799202, |
|
"reward_std": 0.6931318640708923, |
|
"rewards/cosine_scaled_reward": -0.03683085576631129, |
|
"rewards/format_reward": 0.7023809552192688, |
|
"step": 372 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2542.5654907226562, |
|
"epoch": 1.492, |
|
"grad_norm": 0.43199771642684937, |
|
"kl": 0.33544921875, |
|
"learning_rate": 2.655868138008171e-07, |
|
"loss": 0.0657, |
|
"reward": 0.4730634540319443, |
|
"reward_std": 0.5836888402700424, |
|
"rewards/cosine_scaled_reward": -0.1533492412418127, |
|
"rewards/format_reward": 0.7797619104385376, |
|
"step": 373 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2848.3572387695312, |
|
"epoch": 1.496, |
|
"grad_norm": 0.6630088686943054, |
|
"kl": 0.35009765625, |
|
"learning_rate": 2.631592046130896e-07, |
|
"loss": 0.0207, |
|
"reward": 0.2956889607012272, |
|
"reward_std": 0.614417277276516, |
|
"rewards/cosine_scaled_reward": -0.1319174226373434, |
|
"rewards/format_reward": 0.5595238283276558, |
|
"step": 374 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2754.0060424804688, |
|
"epoch": 1.5, |
|
"grad_norm": 0.4504316449165344, |
|
"kl": 0.302490234375, |
|
"learning_rate": 2.6074557564105724e-07, |
|
"loss": 0.0225, |
|
"reward": 0.42709287256002426, |
|
"reward_std": 0.6112170070409775, |
|
"rewards/cosine_scaled_reward": -0.07514405064284801, |
|
"rewards/format_reward": 0.5773809552192688, |
|
"step": 375 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2403.52978515625, |
|
"epoch": 1.504, |
|
"grad_norm": 0.4335888624191284, |
|
"kl": 0.266845703125, |
|
"learning_rate": 2.583460445215911e-07, |
|
"loss": 0.0347, |
|
"reward": 0.37878482323139906, |
|
"reward_std": 0.5512942001223564, |
|
"rewards/cosine_scaled_reward": -0.14691711403429508, |
|
"rewards/format_reward": 0.6726190745830536, |
|
"step": 376 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2539.71435546875, |
|
"epoch": 1.508, |
|
"grad_norm": 0.6600142121315002, |
|
"kl": 0.35546875, |
|
"learning_rate": 2.5596072820445254e-07, |
|
"loss": 0.0879, |
|
"reward": 0.6289402991533279, |
|
"reward_std": 0.7740087658166885, |
|
"rewards/cosine_scaled_reward": -0.04564890172332525, |
|
"rewards/format_reward": 0.7202381044626236, |
|
"step": 377 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2566.4107666015625, |
|
"epoch": 1.512, |
|
"grad_norm": 0.5574218034744263, |
|
"kl": 0.310791015625, |
|
"learning_rate": 2.5358974294659373e-07, |
|
"loss": 0.0794, |
|
"reward": 0.5734596885740757, |
|
"reward_std": 0.6776000708341599, |
|
"rewards/cosine_scaled_reward": -0.058508249232545495, |
|
"rewards/format_reward": 0.6904762089252472, |
|
"step": 378 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2749.4642944335938, |
|
"epoch": 1.516, |
|
"grad_norm": 0.4314301908016205, |
|
"kl": 0.330078125, |
|
"learning_rate": 2.512332043064913e-07, |
|
"loss": 0.0655, |
|
"reward": 0.42858002707362175, |
|
"reward_std": 0.7303398549556732, |
|
"rewards/cosine_scaled_reward": -0.10416238568723202, |
|
"rewards/format_reward": 0.6369047611951828, |
|
"step": 379 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2403.684539794922, |
|
"epoch": 1.52, |
|
"grad_norm": 0.42673397064208984, |
|
"kl": 0.299560546875, |
|
"learning_rate": 2.488912271385139e-07, |
|
"loss": 0.0799, |
|
"reward": 0.7241241782903671, |
|
"reward_std": 0.7478837221860886, |
|
"rewards/cosine_scaled_reward": -0.006985542830079794, |
|
"rewards/format_reward": 0.7380952388048172, |
|
"step": 380 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2391.5179443359375, |
|
"epoch": 1.524, |
|
"grad_norm": 0.8130372762680054, |
|
"kl": 0.3203125, |
|
"learning_rate": 2.465639255873246e-07, |
|
"loss": 0.0286, |
|
"reward": 0.49668359011411667, |
|
"reward_std": 0.6931805461645126, |
|
"rewards/cosine_scaled_reward": -0.12665820121765137, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 381 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2430.5654907226562, |
|
"epoch": 1.528, |
|
"grad_norm": 0.4374740719795227, |
|
"kl": 0.310302734375, |
|
"learning_rate": 2.4425141308231765e-07, |
|
"loss": 0.0341, |
|
"reward": 0.6685771271586418, |
|
"reward_std": 0.8352404981851578, |
|
"rewards/cosine_scaled_reward": -0.016901913098990917, |
|
"rewards/format_reward": 0.7023809552192688, |
|
"step": 382 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2296.1607666015625, |
|
"epoch": 1.532, |
|
"grad_norm": 0.5494891405105591, |
|
"kl": 0.30224609375, |
|
"learning_rate": 2.4195380233209006e-07, |
|
"loss": 0.0859, |
|
"reward": 0.7063075229525566, |
|
"reward_std": 0.7431895136833191, |
|
"rewards/cosine_scaled_reward": -0.009941489901393652, |
|
"rewards/format_reward": 0.7261904925107956, |
|
"step": 383 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2517.827392578125, |
|
"epoch": 1.536, |
|
"grad_norm": 0.5410645604133606, |
|
"kl": 0.33203125, |
|
"learning_rate": 2.3967120531894857e-07, |
|
"loss": 0.0459, |
|
"reward": 0.42114658281207085, |
|
"reward_std": 0.6721706539392471, |
|
"rewards/cosine_scaled_reward": -0.12573623820208013, |
|
"rewards/format_reward": 0.6726190596818924, |
|
"step": 384 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2484.96435546875, |
|
"epoch": 1.54, |
|
"grad_norm": 0.4815540313720703, |
|
"kl": 0.3095703125, |
|
"learning_rate": 2.374037332934512e-07, |
|
"loss": 0.0759, |
|
"reward": 0.7441006675362587, |
|
"reward_std": 0.8601991981267929, |
|
"rewards/cosine_scaled_reward": 0.029788417392410338, |
|
"rewards/format_reward": 0.6845238208770752, |
|
"step": 385 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2389.9703063964844, |
|
"epoch": 1.544, |
|
"grad_norm": 0.6783538460731506, |
|
"kl": 0.2802734375, |
|
"learning_rate": 2.3515149676898552e-07, |
|
"loss": 0.0716, |
|
"reward": 0.479885321110487, |
|
"reward_std": 0.7240753322839737, |
|
"rewards/cosine_scaled_reward": -0.06958115100860596, |
|
"rewards/format_reward": 0.6190476417541504, |
|
"step": 386 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2359.964324951172, |
|
"epoch": 1.548, |
|
"grad_norm": 0.8481286764144897, |
|
"kl": 0.296630859375, |
|
"learning_rate": 2.3291460551638237e-07, |
|
"loss": 0.0148, |
|
"reward": 0.5802747337147593, |
|
"reward_std": 0.5601852983236313, |
|
"rewards/cosine_scaled_reward": -0.04617217415943742, |
|
"rewards/format_reward": 0.672619067132473, |
|
"step": 387 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2159.5655212402344, |
|
"epoch": 1.552, |
|
"grad_norm": 0.5140231251716614, |
|
"kl": 0.302001953125, |
|
"learning_rate": 2.306931685585657e-07, |
|
"loss": 0.063, |
|
"reward": 0.5727366209030151, |
|
"reward_std": 0.6229267343878746, |
|
"rewards/cosine_scaled_reward": -0.11244121752679348, |
|
"rewards/format_reward": 0.7976190745830536, |
|
"step": 388 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2253.7559814453125, |
|
"epoch": 1.556, |
|
"grad_norm": 0.4566425681114197, |
|
"kl": 0.292724609375, |
|
"learning_rate": 2.2848729416523859e-07, |
|
"loss": 0.0398, |
|
"reward": 0.6296885460615158, |
|
"reward_std": 0.7193648666143417, |
|
"rewards/cosine_scaled_reward": -0.04825095273554325, |
|
"rewards/format_reward": 0.7261904925107956, |
|
"step": 389 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2653.6250610351562, |
|
"epoch": 1.56, |
|
"grad_norm": 0.6326945424079895, |
|
"kl": 0.38818359375, |
|
"learning_rate": 2.2629708984760706e-07, |
|
"loss": 0.1043, |
|
"reward": 0.531873881816864, |
|
"reward_std": 0.7026529461145401, |
|
"rewards/cosine_scaled_reward": -0.0822773426771164, |
|
"rewards/format_reward": 0.696428582072258, |
|
"step": 390 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2613.7261962890625, |
|
"epoch": 1.564, |
|
"grad_norm": 0.398603618144989, |
|
"kl": 0.305908203125, |
|
"learning_rate": 2.2412266235313973e-07, |
|
"loss": 0.0464, |
|
"reward": 0.2553718090057373, |
|
"reward_std": 0.6311058104038239, |
|
"rewards/cosine_scaled_reward": -0.1699331346899271, |
|
"rewards/format_reward": 0.595238097012043, |
|
"step": 391 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2196.5833435058594, |
|
"epoch": 1.568, |
|
"grad_norm": 1.320838451385498, |
|
"kl": 0.2607421875, |
|
"learning_rate": 2.2196411766036487e-07, |
|
"loss": 0.1165, |
|
"reward": 0.6960010007023811, |
|
"reward_std": 0.8236257880926132, |
|
"rewards/cosine_scaled_reward": -0.044856662629172206, |
|
"rewards/format_reward": 0.7857143133878708, |
|
"step": 392 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2663.9285888671875, |
|
"epoch": 1.572, |
|
"grad_norm": 0.5183250904083252, |
|
"kl": 0.328857421875, |
|
"learning_rate": 2.1982156097370557e-07, |
|
"loss": 0.0708, |
|
"reward": 0.34957781434059143, |
|
"reward_std": 0.6104390919208527, |
|
"rewards/cosine_scaled_reward": -0.12878252286463976, |
|
"rewards/format_reward": 0.6071428805589676, |
|
"step": 393 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2630.2560424804688, |
|
"epoch": 1.576, |
|
"grad_norm": 0.2792785167694092, |
|
"kl": 0.34619140625, |
|
"learning_rate": 2.1769509671835223e-07, |
|
"loss": 0.0772, |
|
"reward": 0.45473287999629974, |
|
"reward_std": 0.7525355666875839, |
|
"rewards/cosine_scaled_reward": -0.0940621355548501, |
|
"rewards/format_reward": 0.6428571492433548, |
|
"step": 394 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2484.2679138183594, |
|
"epoch": 1.58, |
|
"grad_norm": 0.47966378927230835, |
|
"kl": 0.3330078125, |
|
"learning_rate": 2.1558482853517253e-07, |
|
"loss": 0.0783, |
|
"reward": 0.5024484526365995, |
|
"reward_std": 0.6865183711051941, |
|
"rewards/cosine_scaled_reward": -0.07615673809777945, |
|
"rewards/format_reward": 0.6547619104385376, |
|
"step": 395 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2481.96435546875, |
|
"epoch": 1.584, |
|
"grad_norm": 0.4925695061683655, |
|
"kl": 0.34228515625, |
|
"learning_rate": 2.134908592756607e-07, |
|
"loss": 0.1001, |
|
"reward": 0.6872217282652855, |
|
"reward_std": 0.7295544147491455, |
|
"rewards/cosine_scaled_reward": -0.037341527407988906, |
|
"rewards/format_reward": 0.761904776096344, |
|
"step": 396 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2154.494110107422, |
|
"epoch": 1.588, |
|
"grad_norm": 0.635874330997467, |
|
"kl": 0.281494140625, |
|
"learning_rate": 2.1141329099692406e-07, |
|
"loss": 0.0341, |
|
"reward": 0.6967096533626318, |
|
"reward_std": 0.7243114337325096, |
|
"rewards/cosine_scaled_reward": -0.04450232535600662, |
|
"rewards/format_reward": 0.7857143059372902, |
|
"step": 397 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2704.7322387695312, |
|
"epoch": 1.592, |
|
"grad_norm": 0.36841636896133423, |
|
"kl": 0.390625, |
|
"learning_rate": 2.0935222495670968e-07, |
|
"loss": 0.0683, |
|
"reward": 0.35482142120599747, |
|
"reward_std": 0.6981697529554367, |
|
"rewards/cosine_scaled_reward": -0.1410416765138507, |
|
"rewards/format_reward": 0.6369047611951828, |
|
"step": 398 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2155.1905212402344, |
|
"epoch": 1.596, |
|
"grad_norm": 0.6153831481933594, |
|
"kl": 0.2763671875, |
|
"learning_rate": 2.0730776160846853e-07, |
|
"loss": 0.075, |
|
"reward": 0.7129835858941078, |
|
"reward_std": 0.7049887701869011, |
|
"rewards/cosine_scaled_reward": -0.04827013239264488, |
|
"rewards/format_reward": 0.8095238208770752, |
|
"step": 399 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2736.9286499023438, |
|
"epoch": 1.6, |
|
"grad_norm": 0.4656315743923187, |
|
"kl": 0.38330078125, |
|
"learning_rate": 2.0528000059645995e-07, |
|
"loss": 0.0486, |
|
"reward": 0.3270074762403965, |
|
"reward_std": 0.6684512719511986, |
|
"rewards/cosine_scaled_reward": -0.17578197922557592, |
|
"rewards/format_reward": 0.6785714477300644, |
|
"step": 400 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2684.994140625, |
|
"epoch": 1.604, |
|
"grad_norm": 0.4505021274089813, |
|
"kl": 0.34423828125, |
|
"learning_rate": 2.032690407508949e-07, |
|
"loss": 0.0772, |
|
"reward": 0.5096228048205376, |
|
"reward_std": 0.7098966240882874, |
|
"rewards/cosine_scaled_reward": -0.0814981039147824, |
|
"rewards/format_reward": 0.6726190522313118, |
|
"step": 401 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2508.327423095703, |
|
"epoch": 1.608, |
|
"grad_norm": 0.3696132302284241, |
|
"kl": 0.34033203125, |
|
"learning_rate": 2.0127498008311922e-07, |
|
"loss": 0.0554, |
|
"reward": 0.6811544820666313, |
|
"reward_std": 0.7352585643529892, |
|
"rewards/cosine_scaled_reward": -0.03442276082932949, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 402 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2329.654815673828, |
|
"epoch": 1.612, |
|
"grad_norm": 0.5500597953796387, |
|
"kl": 0.310302734375, |
|
"learning_rate": 1.9929791578083655e-07, |
|
"loss": 0.0912, |
|
"reward": 0.5886539276689291, |
|
"reward_std": 0.6953590214252472, |
|
"rewards/cosine_scaled_reward": -0.05091113201342523, |
|
"rewards/format_reward": 0.6904762089252472, |
|
"step": 403 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2279.5238342285156, |
|
"epoch": 1.616, |
|
"grad_norm": 0.825627326965332, |
|
"kl": 0.314208984375, |
|
"learning_rate": 1.9733794420337213e-07, |
|
"loss": 0.0303, |
|
"reward": 0.4903724156320095, |
|
"reward_std": 0.6759866625070572, |
|
"rewards/cosine_scaled_reward": -0.1268376000225544, |
|
"rewards/format_reward": 0.7440476417541504, |
|
"step": 404 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2298.607208251953, |
|
"epoch": 1.62, |
|
"grad_norm": 0.7521853446960449, |
|
"kl": 0.300048828125, |
|
"learning_rate": 1.9539516087697517e-07, |
|
"loss": 0.0742, |
|
"reward": 0.6187992710620165, |
|
"reward_std": 0.6595779061317444, |
|
"rewards/cosine_scaled_reward": -0.050719428109005094, |
|
"rewards/format_reward": 0.7202381044626236, |
|
"step": 405 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2462.0416870117188, |
|
"epoch": 1.624, |
|
"grad_norm": 0.4565219581127167, |
|
"kl": 0.36181640625, |
|
"learning_rate": 1.934696604901642e-07, |
|
"loss": 0.0655, |
|
"reward": 0.7676936537027359, |
|
"reward_std": 0.8463387489318848, |
|
"rewards/cosine_scaled_reward": 0.032656354829669, |
|
"rewards/format_reward": 0.7023809552192688, |
|
"step": 406 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2593.9285888671875, |
|
"epoch": 1.6280000000000001, |
|
"grad_norm": 0.7805240154266357, |
|
"kl": 0.37109375, |
|
"learning_rate": 1.915615368891117e-07, |
|
"loss": 0.0336, |
|
"reward": 0.4898635447025299, |
|
"reward_std": 0.6100385710597038, |
|
"rewards/cosine_scaled_reward": -0.0943539384752512, |
|
"rewards/format_reward": 0.6785714477300644, |
|
"step": 407 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2179.8810119628906, |
|
"epoch": 1.6320000000000001, |
|
"grad_norm": 0.7494162321090698, |
|
"kl": 0.3154296875, |
|
"learning_rate": 1.8967088307307e-07, |
|
"loss": 0.0819, |
|
"reward": 0.5508405864238739, |
|
"reward_std": 0.6755202859640121, |
|
"rewards/cosine_scaled_reward": -0.12934163073077798, |
|
"rewards/format_reward": 0.8095238357782364, |
|
"step": 408 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2213.1131591796875, |
|
"epoch": 1.6360000000000001, |
|
"grad_norm": 0.7274454832077026, |
|
"kl": 0.3466796875, |
|
"learning_rate": 1.8779779118983867e-07, |
|
"loss": 0.0553, |
|
"reward": 0.6235681027173996, |
|
"reward_std": 0.6233258098363876, |
|
"rewards/cosine_scaled_reward": -0.10488261096179485, |
|
"rewards/format_reward": 0.833333358168602, |
|
"step": 409 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2276.5774536132812, |
|
"epoch": 1.6400000000000001, |
|
"grad_norm": 1.2181180715560913, |
|
"kl": 0.357421875, |
|
"learning_rate": 1.8594235253127372e-07, |
|
"loss": 0.1457, |
|
"reward": 0.6739452332258224, |
|
"reward_std": 0.7620265781879425, |
|
"rewards/cosine_scaled_reward": -0.0350511996075511, |
|
"rewards/format_reward": 0.7440476268529892, |
|
"step": 410 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2513.0833740234375, |
|
"epoch": 1.6440000000000001, |
|
"grad_norm": 0.3816966414451599, |
|
"kl": 0.37060546875, |
|
"learning_rate": 1.8410465752883758e-07, |
|
"loss": 0.0631, |
|
"reward": 0.503595694899559, |
|
"reward_std": 0.6215758174657822, |
|
"rewards/cosine_scaled_reward": -0.08748787135118619, |
|
"rewards/format_reward": 0.6785714328289032, |
|
"step": 411 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2348.7916870117188, |
|
"epoch": 1.6480000000000001, |
|
"grad_norm": 0.647936224937439, |
|
"kl": 0.3623046875, |
|
"learning_rate": 1.822847957491922e-07, |
|
"loss": 0.1147, |
|
"reward": 0.7675136551260948, |
|
"reward_std": 0.7814928591251373, |
|
"rewards/cosine_scaled_reward": 0.020661589689552784, |
|
"rewards/format_reward": 0.7261904925107956, |
|
"step": 412 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2329.9464721679688, |
|
"epoch": 1.6520000000000001, |
|
"grad_norm": 0.7966573238372803, |
|
"kl": 0.3505859375, |
|
"learning_rate": 1.804828558898332e-07, |
|
"loss": 0.0507, |
|
"reward": 0.7220299392938614, |
|
"reward_std": 0.7220810800790787, |
|
"rewards/cosine_scaled_reward": -0.01993740734178573, |
|
"rewards/format_reward": 0.761904776096344, |
|
"step": 413 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2510.83935546875, |
|
"epoch": 1.6560000000000001, |
|
"grad_norm": 0.3402910828590393, |
|
"kl": 0.36767578125, |
|
"learning_rate": 1.7869892577476722e-07, |
|
"loss": 0.0717, |
|
"reward": 0.6566540375351906, |
|
"reward_std": 0.7324352562427521, |
|
"rewards/cosine_scaled_reward": -0.016911087092012167, |
|
"rewards/format_reward": 0.6904762089252472, |
|
"step": 414 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2362.952423095703, |
|
"epoch": 1.6600000000000001, |
|
"grad_norm": 0.5068221688270569, |
|
"kl": 0.41357421875, |
|
"learning_rate": 1.7693309235023127e-07, |
|
"loss": 0.0817, |
|
"reward": 0.5704538598656654, |
|
"reward_std": 0.83931764960289, |
|
"rewards/cosine_scaled_reward": -0.08977308124303818, |
|
"rewards/format_reward": 0.7500000298023224, |
|
"step": 415 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2197.886962890625, |
|
"epoch": 1.6640000000000001, |
|
"grad_norm": 0.5192682147026062, |
|
"kl": 0.349609375, |
|
"learning_rate": 1.7518544168045524e-07, |
|
"loss": 0.0456, |
|
"reward": 0.7760029062628746, |
|
"reward_std": 0.7372387051582336, |
|
"rewards/cosine_scaled_reward": -0.0048556849360466, |
|
"rewards/format_reward": 0.7857142984867096, |
|
"step": 416 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2365.0655517578125, |
|
"epoch": 1.6680000000000001, |
|
"grad_norm": 0.7471702098846436, |
|
"kl": 0.357421875, |
|
"learning_rate": 1.7345605894346726e-07, |
|
"loss": 0.0258, |
|
"reward": 0.6658978462219238, |
|
"reward_std": 0.7144315093755722, |
|
"rewards/cosine_scaled_reward": -0.02419395267497748, |
|
"rewards/format_reward": 0.7142857313156128, |
|
"step": 417 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2210.7500610351562, |
|
"epoch": 1.6720000000000002, |
|
"grad_norm": 0.4305538833141327, |
|
"kl": 0.37060546875, |
|
"learning_rate": 1.7174502842694212e-07, |
|
"loss": 0.0715, |
|
"reward": 0.6991885676980019, |
|
"reward_std": 0.6301053613424301, |
|
"rewards/cosine_scaled_reward": -0.03433429542928934, |
|
"rewards/format_reward": 0.7678571492433548, |
|
"step": 418 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2631.9048461914062, |
|
"epoch": 1.6760000000000002, |
|
"grad_norm": 0.31225350499153137, |
|
"kl": 0.35888671875, |
|
"learning_rate": 1.7005243352409333e-07, |
|
"loss": 0.0697, |
|
"reward": 0.6943976636976004, |
|
"reward_std": 0.7306639850139618, |
|
"rewards/cosine_scaled_reward": 0.02874644659459591, |
|
"rewards/format_reward": 0.636904776096344, |
|
"step": 419 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2425.6607055664062, |
|
"epoch": 1.6800000000000002, |
|
"grad_norm": 0.6987324953079224, |
|
"kl": 0.38330078125, |
|
"learning_rate": 1.6837835672960831e-07, |
|
"loss": 0.0585, |
|
"reward": 0.7563354596495628, |
|
"reward_std": 0.7114580571651459, |
|
"rewards/cosine_scaled_reward": -0.038498950423672795, |
|
"rewards/format_reward": 0.8333333432674408, |
|
"step": 420 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2277.4464721679688, |
|
"epoch": 1.6840000000000002, |
|
"grad_norm": 0.34894487261772156, |
|
"kl": 0.373046875, |
|
"learning_rate": 1.6672287963562852e-07, |
|
"loss": 0.0935, |
|
"reward": 0.6365625336766243, |
|
"reward_std": 0.7153737097978592, |
|
"rewards/cosine_scaled_reward": -0.05969492206349969, |
|
"rewards/format_reward": 0.7559524029493332, |
|
"step": 421 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2504.916748046875, |
|
"epoch": 1.688, |
|
"grad_norm": 0.6229146718978882, |
|
"kl": 0.35498046875, |
|
"learning_rate": 1.6508608292777203e-07, |
|
"loss": 0.0181, |
|
"reward": 0.5784893482923508, |
|
"reward_std": 0.6405449658632278, |
|
"rewards/cosine_scaled_reward": -0.06194583047181368, |
|
"rewards/format_reward": 0.7023809552192688, |
|
"step": 422 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2308.52978515625, |
|
"epoch": 1.692, |
|
"grad_norm": 0.4013311266899109, |
|
"kl": 0.328125, |
|
"learning_rate": 1.6346804638120098e-07, |
|
"loss": 0.0574, |
|
"reward": 0.6359338611364365, |
|
"reward_std": 0.7808969020843506, |
|
"rewards/cosine_scaled_reward": -0.03917593788355589, |
|
"rewards/format_reward": 0.714285746216774, |
|
"step": 423 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2253.6785888671875, |
|
"epoch": 1.696, |
|
"grad_norm": 0.7038490176200867, |
|
"kl": 0.288330078125, |
|
"learning_rate": 1.6186884885673413e-07, |
|
"loss": 0.0231, |
|
"reward": 0.6301399618387222, |
|
"reward_std": 0.7140125781297684, |
|
"rewards/cosine_scaled_reward": -0.07481098547577858, |
|
"rewards/format_reward": 0.7797619104385376, |
|
"step": 424 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2256.464385986328, |
|
"epoch": 1.7, |
|
"grad_norm": 0.4849310517311096, |
|
"kl": 0.34228515625, |
|
"learning_rate": 1.6028856829700258e-07, |
|
"loss": 0.0754, |
|
"reward": 0.6902762800455093, |
|
"reward_std": 0.7732263505458832, |
|
"rewards/cosine_scaled_reward": -0.029861881979741156, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 425 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2638.3750610351562, |
|
"epoch": 1.704, |
|
"grad_norm": 0.4661174416542053, |
|
"kl": 0.34716796875, |
|
"learning_rate": 1.5872728172265146e-07, |
|
"loss": 0.0631, |
|
"reward": 0.4628839958459139, |
|
"reward_std": 0.6522120535373688, |
|
"rewards/cosine_scaled_reward": -0.06915326602756977, |
|
"rewards/format_reward": 0.601190485060215, |
|
"step": 426 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2258.279815673828, |
|
"epoch": 1.708, |
|
"grad_norm": 0.5582512021064758, |
|
"kl": 0.325439453125, |
|
"learning_rate": 1.5718506522858572e-07, |
|
"loss": 0.0416, |
|
"reward": 0.6841344758868217, |
|
"reward_std": 0.716413825750351, |
|
"rewards/cosine_scaled_reward": -0.04483753815293312, |
|
"rewards/format_reward": 0.7738095298409462, |
|
"step": 427 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2152.732208251953, |
|
"epoch": 1.712, |
|
"grad_norm": 0.362613320350647, |
|
"kl": 0.35546875, |
|
"learning_rate": 1.5566199398026147e-07, |
|
"loss": 0.0663, |
|
"reward": 0.8665853589773178, |
|
"reward_std": 0.746289573609829, |
|
"rewards/cosine_scaled_reward": 0.013649825006723404, |
|
"rewards/format_reward": 0.8392857164144516, |
|
"step": 428 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2589.8690795898438, |
|
"epoch": 1.716, |
|
"grad_norm": 0.392411470413208, |
|
"kl": 0.35693359375, |
|
"learning_rate": 1.5415814221002265e-07, |
|
"loss": 0.0781, |
|
"reward": 0.6071142517030239, |
|
"reward_std": 0.7519797533750534, |
|
"rewards/cosine_scaled_reward": -0.044657152146101, |
|
"rewards/format_reward": 0.696428582072258, |
|
"step": 429 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2220.1012573242188, |
|
"epoch": 1.72, |
|
"grad_norm": 0.5445396900177002, |
|
"kl": 0.341796875, |
|
"learning_rate": 1.5267358321348285e-07, |
|
"loss": 0.0534, |
|
"reward": 0.701055221259594, |
|
"reward_std": 0.6740739792585373, |
|
"rewards/cosine_scaled_reward": -0.04232952371239662, |
|
"rewards/format_reward": 0.7857142984867096, |
|
"step": 430 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1902.5595397949219, |
|
"epoch": 1.724, |
|
"grad_norm": 0.3450181186199188, |
|
"kl": 0.2607421875, |
|
"learning_rate": 1.5120838934595337e-07, |
|
"loss": 0.0472, |
|
"reward": 1.0963951796293259, |
|
"reward_std": 0.746511772274971, |
|
"rewards/cosine_scaled_reward": 0.11962614580988884, |
|
"rewards/format_reward": 0.8571428805589676, |
|
"step": 431 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2478.732177734375, |
|
"epoch": 1.728, |
|
"grad_norm": 0.48547589778900146, |
|
"kl": 0.373046875, |
|
"learning_rate": 1.4976263201891613e-07, |
|
"loss": 0.0409, |
|
"reward": 0.5677317231893539, |
|
"reward_std": 0.6051659360527992, |
|
"rewards/cosine_scaled_reward": -0.09113414993043989, |
|
"rewards/format_reward": 0.7500000149011612, |
|
"step": 432 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2518.7560424804688, |
|
"epoch": 1.732, |
|
"grad_norm": 0.9139849543571472, |
|
"kl": 0.3701171875, |
|
"learning_rate": 1.483363816965435e-07, |
|
"loss": 0.01, |
|
"reward": 0.6564267948269844, |
|
"reward_std": 0.6321954727172852, |
|
"rewards/cosine_scaled_reward": -0.04083424177952111, |
|
"rewards/format_reward": 0.7380952388048172, |
|
"step": 433 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2439.119110107422, |
|
"epoch": 1.736, |
|
"grad_norm": 0.4629580080509186, |
|
"kl": 0.3349609375, |
|
"learning_rate": 1.469297078922642e-07, |
|
"loss": 0.0828, |
|
"reward": 0.6547855883836746, |
|
"reward_std": 0.6274382770061493, |
|
"rewards/cosine_scaled_reward": -0.05653578881174326, |
|
"rewards/format_reward": 0.767857164144516, |
|
"step": 434 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2385.047637939453, |
|
"epoch": 1.74, |
|
"grad_norm": 0.4398196041584015, |
|
"kl": 0.342529296875, |
|
"learning_rate": 1.4554267916537495e-07, |
|
"loss": 0.034, |
|
"reward": 0.59782674908638, |
|
"reward_std": 0.7165493220090866, |
|
"rewards/cosine_scaled_reward": -0.09394377004355192, |
|
"rewards/format_reward": 0.7857142984867096, |
|
"step": 435 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2235.4405212402344, |
|
"epoch": 1.744, |
|
"grad_norm": 0.5894522070884705, |
|
"kl": 0.321044921875, |
|
"learning_rate": 1.4417536311769885e-07, |
|
"loss": 0.0578, |
|
"reward": 0.7302387952804565, |
|
"reward_std": 0.7528126537799835, |
|
"rewards/cosine_scaled_reward": -0.018809196539223194, |
|
"rewards/format_reward": 0.7678571492433548, |
|
"step": 436 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2364.2083740234375, |
|
"epoch": 1.748, |
|
"grad_norm": 0.517291784286499, |
|
"kl": 0.262451171875, |
|
"learning_rate": 1.4282782639029128e-07, |
|
"loss": 0.0626, |
|
"reward": 0.5654323399066925, |
|
"reward_std": 0.6579017788171768, |
|
"rewards/cosine_scaled_reward": -0.03276003524661064, |
|
"rewards/format_reward": 0.630952388048172, |
|
"step": 437 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2464.0238647460938, |
|
"epoch": 1.752, |
|
"grad_norm": 0.719810426235199, |
|
"kl": 0.34619140625, |
|
"learning_rate": 1.4150013466019114e-07, |
|
"loss": 0.0289, |
|
"reward": 0.5253645405173302, |
|
"reward_std": 0.593732014298439, |
|
"rewards/cosine_scaled_reward": -0.10934155760332942, |
|
"rewards/format_reward": 0.7440476268529892, |
|
"step": 438 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2688.2559814453125, |
|
"epoch": 1.756, |
|
"grad_norm": 0.47081246972084045, |
|
"kl": 0.320068359375, |
|
"learning_rate": 1.4019235263722034e-07, |
|
"loss": 0.0737, |
|
"reward": 0.5551765933632851, |
|
"reward_std": 0.750535324215889, |
|
"rewards/cosine_scaled_reward": -0.043840276543051004, |
|
"rewards/format_reward": 0.6428571492433548, |
|
"step": 439 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2691.5298461914062, |
|
"epoch": 1.76, |
|
"grad_norm": 0.3561669588088989, |
|
"kl": 0.30517578125, |
|
"learning_rate": 1.3890454406082956e-07, |
|
"loss": 0.0412, |
|
"reward": 0.6305891573429108, |
|
"reward_std": 0.7718498408794403, |
|
"rewards/cosine_scaled_reward": -0.02399112842977047, |
|
"rewards/format_reward": 0.6785714477300644, |
|
"step": 440 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2423.3274536132812, |
|
"epoch": 1.764, |
|
"grad_norm": 0.8448560237884521, |
|
"kl": 0.34130859375, |
|
"learning_rate": 1.3763677169699217e-07, |
|
"loss": 0.0974, |
|
"reward": 0.6258707121014595, |
|
"reward_std": 0.7022215574979782, |
|
"rewards/cosine_scaled_reward": -0.06504084914922714, |
|
"rewards/format_reward": 0.7559524029493332, |
|
"step": 441 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2509.6845703125, |
|
"epoch": 1.768, |
|
"grad_norm": 0.49845507740974426, |
|
"kl": 0.3271484375, |
|
"learning_rate": 1.3638909733514452e-07, |
|
"loss": 0.0368, |
|
"reward": 0.6952026858925819, |
|
"reward_std": 0.7732700109481812, |
|
"rewards/cosine_scaled_reward": -0.03037486458197236, |
|
"rewards/format_reward": 0.7559524029493332, |
|
"step": 442 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2251.6370239257812, |
|
"epoch": 1.772, |
|
"grad_norm": 1.6570687294006348, |
|
"kl": 0.32861328125, |
|
"learning_rate": 1.351615817851748e-07, |
|
"loss": 0.1251, |
|
"reward": 0.5299716778099537, |
|
"reward_std": 0.6262076199054718, |
|
"rewards/cosine_scaled_reward": -0.10108558752108365, |
|
"rewards/format_reward": 0.7321428507566452, |
|
"step": 443 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2569.8392944335938, |
|
"epoch": 1.776, |
|
"grad_norm": 0.5071792602539062, |
|
"kl": 0.285400390625, |
|
"learning_rate": 1.3395428487445914e-07, |
|
"loss": 0.0461, |
|
"reward": 0.3558937795460224, |
|
"reward_std": 0.6386721879243851, |
|
"rewards/cosine_scaled_reward": -0.12860072287730873, |
|
"rewards/format_reward": 0.6130952537059784, |
|
"step": 444 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2434.52978515625, |
|
"epoch": 1.78, |
|
"grad_norm": 0.6472364068031311, |
|
"kl": 0.33935546875, |
|
"learning_rate": 1.3276726544494571e-07, |
|
"loss": 0.0324, |
|
"reward": 0.5342502817511559, |
|
"reward_std": 0.7027776390314102, |
|
"rewards/cosine_scaled_reward": -0.08108916692435741, |
|
"rewards/format_reward": 0.696428582072258, |
|
"step": 445 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2600.7678833007812, |
|
"epoch": 1.784, |
|
"grad_norm": 0.4613596796989441, |
|
"kl": 0.36767578125, |
|
"learning_rate": 1.316005813502869e-07, |
|
"loss": 0.0366, |
|
"reward": 0.3209609054028988, |
|
"reward_std": 0.6079899072647095, |
|
"rewards/cosine_scaled_reward": -0.15499573945999146, |
|
"rewards/format_reward": 0.630952388048172, |
|
"step": 446 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2406.6131286621094, |
|
"epoch": 1.788, |
|
"grad_norm": 0.5609318017959595, |
|
"kl": 0.3349609375, |
|
"learning_rate": 1.3045428945301953e-07, |
|
"loss": 0.1069, |
|
"reward": 0.8279012702405453, |
|
"reward_std": 0.6648521721363068, |
|
"rewards/cosine_scaled_reward": 0.035974426195025444, |
|
"rewards/format_reward": 0.755952388048172, |
|
"step": 447 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2292.5238647460938, |
|
"epoch": 1.792, |
|
"grad_norm": 0.6498924493789673, |
|
"kl": 0.3251953125, |
|
"learning_rate": 1.2932844562179352e-07, |
|
"loss": 0.1002, |
|
"reward": 0.9468577206134796, |
|
"reward_std": 0.8284895867109299, |
|
"rewards/cosine_scaled_reward": 0.1073574130423367, |
|
"rewards/format_reward": 0.7321428656578064, |
|
"step": 448 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2359.7203063964844, |
|
"epoch": 1.796, |
|
"grad_norm": 0.8151586651802063, |
|
"kl": 0.29248046875, |
|
"learning_rate": 1.2822310472864885e-07, |
|
"loss": 0.1025, |
|
"reward": 0.6154885776340961, |
|
"reward_std": 0.643234595656395, |
|
"rewards/cosine_scaled_reward": -0.028565243119373918, |
|
"rewards/format_reward": 0.6726190596818924, |
|
"step": 449 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2456.5952758789062, |
|
"epoch": 1.8, |
|
"grad_norm": 0.6727307438850403, |
|
"kl": 0.3369140625, |
|
"learning_rate": 1.2713832064634125e-07, |
|
"loss": 0.0518, |
|
"reward": 0.5276128388941288, |
|
"reward_std": 0.6850098147988319, |
|
"rewards/cosine_scaled_reward": -0.05166977294720709, |
|
"rewards/format_reward": 0.6309523805975914, |
|
"step": 450 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2180.250030517578, |
|
"epoch": 1.804, |
|
"grad_norm": 0.4327715039253235, |
|
"kl": 0.2841796875, |
|
"learning_rate": 1.260741462457165e-07, |
|
"loss": 0.0396, |
|
"reward": 0.9219238460063934, |
|
"reward_std": 0.8085188716650009, |
|
"rewards/cosine_scaled_reward": 0.029414291959255934, |
|
"rewards/format_reward": 0.8630952537059784, |
|
"step": 451 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2414.9584045410156, |
|
"epoch": 1.808, |
|
"grad_norm": 0.5890040993690491, |
|
"kl": 0.33642578125, |
|
"learning_rate": 1.2503063339313356e-07, |
|
"loss": 0.0116, |
|
"reward": 0.45377534069120884, |
|
"reward_std": 0.6864534169435501, |
|
"rewards/cosine_scaled_reward": -0.07668375968933105, |
|
"rewards/format_reward": 0.607142873108387, |
|
"step": 452 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2325.2619018554688, |
|
"epoch": 1.812, |
|
"grad_norm": 0.8580995798110962, |
|
"kl": 0.38330078125, |
|
"learning_rate": 1.2400783294793668e-07, |
|
"loss": 0.1211, |
|
"reward": 0.41875267028808594, |
|
"reward_std": 0.5978472009301186, |
|
"rewards/cosine_scaled_reward": -0.17455224692821503, |
|
"rewards/format_reward": 0.7678571492433548, |
|
"step": 453 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2524.8452758789062, |
|
"epoch": 1.8159999999999998, |
|
"grad_norm": 0.6263750195503235, |
|
"kl": 0.369140625, |
|
"learning_rate": 1.2300579475997657e-07, |
|
"loss": 0.0599, |
|
"reward": 0.5164637118577957, |
|
"reward_std": 0.7428598999977112, |
|
"rewards/cosine_scaled_reward": -0.0989109962247312, |
|
"rewards/format_reward": 0.7142857313156128, |
|
"step": 454 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2222.684600830078, |
|
"epoch": 1.8199999999999998, |
|
"grad_norm": 0.46227335929870605, |
|
"kl": 0.257568359375, |
|
"learning_rate": 1.220245676671809e-07, |
|
"loss": 0.0452, |
|
"reward": 0.6773854792118073, |
|
"reward_std": 0.6582589149475098, |
|
"rewards/cosine_scaled_reward": -0.03333106730133295, |
|
"rewards/format_reward": 0.744047611951828, |
|
"step": 455 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2381.2916870117188, |
|
"epoch": 1.8239999999999998, |
|
"grad_norm": 1.395836591720581, |
|
"kl": 0.346923828125, |
|
"learning_rate": 1.2106419949317388e-07, |
|
"loss": -0.0036, |
|
"reward": 0.6790124624967575, |
|
"reward_std": 0.7677509784698486, |
|
"rewards/cosine_scaled_reward": -0.0354937631636858, |
|
"rewards/format_reward": 0.7500000298023224, |
|
"step": 456 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2236.1369018554688, |
|
"epoch": 1.8279999999999998, |
|
"grad_norm": 0.363459974527359, |
|
"kl": 0.359375, |
|
"learning_rate": 1.2012473704494537e-07, |
|
"loss": 0.0897, |
|
"reward": 0.8419362753629684, |
|
"reward_std": 0.8306869268417358, |
|
"rewards/cosine_scaled_reward": 0.034063366474583745, |
|
"rewards/format_reward": 0.7738095223903656, |
|
"step": 457 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2312.5059814453125, |
|
"epoch": 1.8319999999999999, |
|
"grad_norm": 0.5052822232246399, |
|
"kl": 0.35791015625, |
|
"learning_rate": 1.1920622611056974e-07, |
|
"loss": 0.0619, |
|
"reward": 0.744497187435627, |
|
"reward_std": 0.6325250118970871, |
|
"rewards/cosine_scaled_reward": -0.02953713061287999, |
|
"rewards/format_reward": 0.8035714477300644, |
|
"step": 458 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2242.0357971191406, |
|
"epoch": 1.8359999999999999, |
|
"grad_norm": 0.4124799966812134, |
|
"kl": 0.30615234375, |
|
"learning_rate": 1.1830871145697412e-07, |
|
"loss": 0.0801, |
|
"reward": 0.7117128595709801, |
|
"reward_std": 0.7263730615377426, |
|
"rewards/cosine_scaled_reward": -0.031048328906763345, |
|
"rewards/format_reward": 0.773809552192688, |
|
"step": 459 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2795.3512573242188, |
|
"epoch": 1.8399999999999999, |
|
"grad_norm": 0.5879099369049072, |
|
"kl": 0.341796875, |
|
"learning_rate": 1.1743223682775649e-07, |
|
"loss": 0.0299, |
|
"reward": 0.41843298077583313, |
|
"reward_std": 0.6329772919416428, |
|
"rewards/cosine_scaled_reward": -0.11518826894462109, |
|
"rewards/format_reward": 0.6488095223903656, |
|
"step": 460 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2640.6904907226562, |
|
"epoch": 1.8439999999999999, |
|
"grad_norm": 0.3294979929924011, |
|
"kl": 0.38818359375, |
|
"learning_rate": 1.1657684494105386e-07, |
|
"loss": 0.0662, |
|
"reward": 0.6549192667007446, |
|
"reward_std": 0.8022814393043518, |
|
"rewards/cosine_scaled_reward": -0.02373085916042328, |
|
"rewards/format_reward": 0.7023809552192688, |
|
"step": 461 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2345.4285888671875, |
|
"epoch": 1.8479999999999999, |
|
"grad_norm": 0.3771924376487732, |
|
"kl": 0.35595703125, |
|
"learning_rate": 1.1574257748745986e-07, |
|
"loss": 0.105, |
|
"reward": 0.6399585753679276, |
|
"reward_std": 0.7968022599816322, |
|
"rewards/cosine_scaled_reward": -0.08478261809796095, |
|
"rewards/format_reward": 0.8095238208770752, |
|
"step": 462 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2589.1666870117188, |
|
"epoch": 1.8519999999999999, |
|
"grad_norm": 0.3122951090335846, |
|
"kl": 0.3798828125, |
|
"learning_rate": 1.1492947512799328e-07, |
|
"loss": 0.0565, |
|
"reward": 0.43525535613298416, |
|
"reward_std": 0.7020779103040695, |
|
"rewards/cosine_scaled_reward": -0.09784852154552937, |
|
"rewards/format_reward": 0.6309523954987526, |
|
"step": 463 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2481.261962890625, |
|
"epoch": 1.8559999999999999, |
|
"grad_norm": 0.5730969905853271, |
|
"kl": 0.33984375, |
|
"learning_rate": 1.1413757749211602e-07, |
|
"loss": 0.0699, |
|
"reward": 0.7782276198267937, |
|
"reward_std": 0.6072199195623398, |
|
"rewards/cosine_scaled_reward": 0.02304239757359028, |
|
"rewards/format_reward": 0.7321428656578064, |
|
"step": 464 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2644.619140625, |
|
"epoch": 1.8599999999999999, |
|
"grad_norm": 0.39649447798728943, |
|
"kl": 0.373046875, |
|
"learning_rate": 1.1336692317580158e-07, |
|
"loss": 0.0712, |
|
"reward": 0.6695687249302864, |
|
"reward_std": 0.8249562680721283, |
|
"rewards/cosine_scaled_reward": 0.007403409108519554, |
|
"rewards/format_reward": 0.6547619178891182, |
|
"step": 465 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2491.0119018554688, |
|
"epoch": 1.8639999999999999, |
|
"grad_norm": 0.4567016065120697, |
|
"kl": 0.36376953125, |
|
"learning_rate": 1.1261754973965422e-07, |
|
"loss": 0.0702, |
|
"reward": 0.46852924674749374, |
|
"reward_std": 0.7603975385427475, |
|
"rewards/cosine_scaled_reward": -0.1199020454660058, |
|
"rewards/format_reward": 0.708333358168602, |
|
"step": 466 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2532.6607666015625, |
|
"epoch": 1.8679999999999999, |
|
"grad_norm": 0.36717355251312256, |
|
"kl": 0.330078125, |
|
"learning_rate": 1.1188949370707787e-07, |
|
"loss": 0.0491, |
|
"reward": 0.5859625339508057, |
|
"reward_std": 0.6559573635458946, |
|
"rewards/cosine_scaled_reward": -0.028447304794099182, |
|
"rewards/format_reward": 0.6428571492433548, |
|
"step": 467 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2622.2559814453125, |
|
"epoch": 1.8719999999999999, |
|
"grad_norm": 0.4103451669216156, |
|
"kl": 0.36279296875, |
|
"learning_rate": 1.1118279056249653e-07, |
|
"loss": 0.0606, |
|
"reward": 0.6576881408691406, |
|
"reward_std": 0.8001267910003662, |
|
"rewards/cosine_scaled_reward": -0.007465461269021034, |
|
"rewards/format_reward": 0.6726190596818924, |
|
"step": 468 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2134.279815673828, |
|
"epoch": 1.876, |
|
"grad_norm": 0.5138155817985535, |
|
"kl": 0.256103515625, |
|
"learning_rate": 1.1049747474962444e-07, |
|
"loss": 0.0759, |
|
"reward": 0.6648233011364937, |
|
"reward_std": 0.7618712484836578, |
|
"rewards/cosine_scaled_reward": -0.06044549681246281, |
|
"rewards/format_reward": 0.7857143133878708, |
|
"step": 469 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2160.202392578125, |
|
"epoch": 1.88, |
|
"grad_norm": 0.3472672700881958, |
|
"kl": 0.30126953125, |
|
"learning_rate": 1.0983357966978745e-07, |
|
"loss": 0.0461, |
|
"reward": 0.7422109395265579, |
|
"reward_std": 0.6609758958220482, |
|
"rewards/cosine_scaled_reward": -0.012823125813156366, |
|
"rewards/format_reward": 0.767857164144516, |
|
"step": 470 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2532.3809814453125, |
|
"epoch": 1.884, |
|
"grad_norm": 0.2868484556674957, |
|
"kl": 0.2763671875, |
|
"learning_rate": 1.0919113768029517e-07, |
|
"loss": 0.0493, |
|
"reward": 0.48562416061758995, |
|
"reward_std": 0.7373960316181183, |
|
"rewards/cosine_scaled_reward": -0.08159269354655407, |
|
"rewards/format_reward": 0.648809552192688, |
|
"step": 471 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2409.3392944335938, |
|
"epoch": 1.888, |
|
"grad_norm": 0.4656960964202881, |
|
"kl": 0.390380859375, |
|
"learning_rate": 1.0857018009286381e-07, |
|
"loss": 0.0561, |
|
"reward": 0.6008469248190522, |
|
"reward_std": 0.6282935440540314, |
|
"rewards/cosine_scaled_reward": -0.05374322272837162, |
|
"rewards/format_reward": 0.708333358168602, |
|
"step": 472 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2294.0178833007812, |
|
"epoch": 1.892, |
|
"grad_norm": 0.5274000763893127, |
|
"kl": 0.3369140625, |
|
"learning_rate": 1.0797073717209013e-07, |
|
"loss": 0.0877, |
|
"reward": 0.7744100838899612, |
|
"reward_std": 0.7935537397861481, |
|
"rewards/cosine_scaled_reward": 0.00030028633773326874, |
|
"rewards/format_reward": 0.7738095372915268, |
|
"step": 473 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2676.6727294921875, |
|
"epoch": 1.896, |
|
"grad_norm": 0.5600417256355286, |
|
"kl": 0.302978515625, |
|
"learning_rate": 1.0739283813397639e-07, |
|
"loss": 0.0338, |
|
"reward": 0.47137061692774296, |
|
"reward_std": 0.7821067273616791, |
|
"rewards/cosine_scaled_reward": -0.07681469712406397, |
|
"rewards/format_reward": 0.6250000074505806, |
|
"step": 474 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2442.0952758789062, |
|
"epoch": 1.9, |
|
"grad_norm": 0.5208225846290588, |
|
"kl": 0.32470703125, |
|
"learning_rate": 1.068365111445064e-07, |
|
"loss": 0.0837, |
|
"reward": 0.38532300293445587, |
|
"reward_std": 0.5505756810307503, |
|
"rewards/cosine_scaled_reward": -0.1287670750170946, |
|
"rewards/format_reward": 0.6428571492433548, |
|
"step": 475 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2030.7857360839844, |
|
"epoch": 1.904, |
|
"grad_norm": 0.6633386611938477, |
|
"kl": 0.269287109375, |
|
"learning_rate": 1.063017833182728e-07, |
|
"loss": 0.0183, |
|
"reward": 0.796258918941021, |
|
"reward_std": 0.8103819191455841, |
|
"rewards/cosine_scaled_reward": 0.017177060712128878, |
|
"rewards/format_reward": 0.761904776096344, |
|
"step": 476 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2734.2262573242188, |
|
"epoch": 1.908, |
|
"grad_norm": 0.5043067932128906, |
|
"kl": 0.35791015625, |
|
"learning_rate": 1.0578868071715544e-07, |
|
"loss": 0.0378, |
|
"reward": 0.2551159653812647, |
|
"reward_std": 0.5920611470937729, |
|
"rewards/cosine_scaled_reward": -0.155180131085217, |
|
"rewards/format_reward": 0.5654762089252472, |
|
"step": 477 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2301.0952758789062, |
|
"epoch": 1.912, |
|
"grad_norm": 0.7771977186203003, |
|
"kl": 0.28662109375, |
|
"learning_rate": 1.0529722834905125e-07, |
|
"loss": 0.0228, |
|
"reward": 0.6790298409759998, |
|
"reward_std": 0.6661486774682999, |
|
"rewards/cosine_scaled_reward": -0.06524700409499928, |
|
"rewards/format_reward": 0.8095238357782364, |
|
"step": 478 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2247.6012573242188, |
|
"epoch": 1.916, |
|
"grad_norm": 0.599141001701355, |
|
"kl": 0.3212890625, |
|
"learning_rate": 1.0482745016665526e-07, |
|
"loss": 0.0857, |
|
"reward": 0.8667033798992634, |
|
"reward_std": 0.8036679923534393, |
|
"rewards/cosine_scaled_reward": 0.06132788397371769, |
|
"rewards/format_reward": 0.7440476268529892, |
|
"step": 479 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2093.3333740234375, |
|
"epoch": 1.92, |
|
"grad_norm": 0.5312609076499939, |
|
"kl": 0.2958984375, |
|
"learning_rate": 1.0437936906629334e-07, |
|
"loss": 0.0735, |
|
"reward": 0.7348574697971344, |
|
"reward_std": 0.689183309674263, |
|
"rewards/cosine_scaled_reward": -0.034356983145698905, |
|
"rewards/format_reward": 0.8035714328289032, |
|
"step": 480 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2448.1726684570312, |
|
"epoch": 1.924, |
|
"grad_norm": 0.5402917861938477, |
|
"kl": 0.36669921875, |
|
"learning_rate": 1.0395300688680625e-07, |
|
"loss": 0.0831, |
|
"reward": 0.43995974212884903, |
|
"reward_std": 0.6862698197364807, |
|
"rewards/cosine_scaled_reward": -0.13121061958372593, |
|
"rewards/format_reward": 0.70238097012043, |
|
"step": 481 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2658.7977294921875, |
|
"epoch": 1.928, |
|
"grad_norm": 0.5909121632575989, |
|
"kl": 0.3623046875, |
|
"learning_rate": 1.0354838440848501e-07, |
|
"loss": 0.0547, |
|
"reward": 0.5179771184921265, |
|
"reward_std": 0.7944772690534592, |
|
"rewards/cosine_scaled_reward": -0.0981543204979971, |
|
"rewards/format_reward": 0.7142857313156128, |
|
"step": 482 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2292.1428833007812, |
|
"epoch": 1.932, |
|
"grad_norm": 0.549201488494873, |
|
"kl": 0.30615234375, |
|
"learning_rate": 1.0316552135205837e-07, |
|
"loss": 0.0906, |
|
"reward": 0.6546563804149628, |
|
"reward_std": 0.7558221146464348, |
|
"rewards/cosine_scaled_reward": -0.017909929156303406, |
|
"rewards/format_reward": 0.690476194024086, |
|
"step": 483 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2001.2381286621094, |
|
"epoch": 1.936, |
|
"grad_norm": 0.9190180897712708, |
|
"kl": 0.2490234375, |
|
"learning_rate": 1.0280443637773163e-07, |
|
"loss": -0.0131, |
|
"reward": 0.6368911117315292, |
|
"reward_std": 0.5770624950528145, |
|
"rewards/cosine_scaled_reward": -0.07441157009452581, |
|
"rewards/format_reward": 0.7857142835855484, |
|
"step": 484 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2369.482177734375, |
|
"epoch": 1.94, |
|
"grad_norm": 0.48303577303886414, |
|
"kl": 0.32861328125, |
|
"learning_rate": 1.0246514708427701e-07, |
|
"loss": 0.0806, |
|
"reward": 0.5949805751442909, |
|
"reward_std": 0.7235869467258453, |
|
"rewards/cosine_scaled_reward": -0.04179543023929, |
|
"rewards/format_reward": 0.6785714328289032, |
|
"step": 485 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2607.375030517578, |
|
"epoch": 1.944, |
|
"grad_norm": 0.45922327041625977, |
|
"kl": 0.334228515625, |
|
"learning_rate": 1.0214767000817596e-07, |
|
"loss": 0.0495, |
|
"reward": 0.4739008713513613, |
|
"reward_std": 0.7411531507968903, |
|
"rewards/cosine_scaled_reward": -0.10828767996281385, |
|
"rewards/format_reward": 0.690476194024086, |
|
"step": 486 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2427.3929443359375, |
|
"epoch": 1.948, |
|
"grad_norm": 0.42017099261283875, |
|
"kl": 0.281494140625, |
|
"learning_rate": 1.0185202062281336e-07, |
|
"loss": 0.0322, |
|
"reward": 0.3904539607465267, |
|
"reward_std": 0.6817184686660767, |
|
"rewards/cosine_scaled_reward": -0.13810635451227427, |
|
"rewards/format_reward": 0.6666666716337204, |
|
"step": 487 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2579.0120239257812, |
|
"epoch": 1.952, |
|
"grad_norm": 0.7048377394676208, |
|
"kl": 0.322265625, |
|
"learning_rate": 1.0157821333772304e-07, |
|
"loss": 0.028, |
|
"reward": 0.5259524993598461, |
|
"reward_std": 0.6612162664532661, |
|
"rewards/cosine_scaled_reward": -0.09714281000196934, |
|
"rewards/format_reward": 0.7202381193637848, |
|
"step": 488 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2246.9524536132812, |
|
"epoch": 1.956, |
|
"grad_norm": 0.4814748167991638, |
|
"kl": 0.313720703125, |
|
"learning_rate": 1.013262614978859e-07, |
|
"loss": 0.0807, |
|
"reward": 0.7873745709657669, |
|
"reward_std": 0.7711023241281509, |
|
"rewards/cosine_scaled_reward": -0.005122252739965916, |
|
"rewards/format_reward": 0.7976190745830536, |
|
"step": 489 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2849.761962890625, |
|
"epoch": 1.96, |
|
"grad_norm": 0.5227950215339661, |
|
"kl": 0.37890625, |
|
"learning_rate": 1.0109617738307911e-07, |
|
"loss": 0.0171, |
|
"reward": 0.4944131616503, |
|
"reward_std": 0.6706523001194, |
|
"rewards/cosine_scaled_reward": -0.06826960667967796, |
|
"rewards/format_reward": 0.6309524029493332, |
|
"step": 490 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2594.3631591796875, |
|
"epoch": 1.964, |
|
"grad_norm": 0.5116230249404907, |
|
"kl": 0.40576171875, |
|
"learning_rate": 1.0088797220727779e-07, |
|
"loss": 0.0511, |
|
"reward": 0.40869739279150963, |
|
"reward_std": 0.703234076499939, |
|
"rewards/cosine_scaled_reward": -0.12600845471024513, |
|
"rewards/format_reward": 0.6607142984867096, |
|
"step": 491 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2416.8632202148438, |
|
"epoch": 1.968, |
|
"grad_norm": 0.9567095637321472, |
|
"kl": 0.27783203125, |
|
"learning_rate": 1.0070165611810855e-07, |
|
"loss": 0.1235, |
|
"reward": 0.5404094010591507, |
|
"reward_std": 0.6638472378253937, |
|
"rewards/cosine_scaled_reward": -0.054200079292058945, |
|
"rewards/format_reward": 0.6488095372915268, |
|
"step": 492 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2514.1666870117188, |
|
"epoch": 1.972, |
|
"grad_norm": 0.4846276044845581, |
|
"kl": 0.28466796875, |
|
"learning_rate": 1.005372381963547e-07, |
|
"loss": 0.0436, |
|
"reward": 0.5605661012232304, |
|
"reward_std": 0.6418938338756561, |
|
"rewards/cosine_scaled_reward": -0.03519314527511597, |
|
"rewards/format_reward": 0.630952388048172, |
|
"step": 493 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2289.71435546875, |
|
"epoch": 1.976, |
|
"grad_norm": 0.6296063661575317, |
|
"kl": 0.3212890625, |
|
"learning_rate": 1.0039472645551372e-07, |
|
"loss": 0.0439, |
|
"reward": 0.6024229377508163, |
|
"reward_std": 0.7128957360982895, |
|
"rewards/cosine_scaled_reward": -0.07081234554061666, |
|
"rewards/format_reward": 0.744047611951828, |
|
"step": 494 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2491.3334045410156, |
|
"epoch": 1.98, |
|
"grad_norm": 0.622008204460144, |
|
"kl": 0.283447265625, |
|
"learning_rate": 1.002741278414069e-07, |
|
"loss": 0.0361, |
|
"reward": 0.594695046544075, |
|
"reward_std": 0.6841184943914413, |
|
"rewards/cosine_scaled_reward": -0.03896199120208621, |
|
"rewards/format_reward": 0.6726190745830536, |
|
"step": 495 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2395.636962890625, |
|
"epoch": 1.984, |
|
"grad_norm": 0.30918648838996887, |
|
"kl": 0.29931640625, |
|
"learning_rate": 1.0017544823184055e-07, |
|
"loss": 0.0827, |
|
"reward": 0.5910248765721917, |
|
"reward_std": 0.6182541996240616, |
|
"rewards/cosine_scaled_reward": -0.05567805375903845, |
|
"rewards/format_reward": 0.70238097012043, |
|
"step": 496 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2368.8631591796875, |
|
"epoch": 1.988, |
|
"grad_norm": 1.1213865280151367, |
|
"kl": 0.3515625, |
|
"learning_rate": 1.0009869243631952e-07, |
|
"loss": 0.0439, |
|
"reward": 0.48846913129091263, |
|
"reward_std": 0.6297848075628281, |
|
"rewards/cosine_scaled_reward": -0.13374162535183132, |
|
"rewards/format_reward": 0.755952388048172, |
|
"step": 497 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2289.3690795898438, |
|
"epoch": 1.992, |
|
"grad_norm": 0.810573399066925, |
|
"kl": 0.302734375, |
|
"learning_rate": 1.000438641958131e-07, |
|
"loss": 0.0228, |
|
"reward": 0.6517780050635338, |
|
"reward_std": 0.7580654174089432, |
|
"rewards/cosine_scaled_reward": -0.046134804193570744, |
|
"rewards/format_reward": 0.7440476417541504, |
|
"step": 498 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2452.2738647460938, |
|
"epoch": 1.996, |
|
"grad_norm": 0.357543408870697, |
|
"kl": 0.32763671875, |
|
"learning_rate": 1.0001096618257236e-07, |
|
"loss": 0.0643, |
|
"reward": 0.3793360572308302, |
|
"reward_std": 0.7790006846189499, |
|
"rewards/cosine_scaled_reward": -0.15557007491588593, |
|
"rewards/format_reward": 0.6904762089252472, |
|
"step": 499 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2528.7500610351562, |
|
"epoch": 2.0, |
|
"grad_norm": 0.9307948350906372, |
|
"kl": 0.306640625, |
|
"learning_rate": 1e-07, |
|
"loss": 0.1364, |
|
"reward": 0.5999207645654678, |
|
"reward_std": 0.6981495916843414, |
|
"rewards/cosine_scaled_reward": -0.03634915268048644, |
|
"rewards/format_reward": 0.6726190596818924, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 500, |
|
"total_flos": 0.0, |
|
"train_loss": 0.0725239302306436, |
|
"train_runtime": 62033.0192, |
|
"train_samples_per_second": 1.354, |
|
"train_steps_per_second": 0.008 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 6, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|