OpenRS-GRPO / trainer_state.json
advaithc's picture
Model save
99a8858 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 2822.7857666015625,
"epoch": 0.004,
"grad_norm": 0.12564538419246674,
"kl": 0.0,
"learning_rate": 2e-08,
"loss": 0.0645,
"reward": 0.09580668434500694,
"reward_std": 0.5702872574329376,
"rewards/cosine_scaled_reward": -0.14554904401302338,
"rewards/format_reward": 0.3869047649204731,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 2575.571533203125,
"epoch": 0.008,
"grad_norm": 0.15411853790283203,
"kl": 0.0,
"learning_rate": 4e-08,
"loss": 0.0717,
"reward": 0.5743008255958557,
"reward_std": 0.7826777100563049,
"rewards/cosine_scaled_reward": 0.03119804011657834,
"rewards/format_reward": 0.5119047686457634,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 2762.1190490722656,
"epoch": 0.012,
"grad_norm": 0.13477382063865662,
"kl": 3.463029861450195e-05,
"learning_rate": 6e-08,
"loss": 0.0865,
"reward": 0.21700193732976913,
"reward_std": 0.6844624578952789,
"rewards/cosine_scaled_reward": -0.10578475520014763,
"rewards/format_reward": 0.4285714402794838,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 2686.3214721679688,
"epoch": 0.016,
"grad_norm": 0.1282820850610733,
"kl": 2.434849739074707e-05,
"learning_rate": 8e-08,
"loss": 0.0525,
"reward": 0.4696298725903034,
"reward_std": 0.7235232815146446,
"rewards/cosine_scaled_reward": -0.0062565067782998085,
"rewards/format_reward": 0.4821428582072258,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 2917.5535888671875,
"epoch": 0.02,
"grad_norm": 0.14993517100811005,
"kl": 3.725290298461914e-05,
"learning_rate": 1e-07,
"loss": 0.0762,
"reward": 0.15318153076805174,
"reward_std": 0.7213103845715523,
"rewards/cosine_scaled_reward": -0.08412353717721999,
"rewards/format_reward": 0.3214285857975483,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 2816.2559814453125,
"epoch": 0.024,
"grad_norm": 0.14960958063602448,
"kl": 3.1054019927978516e-05,
"learning_rate": 1.2e-07,
"loss": 0.0537,
"reward": 0.2950221598148346,
"reward_std": 0.738863505423069,
"rewards/cosine_scaled_reward": -0.057846077223075554,
"rewards/format_reward": 0.410714291036129,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 2870.3988647460938,
"epoch": 0.028,
"grad_norm": 0.10985030233860016,
"kl": 2.8133392333984375e-05,
"learning_rate": 1.4e-07,
"loss": 0.0068,
"reward": 0.27893248095642775,
"reward_std": 0.7550084367394447,
"rewards/cosine_scaled_reward": -0.05993851972743869,
"rewards/format_reward": 0.3988095410168171,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 3160.452392578125,
"epoch": 0.032,
"grad_norm": 0.10308283567428589,
"kl": 3.8176774978637695e-05,
"learning_rate": 1.6e-07,
"loss": 0.0223,
"reward": 0.07877065148204565,
"reward_std": 0.6431715190410614,
"rewards/cosine_scaled_reward": -0.08263849129434675,
"rewards/format_reward": 0.2440476268529892,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 3020.607177734375,
"epoch": 0.036,
"grad_norm": 0.15057384967803955,
"kl": 3.37064266204834e-05,
"learning_rate": 1.8e-07,
"loss": 0.0733,
"reward": 0.06793000735342503,
"reward_std": 0.6978132948279381,
"rewards/cosine_scaled_reward": -0.12079690210521221,
"rewards/format_reward": 0.3095238171517849,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 3089.84521484375,
"epoch": 0.04,
"grad_norm": 0.11256518214941025,
"kl": 3.2395124435424805e-05,
"learning_rate": 2e-07,
"loss": 0.0413,
"reward": 0.032662300392985344,
"reward_std": 0.6881319805979729,
"rewards/cosine_scaled_reward": -0.13545456249266863,
"rewards/format_reward": 0.3035714365541935,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 2851.946533203125,
"epoch": 0.044,
"grad_norm": 0.17106953263282776,
"kl": 3.784894943237305e-05,
"learning_rate": 2.1999999999999998e-07,
"loss": 0.0636,
"reward": 0.3718952457420528,
"reward_std": 0.6902545392513275,
"rewards/cosine_scaled_reward": -0.03131428617052734,
"rewards/format_reward": 0.4345238134264946,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 2798.5178833007812,
"epoch": 0.048,
"grad_norm": 0.1335103064775467,
"kl": 2.9146671295166016e-05,
"learning_rate": 2.4e-07,
"loss": 0.0543,
"reward": 0.40071453526616096,
"reward_std": 0.7024472132325172,
"rewards/cosine_scaled_reward": -0.02285701408982277,
"rewards/format_reward": 0.4464285746216774,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 2948.9464721679688,
"epoch": 0.052,
"grad_norm": 0.1271769255399704,
"kl": 3.698468208312988e-05,
"learning_rate": 2.6e-07,
"loss": 0.0693,
"reward": 0.47545497864484787,
"reward_std": 0.7740402817726135,
"rewards/cosine_scaled_reward": 0.002608438953757286,
"rewards/format_reward": 0.470238097012043,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 2679.3928833007812,
"epoch": 0.056,
"grad_norm": 0.12242422997951508,
"kl": 2.8014183044433594e-05,
"learning_rate": 2.8e-07,
"loss": 0.0489,
"reward": 0.40165250562131405,
"reward_std": 0.7790777683258057,
"rewards/cosine_scaled_reward": -0.028340420685708523,
"rewards/format_reward": 0.4583333507180214,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 2889.136962890625,
"epoch": 0.06,
"grad_norm": 0.19158992171287537,
"kl": 3.224611282348633e-05,
"learning_rate": 3e-07,
"loss": 0.0704,
"reward": 0.15117042418569326,
"reward_std": 0.6893174201250076,
"rewards/cosine_scaled_reward": -0.10893859504722059,
"rewards/format_reward": 0.3690476231276989,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 2892.6488647460938,
"epoch": 0.064,
"grad_norm": 0.15633279085159302,
"kl": 3.668665885925293e-05,
"learning_rate": 3.2e-07,
"loss": 0.0733,
"reward": -0.09426919370889664,
"reward_std": 0.5802397355437279,
"rewards/cosine_scaled_reward": -0.18403935432434082,
"rewards/format_reward": 0.2738095298409462,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 2920.3511962890625,
"epoch": 0.068,
"grad_norm": 0.12191536277532578,
"kl": 3.221631050109863e-05,
"learning_rate": 3.4000000000000003e-07,
"loss": 0.0214,
"reward": 0.13339833123609424,
"reward_std": 0.6428257077932358,
"rewards/cosine_scaled_reward": -0.11187227349728346,
"rewards/format_reward": 0.3571428619325161,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 2704.672607421875,
"epoch": 0.072,
"grad_norm": 0.2207571119070053,
"kl": 2.4259090423583984e-05,
"learning_rate": 3.6e-07,
"loss": 0.0858,
"reward": 0.4250662699341774,
"reward_std": 0.7673918604850769,
"rewards/cosine_scaled_reward": -0.019609727547504008,
"rewards/format_reward": 0.4642857350409031,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 2800.6429443359375,
"epoch": 0.076,
"grad_norm": 0.12734168767929077,
"kl": 2.4378299713134766e-05,
"learning_rate": 3.7999999999999996e-07,
"loss": 0.0471,
"reward": 0.4042445756494999,
"reward_std": 0.6929292231798172,
"rewards/cosine_scaled_reward": -0.02704438249929808,
"rewards/format_reward": 0.4583333432674408,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 2697.8274536132812,
"epoch": 0.08,
"grad_norm": 0.15472018718719482,
"kl": 2.35140323638916e-05,
"learning_rate": 4e-07,
"loss": 0.0265,
"reward": 0.3560524769127369,
"reward_std": 0.6769110411405563,
"rewards/cosine_scaled_reward": -0.05114044318906963,
"rewards/format_reward": 0.4583333432674408,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 2339.4405517578125,
"epoch": 0.084,
"grad_norm": 0.21466447412967682,
"kl": 2.086162567138672e-05,
"learning_rate": 4.1999999999999995e-07,
"loss": 0.0806,
"reward": 0.7416469305753708,
"reward_std": 0.841043546795845,
"rewards/cosine_scaled_reward": 0.06725202780216932,
"rewards/format_reward": 0.6071428656578064,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 2781.5238037109375,
"epoch": 0.088,
"grad_norm": 0.19139103591442108,
"kl": 3.0338764190673828e-05,
"learning_rate": 4.3999999999999997e-07,
"loss": 0.0773,
"reward": 0.20257593411952257,
"reward_std": 0.7891978472471237,
"rewards/cosine_scaled_reward": -0.1129977386444807,
"rewards/format_reward": 0.4285714365541935,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 3036.761962890625,
"epoch": 0.092,
"grad_norm": 0.1108132153749466,
"kl": 2.6345252990722656e-05,
"learning_rate": 4.6e-07,
"loss": 0.0238,
"reward": 0.20629926398396492,
"reward_std": 0.7457813173532486,
"rewards/cosine_scaled_reward": -0.07542179408483207,
"rewards/format_reward": 0.3571428619325161,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 3143.1131591796875,
"epoch": 0.096,
"grad_norm": 0.10591176152229309,
"kl": 2.703070640563965e-05,
"learning_rate": 4.8e-07,
"loss": 0.0637,
"reward": 0.0749267227947712,
"reward_std": 0.6808565855026245,
"rewards/cosine_scaled_reward": -0.1292033027857542,
"rewards/format_reward": 0.3333333395421505,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 2934.4227294921875,
"epoch": 0.1,
"grad_norm": 0.12180113047361374,
"kl": 1.4990568161010742e-05,
"learning_rate": 5e-07,
"loss": 0.0525,
"reward": 0.3605663161724806,
"reward_std": 0.7757866084575653,
"rewards/cosine_scaled_reward": -0.028050171211361885,
"rewards/format_reward": 0.4166666716337204,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 3100.5357666015625,
"epoch": 0.104,
"grad_norm": 0.14736856520175934,
"kl": 2.230703830718994e-05,
"learning_rate": 5.2e-07,
"loss": 0.087,
"reward": 0.1489051878452301,
"reward_std": 0.7608643025159836,
"rewards/cosine_scaled_reward": -0.08923787740059197,
"rewards/format_reward": 0.32738095708191395,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 2978.452392578125,
"epoch": 0.108,
"grad_norm": 0.1490376740694046,
"kl": 2.7447938919067383e-05,
"learning_rate": 5.4e-07,
"loss": 0.0633,
"reward": 0.16602796246297657,
"reward_std": 0.762113556265831,
"rewards/cosine_scaled_reward": -0.09555743727833033,
"rewards/format_reward": 0.3571428656578064,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 2923.1012573242188,
"epoch": 0.112,
"grad_norm": 0.11410558968782425,
"kl": 3.108382225036621e-05,
"learning_rate": 5.6e-07,
"loss": 0.0618,
"reward": 0.058234728407114744,
"reward_std": 0.5919530540704727,
"rewards/cosine_scaled_reward": -0.14052549470216036,
"rewards/format_reward": 0.3392857201397419,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 2925.1011962890625,
"epoch": 0.116,
"grad_norm": 0.17734545469284058,
"kl": 5.251169204711914e-05,
"learning_rate": 5.8e-07,
"loss": 0.0463,
"reward": 0.24072746047750115,
"reward_std": 0.7061209976673126,
"rewards/cosine_scaled_reward": -0.061183891259133816,
"rewards/format_reward": 0.3630952425301075,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 2882.39892578125,
"epoch": 0.12,
"grad_norm": 0.15557299554347992,
"kl": 2.1502375602722168e-05,
"learning_rate": 6e-07,
"loss": 0.0703,
"reward": 0.22035705484449863,
"reward_std": 0.5751676708459854,
"rewards/cosine_scaled_reward": -0.07136909663677216,
"rewards/format_reward": 0.3630952462553978,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 2717.607177734375,
"epoch": 0.124,
"grad_norm": 0.16903533041477203,
"kl": 6.181001663208008e-05,
"learning_rate": 6.2e-07,
"loss": 0.07,
"reward": 0.3481953740119934,
"reward_std": 0.7361179888248444,
"rewards/cosine_scaled_reward": -0.025307081639766693,
"rewards/format_reward": 0.3988095298409462,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 2585.244140625,
"epoch": 0.128,
"grad_norm": 0.1481872797012329,
"kl": 0.00023996829986572266,
"learning_rate": 6.4e-07,
"loss": 0.0664,
"reward": 0.5805501043796539,
"reward_std": 0.805858314037323,
"rewards/cosine_scaled_reward": 0.019441714510321617,
"rewards/format_reward": 0.5416666865348816,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 2608.7083740234375,
"epoch": 0.132,
"grad_norm": 0.10800693184137344,
"kl": 0.0002808570861816406,
"learning_rate": 6.6e-07,
"loss": 0.0255,
"reward": 0.5432634204626083,
"reward_std": 0.7363616675138474,
"rewards/cosine_scaled_reward": 0.02460789866745472,
"rewards/format_reward": 0.4940476268529892,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 2769.1964721679688,
"epoch": 0.136,
"grad_norm": 0.12105516344308853,
"kl": 0.00020498037338256836,
"learning_rate": 6.800000000000001e-07,
"loss": 0.0184,
"reward": 0.18091929703950882,
"reward_std": 0.6703035831451416,
"rewards/cosine_scaled_reward": -0.10596893168985844,
"rewards/format_reward": 0.3928571604192257,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 3090.1607666015625,
"epoch": 0.14,
"grad_norm": 0.11914981156587601,
"kl": 0.0002568960189819336,
"learning_rate": 7e-07,
"loss": 0.0617,
"reward": 0.08196480484912172,
"reward_std": 0.7742973417043686,
"rewards/cosine_scaled_reward": -0.10782713070511818,
"rewards/format_reward": 0.2976190559566021,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 2790.0596313476562,
"epoch": 0.144,
"grad_norm": 0.10883598774671555,
"kl": 0.00027942657470703125,
"learning_rate": 7.2e-07,
"loss": 0.0163,
"reward": 0.31424143677577376,
"reward_std": 0.669949933886528,
"rewards/cosine_scaled_reward": -0.06311738677322865,
"rewards/format_reward": 0.4404762014746666,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 2916.3452758789062,
"epoch": 0.148,
"grad_norm": 0.1492447555065155,
"kl": 0.00025272369384765625,
"learning_rate": 7.4e-07,
"loss": 0.0592,
"reward": 0.1357547640800476,
"reward_std": 0.7365808188915253,
"rewards/cosine_scaled_reward": -0.10771786456461996,
"rewards/format_reward": 0.3511904813349247,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 3342.6786499023438,
"epoch": 0.152,
"grad_norm": 0.08414288610219955,
"kl": 0.00013870000839233398,
"learning_rate": 7.599999999999999e-07,
"loss": 0.0239,
"reward": -0.1723631415516138,
"reward_std": 0.6109825298190117,
"rewards/cosine_scaled_reward": -0.16951490193605423,
"rewards/format_reward": 0.16666666977107525,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 2762.5774536132812,
"epoch": 0.156,
"grad_norm": 0.14042888581752777,
"kl": 0.0005602836608886719,
"learning_rate": 7.799999999999999e-07,
"loss": 0.0588,
"reward": 0.3224933594465256,
"reward_std": 0.6976565718650818,
"rewards/cosine_scaled_reward": -0.04708665423095226,
"rewards/format_reward": 0.416666679084301,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 2729.9644165039062,
"epoch": 0.16,
"grad_norm": 0.110390305519104,
"kl": 0.00016605854034423828,
"learning_rate": 8e-07,
"loss": 0.0549,
"reward": 0.4423699714243412,
"reward_std": 0.6286562532186508,
"rewards/cosine_scaled_reward": -0.016910257749259472,
"rewards/format_reward": 0.476190485060215,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 2955.7084350585938,
"epoch": 0.164,
"grad_norm": 0.15253609418869019,
"kl": 0.00040471553802490234,
"learning_rate": 8.199999999999999e-07,
"loss": 0.0718,
"reward": 0.44552009692415595,
"reward_std": 0.7759689763188362,
"rewards/cosine_scaled_reward": 0.0144266925053671,
"rewards/format_reward": 0.4166666753590107,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 2961.0596313476562,
"epoch": 0.168,
"grad_norm": 0.22110103070735931,
"kl": 0.0009613037109375,
"learning_rate": 8.399999999999999e-07,
"loss": 0.1186,
"reward": 0.2217194978147745,
"reward_std": 0.6207270994782448,
"rewards/cosine_scaled_reward": -0.07366406172513962,
"rewards/format_reward": 0.3690476268529892,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 3015.7738647460938,
"epoch": 0.172,
"grad_norm": 0.23720885813236237,
"kl": 0.0005915164947509766,
"learning_rate": 8.599999999999999e-07,
"loss": 0.1245,
"reward": -0.04521503113210201,
"reward_std": 0.62105892598629,
"rewards/cosine_scaled_reward": -0.16844085440970957,
"rewards/format_reward": 0.2916666716337204,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 2937.2381591796875,
"epoch": 0.176,
"grad_norm": 0.09096106886863708,
"kl": 0.00051116943359375,
"learning_rate": 8.799999999999999e-07,
"loss": 0.0249,
"reward": 0.22813843563199043,
"reward_std": 0.6727291792631149,
"rewards/cosine_scaled_reward": -0.058549837151076645,
"rewards/format_reward": 0.3452381007373333,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 3149.511962890625,
"epoch": 0.18,
"grad_norm": 0.11164555698633194,
"kl": 0.0004715919494628906,
"learning_rate": 9e-07,
"loss": 0.017,
"reward": 0.05970348231494427,
"reward_std": 0.7763290405273438,
"rewards/cosine_scaled_reward": -0.11300540715456009,
"rewards/format_reward": 0.285714291036129,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 3184.6845703125,
"epoch": 0.184,
"grad_norm": 0.1711866706609726,
"kl": 0.0007777214050292969,
"learning_rate": 9.2e-07,
"loss": 0.0731,
"reward": 0.1386737246066332,
"reward_std": 0.7276585251092911,
"rewards/cosine_scaled_reward": -0.06459171324968338,
"rewards/format_reward": 0.26785714738070965,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 3015.386962890625,
"epoch": 0.188,
"grad_norm": 0.12470238655805588,
"kl": 0.0014100074768066406,
"learning_rate": 9.399999999999999e-07,
"loss": 0.0653,
"reward": 0.16049158992245793,
"reward_std": 0.7017006278038025,
"rewards/cosine_scaled_reward": -0.08642087457701564,
"rewards/format_reward": 0.3333333432674408,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 2909.96435546875,
"epoch": 0.192,
"grad_norm": 0.28355535864830017,
"kl": 0.009288787841796875,
"learning_rate": 9.6e-07,
"loss": 0.0581,
"reward": 0.0768200121819973,
"reward_std": 0.7135801166296005,
"rewards/cosine_scaled_reward": -0.14016142301261425,
"rewards/format_reward": 0.3571428693830967,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 2753.6845703125,
"epoch": 0.196,
"grad_norm": 0.6567728519439697,
"kl": 0.023477554321289062,
"learning_rate": 9.8e-07,
"loss": 0.0866,
"reward": 0.4337980281561613,
"reward_std": 0.8068065047264099,
"rewards/cosine_scaled_reward": -0.006315283477306366,
"rewards/format_reward": 0.4464285746216774,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 2934.8274536132812,
"epoch": 0.2,
"grad_norm": 0.11382321268320084,
"kl": 0.0025758743286132812,
"learning_rate": 1e-06,
"loss": 0.0714,
"reward": 0.2880665063858032,
"reward_std": 0.6403830945491791,
"rewards/cosine_scaled_reward": -0.049419129034504294,
"rewards/format_reward": 0.3869047649204731,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 2840.7678833007812,
"epoch": 0.204,
"grad_norm": 0.11641126126050949,
"kl": 0.0050792694091796875,
"learning_rate": 9.999890338174275e-07,
"loss": 0.023,
"reward": 0.2840890493243933,
"reward_std": 0.692708894610405,
"rewards/cosine_scaled_reward": -0.06628882512450218,
"rewards/format_reward": 0.4166666716337204,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 3119.9881591796875,
"epoch": 0.208,
"grad_norm": 0.14652805030345917,
"kl": 0.0032052993774414062,
"learning_rate": 9.999561358041868e-07,
"loss": 0.0641,
"reward": 0.18620363296940923,
"reward_std": 0.8490904271602631,
"rewards/cosine_scaled_reward": -0.058683907613158226,
"rewards/format_reward": 0.3035714365541935,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 2926.9940795898438,
"epoch": 0.212,
"grad_norm": 0.19241634011268616,
"kl": 0.00447845458984375,
"learning_rate": 9.999013075636804e-07,
"loss": 0.0747,
"reward": 0.36803684243932366,
"reward_std": 0.823193870484829,
"rewards/cosine_scaled_reward": -0.006457769020926207,
"rewards/format_reward": 0.380952388048172,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 3031.7203369140625,
"epoch": 0.216,
"grad_norm": 0.12843742966651917,
"kl": 0.0029668807983398438,
"learning_rate": 9.998245517681593e-07,
"loss": 0.0153,
"reward": 0.2685772944241762,
"reward_std": 0.6489126533269882,
"rewards/cosine_scaled_reward": -0.04428278561681509,
"rewards/format_reward": 0.3571428693830967,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 3123.136962890625,
"epoch": 0.22,
"grad_norm": 0.1504901498556137,
"kl": 0.006084442138671875,
"learning_rate": 9.997258721585931e-07,
"loss": 0.0691,
"reward": 0.03501664288341999,
"reward_std": 0.6481388062238693,
"rewards/cosine_scaled_reward": -0.11642025248147547,
"rewards/format_reward": 0.2678571529686451,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 2673.434539794922,
"epoch": 0.224,
"grad_norm": 0.1224113255739212,
"kl": 0.0033931732177734375,
"learning_rate": 9.996052735444862e-07,
"loss": 0.0269,
"reward": 0.6296312126796693,
"reward_std": 0.6751764714717865,
"rewards/cosine_scaled_reward": 0.07374419644474983,
"rewards/format_reward": 0.482142873108387,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 3170.0535888671875,
"epoch": 0.228,
"grad_norm": 0.1044960618019104,
"kl": 0.0024929046630859375,
"learning_rate": 9.994627618036452e-07,
"loss": 0.0262,
"reward": 0.21022793278098106,
"reward_std": 0.7194458544254303,
"rewards/cosine_scaled_reward": -0.05560031719505787,
"rewards/format_reward": 0.3214285857975483,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 2590.0179138183594,
"epoch": 0.232,
"grad_norm": 0.13768674433231354,
"kl": 0.0045948028564453125,
"learning_rate": 9.992983438818915e-07,
"loss": 0.0592,
"reward": 0.4493846707046032,
"reward_std": 0.7118680775165558,
"rewards/cosine_scaled_reward": -0.02233148762024939,
"rewards/format_reward": 0.4940476231276989,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 3087.4702758789062,
"epoch": 0.236,
"grad_norm": 0.13870052993297577,
"kl": 0.001789093017578125,
"learning_rate": 9.991120277927223e-07,
"loss": 0.0691,
"reward": 0.3166997814550996,
"reward_std": 0.7532177269458771,
"rewards/cosine_scaled_reward": -0.011292967945337296,
"rewards/format_reward": 0.3392857201397419,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 2717.3036499023438,
"epoch": 0.24,
"grad_norm": 0.14514827728271484,
"kl": 0.0060253143310546875,
"learning_rate": 9.989038226169207e-07,
"loss": 0.0622,
"reward": 0.16810212982818484,
"reward_std": 0.5123014599084854,
"rewards/cosine_scaled_reward": -0.10642512841150165,
"rewards/format_reward": 0.3809523843228817,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 3105.7977294921875,
"epoch": 0.244,
"grad_norm": 0.10494968295097351,
"kl": 0.0026693344116210938,
"learning_rate": 9.98673738502114e-07,
"loss": 0.0639,
"reward": 0.08244643732905388,
"reward_std": 0.7039294093847275,
"rewards/cosine_scaled_reward": -0.10461012227460742,
"rewards/format_reward": 0.29166666977107525,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 3213.3214721679688,
"epoch": 0.248,
"grad_norm": 0.0999075248837471,
"kl": 0.0023097991943359375,
"learning_rate": 9.98421786662277e-07,
"loss": 0.053,
"reward": 0.09679291397333145,
"reward_std": 0.7138089835643768,
"rewards/cosine_scaled_reward": -0.0914845080114901,
"rewards/format_reward": 0.2797619104385376,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 2783.9584350585938,
"epoch": 0.252,
"grad_norm": 0.19832438230514526,
"kl": 0.0027294158935546875,
"learning_rate": 9.981479793771866e-07,
"loss": 0.0773,
"reward": 0.2238014191389084,
"reward_std": 0.6036202609539032,
"rewards/cosine_scaled_reward": -0.06369452457875013,
"rewards/format_reward": 0.351190485060215,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 2976.6607666015625,
"epoch": 0.256,
"grad_norm": 0.12335456907749176,
"kl": 0.0020885467529296875,
"learning_rate": 9.97852329991824e-07,
"loss": 0.0857,
"reward": 0.27347568422555923,
"reward_std": 0.7463532835245132,
"rewards/cosine_scaled_reward": -0.03885740428813733,
"rewards/format_reward": 0.3511904887855053,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 2825.6130981445312,
"epoch": 0.26,
"grad_norm": 0.13863790035247803,
"kl": 0.002285003662109375,
"learning_rate": 9.975348529157229e-07,
"loss": 0.0349,
"reward": 0.3712610546499491,
"reward_std": 0.7277249395847321,
"rewards/cosine_scaled_reward": -0.037583764642477036,
"rewards/format_reward": 0.4464285746216774,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 2974.232177734375,
"epoch": 0.264,
"grad_norm": 0.11186616122722626,
"kl": 0.002407073974609375,
"learning_rate": 9.971955636222684e-07,
"loss": 0.0025,
"reward": 0.07817286718636751,
"reward_std": 0.640307292342186,
"rewards/cosine_scaled_reward": -0.12460404448211193,
"rewards/format_reward": 0.3273809589445591,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 3152.702392578125,
"epoch": 0.268,
"grad_norm": 0.12694397568702698,
"kl": 0.00250244140625,
"learning_rate": 9.968344786479415e-07,
"loss": 0.07,
"reward": 0.1549822874367237,
"reward_std": 0.6663320288062096,
"rewards/cosine_scaled_reward": -0.08322314161341637,
"rewards/format_reward": 0.3214285783469677,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 2938.0179443359375,
"epoch": 0.272,
"grad_norm": 0.12655070424079895,
"kl": 0.00341033935546875,
"learning_rate": 9.964516155915151e-07,
"loss": 0.0545,
"reward": 0.2076467089354992,
"reward_std": 0.7705407291650772,
"rewards/cosine_scaled_reward": -0.08070044964551926,
"rewards/format_reward": 0.3690476268529892,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 3025.8095703125,
"epoch": 0.276,
"grad_norm": 0.12574610114097595,
"kl": 0.003986358642578125,
"learning_rate": 9.960469931131936e-07,
"loss": 0.0227,
"reward": 0.03160261735320091,
"reward_std": 0.621779277920723,
"rewards/cosine_scaled_reward": -0.13003203552216291,
"rewards/format_reward": 0.2916666716337204,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 2877.5536499023438,
"epoch": 0.28,
"grad_norm": 0.12181198596954346,
"kl": 0.0038127899169921875,
"learning_rate": 9.956206309337066e-07,
"loss": 0.0248,
"reward": 0.2757916431874037,
"reward_std": 0.7794490903615952,
"rewards/cosine_scaled_reward": -0.06448512757197022,
"rewards/format_reward": 0.4047619178891182,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 2776.6131591796875,
"epoch": 0.284,
"grad_norm": 0.17934156954288483,
"kl": 0.0039825439453125,
"learning_rate": 9.951725498333448e-07,
"loss": 0.1072,
"reward": 0.2793612889945507,
"reward_std": 0.7607921361923218,
"rewards/cosine_scaled_reward": -0.06865269318223,
"rewards/format_reward": 0.4166666716337204,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 3037.5178833007812,
"epoch": 0.288,
"grad_norm": 0.1311851143836975,
"kl": 0.0038604736328125,
"learning_rate": 9.947027716509488e-07,
"loss": 0.0745,
"reward": 0.34610075503587723,
"reward_std": 0.8604296147823334,
"rewards/cosine_scaled_reward": -0.01444962713867426,
"rewards/format_reward": 0.3750000074505806,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 2880.3036499023438,
"epoch": 0.292,
"grad_norm": 0.10903972387313843,
"kl": 0.005123138427734375,
"learning_rate": 9.942113192828444e-07,
"loss": 0.0247,
"reward": 0.41264417115598917,
"reward_std": 0.6988394409418106,
"rewards/cosine_scaled_reward": -0.00201124744489789,
"rewards/format_reward": 0.4166666753590107,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 2689.5774536132812,
"epoch": 0.296,
"grad_norm": 0.1187940314412117,
"kl": 0.00371551513671875,
"learning_rate": 9.93698216681727e-07,
"loss": 0.0343,
"reward": 0.49459290131926537,
"reward_std": 0.6582471132278442,
"rewards/cosine_scaled_reward": -0.005679763096850365,
"rewards/format_reward": 0.505952388048172,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 3011.2440795898438,
"epoch": 0.3,
"grad_norm": 0.15027488768100739,
"kl": 0.00612640380859375,
"learning_rate": 9.931634888554935e-07,
"loss": 0.0922,
"reward": -0.12369688227772713,
"reward_std": 0.5938592255115509,
"rewards/cosine_scaled_reward": -0.1957770138978958,
"rewards/format_reward": 0.26785715110599995,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 3087.9762573242188,
"epoch": 0.304,
"grad_norm": 0.1342087835073471,
"kl": 0.004589080810546875,
"learning_rate": 9.926071618660237e-07,
"loss": 0.051,
"reward": 0.029461721424013376,
"reward_std": 0.5008194297552109,
"rewards/cosine_scaled_reward": -0.12812629727704916,
"rewards/format_reward": 0.2857142915017903,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 2708.6845703125,
"epoch": 0.308,
"grad_norm": 0.11936229467391968,
"kl": 0.00518035888671875,
"learning_rate": 9.9202926282791e-07,
"loss": 0.0693,
"reward": 0.38730931747704744,
"reward_std": 0.6931557953357697,
"rewards/cosine_scaled_reward": -0.0503929746337235,
"rewards/format_reward": 0.4880952388048172,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 3123.4345703125,
"epoch": 0.312,
"grad_norm": 0.10926749557256699,
"kl": 0.006168365478515625,
"learning_rate": 9.91429819907136e-07,
"loss": 0.0281,
"reward": 0.14759791223332286,
"reward_std": 0.6552696228027344,
"rewards/cosine_scaled_reward": -0.08989150635898113,
"rewards/format_reward": 0.3273809519596398,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 3104.886962890625,
"epoch": 0.316,
"grad_norm": 0.17055809497833252,
"kl": 0.005580902099609375,
"learning_rate": 9.908088623197048e-07,
"loss": 0.122,
"reward": 0.1187831275165081,
"reward_std": 0.7589289993047714,
"rewards/cosine_scaled_reward": -0.11917985696345568,
"rewards/format_reward": 0.3571428656578064,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 2892.2261962890625,
"epoch": 0.32,
"grad_norm": 0.12601953744888306,
"kl": 0.004444122314453125,
"learning_rate": 9.901664203302124e-07,
"loss": 0.0261,
"reward": 0.2744547198526561,
"reward_std": 0.686069905757904,
"rewards/cosine_scaled_reward": -0.05920121353119612,
"rewards/format_reward": 0.3928571492433548,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 3041.1488647460938,
"epoch": 0.324,
"grad_norm": 0.11552488803863525,
"kl": 0.00655364990234375,
"learning_rate": 9.895025252503755e-07,
"loss": 0.0229,
"reward": 0.12106413394212723,
"reward_std": 0.675617903470993,
"rewards/cosine_scaled_reward": -0.1061346041969955,
"rewards/format_reward": 0.33333333767950535,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 3129.6429443359375,
"epoch": 0.328,
"grad_norm": 0.09706410765647888,
"kl": 0.0055084228515625,
"learning_rate": 9.888172094375033e-07,
"loss": 0.0452,
"reward": 0.12287123966962099,
"reward_std": 0.7173575460910797,
"rewards/cosine_scaled_reward": -0.09035009983927011,
"rewards/format_reward": 0.3035714365541935,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 2602.047637939453,
"epoch": 0.332,
"grad_norm": 0.11233574151992798,
"kl": 0.01010894775390625,
"learning_rate": 9.881105062929221e-07,
"loss": 0.0258,
"reward": 0.35400932375341654,
"reward_std": 0.6462785750627518,
"rewards/cosine_scaled_reward": -0.06109058950096369,
"rewards/format_reward": 0.476190485060215,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 2894.011962890625,
"epoch": 0.336,
"grad_norm": 0.12059750407934189,
"kl": 0.00872802734375,
"learning_rate": 9.873824502603459e-07,
"loss": 0.0806,
"reward": 0.2941260803490877,
"reward_std": 0.78522889316082,
"rewards/cosine_scaled_reward": -0.06722268275916576,
"rewards/format_reward": 0.4285714328289032,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 2979.226318359375,
"epoch": 0.34,
"grad_norm": 0.15206296741962433,
"kl": 0.00783538818359375,
"learning_rate": 9.866330768241983e-07,
"loss": 0.0755,
"reward": 0.21889091655611992,
"reward_std": 0.6674999743700027,
"rewards/cosine_scaled_reward": -0.06614979542791843,
"rewards/format_reward": 0.3511904887855053,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 2819.5892944335938,
"epoch": 0.344,
"grad_norm": 0.1093529462814331,
"kl": 0.00661468505859375,
"learning_rate": 9.85862422507884e-07,
"loss": 0.0353,
"reward": 0.20389786185114644,
"reward_std": 0.6942542195320129,
"rewards/cosine_scaled_reward": -0.07959869271144271,
"rewards/format_reward": 0.3630952425301075,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 2616.452392578125,
"epoch": 0.348,
"grad_norm": 0.15942011773586273,
"kl": 0.00847625732421875,
"learning_rate": 9.850705248720068e-07,
"loss": 0.0773,
"reward": 0.3760679364204407,
"reward_std": 0.6467384025454521,
"rewards/cosine_scaled_reward": -0.05898985452950001,
"rewards/format_reward": 0.4940476194024086,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 2923.7500610351562,
"epoch": 0.352,
"grad_norm": 0.10565865784883499,
"kl": 0.0073089599609375,
"learning_rate": 9.8425742251254e-07,
"loss": 0.0227,
"reward": 0.3185804970562458,
"reward_std": 0.6098055616021156,
"rewards/cosine_scaled_reward": -0.010352615499868989,
"rewards/format_reward": 0.3392857201397419,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 2752.6309814453125,
"epoch": 0.356,
"grad_norm": 0.1690302938222885,
"kl": 0.0132293701171875,
"learning_rate": 9.83423155058946e-07,
"loss": 0.0312,
"reward": 0.46914676763117313,
"reward_std": 0.7854363918304443,
"rewards/cosine_scaled_reward": 0.005406718701124191,
"rewards/format_reward": 0.4583333432674408,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 2769.1488647460938,
"epoch": 0.36,
"grad_norm": 0.17653611302375793,
"kl": 0.011993408203125,
"learning_rate": 9.825677631722435e-07,
"loss": 0.0815,
"reward": 0.3604448903352022,
"reward_std": 0.798264317214489,
"rewards/cosine_scaled_reward": -0.04894421715289354,
"rewards/format_reward": 0.4583333432674408,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 3187.3631591796875,
"epoch": 0.364,
"grad_norm": 0.1068400964140892,
"kl": 0.0078277587890625,
"learning_rate": 9.816912885430258e-07,
"loss": 0.0211,
"reward": -0.03608314320445061,
"reward_std": 0.5826699808239937,
"rewards/cosine_scaled_reward": -0.14006539154797792,
"rewards/format_reward": 0.2440476305782795,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 2924.916748046875,
"epoch": 0.368,
"grad_norm": 0.17665976285934448,
"kl": 0.0091400146484375,
"learning_rate": 9.807937738894303e-07,
"loss": 0.0852,
"reward": 0.2780441716313362,
"reward_std": 0.7524297386407852,
"rewards/cosine_scaled_reward": -0.07526363711804152,
"rewards/format_reward": 0.4285714291036129,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 2939.3512573242188,
"epoch": 0.372,
"grad_norm": 0.13597099483013153,
"kl": 0.0076446533203125,
"learning_rate": 9.798752629550546e-07,
"loss": 0.0324,
"reward": 0.33797190338373184,
"reward_std": 0.5880008786916733,
"rewards/cosine_scaled_reward": -0.01851405529305339,
"rewards/format_reward": 0.3750000111758709,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 2922.875,
"epoch": 0.376,
"grad_norm": 0.12246444076299667,
"kl": 0.00815582275390625,
"learning_rate": 9.78935800506826e-07,
"loss": 0.0801,
"reward": 0.14625070057809353,
"reward_std": 0.690229170024395,
"rewards/cosine_scaled_reward": -0.12330322340130806,
"rewards/format_reward": 0.3928571492433548,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 3222.3632202148438,
"epoch": 0.38,
"grad_norm": 0.08992121368646622,
"kl": 0.00704193115234375,
"learning_rate": 9.779754323328192e-07,
"loss": -0.001,
"reward": 0.26800261437892914,
"reward_std": 0.7103277295827866,
"rewards/cosine_scaled_reward": -0.011832039803266525,
"rewards/format_reward": 0.2916666716337204,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 3215.2202758789062,
"epoch": 0.384,
"grad_norm": 0.12997964024543762,
"kl": 0.00983428955078125,
"learning_rate": 9.769942052400235e-07,
"loss": 0.0591,
"reward": -0.03136127255856991,
"reward_std": 0.6326467096805573,
"rewards/cosine_scaled_reward": -0.18234730698168278,
"rewards/format_reward": 0.33333333767950535,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 2857.0416870117188,
"epoch": 0.388,
"grad_norm": 0.16558504104614258,
"kl": 0.007293701171875,
"learning_rate": 9.759921670520634e-07,
"loss": 0.0821,
"reward": 0.19707820191979408,
"reward_std": 0.6188783794641495,
"rewards/cosine_scaled_reward": -0.10681804455816746,
"rewards/format_reward": 0.410714291036129,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 2983.9940795898438,
"epoch": 0.392,
"grad_norm": 0.1882523149251938,
"kl": 0.00878143310546875,
"learning_rate": 9.749693666068663e-07,
"loss": 0.1027,
"reward": 0.3959239423274994,
"reward_std": 0.875861182808876,
"rewards/cosine_scaled_reward": 0.004509590216912329,
"rewards/format_reward": 0.3869047686457634,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 3077.0952758789062,
"epoch": 0.396,
"grad_norm": 0.1565464437007904,
"kl": 0.011505126953125,
"learning_rate": 9.739258537542835e-07,
"loss": 0.0968,
"reward": 0.027048692107200623,
"reward_std": 0.7064545601606369,
"rewards/cosine_scaled_reward": -0.1323089925572276,
"rewards/format_reward": 0.2916666679084301,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 2358.2381591796875,
"epoch": 0.4,
"grad_norm": 0.14640696346759796,
"kl": 0.00804901123046875,
"learning_rate": 9.728616793536587e-07,
"loss": 0.0774,
"reward": 0.8142919540405273,
"reward_std": 0.7067123055458069,
"rewards/cosine_scaled_reward": 0.10357456840574741,
"rewards/format_reward": 0.6071428507566452,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 3195.6428833007812,
"epoch": 0.404,
"grad_norm": 0.14821472764015198,
"kl": 0.0126495361328125,
"learning_rate": 9.717768952713511e-07,
"loss": 0.0897,
"reward": 0.2797414679080248,
"reward_std": 0.8859200328588486,
"rewards/cosine_scaled_reward": -0.0535816540941596,
"rewards/format_reward": 0.3869047649204731,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 3046.7798461914062,
"epoch": 0.408,
"grad_norm": 0.13244982063770294,
"kl": 0.0107879638671875,
"learning_rate": 9.706715543782064e-07,
"loss": 0.0638,
"reward": 0.16126136109232903,
"reward_std": 0.6933339387178421,
"rewards/cosine_scaled_reward": -0.08305979892611504,
"rewards/format_reward": 0.3273809552192688,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 2718.8275146484375,
"epoch": 0.412,
"grad_norm": 0.20267100632190704,
"kl": 0.01003265380859375,
"learning_rate": 9.695457105469804e-07,
"loss": 0.1246,
"reward": 0.4206714928150177,
"reward_std": 0.6706456393003464,
"rewards/cosine_scaled_reward": -0.0069261584430933,
"rewards/format_reward": 0.43452382180839777,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 2973.7559814453125,
"epoch": 0.416,
"grad_norm": 0.12351427227258682,
"kl": 0.01038360595703125,
"learning_rate": 9.683994186497132e-07,
"loss": 0.0716,
"reward": 0.20578511937389976,
"reward_std": 0.7138822227716446,
"rewards/cosine_scaled_reward": -0.0846074327128008,
"rewards/format_reward": 0.3750000037252903,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 2596.1250915527344,
"epoch": 0.42,
"grad_norm": 0.13755492866039276,
"kl": 0.0099945068359375,
"learning_rate": 9.672327345550543e-07,
"loss": 0.0544,
"reward": 0.6021162122488022,
"reward_std": 0.8435305505990982,
"rewards/cosine_scaled_reward": 0.012367631308734417,
"rewards/format_reward": 0.5773809663951397,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 2986.226318359375,
"epoch": 0.424,
"grad_norm": 0.10337146371603012,
"kl": 0.014007568359375,
"learning_rate": 9.66045715125541e-07,
"loss": 0.0441,
"reward": 0.24333537928760052,
"reward_std": 0.7411631494760513,
"rewards/cosine_scaled_reward": -0.08071326930075884,
"rewards/format_reward": 0.4047619141638279,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 2804.7500610351562,
"epoch": 0.428,
"grad_norm": 0.1334601491689682,
"kl": 0.010711669921875,
"learning_rate": 9.648384182148252e-07,
"loss": 0.0741,
"reward": 0.21579574886709452,
"reward_std": 0.559941440820694,
"rewards/cosine_scaled_reward": -0.08257831074297428,
"rewards/format_reward": 0.380952388048172,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 2947.9583740234375,
"epoch": 0.432,
"grad_norm": 0.15339502692222595,
"kl": 0.0123443603515625,
"learning_rate": 9.636109026648554e-07,
"loss": 0.0797,
"reward": 0.2714387159794569,
"reward_std": 0.5535019040107727,
"rewards/cosine_scaled_reward": -0.060709220357239246,
"rewards/format_reward": 0.3928571492433548,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 3165.3631591796875,
"epoch": 0.436,
"grad_norm": 0.14555484056472778,
"kl": 0.0123748779296875,
"learning_rate": 9.623632283030077e-07,
"loss": 0.0689,
"reward": 0.3741426505148411,
"reward_std": 0.7712415158748627,
"rewards/cosine_scaled_reward": 0.011476085986942053,
"rewards/format_reward": 0.351190485060215,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 2682.607177734375,
"epoch": 0.44,
"grad_norm": 3.0458719730377197,
"kl": 0.1771697998046875,
"learning_rate": 9.610954559391704e-07,
"loss": 0.0576,
"reward": 0.43371669203042984,
"reward_std": 0.6959643810987473,
"rewards/cosine_scaled_reward": -0.02718928176909685,
"rewards/format_reward": 0.4880952462553978,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 2601.636993408203,
"epoch": 0.444,
"grad_norm": 0.16071970760822296,
"kl": 0.0132904052734375,
"learning_rate": 9.598076473627796e-07,
"loss": 0.0475,
"reward": 0.3634342849254608,
"reward_std": 0.5500286221504211,
"rewards/cosine_scaled_reward": -0.06233047042042017,
"rewards/format_reward": 0.4880952462553978,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 2848.0,
"epoch": 0.448,
"grad_norm": 0.11652833968400955,
"kl": 0.01318359375,
"learning_rate": 9.58499865339809e-07,
"loss": 0.0216,
"reward": 0.4104595482349396,
"reward_std": 0.7775004655122757,
"rewards/cosine_scaled_reward": -0.023936893790960312,
"rewards/format_reward": 0.4583333358168602,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 3060.696533203125,
"epoch": 0.452,
"grad_norm": 0.11911512911319733,
"kl": 0.019012451171875,
"learning_rate": 9.571721736097088e-07,
"loss": 0.0351,
"reward": 0.14249714091420174,
"reward_std": 0.5928184911608696,
"rewards/cosine_scaled_reward": -0.08946572133572772,
"rewards/format_reward": 0.32142857648432255,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 3025.3392944335938,
"epoch": 0.456,
"grad_norm": 0.11119002103805542,
"kl": 0.013275146484375,
"learning_rate": 9.55824636882301e-07,
"loss": 0.0157,
"reward": 0.2583576124161482,
"reward_std": 0.5952321216464043,
"rewards/cosine_scaled_reward": -0.05236881226301193,
"rewards/format_reward": 0.3630952462553978,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 2642.4524536132812,
"epoch": 0.46,
"grad_norm": 0.13256801664829254,
"kl": 0.0142669677734375,
"learning_rate": 9.54457320834625e-07,
"loss": 0.0464,
"reward": 0.26410975866019726,
"reward_std": 0.7131557315587997,
"rewards/cosine_scaled_reward": -0.10604035668075085,
"rewards/format_reward": 0.476190485060215,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 2842.172607421875,
"epoch": 0.464,
"grad_norm": 0.14555224776268005,
"kl": 0.0131683349609375,
"learning_rate": 9.530702921077358e-07,
"loss": 0.06,
"reward": 0.7474905252456665,
"reward_std": 0.9560296833515167,
"rewards/cosine_scaled_reward": 0.10291192133445293,
"rewards/format_reward": 0.5416666716337204,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 2644.851318359375,
"epoch": 0.468,
"grad_norm": 0.1801517903804779,
"kl": 0.0159149169921875,
"learning_rate": 9.516636183034564e-07,
"loss": 0.0868,
"reward": 0.3559920974075794,
"reward_std": 0.6948887631297112,
"rewards/cosine_scaled_reward": -0.05712300445884466,
"rewards/format_reward": 0.4702381044626236,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 2898.8692016601562,
"epoch": 0.472,
"grad_norm": 0.11308304965496063,
"kl": 0.01580810546875,
"learning_rate": 9.502373679810839e-07,
"loss": 0.0347,
"reward": -0.02927885064855218,
"reward_std": 0.5874328389763832,
"rewards/cosine_scaled_reward": -0.18725845962762833,
"rewards/format_reward": 0.3452381044626236,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 2942.3869018554688,
"epoch": 0.476,
"grad_norm": 0.11883487552404404,
"kl": 0.0146484375,
"learning_rate": 9.487916106540465e-07,
"loss": 0.0533,
"reward": 0.2897670716047287,
"reward_std": 0.624944195151329,
"rewards/cosine_scaled_reward": -0.05452123726718128,
"rewards/format_reward": 0.3988095298409462,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 2888.5537109375,
"epoch": 0.48,
"grad_norm": 0.18806912004947662,
"kl": 0.0160980224609375,
"learning_rate": 9.473264167865171e-07,
"loss": 0.0988,
"reward": 0.42369477450847626,
"reward_std": 0.8195747882127762,
"rewards/cosine_scaled_reward": -0.002438324736431241,
"rewards/format_reward": 0.4285714365541935,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 2582.732177734375,
"epoch": 0.484,
"grad_norm": 0.24437126517295837,
"kl": 0.0138702392578125,
"learning_rate": 9.458418577899774e-07,
"loss": 0.1221,
"reward": 0.5238880245015025,
"reward_std": 0.7648549973964691,
"rewards/cosine_scaled_reward": -0.005913139786571264,
"rewards/format_reward": 0.535714291036129,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 2693.202392578125,
"epoch": 0.488,
"grad_norm": 0.1520787924528122,
"kl": 0.016265869140625,
"learning_rate": 9.443380060197385e-07,
"loss": 0.075,
"reward": 0.5096995830535889,
"reward_std": 0.8008040487766266,
"rewards/cosine_scaled_reward": 0.01675456203520298,
"rewards/format_reward": 0.4761904776096344,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 2794.4822387695312,
"epoch": 0.492,
"grad_norm": 0.276404470205307,
"kl": 0.016845703125,
"learning_rate": 9.428149347714143e-07,
"loss": 0.1229,
"reward": 0.1483494946733117,
"reward_std": 0.7319334298372269,
"rewards/cosine_scaled_reward": -0.11034906562417746,
"rewards/format_reward": 0.3690476268529892,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 2586.9285583496094,
"epoch": 0.496,
"grad_norm": 0.19590145349502563,
"kl": 0.0192413330078125,
"learning_rate": 9.412727182773486e-07,
"loss": 0.0668,
"reward": 0.42041725292801857,
"reward_std": 0.7440174072980881,
"rewards/cosine_scaled_reward": -0.05169615335762501,
"rewards/format_reward": 0.5238095372915268,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 2647.1309814453125,
"epoch": 0.5,
"grad_norm": 0.1762569099664688,
"kl": 0.018951416015625,
"learning_rate": 9.397114317029974e-07,
"loss": 0.058,
"reward": 0.26979109086096287,
"reward_std": 0.7384046316146851,
"rewards/cosine_scaled_reward": -0.10022350586950779,
"rewards/format_reward": 0.4702380932867527,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 2329.4642639160156,
"epoch": 0.504,
"grad_norm": 0.2497519552707672,
"kl": 0.016754150390625,
"learning_rate": 9.381311511432658e-07,
"loss": 0.1113,
"reward": 0.5470606535673141,
"reward_std": 0.7599766105413437,
"rewards/cosine_scaled_reward": -0.04194586584344506,
"rewards/format_reward": 0.630952388048172,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 2630.851318359375,
"epoch": 0.508,
"grad_norm": 0.24775269627571106,
"kl": 0.018798828125,
"learning_rate": 9.36531953618799e-07,
"loss": 0.0886,
"reward": 0.5138388648629189,
"reward_std": 0.8158636838197708,
"rewards/cosine_scaled_reward": 0.00989562287577428,
"rewards/format_reward": 0.4940476194024086,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 2193.1250610351562,
"epoch": 0.512,
"grad_norm": 0.3206213712692261,
"kl": 0.0168914794921875,
"learning_rate": 9.34913917072228e-07,
"loss": 0.1596,
"reward": 0.7910499274730682,
"reward_std": 0.7149495184421539,
"rewards/cosine_scaled_reward": 0.0502868490293622,
"rewards/format_reward": 0.6904762089252472,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 2532.4642944335938,
"epoch": 0.516,
"grad_norm": 0.28250807523727417,
"kl": 0.021270751953125,
"learning_rate": 9.332771203643714e-07,
"loss": 0.0845,
"reward": 0.36558002047240734,
"reward_std": 0.637114867568016,
"rewards/cosine_scaled_reward": -0.05232903314754367,
"rewards/format_reward": 0.470238097012043,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 2768.4702758789062,
"epoch": 0.52,
"grad_norm": 0.26120948791503906,
"kl": 0.026092529296875,
"learning_rate": 9.316216432703916e-07,
"loss": 0.1052,
"reward": 0.41776999086141586,
"reward_std": 0.9506262838840485,
"rewards/cosine_scaled_reward": -0.029210255946964025,
"rewards/format_reward": 0.4761904776096344,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 2400.5000915527344,
"epoch": 0.524,
"grad_norm": 0.33211514353752136,
"kl": 0.0215606689453125,
"learning_rate": 9.299475664759068e-07,
"loss": 0.1452,
"reward": 0.42596414871513844,
"reward_std": 0.6515605002641678,
"rewards/cosine_scaled_reward": -0.04892268590629101,
"rewards/format_reward": 0.5238095298409462,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 2368.9524536132812,
"epoch": 0.528,
"grad_norm": 0.41833382844924927,
"kl": 0.020782470703125,
"learning_rate": 9.282549715730579e-07,
"loss": 0.1107,
"reward": 0.5763177648186684,
"reward_std": 0.7463207244873047,
"rewards/cosine_scaled_reward": 0.005420786794275045,
"rewards/format_reward": 0.5654762089252472,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 2800.4940795898438,
"epoch": 0.532,
"grad_norm": 0.21257169544696808,
"kl": 0.031768798828125,
"learning_rate": 9.265439410565328e-07,
"loss": 0.0654,
"reward": 0.27612858824431896,
"reward_std": 0.6431511640548706,
"rewards/cosine_scaled_reward": -0.07026905845850706,
"rewards/format_reward": 0.4166666679084301,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 2558.386962890625,
"epoch": 0.536,
"grad_norm": 0.42514950037002563,
"kl": 0.02978515625,
"learning_rate": 9.248145583195447e-07,
"loss": 0.1228,
"reward": 0.4270520806312561,
"reward_std": 0.7729989290237427,
"rewards/cosine_scaled_reward": -0.02159299748018384,
"rewards/format_reward": 0.4702381044626236,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 2229.363067626953,
"epoch": 0.54,
"grad_norm": 0.33076903223991394,
"kl": 0.03668212890625,
"learning_rate": 9.230669076497687e-07,
"loss": 0.1019,
"reward": 0.28207028564065695,
"reward_std": 0.6516975909471512,
"rewards/cosine_scaled_reward": -0.10301248356699944,
"rewards/format_reward": 0.4880952388048172,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 2353.029815673828,
"epoch": 0.544,
"grad_norm": 0.3519177734851837,
"kl": 0.035308837890625,
"learning_rate": 9.213010742252327e-07,
"loss": 0.0997,
"reward": 0.37077474407851696,
"reward_std": 0.668467104434967,
"rewards/cosine_scaled_reward": -0.0675888154655695,
"rewards/format_reward": 0.505952388048172,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 2052.1964721679688,
"epoch": 0.548,
"grad_norm": 0.23379026353359222,
"kl": 0.035400390625,
"learning_rate": 9.195171441101668e-07,
"loss": 0.058,
"reward": 0.6550269052386284,
"reward_std": 0.6502309143543243,
"rewards/cosine_scaled_reward": 0.009061065968126059,
"rewards/format_reward": 0.6369047611951828,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 2325.732177734375,
"epoch": 0.552,
"grad_norm": 0.19041714072227478,
"kl": 0.0423583984375,
"learning_rate": 9.177152042508077e-07,
"loss": 0.0232,
"reward": 0.561458358541131,
"reward_std": 0.9615298509597778,
"rewards/cosine_scaled_reward": 0.012872030027210712,
"rewards/format_reward": 0.5357142947614193,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 2440.327423095703,
"epoch": 0.556,
"grad_norm": 0.2846536934375763,
"kl": 0.0457763671875,
"learning_rate": 9.158953424711624e-07,
"loss": 0.0439,
"reward": 0.44825945422053337,
"reward_std": 0.7441610246896744,
"rewards/cosine_scaled_reward": -0.022894082590937614,
"rewards/format_reward": 0.4940476417541504,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 2455.071533203125,
"epoch": 0.56,
"grad_norm": 0.5667356252670288,
"kl": 0.05535888671875,
"learning_rate": 9.140576474687263e-07,
"loss": 0.1203,
"reward": 0.42634591602836736,
"reward_std": 0.6539553329348564,
"rewards/cosine_scaled_reward": 0.007815815508365631,
"rewards/format_reward": 0.4107142984867096,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 2289.0416870117188,
"epoch": 0.564,
"grad_norm": 0.2632148861885071,
"kl": 0.06378173828125,
"learning_rate": 9.122022088101613e-07,
"loss": 0.0588,
"reward": 0.25764250196516514,
"reward_std": 0.5646726861596107,
"rewards/cosine_scaled_reward": -0.07355970796197653,
"rewards/format_reward": 0.4047619141638279,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 2396.1011962890625,
"epoch": 0.568,
"grad_norm": 0.48258456587791443,
"kl": 0.070556640625,
"learning_rate": 9.103291169269299e-07,
"loss": 0.1188,
"reward": 0.3830295614898205,
"reward_std": 0.7874267548322678,
"rewards/cosine_scaled_reward": 0.0069909729063510895,
"rewards/format_reward": 0.3690476268529892,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 2465.184539794922,
"epoch": 0.572,
"grad_norm": 0.3696215748786926,
"kl": 0.083984375,
"learning_rate": 9.084384631108882e-07,
"loss": 0.0378,
"reward": 0.17246808065101504,
"reward_std": 0.7914570420980453,
"rewards/cosine_scaled_reward": -0.11614691279828548,
"rewards/format_reward": 0.4047619104385376,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 2460.184600830078,
"epoch": 0.576,
"grad_norm": 0.30795326828956604,
"kl": 0.08349609375,
"learning_rate": 9.065303395098358e-07,
"loss": 0.0254,
"reward": 0.23997123539447784,
"reward_std": 0.7133302837610245,
"rewards/cosine_scaled_reward": -0.09430009685456753,
"rewards/format_reward": 0.4285714365541935,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 2376.089324951172,
"epoch": 0.58,
"grad_norm": 0.5491130352020264,
"kl": 0.0899658203125,
"learning_rate": 9.046048391230247e-07,
"loss": 0.0823,
"reward": 0.4339366629719734,
"reward_std": 0.7774848788976669,
"rewards/cosine_scaled_reward": 0.005658812588080764,
"rewards/format_reward": 0.4226190559566021,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 2340.136962890625,
"epoch": 0.584,
"grad_norm": 0.3470991551876068,
"kl": 0.1185302734375,
"learning_rate": 9.026620557966279e-07,
"loss": 0.0431,
"reward": 0.31324461475014687,
"reward_std": 0.7800580561161041,
"rewards/cosine_scaled_reward": -0.10528245754539967,
"rewards/format_reward": 0.5238095298409462,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 2512.7262573242188,
"epoch": 0.588,
"grad_norm": 0.31661325693130493,
"kl": 0.1114501953125,
"learning_rate": 9.007020842191634e-07,
"loss": 0.0189,
"reward": 0.2997382581233978,
"reward_std": 0.8120257556438446,
"rewards/cosine_scaled_reward": -0.06144038587808609,
"rewards/format_reward": 0.4226190485060215,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 2311.8035888671875,
"epoch": 0.592,
"grad_norm": 0.9110273718833923,
"kl": 0.1287841796875,
"learning_rate": 8.987250199168808e-07,
"loss": 0.0875,
"reward": 0.29848775546997786,
"reward_std": 0.7376701682806015,
"rewards/cosine_scaled_reward": -0.06206565350294113,
"rewards/format_reward": 0.4226190596818924,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 2368.2500915527344,
"epoch": 0.596,
"grad_norm": 0.5145498514175415,
"kl": 0.142333984375,
"learning_rate": 8.967309592491052e-07,
"loss": -0.0091,
"reward": 0.1381237395107746,
"reward_std": 0.7537627294659615,
"rewards/cosine_scaled_reward": -0.12736669927835464,
"rewards/format_reward": 0.3928571492433548,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 2541.4226684570312,
"epoch": 0.6,
"grad_norm": 0.4558282792568207,
"kl": 0.1424560546875,
"learning_rate": 8.9471999940354e-07,
"loss": 0.0545,
"reward": 0.517300067236647,
"reward_std": 0.8328704386949539,
"rewards/cosine_scaled_reward": 0.014602408395148814,
"rewards/format_reward": 0.4880952388048172,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 2635.4226989746094,
"epoch": 0.604,
"grad_norm": 0.3748377859592438,
"kl": 0.18310546875,
"learning_rate": 8.926922383915315e-07,
"loss": 0.0372,
"reward": 0.09937155619263649,
"reward_std": 0.7007840871810913,
"rewards/cosine_scaled_reward": -0.1259094497654587,
"rewards/format_reward": 0.3511904776096344,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 2366.8036499023438,
"epoch": 0.608,
"grad_norm": 0.7343178391456604,
"kl": 0.196533203125,
"learning_rate": 8.906477750432903e-07,
"loss": 0.1069,
"reward": 0.6113147716969252,
"reward_std": 0.8982192724943161,
"rewards/cosine_scaled_reward": 0.028871658723801374,
"rewards/format_reward": 0.5535714402794838,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 2539.6726684570312,
"epoch": 0.612,
"grad_norm": 0.3661244213581085,
"kl": 0.23095703125,
"learning_rate": 8.88586709003076e-07,
"loss": 0.0493,
"reward": 0.3096798346377909,
"reward_std": 0.6143878847360611,
"rewards/cosine_scaled_reward": -0.07432675641030073,
"rewards/format_reward": 0.4583333432674408,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 2693.077392578125,
"epoch": 0.616,
"grad_norm": 0.39779341220855713,
"kl": 0.26123046875,
"learning_rate": 8.865091407243394e-07,
"loss": 0.0484,
"reward": 0.20376494899392128,
"reward_std": 0.7454717755317688,
"rewards/cosine_scaled_reward": -0.0856175352819264,
"rewards/format_reward": 0.3750000074505806,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 2522.8333740234375,
"epoch": 0.62,
"grad_norm": 0.7077339291572571,
"kl": 0.275634765625,
"learning_rate": 8.844151714648274e-07,
"loss": 0.1048,
"reward": 0.28493453562259674,
"reward_std": 0.7751601040363312,
"rewards/cosine_scaled_reward": -0.050985115580260754,
"rewards/format_reward": 0.3869047649204731,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 2791.2083740234375,
"epoch": 0.624,
"grad_norm": 0.6277625560760498,
"kl": 0.3359375,
"learning_rate": 8.823049032816478e-07,
"loss": 0.063,
"reward": 0.15741928666830063,
"reward_std": 0.7891719415783882,
"rewards/cosine_scaled_reward": -0.1058141621761024,
"rewards/format_reward": 0.3690476268529892,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 2756.0596313476562,
"epoch": 0.628,
"grad_norm": 0.9464259147644043,
"kl": 0.35107421875,
"learning_rate": 8.801784390262943e-07,
"loss": 0.1337,
"reward": 0.20047340355813503,
"reward_std": 0.7717511355876923,
"rewards/cosine_scaled_reward": -0.10214426182210445,
"rewards/format_reward": 0.4047619178891182,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 2546.386993408203,
"epoch": 0.632,
"grad_norm": 0.9672547578811646,
"kl": 0.3583984375,
"learning_rate": 8.780358823396352e-07,
"loss": 0.1309,
"reward": 0.3896455895155668,
"reward_std": 0.8362017869949341,
"rewards/cosine_scaled_reward": -0.025415319949388504,
"rewards/format_reward": 0.4404761902987957,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 2490.3869018554688,
"epoch": 0.636,
"grad_norm": 0.5016924738883972,
"kl": 0.37744140625,
"learning_rate": 8.758773376468604e-07,
"loss": 0.0548,
"reward": 0.3971053212881088,
"reward_std": 0.6817308068275452,
"rewards/cosine_scaled_reward": -0.07823306252248585,
"rewards/format_reward": 0.5535714328289032,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 2715.90478515625,
"epoch": 0.64,
"grad_norm": 0.776878833770752,
"kl": 0.4169921875,
"learning_rate": 8.737029101523929e-07,
"loss": 0.1414,
"reward": 0.37737663462758064,
"reward_std": 0.8348212540149689,
"rewards/cosine_scaled_reward": -0.03452597954310477,
"rewards/format_reward": 0.4464285746216774,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 2636.0357666015625,
"epoch": 0.644,
"grad_norm": 1.2749825716018677,
"kl": 0.48486328125,
"learning_rate": 8.715127058347614e-07,
"loss": 0.1445,
"reward": 0.24750607460737228,
"reward_std": 0.7917188853025436,
"rewards/cosine_scaled_reward": -0.10541364271193743,
"rewards/format_reward": 0.4583333469927311,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 2622.9405517578125,
"epoch": 0.648,
"grad_norm": 1.3737562894821167,
"kl": 0.5888671875,
"learning_rate": 8.693068314414344e-07,
"loss": 0.1549,
"reward": 0.10282446062774397,
"reward_std": 0.6833581179380417,
"rewards/cosine_scaled_reward": -0.18073063343763351,
"rewards/format_reward": 0.4642857275903225,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 2187.9286193847656,
"epoch": 0.652,
"grad_norm": 1.2476062774658203,
"kl": 0.64453125,
"learning_rate": 8.670853944836176e-07,
"loss": 0.1442,
"reward": 0.5821249708533287,
"reward_std": 0.8525291532278061,
"rewards/cosine_scaled_reward": -0.0303660926874727,
"rewards/format_reward": 0.6428571492433548,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 2645.2202758789062,
"epoch": 0.656,
"grad_norm": 0.8759295344352722,
"kl": 0.83203125,
"learning_rate": 8.648485032310144e-07,
"loss": 0.1369,
"reward": 0.354750145226717,
"reward_std": 0.6708278656005859,
"rewards/cosine_scaled_reward": -0.09941063448786736,
"rewards/format_reward": 0.5535714328289032,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 2744.5952758789062,
"epoch": 0.66,
"grad_norm": 1.443908452987671,
"kl": 0.9365234375,
"learning_rate": 8.625962667065487e-07,
"loss": 0.1514,
"reward": 0.07671361323446035,
"reward_std": 0.7401341199874878,
"rewards/cosine_scaled_reward": -0.16997654270380735,
"rewards/format_reward": 0.4166666679084301,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 2762.2381591796875,
"epoch": 0.664,
"grad_norm": 2.171701192855835,
"kl": 1.064453125,
"learning_rate": 8.603287946810513e-07,
"loss": 0.0493,
"reward": 0.3810354620218277,
"reward_std": 0.6359066590666771,
"rewards/cosine_scaled_reward": -0.05650608614087105,
"rewards/format_reward": 0.4940476231276989,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 2410.6726684570312,
"epoch": 0.668,
"grad_norm": 1.1915135383605957,
"kl": 0.9716796875,
"learning_rate": 8.580461976679099e-07,
"loss": 0.1178,
"reward": 0.5956609398126602,
"reward_std": 0.7429262697696686,
"rewards/cosine_scaled_reward": -0.011693337932229042,
"rewards/format_reward": 0.6190476268529892,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 2624.7083740234375,
"epoch": 0.672,
"grad_norm": 1.2750567197799683,
"kl": 1.111328125,
"learning_rate": 8.557485869176825e-07,
"loss": 0.1676,
"reward": 0.35937594436109066,
"reward_std": 0.7485721707344055,
"rewards/cosine_scaled_reward": -0.0613834522664547,
"rewards/format_reward": 0.482142873108387,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 2566.7857666015625,
"epoch": 0.676,
"grad_norm": 0.8985515832901001,
"kl": 1.0439453125,
"learning_rate": 8.534360744126753e-07,
"loss": 0.1232,
"reward": 0.23157138470560312,
"reward_std": 0.6288014650344849,
"rewards/cosine_scaled_reward": -0.14314288273453712,
"rewards/format_reward": 0.5178571492433548,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 2820.5059814453125,
"epoch": 0.68,
"grad_norm": 1.1454522609710693,
"kl": 0.9677734375,
"learning_rate": 8.511087728614862e-07,
"loss": 0.1412,
"reward": 0.08721911488100886,
"reward_std": 0.6948041319847107,
"rewards/cosine_scaled_reward": -0.16769996285438538,
"rewards/format_reward": 0.4226190522313118,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 2376.166748046875,
"epoch": 0.684,
"grad_norm": 0.9355194568634033,
"kl": 0.9521484375,
"learning_rate": 8.487667956935087e-07,
"loss": 0.128,
"reward": 0.41750996466726065,
"reward_std": 0.7085302621126175,
"rewards/cosine_scaled_reward": -0.05910217575728893,
"rewards/format_reward": 0.535714291036129,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 2571.5000610351562,
"epoch": 0.688,
"grad_norm": 0.9496890902519226,
"kl": 1.0341796875,
"learning_rate": 8.464102570534061e-07,
"loss": 0.147,
"reward": 0.21527537889778614,
"reward_std": 0.6487467139959335,
"rewards/cosine_scaled_reward": -0.2048623152077198,
"rewards/format_reward": 0.6250000149011612,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 2577.839324951172,
"epoch": 0.692,
"grad_norm": 1.125106692314148,
"kl": 1.005859375,
"learning_rate": 8.440392717955475e-07,
"loss": 0.1126,
"reward": 0.29065654147416353,
"reward_std": 0.5777322202920914,
"rewards/cosine_scaled_reward": -0.11955267190933228,
"rewards/format_reward": 0.5297619178891182,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 2297.0416870117188,
"epoch": 0.696,
"grad_norm": 1.5477794408798218,
"kl": 0.9482421875,
"learning_rate": 8.416539554784089e-07,
"loss": 0.0866,
"reward": 0.35003964975476265,
"reward_std": 0.7120198756456375,
"rewards/cosine_scaled_reward": -0.13152779638767242,
"rewards/format_reward": 0.6130952388048172,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 2239.952423095703,
"epoch": 0.7,
"grad_norm": 1.1404165029525757,
"kl": 0.7734375,
"learning_rate": 8.392544243589427e-07,
"loss": 0.1326,
"reward": 0.7693988904356956,
"reward_std": 0.8029063045978546,
"rewards/cosine_scaled_reward": 0.0543422931805253,
"rewards/format_reward": 0.6607142984867096,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 2214.148895263672,
"epoch": 0.704,
"grad_norm": 0.976016104221344,
"kl": 0.8193359375,
"learning_rate": 8.368407953869103e-07,
"loss": 0.1005,
"reward": 0.5222894381731749,
"reward_std": 0.6858630776405334,
"rewards/cosine_scaled_reward": -0.07814099243842065,
"rewards/format_reward": 0.6785714477300644,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 2167.4345092773438,
"epoch": 0.708,
"grad_norm": 1.6724809408187866,
"kl": 0.740234375,
"learning_rate": 8.344131861991828e-07,
"loss": 0.1424,
"reward": 0.3468378521502018,
"reward_std": 0.6407709717750549,
"rewards/cosine_scaled_reward": -0.16289059445261955,
"rewards/format_reward": 0.6726190596818924,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 2593.5654907226562,
"epoch": 0.712,
"grad_norm": 1.3712421655654907,
"kl": 0.9814453125,
"learning_rate": 8.319717151140072e-07,
"loss": 0.1121,
"reward": 0.27433447539806366,
"reward_std": 0.6857093423604965,
"rewards/cosine_scaled_reward": -0.16342800296843052,
"rewards/format_reward": 0.6011904925107956,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 2240.9583129882812,
"epoch": 0.716,
"grad_norm": 2.2109479904174805,
"kl": 0.7880859375,
"learning_rate": 8.295165011252396e-07,
"loss": 0.0613,
"reward": 0.3249462991952896,
"reward_std": 0.7396285533905029,
"rewards/cosine_scaled_reward": -0.12919352855533361,
"rewards/format_reward": 0.5833333358168602,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 2391.261962890625,
"epoch": 0.72,
"grad_norm": 0.9252892136573792,
"kl": 0.8369140625,
"learning_rate": 8.270476638965461e-07,
"loss": 0.0766,
"reward": 0.37066294252872467,
"reward_std": 0.5772489011287689,
"rewards/cosine_scaled_reward": -0.15395426377654076,
"rewards/format_reward": 0.6785714477300644,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 2188.21435546875,
"epoch": 0.724,
"grad_norm": 1.4679890871047974,
"kl": 0.7177734375,
"learning_rate": 8.245653237555705e-07,
"loss": 0.1271,
"reward": 0.47163213789463043,
"reward_std": 0.7110278159379959,
"rewards/cosine_scaled_reward": -0.10942202992737293,
"rewards/format_reward": 0.6904762089252472,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 2330.3572387695312,
"epoch": 0.728,
"grad_norm": 0.8398174047470093,
"kl": 0.71875,
"learning_rate": 8.220696016880687e-07,
"loss": 0.0837,
"reward": 0.5167603380978107,
"reward_std": 0.704664558172226,
"rewards/cosine_scaled_reward": -0.08090554922819138,
"rewards/format_reward": 0.6785714402794838,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 2319.154815673828,
"epoch": 0.732,
"grad_norm": 1.0028657913208008,
"kl": 0.7421875,
"learning_rate": 8.195606193320136e-07,
"loss": 0.1069,
"reward": 0.6520561873912811,
"reward_std": 0.8034340292215347,
"rewards/cosine_scaled_reward": -0.04301954247057438,
"rewards/format_reward": 0.7380952537059784,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 2419.6131591796875,
"epoch": 0.736,
"grad_norm": 0.9799902439117432,
"kl": 0.794921875,
"learning_rate": 8.170384989716657e-07,
"loss": 0.1,
"reward": 0.5134465768933296,
"reward_std": 0.7416307479143143,
"rewards/cosine_scaled_reward": -0.10637196339666843,
"rewards/format_reward": 0.7261904776096344,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 2279.3630981445312,
"epoch": 0.74,
"grad_norm": 1.1403197050094604,
"kl": 0.75390625,
"learning_rate": 8.145033635316128e-07,
"loss": 0.08,
"reward": 0.5693989507853985,
"reward_std": 0.6981105357408524,
"rewards/cosine_scaled_reward": -0.06946719996631145,
"rewards/format_reward": 0.7083333507180214,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 2087.2679443359375,
"epoch": 0.744,
"grad_norm": 0.8785580992698669,
"kl": 0.6123046875,
"learning_rate": 8.119553365707802e-07,
"loss": 0.0849,
"reward": 0.4244233965873718,
"reward_std": 0.718925341963768,
"rewards/cosine_scaled_reward": -0.1449311599135399,
"rewards/format_reward": 0.7142857313156128,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 2415.8036499023438,
"epoch": 0.748,
"grad_norm": 1.325434684753418,
"kl": 0.6298828125,
"learning_rate": 8.093945422764069e-07,
"loss": 0.0477,
"reward": 0.594460990279913,
"reward_std": 0.7041322290897369,
"rewards/cosine_scaled_reward": -0.021221883594989777,
"rewards/format_reward": 0.636904776096344,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 2371.0714721679688,
"epoch": 0.752,
"grad_norm": 1.3853912353515625,
"kl": 0.638671875,
"learning_rate": 8.068211054579943e-07,
"loss": 0.1131,
"reward": 0.5956445932388306,
"reward_std": 0.7780069708824158,
"rewards/cosine_scaled_reward": -0.062296761316247284,
"rewards/format_reward": 0.7202381044626236,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 2243.3333740234375,
"epoch": 0.756,
"grad_norm": 0.7066504955291748,
"kl": 0.564453125,
"learning_rate": 8.04235151541222e-07,
"loss": 0.043,
"reward": 0.7391829118132591,
"reward_std": 0.6626263409852982,
"rewards/cosine_scaled_reward": -0.014337139204144478,
"rewards/format_reward": 0.767857164144516,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 2111.875030517578,
"epoch": 0.76,
"grad_norm": 1.1808303594589233,
"kl": 0.5361328125,
"learning_rate": 8.01636806561836e-07,
"loss": 0.0212,
"reward": 0.6303885579109192,
"reward_std": 0.7266089022159576,
"rewards/cosine_scaled_reward": -0.04790095146745443,
"rewards/format_reward": 0.7261904925107956,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 2512.1607666015625,
"epoch": 0.764,
"grad_norm": 1.169936180114746,
"kl": 0.54736328125,
"learning_rate": 7.990261971595048e-07,
"loss": 0.0239,
"reward": 0.4208872392773628,
"reward_std": 0.6789906620979309,
"rewards/cosine_scaled_reward": -0.12288972595706582,
"rewards/format_reward": 0.6666666716337204,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 2421.7381591796875,
"epoch": 0.768,
"grad_norm": 1.9125944375991821,
"kl": 0.44970703125,
"learning_rate": 7.964034505716476e-07,
"loss": 0.162,
"reward": 0.6703099310398102,
"reward_std": 0.7079124301671982,
"rewards/cosine_scaled_reward": 0.022654948756098747,
"rewards/format_reward": 0.625,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 2342.3333435058594,
"epoch": 0.772,
"grad_norm": 1.1848394870758057,
"kl": 0.4287109375,
"learning_rate": 7.93768694627233e-07,
"loss": 0.1217,
"reward": 0.3946942985057831,
"reward_std": 0.7293716818094254,
"rewards/cosine_scaled_reward": -0.14789094775915146,
"rewards/format_reward": 0.6904762089252472,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 2488.1786193847656,
"epoch": 0.776,
"grad_norm": 0.8427687883377075,
"kl": 0.40673828125,
"learning_rate": 7.911220577405484e-07,
"loss": 0.0681,
"reward": 0.33857931289821863,
"reward_std": 0.7693478316068649,
"rewards/cosine_scaled_reward": -0.11642462853342295,
"rewards/format_reward": 0.5714285671710968,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 2235.4762573242188,
"epoch": 0.78,
"grad_norm": 1.9778449535369873,
"kl": 0.4599609375,
"learning_rate": 7.884636689049422e-07,
"loss": 0.1203,
"reward": 0.7276730462908745,
"reward_std": 0.8504652380943298,
"rewards/cosine_scaled_reward": 0.02455079648643732,
"rewards/format_reward": 0.6785714328289032,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 2252.7262268066406,
"epoch": 0.784,
"grad_norm": 1.251224398612976,
"kl": 0.49169921875,
"learning_rate": 7.857936576865356e-07,
"loss": 0.0753,
"reward": 0.6360676661133766,
"reward_std": 0.8185366541147232,
"rewards/cosine_scaled_reward": -0.01827568793669343,
"rewards/format_reward": 0.6726190596818924,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 2399.7500610351562,
"epoch": 0.788,
"grad_norm": 0.9470409154891968,
"kl": 0.517578125,
"learning_rate": 7.831121542179086e-07,
"loss": 0.1036,
"reward": 0.550631508231163,
"reward_std": 0.7208298593759537,
"rewards/cosine_scaled_reward": -0.037184251472353935,
"rewards/format_reward": 0.625,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 2269.5000610351562,
"epoch": 0.792,
"grad_norm": 2.047698974609375,
"kl": 0.595703125,
"learning_rate": 7.804192891917571e-07,
"loss": 0.1831,
"reward": 0.29151881486177444,
"reward_std": 0.666583925485611,
"rewards/cosine_scaled_reward": -0.15483582392334938,
"rewards/format_reward": 0.601190485060215,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 2316.7857666015625,
"epoch": 0.796,
"grad_norm": 1.6713296175003052,
"kl": 0.60205078125,
"learning_rate": 7.777151938545235e-07,
"loss": 0.1356,
"reward": 0.5018086154013872,
"reward_std": 0.8012387007474899,
"rewards/cosine_scaled_reward": -0.0615957040572539,
"rewards/format_reward": 0.6250000223517418,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 2471.1964721679688,
"epoch": 0.8,
"grad_norm": 0.9633775949478149,
"kl": 0.740234375,
"learning_rate": 7.75e-07,
"loss": 0.1277,
"reward": 0.3792301341891289,
"reward_std": 0.76199010014534,
"rewards/cosine_scaled_reward": -0.11693255044519901,
"rewards/format_reward": 0.6130952537059784,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 2280.6607971191406,
"epoch": 0.804,
"grad_norm": 1.1369765996932983,
"kl": 0.7587890625,
"learning_rate": 7.72273839962904e-07,
"loss": 0.1174,
"reward": 0.4361310079693794,
"reward_std": 0.7977508455514908,
"rewards/cosine_scaled_reward": -0.0736011671833694,
"rewards/format_reward": 0.5833333432674408,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 2239.3809814453125,
"epoch": 0.808,
"grad_norm": 1.1852681636810303,
"kl": 0.80078125,
"learning_rate": 7.695368466124296e-07,
"loss": 0.1861,
"reward": 0.4273875653743744,
"reward_std": 0.7939650565385818,
"rewards/cosine_scaled_reward": -0.08987765479832888,
"rewards/format_reward": 0.6071428656578064,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 2270.9524536132812,
"epoch": 0.812,
"grad_norm": 2.2510244846343994,
"kl": 1.05859375,
"learning_rate": 7.667891533457718e-07,
"loss": 0.1778,
"reward": 0.5268369093537331,
"reward_std": 0.7606751769781113,
"rewards/cosine_scaled_reward": -0.05205773119814694,
"rewards/format_reward": 0.630952388048172,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 2307.83935546875,
"epoch": 0.816,
"grad_norm": 3.0754034519195557,
"kl": 1.107421875,
"learning_rate": 7.640308940816239e-07,
"loss": 0.1251,
"reward": 0.046380717772990465,
"reward_std": 0.6517826318740845,
"rewards/cosine_scaled_reward": -0.23573821783065796,
"rewards/format_reward": 0.5178571566939354,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 2065.482177734375,
"epoch": 0.82,
"grad_norm": 3.317054033279419,
"kl": 0.8037109375,
"learning_rate": 7.612622032536507e-07,
"loss": 0.1229,
"reward": 0.629617914557457,
"reward_std": 0.7360707223415375,
"rewards/cosine_scaled_reward": -0.012572012841701508,
"rewards/format_reward": 0.654761902987957,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 2119.2679443359375,
"epoch": 0.824,
"grad_norm": 1.985148310661316,
"kl": 0.70849609375,
"learning_rate": 7.584832158039378e-07,
"loss": 0.1697,
"reward": 0.4503296762704849,
"reward_std": 0.7717154771089554,
"rewards/cosine_scaled_reward": -0.07840658072382212,
"rewards/format_reward": 0.6071428656578064,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 2085.839324951172,
"epoch": 0.828,
"grad_norm": 2.4033172130584717,
"kl": 0.6025390625,
"learning_rate": 7.556940671764124e-07,
"loss": 0.1578,
"reward": 0.4145805863663554,
"reward_std": 0.7361099421977997,
"rewards/cosine_scaled_reward": -0.11116209626197815,
"rewards/format_reward": 0.6369047611951828,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 1872.482177734375,
"epoch": 0.832,
"grad_norm": 2.11576247215271,
"kl": 0.408203125,
"learning_rate": 7.528948933102438e-07,
"loss": 0.0839,
"reward": 0.4670650511980057,
"reward_std": 0.7250475585460663,
"rewards/cosine_scaled_reward": -0.10872937482781708,
"rewards/format_reward": 0.6845238208770752,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 2085.6786499023438,
"epoch": 0.836,
"grad_norm": 0.8786793351173401,
"kl": 0.51806640625,
"learning_rate": 7.500858306332172e-07,
"loss": 0.0649,
"reward": 0.46545055881142616,
"reward_std": 0.6805593073368073,
"rewards/cosine_scaled_reward": -0.09465568419545889,
"rewards/format_reward": 0.6547619104385376,
"step": 209
},
{
"clip_ratio": 0.0,
"completion_length": 2425.7916870117188,
"epoch": 0.84,
"grad_norm": 1.3337445259094238,
"kl": 0.58837890625,
"learning_rate": 7.472670160550848e-07,
"loss": 0.1561,
"reward": 0.4684627018868923,
"reward_std": 0.824245348572731,
"rewards/cosine_scaled_reward": -0.04553056287113577,
"rewards/format_reward": 0.5595238208770752,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 2630.5655517578125,
"epoch": 0.844,
"grad_norm": 1.3039979934692383,
"kl": 0.732421875,
"learning_rate": 7.444385869608921e-07,
"loss": 0.1559,
"reward": 0.1796425711363554,
"reward_std": 0.6979469060897827,
"rewards/cosine_scaled_reward": -0.17803586274385452,
"rewards/format_reward": 0.5357143059372902,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 1983.9762268066406,
"epoch": 0.848,
"grad_norm": 0.9129418134689331,
"kl": 0.546875,
"learning_rate": 7.416006812042827e-07,
"loss": 0.1352,
"reward": 0.4564796891063452,
"reward_std": 0.6133182421326637,
"rewards/cosine_scaled_reward": -0.11402205377817154,
"rewards/format_reward": 0.6845238208770752,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 2315.5119018554688,
"epoch": 0.852,
"grad_norm": 1.2220977544784546,
"kl": 0.5751953125,
"learning_rate": 7.387534371007797e-07,
"loss": 0.1683,
"reward": 0.6708191484212875,
"reward_std": 0.9547160714864731,
"rewards/cosine_scaled_reward": 0.016957183834165335,
"rewards/format_reward": 0.636904776096344,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 2233.0655212402344,
"epoch": 0.856,
"grad_norm": 0.7978451251983643,
"kl": 0.61474609375,
"learning_rate": 7.358969934210438e-07,
"loss": 0.1304,
"reward": 0.40765415877103806,
"reward_std": 0.7158278822898865,
"rewards/cosine_scaled_reward": -0.12057768838712946,
"rewards/format_reward": 0.6488095372915268,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 2194.791748046875,
"epoch": 0.86,
"grad_norm": 1.0176509618759155,
"kl": 0.61181640625,
"learning_rate": 7.330314893841101e-07,
"loss": 0.0991,
"reward": 0.5912733934819698,
"reward_std": 0.6540912538766861,
"rewards/cosine_scaled_reward": -0.013887112960219383,
"rewards/format_reward": 0.6190476194024086,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 2189.5535888671875,
"epoch": 0.864,
"grad_norm": 0.7862021923065186,
"kl": 0.6572265625,
"learning_rate": 7.301570646506027e-07,
"loss": 0.1369,
"reward": 0.4810000769793987,
"reward_std": 0.6697472035884857,
"rewards/cosine_scaled_reward": -0.09283328615128994,
"rewards/format_reward": 0.6666666716337204,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 2238.6487731933594,
"epoch": 0.868,
"grad_norm": 0.675116240978241,
"kl": 0.662109375,
"learning_rate": 7.27273859315928e-07,
"loss": 0.1234,
"reward": 0.3597661480307579,
"reward_std": 0.6638298779726028,
"rewards/cosine_scaled_reward": -0.13559313118457794,
"rewards/format_reward": 0.6309523731470108,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 2330.7560424804688,
"epoch": 0.872,
"grad_norm": 0.7294526696205139,
"kl": 0.6875,
"learning_rate": 7.243820139034464e-07,
"loss": 0.1182,
"reward": 0.5070892386138439,
"reward_std": 0.770987793803215,
"rewards/cosine_scaled_reward": -0.029193488880991936,
"rewards/format_reward": 0.565476194024086,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 2425.4048461914062,
"epoch": 0.876,
"grad_norm": 0.9955194592475891,
"kl": 0.76171875,
"learning_rate": 7.214816693576234e-07,
"loss": 0.145,
"reward": 0.3850390911102295,
"reward_std": 0.72886823117733,
"rewards/cosine_scaled_reward": -0.11700426135212183,
"rewards/format_reward": 0.6190476417541504,
"step": 219
},
{
"clip_ratio": 0.0,
"completion_length": 2443.869110107422,
"epoch": 0.88,
"grad_norm": 0.8245673179626465,
"kl": 0.7412109375,
"learning_rate": 7.185729670371604e-07,
"loss": 0.1517,
"reward": 0.3367026010528207,
"reward_std": 0.6719767898321152,
"rewards/cosine_scaled_reward": -0.1471248921006918,
"rewards/format_reward": 0.630952388048172,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 2401.65478515625,
"epoch": 0.884,
"grad_norm": 0.6434879302978516,
"kl": 0.552734375,
"learning_rate": 7.156560487081051e-07,
"loss": 0.0964,
"reward": 0.5589644331485033,
"reward_std": 0.6387112140655518,
"rewards/cosine_scaled_reward": -0.030041599762625992,
"rewards/format_reward": 0.619047611951828,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 2159.9107666015625,
"epoch": 0.888,
"grad_norm": 0.8747764229774475,
"kl": 0.48974609375,
"learning_rate": 7.127310565369415e-07,
"loss": 0.0648,
"reward": 1.0575831979513168,
"reward_std": 0.8345089554786682,
"rewards/cosine_scaled_reward": 0.15379157848656178,
"rewards/format_reward": 0.7500000149011612,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 2736.8095703125,
"epoch": 0.892,
"grad_norm": 1.644534707069397,
"kl": 0.650390625,
"learning_rate": 7.097981330836616e-07,
"loss": 0.0898,
"reward": 0.28782752249389887,
"reward_std": 0.6842672526836395,
"rewards/cosine_scaled_reward": -0.11799101112410426,
"rewards/format_reward": 0.5238095298409462,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 2585.4464721679688,
"epoch": 0.896,
"grad_norm": 0.5411848425865173,
"kl": 0.603515625,
"learning_rate": 7.068574212948169e-07,
"loss": 0.1243,
"reward": 0.49723897874355316,
"reward_std": 0.810086615383625,
"rewards/cosine_scaled_reward": -0.07280909270048141,
"rewards/format_reward": 0.6428571492433548,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 2394.8988647460938,
"epoch": 0.9,
"grad_norm": 0.7165555357933044,
"kl": 0.52783203125,
"learning_rate": 7.039090644965509e-07,
"loss": 0.0888,
"reward": 0.5129196643829346,
"reward_std": 0.787805512547493,
"rewards/cosine_scaled_reward": -0.056040180614218116,
"rewards/format_reward": 0.6250000298023224,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 2482.7678833007812,
"epoch": 0.904,
"grad_norm": 0.5211958289146423,
"kl": 0.51416015625,
"learning_rate": 7.009532063876148e-07,
"loss": 0.0812,
"reward": 0.4906727410852909,
"reward_std": 0.7880082875490189,
"rewards/cosine_scaled_reward": -0.0641874436987564,
"rewards/format_reward": 0.6190476417541504,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 2290.0000610351562,
"epoch": 0.908,
"grad_norm": 0.5630519986152649,
"kl": 0.382568359375,
"learning_rate": 6.979899910323624e-07,
"loss": 0.1034,
"reward": 0.6861637309193611,
"reward_std": 0.7359699308872223,
"rewards/cosine_scaled_reward": -0.049775293562561274,
"rewards/format_reward": 0.7857142984867096,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 2686.7440795898438,
"epoch": 0.912,
"grad_norm": 0.6647688746452332,
"kl": 0.4326171875,
"learning_rate": 6.950195628537299e-07,
"loss": 0.0594,
"reward": 0.5352285588160157,
"reward_std": 0.7634364515542984,
"rewards/cosine_scaled_reward": -0.047861908678896725,
"rewards/format_reward": 0.6309524029493332,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 2687.5416564941406,
"epoch": 0.916,
"grad_norm": 0.37424567341804504,
"kl": 0.39208984375,
"learning_rate": 6.920420666261961e-07,
"loss": 0.0465,
"reward": 0.43462158273905516,
"reward_std": 0.6648337990045547,
"rewards/cosine_scaled_reward": -0.1070939814671874,
"rewards/format_reward": 0.6488095298409462,
"step": 229
},
{
"clip_ratio": 0.0,
"completion_length": 2462.8452758789062,
"epoch": 0.92,
"grad_norm": 0.52641361951828,
"kl": 0.37158203125,
"learning_rate": 6.890576474687263e-07,
"loss": 0.1061,
"reward": 0.5536616146564484,
"reward_std": 0.6706894189119339,
"rewards/cosine_scaled_reward": -0.06840727850794792,
"rewards/format_reward": 0.690476194024086,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 2395.279754638672,
"epoch": 0.924,
"grad_norm": 0.5165700912475586,
"kl": 0.369140625,
"learning_rate": 6.860664508377001e-07,
"loss": 0.0951,
"reward": 0.4786584824323654,
"reward_std": 0.774825245141983,
"rewards/cosine_scaled_reward": -0.1267421804368496,
"rewards/format_reward": 0.7321428656578064,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 2468.2857666015625,
"epoch": 0.928,
"grad_norm": 0.4581441879272461,
"kl": 0.31591796875,
"learning_rate": 6.83068622519821e-07,
"loss": 0.0555,
"reward": 0.6299031171947718,
"reward_std": 0.7808382511138916,
"rewards/cosine_scaled_reward": -0.045167478267103434,
"rewards/format_reward": 0.7202381044626236,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 2712.58935546875,
"epoch": 0.932,
"grad_norm": 0.7744795083999634,
"kl": 0.333984375,
"learning_rate": 6.800643086250121e-07,
"loss": 0.0623,
"reward": 0.6912369206547737,
"reward_std": 0.7789230197668076,
"rewards/cosine_scaled_reward": 0.04204704426229,
"rewards/format_reward": 0.607142873108387,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 2556.619110107422,
"epoch": 0.936,
"grad_norm": 0.8385416865348816,
"kl": 0.38427734375,
"learning_rate": 6.770536555792944e-07,
"loss": 0.0662,
"reward": 0.4830031730234623,
"reward_std": 0.7291474640369415,
"rewards/cosine_scaled_reward": -0.07397460378706455,
"rewards/format_reward": 0.6309523731470108,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 2724.8809814453125,
"epoch": 0.94,
"grad_norm": 0.6943939328193665,
"kl": 0.330078125,
"learning_rate": 6.740368101176495e-07,
"loss": 0.1008,
"reward": 0.38701344281435013,
"reward_std": 0.7834271490573883,
"rewards/cosine_scaled_reward": -0.11304090730845928,
"rewards/format_reward": 0.6130952388048172,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 2819.7738647460938,
"epoch": 0.944,
"grad_norm": 0.3916683495044708,
"kl": 0.32177734375,
"learning_rate": 6.710139192768694e-07,
"loss": 0.0365,
"reward": 0.5249419808387756,
"reward_std": 0.8138006925582886,
"rewards/cosine_scaled_reward": -0.023243289440870285,
"rewards/format_reward": 0.571428582072258,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 2499.0536499023438,
"epoch": 0.948,
"grad_norm": 0.9175835847854614,
"kl": 0.321533203125,
"learning_rate": 6.679851303883891e-07,
"loss": 0.1055,
"reward": 0.6389507204294205,
"reward_std": 0.8023868650197983,
"rewards/cosine_scaled_reward": -0.04064369201660156,
"rewards/format_reward": 0.7202381044626236,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 2557.0655517578125,
"epoch": 0.952,
"grad_norm": 0.4397272765636444,
"kl": 0.30810546875,
"learning_rate": 6.649505910711058e-07,
"loss": 0.0869,
"reward": 0.4888541977852583,
"reward_std": 0.7550098150968552,
"rewards/cosine_scaled_reward": -0.09783481806516647,
"rewards/format_reward": 0.684523805975914,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 2600.422637939453,
"epoch": 0.956,
"grad_norm": 0.9344379305839539,
"kl": 0.345703125,
"learning_rate": 6.619104492241847e-07,
"loss": 0.1329,
"reward": 0.27865387313067913,
"reward_std": 0.6713129729032516,
"rewards/cosine_scaled_reward": -0.18210165202617645,
"rewards/format_reward": 0.6428571417927742,
"step": 239
},
{
"clip_ratio": 0.0,
"completion_length": 2331.8928833007812,
"epoch": 0.96,
"grad_norm": 0.5995355248451233,
"kl": 0.326904296875,
"learning_rate": 6.588648530198504e-07,
"loss": 0.0705,
"reward": 0.7613647617399693,
"reward_std": 0.8133140057325363,
"rewards/cosine_scaled_reward": 0.023539513116702437,
"rewards/format_reward": 0.7142857164144516,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 2355.52978515625,
"epoch": 0.964,
"grad_norm": 0.3258729875087738,
"kl": 0.307861328125,
"learning_rate": 6.558139508961654e-07,
"loss": 0.0608,
"reward": 0.6096780672669411,
"reward_std": 0.7518916502594948,
"rewards/cosine_scaled_reward": -0.05230383496382274,
"rewards/format_reward": 0.7142857313156128,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 2414.0834350585938,
"epoch": 0.968,
"grad_norm": 0.3521139919757843,
"kl": 0.3212890625,
"learning_rate": 6.527578915497951e-07,
"loss": 0.0742,
"reward": 0.6254040375351906,
"reward_std": 0.8331593424081802,
"rewards/cosine_scaled_reward": -0.03551226551644504,
"rewards/format_reward": 0.696428582072258,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 2138.107177734375,
"epoch": 0.972,
"grad_norm": 0.5599615573883057,
"kl": 0.33251953125,
"learning_rate": 6.496968239287603e-07,
"loss": 0.0315,
"reward": 0.8373362571001053,
"reward_std": 0.6551230400800705,
"rewards/cosine_scaled_reward": 0.04366813227534294,
"rewards/format_reward": 0.7500000149011612,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 2277.714324951172,
"epoch": 0.976,
"grad_norm": 0.6147165298461914,
"kl": 0.336181640625,
"learning_rate": 6.466308972251785e-07,
"loss": 0.1075,
"reward": 0.46155789494514465,
"reward_std": 0.6391154229640961,
"rewards/cosine_scaled_reward": -0.14124487387016416,
"rewards/format_reward": 0.7440476417541504,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 2401.232208251953,
"epoch": 0.98,
"grad_norm": 0.8454631567001343,
"kl": 0.4814453125,
"learning_rate": 6.435602608679916e-07,
"loss": 0.0417,
"reward": 0.5565547049045563,
"reward_std": 0.6768698394298553,
"rewards/cosine_scaled_reward": -0.04315121428226121,
"rewards/format_reward": 0.6428571492433548,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 2027.3453369140625,
"epoch": 0.984,
"grad_norm": 0.38341155648231506,
"kl": 0.289794921875,
"learning_rate": 6.404850645156841e-07,
"loss": 0.0993,
"reward": 0.7784423977136612,
"reward_std": 0.6467820554971695,
"rewards/cosine_scaled_reward": -0.009588314220309258,
"rewards/format_reward": 0.7976190596818924,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 2170.5416564941406,
"epoch": 0.988,
"grad_norm": 0.445024311542511,
"kl": 0.4326171875,
"learning_rate": 6.374054580489873e-07,
"loss": 0.1244,
"reward": 0.6971250772476196,
"reward_std": 0.7919557690620422,
"rewards/cosine_scaled_reward": -0.026437478853040375,
"rewards/format_reward": 0.7500000149011612,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 2060.2559814453125,
"epoch": 0.992,
"grad_norm": 0.49659866094589233,
"kl": 0.36865234375,
"learning_rate": 6.343215915635761e-07,
"loss": 0.0959,
"reward": 0.6287773251533508,
"reward_std": 0.7386345416307449,
"rewards/cosine_scaled_reward": -0.060611339285969734,
"rewards/format_reward": 0.7500000149011612,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 2148.869140625,
"epoch": 0.996,
"grad_norm": 0.4539166986942291,
"kl": 0.3701171875,
"learning_rate": 6.31233615362752e-07,
"loss": 0.1019,
"reward": 0.4996798560023308,
"reward_std": 0.6163481399416924,
"rewards/cosine_scaled_reward": -0.11027912324061617,
"rewards/format_reward": 0.7202381044626236,
"step": 249
},
{
"clip_ratio": 0.0,
"completion_length": 2330.3482971191406,
"epoch": 1.0,
"grad_norm": 0.5344291925430298,
"kl": 0.5205078125,
"learning_rate": 6.281416799501187e-07,
"loss": 0.0866,
"reward": 0.42578159645199776,
"reward_std": 0.7348527163267136,
"rewards/cosine_scaled_reward": -0.08472825400531292,
"rewards/format_reward": 0.5952381044626236,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 2106.059539794922,
"epoch": 1.004,
"grad_norm": 0.5930522680282593,
"kl": 0.38232421875,
"learning_rate": 6.25045936022246e-07,
"loss": 0.1423,
"reward": 0.5456876549869776,
"reward_std": 0.6847013607621193,
"rewards/cosine_scaled_reward": -0.0842990386299789,
"rewards/format_reward": 0.7142857164144516,
"step": 251
},
{
"clip_ratio": 0.0,
"completion_length": 2112.4762573242188,
"epoch": 1.008,
"grad_norm": 0.4610899090766907,
"kl": 0.39111328125,
"learning_rate": 6.219465344613258e-07,
"loss": 0.0641,
"reward": 0.6148004308342934,
"reward_std": 0.6790047585964203,
"rewards/cosine_scaled_reward": -0.05867121648043394,
"rewards/format_reward": 0.7321428805589676,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 2289.309600830078,
"epoch": 1.012,
"grad_norm": 0.3950199782848358,
"kl": 0.387451171875,
"learning_rate": 6.188436263278172e-07,
"loss": 0.1336,
"reward": 0.626802071928978,
"reward_std": 0.6337872818112373,
"rewards/cosine_scaled_reward": -0.028860883321613073,
"rewards/format_reward": 0.6845238283276558,
"step": 253
},
{
"clip_ratio": 0.0,
"completion_length": 2465.6130981445312,
"epoch": 1.016,
"grad_norm": 0.6084108352661133,
"kl": 0.45947265625,
"learning_rate": 6.157373628530852e-07,
"loss": 0.0932,
"reward": 0.6250473670661449,
"reward_std": 0.7445118278264999,
"rewards/cosine_scaled_reward": 2.3671891540288925e-05,
"rewards/format_reward": 0.6250000074505806,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 2127.8988647460938,
"epoch": 1.02,
"grad_norm": 0.8596522212028503,
"kl": 0.368896484375,
"learning_rate": 6.126278954320294e-07,
"loss": 0.0589,
"reward": 0.4597589522600174,
"reward_std": 0.710930123925209,
"rewards/cosine_scaled_reward": -0.1361919562332332,
"rewards/format_reward": 0.7321428656578064,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 2113.029815673828,
"epoch": 1.024,
"grad_norm": 0.6557802557945251,
"kl": 0.39306640625,
"learning_rate": 6.095153756157051e-07,
"loss": 0.0808,
"reward": 0.7969172149896622,
"reward_std": 0.7165066450834274,
"rewards/cosine_scaled_reward": 0.02643477637320757,
"rewards/format_reward": 0.7440476268529892,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 2350.4881591796875,
"epoch": 1.028,
"grad_norm": 0.7259902954101562,
"kl": 0.37548828125,
"learning_rate": 6.06399955103937e-07,
"loss": 0.0556,
"reward": 0.6144686937332153,
"reward_std": 0.7161982655525208,
"rewards/cosine_scaled_reward": -0.0052656568586826324,
"rewards/format_reward": 0.6250000149011612,
"step": 257
},
{
"clip_ratio": 0.0,
"completion_length": 2659.2500610351562,
"epoch": 1.032,
"grad_norm": 0.6974296569824219,
"kl": 0.4482421875,
"learning_rate": 6.032817857379256e-07,
"loss": 0.1425,
"reward": 0.38613639771938324,
"reward_std": 0.7526693046092987,
"rewards/cosine_scaled_reward": -0.10455084778368473,
"rewards/format_reward": 0.595238097012043,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 2219.6964721679688,
"epoch": 1.036,
"grad_norm": 0.5528798699378967,
"kl": 0.33984375,
"learning_rate": 6.001610194928464e-07,
"loss": 0.1191,
"reward": 0.6971464306116104,
"reward_std": 0.7383679300546646,
"rewards/cosine_scaled_reward": -0.023450596883776598,
"rewards/format_reward": 0.744047611951828,
"step": 259
},
{
"clip_ratio": 0.0,
"completion_length": 2010.1726989746094,
"epoch": 1.04,
"grad_norm": 0.36631372570991516,
"kl": 0.30126953125,
"learning_rate": 5.97037808470444e-07,
"loss": 0.0699,
"reward": 0.771461233496666,
"reward_std": 0.5148339942097664,
"rewards/cosine_scaled_reward": -0.01307891309261322,
"rewards/format_reward": 0.7976190596818924,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 2153.559539794922,
"epoch": 1.044,
"grad_norm": 0.48435378074645996,
"kl": 0.3251953125,
"learning_rate": 5.939123048916173e-07,
"loss": 0.0931,
"reward": 0.5015835016965866,
"reward_std": 0.69777412712574,
"rewards/cosine_scaled_reward": -0.121232058852911,
"rewards/format_reward": 0.7440476268529892,
"step": 261
},
{
"clip_ratio": 0.0,
"completion_length": 2309.1131591796875,
"epoch": 1.048,
"grad_norm": 0.6150787472724915,
"kl": 0.37060546875,
"learning_rate": 5.907846610890011e-07,
"loss": 0.1074,
"reward": 0.656824603676796,
"reward_std": 0.7539815902709961,
"rewards/cosine_scaled_reward": -0.025754368398338556,
"rewards/format_reward": 0.7083333283662796,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 2073.1488647460938,
"epoch": 1.052,
"grad_norm": 0.5915967226028442,
"kl": 0.32958984375,
"learning_rate": 5.87655029499542e-07,
"loss": 0.1016,
"reward": 0.5839189141988754,
"reward_std": 0.6906930133700371,
"rewards/cosine_scaled_reward": -0.10089768993202597,
"rewards/format_reward": 0.7857142984867096,
"step": 263
},
{
"clip_ratio": 0.0,
"completion_length": 2258.6607971191406,
"epoch": 1.056,
"grad_norm": 0.5032393932342529,
"kl": 0.421875,
"learning_rate": 5.845235626570683e-07,
"loss": 0.0833,
"reward": 0.7445018216967583,
"reward_std": 0.7239043861627579,
"rewards/cosine_scaled_reward": 0.0002271006815135479,
"rewards/format_reward": 0.7440476417541504,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 2421.386962890625,
"epoch": 1.06,
"grad_norm": 0.5948444604873657,
"kl": 0.46826171875,
"learning_rate": 5.813904131848564e-07,
"loss": 0.1342,
"reward": 0.3432777523994446,
"reward_std": 0.7306928038597107,
"rewards/cosine_scaled_reward": -0.1527658887207508,
"rewards/format_reward": 0.6488095223903656,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 1943.8214416503906,
"epoch": 1.064,
"grad_norm": 0.672618567943573,
"kl": 0.33251953125,
"learning_rate": 5.78255733788191e-07,
"loss": 0.074,
"reward": 0.5523176118731499,
"reward_std": 0.6472664028406143,
"rewards/cosine_scaled_reward": -0.08693643007427454,
"rewards/format_reward": 0.7261904776096344,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 2289.4107666015625,
"epoch": 1.068,
"grad_norm": 0.43480613827705383,
"kl": 0.41064453125,
"learning_rate": 5.751196772469237e-07,
"loss": 0.116,
"reward": 0.6816908866167068,
"reward_std": 0.7700821459293365,
"rewards/cosine_scaled_reward": -0.01034504920244217,
"rewards/format_reward": 0.70238097012043,
"step": 267
},
{
"clip_ratio": 0.0,
"completion_length": 2265.166748046875,
"epoch": 1.072,
"grad_norm": 0.8894410729408264,
"kl": 0.37890625,
"learning_rate": 5.71982396408026e-07,
"loss": 0.1102,
"reward": 0.5768959820270538,
"reward_std": 0.7392304837703705,
"rewards/cosine_scaled_reward": -0.04786152858287096,
"rewards/format_reward": 0.6726190447807312,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 2102.4345092773438,
"epoch": 1.076,
"grad_norm": 1.40628182888031,
"kl": 0.34814453125,
"learning_rate": 5.688440441781398e-07,
"loss": 0.1523,
"reward": 0.6540864631533623,
"reward_std": 0.7483679950237274,
"rewards/cosine_scaled_reward": -0.030099631054326892,
"rewards/format_reward": 0.7142857164144516,
"step": 269
},
{
"clip_ratio": 0.0,
"completion_length": 1760.5893249511719,
"epoch": 1.08,
"grad_norm": 0.655262291431427,
"kl": 0.34228515625,
"learning_rate": 5.657047735161255e-07,
"loss": 0.0938,
"reward": 0.7075737789273262,
"reward_std": 0.712226152420044,
"rewards/cosine_scaled_reward": -0.045022654812783,
"rewards/format_reward": 0.7976190596818924,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 1989.1488037109375,
"epoch": 1.084,
"grad_norm": 0.5984042286872864,
"kl": 0.3974609375,
"learning_rate": 5.625647374256061e-07,
"loss": 0.0893,
"reward": 0.5623346008360386,
"reward_std": 0.7052316814661026,
"rewards/cosine_scaled_reward": -0.08192794572096318,
"rewards/format_reward": 0.7261904925107956,
"step": 271
},
{
"clip_ratio": 0.0,
"completion_length": 1998.327392578125,
"epoch": 1.088,
"grad_norm": 0.41462650895118713,
"kl": 0.37939453125,
"learning_rate": 5.594240889475106e-07,
"loss": 0.1384,
"reward": 0.6586858294904232,
"reward_std": 0.8071554154157639,
"rewards/cosine_scaled_reward": -0.018871376756578684,
"rewards/format_reward": 0.696428582072258,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 1905.9880981445312,
"epoch": 1.092,
"grad_norm": 1.1817877292633057,
"kl": 0.4287109375,
"learning_rate": 5.562829811526154e-07,
"loss": 0.108,
"reward": 0.585694283246994,
"reward_std": 0.6987177431583405,
"rewards/cosine_scaled_reward": -0.08512906730175018,
"rewards/format_reward": 0.755952388048172,
"step": 273
},
{
"clip_ratio": 0.0,
"completion_length": 1958.0892639160156,
"epoch": 1.096,
"grad_norm": 0.6756201982498169,
"kl": 0.44580078125,
"learning_rate": 5.531415671340826e-07,
"loss": 0.1298,
"reward": 0.5423668641597033,
"reward_std": 0.5766877979040146,
"rewards/cosine_scaled_reward": -0.09786419570446014,
"rewards/format_reward": 0.7380952537059784,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 1423.4405212402344,
"epoch": 1.1,
"grad_norm": 0.9936150908470154,
"kl": 0.283203125,
"learning_rate": 5.5e-07,
"loss": 0.0068,
"reward": 0.8336242958903313,
"reward_std": 0.6556554213166237,
"rewards/cosine_scaled_reward": -0.02366404954227619,
"rewards/format_reward": 0.8809524178504944,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 1477.011962890625,
"epoch": 1.104,
"grad_norm": 1.4654834270477295,
"kl": 0.30419921875,
"learning_rate": 5.468584328659172e-07,
"loss": 0.1583,
"reward": 0.9086148589849472,
"reward_std": 0.7289283871650696,
"rewards/cosine_scaled_reward": 0.0197836235165596,
"rewards/format_reward": 0.8690476417541504,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 1600.8333740234375,
"epoch": 1.108,
"grad_norm": 0.5991122126579285,
"kl": 0.39697265625,
"learning_rate": 5.437170188473847e-07,
"loss": 0.0615,
"reward": 0.6998666599392891,
"reward_std": 0.6800315380096436,
"rewards/cosine_scaled_reward": -0.06375712971203029,
"rewards/format_reward": 0.82738097012043,
"step": 277
},
{
"clip_ratio": 0.0,
"completion_length": 2001.3035583496094,
"epoch": 1.112,
"grad_norm": 0.9033568501472473,
"kl": 0.4404296875,
"learning_rate": 5.405759110524894e-07,
"loss": 0.0566,
"reward": 0.5947119817137718,
"reward_std": 0.6757695525884628,
"rewards/cosine_scaled_reward": -0.0806201882660389,
"rewards/format_reward": 0.755952388048172,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 1884.9286499023438,
"epoch": 1.116,
"grad_norm": 1.0505043268203735,
"kl": 0.41162109375,
"learning_rate": 5.37435262574394e-07,
"loss": 0.0838,
"reward": 0.546771340072155,
"reward_std": 0.5643983408808708,
"rewards/cosine_scaled_reward": -0.13137624226510525,
"rewards/format_reward": 0.8095238208770752,
"step": 279
},
{
"clip_ratio": 0.0,
"completion_length": 1721.6309814453125,
"epoch": 1.12,
"grad_norm": 2.6171982288360596,
"kl": 0.35400390625,
"learning_rate": 5.342952264838747e-07,
"loss": 0.119,
"reward": 0.7959851026535034,
"reward_std": 0.6236628741025925,
"rewards/cosine_scaled_reward": -0.03057891083881259,
"rewards/format_reward": 0.8571428805589676,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 1974.3809814453125,
"epoch": 1.124,
"grad_norm": 0.9569424390792847,
"kl": 0.4814453125,
"learning_rate": 5.311559558218603e-07,
"loss": 0.1494,
"reward": 0.573462575674057,
"reward_std": 0.6640851646661758,
"rewards/cosine_scaled_reward": -0.09422110859304667,
"rewards/format_reward": 0.761904776096344,
"step": 281
},
{
"clip_ratio": 0.0,
"completion_length": 1699.2142944335938,
"epoch": 1.1280000000000001,
"grad_norm": 0.5432654619216919,
"kl": 0.33935546875,
"learning_rate": 5.28017603591974e-07,
"loss": 0.0877,
"reward": 0.7524446099996567,
"reward_std": 0.6557567343115807,
"rewards/cosine_scaled_reward": -0.04342056508176029,
"rewards/format_reward": 0.8392857164144516,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 2023.1012573242188,
"epoch": 1.1320000000000001,
"grad_norm": 1.5788854360580444,
"kl": 0.498046875,
"learning_rate": 5.248803227530763e-07,
"loss": 0.1449,
"reward": 0.471544723957777,
"reward_std": 0.7016247361898422,
"rewards/cosine_scaled_reward": -0.14220385067164898,
"rewards/format_reward": 0.755952388048172,
"step": 283
},
{
"clip_ratio": 0.0,
"completion_length": 1751.6190795898438,
"epoch": 1.1360000000000001,
"grad_norm": 0.8654693365097046,
"kl": 0.45654296875,
"learning_rate": 5.21744266211809e-07,
"loss": 0.096,
"reward": 0.8401590138673782,
"reward_std": 0.7027324140071869,
"rewards/cosine_scaled_reward": -0.008491916581988335,
"rewards/format_reward": 0.8571428805589676,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 1953.1309814453125,
"epoch": 1.1400000000000001,
"grad_norm": 0.7724223732948303,
"kl": 0.43017578125,
"learning_rate": 5.186095868151436e-07,
"loss": 0.1257,
"reward": 0.5251086875796318,
"reward_std": 0.75553198158741,
"rewards/cosine_scaled_reward": -0.11244566680397838,
"rewards/format_reward": 0.7500000298023224,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 1932.9940795898438,
"epoch": 1.144,
"grad_norm": 0.6642920970916748,
"kl": 0.470703125,
"learning_rate": 5.154764373429315e-07,
"loss": 0.096,
"reward": 0.8715938031673431,
"reward_std": 0.7678115516901016,
"rewards/cosine_scaled_reward": 0.036987369414418936,
"rewards/format_reward": 0.7976190745830536,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 1784.1428527832031,
"epoch": 1.148,
"grad_norm": 0.9823849201202393,
"kl": 0.38134765625,
"learning_rate": 5.123449705004581e-07,
"loss": 0.0437,
"reward": 0.7326274067163467,
"reward_std": 0.6021066680550575,
"rewards/cosine_scaled_reward": -0.044400574173778296,
"rewards/format_reward": 0.8214285969734192,
"step": 287
},
{
"clip_ratio": 0.0,
"completion_length": 1929.0536193847656,
"epoch": 1.152,
"grad_norm": 2.430745840072632,
"kl": 0.45458984375,
"learning_rate": 5.09215338910999e-07,
"loss": 0.1275,
"reward": 0.76754130423069,
"reward_std": 0.6635829508304596,
"rewards/cosine_scaled_reward": -0.0001579252420924604,
"rewards/format_reward": 0.7678571492433548,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 1567.4226684570312,
"epoch": 1.156,
"grad_norm": 1.8522855043411255,
"kl": 0.35302734375,
"learning_rate": 5.060876951083828e-07,
"loss": 0.0659,
"reward": 0.8090793639421463,
"reward_std": 0.6970714181661606,
"rewards/cosine_scaled_reward": -0.02105556521564722,
"rewards/format_reward": 0.8511905074119568,
"step": 289
},
{
"clip_ratio": 0.0,
"completion_length": 1916.0298156738281,
"epoch": 1.16,
"grad_norm": 0.8320524096488953,
"kl": 0.353515625,
"learning_rate": 5.02962191529556e-07,
"loss": 0.0397,
"reward": 0.8147249445319176,
"reward_std": 0.7559010833501816,
"rewards/cosine_scaled_reward": 0.014505308354273438,
"rewards/format_reward": 0.7857143133878708,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 1871.5595703125,
"epoch": 1.164,
"grad_norm": 1.639461636543274,
"kl": 0.44482421875,
"learning_rate": 4.998389805071536e-07,
"loss": 0.02,
"reward": 0.7966814041137695,
"reward_std": 0.6868171393871307,
"rewards/cosine_scaled_reward": -0.03320692107081413,
"rewards/format_reward": 0.8630952686071396,
"step": 291
},
{
"clip_ratio": 0.0,
"completion_length": 1849.0714721679688,
"epoch": 1.168,
"grad_norm": 0.9159106016159058,
"kl": 0.41357421875,
"learning_rate": 4.967182142620745e-07,
"loss": 0.1098,
"reward": 0.8123535662889481,
"reward_std": 0.7406510710716248,
"rewards/cosine_scaled_reward": -0.0075137000530958176,
"rewards/format_reward": 0.82738097012043,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 1627.2262268066406,
"epoch": 1.172,
"grad_norm": 1.2907826900482178,
"kl": 0.3271484375,
"learning_rate": 4.93600044896063e-07,
"loss": 0.028,
"reward": 0.7378726750612259,
"reward_std": 0.6904594451189041,
"rewards/cosine_scaled_reward": -0.07153987139463425,
"rewards/format_reward": 0.8809524029493332,
"step": 293
},
{
"clip_ratio": 0.0,
"completion_length": 2141.2202758789062,
"epoch": 1.176,
"grad_norm": 0.7737708687782288,
"kl": 0.4482421875,
"learning_rate": 4.904846243842949e-07,
"loss": 0.0644,
"reward": 0.7625293210148811,
"reward_std": 0.7152971476316452,
"rewards/cosine_scaled_reward": 0.009240844286978245,
"rewards/format_reward": 0.7440476417541504,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 2187.3809814453125,
"epoch": 1.18,
"grad_norm": 1.1525542736053467,
"kl": 0.51025390625,
"learning_rate": 4.873721045679706e-07,
"loss": 0.0634,
"reward": 0.5901899486780167,
"reward_std": 0.6728092133998871,
"rewards/cosine_scaled_reward": -0.0739526596153155,
"rewards/format_reward": 0.7380952537059784,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 2388.577392578125,
"epoch": 1.184,
"grad_norm": 0.9084761738777161,
"kl": 0.52734375,
"learning_rate": 4.842626371469149e-07,
"loss": 0.0587,
"reward": 0.4302752036601305,
"reward_std": 0.615352213382721,
"rewards/cosine_scaled_reward": -0.12117192603182048,
"rewards/format_reward": 0.6726190596818924,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 1568.6786193847656,
"epoch": 1.188,
"grad_norm": 0.852024495601654,
"kl": 0.1533203125,
"learning_rate": 4.811563736721829e-07,
"loss": 0.0574,
"reward": 0.7380149587988853,
"reward_std": 0.7155523598194122,
"rewards/cosine_scaled_reward": -0.029802043922245502,
"rewards/format_reward": 0.7976190596818924,
"step": 297
},
{
"clip_ratio": 0.0,
"completion_length": 1983.4524230957031,
"epoch": 1.192,
"grad_norm": 0.7617373466491699,
"kl": 0.303955078125,
"learning_rate": 4.780534655386743e-07,
"loss": 0.068,
"reward": 0.7127486318349838,
"reward_std": 0.7076264545321465,
"rewards/cosine_scaled_reward": -0.018625682685524225,
"rewards/format_reward": 0.7500000149011612,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 2038.3750305175781,
"epoch": 1.196,
"grad_norm": 0.8094474673271179,
"kl": 0.25830078125,
"learning_rate": 4.749540639777539e-07,
"loss": 0.0566,
"reward": 0.6301854252815247,
"reward_std": 0.6336864829063416,
"rewards/cosine_scaled_reward": -0.036097751930356026,
"rewards/format_reward": 0.7023809552192688,
"step": 299
},
{
"clip_ratio": 0.0,
"completion_length": 1825.9643249511719,
"epoch": 1.2,
"grad_norm": 1.8039993047714233,
"kl": 0.225830078125,
"learning_rate": 4.7185832004988133e-07,
"loss": 0.0501,
"reward": 0.9151953011751175,
"reward_std": 0.6518659368157387,
"rewards/cosine_scaled_reward": 0.03200240898877382,
"rewards/format_reward": 0.8511905074119568,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 1954.3809814453125,
"epoch": 1.204,
"grad_norm": 0.9098180532455444,
"kl": 0.2685546875,
"learning_rate": 4.68766384637248e-07,
"loss": 0.08,
"reward": 0.903901144862175,
"reward_std": 0.7074443101882935,
"rewards/cosine_scaled_reward": 0.059093400835990906,
"rewards/format_reward": 0.7857142835855484,
"step": 301
},
{
"clip_ratio": 0.0,
"completion_length": 2221.6012268066406,
"epoch": 1.208,
"grad_norm": 0.628447949886322,
"kl": 0.297119140625,
"learning_rate": 4.656784084364238e-07,
"loss": 0.0225,
"reward": 0.7435066364705563,
"reward_std": 0.7286128550767899,
"rewards/cosine_scaled_reward": 0.002705696038901806,
"rewards/format_reward": 0.7380952537059784,
"step": 302
},
{
"clip_ratio": 0.0,
"completion_length": 1967.6607360839844,
"epoch": 1.212,
"grad_norm": 1.4870760440826416,
"kl": 0.2193603515625,
"learning_rate": 4.6259454195101267e-07,
"loss": 0.0076,
"reward": 0.6118638888001442,
"reward_std": 0.6256552934646606,
"rewards/cosine_scaled_reward": -0.08990138117223978,
"rewards/format_reward": 0.7916666716337204,
"step": 303
},
{
"clip_ratio": 0.0,
"completion_length": 2091.8810119628906,
"epoch": 1.216,
"grad_norm": 1.0213916301727295,
"kl": 0.28857421875,
"learning_rate": 4.59514935484316e-07,
"loss": 0.0819,
"reward": 0.8858746439218521,
"reward_std": 0.760543704032898,
"rewards/cosine_scaled_reward": 0.04412779211997986,
"rewards/format_reward": 0.7976190596818924,
"step": 304
},
{
"clip_ratio": 0.0,
"completion_length": 2117.5357971191406,
"epoch": 1.22,
"grad_norm": 1.1696289777755737,
"kl": 0.266845703125,
"learning_rate": 4.5643973913200837e-07,
"loss": 0.0319,
"reward": 0.4878672659397125,
"reward_std": 0.5883132815361023,
"rewards/cosine_scaled_reward": -0.13999494537711143,
"rewards/format_reward": 0.7678571492433548,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 1962.8512268066406,
"epoch": 1.224,
"grad_norm": 1.1181604862213135,
"kl": 0.26171875,
"learning_rate": 4.5336910277482155e-07,
"loss": 0.0553,
"reward": 0.8040256127715111,
"reward_std": 0.7542890757322311,
"rewards/cosine_scaled_reward": 0.00915566342882812,
"rewards/format_reward": 0.7857143059372902,
"step": 306
},
{
"clip_ratio": 0.0,
"completion_length": 2503.541717529297,
"epoch": 1.228,
"grad_norm": 1.0075181722640991,
"kl": 0.274658203125,
"learning_rate": 4.503031760712397e-07,
"loss": 0.0639,
"reward": 0.5502185635268688,
"reward_std": 0.7036140263080597,
"rewards/cosine_scaled_reward": -0.043343101628124714,
"rewards/format_reward": 0.6369047686457634,
"step": 307
},
{
"clip_ratio": 0.0,
"completion_length": 2027.6905212402344,
"epoch": 1.232,
"grad_norm": 2.7786951065063477,
"kl": 0.265380859375,
"learning_rate": 4.4724210845020494e-07,
"loss": 0.1529,
"reward": 0.8017951250076294,
"reward_std": 0.7912951856851578,
"rewards/cosine_scaled_reward": 0.005064212018623948,
"rewards/format_reward": 0.7916666865348816,
"step": 308
},
{
"clip_ratio": 0.0,
"completion_length": 2560.6845703125,
"epoch": 1.236,
"grad_norm": 1.6693713665008545,
"kl": 0.2939453125,
"learning_rate": 4.441860491038345e-07,
"loss": 0.1046,
"reward": 0.6068699322640896,
"reward_std": 0.7445466667413712,
"rewards/cosine_scaled_reward": -0.00013647368177771568,
"rewards/format_reward": 0.6071428656578064,
"step": 309
},
{
"clip_ratio": 0.0,
"completion_length": 2221.3988037109375,
"epoch": 1.24,
"grad_norm": 0.7167072892189026,
"kl": 0.253173828125,
"learning_rate": 4.4113514698014953e-07,
"loss": 0.046,
"reward": 0.5108997635543346,
"reward_std": 0.6983606815338135,
"rewards/cosine_scaled_reward": -0.09276440553367138,
"rewards/format_reward": 0.696428582072258,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 2308.1012573242188,
"epoch": 1.244,
"grad_norm": 1.289093255996704,
"kl": 0.24072265625,
"learning_rate": 4.3808955077581546e-07,
"loss": 0.074,
"reward": 0.49418094009160995,
"reward_std": 0.6803844273090363,
"rewards/cosine_scaled_reward": -0.0862428704276681,
"rewards/format_reward": 0.6666666716337204,
"step": 311
},
{
"clip_ratio": 0.0,
"completion_length": 2221.500030517578,
"epoch": 1.248,
"grad_norm": 0.7747544646263123,
"kl": 0.284423828125,
"learning_rate": 4.350494089288943e-07,
"loss": 0.0127,
"reward": 0.5928547494113445,
"reward_std": 0.6995180547237396,
"rewards/cosine_scaled_reward": -0.0875012082979083,
"rewards/format_reward": 0.7678571492433548,
"step": 312
},
{
"clip_ratio": 0.0,
"completion_length": 2528.3750610351562,
"epoch": 1.252,
"grad_norm": 0.9067274928092957,
"kl": 0.261962890625,
"learning_rate": 4.3201486961161093e-07,
"loss": 0.0588,
"reward": 0.580617468804121,
"reward_std": 0.7565959244966507,
"rewards/cosine_scaled_reward": -0.05195318069308996,
"rewards/format_reward": 0.6845238208770752,
"step": 313
},
{
"clip_ratio": 0.0,
"completion_length": 2290.434539794922,
"epoch": 1.256,
"grad_norm": 1.0397149324417114,
"kl": 0.2939453125,
"learning_rate": 4.2898608072313045e-07,
"loss": 0.0843,
"reward": 0.923637330532074,
"reward_std": 0.8029050081968307,
"rewards/cosine_scaled_reward": 0.07491390081122518,
"rewards/format_reward": 0.7738095372915268,
"step": 314
},
{
"clip_ratio": 0.0,
"completion_length": 2221.1785583496094,
"epoch": 1.26,
"grad_norm": 0.8451793789863586,
"kl": 0.2978515625,
"learning_rate": 4.2596318988235037e-07,
"loss": 0.0794,
"reward": 0.9175606220960617,
"reward_std": 0.6950835883617401,
"rewards/cosine_scaled_reward": 0.06592314876616001,
"rewards/format_reward": 0.7857142984867096,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 2737.1131591796875,
"epoch": 1.264,
"grad_norm": 0.8914613723754883,
"kl": 0.4072265625,
"learning_rate": 4.2294634442070553e-07,
"loss": 0.0266,
"reward": 0.39799112919718027,
"reward_std": 0.5211281925439835,
"rewards/cosine_scaled_reward": -0.0896949004381895,
"rewards/format_reward": 0.5773809626698494,
"step": 316
},
{
"clip_ratio": 0.0,
"completion_length": 2356.8155517578125,
"epoch": 1.268,
"grad_norm": 0.8658885955810547,
"kl": 0.32080078125,
"learning_rate": 4.1993569137498776e-07,
"loss": 0.0558,
"reward": 0.4528093598783016,
"reward_std": 0.5718662440776825,
"rewards/cosine_scaled_reward": -0.11585722491145134,
"rewards/format_reward": 0.6845238283276558,
"step": 317
},
{
"clip_ratio": 0.0,
"completion_length": 2067.0952758789062,
"epoch": 1.272,
"grad_norm": 0.6174459457397461,
"kl": 0.2724609375,
"learning_rate": 4.1693137748017915e-07,
"loss": 0.0532,
"reward": 0.9527914822101593,
"reward_std": 0.7573249191045761,
"rewards/cosine_scaled_reward": 0.059729063883423805,
"rewards/format_reward": 0.833333358168602,
"step": 318
},
{
"clip_ratio": 0.0,
"completion_length": 2158.3631286621094,
"epoch": 1.276,
"grad_norm": 0.5749858617782593,
"kl": 0.2744140625,
"learning_rate": 4.1393354916230005e-07,
"loss": 0.0398,
"reward": 0.7759583368897438,
"reward_std": 0.7076128423213959,
"rewards/cosine_scaled_reward": 0.00702677620574832,
"rewards/format_reward": 0.761904776096344,
"step": 319
},
{
"clip_ratio": 0.0,
"completion_length": 2376.7857666015625,
"epoch": 1.28,
"grad_norm": 0.4824450612068176,
"kl": 0.358642578125,
"learning_rate": 4.1094235253127374e-07,
"loss": 0.0579,
"reward": 0.5863704346120358,
"reward_std": 0.69185970723629,
"rewards/cosine_scaled_reward": -0.058005278930068016,
"rewards/format_reward": 0.7023809552192688,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 2094.6786193847656,
"epoch": 1.284,
"grad_norm": 1.153307318687439,
"kl": 0.32373046875,
"learning_rate": 4.079579333738039e-07,
"loss": 0.0147,
"reward": 0.5667938031256199,
"reward_std": 0.6206858605146408,
"rewards/cosine_scaled_reward": -0.12434119766112417,
"rewards/format_reward": 0.8154762089252472,
"step": 321
},
{
"clip_ratio": 0.0,
"completion_length": 2360.970245361328,
"epoch": 1.288,
"grad_norm": 0.7556703090667725,
"kl": 0.314697265625,
"learning_rate": 4.0498043714627006e-07,
"loss": 0.0413,
"reward": 0.54334956407547,
"reward_std": 0.7112371101975441,
"rewards/cosine_scaled_reward": -0.10927761369384825,
"rewards/format_reward": 0.761904776096344,
"step": 322
},
{
"clip_ratio": 0.0,
"completion_length": 2466.9703369140625,
"epoch": 1.292,
"grad_norm": 0.6241899728775024,
"kl": 0.345703125,
"learning_rate": 4.020100089676376e-07,
"loss": 0.0463,
"reward": 0.5620089694857597,
"reward_std": 0.6381285488605499,
"rewards/cosine_scaled_reward": -0.0672098146751523,
"rewards/format_reward": 0.696428582072258,
"step": 323
},
{
"clip_ratio": 0.0,
"completion_length": 2297.2916564941406,
"epoch": 1.296,
"grad_norm": 1.050784945487976,
"kl": 0.289306640625,
"learning_rate": 3.9904679361238526e-07,
"loss": 0.095,
"reward": 0.6569867879152298,
"reward_std": 0.6581598520278931,
"rewards/cosine_scaled_reward": -0.034601859748363495,
"rewards/format_reward": 0.7261904925107956,
"step": 324
},
{
"clip_ratio": 0.0,
"completion_length": 2391.8095703125,
"epoch": 1.3,
"grad_norm": 0.5910518169403076,
"kl": 0.327392578125,
"learning_rate": 3.9609093550344907e-07,
"loss": 0.065,
"reward": 0.6689947620034218,
"reward_std": 0.5862837731838226,
"rewards/cosine_scaled_reward": -0.0434788279235363,
"rewards/format_reward": 0.7559524029493332,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 2161.34521484375,
"epoch": 1.304,
"grad_norm": 1.3934383392333984,
"kl": 0.2412109375,
"learning_rate": 3.931425787051832e-07,
"loss": 0.0952,
"reward": 0.7927189618349075,
"reward_std": 0.8861154615879059,
"rewards/cosine_scaled_reward": 0.03624042624142021,
"rewards/format_reward": 0.7202381044626236,
"step": 326
},
{
"clip_ratio": 0.0,
"completion_length": 2205.75,
"epoch": 1.308,
"grad_norm": 0.5909004211425781,
"kl": 0.276611328125,
"learning_rate": 3.902018669163384e-07,
"loss": 0.0265,
"reward": 0.7868844717741013,
"reward_std": 0.6631656885147095,
"rewards/cosine_scaled_reward": 0.024394613516051322,
"rewards/format_reward": 0.7380952388048172,
"step": 327
},
{
"clip_ratio": 0.0,
"completion_length": 2526.71435546875,
"epoch": 1.312,
"grad_norm": 0.37658610939979553,
"kl": 0.30908203125,
"learning_rate": 3.872689434630585e-07,
"loss": 0.0593,
"reward": 0.3922804482281208,
"reward_std": 0.7164648473262787,
"rewards/cosine_scaled_reward": -0.11933596897870302,
"rewards/format_reward": 0.6309523731470108,
"step": 328
},
{
"clip_ratio": 0.0,
"completion_length": 2406.6726684570312,
"epoch": 1.316,
"grad_norm": 0.5439748764038086,
"kl": 0.28759765625,
"learning_rate": 3.843439512918949e-07,
"loss": 0.0395,
"reward": 0.457830130122602,
"reward_std": 0.6897861212491989,
"rewards/cosine_scaled_reward": -0.10739446245133877,
"rewards/format_reward": 0.6726190447807312,
"step": 329
},
{
"clip_ratio": 0.0,
"completion_length": 2233.6548461914062,
"epoch": 1.32,
"grad_norm": 1.2243571281433105,
"kl": 0.289306640625,
"learning_rate": 3.8142703296283953e-07,
"loss": 0.1087,
"reward": 0.6516863703727722,
"reward_std": 0.7036527991294861,
"rewards/cosine_scaled_reward": -0.08189492486417294,
"rewards/format_reward": 0.8154762089252472,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 2212.7560119628906,
"epoch": 1.324,
"grad_norm": 0.8144615888595581,
"kl": 0.28857421875,
"learning_rate": 3.785183306423767e-07,
"loss": 0.0775,
"reward": 0.5815620422363281,
"reward_std": 0.5177476480603218,
"rewards/cosine_scaled_reward": -0.042552310740575194,
"rewards/format_reward": 0.6666666865348816,
"step": 331
},
{
"clip_ratio": 0.0,
"completion_length": 2412.0059814453125,
"epoch": 1.328,
"grad_norm": 0.42855292558670044,
"kl": 0.3232421875,
"learning_rate": 3.7561798609655373e-07,
"loss": 0.0791,
"reward": 0.642042949795723,
"reward_std": 0.6289803832769394,
"rewards/cosine_scaled_reward": -0.04207377042621374,
"rewards/format_reward": 0.7261904925107956,
"step": 332
},
{
"clip_ratio": 0.0,
"completion_length": 2163.9822387695312,
"epoch": 1.332,
"grad_norm": 1.0114275217056274,
"kl": 0.255859375,
"learning_rate": 3.72726140684072e-07,
"loss": 0.088,
"reward": 0.811268161451153,
"reward_std": 0.6822613030672073,
"rewards/cosine_scaled_reward": 0.042538831010460854,
"rewards/format_reward": 0.7261904925107956,
"step": 333
},
{
"clip_ratio": 0.0,
"completion_length": 2329.0178833007812,
"epoch": 1.336,
"grad_norm": 0.7170870900154114,
"kl": 0.3486328125,
"learning_rate": 3.6984293534939737e-07,
"loss": 0.0455,
"reward": 0.8848401606082916,
"reward_std": 0.7328508943319321,
"rewards/cosine_scaled_reward": 0.04361054569017142,
"rewards/format_reward": 0.7976190745830536,
"step": 334
},
{
"clip_ratio": 0.0,
"completion_length": 2709.2262573242188,
"epoch": 1.34,
"grad_norm": 0.47293010354042053,
"kl": 0.38037109375,
"learning_rate": 3.6696851061588994e-07,
"loss": 0.0416,
"reward": 0.3898888286203146,
"reward_std": 0.6401937156915665,
"rewards/cosine_scaled_reward": -0.10862701199948788,
"rewards/format_reward": 0.6071428582072258,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 2530.416748046875,
"epoch": 1.3439999999999999,
"grad_norm": 0.4423607885837555,
"kl": 0.282958984375,
"learning_rate": 3.641030065789562e-07,
"loss": 0.0446,
"reward": 0.6725399196147919,
"reward_std": 0.7871751934289932,
"rewards/cosine_scaled_reward": 0.0059128133580088615,
"rewards/format_reward": 0.6607142984867096,
"step": 336
},
{
"clip_ratio": 0.0,
"completion_length": 2311.5358276367188,
"epoch": 1.3479999999999999,
"grad_norm": 0.5007253885269165,
"kl": 0.3203125,
"learning_rate": 3.612465628992203e-07,
"loss": 0.0455,
"reward": 0.8073793947696686,
"reward_std": 0.7870100140571594,
"rewards/cosine_scaled_reward": 0.010832530329935253,
"rewards/format_reward": 0.7857142984867096,
"step": 337
},
{
"clip_ratio": 0.0,
"completion_length": 2489.7500610351562,
"epoch": 1.3519999999999999,
"grad_norm": 0.36444640159606934,
"kl": 0.305908203125,
"learning_rate": 3.5839931879571725e-07,
"loss": 0.0652,
"reward": 0.6751855611801147,
"reward_std": 0.6701688021421432,
"rewards/cosine_scaled_reward": 0.001283254474401474,
"rewards/format_reward": 0.6726190596818924,
"step": 338
},
{
"clip_ratio": 0.0,
"completion_length": 2460.8154907226562,
"epoch": 1.3559999999999999,
"grad_norm": 0.43892228603363037,
"kl": 0.3369140625,
"learning_rate": 3.555614130391079e-07,
"loss": 0.0519,
"reward": 0.6638183146715164,
"reward_std": 0.770327016711235,
"rewards/cosine_scaled_reward": 0.010480590397492051,
"rewards/format_reward": 0.6428571566939354,
"step": 339
},
{
"clip_ratio": 0.0,
"completion_length": 2244.3631896972656,
"epoch": 1.3599999999999999,
"grad_norm": 0.6102768778800964,
"kl": 0.31201171875,
"learning_rate": 3.5273298394491515e-07,
"loss": 0.0694,
"reward": 0.8422182202339172,
"reward_std": 0.6671302318572998,
"rewards/cosine_scaled_reward": 0.04610910080373287,
"rewards/format_reward": 0.7500000074505806,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 2239.75,
"epoch": 1.3639999999999999,
"grad_norm": 0.6582260727882385,
"kl": 0.3271484375,
"learning_rate": 3.4991416936678276e-07,
"loss": 0.076,
"reward": 0.6709855943918228,
"reward_std": 0.7041856721043587,
"rewards/cosine_scaled_reward": -0.03355482150800526,
"rewards/format_reward": 0.7380952537059784,
"step": 341
},
{
"clip_ratio": 0.0,
"completion_length": 2438.3214721679688,
"epoch": 1.3679999999999999,
"grad_norm": 0.5521511435508728,
"kl": 0.320556640625,
"learning_rate": 3.471051066897562e-07,
"loss": 0.047,
"reward": 0.6942454129457474,
"reward_std": 0.6340186148881912,
"rewards/cosine_scaled_reward": 0.010813180379045662,
"rewards/format_reward": 0.6726190745830536,
"step": 342
},
{
"clip_ratio": 0.0,
"completion_length": 2433.6488647460938,
"epoch": 1.3719999999999999,
"grad_norm": 0.928674042224884,
"kl": 0.40478515625,
"learning_rate": 3.4430593282358777e-07,
"loss": 0.0328,
"reward": 0.5231252759695053,
"reward_std": 0.7485495656728745,
"rewards/cosine_scaled_reward": -0.11641356535255909,
"rewards/format_reward": 0.755952388048172,
"step": 343
},
{
"clip_ratio": 0.0,
"completion_length": 2388.2202758789062,
"epoch": 1.376,
"grad_norm": 0.43529966473579407,
"kl": 0.32080078125,
"learning_rate": 3.4151678419606233e-07,
"loss": 0.0303,
"reward": 0.7449862584471703,
"reward_std": 0.6971839666366577,
"rewards/cosine_scaled_reward": 0.024278827477246523,
"rewards/format_reward": 0.696428582072258,
"step": 344
},
{
"clip_ratio": 0.0,
"completion_length": 2686.15478515625,
"epoch": 1.38,
"grad_norm": 0.5191164016723633,
"kl": 0.36328125,
"learning_rate": 3.387377967463493e-07,
"loss": 0.0602,
"reward": 0.4467791821807623,
"reward_std": 0.6689166128635406,
"rewards/cosine_scaled_reward": -0.07125327130779624,
"rewards/format_reward": 0.5892857238650322,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 2138.119110107422,
"epoch": 1.384,
"grad_norm": 0.40859875082969666,
"kl": 0.344970703125,
"learning_rate": 3.359691059183761e-07,
"loss": 0.0894,
"reward": 0.7263324186205864,
"reward_std": 0.7082626074552536,
"rewards/cosine_scaled_reward": -0.029690947383642197,
"rewards/format_reward": 0.7857143133878708,
"step": 346
},
{
"clip_ratio": 0.0,
"completion_length": 2158.6429138183594,
"epoch": 1.388,
"grad_norm": 0.35558465123176575,
"kl": 0.29638671875,
"learning_rate": 3.3321084665422803e-07,
"loss": 0.0262,
"reward": 0.6269577667117119,
"reward_std": 0.5908889323472977,
"rewards/cosine_scaled_reward": -0.04664018237963319,
"rewards/format_reward": 0.7202381119132042,
"step": 347
},
{
"clip_ratio": 0.0,
"completion_length": 2144.619110107422,
"epoch": 1.392,
"grad_norm": 1.211071491241455,
"kl": 0.306640625,
"learning_rate": 3.3046315338757026e-07,
"loss": -0.0105,
"reward": 0.6653935462236404,
"reward_std": 0.6245283707976341,
"rewards/cosine_scaled_reward": -0.04230323247611523,
"rewards/format_reward": 0.75,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 2366.4940795898438,
"epoch": 1.396,
"grad_norm": 0.5814414620399475,
"kl": 0.33154296875,
"learning_rate": 3.2772616003709616e-07,
"loss": 0.0485,
"reward": 0.5602632537484169,
"reward_std": 0.5761818215250969,
"rewards/cosine_scaled_reward": -0.0978445541113615,
"rewards/format_reward": 0.755952388048172,
"step": 349
},
{
"clip_ratio": 0.0,
"completion_length": 2348.6607666015625,
"epoch": 1.4,
"grad_norm": 0.675369918346405,
"kl": 0.29931640625,
"learning_rate": 3.250000000000001e-07,
"loss": 0.0825,
"reward": 0.475093599408865,
"reward_std": 0.604865163564682,
"rewards/cosine_scaled_reward": -0.07792939431965351,
"rewards/format_reward": 0.6309523731470108,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 2099.279815673828,
"epoch": 1.404,
"grad_norm": 0.5227596163749695,
"kl": 0.33447265625,
"learning_rate": 3.222848061454764e-07,
"loss": 0.0454,
"reward": 0.6502892896533012,
"reward_std": 0.676431730389595,
"rewards/cosine_scaled_reward": -0.05878393305465579,
"rewards/format_reward": 0.7678571492433548,
"step": 351
},
{
"clip_ratio": 0.0,
"completion_length": 2465.202392578125,
"epoch": 1.408,
"grad_norm": 0.4936739206314087,
"kl": 0.33154296875,
"learning_rate": 3.195807108082429e-07,
"loss": 0.0349,
"reward": 0.51472207903862,
"reward_std": 0.6474315822124481,
"rewards/cosine_scaled_reward": -0.05216278973966837,
"rewards/format_reward": 0.6190476417541504,
"step": 352
},
{
"clip_ratio": 0.0,
"completion_length": 2241.089324951172,
"epoch": 1.412,
"grad_norm": 0.4653976857662201,
"kl": 0.3046875,
"learning_rate": 3.168878457820915e-07,
"loss": 0.0576,
"reward": 0.7246856689453125,
"reward_std": 0.7023278325796127,
"rewards/cosine_scaled_reward": -0.02456192229874432,
"rewards/format_reward": 0.7738095223903656,
"step": 353
},
{
"clip_ratio": 0.0,
"completion_length": 2174.4107666015625,
"epoch": 1.416,
"grad_norm": 1.179158091545105,
"kl": 0.31982421875,
"learning_rate": 3.142063423134644e-07,
"loss": 0.1321,
"reward": 0.4120100736618042,
"reward_std": 0.5803252756595612,
"rewards/cosine_scaled_reward": -0.19280448742210865,
"rewards/format_reward": 0.7976190596818924,
"step": 354
},
{
"clip_ratio": 0.0,
"completion_length": 2560.9405517578125,
"epoch": 1.42,
"grad_norm": 0.6409890651702881,
"kl": 0.3291015625,
"learning_rate": 3.115363310950578e-07,
"loss": 0.0637,
"reward": 0.6557277590036392,
"reward_std": 0.8805683702230453,
"rewards/cosine_scaled_reward": -0.02332661801483482,
"rewards/format_reward": 0.70238097012043,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 1872.1012573242188,
"epoch": 1.424,
"grad_norm": 0.4570577144622803,
"kl": 0.244873046875,
"learning_rate": 3.0887794225945143e-07,
"loss": 0.0537,
"reward": 0.8301898017525673,
"reward_std": 0.6987727582454681,
"rewards/cosine_scaled_reward": -0.016452712705358863,
"rewards/format_reward": 0.8630952537059784,
"step": 356
},
{
"clip_ratio": 0.0,
"completion_length": 2199.4880981445312,
"epoch": 1.428,
"grad_norm": 0.5453688502311707,
"kl": 0.3388671875,
"learning_rate": 3.062313053727671e-07,
"loss": 0.1004,
"reward": 0.5429714322090149,
"reward_std": 0.757801964879036,
"rewards/cosine_scaled_reward": -0.11244285944849253,
"rewards/format_reward": 0.7678571492433548,
"step": 357
},
{
"clip_ratio": 0.0,
"completion_length": 2392.5536193847656,
"epoch": 1.432,
"grad_norm": 0.4179025888442993,
"kl": 0.36767578125,
"learning_rate": 3.0359654942835247e-07,
"loss": 0.057,
"reward": 0.6754717975854874,
"reward_std": 0.8176562935113907,
"rewards/cosine_scaled_reward": -0.004526023752987385,
"rewards/format_reward": 0.6845238357782364,
"step": 358
},
{
"clip_ratio": 0.0,
"completion_length": 2280.5000915527344,
"epoch": 1.436,
"grad_norm": 0.5272053480148315,
"kl": 0.273681640625,
"learning_rate": 3.0097380284049523e-07,
"loss": 0.0565,
"reward": 0.644446611404419,
"reward_std": 0.7567472010850906,
"rewards/cosine_scaled_reward": -0.02896718680858612,
"rewards/format_reward": 0.70238097012043,
"step": 359
},
{
"clip_ratio": 0.0,
"completion_length": 2270.3928833007812,
"epoch": 1.44,
"grad_norm": 0.8152810335159302,
"kl": 0.34619140625,
"learning_rate": 2.9836319343816397e-07,
"loss": 0.0306,
"reward": 0.7786325067281723,
"reward_std": 0.559767447412014,
"rewards/cosine_scaled_reward": -0.012469482608139515,
"rewards/format_reward": 0.8035714477300644,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 2094.2083740234375,
"epoch": 1.444,
"grad_norm": 0.9731494188308716,
"kl": 0.33203125,
"learning_rate": 2.9576484845877793e-07,
"loss": 0.0315,
"reward": 0.7239094823598862,
"reward_std": 0.6780030280351639,
"rewards/cosine_scaled_reward": -0.057688117027282715,
"rewards/format_reward": 0.839285746216774,
"step": 361
},
{
"clip_ratio": 0.0,
"completion_length": 2515.3809814453125,
"epoch": 1.448,
"grad_norm": 0.5006127953529358,
"kl": 0.3583984375,
"learning_rate": 2.931788945420058e-07,
"loss": 0.0632,
"reward": 0.5585716450586915,
"reward_std": 0.6955743506550789,
"rewards/cosine_scaled_reward": -0.08976180851459503,
"rewards/format_reward": 0.7380952462553978,
"step": 362
},
{
"clip_ratio": 0.0,
"completion_length": 2665.2560424804688,
"epoch": 1.452,
"grad_norm": 0.4868517220020294,
"kl": 0.373046875,
"learning_rate": 2.9060545772359305e-07,
"loss": 0.0555,
"reward": 0.5607914663851261,
"reward_std": 0.6483574956655502,
"rewards/cosine_scaled_reward": -0.07377092959359288,
"rewards/format_reward": 0.7083333432674408,
"step": 363
},
{
"clip_ratio": 0.0,
"completion_length": 2244.7262573242188,
"epoch": 1.456,
"grad_norm": 0.6844132542610168,
"kl": 0.3173828125,
"learning_rate": 2.8804466342921987e-07,
"loss": 0.0109,
"reward": 0.7073798812925816,
"reward_std": 0.6621369272470474,
"rewards/cosine_scaled_reward": -0.01833386719226837,
"rewards/format_reward": 0.7440476417541504,
"step": 364
},
{
"clip_ratio": 0.0,
"completion_length": 2576.1905517578125,
"epoch": 1.46,
"grad_norm": 0.5755227208137512,
"kl": 0.35400390625,
"learning_rate": 2.854966364683872e-07,
"loss": 0.0531,
"reward": 0.6706622801721096,
"reward_std": 0.8000525310635567,
"rewards/cosine_scaled_reward": -0.03371649980545044,
"rewards/format_reward": 0.7380952537059784,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 2664.3928833007812,
"epoch": 1.464,
"grad_norm": 0.6695978045463562,
"kl": 0.4052734375,
"learning_rate": 2.829615010283344e-07,
"loss": 0.1001,
"reward": 0.6332942470908165,
"reward_std": 0.9363250732421875,
"rewards/cosine_scaled_reward": -0.04049574676901102,
"rewards/format_reward": 0.7142857313156128,
"step": 366
},
{
"clip_ratio": 0.0,
"completion_length": 2493.2857666015625,
"epoch": 1.468,
"grad_norm": 0.41825661063194275,
"kl": 0.269775390625,
"learning_rate": 2.8043938066798645e-07,
"loss": 0.0634,
"reward": 0.6000736728310585,
"reward_std": 0.6958686709403992,
"rewards/cosine_scaled_reward": -0.04520127363502979,
"rewards/format_reward": 0.690476194024086,
"step": 367
},
{
"clip_ratio": 0.0,
"completion_length": 2441.184600830078,
"epoch": 1.472,
"grad_norm": 0.6742368936538696,
"kl": 0.29248046875,
"learning_rate": 2.7793039831193133e-07,
"loss": 0.0205,
"reward": 0.7077510952949524,
"reward_std": 0.8173489719629288,
"rewards/cosine_scaled_reward": -0.003267320804297924,
"rewards/format_reward": 0.7142857313156128,
"step": 368
},
{
"clip_ratio": 0.0,
"completion_length": 2645.202392578125,
"epoch": 1.476,
"grad_norm": 0.6914957761764526,
"kl": 0.298095703125,
"learning_rate": 2.7543467624442956e-07,
"loss": 0.0967,
"reward": 0.2303389220032841,
"reward_std": 0.6355866640806198,
"rewards/cosine_scaled_reward": -0.1616162583231926,
"rewards/format_reward": 0.5535714328289032,
"step": 369
},
{
"clip_ratio": 0.0,
"completion_length": 2256.9286193847656,
"epoch": 1.48,
"grad_norm": 0.9714637994766235,
"kl": 0.255126953125,
"learning_rate": 2.729523361034538e-07,
"loss": 0.0866,
"reward": 0.7436040937900543,
"reward_std": 0.6377575844526291,
"rewards/cosine_scaled_reward": -0.012126525864005089,
"rewards/format_reward": 0.767857164144516,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 2519.7202758789062,
"epoch": 1.484,
"grad_norm": 0.6541756987571716,
"kl": 0.32470703125,
"learning_rate": 2.7048349887476037e-07,
"loss": 0.0731,
"reward": 0.8480066582560539,
"reward_std": 0.7711106240749359,
"rewards/cosine_scaled_reward": 0.031146179419010878,
"rewards/format_reward": 0.7857142984867096,
"step": 371
},
{
"clip_ratio": 0.0,
"completion_length": 2420.970245361328,
"epoch": 1.488,
"grad_norm": 0.5346278548240662,
"kl": 0.2998046875,
"learning_rate": 2.6802828488599294e-07,
"loss": 0.0556,
"reward": 0.6287192776799202,
"reward_std": 0.6931318640708923,
"rewards/cosine_scaled_reward": -0.03683085576631129,
"rewards/format_reward": 0.7023809552192688,
"step": 372
},
{
"clip_ratio": 0.0,
"completion_length": 2542.5654907226562,
"epoch": 1.492,
"grad_norm": 0.43199771642684937,
"kl": 0.33544921875,
"learning_rate": 2.655868138008171e-07,
"loss": 0.0657,
"reward": 0.4730634540319443,
"reward_std": 0.5836888402700424,
"rewards/cosine_scaled_reward": -0.1533492412418127,
"rewards/format_reward": 0.7797619104385376,
"step": 373
},
{
"clip_ratio": 0.0,
"completion_length": 2848.3572387695312,
"epoch": 1.496,
"grad_norm": 0.6630088686943054,
"kl": 0.35009765625,
"learning_rate": 2.631592046130896e-07,
"loss": 0.0207,
"reward": 0.2956889607012272,
"reward_std": 0.614417277276516,
"rewards/cosine_scaled_reward": -0.1319174226373434,
"rewards/format_reward": 0.5595238283276558,
"step": 374
},
{
"clip_ratio": 0.0,
"completion_length": 2754.0060424804688,
"epoch": 1.5,
"grad_norm": 0.4504316449165344,
"kl": 0.302490234375,
"learning_rate": 2.6074557564105724e-07,
"loss": 0.0225,
"reward": 0.42709287256002426,
"reward_std": 0.6112170070409775,
"rewards/cosine_scaled_reward": -0.07514405064284801,
"rewards/format_reward": 0.5773809552192688,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 2403.52978515625,
"epoch": 1.504,
"grad_norm": 0.4335888624191284,
"kl": 0.266845703125,
"learning_rate": 2.583460445215911e-07,
"loss": 0.0347,
"reward": 0.37878482323139906,
"reward_std": 0.5512942001223564,
"rewards/cosine_scaled_reward": -0.14691711403429508,
"rewards/format_reward": 0.6726190745830536,
"step": 376
},
{
"clip_ratio": 0.0,
"completion_length": 2539.71435546875,
"epoch": 1.508,
"grad_norm": 0.6600142121315002,
"kl": 0.35546875,
"learning_rate": 2.5596072820445254e-07,
"loss": 0.0879,
"reward": 0.6289402991533279,
"reward_std": 0.7740087658166885,
"rewards/cosine_scaled_reward": -0.04564890172332525,
"rewards/format_reward": 0.7202381044626236,
"step": 377
},
{
"clip_ratio": 0.0,
"completion_length": 2566.4107666015625,
"epoch": 1.512,
"grad_norm": 0.5574218034744263,
"kl": 0.310791015625,
"learning_rate": 2.5358974294659373e-07,
"loss": 0.0794,
"reward": 0.5734596885740757,
"reward_std": 0.6776000708341599,
"rewards/cosine_scaled_reward": -0.058508249232545495,
"rewards/format_reward": 0.6904762089252472,
"step": 378
},
{
"clip_ratio": 0.0,
"completion_length": 2749.4642944335938,
"epoch": 1.516,
"grad_norm": 0.4314301908016205,
"kl": 0.330078125,
"learning_rate": 2.512332043064913e-07,
"loss": 0.0655,
"reward": 0.42858002707362175,
"reward_std": 0.7303398549556732,
"rewards/cosine_scaled_reward": -0.10416238568723202,
"rewards/format_reward": 0.6369047611951828,
"step": 379
},
{
"clip_ratio": 0.0,
"completion_length": 2403.684539794922,
"epoch": 1.52,
"grad_norm": 0.42673397064208984,
"kl": 0.299560546875,
"learning_rate": 2.488912271385139e-07,
"loss": 0.0799,
"reward": 0.7241241782903671,
"reward_std": 0.7478837221860886,
"rewards/cosine_scaled_reward": -0.006985542830079794,
"rewards/format_reward": 0.7380952388048172,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 2391.5179443359375,
"epoch": 1.524,
"grad_norm": 0.8130372762680054,
"kl": 0.3203125,
"learning_rate": 2.465639255873246e-07,
"loss": 0.0286,
"reward": 0.49668359011411667,
"reward_std": 0.6931805461645126,
"rewards/cosine_scaled_reward": -0.12665820121765137,
"rewards/format_reward": 0.7500000149011612,
"step": 381
},
{
"clip_ratio": 0.0,
"completion_length": 2430.5654907226562,
"epoch": 1.528,
"grad_norm": 0.4374740719795227,
"kl": 0.310302734375,
"learning_rate": 2.4425141308231765e-07,
"loss": 0.0341,
"reward": 0.6685771271586418,
"reward_std": 0.8352404981851578,
"rewards/cosine_scaled_reward": -0.016901913098990917,
"rewards/format_reward": 0.7023809552192688,
"step": 382
},
{
"clip_ratio": 0.0,
"completion_length": 2296.1607666015625,
"epoch": 1.532,
"grad_norm": 0.5494891405105591,
"kl": 0.30224609375,
"learning_rate": 2.4195380233209006e-07,
"loss": 0.0859,
"reward": 0.7063075229525566,
"reward_std": 0.7431895136833191,
"rewards/cosine_scaled_reward": -0.009941489901393652,
"rewards/format_reward": 0.7261904925107956,
"step": 383
},
{
"clip_ratio": 0.0,
"completion_length": 2517.827392578125,
"epoch": 1.536,
"grad_norm": 0.5410645604133606,
"kl": 0.33203125,
"learning_rate": 2.3967120531894857e-07,
"loss": 0.0459,
"reward": 0.42114658281207085,
"reward_std": 0.6721706539392471,
"rewards/cosine_scaled_reward": -0.12573623820208013,
"rewards/format_reward": 0.6726190596818924,
"step": 384
},
{
"clip_ratio": 0.0,
"completion_length": 2484.96435546875,
"epoch": 1.54,
"grad_norm": 0.4815540313720703,
"kl": 0.3095703125,
"learning_rate": 2.374037332934512e-07,
"loss": 0.0759,
"reward": 0.7441006675362587,
"reward_std": 0.8601991981267929,
"rewards/cosine_scaled_reward": 0.029788417392410338,
"rewards/format_reward": 0.6845238208770752,
"step": 385
},
{
"clip_ratio": 0.0,
"completion_length": 2389.9703063964844,
"epoch": 1.544,
"grad_norm": 0.6783538460731506,
"kl": 0.2802734375,
"learning_rate": 2.3515149676898552e-07,
"loss": 0.0716,
"reward": 0.479885321110487,
"reward_std": 0.7240753322839737,
"rewards/cosine_scaled_reward": -0.06958115100860596,
"rewards/format_reward": 0.6190476417541504,
"step": 386
},
{
"clip_ratio": 0.0,
"completion_length": 2359.964324951172,
"epoch": 1.548,
"grad_norm": 0.8481286764144897,
"kl": 0.296630859375,
"learning_rate": 2.3291460551638237e-07,
"loss": 0.0148,
"reward": 0.5802747337147593,
"reward_std": 0.5601852983236313,
"rewards/cosine_scaled_reward": -0.04617217415943742,
"rewards/format_reward": 0.672619067132473,
"step": 387
},
{
"clip_ratio": 0.0,
"completion_length": 2159.5655212402344,
"epoch": 1.552,
"grad_norm": 0.5140231251716614,
"kl": 0.302001953125,
"learning_rate": 2.306931685585657e-07,
"loss": 0.063,
"reward": 0.5727366209030151,
"reward_std": 0.6229267343878746,
"rewards/cosine_scaled_reward": -0.11244121752679348,
"rewards/format_reward": 0.7976190745830536,
"step": 388
},
{
"clip_ratio": 0.0,
"completion_length": 2253.7559814453125,
"epoch": 1.556,
"grad_norm": 0.4566425681114197,
"kl": 0.292724609375,
"learning_rate": 2.2848729416523859e-07,
"loss": 0.0398,
"reward": 0.6296885460615158,
"reward_std": 0.7193648666143417,
"rewards/cosine_scaled_reward": -0.04825095273554325,
"rewards/format_reward": 0.7261904925107956,
"step": 389
},
{
"clip_ratio": 0.0,
"completion_length": 2653.6250610351562,
"epoch": 1.56,
"grad_norm": 0.6326945424079895,
"kl": 0.38818359375,
"learning_rate": 2.2629708984760706e-07,
"loss": 0.1043,
"reward": 0.531873881816864,
"reward_std": 0.7026529461145401,
"rewards/cosine_scaled_reward": -0.0822773426771164,
"rewards/format_reward": 0.696428582072258,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 2613.7261962890625,
"epoch": 1.564,
"grad_norm": 0.398603618144989,
"kl": 0.305908203125,
"learning_rate": 2.2412266235313973e-07,
"loss": 0.0464,
"reward": 0.2553718090057373,
"reward_std": 0.6311058104038239,
"rewards/cosine_scaled_reward": -0.1699331346899271,
"rewards/format_reward": 0.595238097012043,
"step": 391
},
{
"clip_ratio": 0.0,
"completion_length": 2196.5833435058594,
"epoch": 1.568,
"grad_norm": 1.320838451385498,
"kl": 0.2607421875,
"learning_rate": 2.2196411766036487e-07,
"loss": 0.1165,
"reward": 0.6960010007023811,
"reward_std": 0.8236257880926132,
"rewards/cosine_scaled_reward": -0.044856662629172206,
"rewards/format_reward": 0.7857143133878708,
"step": 392
},
{
"clip_ratio": 0.0,
"completion_length": 2663.9285888671875,
"epoch": 1.572,
"grad_norm": 0.5183250904083252,
"kl": 0.328857421875,
"learning_rate": 2.1982156097370557e-07,
"loss": 0.0708,
"reward": 0.34957781434059143,
"reward_std": 0.6104390919208527,
"rewards/cosine_scaled_reward": -0.12878252286463976,
"rewards/format_reward": 0.6071428805589676,
"step": 393
},
{
"clip_ratio": 0.0,
"completion_length": 2630.2560424804688,
"epoch": 1.576,
"grad_norm": 0.2792785167694092,
"kl": 0.34619140625,
"learning_rate": 2.1769509671835223e-07,
"loss": 0.0772,
"reward": 0.45473287999629974,
"reward_std": 0.7525355666875839,
"rewards/cosine_scaled_reward": -0.0940621355548501,
"rewards/format_reward": 0.6428571492433548,
"step": 394
},
{
"clip_ratio": 0.0,
"completion_length": 2484.2679138183594,
"epoch": 1.58,
"grad_norm": 0.47966378927230835,
"kl": 0.3330078125,
"learning_rate": 2.1558482853517253e-07,
"loss": 0.0783,
"reward": 0.5024484526365995,
"reward_std": 0.6865183711051941,
"rewards/cosine_scaled_reward": -0.07615673809777945,
"rewards/format_reward": 0.6547619104385376,
"step": 395
},
{
"clip_ratio": 0.0,
"completion_length": 2481.96435546875,
"epoch": 1.584,
"grad_norm": 0.4925695061683655,
"kl": 0.34228515625,
"learning_rate": 2.134908592756607e-07,
"loss": 0.1001,
"reward": 0.6872217282652855,
"reward_std": 0.7295544147491455,
"rewards/cosine_scaled_reward": -0.037341527407988906,
"rewards/format_reward": 0.761904776096344,
"step": 396
},
{
"clip_ratio": 0.0,
"completion_length": 2154.494110107422,
"epoch": 1.588,
"grad_norm": 0.635874330997467,
"kl": 0.281494140625,
"learning_rate": 2.1141329099692406e-07,
"loss": 0.0341,
"reward": 0.6967096533626318,
"reward_std": 0.7243114337325096,
"rewards/cosine_scaled_reward": -0.04450232535600662,
"rewards/format_reward": 0.7857143059372902,
"step": 397
},
{
"clip_ratio": 0.0,
"completion_length": 2704.7322387695312,
"epoch": 1.592,
"grad_norm": 0.36841636896133423,
"kl": 0.390625,
"learning_rate": 2.0935222495670968e-07,
"loss": 0.0683,
"reward": 0.35482142120599747,
"reward_std": 0.6981697529554367,
"rewards/cosine_scaled_reward": -0.1410416765138507,
"rewards/format_reward": 0.6369047611951828,
"step": 398
},
{
"clip_ratio": 0.0,
"completion_length": 2155.1905212402344,
"epoch": 1.596,
"grad_norm": 0.6153831481933594,
"kl": 0.2763671875,
"learning_rate": 2.0730776160846853e-07,
"loss": 0.075,
"reward": 0.7129835858941078,
"reward_std": 0.7049887701869011,
"rewards/cosine_scaled_reward": -0.04827013239264488,
"rewards/format_reward": 0.8095238208770752,
"step": 399
},
{
"clip_ratio": 0.0,
"completion_length": 2736.9286499023438,
"epoch": 1.6,
"grad_norm": 0.4656315743923187,
"kl": 0.38330078125,
"learning_rate": 2.0528000059645995e-07,
"loss": 0.0486,
"reward": 0.3270074762403965,
"reward_std": 0.6684512719511986,
"rewards/cosine_scaled_reward": -0.17578197922557592,
"rewards/format_reward": 0.6785714477300644,
"step": 400
},
{
"clip_ratio": 0.0,
"completion_length": 2684.994140625,
"epoch": 1.604,
"grad_norm": 0.4505021274089813,
"kl": 0.34423828125,
"learning_rate": 2.032690407508949e-07,
"loss": 0.0772,
"reward": 0.5096228048205376,
"reward_std": 0.7098966240882874,
"rewards/cosine_scaled_reward": -0.0814981039147824,
"rewards/format_reward": 0.6726190522313118,
"step": 401
},
{
"clip_ratio": 0.0,
"completion_length": 2508.327423095703,
"epoch": 1.608,
"grad_norm": 0.3696132302284241,
"kl": 0.34033203125,
"learning_rate": 2.0127498008311922e-07,
"loss": 0.0554,
"reward": 0.6811544820666313,
"reward_std": 0.7352585643529892,
"rewards/cosine_scaled_reward": -0.03442276082932949,
"rewards/format_reward": 0.7500000149011612,
"step": 402
},
{
"clip_ratio": 0.0,
"completion_length": 2329.654815673828,
"epoch": 1.612,
"grad_norm": 0.5500597953796387,
"kl": 0.310302734375,
"learning_rate": 1.9929791578083655e-07,
"loss": 0.0912,
"reward": 0.5886539276689291,
"reward_std": 0.6953590214252472,
"rewards/cosine_scaled_reward": -0.05091113201342523,
"rewards/format_reward": 0.6904762089252472,
"step": 403
},
{
"clip_ratio": 0.0,
"completion_length": 2279.5238342285156,
"epoch": 1.616,
"grad_norm": 0.825627326965332,
"kl": 0.314208984375,
"learning_rate": 1.9733794420337213e-07,
"loss": 0.0303,
"reward": 0.4903724156320095,
"reward_std": 0.6759866625070572,
"rewards/cosine_scaled_reward": -0.1268376000225544,
"rewards/format_reward": 0.7440476417541504,
"step": 404
},
{
"clip_ratio": 0.0,
"completion_length": 2298.607208251953,
"epoch": 1.62,
"grad_norm": 0.7521853446960449,
"kl": 0.300048828125,
"learning_rate": 1.9539516087697517e-07,
"loss": 0.0742,
"reward": 0.6187992710620165,
"reward_std": 0.6595779061317444,
"rewards/cosine_scaled_reward": -0.050719428109005094,
"rewards/format_reward": 0.7202381044626236,
"step": 405
},
{
"clip_ratio": 0.0,
"completion_length": 2462.0416870117188,
"epoch": 1.624,
"grad_norm": 0.4565219581127167,
"kl": 0.36181640625,
"learning_rate": 1.934696604901642e-07,
"loss": 0.0655,
"reward": 0.7676936537027359,
"reward_std": 0.8463387489318848,
"rewards/cosine_scaled_reward": 0.032656354829669,
"rewards/format_reward": 0.7023809552192688,
"step": 406
},
{
"clip_ratio": 0.0,
"completion_length": 2593.9285888671875,
"epoch": 1.6280000000000001,
"grad_norm": 0.7805240154266357,
"kl": 0.37109375,
"learning_rate": 1.915615368891117e-07,
"loss": 0.0336,
"reward": 0.4898635447025299,
"reward_std": 0.6100385710597038,
"rewards/cosine_scaled_reward": -0.0943539384752512,
"rewards/format_reward": 0.6785714477300644,
"step": 407
},
{
"clip_ratio": 0.0,
"completion_length": 2179.8810119628906,
"epoch": 1.6320000000000001,
"grad_norm": 0.7494162321090698,
"kl": 0.3154296875,
"learning_rate": 1.8967088307307e-07,
"loss": 0.0819,
"reward": 0.5508405864238739,
"reward_std": 0.6755202859640121,
"rewards/cosine_scaled_reward": -0.12934163073077798,
"rewards/format_reward": 0.8095238357782364,
"step": 408
},
{
"clip_ratio": 0.0,
"completion_length": 2213.1131591796875,
"epoch": 1.6360000000000001,
"grad_norm": 0.7274454832077026,
"kl": 0.3466796875,
"learning_rate": 1.8779779118983867e-07,
"loss": 0.0553,
"reward": 0.6235681027173996,
"reward_std": 0.6233258098363876,
"rewards/cosine_scaled_reward": -0.10488261096179485,
"rewards/format_reward": 0.833333358168602,
"step": 409
},
{
"clip_ratio": 0.0,
"completion_length": 2276.5774536132812,
"epoch": 1.6400000000000001,
"grad_norm": 1.2181180715560913,
"kl": 0.357421875,
"learning_rate": 1.8594235253127372e-07,
"loss": 0.1457,
"reward": 0.6739452332258224,
"reward_std": 0.7620265781879425,
"rewards/cosine_scaled_reward": -0.0350511996075511,
"rewards/format_reward": 0.7440476268529892,
"step": 410
},
{
"clip_ratio": 0.0,
"completion_length": 2513.0833740234375,
"epoch": 1.6440000000000001,
"grad_norm": 0.3816966414451599,
"kl": 0.37060546875,
"learning_rate": 1.8410465752883758e-07,
"loss": 0.0631,
"reward": 0.503595694899559,
"reward_std": 0.6215758174657822,
"rewards/cosine_scaled_reward": -0.08748787135118619,
"rewards/format_reward": 0.6785714328289032,
"step": 411
},
{
"clip_ratio": 0.0,
"completion_length": 2348.7916870117188,
"epoch": 1.6480000000000001,
"grad_norm": 0.647936224937439,
"kl": 0.3623046875,
"learning_rate": 1.822847957491922e-07,
"loss": 0.1147,
"reward": 0.7675136551260948,
"reward_std": 0.7814928591251373,
"rewards/cosine_scaled_reward": 0.020661589689552784,
"rewards/format_reward": 0.7261904925107956,
"step": 412
},
{
"clip_ratio": 0.0,
"completion_length": 2329.9464721679688,
"epoch": 1.6520000000000001,
"grad_norm": 0.7966573238372803,
"kl": 0.3505859375,
"learning_rate": 1.804828558898332e-07,
"loss": 0.0507,
"reward": 0.7220299392938614,
"reward_std": 0.7220810800790787,
"rewards/cosine_scaled_reward": -0.01993740734178573,
"rewards/format_reward": 0.761904776096344,
"step": 413
},
{
"clip_ratio": 0.0,
"completion_length": 2510.83935546875,
"epoch": 1.6560000000000001,
"grad_norm": 0.3402910828590393,
"kl": 0.36767578125,
"learning_rate": 1.7869892577476722e-07,
"loss": 0.0717,
"reward": 0.6566540375351906,
"reward_std": 0.7324352562427521,
"rewards/cosine_scaled_reward": -0.016911087092012167,
"rewards/format_reward": 0.6904762089252472,
"step": 414
},
{
"clip_ratio": 0.0,
"completion_length": 2362.952423095703,
"epoch": 1.6600000000000001,
"grad_norm": 0.5068221688270569,
"kl": 0.41357421875,
"learning_rate": 1.7693309235023127e-07,
"loss": 0.0817,
"reward": 0.5704538598656654,
"reward_std": 0.83931764960289,
"rewards/cosine_scaled_reward": -0.08977308124303818,
"rewards/format_reward": 0.7500000298023224,
"step": 415
},
{
"clip_ratio": 0.0,
"completion_length": 2197.886962890625,
"epoch": 1.6640000000000001,
"grad_norm": 0.5192682147026062,
"kl": 0.349609375,
"learning_rate": 1.7518544168045524e-07,
"loss": 0.0456,
"reward": 0.7760029062628746,
"reward_std": 0.7372387051582336,
"rewards/cosine_scaled_reward": -0.0048556849360466,
"rewards/format_reward": 0.7857142984867096,
"step": 416
},
{
"clip_ratio": 0.0,
"completion_length": 2365.0655517578125,
"epoch": 1.6680000000000001,
"grad_norm": 0.7471702098846436,
"kl": 0.357421875,
"learning_rate": 1.7345605894346726e-07,
"loss": 0.0258,
"reward": 0.6658978462219238,
"reward_std": 0.7144315093755722,
"rewards/cosine_scaled_reward": -0.02419395267497748,
"rewards/format_reward": 0.7142857313156128,
"step": 417
},
{
"clip_ratio": 0.0,
"completion_length": 2210.7500610351562,
"epoch": 1.6720000000000002,
"grad_norm": 0.4305538833141327,
"kl": 0.37060546875,
"learning_rate": 1.7174502842694212e-07,
"loss": 0.0715,
"reward": 0.6991885676980019,
"reward_std": 0.6301053613424301,
"rewards/cosine_scaled_reward": -0.03433429542928934,
"rewards/format_reward": 0.7678571492433548,
"step": 418
},
{
"clip_ratio": 0.0,
"completion_length": 2631.9048461914062,
"epoch": 1.6760000000000002,
"grad_norm": 0.31225350499153137,
"kl": 0.35888671875,
"learning_rate": 1.7005243352409333e-07,
"loss": 0.0697,
"reward": 0.6943976636976004,
"reward_std": 0.7306639850139618,
"rewards/cosine_scaled_reward": 0.02874644659459591,
"rewards/format_reward": 0.636904776096344,
"step": 419
},
{
"clip_ratio": 0.0,
"completion_length": 2425.6607055664062,
"epoch": 1.6800000000000002,
"grad_norm": 0.6987324953079224,
"kl": 0.38330078125,
"learning_rate": 1.6837835672960831e-07,
"loss": 0.0585,
"reward": 0.7563354596495628,
"reward_std": 0.7114580571651459,
"rewards/cosine_scaled_reward": -0.038498950423672795,
"rewards/format_reward": 0.8333333432674408,
"step": 420
},
{
"clip_ratio": 0.0,
"completion_length": 2277.4464721679688,
"epoch": 1.6840000000000002,
"grad_norm": 0.34894487261772156,
"kl": 0.373046875,
"learning_rate": 1.6672287963562852e-07,
"loss": 0.0935,
"reward": 0.6365625336766243,
"reward_std": 0.7153737097978592,
"rewards/cosine_scaled_reward": -0.05969492206349969,
"rewards/format_reward": 0.7559524029493332,
"step": 421
},
{
"clip_ratio": 0.0,
"completion_length": 2504.916748046875,
"epoch": 1.688,
"grad_norm": 0.6229146718978882,
"kl": 0.35498046875,
"learning_rate": 1.6508608292777203e-07,
"loss": 0.0181,
"reward": 0.5784893482923508,
"reward_std": 0.6405449658632278,
"rewards/cosine_scaled_reward": -0.06194583047181368,
"rewards/format_reward": 0.7023809552192688,
"step": 422
},
{
"clip_ratio": 0.0,
"completion_length": 2308.52978515625,
"epoch": 1.692,
"grad_norm": 0.4013311266899109,
"kl": 0.328125,
"learning_rate": 1.6346804638120098e-07,
"loss": 0.0574,
"reward": 0.6359338611364365,
"reward_std": 0.7808969020843506,
"rewards/cosine_scaled_reward": -0.03917593788355589,
"rewards/format_reward": 0.714285746216774,
"step": 423
},
{
"clip_ratio": 0.0,
"completion_length": 2253.6785888671875,
"epoch": 1.696,
"grad_norm": 0.7038490176200867,
"kl": 0.288330078125,
"learning_rate": 1.6186884885673413e-07,
"loss": 0.0231,
"reward": 0.6301399618387222,
"reward_std": 0.7140125781297684,
"rewards/cosine_scaled_reward": -0.07481098547577858,
"rewards/format_reward": 0.7797619104385376,
"step": 424
},
{
"clip_ratio": 0.0,
"completion_length": 2256.464385986328,
"epoch": 1.7,
"grad_norm": 0.4849310517311096,
"kl": 0.34228515625,
"learning_rate": 1.6028856829700258e-07,
"loss": 0.0754,
"reward": 0.6902762800455093,
"reward_std": 0.7732263505458832,
"rewards/cosine_scaled_reward": -0.029861881979741156,
"rewards/format_reward": 0.7500000149011612,
"step": 425
},
{
"clip_ratio": 0.0,
"completion_length": 2638.3750610351562,
"epoch": 1.704,
"grad_norm": 0.4661174416542053,
"kl": 0.34716796875,
"learning_rate": 1.5872728172265146e-07,
"loss": 0.0631,
"reward": 0.4628839958459139,
"reward_std": 0.6522120535373688,
"rewards/cosine_scaled_reward": -0.06915326602756977,
"rewards/format_reward": 0.601190485060215,
"step": 426
},
{
"clip_ratio": 0.0,
"completion_length": 2258.279815673828,
"epoch": 1.708,
"grad_norm": 0.5582512021064758,
"kl": 0.325439453125,
"learning_rate": 1.5718506522858572e-07,
"loss": 0.0416,
"reward": 0.6841344758868217,
"reward_std": 0.716413825750351,
"rewards/cosine_scaled_reward": -0.04483753815293312,
"rewards/format_reward": 0.7738095298409462,
"step": 427
},
{
"clip_ratio": 0.0,
"completion_length": 2152.732208251953,
"epoch": 1.712,
"grad_norm": 0.362613320350647,
"kl": 0.35546875,
"learning_rate": 1.5566199398026147e-07,
"loss": 0.0663,
"reward": 0.8665853589773178,
"reward_std": 0.746289573609829,
"rewards/cosine_scaled_reward": 0.013649825006723404,
"rewards/format_reward": 0.8392857164144516,
"step": 428
},
{
"clip_ratio": 0.0,
"completion_length": 2589.8690795898438,
"epoch": 1.716,
"grad_norm": 0.392411470413208,
"kl": 0.35693359375,
"learning_rate": 1.5415814221002265e-07,
"loss": 0.0781,
"reward": 0.6071142517030239,
"reward_std": 0.7519797533750534,
"rewards/cosine_scaled_reward": -0.044657152146101,
"rewards/format_reward": 0.696428582072258,
"step": 429
},
{
"clip_ratio": 0.0,
"completion_length": 2220.1012573242188,
"epoch": 1.72,
"grad_norm": 0.5445396900177002,
"kl": 0.341796875,
"learning_rate": 1.5267358321348285e-07,
"loss": 0.0534,
"reward": 0.701055221259594,
"reward_std": 0.6740739792585373,
"rewards/cosine_scaled_reward": -0.04232952371239662,
"rewards/format_reward": 0.7857142984867096,
"step": 430
},
{
"clip_ratio": 0.0,
"completion_length": 1902.5595397949219,
"epoch": 1.724,
"grad_norm": 0.3450181186199188,
"kl": 0.2607421875,
"learning_rate": 1.5120838934595337e-07,
"loss": 0.0472,
"reward": 1.0963951796293259,
"reward_std": 0.746511772274971,
"rewards/cosine_scaled_reward": 0.11962614580988884,
"rewards/format_reward": 0.8571428805589676,
"step": 431
},
{
"clip_ratio": 0.0,
"completion_length": 2478.732177734375,
"epoch": 1.728,
"grad_norm": 0.48547589778900146,
"kl": 0.373046875,
"learning_rate": 1.4976263201891613e-07,
"loss": 0.0409,
"reward": 0.5677317231893539,
"reward_std": 0.6051659360527992,
"rewards/cosine_scaled_reward": -0.09113414993043989,
"rewards/format_reward": 0.7500000149011612,
"step": 432
},
{
"clip_ratio": 0.0,
"completion_length": 2518.7560424804688,
"epoch": 1.732,
"grad_norm": 0.9139849543571472,
"kl": 0.3701171875,
"learning_rate": 1.483363816965435e-07,
"loss": 0.01,
"reward": 0.6564267948269844,
"reward_std": 0.6321954727172852,
"rewards/cosine_scaled_reward": -0.04083424177952111,
"rewards/format_reward": 0.7380952388048172,
"step": 433
},
{
"clip_ratio": 0.0,
"completion_length": 2439.119110107422,
"epoch": 1.736,
"grad_norm": 0.4629580080509186,
"kl": 0.3349609375,
"learning_rate": 1.469297078922642e-07,
"loss": 0.0828,
"reward": 0.6547855883836746,
"reward_std": 0.6274382770061493,
"rewards/cosine_scaled_reward": -0.05653578881174326,
"rewards/format_reward": 0.767857164144516,
"step": 434
},
{
"clip_ratio": 0.0,
"completion_length": 2385.047637939453,
"epoch": 1.74,
"grad_norm": 0.4398196041584015,
"kl": 0.342529296875,
"learning_rate": 1.4554267916537495e-07,
"loss": 0.034,
"reward": 0.59782674908638,
"reward_std": 0.7165493220090866,
"rewards/cosine_scaled_reward": -0.09394377004355192,
"rewards/format_reward": 0.7857142984867096,
"step": 435
},
{
"clip_ratio": 0.0,
"completion_length": 2235.4405212402344,
"epoch": 1.744,
"grad_norm": 0.5894522070884705,
"kl": 0.321044921875,
"learning_rate": 1.4417536311769885e-07,
"loss": 0.0578,
"reward": 0.7302387952804565,
"reward_std": 0.7528126537799835,
"rewards/cosine_scaled_reward": -0.018809196539223194,
"rewards/format_reward": 0.7678571492433548,
"step": 436
},
{
"clip_ratio": 0.0,
"completion_length": 2364.2083740234375,
"epoch": 1.748,
"grad_norm": 0.517291784286499,
"kl": 0.262451171875,
"learning_rate": 1.4282782639029128e-07,
"loss": 0.0626,
"reward": 0.5654323399066925,
"reward_std": 0.6579017788171768,
"rewards/cosine_scaled_reward": -0.03276003524661064,
"rewards/format_reward": 0.630952388048172,
"step": 437
},
{
"clip_ratio": 0.0,
"completion_length": 2464.0238647460938,
"epoch": 1.752,
"grad_norm": 0.719810426235199,
"kl": 0.34619140625,
"learning_rate": 1.4150013466019114e-07,
"loss": 0.0289,
"reward": 0.5253645405173302,
"reward_std": 0.593732014298439,
"rewards/cosine_scaled_reward": -0.10934155760332942,
"rewards/format_reward": 0.7440476268529892,
"step": 438
},
{
"clip_ratio": 0.0,
"completion_length": 2688.2559814453125,
"epoch": 1.756,
"grad_norm": 0.47081246972084045,
"kl": 0.320068359375,
"learning_rate": 1.4019235263722034e-07,
"loss": 0.0737,
"reward": 0.5551765933632851,
"reward_std": 0.750535324215889,
"rewards/cosine_scaled_reward": -0.043840276543051004,
"rewards/format_reward": 0.6428571492433548,
"step": 439
},
{
"clip_ratio": 0.0,
"completion_length": 2691.5298461914062,
"epoch": 1.76,
"grad_norm": 0.3561669588088989,
"kl": 0.30517578125,
"learning_rate": 1.3890454406082956e-07,
"loss": 0.0412,
"reward": 0.6305891573429108,
"reward_std": 0.7718498408794403,
"rewards/cosine_scaled_reward": -0.02399112842977047,
"rewards/format_reward": 0.6785714477300644,
"step": 440
},
{
"clip_ratio": 0.0,
"completion_length": 2423.3274536132812,
"epoch": 1.764,
"grad_norm": 0.8448560237884521,
"kl": 0.34130859375,
"learning_rate": 1.3763677169699217e-07,
"loss": 0.0974,
"reward": 0.6258707121014595,
"reward_std": 0.7022215574979782,
"rewards/cosine_scaled_reward": -0.06504084914922714,
"rewards/format_reward": 0.7559524029493332,
"step": 441
},
{
"clip_ratio": 0.0,
"completion_length": 2509.6845703125,
"epoch": 1.768,
"grad_norm": 0.49845507740974426,
"kl": 0.3271484375,
"learning_rate": 1.3638909733514452e-07,
"loss": 0.0368,
"reward": 0.6952026858925819,
"reward_std": 0.7732700109481812,
"rewards/cosine_scaled_reward": -0.03037486458197236,
"rewards/format_reward": 0.7559524029493332,
"step": 442
},
{
"clip_ratio": 0.0,
"completion_length": 2251.6370239257812,
"epoch": 1.772,
"grad_norm": 1.6570687294006348,
"kl": 0.32861328125,
"learning_rate": 1.351615817851748e-07,
"loss": 0.1251,
"reward": 0.5299716778099537,
"reward_std": 0.6262076199054718,
"rewards/cosine_scaled_reward": -0.10108558752108365,
"rewards/format_reward": 0.7321428507566452,
"step": 443
},
{
"clip_ratio": 0.0,
"completion_length": 2569.8392944335938,
"epoch": 1.776,
"grad_norm": 0.5071792602539062,
"kl": 0.285400390625,
"learning_rate": 1.3395428487445914e-07,
"loss": 0.0461,
"reward": 0.3558937795460224,
"reward_std": 0.6386721879243851,
"rewards/cosine_scaled_reward": -0.12860072287730873,
"rewards/format_reward": 0.6130952537059784,
"step": 444
},
{
"clip_ratio": 0.0,
"completion_length": 2434.52978515625,
"epoch": 1.78,
"grad_norm": 0.6472364068031311,
"kl": 0.33935546875,
"learning_rate": 1.3276726544494571e-07,
"loss": 0.0324,
"reward": 0.5342502817511559,
"reward_std": 0.7027776390314102,
"rewards/cosine_scaled_reward": -0.08108916692435741,
"rewards/format_reward": 0.696428582072258,
"step": 445
},
{
"clip_ratio": 0.0,
"completion_length": 2600.7678833007812,
"epoch": 1.784,
"grad_norm": 0.4613596796989441,
"kl": 0.36767578125,
"learning_rate": 1.316005813502869e-07,
"loss": 0.0366,
"reward": 0.3209609054028988,
"reward_std": 0.6079899072647095,
"rewards/cosine_scaled_reward": -0.15499573945999146,
"rewards/format_reward": 0.630952388048172,
"step": 446
},
{
"clip_ratio": 0.0,
"completion_length": 2406.6131286621094,
"epoch": 1.788,
"grad_norm": 0.5609318017959595,
"kl": 0.3349609375,
"learning_rate": 1.3045428945301953e-07,
"loss": 0.1069,
"reward": 0.8279012702405453,
"reward_std": 0.6648521721363068,
"rewards/cosine_scaled_reward": 0.035974426195025444,
"rewards/format_reward": 0.755952388048172,
"step": 447
},
{
"clip_ratio": 0.0,
"completion_length": 2292.5238647460938,
"epoch": 1.792,
"grad_norm": 0.6498924493789673,
"kl": 0.3251953125,
"learning_rate": 1.2932844562179352e-07,
"loss": 0.1002,
"reward": 0.9468577206134796,
"reward_std": 0.8284895867109299,
"rewards/cosine_scaled_reward": 0.1073574130423367,
"rewards/format_reward": 0.7321428656578064,
"step": 448
},
{
"clip_ratio": 0.0,
"completion_length": 2359.7203063964844,
"epoch": 1.796,
"grad_norm": 0.8151586651802063,
"kl": 0.29248046875,
"learning_rate": 1.2822310472864885e-07,
"loss": 0.1025,
"reward": 0.6154885776340961,
"reward_std": 0.643234595656395,
"rewards/cosine_scaled_reward": -0.028565243119373918,
"rewards/format_reward": 0.6726190596818924,
"step": 449
},
{
"clip_ratio": 0.0,
"completion_length": 2456.5952758789062,
"epoch": 1.8,
"grad_norm": 0.6727307438850403,
"kl": 0.3369140625,
"learning_rate": 1.2713832064634125e-07,
"loss": 0.0518,
"reward": 0.5276128388941288,
"reward_std": 0.6850098147988319,
"rewards/cosine_scaled_reward": -0.05166977294720709,
"rewards/format_reward": 0.6309523805975914,
"step": 450
},
{
"clip_ratio": 0.0,
"completion_length": 2180.250030517578,
"epoch": 1.804,
"grad_norm": 0.4327715039253235,
"kl": 0.2841796875,
"learning_rate": 1.260741462457165e-07,
"loss": 0.0396,
"reward": 0.9219238460063934,
"reward_std": 0.8085188716650009,
"rewards/cosine_scaled_reward": 0.029414291959255934,
"rewards/format_reward": 0.8630952537059784,
"step": 451
},
{
"clip_ratio": 0.0,
"completion_length": 2414.9584045410156,
"epoch": 1.808,
"grad_norm": 0.5890040993690491,
"kl": 0.33642578125,
"learning_rate": 1.2503063339313356e-07,
"loss": 0.0116,
"reward": 0.45377534069120884,
"reward_std": 0.6864534169435501,
"rewards/cosine_scaled_reward": -0.07668375968933105,
"rewards/format_reward": 0.607142873108387,
"step": 452
},
{
"clip_ratio": 0.0,
"completion_length": 2325.2619018554688,
"epoch": 1.812,
"grad_norm": 0.8580995798110962,
"kl": 0.38330078125,
"learning_rate": 1.2400783294793668e-07,
"loss": 0.1211,
"reward": 0.41875267028808594,
"reward_std": 0.5978472009301186,
"rewards/cosine_scaled_reward": -0.17455224692821503,
"rewards/format_reward": 0.7678571492433548,
"step": 453
},
{
"clip_ratio": 0.0,
"completion_length": 2524.8452758789062,
"epoch": 1.8159999999999998,
"grad_norm": 0.6263750195503235,
"kl": 0.369140625,
"learning_rate": 1.2300579475997657e-07,
"loss": 0.0599,
"reward": 0.5164637118577957,
"reward_std": 0.7428598999977112,
"rewards/cosine_scaled_reward": -0.0989109962247312,
"rewards/format_reward": 0.7142857313156128,
"step": 454
},
{
"clip_ratio": 0.0,
"completion_length": 2222.684600830078,
"epoch": 1.8199999999999998,
"grad_norm": 0.46227335929870605,
"kl": 0.257568359375,
"learning_rate": 1.220245676671809e-07,
"loss": 0.0452,
"reward": 0.6773854792118073,
"reward_std": 0.6582589149475098,
"rewards/cosine_scaled_reward": -0.03333106730133295,
"rewards/format_reward": 0.744047611951828,
"step": 455
},
{
"clip_ratio": 0.0,
"completion_length": 2381.2916870117188,
"epoch": 1.8239999999999998,
"grad_norm": 1.395836591720581,
"kl": 0.346923828125,
"learning_rate": 1.2106419949317388e-07,
"loss": -0.0036,
"reward": 0.6790124624967575,
"reward_std": 0.7677509784698486,
"rewards/cosine_scaled_reward": -0.0354937631636858,
"rewards/format_reward": 0.7500000298023224,
"step": 456
},
{
"clip_ratio": 0.0,
"completion_length": 2236.1369018554688,
"epoch": 1.8279999999999998,
"grad_norm": 0.363459974527359,
"kl": 0.359375,
"learning_rate": 1.2012473704494537e-07,
"loss": 0.0897,
"reward": 0.8419362753629684,
"reward_std": 0.8306869268417358,
"rewards/cosine_scaled_reward": 0.034063366474583745,
"rewards/format_reward": 0.7738095223903656,
"step": 457
},
{
"clip_ratio": 0.0,
"completion_length": 2312.5059814453125,
"epoch": 1.8319999999999999,
"grad_norm": 0.5052822232246399,
"kl": 0.35791015625,
"learning_rate": 1.1920622611056974e-07,
"loss": 0.0619,
"reward": 0.744497187435627,
"reward_std": 0.6325250118970871,
"rewards/cosine_scaled_reward": -0.02953713061287999,
"rewards/format_reward": 0.8035714477300644,
"step": 458
},
{
"clip_ratio": 0.0,
"completion_length": 2242.0357971191406,
"epoch": 1.8359999999999999,
"grad_norm": 0.4124799966812134,
"kl": 0.30615234375,
"learning_rate": 1.1830871145697412e-07,
"loss": 0.0801,
"reward": 0.7117128595709801,
"reward_std": 0.7263730615377426,
"rewards/cosine_scaled_reward": -0.031048328906763345,
"rewards/format_reward": 0.773809552192688,
"step": 459
},
{
"clip_ratio": 0.0,
"completion_length": 2795.3512573242188,
"epoch": 1.8399999999999999,
"grad_norm": 0.5879099369049072,
"kl": 0.341796875,
"learning_rate": 1.1743223682775649e-07,
"loss": 0.0299,
"reward": 0.41843298077583313,
"reward_std": 0.6329772919416428,
"rewards/cosine_scaled_reward": -0.11518826894462109,
"rewards/format_reward": 0.6488095223903656,
"step": 460
},
{
"clip_ratio": 0.0,
"completion_length": 2640.6904907226562,
"epoch": 1.8439999999999999,
"grad_norm": 0.3294979929924011,
"kl": 0.38818359375,
"learning_rate": 1.1657684494105386e-07,
"loss": 0.0662,
"reward": 0.6549192667007446,
"reward_std": 0.8022814393043518,
"rewards/cosine_scaled_reward": -0.02373085916042328,
"rewards/format_reward": 0.7023809552192688,
"step": 461
},
{
"clip_ratio": 0.0,
"completion_length": 2345.4285888671875,
"epoch": 1.8479999999999999,
"grad_norm": 0.3771924376487732,
"kl": 0.35595703125,
"learning_rate": 1.1574257748745986e-07,
"loss": 0.105,
"reward": 0.6399585753679276,
"reward_std": 0.7968022599816322,
"rewards/cosine_scaled_reward": -0.08478261809796095,
"rewards/format_reward": 0.8095238208770752,
"step": 462
},
{
"clip_ratio": 0.0,
"completion_length": 2589.1666870117188,
"epoch": 1.8519999999999999,
"grad_norm": 0.3122951090335846,
"kl": 0.3798828125,
"learning_rate": 1.1492947512799328e-07,
"loss": 0.0565,
"reward": 0.43525535613298416,
"reward_std": 0.7020779103040695,
"rewards/cosine_scaled_reward": -0.09784852154552937,
"rewards/format_reward": 0.6309523954987526,
"step": 463
},
{
"clip_ratio": 0.0,
"completion_length": 2481.261962890625,
"epoch": 1.8559999999999999,
"grad_norm": 0.5730969905853271,
"kl": 0.33984375,
"learning_rate": 1.1413757749211602e-07,
"loss": 0.0699,
"reward": 0.7782276198267937,
"reward_std": 0.6072199195623398,
"rewards/cosine_scaled_reward": 0.02304239757359028,
"rewards/format_reward": 0.7321428656578064,
"step": 464
},
{
"clip_ratio": 0.0,
"completion_length": 2644.619140625,
"epoch": 1.8599999999999999,
"grad_norm": 0.39649447798728943,
"kl": 0.373046875,
"learning_rate": 1.1336692317580158e-07,
"loss": 0.0712,
"reward": 0.6695687249302864,
"reward_std": 0.8249562680721283,
"rewards/cosine_scaled_reward": 0.007403409108519554,
"rewards/format_reward": 0.6547619178891182,
"step": 465
},
{
"clip_ratio": 0.0,
"completion_length": 2491.0119018554688,
"epoch": 1.8639999999999999,
"grad_norm": 0.4567016065120697,
"kl": 0.36376953125,
"learning_rate": 1.1261754973965422e-07,
"loss": 0.0702,
"reward": 0.46852924674749374,
"reward_std": 0.7603975385427475,
"rewards/cosine_scaled_reward": -0.1199020454660058,
"rewards/format_reward": 0.708333358168602,
"step": 466
},
{
"clip_ratio": 0.0,
"completion_length": 2532.6607666015625,
"epoch": 1.8679999999999999,
"grad_norm": 0.36717355251312256,
"kl": 0.330078125,
"learning_rate": 1.1188949370707787e-07,
"loss": 0.0491,
"reward": 0.5859625339508057,
"reward_std": 0.6559573635458946,
"rewards/cosine_scaled_reward": -0.028447304794099182,
"rewards/format_reward": 0.6428571492433548,
"step": 467
},
{
"clip_ratio": 0.0,
"completion_length": 2622.2559814453125,
"epoch": 1.8719999999999999,
"grad_norm": 0.4103451669216156,
"kl": 0.36279296875,
"learning_rate": 1.1118279056249653e-07,
"loss": 0.0606,
"reward": 0.6576881408691406,
"reward_std": 0.8001267910003662,
"rewards/cosine_scaled_reward": -0.007465461269021034,
"rewards/format_reward": 0.6726190596818924,
"step": 468
},
{
"clip_ratio": 0.0,
"completion_length": 2134.279815673828,
"epoch": 1.876,
"grad_norm": 0.5138155817985535,
"kl": 0.256103515625,
"learning_rate": 1.1049747474962444e-07,
"loss": 0.0759,
"reward": 0.6648233011364937,
"reward_std": 0.7618712484836578,
"rewards/cosine_scaled_reward": -0.06044549681246281,
"rewards/format_reward": 0.7857143133878708,
"step": 469
},
{
"clip_ratio": 0.0,
"completion_length": 2160.202392578125,
"epoch": 1.88,
"grad_norm": 0.3472672700881958,
"kl": 0.30126953125,
"learning_rate": 1.0983357966978745e-07,
"loss": 0.0461,
"reward": 0.7422109395265579,
"reward_std": 0.6609758958220482,
"rewards/cosine_scaled_reward": -0.012823125813156366,
"rewards/format_reward": 0.767857164144516,
"step": 470
},
{
"clip_ratio": 0.0,
"completion_length": 2532.3809814453125,
"epoch": 1.884,
"grad_norm": 0.2868484556674957,
"kl": 0.2763671875,
"learning_rate": 1.0919113768029517e-07,
"loss": 0.0493,
"reward": 0.48562416061758995,
"reward_std": 0.7373960316181183,
"rewards/cosine_scaled_reward": -0.08159269354655407,
"rewards/format_reward": 0.648809552192688,
"step": 471
},
{
"clip_ratio": 0.0,
"completion_length": 2409.3392944335938,
"epoch": 1.888,
"grad_norm": 0.4656960964202881,
"kl": 0.390380859375,
"learning_rate": 1.0857018009286381e-07,
"loss": 0.0561,
"reward": 0.6008469248190522,
"reward_std": 0.6282935440540314,
"rewards/cosine_scaled_reward": -0.05374322272837162,
"rewards/format_reward": 0.708333358168602,
"step": 472
},
{
"clip_ratio": 0.0,
"completion_length": 2294.0178833007812,
"epoch": 1.892,
"grad_norm": 0.5274000763893127,
"kl": 0.3369140625,
"learning_rate": 1.0797073717209013e-07,
"loss": 0.0877,
"reward": 0.7744100838899612,
"reward_std": 0.7935537397861481,
"rewards/cosine_scaled_reward": 0.00030028633773326874,
"rewards/format_reward": 0.7738095372915268,
"step": 473
},
{
"clip_ratio": 0.0,
"completion_length": 2676.6727294921875,
"epoch": 1.896,
"grad_norm": 0.5600417256355286,
"kl": 0.302978515625,
"learning_rate": 1.0739283813397639e-07,
"loss": 0.0338,
"reward": 0.47137061692774296,
"reward_std": 0.7821067273616791,
"rewards/cosine_scaled_reward": -0.07681469712406397,
"rewards/format_reward": 0.6250000074505806,
"step": 474
},
{
"clip_ratio": 0.0,
"completion_length": 2442.0952758789062,
"epoch": 1.9,
"grad_norm": 0.5208225846290588,
"kl": 0.32470703125,
"learning_rate": 1.068365111445064e-07,
"loss": 0.0837,
"reward": 0.38532300293445587,
"reward_std": 0.5505756810307503,
"rewards/cosine_scaled_reward": -0.1287670750170946,
"rewards/format_reward": 0.6428571492433548,
"step": 475
},
{
"clip_ratio": 0.0,
"completion_length": 2030.7857360839844,
"epoch": 1.904,
"grad_norm": 0.6633386611938477,
"kl": 0.269287109375,
"learning_rate": 1.063017833182728e-07,
"loss": 0.0183,
"reward": 0.796258918941021,
"reward_std": 0.8103819191455841,
"rewards/cosine_scaled_reward": 0.017177060712128878,
"rewards/format_reward": 0.761904776096344,
"step": 476
},
{
"clip_ratio": 0.0,
"completion_length": 2734.2262573242188,
"epoch": 1.908,
"grad_norm": 0.5043067932128906,
"kl": 0.35791015625,
"learning_rate": 1.0578868071715544e-07,
"loss": 0.0378,
"reward": 0.2551159653812647,
"reward_std": 0.5920611470937729,
"rewards/cosine_scaled_reward": -0.155180131085217,
"rewards/format_reward": 0.5654762089252472,
"step": 477
},
{
"clip_ratio": 0.0,
"completion_length": 2301.0952758789062,
"epoch": 1.912,
"grad_norm": 0.7771977186203003,
"kl": 0.28662109375,
"learning_rate": 1.0529722834905125e-07,
"loss": 0.0228,
"reward": 0.6790298409759998,
"reward_std": 0.6661486774682999,
"rewards/cosine_scaled_reward": -0.06524700409499928,
"rewards/format_reward": 0.8095238357782364,
"step": 478
},
{
"clip_ratio": 0.0,
"completion_length": 2247.6012573242188,
"epoch": 1.916,
"grad_norm": 0.599141001701355,
"kl": 0.3212890625,
"learning_rate": 1.0482745016665526e-07,
"loss": 0.0857,
"reward": 0.8667033798992634,
"reward_std": 0.8036679923534393,
"rewards/cosine_scaled_reward": 0.06132788397371769,
"rewards/format_reward": 0.7440476268529892,
"step": 479
},
{
"clip_ratio": 0.0,
"completion_length": 2093.3333740234375,
"epoch": 1.92,
"grad_norm": 0.5312609076499939,
"kl": 0.2958984375,
"learning_rate": 1.0437936906629334e-07,
"loss": 0.0735,
"reward": 0.7348574697971344,
"reward_std": 0.689183309674263,
"rewards/cosine_scaled_reward": -0.034356983145698905,
"rewards/format_reward": 0.8035714328289032,
"step": 480
},
{
"clip_ratio": 0.0,
"completion_length": 2448.1726684570312,
"epoch": 1.924,
"grad_norm": 0.5402917861938477,
"kl": 0.36669921875,
"learning_rate": 1.0395300688680625e-07,
"loss": 0.0831,
"reward": 0.43995974212884903,
"reward_std": 0.6862698197364807,
"rewards/cosine_scaled_reward": -0.13121061958372593,
"rewards/format_reward": 0.70238097012043,
"step": 481
},
{
"clip_ratio": 0.0,
"completion_length": 2658.7977294921875,
"epoch": 1.928,
"grad_norm": 0.5909121632575989,
"kl": 0.3623046875,
"learning_rate": 1.0354838440848501e-07,
"loss": 0.0547,
"reward": 0.5179771184921265,
"reward_std": 0.7944772690534592,
"rewards/cosine_scaled_reward": -0.0981543204979971,
"rewards/format_reward": 0.7142857313156128,
"step": 482
},
{
"clip_ratio": 0.0,
"completion_length": 2292.1428833007812,
"epoch": 1.932,
"grad_norm": 0.549201488494873,
"kl": 0.30615234375,
"learning_rate": 1.0316552135205837e-07,
"loss": 0.0906,
"reward": 0.6546563804149628,
"reward_std": 0.7558221146464348,
"rewards/cosine_scaled_reward": -0.017909929156303406,
"rewards/format_reward": 0.690476194024086,
"step": 483
},
{
"clip_ratio": 0.0,
"completion_length": 2001.2381286621094,
"epoch": 1.936,
"grad_norm": 0.9190180897712708,
"kl": 0.2490234375,
"learning_rate": 1.0280443637773163e-07,
"loss": -0.0131,
"reward": 0.6368911117315292,
"reward_std": 0.5770624950528145,
"rewards/cosine_scaled_reward": -0.07441157009452581,
"rewards/format_reward": 0.7857142835855484,
"step": 484
},
{
"clip_ratio": 0.0,
"completion_length": 2369.482177734375,
"epoch": 1.94,
"grad_norm": 0.48303577303886414,
"kl": 0.32861328125,
"learning_rate": 1.0246514708427701e-07,
"loss": 0.0806,
"reward": 0.5949805751442909,
"reward_std": 0.7235869467258453,
"rewards/cosine_scaled_reward": -0.04179543023929,
"rewards/format_reward": 0.6785714328289032,
"step": 485
},
{
"clip_ratio": 0.0,
"completion_length": 2607.375030517578,
"epoch": 1.944,
"grad_norm": 0.45922327041625977,
"kl": 0.334228515625,
"learning_rate": 1.0214767000817596e-07,
"loss": 0.0495,
"reward": 0.4739008713513613,
"reward_std": 0.7411531507968903,
"rewards/cosine_scaled_reward": -0.10828767996281385,
"rewards/format_reward": 0.690476194024086,
"step": 486
},
{
"clip_ratio": 0.0,
"completion_length": 2427.3929443359375,
"epoch": 1.948,
"grad_norm": 0.42017099261283875,
"kl": 0.281494140625,
"learning_rate": 1.0185202062281336e-07,
"loss": 0.0322,
"reward": 0.3904539607465267,
"reward_std": 0.6817184686660767,
"rewards/cosine_scaled_reward": -0.13810635451227427,
"rewards/format_reward": 0.6666666716337204,
"step": 487
},
{
"clip_ratio": 0.0,
"completion_length": 2579.0120239257812,
"epoch": 1.952,
"grad_norm": 0.7048377394676208,
"kl": 0.322265625,
"learning_rate": 1.0157821333772304e-07,
"loss": 0.028,
"reward": 0.5259524993598461,
"reward_std": 0.6612162664532661,
"rewards/cosine_scaled_reward": -0.09714281000196934,
"rewards/format_reward": 0.7202381193637848,
"step": 488
},
{
"clip_ratio": 0.0,
"completion_length": 2246.9524536132812,
"epoch": 1.956,
"grad_norm": 0.4814748167991638,
"kl": 0.313720703125,
"learning_rate": 1.013262614978859e-07,
"loss": 0.0807,
"reward": 0.7873745709657669,
"reward_std": 0.7711023241281509,
"rewards/cosine_scaled_reward": -0.005122252739965916,
"rewards/format_reward": 0.7976190745830536,
"step": 489
},
{
"clip_ratio": 0.0,
"completion_length": 2849.761962890625,
"epoch": 1.96,
"grad_norm": 0.5227950215339661,
"kl": 0.37890625,
"learning_rate": 1.0109617738307911e-07,
"loss": 0.0171,
"reward": 0.4944131616503,
"reward_std": 0.6706523001194,
"rewards/cosine_scaled_reward": -0.06826960667967796,
"rewards/format_reward": 0.6309524029493332,
"step": 490
},
{
"clip_ratio": 0.0,
"completion_length": 2594.3631591796875,
"epoch": 1.964,
"grad_norm": 0.5116230249404907,
"kl": 0.40576171875,
"learning_rate": 1.0088797220727779e-07,
"loss": 0.0511,
"reward": 0.40869739279150963,
"reward_std": 0.703234076499939,
"rewards/cosine_scaled_reward": -0.12600845471024513,
"rewards/format_reward": 0.6607142984867096,
"step": 491
},
{
"clip_ratio": 0.0,
"completion_length": 2416.8632202148438,
"epoch": 1.968,
"grad_norm": 0.9567095637321472,
"kl": 0.27783203125,
"learning_rate": 1.0070165611810855e-07,
"loss": 0.1235,
"reward": 0.5404094010591507,
"reward_std": 0.6638472378253937,
"rewards/cosine_scaled_reward": -0.054200079292058945,
"rewards/format_reward": 0.6488095372915268,
"step": 492
},
{
"clip_ratio": 0.0,
"completion_length": 2514.1666870117188,
"epoch": 1.972,
"grad_norm": 0.4846276044845581,
"kl": 0.28466796875,
"learning_rate": 1.005372381963547e-07,
"loss": 0.0436,
"reward": 0.5605661012232304,
"reward_std": 0.6418938338756561,
"rewards/cosine_scaled_reward": -0.03519314527511597,
"rewards/format_reward": 0.630952388048172,
"step": 493
},
{
"clip_ratio": 0.0,
"completion_length": 2289.71435546875,
"epoch": 1.976,
"grad_norm": 0.6296063661575317,
"kl": 0.3212890625,
"learning_rate": 1.0039472645551372e-07,
"loss": 0.0439,
"reward": 0.6024229377508163,
"reward_std": 0.7128957360982895,
"rewards/cosine_scaled_reward": -0.07081234554061666,
"rewards/format_reward": 0.744047611951828,
"step": 494
},
{
"clip_ratio": 0.0,
"completion_length": 2491.3334045410156,
"epoch": 1.98,
"grad_norm": 0.622008204460144,
"kl": 0.283447265625,
"learning_rate": 1.002741278414069e-07,
"loss": 0.0361,
"reward": 0.594695046544075,
"reward_std": 0.6841184943914413,
"rewards/cosine_scaled_reward": -0.03896199120208621,
"rewards/format_reward": 0.6726190745830536,
"step": 495
},
{
"clip_ratio": 0.0,
"completion_length": 2395.636962890625,
"epoch": 1.984,
"grad_norm": 0.30918648838996887,
"kl": 0.29931640625,
"learning_rate": 1.0017544823184055e-07,
"loss": 0.0827,
"reward": 0.5910248765721917,
"reward_std": 0.6182541996240616,
"rewards/cosine_scaled_reward": -0.05567805375903845,
"rewards/format_reward": 0.70238097012043,
"step": 496
},
{
"clip_ratio": 0.0,
"completion_length": 2368.8631591796875,
"epoch": 1.988,
"grad_norm": 1.1213865280151367,
"kl": 0.3515625,
"learning_rate": 1.0009869243631952e-07,
"loss": 0.0439,
"reward": 0.48846913129091263,
"reward_std": 0.6297848075628281,
"rewards/cosine_scaled_reward": -0.13374162535183132,
"rewards/format_reward": 0.755952388048172,
"step": 497
},
{
"clip_ratio": 0.0,
"completion_length": 2289.3690795898438,
"epoch": 1.992,
"grad_norm": 0.810573399066925,
"kl": 0.302734375,
"learning_rate": 1.000438641958131e-07,
"loss": 0.0228,
"reward": 0.6517780050635338,
"reward_std": 0.7580654174089432,
"rewards/cosine_scaled_reward": -0.046134804193570744,
"rewards/format_reward": 0.7440476417541504,
"step": 498
},
{
"clip_ratio": 0.0,
"completion_length": 2452.2738647460938,
"epoch": 1.996,
"grad_norm": 0.357543408870697,
"kl": 0.32763671875,
"learning_rate": 1.0001096618257236e-07,
"loss": 0.0643,
"reward": 0.3793360572308302,
"reward_std": 0.7790006846189499,
"rewards/cosine_scaled_reward": -0.15557007491588593,
"rewards/format_reward": 0.6904762089252472,
"step": 499
},
{
"clip_ratio": 0.0,
"completion_length": 2528.7500610351562,
"epoch": 2.0,
"grad_norm": 0.9307948350906372,
"kl": 0.306640625,
"learning_rate": 1e-07,
"loss": 0.1364,
"reward": 0.5999207645654678,
"reward_std": 0.6981495916843414,
"rewards/cosine_scaled_reward": -0.03634915268048644,
"rewards/format_reward": 0.6726190596818924,
"step": 500
},
{
"epoch": 2.0,
"step": 500,
"total_flos": 0.0,
"train_loss": 0.0725239302306436,
"train_runtime": 62033.0192,
"train_samples_per_second": 1.354,
"train_steps_per_second": 0.008
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}