{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 2822.7857666015625, "epoch": 0.004, "grad_norm": 0.12564538419246674, "kl": 0.0, "learning_rate": 2e-08, "loss": 0.0645, "reward": 0.09580668434500694, "reward_std": 0.5702872574329376, "rewards/cosine_scaled_reward": -0.14554904401302338, "rewards/format_reward": 0.3869047649204731, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 2575.571533203125, "epoch": 0.008, "grad_norm": 0.15411853790283203, "kl": 0.0, "learning_rate": 4e-08, "loss": 0.0717, "reward": 0.5743008255958557, "reward_std": 0.7826777100563049, "rewards/cosine_scaled_reward": 0.03119804011657834, "rewards/format_reward": 0.5119047686457634, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 2762.1190490722656, "epoch": 0.012, "grad_norm": 0.13477382063865662, "kl": 3.463029861450195e-05, "learning_rate": 6e-08, "loss": 0.0865, "reward": 0.21700193732976913, "reward_std": 0.6844624578952789, "rewards/cosine_scaled_reward": -0.10578475520014763, "rewards/format_reward": 0.4285714402794838, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 2686.3214721679688, "epoch": 0.016, "grad_norm": 0.1282820850610733, "kl": 2.434849739074707e-05, "learning_rate": 8e-08, "loss": 0.0525, "reward": 0.4696298725903034, "reward_std": 0.7235232815146446, "rewards/cosine_scaled_reward": -0.0062565067782998085, "rewards/format_reward": 0.4821428582072258, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 2917.5535888671875, "epoch": 0.02, "grad_norm": 0.14993517100811005, "kl": 3.725290298461914e-05, "learning_rate": 1e-07, "loss": 0.0762, "reward": 0.15318153076805174, "reward_std": 0.7213103845715523, "rewards/cosine_scaled_reward": -0.08412353717721999, "rewards/format_reward": 0.3214285857975483, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 2816.2559814453125, "epoch": 0.024, "grad_norm": 0.14960958063602448, "kl": 3.1054019927978516e-05, "learning_rate": 1.2e-07, "loss": 0.0537, "reward": 0.2950221598148346, "reward_std": 0.738863505423069, "rewards/cosine_scaled_reward": -0.057846077223075554, "rewards/format_reward": 0.410714291036129, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 2870.3988647460938, "epoch": 0.028, "grad_norm": 0.10985030233860016, "kl": 2.8133392333984375e-05, "learning_rate": 1.4e-07, "loss": 0.0068, "reward": 0.27893248095642775, "reward_std": 0.7550084367394447, "rewards/cosine_scaled_reward": -0.05993851972743869, "rewards/format_reward": 0.3988095410168171, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 3160.452392578125, "epoch": 0.032, "grad_norm": 0.10308283567428589, "kl": 3.8176774978637695e-05, "learning_rate": 1.6e-07, "loss": 0.0223, "reward": 0.07877065148204565, "reward_std": 0.6431715190410614, "rewards/cosine_scaled_reward": -0.08263849129434675, "rewards/format_reward": 0.2440476268529892, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 3020.607177734375, "epoch": 0.036, "grad_norm": 0.15057384967803955, "kl": 3.37064266204834e-05, "learning_rate": 1.8e-07, "loss": 0.0733, "reward": 0.06793000735342503, "reward_std": 0.6978132948279381, "rewards/cosine_scaled_reward": -0.12079690210521221, "rewards/format_reward": 0.3095238171517849, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 3089.84521484375, "epoch": 0.04, "grad_norm": 0.11256518214941025, "kl": 3.2395124435424805e-05, "learning_rate": 2e-07, "loss": 0.0413, "reward": 0.032662300392985344, "reward_std": 0.6881319805979729, "rewards/cosine_scaled_reward": -0.13545456249266863, "rewards/format_reward": 0.3035714365541935, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 2851.946533203125, "epoch": 0.044, "grad_norm": 0.17106953263282776, "kl": 3.784894943237305e-05, "learning_rate": 2.1999999999999998e-07, "loss": 0.0636, "reward": 0.3718952457420528, "reward_std": 0.6902545392513275, "rewards/cosine_scaled_reward": -0.03131428617052734, "rewards/format_reward": 0.4345238134264946, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 2798.5178833007812, "epoch": 0.048, "grad_norm": 0.1335103064775467, "kl": 2.9146671295166016e-05, "learning_rate": 2.4e-07, "loss": 0.0543, "reward": 0.40071453526616096, "reward_std": 0.7024472132325172, "rewards/cosine_scaled_reward": -0.02285701408982277, "rewards/format_reward": 0.4464285746216774, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 2948.9464721679688, "epoch": 0.052, "grad_norm": 0.1271769255399704, "kl": 3.698468208312988e-05, "learning_rate": 2.6e-07, "loss": 0.0693, "reward": 0.47545497864484787, "reward_std": 0.7740402817726135, "rewards/cosine_scaled_reward": 0.002608438953757286, "rewards/format_reward": 0.470238097012043, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 2679.3928833007812, "epoch": 0.056, "grad_norm": 0.12242422997951508, "kl": 2.8014183044433594e-05, "learning_rate": 2.8e-07, "loss": 0.0489, "reward": 0.40165250562131405, "reward_std": 0.7790777683258057, "rewards/cosine_scaled_reward": -0.028340420685708523, "rewards/format_reward": 0.4583333507180214, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 2889.136962890625, "epoch": 0.06, "grad_norm": 0.19158992171287537, "kl": 3.224611282348633e-05, "learning_rate": 3e-07, "loss": 0.0704, "reward": 0.15117042418569326, "reward_std": 0.6893174201250076, "rewards/cosine_scaled_reward": -0.10893859504722059, "rewards/format_reward": 0.3690476231276989, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 2892.6488647460938, "epoch": 0.064, "grad_norm": 0.15633279085159302, "kl": 3.668665885925293e-05, "learning_rate": 3.2e-07, "loss": 0.0733, "reward": -0.09426919370889664, "reward_std": 0.5802397355437279, "rewards/cosine_scaled_reward": -0.18403935432434082, "rewards/format_reward": 0.2738095298409462, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 2920.3511962890625, "epoch": 0.068, "grad_norm": 0.12191536277532578, "kl": 3.221631050109863e-05, "learning_rate": 3.4000000000000003e-07, "loss": 0.0214, "reward": 0.13339833123609424, "reward_std": 0.6428257077932358, "rewards/cosine_scaled_reward": -0.11187227349728346, "rewards/format_reward": 0.3571428619325161, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 2704.672607421875, "epoch": 0.072, "grad_norm": 0.2207571119070053, "kl": 2.4259090423583984e-05, "learning_rate": 3.6e-07, "loss": 0.0858, "reward": 0.4250662699341774, "reward_std": 0.7673918604850769, "rewards/cosine_scaled_reward": -0.019609727547504008, "rewards/format_reward": 0.4642857350409031, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 2800.6429443359375, "epoch": 0.076, "grad_norm": 0.12734168767929077, "kl": 2.4378299713134766e-05, "learning_rate": 3.7999999999999996e-07, "loss": 0.0471, "reward": 0.4042445756494999, "reward_std": 0.6929292231798172, "rewards/cosine_scaled_reward": -0.02704438249929808, "rewards/format_reward": 0.4583333432674408, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 2697.8274536132812, "epoch": 0.08, "grad_norm": 0.15472018718719482, "kl": 2.35140323638916e-05, "learning_rate": 4e-07, "loss": 0.0265, "reward": 0.3560524769127369, "reward_std": 0.6769110411405563, "rewards/cosine_scaled_reward": -0.05114044318906963, "rewards/format_reward": 0.4583333432674408, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 2339.4405517578125, "epoch": 0.084, "grad_norm": 0.21466447412967682, "kl": 2.086162567138672e-05, "learning_rate": 4.1999999999999995e-07, "loss": 0.0806, "reward": 0.7416469305753708, "reward_std": 0.841043546795845, "rewards/cosine_scaled_reward": 0.06725202780216932, "rewards/format_reward": 0.6071428656578064, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 2781.5238037109375, "epoch": 0.088, "grad_norm": 0.19139103591442108, "kl": 3.0338764190673828e-05, "learning_rate": 4.3999999999999997e-07, "loss": 0.0773, "reward": 0.20257593411952257, "reward_std": 0.7891978472471237, "rewards/cosine_scaled_reward": -0.1129977386444807, "rewards/format_reward": 0.4285714365541935, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 3036.761962890625, "epoch": 0.092, "grad_norm": 0.1108132153749466, "kl": 2.6345252990722656e-05, "learning_rate": 4.6e-07, "loss": 0.0238, "reward": 0.20629926398396492, "reward_std": 0.7457813173532486, "rewards/cosine_scaled_reward": -0.07542179408483207, "rewards/format_reward": 0.3571428619325161, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 3143.1131591796875, "epoch": 0.096, "grad_norm": 0.10591176152229309, "kl": 2.703070640563965e-05, "learning_rate": 4.8e-07, "loss": 0.0637, "reward": 0.0749267227947712, "reward_std": 0.6808565855026245, "rewards/cosine_scaled_reward": -0.1292033027857542, "rewards/format_reward": 0.3333333395421505, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 2934.4227294921875, "epoch": 0.1, "grad_norm": 0.12180113047361374, "kl": 1.4990568161010742e-05, "learning_rate": 5e-07, "loss": 0.0525, "reward": 0.3605663161724806, "reward_std": 0.7757866084575653, "rewards/cosine_scaled_reward": -0.028050171211361885, "rewards/format_reward": 0.4166666716337204, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 3100.5357666015625, "epoch": 0.104, "grad_norm": 0.14736856520175934, "kl": 2.230703830718994e-05, "learning_rate": 5.2e-07, "loss": 0.087, "reward": 0.1489051878452301, "reward_std": 0.7608643025159836, "rewards/cosine_scaled_reward": -0.08923787740059197, "rewards/format_reward": 0.32738095708191395, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 2978.452392578125, "epoch": 0.108, "grad_norm": 0.1490376740694046, "kl": 2.7447938919067383e-05, "learning_rate": 5.4e-07, "loss": 0.0633, "reward": 0.16602796246297657, "reward_std": 0.762113556265831, "rewards/cosine_scaled_reward": -0.09555743727833033, "rewards/format_reward": 0.3571428656578064, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 2923.1012573242188, "epoch": 0.112, "grad_norm": 0.11410558968782425, "kl": 3.108382225036621e-05, "learning_rate": 5.6e-07, "loss": 0.0618, "reward": 0.058234728407114744, "reward_std": 0.5919530540704727, "rewards/cosine_scaled_reward": -0.14052549470216036, "rewards/format_reward": 0.3392857201397419, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 2925.1011962890625, "epoch": 0.116, "grad_norm": 0.17734545469284058, "kl": 5.251169204711914e-05, "learning_rate": 5.8e-07, "loss": 0.0463, "reward": 0.24072746047750115, "reward_std": 0.7061209976673126, "rewards/cosine_scaled_reward": -0.061183891259133816, "rewards/format_reward": 0.3630952425301075, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 2882.39892578125, "epoch": 0.12, "grad_norm": 0.15557299554347992, "kl": 2.1502375602722168e-05, "learning_rate": 6e-07, "loss": 0.0703, "reward": 0.22035705484449863, "reward_std": 0.5751676708459854, "rewards/cosine_scaled_reward": -0.07136909663677216, "rewards/format_reward": 0.3630952462553978, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 2717.607177734375, "epoch": 0.124, "grad_norm": 0.16903533041477203, "kl": 6.181001663208008e-05, "learning_rate": 6.2e-07, "loss": 0.07, "reward": 0.3481953740119934, "reward_std": 0.7361179888248444, "rewards/cosine_scaled_reward": -0.025307081639766693, "rewards/format_reward": 0.3988095298409462, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 2585.244140625, "epoch": 0.128, "grad_norm": 0.1481872797012329, "kl": 0.00023996829986572266, "learning_rate": 6.4e-07, "loss": 0.0664, "reward": 0.5805501043796539, "reward_std": 0.805858314037323, "rewards/cosine_scaled_reward": 0.019441714510321617, "rewards/format_reward": 0.5416666865348816, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 2608.7083740234375, "epoch": 0.132, "grad_norm": 0.10800693184137344, "kl": 0.0002808570861816406, "learning_rate": 6.6e-07, "loss": 0.0255, "reward": 0.5432634204626083, "reward_std": 0.7363616675138474, "rewards/cosine_scaled_reward": 0.02460789866745472, "rewards/format_reward": 0.4940476268529892, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 2769.1964721679688, "epoch": 0.136, "grad_norm": 0.12105516344308853, "kl": 0.00020498037338256836, "learning_rate": 6.800000000000001e-07, "loss": 0.0184, "reward": 0.18091929703950882, "reward_std": 0.6703035831451416, "rewards/cosine_scaled_reward": -0.10596893168985844, "rewards/format_reward": 0.3928571604192257, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 3090.1607666015625, "epoch": 0.14, "grad_norm": 0.11914981156587601, "kl": 0.0002568960189819336, "learning_rate": 7e-07, "loss": 0.0617, "reward": 0.08196480484912172, "reward_std": 0.7742973417043686, "rewards/cosine_scaled_reward": -0.10782713070511818, "rewards/format_reward": 0.2976190559566021, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 2790.0596313476562, "epoch": 0.144, "grad_norm": 0.10883598774671555, "kl": 0.00027942657470703125, "learning_rate": 7.2e-07, "loss": 0.0163, "reward": 0.31424143677577376, "reward_std": 0.669949933886528, "rewards/cosine_scaled_reward": -0.06311738677322865, "rewards/format_reward": 0.4404762014746666, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 2916.3452758789062, "epoch": 0.148, "grad_norm": 0.1492447555065155, "kl": 0.00025272369384765625, "learning_rate": 7.4e-07, "loss": 0.0592, "reward": 0.1357547640800476, "reward_std": 0.7365808188915253, "rewards/cosine_scaled_reward": -0.10771786456461996, "rewards/format_reward": 0.3511904813349247, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 3342.6786499023438, "epoch": 0.152, "grad_norm": 0.08414288610219955, "kl": 0.00013870000839233398, "learning_rate": 7.599999999999999e-07, "loss": 0.0239, "reward": -0.1723631415516138, "reward_std": 0.6109825298190117, "rewards/cosine_scaled_reward": -0.16951490193605423, "rewards/format_reward": 0.16666666977107525, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 2762.5774536132812, "epoch": 0.156, "grad_norm": 0.14042888581752777, "kl": 0.0005602836608886719, "learning_rate": 7.799999999999999e-07, "loss": 0.0588, "reward": 0.3224933594465256, "reward_std": 0.6976565718650818, "rewards/cosine_scaled_reward": -0.04708665423095226, "rewards/format_reward": 0.416666679084301, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 2729.9644165039062, "epoch": 0.16, "grad_norm": 0.110390305519104, "kl": 0.00016605854034423828, "learning_rate": 8e-07, "loss": 0.0549, "reward": 0.4423699714243412, "reward_std": 0.6286562532186508, "rewards/cosine_scaled_reward": -0.016910257749259472, "rewards/format_reward": 0.476190485060215, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 2955.7084350585938, "epoch": 0.164, "grad_norm": 0.15253609418869019, "kl": 0.00040471553802490234, "learning_rate": 8.199999999999999e-07, "loss": 0.0718, "reward": 0.44552009692415595, "reward_std": 0.7759689763188362, "rewards/cosine_scaled_reward": 0.0144266925053671, "rewards/format_reward": 0.4166666753590107, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 2961.0596313476562, "epoch": 0.168, "grad_norm": 0.22110103070735931, "kl": 0.0009613037109375, "learning_rate": 8.399999999999999e-07, "loss": 0.1186, "reward": 0.2217194978147745, "reward_std": 0.6207270994782448, "rewards/cosine_scaled_reward": -0.07366406172513962, "rewards/format_reward": 0.3690476268529892, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 3015.7738647460938, "epoch": 0.172, "grad_norm": 0.23720885813236237, "kl": 0.0005915164947509766, "learning_rate": 8.599999999999999e-07, "loss": 0.1245, "reward": -0.04521503113210201, "reward_std": 0.62105892598629, "rewards/cosine_scaled_reward": -0.16844085440970957, "rewards/format_reward": 0.2916666716337204, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 2937.2381591796875, "epoch": 0.176, "grad_norm": 0.09096106886863708, "kl": 0.00051116943359375, "learning_rate": 8.799999999999999e-07, "loss": 0.0249, "reward": 0.22813843563199043, "reward_std": 0.6727291792631149, "rewards/cosine_scaled_reward": -0.058549837151076645, "rewards/format_reward": 0.3452381007373333, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 3149.511962890625, "epoch": 0.18, "grad_norm": 0.11164555698633194, "kl": 0.0004715919494628906, "learning_rate": 9e-07, "loss": 0.017, "reward": 0.05970348231494427, "reward_std": 0.7763290405273438, "rewards/cosine_scaled_reward": -0.11300540715456009, "rewards/format_reward": 0.285714291036129, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 3184.6845703125, "epoch": 0.184, "grad_norm": 0.1711866706609726, "kl": 0.0007777214050292969, "learning_rate": 9.2e-07, "loss": 0.0731, "reward": 0.1386737246066332, "reward_std": 0.7276585251092911, "rewards/cosine_scaled_reward": -0.06459171324968338, "rewards/format_reward": 0.26785714738070965, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 3015.386962890625, "epoch": 0.188, "grad_norm": 0.12470238655805588, "kl": 0.0014100074768066406, "learning_rate": 9.399999999999999e-07, "loss": 0.0653, "reward": 0.16049158992245793, "reward_std": 0.7017006278038025, "rewards/cosine_scaled_reward": -0.08642087457701564, "rewards/format_reward": 0.3333333432674408, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 2909.96435546875, "epoch": 0.192, "grad_norm": 0.28355535864830017, "kl": 0.009288787841796875, "learning_rate": 9.6e-07, "loss": 0.0581, "reward": 0.0768200121819973, "reward_std": 0.7135801166296005, "rewards/cosine_scaled_reward": -0.14016142301261425, "rewards/format_reward": 0.3571428693830967, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 2753.6845703125, "epoch": 0.196, "grad_norm": 0.6567728519439697, "kl": 0.023477554321289062, "learning_rate": 9.8e-07, "loss": 0.0866, "reward": 0.4337980281561613, "reward_std": 0.8068065047264099, "rewards/cosine_scaled_reward": -0.006315283477306366, "rewards/format_reward": 0.4464285746216774, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 2934.8274536132812, "epoch": 0.2, "grad_norm": 0.11382321268320084, "kl": 0.0025758743286132812, "learning_rate": 1e-06, "loss": 0.0714, "reward": 0.2880665063858032, "reward_std": 0.6403830945491791, "rewards/cosine_scaled_reward": -0.049419129034504294, "rewards/format_reward": 0.3869047649204731, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 2840.7678833007812, "epoch": 0.204, "grad_norm": 0.11641126126050949, "kl": 0.0050792694091796875, "learning_rate": 9.999890338174275e-07, "loss": 0.023, "reward": 0.2840890493243933, "reward_std": 0.692708894610405, "rewards/cosine_scaled_reward": -0.06628882512450218, "rewards/format_reward": 0.4166666716337204, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 3119.9881591796875, "epoch": 0.208, "grad_norm": 0.14652805030345917, "kl": 0.0032052993774414062, "learning_rate": 9.999561358041868e-07, "loss": 0.0641, "reward": 0.18620363296940923, "reward_std": 0.8490904271602631, "rewards/cosine_scaled_reward": -0.058683907613158226, "rewards/format_reward": 0.3035714365541935, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 2926.9940795898438, "epoch": 0.212, "grad_norm": 0.19241634011268616, "kl": 0.00447845458984375, "learning_rate": 9.999013075636804e-07, "loss": 0.0747, "reward": 0.36803684243932366, "reward_std": 0.823193870484829, "rewards/cosine_scaled_reward": -0.006457769020926207, "rewards/format_reward": 0.380952388048172, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 3031.7203369140625, "epoch": 0.216, "grad_norm": 0.12843742966651917, "kl": 0.0029668807983398438, "learning_rate": 9.998245517681593e-07, "loss": 0.0153, "reward": 0.2685772944241762, "reward_std": 0.6489126533269882, "rewards/cosine_scaled_reward": -0.04428278561681509, "rewards/format_reward": 0.3571428693830967, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 3123.136962890625, "epoch": 0.22, "grad_norm": 0.1504901498556137, "kl": 0.006084442138671875, "learning_rate": 9.997258721585931e-07, "loss": 0.0691, "reward": 0.03501664288341999, "reward_std": 0.6481388062238693, "rewards/cosine_scaled_reward": -0.11642025248147547, "rewards/format_reward": 0.2678571529686451, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 2673.434539794922, "epoch": 0.224, "grad_norm": 0.1224113255739212, "kl": 0.0033931732177734375, "learning_rate": 9.996052735444862e-07, "loss": 0.0269, "reward": 0.6296312126796693, "reward_std": 0.6751764714717865, "rewards/cosine_scaled_reward": 0.07374419644474983, "rewards/format_reward": 0.482142873108387, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 3170.0535888671875, "epoch": 0.228, "grad_norm": 0.1044960618019104, "kl": 0.0024929046630859375, "learning_rate": 9.994627618036452e-07, "loss": 0.0262, "reward": 0.21022793278098106, "reward_std": 0.7194458544254303, "rewards/cosine_scaled_reward": -0.05560031719505787, "rewards/format_reward": 0.3214285857975483, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 2590.0179138183594, "epoch": 0.232, "grad_norm": 0.13768674433231354, "kl": 0.0045948028564453125, "learning_rate": 9.992983438818915e-07, "loss": 0.0592, "reward": 0.4493846707046032, "reward_std": 0.7118680775165558, "rewards/cosine_scaled_reward": -0.02233148762024939, "rewards/format_reward": 0.4940476231276989, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 3087.4702758789062, "epoch": 0.236, "grad_norm": 0.13870052993297577, "kl": 0.001789093017578125, "learning_rate": 9.991120277927223e-07, "loss": 0.0691, "reward": 0.3166997814550996, "reward_std": 0.7532177269458771, "rewards/cosine_scaled_reward": -0.011292967945337296, "rewards/format_reward": 0.3392857201397419, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 2717.3036499023438, "epoch": 0.24, "grad_norm": 0.14514827728271484, "kl": 0.0060253143310546875, "learning_rate": 9.989038226169207e-07, "loss": 0.0622, "reward": 0.16810212982818484, "reward_std": 0.5123014599084854, "rewards/cosine_scaled_reward": -0.10642512841150165, "rewards/format_reward": 0.3809523843228817, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 3105.7977294921875, "epoch": 0.244, "grad_norm": 0.10494968295097351, "kl": 0.0026693344116210938, "learning_rate": 9.98673738502114e-07, "loss": 0.0639, "reward": 0.08244643732905388, "reward_std": 0.7039294093847275, "rewards/cosine_scaled_reward": -0.10461012227460742, "rewards/format_reward": 0.29166666977107525, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 3213.3214721679688, "epoch": 0.248, "grad_norm": 0.0999075248837471, "kl": 0.0023097991943359375, "learning_rate": 9.98421786662277e-07, "loss": 0.053, "reward": 0.09679291397333145, "reward_std": 0.7138089835643768, "rewards/cosine_scaled_reward": -0.0914845080114901, "rewards/format_reward": 0.2797619104385376, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 2783.9584350585938, "epoch": 0.252, "grad_norm": 0.19832438230514526, "kl": 0.0027294158935546875, "learning_rate": 9.981479793771866e-07, "loss": 0.0773, "reward": 0.2238014191389084, "reward_std": 0.6036202609539032, "rewards/cosine_scaled_reward": -0.06369452457875013, "rewards/format_reward": 0.351190485060215, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 2976.6607666015625, "epoch": 0.256, "grad_norm": 0.12335456907749176, "kl": 0.0020885467529296875, "learning_rate": 9.97852329991824e-07, "loss": 0.0857, "reward": 0.27347568422555923, "reward_std": 0.7463532835245132, "rewards/cosine_scaled_reward": -0.03885740428813733, "rewards/format_reward": 0.3511904887855053, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 2825.6130981445312, "epoch": 0.26, "grad_norm": 0.13863790035247803, "kl": 0.002285003662109375, "learning_rate": 9.975348529157229e-07, "loss": 0.0349, "reward": 0.3712610546499491, "reward_std": 0.7277249395847321, "rewards/cosine_scaled_reward": -0.037583764642477036, "rewards/format_reward": 0.4464285746216774, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 2974.232177734375, "epoch": 0.264, "grad_norm": 0.11186616122722626, "kl": 0.002407073974609375, "learning_rate": 9.971955636222684e-07, "loss": 0.0025, "reward": 0.07817286718636751, "reward_std": 0.640307292342186, "rewards/cosine_scaled_reward": -0.12460404448211193, "rewards/format_reward": 0.3273809589445591, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 3152.702392578125, "epoch": 0.268, "grad_norm": 0.12694397568702698, "kl": 0.00250244140625, "learning_rate": 9.968344786479415e-07, "loss": 0.07, "reward": 0.1549822874367237, "reward_std": 0.6663320288062096, "rewards/cosine_scaled_reward": -0.08322314161341637, "rewards/format_reward": 0.3214285783469677, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 2938.0179443359375, "epoch": 0.272, "grad_norm": 0.12655070424079895, "kl": 0.00341033935546875, "learning_rate": 9.964516155915151e-07, "loss": 0.0545, "reward": 0.2076467089354992, "reward_std": 0.7705407291650772, "rewards/cosine_scaled_reward": -0.08070044964551926, "rewards/format_reward": 0.3690476268529892, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 3025.8095703125, "epoch": 0.276, "grad_norm": 0.12574610114097595, "kl": 0.003986358642578125, "learning_rate": 9.960469931131936e-07, "loss": 0.0227, "reward": 0.03160261735320091, "reward_std": 0.621779277920723, "rewards/cosine_scaled_reward": -0.13003203552216291, "rewards/format_reward": 0.2916666716337204, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 2877.5536499023438, "epoch": 0.28, "grad_norm": 0.12181198596954346, "kl": 0.0038127899169921875, "learning_rate": 9.956206309337066e-07, "loss": 0.0248, "reward": 0.2757916431874037, "reward_std": 0.7794490903615952, "rewards/cosine_scaled_reward": -0.06448512757197022, "rewards/format_reward": 0.4047619178891182, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 2776.6131591796875, "epoch": 0.284, "grad_norm": 0.17934156954288483, "kl": 0.0039825439453125, "learning_rate": 9.951725498333448e-07, "loss": 0.1072, "reward": 0.2793612889945507, "reward_std": 0.7607921361923218, "rewards/cosine_scaled_reward": -0.06865269318223, "rewards/format_reward": 0.4166666716337204, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 3037.5178833007812, "epoch": 0.288, "grad_norm": 0.1311851143836975, "kl": 0.0038604736328125, "learning_rate": 9.947027716509488e-07, "loss": 0.0745, "reward": 0.34610075503587723, "reward_std": 0.8604296147823334, "rewards/cosine_scaled_reward": -0.01444962713867426, "rewards/format_reward": 0.3750000074505806, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 2880.3036499023438, "epoch": 0.292, "grad_norm": 0.10903972387313843, "kl": 0.005123138427734375, "learning_rate": 9.942113192828444e-07, "loss": 0.0247, "reward": 0.41264417115598917, "reward_std": 0.6988394409418106, "rewards/cosine_scaled_reward": -0.00201124744489789, "rewards/format_reward": 0.4166666753590107, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 2689.5774536132812, "epoch": 0.296, "grad_norm": 0.1187940314412117, "kl": 0.00371551513671875, "learning_rate": 9.93698216681727e-07, "loss": 0.0343, "reward": 0.49459290131926537, "reward_std": 0.6582471132278442, "rewards/cosine_scaled_reward": -0.005679763096850365, "rewards/format_reward": 0.505952388048172, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 3011.2440795898438, "epoch": 0.3, "grad_norm": 0.15027488768100739, "kl": 0.00612640380859375, "learning_rate": 9.931634888554935e-07, "loss": 0.0922, "reward": -0.12369688227772713, "reward_std": 0.5938592255115509, "rewards/cosine_scaled_reward": -0.1957770138978958, "rewards/format_reward": 0.26785715110599995, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 3087.9762573242188, "epoch": 0.304, "grad_norm": 0.1342087835073471, "kl": 0.004589080810546875, "learning_rate": 9.926071618660237e-07, "loss": 0.051, "reward": 0.029461721424013376, "reward_std": 0.5008194297552109, "rewards/cosine_scaled_reward": -0.12812629727704916, "rewards/format_reward": 0.2857142915017903, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 2708.6845703125, "epoch": 0.308, "grad_norm": 0.11936229467391968, "kl": 0.00518035888671875, "learning_rate": 9.9202926282791e-07, "loss": 0.0693, "reward": 0.38730931747704744, "reward_std": 0.6931557953357697, "rewards/cosine_scaled_reward": -0.0503929746337235, "rewards/format_reward": 0.4880952388048172, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 3123.4345703125, "epoch": 0.312, "grad_norm": 0.10926749557256699, "kl": 0.006168365478515625, "learning_rate": 9.91429819907136e-07, "loss": 0.0281, "reward": 0.14759791223332286, "reward_std": 0.6552696228027344, "rewards/cosine_scaled_reward": -0.08989150635898113, "rewards/format_reward": 0.3273809519596398, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 3104.886962890625, "epoch": 0.316, "grad_norm": 0.17055809497833252, "kl": 0.005580902099609375, "learning_rate": 9.908088623197048e-07, "loss": 0.122, "reward": 0.1187831275165081, "reward_std": 0.7589289993047714, "rewards/cosine_scaled_reward": -0.11917985696345568, "rewards/format_reward": 0.3571428656578064, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 2892.2261962890625, "epoch": 0.32, "grad_norm": 0.12601953744888306, "kl": 0.004444122314453125, "learning_rate": 9.901664203302124e-07, "loss": 0.0261, "reward": 0.2744547198526561, "reward_std": 0.686069905757904, "rewards/cosine_scaled_reward": -0.05920121353119612, "rewards/format_reward": 0.3928571492433548, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 3041.1488647460938, "epoch": 0.324, "grad_norm": 0.11552488803863525, "kl": 0.00655364990234375, "learning_rate": 9.895025252503755e-07, "loss": 0.0229, "reward": 0.12106413394212723, "reward_std": 0.675617903470993, "rewards/cosine_scaled_reward": -0.1061346041969955, "rewards/format_reward": 0.33333333767950535, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 3129.6429443359375, "epoch": 0.328, "grad_norm": 0.09706410765647888, "kl": 0.0055084228515625, "learning_rate": 9.888172094375033e-07, "loss": 0.0452, "reward": 0.12287123966962099, "reward_std": 0.7173575460910797, "rewards/cosine_scaled_reward": -0.09035009983927011, "rewards/format_reward": 0.3035714365541935, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 2602.047637939453, "epoch": 0.332, "grad_norm": 0.11233574151992798, "kl": 0.01010894775390625, "learning_rate": 9.881105062929221e-07, "loss": 0.0258, "reward": 0.35400932375341654, "reward_std": 0.6462785750627518, "rewards/cosine_scaled_reward": -0.06109058950096369, "rewards/format_reward": 0.476190485060215, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 2894.011962890625, "epoch": 0.336, "grad_norm": 0.12059750407934189, "kl": 0.00872802734375, "learning_rate": 9.873824502603459e-07, "loss": 0.0806, "reward": 0.2941260803490877, "reward_std": 0.78522889316082, "rewards/cosine_scaled_reward": -0.06722268275916576, "rewards/format_reward": 0.4285714328289032, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 2979.226318359375, "epoch": 0.34, "grad_norm": 0.15206296741962433, "kl": 0.00783538818359375, "learning_rate": 9.866330768241983e-07, "loss": 0.0755, "reward": 0.21889091655611992, "reward_std": 0.6674999743700027, "rewards/cosine_scaled_reward": -0.06614979542791843, "rewards/format_reward": 0.3511904887855053, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 2819.5892944335938, "epoch": 0.344, "grad_norm": 0.1093529462814331, "kl": 0.00661468505859375, "learning_rate": 9.85862422507884e-07, "loss": 0.0353, "reward": 0.20389786185114644, "reward_std": 0.6942542195320129, "rewards/cosine_scaled_reward": -0.07959869271144271, "rewards/format_reward": 0.3630952425301075, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 2616.452392578125, "epoch": 0.348, "grad_norm": 0.15942011773586273, "kl": 0.00847625732421875, "learning_rate": 9.850705248720068e-07, "loss": 0.0773, "reward": 0.3760679364204407, "reward_std": 0.6467384025454521, "rewards/cosine_scaled_reward": -0.05898985452950001, "rewards/format_reward": 0.4940476194024086, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 2923.7500610351562, "epoch": 0.352, "grad_norm": 0.10565865784883499, "kl": 0.0073089599609375, "learning_rate": 9.8425742251254e-07, "loss": 0.0227, "reward": 0.3185804970562458, "reward_std": 0.6098055616021156, "rewards/cosine_scaled_reward": -0.010352615499868989, "rewards/format_reward": 0.3392857201397419, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 2752.6309814453125, "epoch": 0.356, "grad_norm": 0.1690302938222885, "kl": 0.0132293701171875, "learning_rate": 9.83423155058946e-07, "loss": 0.0312, "reward": 0.46914676763117313, "reward_std": 0.7854363918304443, "rewards/cosine_scaled_reward": 0.005406718701124191, "rewards/format_reward": 0.4583333432674408, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 2769.1488647460938, "epoch": 0.36, "grad_norm": 0.17653611302375793, "kl": 0.011993408203125, "learning_rate": 9.825677631722435e-07, "loss": 0.0815, "reward": 0.3604448903352022, "reward_std": 0.798264317214489, "rewards/cosine_scaled_reward": -0.04894421715289354, "rewards/format_reward": 0.4583333432674408, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 3187.3631591796875, "epoch": 0.364, "grad_norm": 0.1068400964140892, "kl": 0.0078277587890625, "learning_rate": 9.816912885430258e-07, "loss": 0.0211, "reward": -0.03608314320445061, "reward_std": 0.5826699808239937, "rewards/cosine_scaled_reward": -0.14006539154797792, "rewards/format_reward": 0.2440476305782795, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 2924.916748046875, "epoch": 0.368, "grad_norm": 0.17665976285934448, "kl": 0.0091400146484375, "learning_rate": 9.807937738894303e-07, "loss": 0.0852, "reward": 0.2780441716313362, "reward_std": 0.7524297386407852, "rewards/cosine_scaled_reward": -0.07526363711804152, "rewards/format_reward": 0.4285714291036129, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 2939.3512573242188, "epoch": 0.372, "grad_norm": 0.13597099483013153, "kl": 0.0076446533203125, "learning_rate": 9.798752629550546e-07, "loss": 0.0324, "reward": 0.33797190338373184, "reward_std": 0.5880008786916733, "rewards/cosine_scaled_reward": -0.01851405529305339, "rewards/format_reward": 0.3750000111758709, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 2922.875, "epoch": 0.376, "grad_norm": 0.12246444076299667, "kl": 0.00815582275390625, "learning_rate": 9.78935800506826e-07, "loss": 0.0801, "reward": 0.14625070057809353, "reward_std": 0.690229170024395, "rewards/cosine_scaled_reward": -0.12330322340130806, "rewards/format_reward": 0.3928571492433548, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 3222.3632202148438, "epoch": 0.38, "grad_norm": 0.08992121368646622, "kl": 0.00704193115234375, "learning_rate": 9.779754323328192e-07, "loss": -0.001, "reward": 0.26800261437892914, "reward_std": 0.7103277295827866, "rewards/cosine_scaled_reward": -0.011832039803266525, "rewards/format_reward": 0.2916666716337204, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 3215.2202758789062, "epoch": 0.384, "grad_norm": 0.12997964024543762, "kl": 0.00983428955078125, "learning_rate": 9.769942052400235e-07, "loss": 0.0591, "reward": -0.03136127255856991, "reward_std": 0.6326467096805573, "rewards/cosine_scaled_reward": -0.18234730698168278, "rewards/format_reward": 0.33333333767950535, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 2857.0416870117188, "epoch": 0.388, "grad_norm": 0.16558504104614258, "kl": 0.007293701171875, "learning_rate": 9.759921670520634e-07, "loss": 0.0821, "reward": 0.19707820191979408, "reward_std": 0.6188783794641495, "rewards/cosine_scaled_reward": -0.10681804455816746, "rewards/format_reward": 0.410714291036129, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 2983.9940795898438, "epoch": 0.392, "grad_norm": 0.1882523149251938, "kl": 0.00878143310546875, "learning_rate": 9.749693666068663e-07, "loss": 0.1027, "reward": 0.3959239423274994, "reward_std": 0.875861182808876, "rewards/cosine_scaled_reward": 0.004509590216912329, "rewards/format_reward": 0.3869047686457634, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 3077.0952758789062, "epoch": 0.396, "grad_norm": 0.1565464437007904, "kl": 0.011505126953125, "learning_rate": 9.739258537542835e-07, "loss": 0.0968, "reward": 0.027048692107200623, "reward_std": 0.7064545601606369, "rewards/cosine_scaled_reward": -0.1323089925572276, "rewards/format_reward": 0.2916666679084301, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 2358.2381591796875, "epoch": 0.4, "grad_norm": 0.14640696346759796, "kl": 0.00804901123046875, "learning_rate": 9.728616793536587e-07, "loss": 0.0774, "reward": 0.8142919540405273, "reward_std": 0.7067123055458069, "rewards/cosine_scaled_reward": 0.10357456840574741, "rewards/format_reward": 0.6071428507566452, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 3195.6428833007812, "epoch": 0.404, "grad_norm": 0.14821472764015198, "kl": 0.0126495361328125, "learning_rate": 9.717768952713511e-07, "loss": 0.0897, "reward": 0.2797414679080248, "reward_std": 0.8859200328588486, "rewards/cosine_scaled_reward": -0.0535816540941596, "rewards/format_reward": 0.3869047649204731, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 3046.7798461914062, "epoch": 0.408, "grad_norm": 0.13244982063770294, "kl": 0.0107879638671875, "learning_rate": 9.706715543782064e-07, "loss": 0.0638, "reward": 0.16126136109232903, "reward_std": 0.6933339387178421, "rewards/cosine_scaled_reward": -0.08305979892611504, "rewards/format_reward": 0.3273809552192688, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 2718.8275146484375, "epoch": 0.412, "grad_norm": 0.20267100632190704, "kl": 0.01003265380859375, "learning_rate": 9.695457105469804e-07, "loss": 0.1246, "reward": 0.4206714928150177, "reward_std": 0.6706456393003464, "rewards/cosine_scaled_reward": -0.0069261584430933, "rewards/format_reward": 0.43452382180839777, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 2973.7559814453125, "epoch": 0.416, "grad_norm": 0.12351427227258682, "kl": 0.01038360595703125, "learning_rate": 9.683994186497132e-07, "loss": 0.0716, "reward": 0.20578511937389976, "reward_std": 0.7138822227716446, "rewards/cosine_scaled_reward": -0.0846074327128008, "rewards/format_reward": 0.3750000037252903, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 2596.1250915527344, "epoch": 0.42, "grad_norm": 0.13755492866039276, "kl": 0.0099945068359375, "learning_rate": 9.672327345550543e-07, "loss": 0.0544, "reward": 0.6021162122488022, "reward_std": 0.8435305505990982, "rewards/cosine_scaled_reward": 0.012367631308734417, "rewards/format_reward": 0.5773809663951397, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 2986.226318359375, "epoch": 0.424, "grad_norm": 0.10337146371603012, "kl": 0.014007568359375, "learning_rate": 9.66045715125541e-07, "loss": 0.0441, "reward": 0.24333537928760052, "reward_std": 0.7411631494760513, "rewards/cosine_scaled_reward": -0.08071326930075884, "rewards/format_reward": 0.4047619141638279, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 2804.7500610351562, "epoch": 0.428, "grad_norm": 0.1334601491689682, "kl": 0.010711669921875, "learning_rate": 9.648384182148252e-07, "loss": 0.0741, "reward": 0.21579574886709452, "reward_std": 0.559941440820694, "rewards/cosine_scaled_reward": -0.08257831074297428, "rewards/format_reward": 0.380952388048172, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 2947.9583740234375, "epoch": 0.432, "grad_norm": 0.15339502692222595, "kl": 0.0123443603515625, "learning_rate": 9.636109026648554e-07, "loss": 0.0797, "reward": 0.2714387159794569, "reward_std": 0.5535019040107727, "rewards/cosine_scaled_reward": -0.060709220357239246, "rewards/format_reward": 0.3928571492433548, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 3165.3631591796875, "epoch": 0.436, "grad_norm": 0.14555484056472778, "kl": 0.0123748779296875, "learning_rate": 9.623632283030077e-07, "loss": 0.0689, "reward": 0.3741426505148411, "reward_std": 0.7712415158748627, "rewards/cosine_scaled_reward": 0.011476085986942053, "rewards/format_reward": 0.351190485060215, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 2682.607177734375, "epoch": 0.44, "grad_norm": 3.0458719730377197, "kl": 0.1771697998046875, "learning_rate": 9.610954559391704e-07, "loss": 0.0576, "reward": 0.43371669203042984, "reward_std": 0.6959643810987473, "rewards/cosine_scaled_reward": -0.02718928176909685, "rewards/format_reward": 0.4880952462553978, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 2601.636993408203, "epoch": 0.444, "grad_norm": 0.16071970760822296, "kl": 0.0132904052734375, "learning_rate": 9.598076473627796e-07, "loss": 0.0475, "reward": 0.3634342849254608, "reward_std": 0.5500286221504211, "rewards/cosine_scaled_reward": -0.06233047042042017, "rewards/format_reward": 0.4880952462553978, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 2848.0, "epoch": 0.448, "grad_norm": 0.11652833968400955, "kl": 0.01318359375, "learning_rate": 9.58499865339809e-07, "loss": 0.0216, "reward": 0.4104595482349396, "reward_std": 0.7775004655122757, "rewards/cosine_scaled_reward": -0.023936893790960312, "rewards/format_reward": 0.4583333358168602, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 3060.696533203125, "epoch": 0.452, "grad_norm": 0.11911512911319733, "kl": 0.019012451171875, "learning_rate": 9.571721736097088e-07, "loss": 0.0351, "reward": 0.14249714091420174, "reward_std": 0.5928184911608696, "rewards/cosine_scaled_reward": -0.08946572133572772, "rewards/format_reward": 0.32142857648432255, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 3025.3392944335938, "epoch": 0.456, "grad_norm": 0.11119002103805542, "kl": 0.013275146484375, "learning_rate": 9.55824636882301e-07, "loss": 0.0157, "reward": 0.2583576124161482, "reward_std": 0.5952321216464043, "rewards/cosine_scaled_reward": -0.05236881226301193, "rewards/format_reward": 0.3630952462553978, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 2642.4524536132812, "epoch": 0.46, "grad_norm": 0.13256801664829254, "kl": 0.0142669677734375, "learning_rate": 9.54457320834625e-07, "loss": 0.0464, "reward": 0.26410975866019726, "reward_std": 0.7131557315587997, "rewards/cosine_scaled_reward": -0.10604035668075085, "rewards/format_reward": 0.476190485060215, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 2842.172607421875, "epoch": 0.464, "grad_norm": 0.14555224776268005, "kl": 0.0131683349609375, "learning_rate": 9.530702921077358e-07, "loss": 0.06, "reward": 0.7474905252456665, "reward_std": 0.9560296833515167, "rewards/cosine_scaled_reward": 0.10291192133445293, "rewards/format_reward": 0.5416666716337204, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 2644.851318359375, "epoch": 0.468, "grad_norm": 0.1801517903804779, "kl": 0.0159149169921875, "learning_rate": 9.516636183034564e-07, "loss": 0.0868, "reward": 0.3559920974075794, "reward_std": 0.6948887631297112, "rewards/cosine_scaled_reward": -0.05712300445884466, "rewards/format_reward": 0.4702381044626236, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 2898.8692016601562, "epoch": 0.472, "grad_norm": 0.11308304965496063, "kl": 0.01580810546875, "learning_rate": 9.502373679810839e-07, "loss": 0.0347, "reward": -0.02927885064855218, "reward_std": 0.5874328389763832, "rewards/cosine_scaled_reward": -0.18725845962762833, "rewards/format_reward": 0.3452381044626236, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 2942.3869018554688, "epoch": 0.476, "grad_norm": 0.11883487552404404, "kl": 0.0146484375, "learning_rate": 9.487916106540465e-07, "loss": 0.0533, "reward": 0.2897670716047287, "reward_std": 0.624944195151329, "rewards/cosine_scaled_reward": -0.05452123726718128, "rewards/format_reward": 0.3988095298409462, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 2888.5537109375, "epoch": 0.48, "grad_norm": 0.18806912004947662, "kl": 0.0160980224609375, "learning_rate": 9.473264167865171e-07, "loss": 0.0988, "reward": 0.42369477450847626, "reward_std": 0.8195747882127762, "rewards/cosine_scaled_reward": -0.002438324736431241, "rewards/format_reward": 0.4285714365541935, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 2582.732177734375, "epoch": 0.484, "grad_norm": 0.24437126517295837, "kl": 0.0138702392578125, "learning_rate": 9.458418577899774e-07, "loss": 0.1221, "reward": 0.5238880245015025, "reward_std": 0.7648549973964691, "rewards/cosine_scaled_reward": -0.005913139786571264, "rewards/format_reward": 0.535714291036129, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 2693.202392578125, "epoch": 0.488, "grad_norm": 0.1520787924528122, "kl": 0.016265869140625, "learning_rate": 9.443380060197385e-07, "loss": 0.075, "reward": 0.5096995830535889, "reward_std": 0.8008040487766266, "rewards/cosine_scaled_reward": 0.01675456203520298, "rewards/format_reward": 0.4761904776096344, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 2794.4822387695312, "epoch": 0.492, "grad_norm": 0.276404470205307, "kl": 0.016845703125, "learning_rate": 9.428149347714143e-07, "loss": 0.1229, "reward": 0.1483494946733117, "reward_std": 0.7319334298372269, "rewards/cosine_scaled_reward": -0.11034906562417746, "rewards/format_reward": 0.3690476268529892, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 2586.9285583496094, "epoch": 0.496, "grad_norm": 0.19590145349502563, "kl": 0.0192413330078125, "learning_rate": 9.412727182773486e-07, "loss": 0.0668, "reward": 0.42041725292801857, "reward_std": 0.7440174072980881, "rewards/cosine_scaled_reward": -0.05169615335762501, "rewards/format_reward": 0.5238095372915268, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 2647.1309814453125, "epoch": 0.5, "grad_norm": 0.1762569099664688, "kl": 0.018951416015625, "learning_rate": 9.397114317029974e-07, "loss": 0.058, "reward": 0.26979109086096287, "reward_std": 0.7384046316146851, "rewards/cosine_scaled_reward": -0.10022350586950779, "rewards/format_reward": 0.4702380932867527, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 2329.4642639160156, "epoch": 0.504, "grad_norm": 0.2497519552707672, "kl": 0.016754150390625, "learning_rate": 9.381311511432658e-07, "loss": 0.1113, "reward": 0.5470606535673141, "reward_std": 0.7599766105413437, "rewards/cosine_scaled_reward": -0.04194586584344506, "rewards/format_reward": 0.630952388048172, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 2630.851318359375, "epoch": 0.508, "grad_norm": 0.24775269627571106, "kl": 0.018798828125, "learning_rate": 9.36531953618799e-07, "loss": 0.0886, "reward": 0.5138388648629189, "reward_std": 0.8158636838197708, "rewards/cosine_scaled_reward": 0.00989562287577428, "rewards/format_reward": 0.4940476194024086, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 2193.1250610351562, "epoch": 0.512, "grad_norm": 0.3206213712692261, "kl": 0.0168914794921875, "learning_rate": 9.34913917072228e-07, "loss": 0.1596, "reward": 0.7910499274730682, "reward_std": 0.7149495184421539, "rewards/cosine_scaled_reward": 0.0502868490293622, "rewards/format_reward": 0.6904762089252472, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 2532.4642944335938, "epoch": 0.516, "grad_norm": 0.28250807523727417, "kl": 0.021270751953125, "learning_rate": 9.332771203643714e-07, "loss": 0.0845, "reward": 0.36558002047240734, "reward_std": 0.637114867568016, "rewards/cosine_scaled_reward": -0.05232903314754367, "rewards/format_reward": 0.470238097012043, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 2768.4702758789062, "epoch": 0.52, "grad_norm": 0.26120948791503906, "kl": 0.026092529296875, "learning_rate": 9.316216432703916e-07, "loss": 0.1052, "reward": 0.41776999086141586, "reward_std": 0.9506262838840485, "rewards/cosine_scaled_reward": -0.029210255946964025, "rewards/format_reward": 0.4761904776096344, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 2400.5000915527344, "epoch": 0.524, "grad_norm": 0.33211514353752136, "kl": 0.0215606689453125, "learning_rate": 9.299475664759068e-07, "loss": 0.1452, "reward": 0.42596414871513844, "reward_std": 0.6515605002641678, "rewards/cosine_scaled_reward": -0.04892268590629101, "rewards/format_reward": 0.5238095298409462, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 2368.9524536132812, "epoch": 0.528, "grad_norm": 0.41833382844924927, "kl": 0.020782470703125, "learning_rate": 9.282549715730579e-07, "loss": 0.1107, "reward": 0.5763177648186684, "reward_std": 0.7463207244873047, "rewards/cosine_scaled_reward": 0.005420786794275045, "rewards/format_reward": 0.5654762089252472, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 2800.4940795898438, "epoch": 0.532, "grad_norm": 0.21257169544696808, "kl": 0.031768798828125, "learning_rate": 9.265439410565328e-07, "loss": 0.0654, "reward": 0.27612858824431896, "reward_std": 0.6431511640548706, "rewards/cosine_scaled_reward": -0.07026905845850706, "rewards/format_reward": 0.4166666679084301, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 2558.386962890625, "epoch": 0.536, "grad_norm": 0.42514950037002563, "kl": 0.02978515625, "learning_rate": 9.248145583195447e-07, "loss": 0.1228, "reward": 0.4270520806312561, "reward_std": 0.7729989290237427, "rewards/cosine_scaled_reward": -0.02159299748018384, "rewards/format_reward": 0.4702381044626236, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 2229.363067626953, "epoch": 0.54, "grad_norm": 0.33076903223991394, "kl": 0.03668212890625, "learning_rate": 9.230669076497687e-07, "loss": 0.1019, "reward": 0.28207028564065695, "reward_std": 0.6516975909471512, "rewards/cosine_scaled_reward": -0.10301248356699944, "rewards/format_reward": 0.4880952388048172, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 2353.029815673828, "epoch": 0.544, "grad_norm": 0.3519177734851837, "kl": 0.035308837890625, "learning_rate": 9.213010742252327e-07, "loss": 0.0997, "reward": 0.37077474407851696, "reward_std": 0.668467104434967, "rewards/cosine_scaled_reward": -0.0675888154655695, "rewards/format_reward": 0.505952388048172, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 2052.1964721679688, "epoch": 0.548, "grad_norm": 0.23379026353359222, "kl": 0.035400390625, "learning_rate": 9.195171441101668e-07, "loss": 0.058, "reward": 0.6550269052386284, "reward_std": 0.6502309143543243, "rewards/cosine_scaled_reward": 0.009061065968126059, "rewards/format_reward": 0.6369047611951828, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 2325.732177734375, "epoch": 0.552, "grad_norm": 0.19041714072227478, "kl": 0.0423583984375, "learning_rate": 9.177152042508077e-07, "loss": 0.0232, "reward": 0.561458358541131, "reward_std": 0.9615298509597778, "rewards/cosine_scaled_reward": 0.012872030027210712, "rewards/format_reward": 0.5357142947614193, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 2440.327423095703, "epoch": 0.556, "grad_norm": 0.2846536934375763, "kl": 0.0457763671875, "learning_rate": 9.158953424711624e-07, "loss": 0.0439, "reward": 0.44825945422053337, "reward_std": 0.7441610246896744, "rewards/cosine_scaled_reward": -0.022894082590937614, "rewards/format_reward": 0.4940476417541504, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 2455.071533203125, "epoch": 0.56, "grad_norm": 0.5667356252670288, "kl": 0.05535888671875, "learning_rate": 9.140576474687263e-07, "loss": 0.1203, "reward": 0.42634591602836736, "reward_std": 0.6539553329348564, "rewards/cosine_scaled_reward": 0.007815815508365631, "rewards/format_reward": 0.4107142984867096, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 2289.0416870117188, "epoch": 0.564, "grad_norm": 0.2632148861885071, "kl": 0.06378173828125, "learning_rate": 9.122022088101613e-07, "loss": 0.0588, "reward": 0.25764250196516514, "reward_std": 0.5646726861596107, "rewards/cosine_scaled_reward": -0.07355970796197653, "rewards/format_reward": 0.4047619141638279, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 2396.1011962890625, "epoch": 0.568, "grad_norm": 0.48258456587791443, "kl": 0.070556640625, "learning_rate": 9.103291169269299e-07, "loss": 0.1188, "reward": 0.3830295614898205, "reward_std": 0.7874267548322678, "rewards/cosine_scaled_reward": 0.0069909729063510895, "rewards/format_reward": 0.3690476268529892, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 2465.184539794922, "epoch": 0.572, "grad_norm": 0.3696215748786926, "kl": 0.083984375, "learning_rate": 9.084384631108882e-07, "loss": 0.0378, "reward": 0.17246808065101504, "reward_std": 0.7914570420980453, "rewards/cosine_scaled_reward": -0.11614691279828548, "rewards/format_reward": 0.4047619104385376, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 2460.184600830078, "epoch": 0.576, "grad_norm": 0.30795326828956604, "kl": 0.08349609375, "learning_rate": 9.065303395098358e-07, "loss": 0.0254, "reward": 0.23997123539447784, "reward_std": 0.7133302837610245, "rewards/cosine_scaled_reward": -0.09430009685456753, "rewards/format_reward": 0.4285714365541935, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 2376.089324951172, "epoch": 0.58, "grad_norm": 0.5491130352020264, "kl": 0.0899658203125, "learning_rate": 9.046048391230247e-07, "loss": 0.0823, "reward": 0.4339366629719734, "reward_std": 0.7774848788976669, "rewards/cosine_scaled_reward": 0.005658812588080764, "rewards/format_reward": 0.4226190559566021, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 2340.136962890625, "epoch": 0.584, "grad_norm": 0.3470991551876068, "kl": 0.1185302734375, "learning_rate": 9.026620557966279e-07, "loss": 0.0431, "reward": 0.31324461475014687, "reward_std": 0.7800580561161041, "rewards/cosine_scaled_reward": -0.10528245754539967, "rewards/format_reward": 0.5238095298409462, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 2512.7262573242188, "epoch": 0.588, "grad_norm": 0.31661325693130493, "kl": 0.1114501953125, "learning_rate": 9.007020842191634e-07, "loss": 0.0189, "reward": 0.2997382581233978, "reward_std": 0.8120257556438446, "rewards/cosine_scaled_reward": -0.06144038587808609, "rewards/format_reward": 0.4226190485060215, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 2311.8035888671875, "epoch": 0.592, "grad_norm": 0.9110273718833923, "kl": 0.1287841796875, "learning_rate": 8.987250199168808e-07, "loss": 0.0875, "reward": 0.29848775546997786, "reward_std": 0.7376701682806015, "rewards/cosine_scaled_reward": -0.06206565350294113, "rewards/format_reward": 0.4226190596818924, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 2368.2500915527344, "epoch": 0.596, "grad_norm": 0.5145498514175415, "kl": 0.142333984375, "learning_rate": 8.967309592491052e-07, "loss": -0.0091, "reward": 0.1381237395107746, "reward_std": 0.7537627294659615, "rewards/cosine_scaled_reward": -0.12736669927835464, "rewards/format_reward": 0.3928571492433548, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 2541.4226684570312, "epoch": 0.6, "grad_norm": 0.4558282792568207, "kl": 0.1424560546875, "learning_rate": 8.9471999940354e-07, "loss": 0.0545, "reward": 0.517300067236647, "reward_std": 0.8328704386949539, "rewards/cosine_scaled_reward": 0.014602408395148814, "rewards/format_reward": 0.4880952388048172, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 2635.4226989746094, "epoch": 0.604, "grad_norm": 0.3748377859592438, "kl": 0.18310546875, "learning_rate": 8.926922383915315e-07, "loss": 0.0372, "reward": 0.09937155619263649, "reward_std": 0.7007840871810913, "rewards/cosine_scaled_reward": -0.1259094497654587, "rewards/format_reward": 0.3511904776096344, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 2366.8036499023438, "epoch": 0.608, "grad_norm": 0.7343178391456604, "kl": 0.196533203125, "learning_rate": 8.906477750432903e-07, "loss": 0.1069, "reward": 0.6113147716969252, "reward_std": 0.8982192724943161, "rewards/cosine_scaled_reward": 0.028871658723801374, "rewards/format_reward": 0.5535714402794838, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 2539.6726684570312, "epoch": 0.612, "grad_norm": 0.3661244213581085, "kl": 0.23095703125, "learning_rate": 8.88586709003076e-07, "loss": 0.0493, "reward": 0.3096798346377909, "reward_std": 0.6143878847360611, "rewards/cosine_scaled_reward": -0.07432675641030073, "rewards/format_reward": 0.4583333432674408, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 2693.077392578125, "epoch": 0.616, "grad_norm": 0.39779341220855713, "kl": 0.26123046875, "learning_rate": 8.865091407243394e-07, "loss": 0.0484, "reward": 0.20376494899392128, "reward_std": 0.7454717755317688, "rewards/cosine_scaled_reward": -0.0856175352819264, "rewards/format_reward": 0.3750000074505806, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 2522.8333740234375, "epoch": 0.62, "grad_norm": 0.7077339291572571, "kl": 0.275634765625, "learning_rate": 8.844151714648274e-07, "loss": 0.1048, "reward": 0.28493453562259674, "reward_std": 0.7751601040363312, "rewards/cosine_scaled_reward": -0.050985115580260754, "rewards/format_reward": 0.3869047649204731, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 2791.2083740234375, "epoch": 0.624, "grad_norm": 0.6277625560760498, "kl": 0.3359375, "learning_rate": 8.823049032816478e-07, "loss": 0.063, "reward": 0.15741928666830063, "reward_std": 0.7891719415783882, "rewards/cosine_scaled_reward": -0.1058141621761024, "rewards/format_reward": 0.3690476268529892, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 2756.0596313476562, "epoch": 0.628, "grad_norm": 0.9464259147644043, "kl": 0.35107421875, "learning_rate": 8.801784390262943e-07, "loss": 0.1337, "reward": 0.20047340355813503, "reward_std": 0.7717511355876923, "rewards/cosine_scaled_reward": -0.10214426182210445, "rewards/format_reward": 0.4047619178891182, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 2546.386993408203, "epoch": 0.632, "grad_norm": 0.9672547578811646, "kl": 0.3583984375, "learning_rate": 8.780358823396352e-07, "loss": 0.1309, "reward": 0.3896455895155668, "reward_std": 0.8362017869949341, "rewards/cosine_scaled_reward": -0.025415319949388504, "rewards/format_reward": 0.4404761902987957, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 2490.3869018554688, "epoch": 0.636, "grad_norm": 0.5016924738883972, "kl": 0.37744140625, "learning_rate": 8.758773376468604e-07, "loss": 0.0548, "reward": 0.3971053212881088, "reward_std": 0.6817308068275452, "rewards/cosine_scaled_reward": -0.07823306252248585, "rewards/format_reward": 0.5535714328289032, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 2715.90478515625, "epoch": 0.64, "grad_norm": 0.776878833770752, "kl": 0.4169921875, "learning_rate": 8.737029101523929e-07, "loss": 0.1414, "reward": 0.37737663462758064, "reward_std": 0.8348212540149689, "rewards/cosine_scaled_reward": -0.03452597954310477, "rewards/format_reward": 0.4464285746216774, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 2636.0357666015625, "epoch": 0.644, "grad_norm": 1.2749825716018677, "kl": 0.48486328125, "learning_rate": 8.715127058347614e-07, "loss": 0.1445, "reward": 0.24750607460737228, "reward_std": 0.7917188853025436, "rewards/cosine_scaled_reward": -0.10541364271193743, "rewards/format_reward": 0.4583333469927311, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 2622.9405517578125, "epoch": 0.648, "grad_norm": 1.3737562894821167, "kl": 0.5888671875, "learning_rate": 8.693068314414344e-07, "loss": 0.1549, "reward": 0.10282446062774397, "reward_std": 0.6833581179380417, "rewards/cosine_scaled_reward": -0.18073063343763351, "rewards/format_reward": 0.4642857275903225, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 2187.9286193847656, "epoch": 0.652, "grad_norm": 1.2476062774658203, "kl": 0.64453125, "learning_rate": 8.670853944836176e-07, "loss": 0.1442, "reward": 0.5821249708533287, "reward_std": 0.8525291532278061, "rewards/cosine_scaled_reward": -0.0303660926874727, "rewards/format_reward": 0.6428571492433548, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 2645.2202758789062, "epoch": 0.656, "grad_norm": 0.8759295344352722, "kl": 0.83203125, "learning_rate": 8.648485032310144e-07, "loss": 0.1369, "reward": 0.354750145226717, "reward_std": 0.6708278656005859, "rewards/cosine_scaled_reward": -0.09941063448786736, "rewards/format_reward": 0.5535714328289032, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 2744.5952758789062, "epoch": 0.66, "grad_norm": 1.443908452987671, "kl": 0.9365234375, "learning_rate": 8.625962667065487e-07, "loss": 0.1514, "reward": 0.07671361323446035, "reward_std": 0.7401341199874878, "rewards/cosine_scaled_reward": -0.16997654270380735, "rewards/format_reward": 0.4166666679084301, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 2762.2381591796875, "epoch": 0.664, "grad_norm": 2.171701192855835, "kl": 1.064453125, "learning_rate": 8.603287946810513e-07, "loss": 0.0493, "reward": 0.3810354620218277, "reward_std": 0.6359066590666771, "rewards/cosine_scaled_reward": -0.05650608614087105, "rewards/format_reward": 0.4940476231276989, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 2410.6726684570312, "epoch": 0.668, "grad_norm": 1.1915135383605957, "kl": 0.9716796875, "learning_rate": 8.580461976679099e-07, "loss": 0.1178, "reward": 0.5956609398126602, "reward_std": 0.7429262697696686, "rewards/cosine_scaled_reward": -0.011693337932229042, "rewards/format_reward": 0.6190476268529892, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 2624.7083740234375, "epoch": 0.672, "grad_norm": 1.2750567197799683, "kl": 1.111328125, "learning_rate": 8.557485869176825e-07, "loss": 0.1676, "reward": 0.35937594436109066, "reward_std": 0.7485721707344055, "rewards/cosine_scaled_reward": -0.0613834522664547, "rewards/format_reward": 0.482142873108387, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 2566.7857666015625, "epoch": 0.676, "grad_norm": 0.8985515832901001, "kl": 1.0439453125, "learning_rate": 8.534360744126753e-07, "loss": 0.1232, "reward": 0.23157138470560312, "reward_std": 0.6288014650344849, "rewards/cosine_scaled_reward": -0.14314288273453712, "rewards/format_reward": 0.5178571492433548, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 2820.5059814453125, "epoch": 0.68, "grad_norm": 1.1454522609710693, "kl": 0.9677734375, "learning_rate": 8.511087728614862e-07, "loss": 0.1412, "reward": 0.08721911488100886, "reward_std": 0.6948041319847107, "rewards/cosine_scaled_reward": -0.16769996285438538, "rewards/format_reward": 0.4226190522313118, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 2376.166748046875, "epoch": 0.684, "grad_norm": 0.9355194568634033, "kl": 0.9521484375, "learning_rate": 8.487667956935087e-07, "loss": 0.128, "reward": 0.41750996466726065, "reward_std": 0.7085302621126175, "rewards/cosine_scaled_reward": -0.05910217575728893, "rewards/format_reward": 0.535714291036129, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 2571.5000610351562, "epoch": 0.688, "grad_norm": 0.9496890902519226, "kl": 1.0341796875, "learning_rate": 8.464102570534061e-07, "loss": 0.147, "reward": 0.21527537889778614, "reward_std": 0.6487467139959335, "rewards/cosine_scaled_reward": -0.2048623152077198, "rewards/format_reward": 0.6250000149011612, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 2577.839324951172, "epoch": 0.692, "grad_norm": 1.125106692314148, "kl": 1.005859375, "learning_rate": 8.440392717955475e-07, "loss": 0.1126, "reward": 0.29065654147416353, "reward_std": 0.5777322202920914, "rewards/cosine_scaled_reward": -0.11955267190933228, "rewards/format_reward": 0.5297619178891182, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 2297.0416870117188, "epoch": 0.696, "grad_norm": 1.5477794408798218, "kl": 0.9482421875, "learning_rate": 8.416539554784089e-07, "loss": 0.0866, "reward": 0.35003964975476265, "reward_std": 0.7120198756456375, "rewards/cosine_scaled_reward": -0.13152779638767242, "rewards/format_reward": 0.6130952388048172, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 2239.952423095703, "epoch": 0.7, "grad_norm": 1.1404165029525757, "kl": 0.7734375, "learning_rate": 8.392544243589427e-07, "loss": 0.1326, "reward": 0.7693988904356956, "reward_std": 0.8029063045978546, "rewards/cosine_scaled_reward": 0.0543422931805253, "rewards/format_reward": 0.6607142984867096, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 2214.148895263672, "epoch": 0.704, "grad_norm": 0.976016104221344, "kl": 0.8193359375, "learning_rate": 8.368407953869103e-07, "loss": 0.1005, "reward": 0.5222894381731749, "reward_std": 0.6858630776405334, "rewards/cosine_scaled_reward": -0.07814099243842065, "rewards/format_reward": 0.6785714477300644, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 2167.4345092773438, "epoch": 0.708, "grad_norm": 1.6724809408187866, "kl": 0.740234375, "learning_rate": 8.344131861991828e-07, "loss": 0.1424, "reward": 0.3468378521502018, "reward_std": 0.6407709717750549, "rewards/cosine_scaled_reward": -0.16289059445261955, "rewards/format_reward": 0.6726190596818924, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 2593.5654907226562, "epoch": 0.712, "grad_norm": 1.3712421655654907, "kl": 0.9814453125, "learning_rate": 8.319717151140072e-07, "loss": 0.1121, "reward": 0.27433447539806366, "reward_std": 0.6857093423604965, "rewards/cosine_scaled_reward": -0.16342800296843052, "rewards/format_reward": 0.6011904925107956, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 2240.9583129882812, "epoch": 0.716, "grad_norm": 2.2109479904174805, "kl": 0.7880859375, "learning_rate": 8.295165011252396e-07, "loss": 0.0613, "reward": 0.3249462991952896, "reward_std": 0.7396285533905029, "rewards/cosine_scaled_reward": -0.12919352855533361, "rewards/format_reward": 0.5833333358168602, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 2391.261962890625, "epoch": 0.72, "grad_norm": 0.9252892136573792, "kl": 0.8369140625, "learning_rate": 8.270476638965461e-07, "loss": 0.0766, "reward": 0.37066294252872467, "reward_std": 0.5772489011287689, "rewards/cosine_scaled_reward": -0.15395426377654076, "rewards/format_reward": 0.6785714477300644, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 2188.21435546875, "epoch": 0.724, "grad_norm": 1.4679890871047974, "kl": 0.7177734375, "learning_rate": 8.245653237555705e-07, "loss": 0.1271, "reward": 0.47163213789463043, "reward_std": 0.7110278159379959, "rewards/cosine_scaled_reward": -0.10942202992737293, "rewards/format_reward": 0.6904762089252472, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 2330.3572387695312, "epoch": 0.728, "grad_norm": 0.8398174047470093, "kl": 0.71875, "learning_rate": 8.220696016880687e-07, "loss": 0.0837, "reward": 0.5167603380978107, "reward_std": 0.704664558172226, "rewards/cosine_scaled_reward": -0.08090554922819138, "rewards/format_reward": 0.6785714402794838, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 2319.154815673828, "epoch": 0.732, "grad_norm": 1.0028657913208008, "kl": 0.7421875, "learning_rate": 8.195606193320136e-07, "loss": 0.1069, "reward": 0.6520561873912811, "reward_std": 0.8034340292215347, "rewards/cosine_scaled_reward": -0.04301954247057438, "rewards/format_reward": 0.7380952537059784, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 2419.6131591796875, "epoch": 0.736, "grad_norm": 0.9799902439117432, "kl": 0.794921875, "learning_rate": 8.170384989716657e-07, "loss": 0.1, "reward": 0.5134465768933296, "reward_std": 0.7416307479143143, "rewards/cosine_scaled_reward": -0.10637196339666843, "rewards/format_reward": 0.7261904776096344, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 2279.3630981445312, "epoch": 0.74, "grad_norm": 1.1403197050094604, "kl": 0.75390625, "learning_rate": 8.145033635316128e-07, "loss": 0.08, "reward": 0.5693989507853985, "reward_std": 0.6981105357408524, "rewards/cosine_scaled_reward": -0.06946719996631145, "rewards/format_reward": 0.7083333507180214, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 2087.2679443359375, "epoch": 0.744, "grad_norm": 0.8785580992698669, "kl": 0.6123046875, "learning_rate": 8.119553365707802e-07, "loss": 0.0849, "reward": 0.4244233965873718, "reward_std": 0.718925341963768, "rewards/cosine_scaled_reward": -0.1449311599135399, "rewards/format_reward": 0.7142857313156128, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 2415.8036499023438, "epoch": 0.748, "grad_norm": 1.325434684753418, "kl": 0.6298828125, "learning_rate": 8.093945422764069e-07, "loss": 0.0477, "reward": 0.594460990279913, "reward_std": 0.7041322290897369, "rewards/cosine_scaled_reward": -0.021221883594989777, "rewards/format_reward": 0.636904776096344, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 2371.0714721679688, "epoch": 0.752, "grad_norm": 1.3853912353515625, "kl": 0.638671875, "learning_rate": 8.068211054579943e-07, "loss": 0.1131, "reward": 0.5956445932388306, "reward_std": 0.7780069708824158, "rewards/cosine_scaled_reward": -0.062296761316247284, "rewards/format_reward": 0.7202381044626236, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 2243.3333740234375, "epoch": 0.756, "grad_norm": 0.7066504955291748, "kl": 0.564453125, "learning_rate": 8.04235151541222e-07, "loss": 0.043, "reward": 0.7391829118132591, "reward_std": 0.6626263409852982, "rewards/cosine_scaled_reward": -0.014337139204144478, "rewards/format_reward": 0.767857164144516, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 2111.875030517578, "epoch": 0.76, "grad_norm": 1.1808303594589233, "kl": 0.5361328125, "learning_rate": 8.01636806561836e-07, "loss": 0.0212, "reward": 0.6303885579109192, "reward_std": 0.7266089022159576, "rewards/cosine_scaled_reward": -0.04790095146745443, "rewards/format_reward": 0.7261904925107956, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 2512.1607666015625, "epoch": 0.764, "grad_norm": 1.169936180114746, "kl": 0.54736328125, "learning_rate": 7.990261971595048e-07, "loss": 0.0239, "reward": 0.4208872392773628, "reward_std": 0.6789906620979309, "rewards/cosine_scaled_reward": -0.12288972595706582, "rewards/format_reward": 0.6666666716337204, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 2421.7381591796875, "epoch": 0.768, "grad_norm": 1.9125944375991821, "kl": 0.44970703125, "learning_rate": 7.964034505716476e-07, "loss": 0.162, "reward": 0.6703099310398102, "reward_std": 0.7079124301671982, "rewards/cosine_scaled_reward": 0.022654948756098747, "rewards/format_reward": 0.625, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 2342.3333435058594, "epoch": 0.772, "grad_norm": 1.1848394870758057, "kl": 0.4287109375, "learning_rate": 7.93768694627233e-07, "loss": 0.1217, "reward": 0.3946942985057831, "reward_std": 0.7293716818094254, "rewards/cosine_scaled_reward": -0.14789094775915146, "rewards/format_reward": 0.6904762089252472, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 2488.1786193847656, "epoch": 0.776, "grad_norm": 0.8427687883377075, "kl": 0.40673828125, "learning_rate": 7.911220577405484e-07, "loss": 0.0681, "reward": 0.33857931289821863, "reward_std": 0.7693478316068649, "rewards/cosine_scaled_reward": -0.11642462853342295, "rewards/format_reward": 0.5714285671710968, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 2235.4762573242188, "epoch": 0.78, "grad_norm": 1.9778449535369873, "kl": 0.4599609375, "learning_rate": 7.884636689049422e-07, "loss": 0.1203, "reward": 0.7276730462908745, "reward_std": 0.8504652380943298, "rewards/cosine_scaled_reward": 0.02455079648643732, "rewards/format_reward": 0.6785714328289032, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 2252.7262268066406, "epoch": 0.784, "grad_norm": 1.251224398612976, "kl": 0.49169921875, "learning_rate": 7.857936576865356e-07, "loss": 0.0753, "reward": 0.6360676661133766, "reward_std": 0.8185366541147232, "rewards/cosine_scaled_reward": -0.01827568793669343, "rewards/format_reward": 0.6726190596818924, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 2399.7500610351562, "epoch": 0.788, "grad_norm": 0.9470409154891968, "kl": 0.517578125, "learning_rate": 7.831121542179086e-07, "loss": 0.1036, "reward": 0.550631508231163, "reward_std": 0.7208298593759537, "rewards/cosine_scaled_reward": -0.037184251472353935, "rewards/format_reward": 0.625, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 2269.5000610351562, "epoch": 0.792, "grad_norm": 2.047698974609375, "kl": 0.595703125, "learning_rate": 7.804192891917571e-07, "loss": 0.1831, "reward": 0.29151881486177444, "reward_std": 0.666583925485611, "rewards/cosine_scaled_reward": -0.15483582392334938, "rewards/format_reward": 0.601190485060215, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 2316.7857666015625, "epoch": 0.796, "grad_norm": 1.6713296175003052, "kl": 0.60205078125, "learning_rate": 7.777151938545235e-07, "loss": 0.1356, "reward": 0.5018086154013872, "reward_std": 0.8012387007474899, "rewards/cosine_scaled_reward": -0.0615957040572539, "rewards/format_reward": 0.6250000223517418, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 2471.1964721679688, "epoch": 0.8, "grad_norm": 0.9633775949478149, "kl": 0.740234375, "learning_rate": 7.75e-07, "loss": 0.1277, "reward": 0.3792301341891289, "reward_std": 0.76199010014534, "rewards/cosine_scaled_reward": -0.11693255044519901, "rewards/format_reward": 0.6130952537059784, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 2280.6607971191406, "epoch": 0.804, "grad_norm": 1.1369765996932983, "kl": 0.7587890625, "learning_rate": 7.72273839962904e-07, "loss": 0.1174, "reward": 0.4361310079693794, "reward_std": 0.7977508455514908, "rewards/cosine_scaled_reward": -0.0736011671833694, "rewards/format_reward": 0.5833333432674408, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 2239.3809814453125, "epoch": 0.808, "grad_norm": 1.1852681636810303, "kl": 0.80078125, "learning_rate": 7.695368466124296e-07, "loss": 0.1861, "reward": 0.4273875653743744, "reward_std": 0.7939650565385818, "rewards/cosine_scaled_reward": -0.08987765479832888, "rewards/format_reward": 0.6071428656578064, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 2270.9524536132812, "epoch": 0.812, "grad_norm": 2.2510244846343994, "kl": 1.05859375, "learning_rate": 7.667891533457718e-07, "loss": 0.1778, "reward": 0.5268369093537331, "reward_std": 0.7606751769781113, "rewards/cosine_scaled_reward": -0.05205773119814694, "rewards/format_reward": 0.630952388048172, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 2307.83935546875, "epoch": 0.816, "grad_norm": 3.0754034519195557, "kl": 1.107421875, "learning_rate": 7.640308940816239e-07, "loss": 0.1251, "reward": 0.046380717772990465, "reward_std": 0.6517826318740845, "rewards/cosine_scaled_reward": -0.23573821783065796, "rewards/format_reward": 0.5178571566939354, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 2065.482177734375, "epoch": 0.82, "grad_norm": 3.317054033279419, "kl": 0.8037109375, "learning_rate": 7.612622032536507e-07, "loss": 0.1229, "reward": 0.629617914557457, "reward_std": 0.7360707223415375, "rewards/cosine_scaled_reward": -0.012572012841701508, "rewards/format_reward": 0.654761902987957, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 2119.2679443359375, "epoch": 0.824, "grad_norm": 1.985148310661316, "kl": 0.70849609375, "learning_rate": 7.584832158039378e-07, "loss": 0.1697, "reward": 0.4503296762704849, "reward_std": 0.7717154771089554, "rewards/cosine_scaled_reward": -0.07840658072382212, "rewards/format_reward": 0.6071428656578064, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 2085.839324951172, "epoch": 0.828, "grad_norm": 2.4033172130584717, "kl": 0.6025390625, "learning_rate": 7.556940671764124e-07, "loss": 0.1578, "reward": 0.4145805863663554, "reward_std": 0.7361099421977997, "rewards/cosine_scaled_reward": -0.11116209626197815, "rewards/format_reward": 0.6369047611951828, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 1872.482177734375, "epoch": 0.832, "grad_norm": 2.11576247215271, "kl": 0.408203125, "learning_rate": 7.528948933102438e-07, "loss": 0.0839, "reward": 0.4670650511980057, "reward_std": 0.7250475585460663, "rewards/cosine_scaled_reward": -0.10872937482781708, "rewards/format_reward": 0.6845238208770752, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 2085.6786499023438, "epoch": 0.836, "grad_norm": 0.8786793351173401, "kl": 0.51806640625, "learning_rate": 7.500858306332172e-07, "loss": 0.0649, "reward": 0.46545055881142616, "reward_std": 0.6805593073368073, "rewards/cosine_scaled_reward": -0.09465568419545889, "rewards/format_reward": 0.6547619104385376, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 2425.7916870117188, "epoch": 0.84, "grad_norm": 1.3337445259094238, "kl": 0.58837890625, "learning_rate": 7.472670160550848e-07, "loss": 0.1561, "reward": 0.4684627018868923, "reward_std": 0.824245348572731, "rewards/cosine_scaled_reward": -0.04553056287113577, "rewards/format_reward": 0.5595238208770752, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 2630.5655517578125, "epoch": 0.844, "grad_norm": 1.3039979934692383, "kl": 0.732421875, "learning_rate": 7.444385869608921e-07, "loss": 0.1559, "reward": 0.1796425711363554, "reward_std": 0.6979469060897827, "rewards/cosine_scaled_reward": -0.17803586274385452, "rewards/format_reward": 0.5357143059372902, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 1983.9762268066406, "epoch": 0.848, "grad_norm": 0.9129418134689331, "kl": 0.546875, "learning_rate": 7.416006812042827e-07, "loss": 0.1352, "reward": 0.4564796891063452, "reward_std": 0.6133182421326637, "rewards/cosine_scaled_reward": -0.11402205377817154, "rewards/format_reward": 0.6845238208770752, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 2315.5119018554688, "epoch": 0.852, "grad_norm": 1.2220977544784546, "kl": 0.5751953125, "learning_rate": 7.387534371007797e-07, "loss": 0.1683, "reward": 0.6708191484212875, "reward_std": 0.9547160714864731, "rewards/cosine_scaled_reward": 0.016957183834165335, "rewards/format_reward": 0.636904776096344, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 2233.0655212402344, "epoch": 0.856, "grad_norm": 0.7978451251983643, "kl": 0.61474609375, "learning_rate": 7.358969934210438e-07, "loss": 0.1304, "reward": 0.40765415877103806, "reward_std": 0.7158278822898865, "rewards/cosine_scaled_reward": -0.12057768838712946, "rewards/format_reward": 0.6488095372915268, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 2194.791748046875, "epoch": 0.86, "grad_norm": 1.0176509618759155, "kl": 0.61181640625, "learning_rate": 7.330314893841101e-07, "loss": 0.0991, "reward": 0.5912733934819698, "reward_std": 0.6540912538766861, "rewards/cosine_scaled_reward": -0.013887112960219383, "rewards/format_reward": 0.6190476194024086, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 2189.5535888671875, "epoch": 0.864, "grad_norm": 0.7862021923065186, "kl": 0.6572265625, "learning_rate": 7.301570646506027e-07, "loss": 0.1369, "reward": 0.4810000769793987, "reward_std": 0.6697472035884857, "rewards/cosine_scaled_reward": -0.09283328615128994, "rewards/format_reward": 0.6666666716337204, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 2238.6487731933594, "epoch": 0.868, "grad_norm": 0.675116240978241, "kl": 0.662109375, "learning_rate": 7.27273859315928e-07, "loss": 0.1234, "reward": 0.3597661480307579, "reward_std": 0.6638298779726028, "rewards/cosine_scaled_reward": -0.13559313118457794, "rewards/format_reward": 0.6309523731470108, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 2330.7560424804688, "epoch": 0.872, "grad_norm": 0.7294526696205139, "kl": 0.6875, "learning_rate": 7.243820139034464e-07, "loss": 0.1182, "reward": 0.5070892386138439, "reward_std": 0.770987793803215, "rewards/cosine_scaled_reward": -0.029193488880991936, "rewards/format_reward": 0.565476194024086, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 2425.4048461914062, "epoch": 0.876, "grad_norm": 0.9955194592475891, "kl": 0.76171875, "learning_rate": 7.214816693576234e-07, "loss": 0.145, "reward": 0.3850390911102295, "reward_std": 0.72886823117733, "rewards/cosine_scaled_reward": -0.11700426135212183, "rewards/format_reward": 0.6190476417541504, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 2443.869110107422, "epoch": 0.88, "grad_norm": 0.8245673179626465, "kl": 0.7412109375, "learning_rate": 7.185729670371604e-07, "loss": 0.1517, "reward": 0.3367026010528207, "reward_std": 0.6719767898321152, "rewards/cosine_scaled_reward": -0.1471248921006918, "rewards/format_reward": 0.630952388048172, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 2401.65478515625, "epoch": 0.884, "grad_norm": 0.6434879302978516, "kl": 0.552734375, "learning_rate": 7.156560487081051e-07, "loss": 0.0964, "reward": 0.5589644331485033, "reward_std": 0.6387112140655518, "rewards/cosine_scaled_reward": -0.030041599762625992, "rewards/format_reward": 0.619047611951828, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 2159.9107666015625, "epoch": 0.888, "grad_norm": 0.8747764229774475, "kl": 0.48974609375, "learning_rate": 7.127310565369415e-07, "loss": 0.0648, "reward": 1.0575831979513168, "reward_std": 0.8345089554786682, "rewards/cosine_scaled_reward": 0.15379157848656178, "rewards/format_reward": 0.7500000149011612, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 2736.8095703125, "epoch": 0.892, "grad_norm": 1.644534707069397, "kl": 0.650390625, "learning_rate": 7.097981330836616e-07, "loss": 0.0898, "reward": 0.28782752249389887, "reward_std": 0.6842672526836395, "rewards/cosine_scaled_reward": -0.11799101112410426, "rewards/format_reward": 0.5238095298409462, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 2585.4464721679688, "epoch": 0.896, "grad_norm": 0.5411848425865173, "kl": 0.603515625, "learning_rate": 7.068574212948169e-07, "loss": 0.1243, "reward": 0.49723897874355316, "reward_std": 0.810086615383625, "rewards/cosine_scaled_reward": -0.07280909270048141, "rewards/format_reward": 0.6428571492433548, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 2394.8988647460938, "epoch": 0.9, "grad_norm": 0.7165555357933044, "kl": 0.52783203125, "learning_rate": 7.039090644965509e-07, "loss": 0.0888, "reward": 0.5129196643829346, "reward_std": 0.787805512547493, "rewards/cosine_scaled_reward": -0.056040180614218116, "rewards/format_reward": 0.6250000298023224, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 2482.7678833007812, "epoch": 0.904, "grad_norm": 0.5211958289146423, "kl": 0.51416015625, "learning_rate": 7.009532063876148e-07, "loss": 0.0812, "reward": 0.4906727410852909, "reward_std": 0.7880082875490189, "rewards/cosine_scaled_reward": -0.0641874436987564, "rewards/format_reward": 0.6190476417541504, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 2290.0000610351562, "epoch": 0.908, "grad_norm": 0.5630519986152649, "kl": 0.382568359375, "learning_rate": 6.979899910323624e-07, "loss": 0.1034, "reward": 0.6861637309193611, "reward_std": 0.7359699308872223, "rewards/cosine_scaled_reward": -0.049775293562561274, "rewards/format_reward": 0.7857142984867096, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 2686.7440795898438, "epoch": 0.912, "grad_norm": 0.6647688746452332, "kl": 0.4326171875, "learning_rate": 6.950195628537299e-07, "loss": 0.0594, "reward": 0.5352285588160157, "reward_std": 0.7634364515542984, "rewards/cosine_scaled_reward": -0.047861908678896725, "rewards/format_reward": 0.6309524029493332, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 2687.5416564941406, "epoch": 0.916, "grad_norm": 0.37424567341804504, "kl": 0.39208984375, "learning_rate": 6.920420666261961e-07, "loss": 0.0465, "reward": 0.43462158273905516, "reward_std": 0.6648337990045547, "rewards/cosine_scaled_reward": -0.1070939814671874, "rewards/format_reward": 0.6488095298409462, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 2462.8452758789062, "epoch": 0.92, "grad_norm": 0.52641361951828, "kl": 0.37158203125, "learning_rate": 6.890576474687263e-07, "loss": 0.1061, "reward": 0.5536616146564484, "reward_std": 0.6706894189119339, "rewards/cosine_scaled_reward": -0.06840727850794792, "rewards/format_reward": 0.690476194024086, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 2395.279754638672, "epoch": 0.924, "grad_norm": 0.5165700912475586, "kl": 0.369140625, "learning_rate": 6.860664508377001e-07, "loss": 0.0951, "reward": 0.4786584824323654, "reward_std": 0.774825245141983, "rewards/cosine_scaled_reward": -0.1267421804368496, "rewards/format_reward": 0.7321428656578064, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 2468.2857666015625, "epoch": 0.928, "grad_norm": 0.4581441879272461, "kl": 0.31591796875, "learning_rate": 6.83068622519821e-07, "loss": 0.0555, "reward": 0.6299031171947718, "reward_std": 0.7808382511138916, "rewards/cosine_scaled_reward": -0.045167478267103434, "rewards/format_reward": 0.7202381044626236, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 2712.58935546875, "epoch": 0.932, "grad_norm": 0.7744795083999634, "kl": 0.333984375, "learning_rate": 6.800643086250121e-07, "loss": 0.0623, "reward": 0.6912369206547737, "reward_std": 0.7789230197668076, "rewards/cosine_scaled_reward": 0.04204704426229, "rewards/format_reward": 0.607142873108387, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 2556.619110107422, "epoch": 0.936, "grad_norm": 0.8385416865348816, "kl": 0.38427734375, "learning_rate": 6.770536555792944e-07, "loss": 0.0662, "reward": 0.4830031730234623, "reward_std": 0.7291474640369415, "rewards/cosine_scaled_reward": -0.07397460378706455, "rewards/format_reward": 0.6309523731470108, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 2724.8809814453125, "epoch": 0.94, "grad_norm": 0.6943939328193665, "kl": 0.330078125, "learning_rate": 6.740368101176495e-07, "loss": 0.1008, "reward": 0.38701344281435013, "reward_std": 0.7834271490573883, "rewards/cosine_scaled_reward": -0.11304090730845928, "rewards/format_reward": 0.6130952388048172, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 2819.7738647460938, "epoch": 0.944, "grad_norm": 0.3916683495044708, "kl": 0.32177734375, "learning_rate": 6.710139192768694e-07, "loss": 0.0365, "reward": 0.5249419808387756, "reward_std": 0.8138006925582886, "rewards/cosine_scaled_reward": -0.023243289440870285, "rewards/format_reward": 0.571428582072258, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 2499.0536499023438, "epoch": 0.948, "grad_norm": 0.9175835847854614, "kl": 0.321533203125, "learning_rate": 6.679851303883891e-07, "loss": 0.1055, "reward": 0.6389507204294205, "reward_std": 0.8023868650197983, "rewards/cosine_scaled_reward": -0.04064369201660156, "rewards/format_reward": 0.7202381044626236, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 2557.0655517578125, "epoch": 0.952, "grad_norm": 0.4397272765636444, "kl": 0.30810546875, "learning_rate": 6.649505910711058e-07, "loss": 0.0869, "reward": 0.4888541977852583, "reward_std": 0.7550098150968552, "rewards/cosine_scaled_reward": -0.09783481806516647, "rewards/format_reward": 0.684523805975914, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 2600.422637939453, "epoch": 0.956, "grad_norm": 0.9344379305839539, "kl": 0.345703125, "learning_rate": 6.619104492241847e-07, "loss": 0.1329, "reward": 0.27865387313067913, "reward_std": 0.6713129729032516, "rewards/cosine_scaled_reward": -0.18210165202617645, "rewards/format_reward": 0.6428571417927742, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 2331.8928833007812, "epoch": 0.96, "grad_norm": 0.5995355248451233, "kl": 0.326904296875, "learning_rate": 6.588648530198504e-07, "loss": 0.0705, "reward": 0.7613647617399693, "reward_std": 0.8133140057325363, "rewards/cosine_scaled_reward": 0.023539513116702437, "rewards/format_reward": 0.7142857164144516, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 2355.52978515625, "epoch": 0.964, "grad_norm": 0.3258729875087738, "kl": 0.307861328125, "learning_rate": 6.558139508961654e-07, "loss": 0.0608, "reward": 0.6096780672669411, "reward_std": 0.7518916502594948, "rewards/cosine_scaled_reward": -0.05230383496382274, "rewards/format_reward": 0.7142857313156128, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 2414.0834350585938, "epoch": 0.968, "grad_norm": 0.3521139919757843, "kl": 0.3212890625, "learning_rate": 6.527578915497951e-07, "loss": 0.0742, "reward": 0.6254040375351906, "reward_std": 0.8331593424081802, "rewards/cosine_scaled_reward": -0.03551226551644504, "rewards/format_reward": 0.696428582072258, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 2138.107177734375, "epoch": 0.972, "grad_norm": 0.5599615573883057, "kl": 0.33251953125, "learning_rate": 6.496968239287603e-07, "loss": 0.0315, "reward": 0.8373362571001053, "reward_std": 0.6551230400800705, "rewards/cosine_scaled_reward": 0.04366813227534294, "rewards/format_reward": 0.7500000149011612, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 2277.714324951172, "epoch": 0.976, "grad_norm": 0.6147165298461914, "kl": 0.336181640625, "learning_rate": 6.466308972251785e-07, "loss": 0.1075, "reward": 0.46155789494514465, "reward_std": 0.6391154229640961, "rewards/cosine_scaled_reward": -0.14124487387016416, "rewards/format_reward": 0.7440476417541504, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 2401.232208251953, "epoch": 0.98, "grad_norm": 0.8454631567001343, "kl": 0.4814453125, "learning_rate": 6.435602608679916e-07, "loss": 0.0417, "reward": 0.5565547049045563, "reward_std": 0.6768698394298553, "rewards/cosine_scaled_reward": -0.04315121428226121, "rewards/format_reward": 0.6428571492433548, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 2027.3453369140625, "epoch": 0.984, "grad_norm": 0.38341155648231506, "kl": 0.289794921875, "learning_rate": 6.404850645156841e-07, "loss": 0.0993, "reward": 0.7784423977136612, "reward_std": 0.6467820554971695, "rewards/cosine_scaled_reward": -0.009588314220309258, "rewards/format_reward": 0.7976190596818924, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 2170.5416564941406, "epoch": 0.988, "grad_norm": 0.445024311542511, "kl": 0.4326171875, "learning_rate": 6.374054580489873e-07, "loss": 0.1244, "reward": 0.6971250772476196, "reward_std": 0.7919557690620422, "rewards/cosine_scaled_reward": -0.026437478853040375, "rewards/format_reward": 0.7500000149011612, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 2060.2559814453125, "epoch": 0.992, "grad_norm": 0.49659866094589233, "kl": 0.36865234375, "learning_rate": 6.343215915635761e-07, "loss": 0.0959, "reward": 0.6287773251533508, "reward_std": 0.7386345416307449, "rewards/cosine_scaled_reward": -0.060611339285969734, "rewards/format_reward": 0.7500000149011612, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 2148.869140625, "epoch": 0.996, "grad_norm": 0.4539166986942291, "kl": 0.3701171875, "learning_rate": 6.31233615362752e-07, "loss": 0.1019, "reward": 0.4996798560023308, "reward_std": 0.6163481399416924, "rewards/cosine_scaled_reward": -0.11027912324061617, "rewards/format_reward": 0.7202381044626236, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 2330.3482971191406, "epoch": 1.0, "grad_norm": 0.5344291925430298, "kl": 0.5205078125, "learning_rate": 6.281416799501187e-07, "loss": 0.0866, "reward": 0.42578159645199776, "reward_std": 0.7348527163267136, "rewards/cosine_scaled_reward": -0.08472825400531292, "rewards/format_reward": 0.5952381044626236, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 2106.059539794922, "epoch": 1.004, "grad_norm": 0.5930522680282593, "kl": 0.38232421875, "learning_rate": 6.25045936022246e-07, "loss": 0.1423, "reward": 0.5456876549869776, "reward_std": 0.6847013607621193, "rewards/cosine_scaled_reward": -0.0842990386299789, "rewards/format_reward": 0.7142857164144516, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 2112.4762573242188, "epoch": 1.008, "grad_norm": 0.4610899090766907, "kl": 0.39111328125, "learning_rate": 6.219465344613258e-07, "loss": 0.0641, "reward": 0.6148004308342934, "reward_std": 0.6790047585964203, "rewards/cosine_scaled_reward": -0.05867121648043394, "rewards/format_reward": 0.7321428805589676, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 2289.309600830078, "epoch": 1.012, "grad_norm": 0.3950199782848358, "kl": 0.387451171875, "learning_rate": 6.188436263278172e-07, "loss": 0.1336, "reward": 0.626802071928978, "reward_std": 0.6337872818112373, "rewards/cosine_scaled_reward": -0.028860883321613073, "rewards/format_reward": 0.6845238283276558, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 2465.6130981445312, "epoch": 1.016, "grad_norm": 0.6084108352661133, "kl": 0.45947265625, "learning_rate": 6.157373628530852e-07, "loss": 0.0932, "reward": 0.6250473670661449, "reward_std": 0.7445118278264999, "rewards/cosine_scaled_reward": 2.3671891540288925e-05, "rewards/format_reward": 0.6250000074505806, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 2127.8988647460938, "epoch": 1.02, "grad_norm": 0.8596522212028503, "kl": 0.368896484375, "learning_rate": 6.126278954320294e-07, "loss": 0.0589, "reward": 0.4597589522600174, "reward_std": 0.710930123925209, "rewards/cosine_scaled_reward": -0.1361919562332332, "rewards/format_reward": 0.7321428656578064, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 2113.029815673828, "epoch": 1.024, "grad_norm": 0.6557802557945251, "kl": 0.39306640625, "learning_rate": 6.095153756157051e-07, "loss": 0.0808, "reward": 0.7969172149896622, "reward_std": 0.7165066450834274, "rewards/cosine_scaled_reward": 0.02643477637320757, "rewards/format_reward": 0.7440476268529892, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 2350.4881591796875, "epoch": 1.028, "grad_norm": 0.7259902954101562, "kl": 0.37548828125, "learning_rate": 6.06399955103937e-07, "loss": 0.0556, "reward": 0.6144686937332153, "reward_std": 0.7161982655525208, "rewards/cosine_scaled_reward": -0.0052656568586826324, "rewards/format_reward": 0.6250000149011612, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 2659.2500610351562, "epoch": 1.032, "grad_norm": 0.6974296569824219, "kl": 0.4482421875, "learning_rate": 6.032817857379256e-07, "loss": 0.1425, "reward": 0.38613639771938324, "reward_std": 0.7526693046092987, "rewards/cosine_scaled_reward": -0.10455084778368473, "rewards/format_reward": 0.595238097012043, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 2219.6964721679688, "epoch": 1.036, "grad_norm": 0.5528798699378967, "kl": 0.33984375, "learning_rate": 6.001610194928464e-07, "loss": 0.1191, "reward": 0.6971464306116104, "reward_std": 0.7383679300546646, "rewards/cosine_scaled_reward": -0.023450596883776598, "rewards/format_reward": 0.744047611951828, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 2010.1726989746094, "epoch": 1.04, "grad_norm": 0.36631372570991516, "kl": 0.30126953125, "learning_rate": 5.97037808470444e-07, "loss": 0.0699, "reward": 0.771461233496666, "reward_std": 0.5148339942097664, "rewards/cosine_scaled_reward": -0.01307891309261322, "rewards/format_reward": 0.7976190596818924, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 2153.559539794922, "epoch": 1.044, "grad_norm": 0.48435378074645996, "kl": 0.3251953125, "learning_rate": 5.939123048916173e-07, "loss": 0.0931, "reward": 0.5015835016965866, "reward_std": 0.69777412712574, "rewards/cosine_scaled_reward": -0.121232058852911, "rewards/format_reward": 0.7440476268529892, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 2309.1131591796875, "epoch": 1.048, "grad_norm": 0.6150787472724915, "kl": 0.37060546875, "learning_rate": 5.907846610890011e-07, "loss": 0.1074, "reward": 0.656824603676796, "reward_std": 0.7539815902709961, "rewards/cosine_scaled_reward": -0.025754368398338556, "rewards/format_reward": 0.7083333283662796, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 2073.1488647460938, "epoch": 1.052, "grad_norm": 0.5915967226028442, "kl": 0.32958984375, "learning_rate": 5.87655029499542e-07, "loss": 0.1016, "reward": 0.5839189141988754, "reward_std": 0.6906930133700371, "rewards/cosine_scaled_reward": -0.10089768993202597, "rewards/format_reward": 0.7857142984867096, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 2258.6607971191406, "epoch": 1.056, "grad_norm": 0.5032393932342529, "kl": 0.421875, "learning_rate": 5.845235626570683e-07, "loss": 0.0833, "reward": 0.7445018216967583, "reward_std": 0.7239043861627579, "rewards/cosine_scaled_reward": 0.0002271006815135479, "rewards/format_reward": 0.7440476417541504, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 2421.386962890625, "epoch": 1.06, "grad_norm": 0.5948444604873657, "kl": 0.46826171875, "learning_rate": 5.813904131848564e-07, "loss": 0.1342, "reward": 0.3432777523994446, "reward_std": 0.7306928038597107, "rewards/cosine_scaled_reward": -0.1527658887207508, "rewards/format_reward": 0.6488095223903656, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 1943.8214416503906, "epoch": 1.064, "grad_norm": 0.672618567943573, "kl": 0.33251953125, "learning_rate": 5.78255733788191e-07, "loss": 0.074, "reward": 0.5523176118731499, "reward_std": 0.6472664028406143, "rewards/cosine_scaled_reward": -0.08693643007427454, "rewards/format_reward": 0.7261904776096344, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 2289.4107666015625, "epoch": 1.068, "grad_norm": 0.43480613827705383, "kl": 0.41064453125, "learning_rate": 5.751196772469237e-07, "loss": 0.116, "reward": 0.6816908866167068, "reward_std": 0.7700821459293365, "rewards/cosine_scaled_reward": -0.01034504920244217, "rewards/format_reward": 0.70238097012043, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 2265.166748046875, "epoch": 1.072, "grad_norm": 0.8894410729408264, "kl": 0.37890625, "learning_rate": 5.71982396408026e-07, "loss": 0.1102, "reward": 0.5768959820270538, "reward_std": 0.7392304837703705, "rewards/cosine_scaled_reward": -0.04786152858287096, "rewards/format_reward": 0.6726190447807312, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 2102.4345092773438, "epoch": 1.076, "grad_norm": 1.40628182888031, "kl": 0.34814453125, "learning_rate": 5.688440441781398e-07, "loss": 0.1523, "reward": 0.6540864631533623, "reward_std": 0.7483679950237274, "rewards/cosine_scaled_reward": -0.030099631054326892, "rewards/format_reward": 0.7142857164144516, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 1760.5893249511719, "epoch": 1.08, "grad_norm": 0.655262291431427, "kl": 0.34228515625, "learning_rate": 5.657047735161255e-07, "loss": 0.0938, "reward": 0.7075737789273262, "reward_std": 0.712226152420044, "rewards/cosine_scaled_reward": -0.045022654812783, "rewards/format_reward": 0.7976190596818924, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 1989.1488037109375, "epoch": 1.084, "grad_norm": 0.5984042286872864, "kl": 0.3974609375, "learning_rate": 5.625647374256061e-07, "loss": 0.0893, "reward": 0.5623346008360386, "reward_std": 0.7052316814661026, "rewards/cosine_scaled_reward": -0.08192794572096318, "rewards/format_reward": 0.7261904925107956, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 1998.327392578125, "epoch": 1.088, "grad_norm": 0.41462650895118713, "kl": 0.37939453125, "learning_rate": 5.594240889475106e-07, "loss": 0.1384, "reward": 0.6586858294904232, "reward_std": 0.8071554154157639, "rewards/cosine_scaled_reward": -0.018871376756578684, "rewards/format_reward": 0.696428582072258, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 1905.9880981445312, "epoch": 1.092, "grad_norm": 1.1817877292633057, "kl": 0.4287109375, "learning_rate": 5.562829811526154e-07, "loss": 0.108, "reward": 0.585694283246994, "reward_std": 0.6987177431583405, "rewards/cosine_scaled_reward": -0.08512906730175018, "rewards/format_reward": 0.755952388048172, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 1958.0892639160156, "epoch": 1.096, "grad_norm": 0.6756201982498169, "kl": 0.44580078125, "learning_rate": 5.531415671340826e-07, "loss": 0.1298, "reward": 0.5423668641597033, "reward_std": 0.5766877979040146, "rewards/cosine_scaled_reward": -0.09786419570446014, "rewards/format_reward": 0.7380952537059784, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 1423.4405212402344, "epoch": 1.1, "grad_norm": 0.9936150908470154, "kl": 0.283203125, "learning_rate": 5.5e-07, "loss": 0.0068, "reward": 0.8336242958903313, "reward_std": 0.6556554213166237, "rewards/cosine_scaled_reward": -0.02366404954227619, "rewards/format_reward": 0.8809524178504944, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 1477.011962890625, "epoch": 1.104, "grad_norm": 1.4654834270477295, "kl": 0.30419921875, "learning_rate": 5.468584328659172e-07, "loss": 0.1583, "reward": 0.9086148589849472, "reward_std": 0.7289283871650696, "rewards/cosine_scaled_reward": 0.0197836235165596, "rewards/format_reward": 0.8690476417541504, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 1600.8333740234375, "epoch": 1.108, "grad_norm": 0.5991122126579285, "kl": 0.39697265625, "learning_rate": 5.437170188473847e-07, "loss": 0.0615, "reward": 0.6998666599392891, "reward_std": 0.6800315380096436, "rewards/cosine_scaled_reward": -0.06375712971203029, "rewards/format_reward": 0.82738097012043, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 2001.3035583496094, "epoch": 1.112, "grad_norm": 0.9033568501472473, "kl": 0.4404296875, "learning_rate": 5.405759110524894e-07, "loss": 0.0566, "reward": 0.5947119817137718, "reward_std": 0.6757695525884628, "rewards/cosine_scaled_reward": -0.0806201882660389, "rewards/format_reward": 0.755952388048172, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 1884.9286499023438, "epoch": 1.116, "grad_norm": 1.0505043268203735, "kl": 0.41162109375, "learning_rate": 5.37435262574394e-07, "loss": 0.0838, "reward": 0.546771340072155, "reward_std": 0.5643983408808708, "rewards/cosine_scaled_reward": -0.13137624226510525, "rewards/format_reward": 0.8095238208770752, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 1721.6309814453125, "epoch": 1.12, "grad_norm": 2.6171982288360596, "kl": 0.35400390625, "learning_rate": 5.342952264838747e-07, "loss": 0.119, "reward": 0.7959851026535034, "reward_std": 0.6236628741025925, "rewards/cosine_scaled_reward": -0.03057891083881259, "rewards/format_reward": 0.8571428805589676, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 1974.3809814453125, "epoch": 1.124, "grad_norm": 0.9569424390792847, "kl": 0.4814453125, "learning_rate": 5.311559558218603e-07, "loss": 0.1494, "reward": 0.573462575674057, "reward_std": 0.6640851646661758, "rewards/cosine_scaled_reward": -0.09422110859304667, "rewards/format_reward": 0.761904776096344, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 1699.2142944335938, "epoch": 1.1280000000000001, "grad_norm": 0.5432654619216919, "kl": 0.33935546875, "learning_rate": 5.28017603591974e-07, "loss": 0.0877, "reward": 0.7524446099996567, "reward_std": 0.6557567343115807, "rewards/cosine_scaled_reward": -0.04342056508176029, "rewards/format_reward": 0.8392857164144516, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 2023.1012573242188, "epoch": 1.1320000000000001, "grad_norm": 1.5788854360580444, "kl": 0.498046875, "learning_rate": 5.248803227530763e-07, "loss": 0.1449, "reward": 0.471544723957777, "reward_std": 0.7016247361898422, "rewards/cosine_scaled_reward": -0.14220385067164898, "rewards/format_reward": 0.755952388048172, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 1751.6190795898438, "epoch": 1.1360000000000001, "grad_norm": 0.8654693365097046, "kl": 0.45654296875, "learning_rate": 5.21744266211809e-07, "loss": 0.096, "reward": 0.8401590138673782, "reward_std": 0.7027324140071869, "rewards/cosine_scaled_reward": -0.008491916581988335, "rewards/format_reward": 0.8571428805589676, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 1953.1309814453125, "epoch": 1.1400000000000001, "grad_norm": 0.7724223732948303, "kl": 0.43017578125, "learning_rate": 5.186095868151436e-07, "loss": 0.1257, "reward": 0.5251086875796318, "reward_std": 0.75553198158741, "rewards/cosine_scaled_reward": -0.11244566680397838, "rewards/format_reward": 0.7500000298023224, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 1932.9940795898438, "epoch": 1.144, "grad_norm": 0.6642920970916748, "kl": 0.470703125, "learning_rate": 5.154764373429315e-07, "loss": 0.096, "reward": 0.8715938031673431, "reward_std": 0.7678115516901016, "rewards/cosine_scaled_reward": 0.036987369414418936, "rewards/format_reward": 0.7976190745830536, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 1784.1428527832031, "epoch": 1.148, "grad_norm": 0.9823849201202393, "kl": 0.38134765625, "learning_rate": 5.123449705004581e-07, "loss": 0.0437, "reward": 0.7326274067163467, "reward_std": 0.6021066680550575, "rewards/cosine_scaled_reward": -0.044400574173778296, "rewards/format_reward": 0.8214285969734192, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 1929.0536193847656, "epoch": 1.152, "grad_norm": 2.430745840072632, "kl": 0.45458984375, "learning_rate": 5.09215338910999e-07, "loss": 0.1275, "reward": 0.76754130423069, "reward_std": 0.6635829508304596, "rewards/cosine_scaled_reward": -0.0001579252420924604, "rewards/format_reward": 0.7678571492433548, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 1567.4226684570312, "epoch": 1.156, "grad_norm": 1.8522855043411255, "kl": 0.35302734375, "learning_rate": 5.060876951083828e-07, "loss": 0.0659, "reward": 0.8090793639421463, "reward_std": 0.6970714181661606, "rewards/cosine_scaled_reward": -0.02105556521564722, "rewards/format_reward": 0.8511905074119568, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 1916.0298156738281, "epoch": 1.16, "grad_norm": 0.8320524096488953, "kl": 0.353515625, "learning_rate": 5.02962191529556e-07, "loss": 0.0397, "reward": 0.8147249445319176, "reward_std": 0.7559010833501816, "rewards/cosine_scaled_reward": 0.014505308354273438, "rewards/format_reward": 0.7857143133878708, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 1871.5595703125, "epoch": 1.164, "grad_norm": 1.639461636543274, "kl": 0.44482421875, "learning_rate": 4.998389805071536e-07, "loss": 0.02, "reward": 0.7966814041137695, "reward_std": 0.6868171393871307, "rewards/cosine_scaled_reward": -0.03320692107081413, "rewards/format_reward": 0.8630952686071396, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 1849.0714721679688, "epoch": 1.168, "grad_norm": 0.9159106016159058, "kl": 0.41357421875, "learning_rate": 4.967182142620745e-07, "loss": 0.1098, "reward": 0.8123535662889481, "reward_std": 0.7406510710716248, "rewards/cosine_scaled_reward": -0.0075137000530958176, "rewards/format_reward": 0.82738097012043, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 1627.2262268066406, "epoch": 1.172, "grad_norm": 1.2907826900482178, "kl": 0.3271484375, "learning_rate": 4.93600044896063e-07, "loss": 0.028, "reward": 0.7378726750612259, "reward_std": 0.6904594451189041, "rewards/cosine_scaled_reward": -0.07153987139463425, "rewards/format_reward": 0.8809524029493332, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 2141.2202758789062, "epoch": 1.176, "grad_norm": 0.7737708687782288, "kl": 0.4482421875, "learning_rate": 4.904846243842949e-07, "loss": 0.0644, "reward": 0.7625293210148811, "reward_std": 0.7152971476316452, "rewards/cosine_scaled_reward": 0.009240844286978245, "rewards/format_reward": 0.7440476417541504, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 2187.3809814453125, "epoch": 1.18, "grad_norm": 1.1525542736053467, "kl": 0.51025390625, "learning_rate": 4.873721045679706e-07, "loss": 0.0634, "reward": 0.5901899486780167, "reward_std": 0.6728092133998871, "rewards/cosine_scaled_reward": -0.0739526596153155, "rewards/format_reward": 0.7380952537059784, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 2388.577392578125, "epoch": 1.184, "grad_norm": 0.9084761738777161, "kl": 0.52734375, "learning_rate": 4.842626371469149e-07, "loss": 0.0587, "reward": 0.4302752036601305, "reward_std": 0.615352213382721, "rewards/cosine_scaled_reward": -0.12117192603182048, "rewards/format_reward": 0.6726190596818924, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 1568.6786193847656, "epoch": 1.188, "grad_norm": 0.852024495601654, "kl": 0.1533203125, "learning_rate": 4.811563736721829e-07, "loss": 0.0574, "reward": 0.7380149587988853, "reward_std": 0.7155523598194122, "rewards/cosine_scaled_reward": -0.029802043922245502, "rewards/format_reward": 0.7976190596818924, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 1983.4524230957031, "epoch": 1.192, "grad_norm": 0.7617373466491699, "kl": 0.303955078125, "learning_rate": 4.780534655386743e-07, "loss": 0.068, "reward": 0.7127486318349838, "reward_std": 0.7076264545321465, "rewards/cosine_scaled_reward": -0.018625682685524225, "rewards/format_reward": 0.7500000149011612, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 2038.3750305175781, "epoch": 1.196, "grad_norm": 0.8094474673271179, "kl": 0.25830078125, "learning_rate": 4.749540639777539e-07, "loss": 0.0566, "reward": 0.6301854252815247, "reward_std": 0.6336864829063416, "rewards/cosine_scaled_reward": -0.036097751930356026, "rewards/format_reward": 0.7023809552192688, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 1825.9643249511719, "epoch": 1.2, "grad_norm": 1.8039993047714233, "kl": 0.225830078125, "learning_rate": 4.7185832004988133e-07, "loss": 0.0501, "reward": 0.9151953011751175, "reward_std": 0.6518659368157387, "rewards/cosine_scaled_reward": 0.03200240898877382, "rewards/format_reward": 0.8511905074119568, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 1954.3809814453125, "epoch": 1.204, "grad_norm": 0.9098180532455444, "kl": 0.2685546875, "learning_rate": 4.68766384637248e-07, "loss": 0.08, "reward": 0.903901144862175, "reward_std": 0.7074443101882935, "rewards/cosine_scaled_reward": 0.059093400835990906, "rewards/format_reward": 0.7857142835855484, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 2221.6012268066406, "epoch": 1.208, "grad_norm": 0.628447949886322, "kl": 0.297119140625, "learning_rate": 4.656784084364238e-07, "loss": 0.0225, "reward": 0.7435066364705563, "reward_std": 0.7286128550767899, "rewards/cosine_scaled_reward": 0.002705696038901806, "rewards/format_reward": 0.7380952537059784, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 1967.6607360839844, "epoch": 1.212, "grad_norm": 1.4870760440826416, "kl": 0.2193603515625, "learning_rate": 4.6259454195101267e-07, "loss": 0.0076, "reward": 0.6118638888001442, "reward_std": 0.6256552934646606, "rewards/cosine_scaled_reward": -0.08990138117223978, "rewards/format_reward": 0.7916666716337204, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 2091.8810119628906, "epoch": 1.216, "grad_norm": 1.0213916301727295, "kl": 0.28857421875, "learning_rate": 4.59514935484316e-07, "loss": 0.0819, "reward": 0.8858746439218521, "reward_std": 0.760543704032898, "rewards/cosine_scaled_reward": 0.04412779211997986, "rewards/format_reward": 0.7976190596818924, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 2117.5357971191406, "epoch": 1.22, "grad_norm": 1.1696289777755737, "kl": 0.266845703125, "learning_rate": 4.5643973913200837e-07, "loss": 0.0319, "reward": 0.4878672659397125, "reward_std": 0.5883132815361023, "rewards/cosine_scaled_reward": -0.13999494537711143, "rewards/format_reward": 0.7678571492433548, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 1962.8512268066406, "epoch": 1.224, "grad_norm": 1.1181604862213135, "kl": 0.26171875, "learning_rate": 4.5336910277482155e-07, "loss": 0.0553, "reward": 0.8040256127715111, "reward_std": 0.7542890757322311, "rewards/cosine_scaled_reward": 0.00915566342882812, "rewards/format_reward": 0.7857143059372902, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 2503.541717529297, "epoch": 1.228, "grad_norm": 1.0075181722640991, "kl": 0.274658203125, "learning_rate": 4.503031760712397e-07, "loss": 0.0639, "reward": 0.5502185635268688, "reward_std": 0.7036140263080597, "rewards/cosine_scaled_reward": -0.043343101628124714, "rewards/format_reward": 0.6369047686457634, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 2027.6905212402344, "epoch": 1.232, "grad_norm": 2.7786951065063477, "kl": 0.265380859375, "learning_rate": 4.4724210845020494e-07, "loss": 0.1529, "reward": 0.8017951250076294, "reward_std": 0.7912951856851578, "rewards/cosine_scaled_reward": 0.005064212018623948, "rewards/format_reward": 0.7916666865348816, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 2560.6845703125, "epoch": 1.236, "grad_norm": 1.6693713665008545, "kl": 0.2939453125, "learning_rate": 4.441860491038345e-07, "loss": 0.1046, "reward": 0.6068699322640896, "reward_std": 0.7445466667413712, "rewards/cosine_scaled_reward": -0.00013647368177771568, "rewards/format_reward": 0.6071428656578064, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 2221.3988037109375, "epoch": 1.24, "grad_norm": 0.7167072892189026, "kl": 0.253173828125, "learning_rate": 4.4113514698014953e-07, "loss": 0.046, "reward": 0.5108997635543346, "reward_std": 0.6983606815338135, "rewards/cosine_scaled_reward": -0.09276440553367138, "rewards/format_reward": 0.696428582072258, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 2308.1012573242188, "epoch": 1.244, "grad_norm": 1.289093255996704, "kl": 0.24072265625, "learning_rate": 4.3808955077581546e-07, "loss": 0.074, "reward": 0.49418094009160995, "reward_std": 0.6803844273090363, "rewards/cosine_scaled_reward": -0.0862428704276681, "rewards/format_reward": 0.6666666716337204, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 2221.500030517578, "epoch": 1.248, "grad_norm": 0.7747544646263123, "kl": 0.284423828125, "learning_rate": 4.350494089288943e-07, "loss": 0.0127, "reward": 0.5928547494113445, "reward_std": 0.6995180547237396, "rewards/cosine_scaled_reward": -0.0875012082979083, "rewards/format_reward": 0.7678571492433548, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 2528.3750610351562, "epoch": 1.252, "grad_norm": 0.9067274928092957, "kl": 0.261962890625, "learning_rate": 4.3201486961161093e-07, "loss": 0.0588, "reward": 0.580617468804121, "reward_std": 0.7565959244966507, "rewards/cosine_scaled_reward": -0.05195318069308996, "rewards/format_reward": 0.6845238208770752, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 2290.434539794922, "epoch": 1.256, "grad_norm": 1.0397149324417114, "kl": 0.2939453125, "learning_rate": 4.2898608072313045e-07, "loss": 0.0843, "reward": 0.923637330532074, "reward_std": 0.8029050081968307, "rewards/cosine_scaled_reward": 0.07491390081122518, "rewards/format_reward": 0.7738095372915268, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 2221.1785583496094, "epoch": 1.26, "grad_norm": 0.8451793789863586, "kl": 0.2978515625, "learning_rate": 4.2596318988235037e-07, "loss": 0.0794, "reward": 0.9175606220960617, "reward_std": 0.6950835883617401, "rewards/cosine_scaled_reward": 0.06592314876616001, "rewards/format_reward": 0.7857142984867096, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 2737.1131591796875, "epoch": 1.264, "grad_norm": 0.8914613723754883, "kl": 0.4072265625, "learning_rate": 4.2294634442070553e-07, "loss": 0.0266, "reward": 0.39799112919718027, "reward_std": 0.5211281925439835, "rewards/cosine_scaled_reward": -0.0896949004381895, "rewards/format_reward": 0.5773809626698494, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 2356.8155517578125, "epoch": 1.268, "grad_norm": 0.8658885955810547, "kl": 0.32080078125, "learning_rate": 4.1993569137498776e-07, "loss": 0.0558, "reward": 0.4528093598783016, "reward_std": 0.5718662440776825, "rewards/cosine_scaled_reward": -0.11585722491145134, "rewards/format_reward": 0.6845238283276558, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 2067.0952758789062, "epoch": 1.272, "grad_norm": 0.6174459457397461, "kl": 0.2724609375, "learning_rate": 4.1693137748017915e-07, "loss": 0.0532, "reward": 0.9527914822101593, "reward_std": 0.7573249191045761, "rewards/cosine_scaled_reward": 0.059729063883423805, "rewards/format_reward": 0.833333358168602, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 2158.3631286621094, "epoch": 1.276, "grad_norm": 0.5749858617782593, "kl": 0.2744140625, "learning_rate": 4.1393354916230005e-07, "loss": 0.0398, "reward": 0.7759583368897438, "reward_std": 0.7076128423213959, "rewards/cosine_scaled_reward": 0.00702677620574832, "rewards/format_reward": 0.761904776096344, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 2376.7857666015625, "epoch": 1.28, "grad_norm": 0.4824450612068176, "kl": 0.358642578125, "learning_rate": 4.1094235253127374e-07, "loss": 0.0579, "reward": 0.5863704346120358, "reward_std": 0.69185970723629, "rewards/cosine_scaled_reward": -0.058005278930068016, "rewards/format_reward": 0.7023809552192688, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 2094.6786193847656, "epoch": 1.284, "grad_norm": 1.153307318687439, "kl": 0.32373046875, "learning_rate": 4.079579333738039e-07, "loss": 0.0147, "reward": 0.5667938031256199, "reward_std": 0.6206858605146408, "rewards/cosine_scaled_reward": -0.12434119766112417, "rewards/format_reward": 0.8154762089252472, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 2360.970245361328, "epoch": 1.288, "grad_norm": 0.7556703090667725, "kl": 0.314697265625, "learning_rate": 4.0498043714627006e-07, "loss": 0.0413, "reward": 0.54334956407547, "reward_std": 0.7112371101975441, "rewards/cosine_scaled_reward": -0.10927761369384825, "rewards/format_reward": 0.761904776096344, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 2466.9703369140625, "epoch": 1.292, "grad_norm": 0.6241899728775024, "kl": 0.345703125, "learning_rate": 4.020100089676376e-07, "loss": 0.0463, "reward": 0.5620089694857597, "reward_std": 0.6381285488605499, "rewards/cosine_scaled_reward": -0.0672098146751523, "rewards/format_reward": 0.696428582072258, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 2297.2916564941406, "epoch": 1.296, "grad_norm": 1.050784945487976, "kl": 0.289306640625, "learning_rate": 3.9904679361238526e-07, "loss": 0.095, "reward": 0.6569867879152298, "reward_std": 0.6581598520278931, "rewards/cosine_scaled_reward": -0.034601859748363495, "rewards/format_reward": 0.7261904925107956, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 2391.8095703125, "epoch": 1.3, "grad_norm": 0.5910518169403076, "kl": 0.327392578125, "learning_rate": 3.9609093550344907e-07, "loss": 0.065, "reward": 0.6689947620034218, "reward_std": 0.5862837731838226, "rewards/cosine_scaled_reward": -0.0434788279235363, "rewards/format_reward": 0.7559524029493332, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 2161.34521484375, "epoch": 1.304, "grad_norm": 1.3934383392333984, "kl": 0.2412109375, "learning_rate": 3.931425787051832e-07, "loss": 0.0952, "reward": 0.7927189618349075, "reward_std": 0.8861154615879059, "rewards/cosine_scaled_reward": 0.03624042624142021, "rewards/format_reward": 0.7202381044626236, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 2205.75, "epoch": 1.308, "grad_norm": 0.5909004211425781, "kl": 0.276611328125, "learning_rate": 3.902018669163384e-07, "loss": 0.0265, "reward": 0.7868844717741013, "reward_std": 0.6631656885147095, "rewards/cosine_scaled_reward": 0.024394613516051322, "rewards/format_reward": 0.7380952388048172, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 2526.71435546875, "epoch": 1.312, "grad_norm": 0.37658610939979553, "kl": 0.30908203125, "learning_rate": 3.872689434630585e-07, "loss": 0.0593, "reward": 0.3922804482281208, "reward_std": 0.7164648473262787, "rewards/cosine_scaled_reward": -0.11933596897870302, "rewards/format_reward": 0.6309523731470108, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 2406.6726684570312, "epoch": 1.316, "grad_norm": 0.5439748764038086, "kl": 0.28759765625, "learning_rate": 3.843439512918949e-07, "loss": 0.0395, "reward": 0.457830130122602, "reward_std": 0.6897861212491989, "rewards/cosine_scaled_reward": -0.10739446245133877, "rewards/format_reward": 0.6726190447807312, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 2233.6548461914062, "epoch": 1.32, "grad_norm": 1.2243571281433105, "kl": 0.289306640625, "learning_rate": 3.8142703296283953e-07, "loss": 0.1087, "reward": 0.6516863703727722, "reward_std": 0.7036527991294861, "rewards/cosine_scaled_reward": -0.08189492486417294, "rewards/format_reward": 0.8154762089252472, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 2212.7560119628906, "epoch": 1.324, "grad_norm": 0.8144615888595581, "kl": 0.28857421875, "learning_rate": 3.785183306423767e-07, "loss": 0.0775, "reward": 0.5815620422363281, "reward_std": 0.5177476480603218, "rewards/cosine_scaled_reward": -0.042552310740575194, "rewards/format_reward": 0.6666666865348816, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 2412.0059814453125, "epoch": 1.328, "grad_norm": 0.42855292558670044, "kl": 0.3232421875, "learning_rate": 3.7561798609655373e-07, "loss": 0.0791, "reward": 0.642042949795723, "reward_std": 0.6289803832769394, "rewards/cosine_scaled_reward": -0.04207377042621374, "rewards/format_reward": 0.7261904925107956, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 2163.9822387695312, "epoch": 1.332, "grad_norm": 1.0114275217056274, "kl": 0.255859375, "learning_rate": 3.72726140684072e-07, "loss": 0.088, "reward": 0.811268161451153, "reward_std": 0.6822613030672073, "rewards/cosine_scaled_reward": 0.042538831010460854, "rewards/format_reward": 0.7261904925107956, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 2329.0178833007812, "epoch": 1.336, "grad_norm": 0.7170870900154114, "kl": 0.3486328125, "learning_rate": 3.6984293534939737e-07, "loss": 0.0455, "reward": 0.8848401606082916, "reward_std": 0.7328508943319321, "rewards/cosine_scaled_reward": 0.04361054569017142, "rewards/format_reward": 0.7976190745830536, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 2709.2262573242188, "epoch": 1.34, "grad_norm": 0.47293010354042053, "kl": 0.38037109375, "learning_rate": 3.6696851061588994e-07, "loss": 0.0416, "reward": 0.3898888286203146, "reward_std": 0.6401937156915665, "rewards/cosine_scaled_reward": -0.10862701199948788, "rewards/format_reward": 0.6071428582072258, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 2530.416748046875, "epoch": 1.3439999999999999, "grad_norm": 0.4423607885837555, "kl": 0.282958984375, "learning_rate": 3.641030065789562e-07, "loss": 0.0446, "reward": 0.6725399196147919, "reward_std": 0.7871751934289932, "rewards/cosine_scaled_reward": 0.0059128133580088615, "rewards/format_reward": 0.6607142984867096, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 2311.5358276367188, "epoch": 1.3479999999999999, "grad_norm": 0.5007253885269165, "kl": 0.3203125, "learning_rate": 3.612465628992203e-07, "loss": 0.0455, "reward": 0.8073793947696686, "reward_std": 0.7870100140571594, "rewards/cosine_scaled_reward": 0.010832530329935253, "rewards/format_reward": 0.7857142984867096, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 2489.7500610351562, "epoch": 1.3519999999999999, "grad_norm": 0.36444640159606934, "kl": 0.305908203125, "learning_rate": 3.5839931879571725e-07, "loss": 0.0652, "reward": 0.6751855611801147, "reward_std": 0.6701688021421432, "rewards/cosine_scaled_reward": 0.001283254474401474, "rewards/format_reward": 0.6726190596818924, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 2460.8154907226562, "epoch": 1.3559999999999999, "grad_norm": 0.43892228603363037, "kl": 0.3369140625, "learning_rate": 3.555614130391079e-07, "loss": 0.0519, "reward": 0.6638183146715164, "reward_std": 0.770327016711235, "rewards/cosine_scaled_reward": 0.010480590397492051, "rewards/format_reward": 0.6428571566939354, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 2244.3631896972656, "epoch": 1.3599999999999999, "grad_norm": 0.6102768778800964, "kl": 0.31201171875, "learning_rate": 3.5273298394491515e-07, "loss": 0.0694, "reward": 0.8422182202339172, "reward_std": 0.6671302318572998, "rewards/cosine_scaled_reward": 0.04610910080373287, "rewards/format_reward": 0.7500000074505806, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 2239.75, "epoch": 1.3639999999999999, "grad_norm": 0.6582260727882385, "kl": 0.3271484375, "learning_rate": 3.4991416936678276e-07, "loss": 0.076, "reward": 0.6709855943918228, "reward_std": 0.7041856721043587, "rewards/cosine_scaled_reward": -0.03355482150800526, "rewards/format_reward": 0.7380952537059784, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 2438.3214721679688, "epoch": 1.3679999999999999, "grad_norm": 0.5521511435508728, "kl": 0.320556640625, "learning_rate": 3.471051066897562e-07, "loss": 0.047, "reward": 0.6942454129457474, "reward_std": 0.6340186148881912, "rewards/cosine_scaled_reward": 0.010813180379045662, "rewards/format_reward": 0.6726190745830536, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 2433.6488647460938, "epoch": 1.3719999999999999, "grad_norm": 0.928674042224884, "kl": 0.40478515625, "learning_rate": 3.4430593282358777e-07, "loss": 0.0328, "reward": 0.5231252759695053, "reward_std": 0.7485495656728745, "rewards/cosine_scaled_reward": -0.11641356535255909, "rewards/format_reward": 0.755952388048172, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 2388.2202758789062, "epoch": 1.376, "grad_norm": 0.43529966473579407, "kl": 0.32080078125, "learning_rate": 3.4151678419606233e-07, "loss": 0.0303, "reward": 0.7449862584471703, "reward_std": 0.6971839666366577, "rewards/cosine_scaled_reward": 0.024278827477246523, "rewards/format_reward": 0.696428582072258, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 2686.15478515625, "epoch": 1.38, "grad_norm": 0.5191164016723633, "kl": 0.36328125, "learning_rate": 3.387377967463493e-07, "loss": 0.0602, "reward": 0.4467791821807623, "reward_std": 0.6689166128635406, "rewards/cosine_scaled_reward": -0.07125327130779624, "rewards/format_reward": 0.5892857238650322, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 2138.119110107422, "epoch": 1.384, "grad_norm": 0.40859875082969666, "kl": 0.344970703125, "learning_rate": 3.359691059183761e-07, "loss": 0.0894, "reward": 0.7263324186205864, "reward_std": 0.7082626074552536, "rewards/cosine_scaled_reward": -0.029690947383642197, "rewards/format_reward": 0.7857143133878708, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 2158.6429138183594, "epoch": 1.388, "grad_norm": 0.35558465123176575, "kl": 0.29638671875, "learning_rate": 3.3321084665422803e-07, "loss": 0.0262, "reward": 0.6269577667117119, "reward_std": 0.5908889323472977, "rewards/cosine_scaled_reward": -0.04664018237963319, "rewards/format_reward": 0.7202381119132042, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 2144.619110107422, "epoch": 1.392, "grad_norm": 1.211071491241455, "kl": 0.306640625, "learning_rate": 3.3046315338757026e-07, "loss": -0.0105, "reward": 0.6653935462236404, "reward_std": 0.6245283707976341, "rewards/cosine_scaled_reward": -0.04230323247611523, "rewards/format_reward": 0.75, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 2366.4940795898438, "epoch": 1.396, "grad_norm": 0.5814414620399475, "kl": 0.33154296875, "learning_rate": 3.2772616003709616e-07, "loss": 0.0485, "reward": 0.5602632537484169, "reward_std": 0.5761818215250969, "rewards/cosine_scaled_reward": -0.0978445541113615, "rewards/format_reward": 0.755952388048172, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 2348.6607666015625, "epoch": 1.4, "grad_norm": 0.675369918346405, "kl": 0.29931640625, "learning_rate": 3.250000000000001e-07, "loss": 0.0825, "reward": 0.475093599408865, "reward_std": 0.604865163564682, "rewards/cosine_scaled_reward": -0.07792939431965351, "rewards/format_reward": 0.6309523731470108, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 2099.279815673828, "epoch": 1.404, "grad_norm": 0.5227596163749695, "kl": 0.33447265625, "learning_rate": 3.222848061454764e-07, "loss": 0.0454, "reward": 0.6502892896533012, "reward_std": 0.676431730389595, "rewards/cosine_scaled_reward": -0.05878393305465579, "rewards/format_reward": 0.7678571492433548, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 2465.202392578125, "epoch": 1.408, "grad_norm": 0.4936739206314087, "kl": 0.33154296875, "learning_rate": 3.195807108082429e-07, "loss": 0.0349, "reward": 0.51472207903862, "reward_std": 0.6474315822124481, "rewards/cosine_scaled_reward": -0.05216278973966837, "rewards/format_reward": 0.6190476417541504, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 2241.089324951172, "epoch": 1.412, "grad_norm": 0.4653976857662201, "kl": 0.3046875, "learning_rate": 3.168878457820915e-07, "loss": 0.0576, "reward": 0.7246856689453125, "reward_std": 0.7023278325796127, "rewards/cosine_scaled_reward": -0.02456192229874432, "rewards/format_reward": 0.7738095223903656, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 2174.4107666015625, "epoch": 1.416, "grad_norm": 1.179158091545105, "kl": 0.31982421875, "learning_rate": 3.142063423134644e-07, "loss": 0.1321, "reward": 0.4120100736618042, "reward_std": 0.5803252756595612, "rewards/cosine_scaled_reward": -0.19280448742210865, "rewards/format_reward": 0.7976190596818924, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 2560.9405517578125, "epoch": 1.42, "grad_norm": 0.6409890651702881, "kl": 0.3291015625, "learning_rate": 3.115363310950578e-07, "loss": 0.0637, "reward": 0.6557277590036392, "reward_std": 0.8805683702230453, "rewards/cosine_scaled_reward": -0.02332661801483482, "rewards/format_reward": 0.70238097012043, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 1872.1012573242188, "epoch": 1.424, "grad_norm": 0.4570577144622803, "kl": 0.244873046875, "learning_rate": 3.0887794225945143e-07, "loss": 0.0537, "reward": 0.8301898017525673, "reward_std": 0.6987727582454681, "rewards/cosine_scaled_reward": -0.016452712705358863, "rewards/format_reward": 0.8630952537059784, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 2199.4880981445312, "epoch": 1.428, "grad_norm": 0.5453688502311707, "kl": 0.3388671875, "learning_rate": 3.062313053727671e-07, "loss": 0.1004, "reward": 0.5429714322090149, "reward_std": 0.757801964879036, "rewards/cosine_scaled_reward": -0.11244285944849253, "rewards/format_reward": 0.7678571492433548, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 2392.5536193847656, "epoch": 1.432, "grad_norm": 0.4179025888442993, "kl": 0.36767578125, "learning_rate": 3.0359654942835247e-07, "loss": 0.057, "reward": 0.6754717975854874, "reward_std": 0.8176562935113907, "rewards/cosine_scaled_reward": -0.004526023752987385, "rewards/format_reward": 0.6845238357782364, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 2280.5000915527344, "epoch": 1.436, "grad_norm": 0.5272053480148315, "kl": 0.273681640625, "learning_rate": 3.0097380284049523e-07, "loss": 0.0565, "reward": 0.644446611404419, "reward_std": 0.7567472010850906, "rewards/cosine_scaled_reward": -0.02896718680858612, "rewards/format_reward": 0.70238097012043, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 2270.3928833007812, "epoch": 1.44, "grad_norm": 0.8152810335159302, "kl": 0.34619140625, "learning_rate": 2.9836319343816397e-07, "loss": 0.0306, "reward": 0.7786325067281723, "reward_std": 0.559767447412014, "rewards/cosine_scaled_reward": -0.012469482608139515, "rewards/format_reward": 0.8035714477300644, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 2094.2083740234375, "epoch": 1.444, "grad_norm": 0.9731494188308716, "kl": 0.33203125, "learning_rate": 2.9576484845877793e-07, "loss": 0.0315, "reward": 0.7239094823598862, "reward_std": 0.6780030280351639, "rewards/cosine_scaled_reward": -0.057688117027282715, "rewards/format_reward": 0.839285746216774, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 2515.3809814453125, "epoch": 1.448, "grad_norm": 0.5006127953529358, "kl": 0.3583984375, "learning_rate": 2.931788945420058e-07, "loss": 0.0632, "reward": 0.5585716450586915, "reward_std": 0.6955743506550789, "rewards/cosine_scaled_reward": -0.08976180851459503, "rewards/format_reward": 0.7380952462553978, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 2665.2560424804688, "epoch": 1.452, "grad_norm": 0.4868517220020294, "kl": 0.373046875, "learning_rate": 2.9060545772359305e-07, "loss": 0.0555, "reward": 0.5607914663851261, "reward_std": 0.6483574956655502, "rewards/cosine_scaled_reward": -0.07377092959359288, "rewards/format_reward": 0.7083333432674408, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 2244.7262573242188, "epoch": 1.456, "grad_norm": 0.6844132542610168, "kl": 0.3173828125, "learning_rate": 2.8804466342921987e-07, "loss": 0.0109, "reward": 0.7073798812925816, "reward_std": 0.6621369272470474, "rewards/cosine_scaled_reward": -0.01833386719226837, "rewards/format_reward": 0.7440476417541504, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 2576.1905517578125, "epoch": 1.46, "grad_norm": 0.5755227208137512, "kl": 0.35400390625, "learning_rate": 2.854966364683872e-07, "loss": 0.0531, "reward": 0.6706622801721096, "reward_std": 0.8000525310635567, "rewards/cosine_scaled_reward": -0.03371649980545044, "rewards/format_reward": 0.7380952537059784, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 2664.3928833007812, "epoch": 1.464, "grad_norm": 0.6695978045463562, "kl": 0.4052734375, "learning_rate": 2.829615010283344e-07, "loss": 0.1001, "reward": 0.6332942470908165, "reward_std": 0.9363250732421875, "rewards/cosine_scaled_reward": -0.04049574676901102, "rewards/format_reward": 0.7142857313156128, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 2493.2857666015625, "epoch": 1.468, "grad_norm": 0.41825661063194275, "kl": 0.269775390625, "learning_rate": 2.8043938066798645e-07, "loss": 0.0634, "reward": 0.6000736728310585, "reward_std": 0.6958686709403992, "rewards/cosine_scaled_reward": -0.04520127363502979, "rewards/format_reward": 0.690476194024086, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 2441.184600830078, "epoch": 1.472, "grad_norm": 0.6742368936538696, "kl": 0.29248046875, "learning_rate": 2.7793039831193133e-07, "loss": 0.0205, "reward": 0.7077510952949524, "reward_std": 0.8173489719629288, "rewards/cosine_scaled_reward": -0.003267320804297924, "rewards/format_reward": 0.7142857313156128, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 2645.202392578125, "epoch": 1.476, "grad_norm": 0.6914957761764526, "kl": 0.298095703125, "learning_rate": 2.7543467624442956e-07, "loss": 0.0967, "reward": 0.2303389220032841, "reward_std": 0.6355866640806198, "rewards/cosine_scaled_reward": -0.1616162583231926, "rewards/format_reward": 0.5535714328289032, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 2256.9286193847656, "epoch": 1.48, "grad_norm": 0.9714637994766235, "kl": 0.255126953125, "learning_rate": 2.729523361034538e-07, "loss": 0.0866, "reward": 0.7436040937900543, "reward_std": 0.6377575844526291, "rewards/cosine_scaled_reward": -0.012126525864005089, "rewards/format_reward": 0.767857164144516, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 2519.7202758789062, "epoch": 1.484, "grad_norm": 0.6541756987571716, "kl": 0.32470703125, "learning_rate": 2.7048349887476037e-07, "loss": 0.0731, "reward": 0.8480066582560539, "reward_std": 0.7711106240749359, "rewards/cosine_scaled_reward": 0.031146179419010878, "rewards/format_reward": 0.7857142984867096, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 2420.970245361328, "epoch": 1.488, "grad_norm": 0.5346278548240662, "kl": 0.2998046875, "learning_rate": 2.6802828488599294e-07, "loss": 0.0556, "reward": 0.6287192776799202, "reward_std": 0.6931318640708923, "rewards/cosine_scaled_reward": -0.03683085576631129, "rewards/format_reward": 0.7023809552192688, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 2542.5654907226562, "epoch": 1.492, "grad_norm": 0.43199771642684937, "kl": 0.33544921875, "learning_rate": 2.655868138008171e-07, "loss": 0.0657, "reward": 0.4730634540319443, "reward_std": 0.5836888402700424, "rewards/cosine_scaled_reward": -0.1533492412418127, "rewards/format_reward": 0.7797619104385376, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 2848.3572387695312, "epoch": 1.496, "grad_norm": 0.6630088686943054, "kl": 0.35009765625, "learning_rate": 2.631592046130896e-07, "loss": 0.0207, "reward": 0.2956889607012272, "reward_std": 0.614417277276516, "rewards/cosine_scaled_reward": -0.1319174226373434, "rewards/format_reward": 0.5595238283276558, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 2754.0060424804688, "epoch": 1.5, "grad_norm": 0.4504316449165344, "kl": 0.302490234375, "learning_rate": 2.6074557564105724e-07, "loss": 0.0225, "reward": 0.42709287256002426, "reward_std": 0.6112170070409775, "rewards/cosine_scaled_reward": -0.07514405064284801, "rewards/format_reward": 0.5773809552192688, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 2403.52978515625, "epoch": 1.504, "grad_norm": 0.4335888624191284, "kl": 0.266845703125, "learning_rate": 2.583460445215911e-07, "loss": 0.0347, "reward": 0.37878482323139906, "reward_std": 0.5512942001223564, "rewards/cosine_scaled_reward": -0.14691711403429508, "rewards/format_reward": 0.6726190745830536, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 2539.71435546875, "epoch": 1.508, "grad_norm": 0.6600142121315002, "kl": 0.35546875, "learning_rate": 2.5596072820445254e-07, "loss": 0.0879, "reward": 0.6289402991533279, "reward_std": 0.7740087658166885, "rewards/cosine_scaled_reward": -0.04564890172332525, "rewards/format_reward": 0.7202381044626236, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 2566.4107666015625, "epoch": 1.512, "grad_norm": 0.5574218034744263, "kl": 0.310791015625, "learning_rate": 2.5358974294659373e-07, "loss": 0.0794, "reward": 0.5734596885740757, "reward_std": 0.6776000708341599, "rewards/cosine_scaled_reward": -0.058508249232545495, "rewards/format_reward": 0.6904762089252472, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 2749.4642944335938, "epoch": 1.516, "grad_norm": 0.4314301908016205, "kl": 0.330078125, "learning_rate": 2.512332043064913e-07, "loss": 0.0655, "reward": 0.42858002707362175, "reward_std": 0.7303398549556732, "rewards/cosine_scaled_reward": -0.10416238568723202, "rewards/format_reward": 0.6369047611951828, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 2403.684539794922, "epoch": 1.52, "grad_norm": 0.42673397064208984, "kl": 0.299560546875, "learning_rate": 2.488912271385139e-07, "loss": 0.0799, "reward": 0.7241241782903671, "reward_std": 0.7478837221860886, "rewards/cosine_scaled_reward": -0.006985542830079794, "rewards/format_reward": 0.7380952388048172, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 2391.5179443359375, "epoch": 1.524, "grad_norm": 0.8130372762680054, "kl": 0.3203125, "learning_rate": 2.465639255873246e-07, "loss": 0.0286, "reward": 0.49668359011411667, "reward_std": 0.6931805461645126, "rewards/cosine_scaled_reward": -0.12665820121765137, "rewards/format_reward": 0.7500000149011612, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 2430.5654907226562, "epoch": 1.528, "grad_norm": 0.4374740719795227, "kl": 0.310302734375, "learning_rate": 2.4425141308231765e-07, "loss": 0.0341, "reward": 0.6685771271586418, "reward_std": 0.8352404981851578, "rewards/cosine_scaled_reward": -0.016901913098990917, "rewards/format_reward": 0.7023809552192688, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 2296.1607666015625, "epoch": 1.532, "grad_norm": 0.5494891405105591, "kl": 0.30224609375, "learning_rate": 2.4195380233209006e-07, "loss": 0.0859, "reward": 0.7063075229525566, "reward_std": 0.7431895136833191, "rewards/cosine_scaled_reward": -0.009941489901393652, "rewards/format_reward": 0.7261904925107956, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 2517.827392578125, "epoch": 1.536, "grad_norm": 0.5410645604133606, "kl": 0.33203125, "learning_rate": 2.3967120531894857e-07, "loss": 0.0459, "reward": 0.42114658281207085, "reward_std": 0.6721706539392471, "rewards/cosine_scaled_reward": -0.12573623820208013, "rewards/format_reward": 0.6726190596818924, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 2484.96435546875, "epoch": 1.54, "grad_norm": 0.4815540313720703, "kl": 0.3095703125, "learning_rate": 2.374037332934512e-07, "loss": 0.0759, "reward": 0.7441006675362587, "reward_std": 0.8601991981267929, "rewards/cosine_scaled_reward": 0.029788417392410338, "rewards/format_reward": 0.6845238208770752, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 2389.9703063964844, "epoch": 1.544, "grad_norm": 0.6783538460731506, "kl": 0.2802734375, "learning_rate": 2.3515149676898552e-07, "loss": 0.0716, "reward": 0.479885321110487, "reward_std": 0.7240753322839737, "rewards/cosine_scaled_reward": -0.06958115100860596, "rewards/format_reward": 0.6190476417541504, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 2359.964324951172, "epoch": 1.548, "grad_norm": 0.8481286764144897, "kl": 0.296630859375, "learning_rate": 2.3291460551638237e-07, "loss": 0.0148, "reward": 0.5802747337147593, "reward_std": 0.5601852983236313, "rewards/cosine_scaled_reward": -0.04617217415943742, "rewards/format_reward": 0.672619067132473, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 2159.5655212402344, "epoch": 1.552, "grad_norm": 0.5140231251716614, "kl": 0.302001953125, "learning_rate": 2.306931685585657e-07, "loss": 0.063, "reward": 0.5727366209030151, "reward_std": 0.6229267343878746, "rewards/cosine_scaled_reward": -0.11244121752679348, "rewards/format_reward": 0.7976190745830536, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 2253.7559814453125, "epoch": 1.556, "grad_norm": 0.4566425681114197, "kl": 0.292724609375, "learning_rate": 2.2848729416523859e-07, "loss": 0.0398, "reward": 0.6296885460615158, "reward_std": 0.7193648666143417, "rewards/cosine_scaled_reward": -0.04825095273554325, "rewards/format_reward": 0.7261904925107956, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 2653.6250610351562, "epoch": 1.56, "grad_norm": 0.6326945424079895, "kl": 0.38818359375, "learning_rate": 2.2629708984760706e-07, "loss": 0.1043, "reward": 0.531873881816864, "reward_std": 0.7026529461145401, "rewards/cosine_scaled_reward": -0.0822773426771164, "rewards/format_reward": 0.696428582072258, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 2613.7261962890625, "epoch": 1.564, "grad_norm": 0.398603618144989, "kl": 0.305908203125, "learning_rate": 2.2412266235313973e-07, "loss": 0.0464, "reward": 0.2553718090057373, "reward_std": 0.6311058104038239, "rewards/cosine_scaled_reward": -0.1699331346899271, "rewards/format_reward": 0.595238097012043, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 2196.5833435058594, "epoch": 1.568, "grad_norm": 1.320838451385498, "kl": 0.2607421875, "learning_rate": 2.2196411766036487e-07, "loss": 0.1165, "reward": 0.6960010007023811, "reward_std": 0.8236257880926132, "rewards/cosine_scaled_reward": -0.044856662629172206, "rewards/format_reward": 0.7857143133878708, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 2663.9285888671875, "epoch": 1.572, "grad_norm": 0.5183250904083252, "kl": 0.328857421875, "learning_rate": 2.1982156097370557e-07, "loss": 0.0708, "reward": 0.34957781434059143, "reward_std": 0.6104390919208527, "rewards/cosine_scaled_reward": -0.12878252286463976, "rewards/format_reward": 0.6071428805589676, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 2630.2560424804688, "epoch": 1.576, "grad_norm": 0.2792785167694092, "kl": 0.34619140625, "learning_rate": 2.1769509671835223e-07, "loss": 0.0772, "reward": 0.45473287999629974, "reward_std": 0.7525355666875839, "rewards/cosine_scaled_reward": -0.0940621355548501, "rewards/format_reward": 0.6428571492433548, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 2484.2679138183594, "epoch": 1.58, "grad_norm": 0.47966378927230835, "kl": 0.3330078125, "learning_rate": 2.1558482853517253e-07, "loss": 0.0783, "reward": 0.5024484526365995, "reward_std": 0.6865183711051941, "rewards/cosine_scaled_reward": -0.07615673809777945, "rewards/format_reward": 0.6547619104385376, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 2481.96435546875, "epoch": 1.584, "grad_norm": 0.4925695061683655, "kl": 0.34228515625, "learning_rate": 2.134908592756607e-07, "loss": 0.1001, "reward": 0.6872217282652855, "reward_std": 0.7295544147491455, "rewards/cosine_scaled_reward": -0.037341527407988906, "rewards/format_reward": 0.761904776096344, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 2154.494110107422, "epoch": 1.588, "grad_norm": 0.635874330997467, "kl": 0.281494140625, "learning_rate": 2.1141329099692406e-07, "loss": 0.0341, "reward": 0.6967096533626318, "reward_std": 0.7243114337325096, "rewards/cosine_scaled_reward": -0.04450232535600662, "rewards/format_reward": 0.7857143059372902, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 2704.7322387695312, "epoch": 1.592, "grad_norm": 0.36841636896133423, "kl": 0.390625, "learning_rate": 2.0935222495670968e-07, "loss": 0.0683, "reward": 0.35482142120599747, "reward_std": 0.6981697529554367, "rewards/cosine_scaled_reward": -0.1410416765138507, "rewards/format_reward": 0.6369047611951828, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 2155.1905212402344, "epoch": 1.596, "grad_norm": 0.6153831481933594, "kl": 0.2763671875, "learning_rate": 2.0730776160846853e-07, "loss": 0.075, "reward": 0.7129835858941078, "reward_std": 0.7049887701869011, "rewards/cosine_scaled_reward": -0.04827013239264488, "rewards/format_reward": 0.8095238208770752, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 2736.9286499023438, "epoch": 1.6, "grad_norm": 0.4656315743923187, "kl": 0.38330078125, "learning_rate": 2.0528000059645995e-07, "loss": 0.0486, "reward": 0.3270074762403965, "reward_std": 0.6684512719511986, "rewards/cosine_scaled_reward": -0.17578197922557592, "rewards/format_reward": 0.6785714477300644, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 2684.994140625, "epoch": 1.604, "grad_norm": 0.4505021274089813, "kl": 0.34423828125, "learning_rate": 2.032690407508949e-07, "loss": 0.0772, "reward": 0.5096228048205376, "reward_std": 0.7098966240882874, "rewards/cosine_scaled_reward": -0.0814981039147824, "rewards/format_reward": 0.6726190522313118, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 2508.327423095703, "epoch": 1.608, "grad_norm": 0.3696132302284241, "kl": 0.34033203125, "learning_rate": 2.0127498008311922e-07, "loss": 0.0554, "reward": 0.6811544820666313, "reward_std": 0.7352585643529892, "rewards/cosine_scaled_reward": -0.03442276082932949, "rewards/format_reward": 0.7500000149011612, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 2329.654815673828, "epoch": 1.612, "grad_norm": 0.5500597953796387, "kl": 0.310302734375, "learning_rate": 1.9929791578083655e-07, "loss": 0.0912, "reward": 0.5886539276689291, "reward_std": 0.6953590214252472, "rewards/cosine_scaled_reward": -0.05091113201342523, "rewards/format_reward": 0.6904762089252472, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 2279.5238342285156, "epoch": 1.616, "grad_norm": 0.825627326965332, "kl": 0.314208984375, "learning_rate": 1.9733794420337213e-07, "loss": 0.0303, "reward": 0.4903724156320095, "reward_std": 0.6759866625070572, "rewards/cosine_scaled_reward": -0.1268376000225544, "rewards/format_reward": 0.7440476417541504, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 2298.607208251953, "epoch": 1.62, "grad_norm": 0.7521853446960449, "kl": 0.300048828125, "learning_rate": 1.9539516087697517e-07, "loss": 0.0742, "reward": 0.6187992710620165, "reward_std": 0.6595779061317444, "rewards/cosine_scaled_reward": -0.050719428109005094, "rewards/format_reward": 0.7202381044626236, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 2462.0416870117188, "epoch": 1.624, "grad_norm": 0.4565219581127167, "kl": 0.36181640625, "learning_rate": 1.934696604901642e-07, "loss": 0.0655, "reward": 0.7676936537027359, "reward_std": 0.8463387489318848, "rewards/cosine_scaled_reward": 0.032656354829669, "rewards/format_reward": 0.7023809552192688, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 2593.9285888671875, "epoch": 1.6280000000000001, "grad_norm": 0.7805240154266357, "kl": 0.37109375, "learning_rate": 1.915615368891117e-07, "loss": 0.0336, "reward": 0.4898635447025299, "reward_std": 0.6100385710597038, "rewards/cosine_scaled_reward": -0.0943539384752512, "rewards/format_reward": 0.6785714477300644, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 2179.8810119628906, "epoch": 1.6320000000000001, "grad_norm": 0.7494162321090698, "kl": 0.3154296875, "learning_rate": 1.8967088307307e-07, "loss": 0.0819, "reward": 0.5508405864238739, "reward_std": 0.6755202859640121, "rewards/cosine_scaled_reward": -0.12934163073077798, "rewards/format_reward": 0.8095238357782364, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 2213.1131591796875, "epoch": 1.6360000000000001, "grad_norm": 0.7274454832077026, "kl": 0.3466796875, "learning_rate": 1.8779779118983867e-07, "loss": 0.0553, "reward": 0.6235681027173996, "reward_std": 0.6233258098363876, "rewards/cosine_scaled_reward": -0.10488261096179485, "rewards/format_reward": 0.833333358168602, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 2276.5774536132812, "epoch": 1.6400000000000001, "grad_norm": 1.2181180715560913, "kl": 0.357421875, "learning_rate": 1.8594235253127372e-07, "loss": 0.1457, "reward": 0.6739452332258224, "reward_std": 0.7620265781879425, "rewards/cosine_scaled_reward": -0.0350511996075511, "rewards/format_reward": 0.7440476268529892, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 2513.0833740234375, "epoch": 1.6440000000000001, "grad_norm": 0.3816966414451599, "kl": 0.37060546875, "learning_rate": 1.8410465752883758e-07, "loss": 0.0631, "reward": 0.503595694899559, "reward_std": 0.6215758174657822, "rewards/cosine_scaled_reward": -0.08748787135118619, "rewards/format_reward": 0.6785714328289032, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 2348.7916870117188, "epoch": 1.6480000000000001, "grad_norm": 0.647936224937439, "kl": 0.3623046875, "learning_rate": 1.822847957491922e-07, "loss": 0.1147, "reward": 0.7675136551260948, "reward_std": 0.7814928591251373, "rewards/cosine_scaled_reward": 0.020661589689552784, "rewards/format_reward": 0.7261904925107956, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 2329.9464721679688, "epoch": 1.6520000000000001, "grad_norm": 0.7966573238372803, "kl": 0.3505859375, "learning_rate": 1.804828558898332e-07, "loss": 0.0507, "reward": 0.7220299392938614, "reward_std": 0.7220810800790787, "rewards/cosine_scaled_reward": -0.01993740734178573, "rewards/format_reward": 0.761904776096344, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 2510.83935546875, "epoch": 1.6560000000000001, "grad_norm": 0.3402910828590393, "kl": 0.36767578125, "learning_rate": 1.7869892577476722e-07, "loss": 0.0717, "reward": 0.6566540375351906, "reward_std": 0.7324352562427521, "rewards/cosine_scaled_reward": -0.016911087092012167, "rewards/format_reward": 0.6904762089252472, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 2362.952423095703, "epoch": 1.6600000000000001, "grad_norm": 0.5068221688270569, "kl": 0.41357421875, "learning_rate": 1.7693309235023127e-07, "loss": 0.0817, "reward": 0.5704538598656654, "reward_std": 0.83931764960289, "rewards/cosine_scaled_reward": -0.08977308124303818, "rewards/format_reward": 0.7500000298023224, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 2197.886962890625, "epoch": 1.6640000000000001, "grad_norm": 0.5192682147026062, "kl": 0.349609375, "learning_rate": 1.7518544168045524e-07, "loss": 0.0456, "reward": 0.7760029062628746, "reward_std": 0.7372387051582336, "rewards/cosine_scaled_reward": -0.0048556849360466, "rewards/format_reward": 0.7857142984867096, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 2365.0655517578125, "epoch": 1.6680000000000001, "grad_norm": 0.7471702098846436, "kl": 0.357421875, "learning_rate": 1.7345605894346726e-07, "loss": 0.0258, "reward": 0.6658978462219238, "reward_std": 0.7144315093755722, "rewards/cosine_scaled_reward": -0.02419395267497748, "rewards/format_reward": 0.7142857313156128, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 2210.7500610351562, "epoch": 1.6720000000000002, "grad_norm": 0.4305538833141327, "kl": 0.37060546875, "learning_rate": 1.7174502842694212e-07, "loss": 0.0715, "reward": 0.6991885676980019, "reward_std": 0.6301053613424301, "rewards/cosine_scaled_reward": -0.03433429542928934, "rewards/format_reward": 0.7678571492433548, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 2631.9048461914062, "epoch": 1.6760000000000002, "grad_norm": 0.31225350499153137, "kl": 0.35888671875, "learning_rate": 1.7005243352409333e-07, "loss": 0.0697, "reward": 0.6943976636976004, "reward_std": 0.7306639850139618, "rewards/cosine_scaled_reward": 0.02874644659459591, "rewards/format_reward": 0.636904776096344, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 2425.6607055664062, "epoch": 1.6800000000000002, "grad_norm": 0.6987324953079224, "kl": 0.38330078125, "learning_rate": 1.6837835672960831e-07, "loss": 0.0585, "reward": 0.7563354596495628, "reward_std": 0.7114580571651459, "rewards/cosine_scaled_reward": -0.038498950423672795, "rewards/format_reward": 0.8333333432674408, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 2277.4464721679688, "epoch": 1.6840000000000002, "grad_norm": 0.34894487261772156, "kl": 0.373046875, "learning_rate": 1.6672287963562852e-07, "loss": 0.0935, "reward": 0.6365625336766243, "reward_std": 0.7153737097978592, "rewards/cosine_scaled_reward": -0.05969492206349969, "rewards/format_reward": 0.7559524029493332, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 2504.916748046875, "epoch": 1.688, "grad_norm": 0.6229146718978882, "kl": 0.35498046875, "learning_rate": 1.6508608292777203e-07, "loss": 0.0181, "reward": 0.5784893482923508, "reward_std": 0.6405449658632278, "rewards/cosine_scaled_reward": -0.06194583047181368, "rewards/format_reward": 0.7023809552192688, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 2308.52978515625, "epoch": 1.692, "grad_norm": 0.4013311266899109, "kl": 0.328125, "learning_rate": 1.6346804638120098e-07, "loss": 0.0574, "reward": 0.6359338611364365, "reward_std": 0.7808969020843506, "rewards/cosine_scaled_reward": -0.03917593788355589, "rewards/format_reward": 0.714285746216774, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 2253.6785888671875, "epoch": 1.696, "grad_norm": 0.7038490176200867, "kl": 0.288330078125, "learning_rate": 1.6186884885673413e-07, "loss": 0.0231, "reward": 0.6301399618387222, "reward_std": 0.7140125781297684, "rewards/cosine_scaled_reward": -0.07481098547577858, "rewards/format_reward": 0.7797619104385376, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 2256.464385986328, "epoch": 1.7, "grad_norm": 0.4849310517311096, "kl": 0.34228515625, "learning_rate": 1.6028856829700258e-07, "loss": 0.0754, "reward": 0.6902762800455093, "reward_std": 0.7732263505458832, "rewards/cosine_scaled_reward": -0.029861881979741156, "rewards/format_reward": 0.7500000149011612, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 2638.3750610351562, "epoch": 1.704, "grad_norm": 0.4661174416542053, "kl": 0.34716796875, "learning_rate": 1.5872728172265146e-07, "loss": 0.0631, "reward": 0.4628839958459139, "reward_std": 0.6522120535373688, "rewards/cosine_scaled_reward": -0.06915326602756977, "rewards/format_reward": 0.601190485060215, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 2258.279815673828, "epoch": 1.708, "grad_norm": 0.5582512021064758, "kl": 0.325439453125, "learning_rate": 1.5718506522858572e-07, "loss": 0.0416, "reward": 0.6841344758868217, "reward_std": 0.716413825750351, "rewards/cosine_scaled_reward": -0.04483753815293312, "rewards/format_reward": 0.7738095298409462, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 2152.732208251953, "epoch": 1.712, "grad_norm": 0.362613320350647, "kl": 0.35546875, "learning_rate": 1.5566199398026147e-07, "loss": 0.0663, "reward": 0.8665853589773178, "reward_std": 0.746289573609829, "rewards/cosine_scaled_reward": 0.013649825006723404, "rewards/format_reward": 0.8392857164144516, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 2589.8690795898438, "epoch": 1.716, "grad_norm": 0.392411470413208, "kl": 0.35693359375, "learning_rate": 1.5415814221002265e-07, "loss": 0.0781, "reward": 0.6071142517030239, "reward_std": 0.7519797533750534, "rewards/cosine_scaled_reward": -0.044657152146101, "rewards/format_reward": 0.696428582072258, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 2220.1012573242188, "epoch": 1.72, "grad_norm": 0.5445396900177002, "kl": 0.341796875, "learning_rate": 1.5267358321348285e-07, "loss": 0.0534, "reward": 0.701055221259594, "reward_std": 0.6740739792585373, "rewards/cosine_scaled_reward": -0.04232952371239662, "rewards/format_reward": 0.7857142984867096, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 1902.5595397949219, "epoch": 1.724, "grad_norm": 0.3450181186199188, "kl": 0.2607421875, "learning_rate": 1.5120838934595337e-07, "loss": 0.0472, "reward": 1.0963951796293259, "reward_std": 0.746511772274971, "rewards/cosine_scaled_reward": 0.11962614580988884, "rewards/format_reward": 0.8571428805589676, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 2478.732177734375, "epoch": 1.728, "grad_norm": 0.48547589778900146, "kl": 0.373046875, "learning_rate": 1.4976263201891613e-07, "loss": 0.0409, "reward": 0.5677317231893539, "reward_std": 0.6051659360527992, "rewards/cosine_scaled_reward": -0.09113414993043989, "rewards/format_reward": 0.7500000149011612, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 2518.7560424804688, "epoch": 1.732, "grad_norm": 0.9139849543571472, "kl": 0.3701171875, "learning_rate": 1.483363816965435e-07, "loss": 0.01, "reward": 0.6564267948269844, "reward_std": 0.6321954727172852, "rewards/cosine_scaled_reward": -0.04083424177952111, "rewards/format_reward": 0.7380952388048172, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 2439.119110107422, "epoch": 1.736, "grad_norm": 0.4629580080509186, "kl": 0.3349609375, "learning_rate": 1.469297078922642e-07, "loss": 0.0828, "reward": 0.6547855883836746, "reward_std": 0.6274382770061493, "rewards/cosine_scaled_reward": -0.05653578881174326, "rewards/format_reward": 0.767857164144516, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 2385.047637939453, "epoch": 1.74, "grad_norm": 0.4398196041584015, "kl": 0.342529296875, "learning_rate": 1.4554267916537495e-07, "loss": 0.034, "reward": 0.59782674908638, "reward_std": 0.7165493220090866, "rewards/cosine_scaled_reward": -0.09394377004355192, "rewards/format_reward": 0.7857142984867096, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 2235.4405212402344, "epoch": 1.744, "grad_norm": 0.5894522070884705, "kl": 0.321044921875, "learning_rate": 1.4417536311769885e-07, "loss": 0.0578, "reward": 0.7302387952804565, "reward_std": 0.7528126537799835, "rewards/cosine_scaled_reward": -0.018809196539223194, "rewards/format_reward": 0.7678571492433548, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 2364.2083740234375, "epoch": 1.748, "grad_norm": 0.517291784286499, "kl": 0.262451171875, "learning_rate": 1.4282782639029128e-07, "loss": 0.0626, "reward": 0.5654323399066925, "reward_std": 0.6579017788171768, "rewards/cosine_scaled_reward": -0.03276003524661064, "rewards/format_reward": 0.630952388048172, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 2464.0238647460938, "epoch": 1.752, "grad_norm": 0.719810426235199, "kl": 0.34619140625, "learning_rate": 1.4150013466019114e-07, "loss": 0.0289, "reward": 0.5253645405173302, "reward_std": 0.593732014298439, "rewards/cosine_scaled_reward": -0.10934155760332942, "rewards/format_reward": 0.7440476268529892, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 2688.2559814453125, "epoch": 1.756, "grad_norm": 0.47081246972084045, "kl": 0.320068359375, "learning_rate": 1.4019235263722034e-07, "loss": 0.0737, "reward": 0.5551765933632851, "reward_std": 0.750535324215889, "rewards/cosine_scaled_reward": -0.043840276543051004, "rewards/format_reward": 0.6428571492433548, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 2691.5298461914062, "epoch": 1.76, "grad_norm": 0.3561669588088989, "kl": 0.30517578125, "learning_rate": 1.3890454406082956e-07, "loss": 0.0412, "reward": 0.6305891573429108, "reward_std": 0.7718498408794403, "rewards/cosine_scaled_reward": -0.02399112842977047, "rewards/format_reward": 0.6785714477300644, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 2423.3274536132812, "epoch": 1.764, "grad_norm": 0.8448560237884521, "kl": 0.34130859375, "learning_rate": 1.3763677169699217e-07, "loss": 0.0974, "reward": 0.6258707121014595, "reward_std": 0.7022215574979782, "rewards/cosine_scaled_reward": -0.06504084914922714, "rewards/format_reward": 0.7559524029493332, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 2509.6845703125, "epoch": 1.768, "grad_norm": 0.49845507740974426, "kl": 0.3271484375, "learning_rate": 1.3638909733514452e-07, "loss": 0.0368, "reward": 0.6952026858925819, "reward_std": 0.7732700109481812, "rewards/cosine_scaled_reward": -0.03037486458197236, "rewards/format_reward": 0.7559524029493332, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 2251.6370239257812, "epoch": 1.772, "grad_norm": 1.6570687294006348, "kl": 0.32861328125, "learning_rate": 1.351615817851748e-07, "loss": 0.1251, "reward": 0.5299716778099537, "reward_std": 0.6262076199054718, "rewards/cosine_scaled_reward": -0.10108558752108365, "rewards/format_reward": 0.7321428507566452, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 2569.8392944335938, "epoch": 1.776, "grad_norm": 0.5071792602539062, "kl": 0.285400390625, "learning_rate": 1.3395428487445914e-07, "loss": 0.0461, "reward": 0.3558937795460224, "reward_std": 0.6386721879243851, "rewards/cosine_scaled_reward": -0.12860072287730873, "rewards/format_reward": 0.6130952537059784, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 2434.52978515625, "epoch": 1.78, "grad_norm": 0.6472364068031311, "kl": 0.33935546875, "learning_rate": 1.3276726544494571e-07, "loss": 0.0324, "reward": 0.5342502817511559, "reward_std": 0.7027776390314102, "rewards/cosine_scaled_reward": -0.08108916692435741, "rewards/format_reward": 0.696428582072258, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 2600.7678833007812, "epoch": 1.784, "grad_norm": 0.4613596796989441, "kl": 0.36767578125, "learning_rate": 1.316005813502869e-07, "loss": 0.0366, "reward": 0.3209609054028988, "reward_std": 0.6079899072647095, "rewards/cosine_scaled_reward": -0.15499573945999146, "rewards/format_reward": 0.630952388048172, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 2406.6131286621094, "epoch": 1.788, "grad_norm": 0.5609318017959595, "kl": 0.3349609375, "learning_rate": 1.3045428945301953e-07, "loss": 0.1069, "reward": 0.8279012702405453, "reward_std": 0.6648521721363068, "rewards/cosine_scaled_reward": 0.035974426195025444, "rewards/format_reward": 0.755952388048172, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 2292.5238647460938, "epoch": 1.792, "grad_norm": 0.6498924493789673, "kl": 0.3251953125, "learning_rate": 1.2932844562179352e-07, "loss": 0.1002, "reward": 0.9468577206134796, "reward_std": 0.8284895867109299, "rewards/cosine_scaled_reward": 0.1073574130423367, "rewards/format_reward": 0.7321428656578064, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 2359.7203063964844, "epoch": 1.796, "grad_norm": 0.8151586651802063, "kl": 0.29248046875, "learning_rate": 1.2822310472864885e-07, "loss": 0.1025, "reward": 0.6154885776340961, "reward_std": 0.643234595656395, "rewards/cosine_scaled_reward": -0.028565243119373918, "rewards/format_reward": 0.6726190596818924, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 2456.5952758789062, "epoch": 1.8, "grad_norm": 0.6727307438850403, "kl": 0.3369140625, "learning_rate": 1.2713832064634125e-07, "loss": 0.0518, "reward": 0.5276128388941288, "reward_std": 0.6850098147988319, "rewards/cosine_scaled_reward": -0.05166977294720709, "rewards/format_reward": 0.6309523805975914, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 2180.250030517578, "epoch": 1.804, "grad_norm": 0.4327715039253235, "kl": 0.2841796875, "learning_rate": 1.260741462457165e-07, "loss": 0.0396, "reward": 0.9219238460063934, "reward_std": 0.8085188716650009, "rewards/cosine_scaled_reward": 0.029414291959255934, "rewards/format_reward": 0.8630952537059784, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 2414.9584045410156, "epoch": 1.808, "grad_norm": 0.5890040993690491, "kl": 0.33642578125, "learning_rate": 1.2503063339313356e-07, "loss": 0.0116, "reward": 0.45377534069120884, "reward_std": 0.6864534169435501, "rewards/cosine_scaled_reward": -0.07668375968933105, "rewards/format_reward": 0.607142873108387, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 2325.2619018554688, "epoch": 1.812, "grad_norm": 0.8580995798110962, "kl": 0.38330078125, "learning_rate": 1.2400783294793668e-07, "loss": 0.1211, "reward": 0.41875267028808594, "reward_std": 0.5978472009301186, "rewards/cosine_scaled_reward": -0.17455224692821503, "rewards/format_reward": 0.7678571492433548, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 2524.8452758789062, "epoch": 1.8159999999999998, "grad_norm": 0.6263750195503235, "kl": 0.369140625, "learning_rate": 1.2300579475997657e-07, "loss": 0.0599, "reward": 0.5164637118577957, "reward_std": 0.7428598999977112, "rewards/cosine_scaled_reward": -0.0989109962247312, "rewards/format_reward": 0.7142857313156128, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 2222.684600830078, "epoch": 1.8199999999999998, "grad_norm": 0.46227335929870605, "kl": 0.257568359375, "learning_rate": 1.220245676671809e-07, "loss": 0.0452, "reward": 0.6773854792118073, "reward_std": 0.6582589149475098, "rewards/cosine_scaled_reward": -0.03333106730133295, "rewards/format_reward": 0.744047611951828, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 2381.2916870117188, "epoch": 1.8239999999999998, "grad_norm": 1.395836591720581, "kl": 0.346923828125, "learning_rate": 1.2106419949317388e-07, "loss": -0.0036, "reward": 0.6790124624967575, "reward_std": 0.7677509784698486, "rewards/cosine_scaled_reward": -0.0354937631636858, "rewards/format_reward": 0.7500000298023224, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 2236.1369018554688, "epoch": 1.8279999999999998, "grad_norm": 0.363459974527359, "kl": 0.359375, "learning_rate": 1.2012473704494537e-07, "loss": 0.0897, "reward": 0.8419362753629684, "reward_std": 0.8306869268417358, "rewards/cosine_scaled_reward": 0.034063366474583745, "rewards/format_reward": 0.7738095223903656, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 2312.5059814453125, "epoch": 1.8319999999999999, "grad_norm": 0.5052822232246399, "kl": 0.35791015625, "learning_rate": 1.1920622611056974e-07, "loss": 0.0619, "reward": 0.744497187435627, "reward_std": 0.6325250118970871, "rewards/cosine_scaled_reward": -0.02953713061287999, "rewards/format_reward": 0.8035714477300644, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 2242.0357971191406, "epoch": 1.8359999999999999, "grad_norm": 0.4124799966812134, "kl": 0.30615234375, "learning_rate": 1.1830871145697412e-07, "loss": 0.0801, "reward": 0.7117128595709801, "reward_std": 0.7263730615377426, "rewards/cosine_scaled_reward": -0.031048328906763345, "rewards/format_reward": 0.773809552192688, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 2795.3512573242188, "epoch": 1.8399999999999999, "grad_norm": 0.5879099369049072, "kl": 0.341796875, "learning_rate": 1.1743223682775649e-07, "loss": 0.0299, "reward": 0.41843298077583313, "reward_std": 0.6329772919416428, "rewards/cosine_scaled_reward": -0.11518826894462109, "rewards/format_reward": 0.6488095223903656, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 2640.6904907226562, "epoch": 1.8439999999999999, "grad_norm": 0.3294979929924011, "kl": 0.38818359375, "learning_rate": 1.1657684494105386e-07, "loss": 0.0662, "reward": 0.6549192667007446, "reward_std": 0.8022814393043518, "rewards/cosine_scaled_reward": -0.02373085916042328, "rewards/format_reward": 0.7023809552192688, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 2345.4285888671875, "epoch": 1.8479999999999999, "grad_norm": 0.3771924376487732, "kl": 0.35595703125, "learning_rate": 1.1574257748745986e-07, "loss": 0.105, "reward": 0.6399585753679276, "reward_std": 0.7968022599816322, "rewards/cosine_scaled_reward": -0.08478261809796095, "rewards/format_reward": 0.8095238208770752, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 2589.1666870117188, "epoch": 1.8519999999999999, "grad_norm": 0.3122951090335846, "kl": 0.3798828125, "learning_rate": 1.1492947512799328e-07, "loss": 0.0565, "reward": 0.43525535613298416, "reward_std": 0.7020779103040695, "rewards/cosine_scaled_reward": -0.09784852154552937, "rewards/format_reward": 0.6309523954987526, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 2481.261962890625, "epoch": 1.8559999999999999, "grad_norm": 0.5730969905853271, "kl": 0.33984375, "learning_rate": 1.1413757749211602e-07, "loss": 0.0699, "reward": 0.7782276198267937, "reward_std": 0.6072199195623398, "rewards/cosine_scaled_reward": 0.02304239757359028, "rewards/format_reward": 0.7321428656578064, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 2644.619140625, "epoch": 1.8599999999999999, "grad_norm": 0.39649447798728943, "kl": 0.373046875, "learning_rate": 1.1336692317580158e-07, "loss": 0.0712, "reward": 0.6695687249302864, "reward_std": 0.8249562680721283, "rewards/cosine_scaled_reward": 0.007403409108519554, "rewards/format_reward": 0.6547619178891182, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 2491.0119018554688, "epoch": 1.8639999999999999, "grad_norm": 0.4567016065120697, "kl": 0.36376953125, "learning_rate": 1.1261754973965422e-07, "loss": 0.0702, "reward": 0.46852924674749374, "reward_std": 0.7603975385427475, "rewards/cosine_scaled_reward": -0.1199020454660058, "rewards/format_reward": 0.708333358168602, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 2532.6607666015625, "epoch": 1.8679999999999999, "grad_norm": 0.36717355251312256, "kl": 0.330078125, "learning_rate": 1.1188949370707787e-07, "loss": 0.0491, "reward": 0.5859625339508057, "reward_std": 0.6559573635458946, "rewards/cosine_scaled_reward": -0.028447304794099182, "rewards/format_reward": 0.6428571492433548, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 2622.2559814453125, "epoch": 1.8719999999999999, "grad_norm": 0.4103451669216156, "kl": 0.36279296875, "learning_rate": 1.1118279056249653e-07, "loss": 0.0606, "reward": 0.6576881408691406, "reward_std": 0.8001267910003662, "rewards/cosine_scaled_reward": -0.007465461269021034, "rewards/format_reward": 0.6726190596818924, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 2134.279815673828, "epoch": 1.876, "grad_norm": 0.5138155817985535, "kl": 0.256103515625, "learning_rate": 1.1049747474962444e-07, "loss": 0.0759, "reward": 0.6648233011364937, "reward_std": 0.7618712484836578, "rewards/cosine_scaled_reward": -0.06044549681246281, "rewards/format_reward": 0.7857143133878708, "step": 469 }, { "clip_ratio": 0.0, "completion_length": 2160.202392578125, "epoch": 1.88, "grad_norm": 0.3472672700881958, "kl": 0.30126953125, "learning_rate": 1.0983357966978745e-07, "loss": 0.0461, "reward": 0.7422109395265579, "reward_std": 0.6609758958220482, "rewards/cosine_scaled_reward": -0.012823125813156366, "rewards/format_reward": 0.767857164144516, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 2532.3809814453125, "epoch": 1.884, "grad_norm": 0.2868484556674957, "kl": 0.2763671875, "learning_rate": 1.0919113768029517e-07, "loss": 0.0493, "reward": 0.48562416061758995, "reward_std": 0.7373960316181183, "rewards/cosine_scaled_reward": -0.08159269354655407, "rewards/format_reward": 0.648809552192688, "step": 471 }, { "clip_ratio": 0.0, "completion_length": 2409.3392944335938, "epoch": 1.888, "grad_norm": 0.4656960964202881, "kl": 0.390380859375, "learning_rate": 1.0857018009286381e-07, "loss": 0.0561, "reward": 0.6008469248190522, "reward_std": 0.6282935440540314, "rewards/cosine_scaled_reward": -0.05374322272837162, "rewards/format_reward": 0.708333358168602, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 2294.0178833007812, "epoch": 1.892, "grad_norm": 0.5274000763893127, "kl": 0.3369140625, "learning_rate": 1.0797073717209013e-07, "loss": 0.0877, "reward": 0.7744100838899612, "reward_std": 0.7935537397861481, "rewards/cosine_scaled_reward": 0.00030028633773326874, "rewards/format_reward": 0.7738095372915268, "step": 473 }, { "clip_ratio": 0.0, "completion_length": 2676.6727294921875, "epoch": 1.896, "grad_norm": 0.5600417256355286, "kl": 0.302978515625, "learning_rate": 1.0739283813397639e-07, "loss": 0.0338, "reward": 0.47137061692774296, "reward_std": 0.7821067273616791, "rewards/cosine_scaled_reward": -0.07681469712406397, "rewards/format_reward": 0.6250000074505806, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 2442.0952758789062, "epoch": 1.9, "grad_norm": 0.5208225846290588, "kl": 0.32470703125, "learning_rate": 1.068365111445064e-07, "loss": 0.0837, "reward": 0.38532300293445587, "reward_std": 0.5505756810307503, "rewards/cosine_scaled_reward": -0.1287670750170946, "rewards/format_reward": 0.6428571492433548, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 2030.7857360839844, "epoch": 1.904, "grad_norm": 0.6633386611938477, "kl": 0.269287109375, "learning_rate": 1.063017833182728e-07, "loss": 0.0183, "reward": 0.796258918941021, "reward_std": 0.8103819191455841, "rewards/cosine_scaled_reward": 0.017177060712128878, "rewards/format_reward": 0.761904776096344, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 2734.2262573242188, "epoch": 1.908, "grad_norm": 0.5043067932128906, "kl": 0.35791015625, "learning_rate": 1.0578868071715544e-07, "loss": 0.0378, "reward": 0.2551159653812647, "reward_std": 0.5920611470937729, "rewards/cosine_scaled_reward": -0.155180131085217, "rewards/format_reward": 0.5654762089252472, "step": 477 }, { "clip_ratio": 0.0, "completion_length": 2301.0952758789062, "epoch": 1.912, "grad_norm": 0.7771977186203003, "kl": 0.28662109375, "learning_rate": 1.0529722834905125e-07, "loss": 0.0228, "reward": 0.6790298409759998, "reward_std": 0.6661486774682999, "rewards/cosine_scaled_reward": -0.06524700409499928, "rewards/format_reward": 0.8095238357782364, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 2247.6012573242188, "epoch": 1.916, "grad_norm": 0.599141001701355, "kl": 0.3212890625, "learning_rate": 1.0482745016665526e-07, "loss": 0.0857, "reward": 0.8667033798992634, "reward_std": 0.8036679923534393, "rewards/cosine_scaled_reward": 0.06132788397371769, "rewards/format_reward": 0.7440476268529892, "step": 479 }, { "clip_ratio": 0.0, "completion_length": 2093.3333740234375, "epoch": 1.92, "grad_norm": 0.5312609076499939, "kl": 0.2958984375, "learning_rate": 1.0437936906629334e-07, "loss": 0.0735, "reward": 0.7348574697971344, "reward_std": 0.689183309674263, "rewards/cosine_scaled_reward": -0.034356983145698905, "rewards/format_reward": 0.8035714328289032, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 2448.1726684570312, "epoch": 1.924, "grad_norm": 0.5402917861938477, "kl": 0.36669921875, "learning_rate": 1.0395300688680625e-07, "loss": 0.0831, "reward": 0.43995974212884903, "reward_std": 0.6862698197364807, "rewards/cosine_scaled_reward": -0.13121061958372593, "rewards/format_reward": 0.70238097012043, "step": 481 }, { "clip_ratio": 0.0, "completion_length": 2658.7977294921875, "epoch": 1.928, "grad_norm": 0.5909121632575989, "kl": 0.3623046875, "learning_rate": 1.0354838440848501e-07, "loss": 0.0547, "reward": 0.5179771184921265, "reward_std": 0.7944772690534592, "rewards/cosine_scaled_reward": -0.0981543204979971, "rewards/format_reward": 0.7142857313156128, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 2292.1428833007812, "epoch": 1.932, "grad_norm": 0.549201488494873, "kl": 0.30615234375, "learning_rate": 1.0316552135205837e-07, "loss": 0.0906, "reward": 0.6546563804149628, "reward_std": 0.7558221146464348, "rewards/cosine_scaled_reward": -0.017909929156303406, "rewards/format_reward": 0.690476194024086, "step": 483 }, { "clip_ratio": 0.0, "completion_length": 2001.2381286621094, "epoch": 1.936, "grad_norm": 0.9190180897712708, "kl": 0.2490234375, "learning_rate": 1.0280443637773163e-07, "loss": -0.0131, "reward": 0.6368911117315292, "reward_std": 0.5770624950528145, "rewards/cosine_scaled_reward": -0.07441157009452581, "rewards/format_reward": 0.7857142835855484, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 2369.482177734375, "epoch": 1.94, "grad_norm": 0.48303577303886414, "kl": 0.32861328125, "learning_rate": 1.0246514708427701e-07, "loss": 0.0806, "reward": 0.5949805751442909, "reward_std": 0.7235869467258453, "rewards/cosine_scaled_reward": -0.04179543023929, "rewards/format_reward": 0.6785714328289032, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 2607.375030517578, "epoch": 1.944, "grad_norm": 0.45922327041625977, "kl": 0.334228515625, "learning_rate": 1.0214767000817596e-07, "loss": 0.0495, "reward": 0.4739008713513613, "reward_std": 0.7411531507968903, "rewards/cosine_scaled_reward": -0.10828767996281385, "rewards/format_reward": 0.690476194024086, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 2427.3929443359375, "epoch": 1.948, "grad_norm": 0.42017099261283875, "kl": 0.281494140625, "learning_rate": 1.0185202062281336e-07, "loss": 0.0322, "reward": 0.3904539607465267, "reward_std": 0.6817184686660767, "rewards/cosine_scaled_reward": -0.13810635451227427, "rewards/format_reward": 0.6666666716337204, "step": 487 }, { "clip_ratio": 0.0, "completion_length": 2579.0120239257812, "epoch": 1.952, "grad_norm": 0.7048377394676208, "kl": 0.322265625, "learning_rate": 1.0157821333772304e-07, "loss": 0.028, "reward": 0.5259524993598461, "reward_std": 0.6612162664532661, "rewards/cosine_scaled_reward": -0.09714281000196934, "rewards/format_reward": 0.7202381193637848, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 2246.9524536132812, "epoch": 1.956, "grad_norm": 0.4814748167991638, "kl": 0.313720703125, "learning_rate": 1.013262614978859e-07, "loss": 0.0807, "reward": 0.7873745709657669, "reward_std": 0.7711023241281509, "rewards/cosine_scaled_reward": -0.005122252739965916, "rewards/format_reward": 0.7976190745830536, "step": 489 }, { "clip_ratio": 0.0, "completion_length": 2849.761962890625, "epoch": 1.96, "grad_norm": 0.5227950215339661, "kl": 0.37890625, "learning_rate": 1.0109617738307911e-07, "loss": 0.0171, "reward": 0.4944131616503, "reward_std": 0.6706523001194, "rewards/cosine_scaled_reward": -0.06826960667967796, "rewards/format_reward": 0.6309524029493332, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 2594.3631591796875, "epoch": 1.964, "grad_norm": 0.5116230249404907, "kl": 0.40576171875, "learning_rate": 1.0088797220727779e-07, "loss": 0.0511, "reward": 0.40869739279150963, "reward_std": 0.703234076499939, "rewards/cosine_scaled_reward": -0.12600845471024513, "rewards/format_reward": 0.6607142984867096, "step": 491 }, { "clip_ratio": 0.0, "completion_length": 2416.8632202148438, "epoch": 1.968, "grad_norm": 0.9567095637321472, "kl": 0.27783203125, "learning_rate": 1.0070165611810855e-07, "loss": 0.1235, "reward": 0.5404094010591507, "reward_std": 0.6638472378253937, "rewards/cosine_scaled_reward": -0.054200079292058945, "rewards/format_reward": 0.6488095372915268, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 2514.1666870117188, "epoch": 1.972, "grad_norm": 0.4846276044845581, "kl": 0.28466796875, "learning_rate": 1.005372381963547e-07, "loss": 0.0436, "reward": 0.5605661012232304, "reward_std": 0.6418938338756561, "rewards/cosine_scaled_reward": -0.03519314527511597, "rewards/format_reward": 0.630952388048172, "step": 493 }, { "clip_ratio": 0.0, "completion_length": 2289.71435546875, "epoch": 1.976, "grad_norm": 0.6296063661575317, "kl": 0.3212890625, "learning_rate": 1.0039472645551372e-07, "loss": 0.0439, "reward": 0.6024229377508163, "reward_std": 0.7128957360982895, "rewards/cosine_scaled_reward": -0.07081234554061666, "rewards/format_reward": 0.744047611951828, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 2491.3334045410156, "epoch": 1.98, "grad_norm": 0.622008204460144, "kl": 0.283447265625, "learning_rate": 1.002741278414069e-07, "loss": 0.0361, "reward": 0.594695046544075, "reward_std": 0.6841184943914413, "rewards/cosine_scaled_reward": -0.03896199120208621, "rewards/format_reward": 0.6726190745830536, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 2395.636962890625, "epoch": 1.984, "grad_norm": 0.30918648838996887, "kl": 0.29931640625, "learning_rate": 1.0017544823184055e-07, "loss": 0.0827, "reward": 0.5910248765721917, "reward_std": 0.6182541996240616, "rewards/cosine_scaled_reward": -0.05567805375903845, "rewards/format_reward": 0.70238097012043, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 2368.8631591796875, "epoch": 1.988, "grad_norm": 1.1213865280151367, "kl": 0.3515625, "learning_rate": 1.0009869243631952e-07, "loss": 0.0439, "reward": 0.48846913129091263, "reward_std": 0.6297848075628281, "rewards/cosine_scaled_reward": -0.13374162535183132, "rewards/format_reward": 0.755952388048172, "step": 497 }, { "clip_ratio": 0.0, "completion_length": 2289.3690795898438, "epoch": 1.992, "grad_norm": 0.810573399066925, "kl": 0.302734375, "learning_rate": 1.000438641958131e-07, "loss": 0.0228, "reward": 0.6517780050635338, "reward_std": 0.7580654174089432, "rewards/cosine_scaled_reward": -0.046134804193570744, "rewards/format_reward": 0.7440476417541504, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 2452.2738647460938, "epoch": 1.996, "grad_norm": 0.357543408870697, "kl": 0.32763671875, "learning_rate": 1.0001096618257236e-07, "loss": 0.0643, "reward": 0.3793360572308302, "reward_std": 0.7790006846189499, "rewards/cosine_scaled_reward": -0.15557007491588593, "rewards/format_reward": 0.6904762089252472, "step": 499 }, { "clip_ratio": 0.0, "completion_length": 2528.7500610351562, "epoch": 2.0, "grad_norm": 0.9307948350906372, "kl": 0.306640625, "learning_rate": 1e-07, "loss": 0.1364, "reward": 0.5999207645654678, "reward_std": 0.6981495916843414, "rewards/cosine_scaled_reward": -0.03634915268048644, "rewards/format_reward": 0.6726190596818924, "step": 500 }, { "epoch": 2.0, "step": 500, "total_flos": 0.0, "train_loss": 0.0725239302306436, "train_runtime": 62033.0192, "train_samples_per_second": 1.354, "train_steps_per_second": 0.008 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }