{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2857142857142857, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 3140.2083435058594, "epoch": 0.0005714285714285715, "grad_norm": 0.18733426928520203, "kl": 0.1341552734375, "learning_rate": 0.0, "loss": -0.0125, "reward": 0.13575429469347, "reward_std": 0.2010277360677719, "rewards/cosine_scaled_reward": -0.057122852653265, "rewards/format_reward": 0.25, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 3231.1666870117188, "epoch": 0.001142857142857143, "grad_norm": 0.3353370130062103, "kl": 0.05010986328125, "learning_rate": 2e-08, "loss": 0.1382, "reward": -0.5267819836735725, "reward_std": 0.38683023303747177, "rewards/cosine_scaled_reward": -0.36755766719579697, "rewards/format_reward": 0.2083333358168602, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 3376.1250610351562, "epoch": 0.0017142857142857142, "grad_norm": 0.29977869987487793, "kl": 0.0494384765625, "learning_rate": 4e-08, "loss": 0.0883, "reward": -0.0896148718893528, "reward_std": 0.8756385631859303, "rewards/cosine_scaled_reward": -0.12814077525399625, "rewards/format_reward": 0.1666666716337204, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 3431.9583740234375, "epoch": 0.002285714285714286, "grad_norm": 0.1965445876121521, "kl": 0.0457763671875, "learning_rate": 6e-08, "loss": 0.0477, "reward": -0.19004566967487335, "reward_std": 0.6523252762854099, "rewards/cosine_scaled_reward": -0.17835617810487747, "rewards/format_reward": 0.1666666679084301, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 3436.8333740234375, "epoch": 0.002857142857142857, "grad_norm": 0.20483434200286865, "kl": 0.060546875, "learning_rate": 8e-08, "loss": 0.0513, "reward": -0.4698427654802799, "reward_std": 0.36434993892908096, "rewards/cosine_scaled_reward": -0.2974213883280754, "rewards/format_reward": 0.125, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 3070.0416870117188, "epoch": 0.0034285714285714284, "grad_norm": 0.28048911690711975, "kl": 0.044189453125, "learning_rate": 1e-07, "loss": 0.153, "reward": -0.19587285071611404, "reward_std": 0.4000375494360924, "rewards/cosine_scaled_reward": -0.1812697658315301, "rewards/format_reward": 0.1666666716337204, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 3141.4583435058594, "epoch": 0.004, "grad_norm": 0.17636096477508545, "kl": 0.047271728515625, "learning_rate": 1.2e-07, "loss": 0.0253, "reward": -0.18599995225667953, "reward_std": 0.2766329199075699, "rewards/cosine_scaled_reward": -0.19716664776206017, "rewards/format_reward": 0.2083333432674408, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 3150.5833435058594, "epoch": 0.004571428571428572, "grad_norm": 0.23299627006053925, "kl": 0.042236328125, "learning_rate": 1.4e-07, "loss": 0.0593, "reward": -0.5499451458454132, "reward_std": 0.25627370551228523, "rewards/cosine_scaled_reward": -0.3791392296552658, "rewards/format_reward": 0.2083333432674408, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 3046.666717529297, "epoch": 0.005142857142857143, "grad_norm": 0.23352369666099548, "kl": 0.04388427734375, "learning_rate": 1.6e-07, "loss": 0.0801, "reward": 0.21459830552339554, "reward_std": 0.5846549328416586, "rewards/cosine_scaled_reward": -0.05936753377318382, "rewards/format_reward": 0.3333333358168602, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 3295.9583740234375, "epoch": 0.005714285714285714, "grad_norm": 0.1519400179386139, "kl": 0.03594970703125, "learning_rate": 1.8e-07, "loss": 0.0366, "reward": 0.6032139137387276, "reward_std": 1.0609627589583397, "rewards/cosine_scaled_reward": 0.11410695873200893, "rewards/format_reward": 0.3750000111758709, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 2851.3333740234375, "epoch": 0.006285714285714286, "grad_norm": 0.2642011344432831, "kl": 0.05914306640625, "learning_rate": 2e-07, "loss": 0.0972, "reward": -0.20808421075344086, "reward_std": 0.41022560093551874, "rewards/cosine_scaled_reward": -0.24987544119358063, "rewards/format_reward": 0.2916666679084301, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 3317.0833740234375, "epoch": 0.006857142857142857, "grad_norm": 0.18985775113105774, "kl": 0.0611572265625, "learning_rate": 2.1999999999999998e-07, "loss": 0.0187, "reward": -0.09276259876787663, "reward_std": 0.3626005630940199, "rewards/cosine_scaled_reward": -0.19221464078873396, "rewards/format_reward": 0.2916666679084301, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 3030.250030517578, "epoch": 0.0074285714285714285, "grad_norm": 0.3722766935825348, "kl": 0.043060302734375, "learning_rate": 2.4e-07, "loss": 0.203, "reward": 0.2502866378054023, "reward_std": 0.5806700736284256, "rewards/cosine_scaled_reward": -0.020690007135272026, "rewards/format_reward": 0.2916666716337204, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 3173.0833740234375, "epoch": 0.008, "grad_norm": 0.21710394322872162, "kl": 0.039794921875, "learning_rate": 2.6e-07, "loss": 0.128, "reward": -0.06073956936597824, "reward_std": 0.5397324226796627, "rewards/cosine_scaled_reward": -0.15536978468298912, "rewards/format_reward": 0.2500000111758709, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 3022.125, "epoch": 0.008571428571428572, "grad_norm": 0.2290153205394745, "kl": 0.0469970703125, "learning_rate": 2.8e-07, "loss": 0.1717, "reward": -0.36692222114652395, "reward_std": 0.1796758584678173, "rewards/cosine_scaled_reward": -0.2876277659088373, "rewards/format_reward": 0.2083333432674408, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 3381.4583740234375, "epoch": 0.009142857142857144, "grad_norm": 0.18353527784347534, "kl": 0.045654296875, "learning_rate": 3e-07, "loss": 0.035, "reward": 0.09733710438013077, "reward_std": 0.7802678793668747, "rewards/cosine_scaled_reward": -0.05549812689423561, "rewards/format_reward": 0.2083333395421505, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.009714285714285713, "grad_norm": 0.17221970856189728, "kl": 0.05108642578125, "learning_rate": 3.2e-07, "loss": 0.0002, "reward": -0.521545228548348, "reward_std": 0.3114899694919586, "rewards/cosine_scaled_reward": -0.28160594776272774, "rewards/format_reward": 0.0416666679084301, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 3568.375, "epoch": 0.010285714285714285, "grad_norm": 0.146370992064476, "kl": 0.04718017578125, "learning_rate": 3.4000000000000003e-07, "loss": 0.0092, "reward": -0.44301459565758705, "reward_std": 0.5276463013142347, "rewards/cosine_scaled_reward": -0.24234064668416977, "rewards/format_reward": 0.0416666679084301, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 3245.1666870117188, "epoch": 0.010857142857142857, "grad_norm": 0.1941138356924057, "kl": 0.04986572265625, "learning_rate": 3.6e-07, "loss": 0.0432, "reward": 0.06469105184078217, "reward_std": 0.7173988372087479, "rewards/cosine_scaled_reward": -0.11348781548440456, "rewards/format_reward": 0.2916666716337204, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 2396.625030517578, "epoch": 0.011428571428571429, "grad_norm": 0.2624322772026062, "kl": 0.0740966796875, "learning_rate": 3.7999999999999996e-07, "loss": 0.034, "reward": 0.6277919709682465, "reward_std": 0.7424036711454391, "rewards/cosine_scaled_reward": 0.06389598548412323, "rewards/format_reward": 0.5000000111758709, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 2653.6250610351562, "epoch": 0.012, "grad_norm": 0.2804762125015259, "kl": 0.05694580078125, "learning_rate": 4e-07, "loss": -0.0701, "reward": -0.026539891958236694, "reward_std": 0.4789135903120041, "rewards/cosine_scaled_reward": -0.26326995715498924, "rewards/format_reward": 0.5000000111758709, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 3540.2083740234375, "epoch": 0.012571428571428572, "grad_norm": 0.13607840240001678, "kl": 0.04083251953125, "learning_rate": 4.1999999999999995e-07, "loss": 0.0263, "reward": -0.28362276405096054, "reward_std": 0.4733579605817795, "rewards/cosine_scaled_reward": -0.16264472343027592, "rewards/format_reward": 0.0416666679084301, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 3071.9166870117188, "epoch": 0.013142857142857144, "grad_norm": 0.3339034914970398, "kl": 0.05059814453125, "learning_rate": 4.3999999999999997e-07, "loss": 0.1545, "reward": -0.3420925512909889, "reward_std": 0.4987642988562584, "rewards/cosine_scaled_reward": -0.27521293610334396, "rewards/format_reward": 0.2083333358168602, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 3343.0000610351562, "epoch": 0.013714285714285714, "grad_norm": 0.19908681511878967, "kl": 0.04998779296875, "learning_rate": 4.6e-07, "loss": 0.0843, "reward": -0.06604887545108795, "reward_std": 0.8876266591250896, "rewards/cosine_scaled_reward": -0.11635777913033962, "rewards/format_reward": 0.1666666679084301, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 2447.2500610351562, "epoch": 0.014285714285714285, "grad_norm": 0.2169143408536911, "kl": 0.044097900390625, "learning_rate": 4.8e-07, "loss": 0.014, "reward": 0.9716870114207268, "reward_std": 0.41379065811634064, "rewards/cosine_scaled_reward": 0.1733434647321701, "rewards/format_reward": 0.625, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 3371.125, "epoch": 0.014857142857142857, "grad_norm": 0.1605193167924881, "kl": 0.049896240234375, "learning_rate": 5e-07, "loss": 0.0591, "reward": -0.2308735428377986, "reward_std": 0.41561231948435307, "rewards/cosine_scaled_reward": -0.1779367751441896, "rewards/format_reward": 0.125, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 3489.125, "epoch": 0.015428571428571429, "grad_norm": 0.14547424018383026, "kl": 0.043914794921875, "learning_rate": 5.2e-07, "loss": 0.0077, "reward": -0.3666146732866764, "reward_std": 0.313198696821928, "rewards/cosine_scaled_reward": -0.24580733105540276, "rewards/format_reward": 0.125, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 3418.5833740234375, "epoch": 0.016, "grad_norm": 0.19328951835632324, "kl": 0.04888916015625, "learning_rate": 5.4e-07, "loss": 0.057, "reward": -0.2427891194820404, "reward_std": 0.4138132072985172, "rewards/cosine_scaled_reward": -0.18389457929879427, "rewards/format_reward": 0.125, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 2894.2083435058594, "epoch": 0.01657142857142857, "grad_norm": 0.20895135402679443, "kl": 0.0489501953125, "learning_rate": 5.6e-07, "loss": -0.0056, "reward": 0.09402500465512276, "reward_std": 0.35269030928611755, "rewards/cosine_scaled_reward": -0.07798751257359982, "rewards/format_reward": 0.25, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.017142857142857144, "grad_norm": 0.2097480595111847, "kl": 0.066162109375, "learning_rate": 5.8e-07, "loss": 0.0003, "reward": -0.6841397285461426, "reward_std": 0.1242629922926426, "rewards/cosine_scaled_reward": -0.3420698642730713, "rewards/format_reward": 0.0, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 2740.9166870117188, "epoch": 0.017714285714285714, "grad_norm": 0.2932406961917877, "kl": 0.0579833984375, "learning_rate": 6e-07, "loss": 0.1181, "reward": -0.3630891740322113, "reward_std": 0.44184136018157005, "rewards/cosine_scaled_reward": -0.32737791910767555, "rewards/format_reward": 0.2916666679084301, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 2855.375, "epoch": 0.018285714285714287, "grad_norm": 0.18361368775367737, "kl": 0.04327392578125, "learning_rate": 6.2e-07, "loss": -0.0227, "reward": 0.5959962904453278, "reward_std": 0.762365136295557, "rewards/cosine_scaled_reward": 0.06883148103952408, "rewards/format_reward": 0.4583333432674408, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.018857142857142857, "grad_norm": 0.16273260116577148, "kl": 0.0531005859375, "learning_rate": 6.4e-07, "loss": 0.0002, "reward": -0.4899278059601784, "reward_std": 0.16865173168480396, "rewards/cosine_scaled_reward": -0.2449638955295086, "rewards/format_reward": 0.0, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 3541.4166870117188, "epoch": 0.019428571428571427, "grad_norm": 0.18354347348213196, "kl": 0.045654296875, "learning_rate": 6.6e-07, "loss": 0.0147, "reward": -0.531697541475296, "reward_std": 0.3807820826768875, "rewards/cosine_scaled_reward": -0.2866821028292179, "rewards/format_reward": 0.0416666679084301, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 3476.3333740234375, "epoch": 0.02, "grad_norm": 0.15978805720806122, "kl": 0.0550537109375, "learning_rate": 6.800000000000001e-07, "loss": 0.0515, "reward": -0.6665981858968735, "reward_std": 0.2222603689879179, "rewards/cosine_scaled_reward": -0.3541324511170387, "rewards/format_reward": 0.0416666679084301, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 3353.0416870117188, "epoch": 0.02057142857142857, "grad_norm": 0.22307412326335907, "kl": 0.05029296875, "learning_rate": 7e-07, "loss": 0.0934, "reward": -0.26416327990591526, "reward_std": 0.2276486847549677, "rewards/cosine_scaled_reward": -0.1945816483348608, "rewards/format_reward": 0.1250000037252903, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.021142857142857144, "grad_norm": 0.1473287045955658, "kl": 0.040771484375, "learning_rate": 7.2e-07, "loss": 0.0002, "reward": -0.50784532725811, "reward_std": 0.24061324447393417, "rewards/cosine_scaled_reward": -0.2539226710796356, "rewards/format_reward": 0.0, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 3406.416748046875, "epoch": 0.021714285714285714, "grad_norm": 0.22248540818691254, "kl": 0.0460205078125, "learning_rate": 7.4e-07, "loss": 0.0835, "reward": -0.5920155718922615, "reward_std": 0.3137332946062088, "rewards/cosine_scaled_reward": -0.37934111058712006, "rewards/format_reward": 0.1666666679084301, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 2601.416717529297, "epoch": 0.022285714285714287, "grad_norm": 0.4459127187728882, "kl": 0.0550537109375, "learning_rate": 7.599999999999999e-07, "loss": 0.3106, "reward": 0.2684231176972389, "reward_std": 0.47980744019150734, "rewards/cosine_scaled_reward": -0.09495509788393974, "rewards/format_reward": 0.4583333544433117, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.022857142857142857, "grad_norm": 0.15880325436592102, "kl": 0.04510498046875, "learning_rate": 7.799999999999999e-07, "loss": 0.0002, "reward": -0.6260051801800728, "reward_std": 0.1923852041363716, "rewards/cosine_scaled_reward": -0.3130025826394558, "rewards/format_reward": 0.0, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 3001.625030517578, "epoch": 0.023428571428571427, "grad_norm": 0.16983532905578613, "kl": 0.05487060546875, "learning_rate": 8e-07, "loss": 0.041, "reward": 0.07866484671831131, "reward_std": 0.4578991234302521, "rewards/cosine_scaled_reward": -0.10650091245770454, "rewards/format_reward": 0.2916666679084301, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 2764.000030517578, "epoch": 0.024, "grad_norm": 0.24087683856487274, "kl": 0.038360595703125, "learning_rate": 8.199999999999999e-07, "loss": 0.0531, "reward": 0.32172612100839615, "reward_std": 0.48358193784952164, "rewards/cosine_scaled_reward": -0.026636939495801926, "rewards/format_reward": 0.375, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.02457142857142857, "grad_norm": 0.21731670200824738, "kl": 0.0496826171875, "learning_rate": 8.399999999999999e-07, "loss": 0.0002, "reward": -0.5966501906514168, "reward_std": 0.23214636743068695, "rewards/cosine_scaled_reward": -0.298325102776289, "rewards/format_reward": 0.0, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.025142857142857144, "grad_norm": 0.16481076180934906, "kl": 0.044677734375, "learning_rate": 8.599999999999999e-07, "loss": 0.0002, "reward": -0.6713002845644951, "reward_std": 0.19106930866837502, "rewards/cosine_scaled_reward": -0.33565014228224754, "rewards/format_reward": 0.0, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 3372.2500610351562, "epoch": 0.025714285714285714, "grad_norm": 0.17415094375610352, "kl": 0.0469970703125, "learning_rate": 8.799999999999999e-07, "loss": 0.0527, "reward": -0.5838596299290657, "reward_std": 0.42229044809937477, "rewards/cosine_scaled_reward": -0.31276314333081245, "rewards/format_reward": 0.0416666679084301, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 3363.375, "epoch": 0.026285714285714287, "grad_norm": 0.21891918778419495, "kl": 0.04705810546875, "learning_rate": 9e-07, "loss": 0.08, "reward": -0.1459154188632965, "reward_std": 0.8675736896693707, "rewards/cosine_scaled_reward": -0.17712438106536865, "rewards/format_reward": 0.2083333358168602, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 2864.9583435058594, "epoch": 0.026857142857142857, "grad_norm": 0.20759688317775726, "kl": 0.03955078125, "learning_rate": 9.2e-07, "loss": 0.0259, "reward": -0.1351792812347412, "reward_std": 0.3917945884168148, "rewards/cosine_scaled_reward": -0.19258963316679, "rewards/format_reward": 0.25, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 2618.416717529297, "epoch": 0.027428571428571427, "grad_norm": 0.19388090074062347, "kl": 0.04571533203125, "learning_rate": 9.399999999999999e-07, "loss": 0.072, "reward": 0.5992091596126556, "reward_std": 1.018235296010971, "rewards/cosine_scaled_reward": 0.007937910500913858, "rewards/format_reward": 0.5833333432674408, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 2918.5833435058594, "epoch": 0.028, "grad_norm": 0.27869799733161926, "kl": 0.0540771484375, "learning_rate": 9.6e-07, "loss": 0.1327, "reward": 0.18005166947841644, "reward_std": 0.7894617840647697, "rewards/cosine_scaled_reward": -0.07664081640541553, "rewards/format_reward": 0.3333333432674408, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 3575.375, "epoch": 0.02857142857142857, "grad_norm": 0.16276530921459198, "kl": 0.04864501953125, "learning_rate": 9.8e-07, "loss": 0.0023, "reward": -0.4990532919764519, "reward_std": 0.28135714679956436, "rewards/cosine_scaled_reward": -0.29119331762194633, "rewards/format_reward": 0.0833333358168602, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 2936.0, "epoch": 0.029142857142857144, "grad_norm": 0.14888997375965118, "kl": 0.04449462890625, "learning_rate": 1e-06, "loss": -0.043, "reward": 0.0526563823223114, "reward_std": 0.32037340477108955, "rewards/cosine_scaled_reward": -0.0986718013882637, "rewards/format_reward": 0.25, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.029714285714285714, "grad_norm": 0.16475068032741547, "kl": 0.04327392578125, "learning_rate": 9.999890338174275e-07, "loss": 0.0002, "reward": -0.734376922249794, "reward_std": 0.2161643784493208, "rewards/cosine_scaled_reward": -0.3671884685754776, "rewards/format_reward": 0.0, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 3400.1666870117188, "epoch": 0.030285714285714287, "grad_norm": 0.17514079809188843, "kl": 0.03863525390625, "learning_rate": 9.999561358041868e-07, "loss": 0.0608, "reward": -0.09651139751076698, "reward_std": 0.5052468162029982, "rewards/cosine_scaled_reward": -0.11075571551918983, "rewards/format_reward": 0.125, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 3467.6666870117188, "epoch": 0.030857142857142857, "grad_norm": 0.19022035598754883, "kl": 0.05419921875, "learning_rate": 9.999013075636804e-07, "loss": 0.0556, "reward": -0.5887648984789848, "reward_std": 0.2710861638188362, "rewards/cosine_scaled_reward": -0.3360491245985031, "rewards/format_reward": 0.0833333358168602, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.03142857142857143, "grad_norm": 0.15405318140983582, "kl": 0.05474853515625, "learning_rate": 9.998245517681593e-07, "loss": 0.0002, "reward": -0.6332229375839233, "reward_std": 0.44320254772901535, "rewards/cosine_scaled_reward": -0.33744481950998306, "rewards/format_reward": 0.0416666679084301, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 3520.0, "epoch": 0.032, "grad_norm": 0.16492366790771484, "kl": 0.0576171875, "learning_rate": 9.997258721585931e-07, "loss": 0.0222, "reward": -0.5373398922383785, "reward_std": 0.3259655721485615, "rewards/cosine_scaled_reward": -0.31033661775290966, "rewards/format_reward": 0.0833333358168602, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 3196.5416870117188, "epoch": 0.03257142857142857, "grad_norm": 0.3228585124015808, "kl": 0.09649658203125, "learning_rate": 9.996052735444862e-07, "loss": -0.0253, "reward": 0.02994374930858612, "reward_std": 0.391297597438097, "rewards/cosine_scaled_reward": -0.11002812534570694, "rewards/format_reward": 0.25, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.03314285714285714, "grad_norm": 0.15809538960456848, "kl": 0.0439453125, "learning_rate": 9.994627618036452e-07, "loss": 0.0002, "reward": -0.7625578194856644, "reward_std": 0.20482752844691277, "rewards/cosine_scaled_reward": -0.381278894841671, "rewards/format_reward": 0.0, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.03371428571428572, "grad_norm": 0.16887404024600983, "kl": 0.0562744140625, "learning_rate": 9.992983438818915e-07, "loss": 0.0002, "reward": -0.511197448708117, "reward_std": 0.14204201754182577, "rewards/cosine_scaled_reward": -0.2555987243540585, "rewards/format_reward": 0.0, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 3469.0416870117188, "epoch": 0.03428571428571429, "grad_norm": 0.14915870130062103, "kl": 0.03826904296875, "learning_rate": 9.991120277927223e-07, "loss": 0.0642, "reward": -0.617660641670227, "reward_std": 0.24892418831586838, "rewards/cosine_scaled_reward": -0.329663660377264, "rewards/format_reward": 0.0416666679084301, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 2802.791717529297, "epoch": 0.03485714285714286, "grad_norm": 0.20380046963691711, "kl": 0.0477294921875, "learning_rate": 9.989038226169207e-07, "loss": 0.1085, "reward": -0.008406132459640503, "reward_std": 0.8550728969275951, "rewards/cosine_scaled_reward": -0.17086973786354065, "rewards/format_reward": 0.3333333358168602, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 2853.2083435058594, "epoch": 0.03542857142857143, "grad_norm": 0.31721076369285583, "kl": 0.06878662109375, "learning_rate": 9.98673738502114e-07, "loss": 0.0961, "reward": -0.38323642313480377, "reward_std": 0.25505492370575666, "rewards/cosine_scaled_reward": -0.3166182152926922, "rewards/format_reward": 0.25, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.036, "grad_norm": 0.15038253366947174, "kl": 0.04864501953125, "learning_rate": 9.98421786662277e-07, "loss": 0.0002, "reward": -0.4516504108905792, "reward_std": 0.26408347859978676, "rewards/cosine_scaled_reward": -0.22582519799470901, "rewards/format_reward": 0.0, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 3534.2916870117188, "epoch": 0.036571428571428574, "grad_norm": 0.1737290471792221, "kl": 0.049896240234375, "learning_rate": 9.981479793771866e-07, "loss": 0.0283, "reward": -0.6200313568115234, "reward_std": 0.1568075306713581, "rewards/cosine_scaled_reward": -0.3308490067720413, "rewards/format_reward": 0.0416666679084301, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.037142857142857144, "grad_norm": 0.15397769212722778, "kl": 0.05743408203125, "learning_rate": 9.97852329991824e-07, "loss": 0.0002, "reward": -0.4742198493331671, "reward_std": 0.20335539802908897, "rewards/cosine_scaled_reward": -0.2371099255979061, "rewards/format_reward": 0.0, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 3236.666748046875, "epoch": 0.037714285714285714, "grad_norm": 0.19795703887939453, "kl": 0.03839111328125, "learning_rate": 9.975348529157229e-07, "loss": 0.1441, "reward": 0.2815367206931114, "reward_std": 1.0324797630310059, "rewards/cosine_scaled_reward": -0.025898311287164688, "rewards/format_reward": 0.3333333395421505, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 2816.375, "epoch": 0.038285714285714284, "grad_norm": 0.19631832838058472, "kl": 0.04827880859375, "learning_rate": 9.971955636222684e-07, "loss": -0.0521, "reward": -0.030950482934713364, "reward_std": 0.6847976944409311, "rewards/cosine_scaled_reward": -0.16130859032273293, "rewards/format_reward": 0.2916666679084301, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.038857142857142854, "grad_norm": 0.271883100271225, "kl": 0.07073974609375, "learning_rate": 9.968344786479415e-07, "loss": 0.0003, "reward": -0.6612199693918228, "reward_std": 0.1911415420472622, "rewards/cosine_scaled_reward": -0.3306099846959114, "rewards/format_reward": 0.0, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 3492.9583740234375, "epoch": 0.03942857142857143, "grad_norm": 0.14704446494579315, "kl": 0.04931640625, "learning_rate": 9.964516155915151e-07, "loss": 0.0101, "reward": -0.46425507962703705, "reward_std": 0.4483284428715706, "rewards/cosine_scaled_reward": -0.2946275472640991, "rewards/format_reward": 0.125, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 3439.3750610351562, "epoch": 0.04, "grad_norm": 0.2434043288230896, "kl": 0.05535888671875, "learning_rate": 9.960469931131936e-07, "loss": 0.0798, "reward": -0.6196610480546951, "reward_std": 0.322536863386631, "rewards/cosine_scaled_reward": -0.35149718821048737, "rewards/format_reward": 0.0833333358168602, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 2774.1250610351562, "epoch": 0.04057142857142857, "grad_norm": 0.32268911600112915, "kl": 0.0482177734375, "learning_rate": 9.956206309337066e-07, "loss": 0.1546, "reward": 0.546534039080143, "reward_std": 0.8966285344213247, "rewards/cosine_scaled_reward": 0.0024337023496627808, "rewards/format_reward": 0.5416666865348816, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 3378.0833740234375, "epoch": 0.04114285714285714, "grad_norm": 0.17585237324237823, "kl": 0.0487060546875, "learning_rate": 9.951725498333448e-07, "loss": 0.066, "reward": -0.14741092920303345, "reward_std": 0.6694340538233519, "rewards/cosine_scaled_reward": -0.15703882090747356, "rewards/format_reward": 0.1666666679084301, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 3107.8750610351562, "epoch": 0.04171428571428572, "grad_norm": 0.24229124188423157, "kl": 0.05169677734375, "learning_rate": 9.947027716509488e-07, "loss": 0.0981, "reward": -0.270541962236166, "reward_std": 0.5891504883766174, "rewards/cosine_scaled_reward": -0.30193765088915825, "rewards/format_reward": 0.3333333432674408, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 3386.25, "epoch": 0.04228571428571429, "grad_norm": 0.16542011499404907, "kl": 0.04302978515625, "learning_rate": 9.942113192828444e-07, "loss": 0.0643, "reward": -0.47412845492362976, "reward_std": 0.3468447830528021, "rewards/cosine_scaled_reward": -0.2787308990955353, "rewards/format_reward": 0.0833333358168602, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 3475.6666870117188, "epoch": 0.04285714285714286, "grad_norm": 0.1598564237356186, "kl": 0.04718017578125, "learning_rate": 9.93698216681727e-07, "loss": 0.0421, "reward": -0.3513486757874489, "reward_std": 0.7791556939482689, "rewards/cosine_scaled_reward": -0.23817433044314384, "rewards/format_reward": 0.1250000037252903, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 3499.7083740234375, "epoch": 0.04342857142857143, "grad_norm": 0.15245261788368225, "kl": 0.05072021484375, "learning_rate": 9.931634888554935e-07, "loss": 0.0273, "reward": -0.26299357414245605, "reward_std": 0.615978293120861, "rewards/cosine_scaled_reward": -0.19399680197238922, "rewards/format_reward": 0.125, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 3544.5416870117188, "epoch": 0.044, "grad_norm": 0.17183570563793182, "kl": 0.05072021484375, "learning_rate": 9.926071618660237e-07, "loss": 0.015, "reward": -0.29701984860002995, "reward_std": 0.553566699847579, "rewards/cosine_scaled_reward": -0.23184325452893972, "rewards/format_reward": 0.1666666679084301, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 3498.0000610351562, "epoch": 0.044571428571428574, "grad_norm": 0.16006886959075928, "kl": 0.04241943359375, "learning_rate": 9.9202926282791e-07, "loss": 0.0266, "reward": -0.0037414096295833588, "reward_std": 0.9226736649870872, "rewards/cosine_scaled_reward": -0.10603736154735088, "rewards/format_reward": 0.2083333358168602, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 3099.75, "epoch": 0.045142857142857144, "grad_norm": 0.15230302512645721, "kl": 0.0433349609375, "learning_rate": 9.91429819907136e-07, "loss": -0.0371, "reward": -0.008567571640014648, "reward_std": 0.4445110894739628, "rewards/cosine_scaled_reward": -0.12928379327058792, "rewards/format_reward": 0.25, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 3559.0833740234375, "epoch": 0.045714285714285714, "grad_norm": 0.1428958624601364, "kl": 0.0391845703125, "learning_rate": 9.908088623197048e-07, "loss": 0.0109, "reward": -0.22227831184864044, "reward_std": 0.46194060891866684, "rewards/cosine_scaled_reward": -0.13197248615324497, "rewards/format_reward": 0.0416666679084301, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 3474.541748046875, "epoch": 0.046285714285714284, "grad_norm": 0.19979149103164673, "kl": 0.0528564453125, "learning_rate": 9.901664203302124e-07, "loss": 0.0265, "reward": 0.20810034382157028, "reward_std": 0.55513103492558, "rewards/cosine_scaled_reward": -0.00011649727821350098, "rewards/format_reward": 0.2083333395421505, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 3402.75, "epoch": 0.046857142857142854, "grad_norm": 0.16143812239170074, "kl": 0.04791259765625, "learning_rate": 9.895025252503755e-07, "loss": 0.0423, "reward": -0.17914994060993195, "reward_std": 0.677577305585146, "rewards/cosine_scaled_reward": -0.17290829867124557, "rewards/format_reward": 0.1666666716337204, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 2067.291702270508, "epoch": 0.04742857142857143, "grad_norm": 0.2830398380756378, "kl": 0.073974609375, "learning_rate": 9.888172094375033e-07, "loss": 0.1007, "reward": 0.6157565079629421, "reward_std": 0.6514625661075115, "rewards/cosine_scaled_reward": -0.025455085560679436, "rewards/format_reward": 0.6666666716337204, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 3327.916748046875, "epoch": 0.048, "grad_norm": 0.18658600747585297, "kl": 0.0465087890625, "learning_rate": 9.881105062929221e-07, "loss": 0.049, "reward": -0.15906250849366188, "reward_std": 0.772390453144908, "rewards/cosine_scaled_reward": -0.2045312598347664, "rewards/format_reward": 0.2500000074505806, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 3180.3333740234375, "epoch": 0.04857142857142857, "grad_norm": 0.16970154643058777, "kl": 0.05596923828125, "learning_rate": 9.873824502603459e-07, "loss": 0.0349, "reward": 0.2883519548922777, "reward_std": 0.6625581197440624, "rewards/cosine_scaled_reward": -0.02249070629477501, "rewards/format_reward": 0.3333333432674408, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 3541.875, "epoch": 0.04914285714285714, "grad_norm": 0.15287643671035767, "kl": 0.04180908203125, "learning_rate": 9.866330768241983e-07, "loss": 0.0253, "reward": -0.5468939123675227, "reward_std": 0.47659813798964024, "rewards/cosine_scaled_reward": -0.29428029619157314, "rewards/format_reward": 0.0416666679084301, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 3427.7916870117188, "epoch": 0.04971428571428571, "grad_norm": 0.1800970584154129, "kl": 0.052978515625, "learning_rate": 9.85862422507884e-07, "loss": 0.0337, "reward": -0.22589577734470367, "reward_std": 0.6039449013769627, "rewards/cosine_scaled_reward": -0.19628122448921204, "rewards/format_reward": 0.1666666716337204, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 3364.125, "epoch": 0.05028571428571429, "grad_norm": 0.1843470185995102, "kl": 0.05328369140625, "learning_rate": 9.850705248720068e-07, "loss": 0.0534, "reward": -0.588555134832859, "reward_std": 0.29554150719195604, "rewards/cosine_scaled_reward": -0.37761090695858, "rewards/format_reward": 0.1666666716337204, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 3422.7916870117188, "epoch": 0.05085714285714286, "grad_norm": 0.16204486787319183, "kl": 0.050537109375, "learning_rate": 9.8425742251254e-07, "loss": 0.0307, "reward": -0.49031344801187515, "reward_std": 0.3072348916903138, "rewards/cosine_scaled_reward": -0.3284900598227978, "rewards/format_reward": 0.1666666716337204, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 2920.5833740234375, "epoch": 0.05142857142857143, "grad_norm": 0.2106999307870865, "kl": 0.041412353515625, "learning_rate": 9.83423155058946e-07, "loss": 0.1641, "reward": 0.3364746905863285, "reward_std": 0.8183911889791489, "rewards/cosine_scaled_reward": -0.04009598679840565, "rewards/format_reward": 0.4166666828095913, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 3493.7083740234375, "epoch": 0.052, "grad_norm": 0.20229995250701904, "kl": 0.0487060546875, "learning_rate": 9.825677631722435e-07, "loss": 0.0375, "reward": -0.22211312502622604, "reward_std": 0.800605058670044, "rewards/cosine_scaled_reward": -0.17355656623840332, "rewards/format_reward": 0.1250000037252903, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 3018.25, "epoch": 0.052571428571428575, "grad_norm": 0.2712525427341461, "kl": 0.04443359375, "learning_rate": 9.816912885430258e-07, "loss": 0.1815, "reward": 0.07752631604671478, "reward_std": 0.3367920182645321, "rewards/cosine_scaled_reward": -0.065403513610363, "rewards/format_reward": 0.2083333432674408, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 3192.416748046875, "epoch": 0.053142857142857144, "grad_norm": 0.2336161732673645, "kl": 0.05108642578125, "learning_rate": 9.807937738894303e-07, "loss": 0.1053, "reward": -0.15976980328559875, "reward_std": 0.66871527582407, "rewards/cosine_scaled_reward": -0.20488492399454117, "rewards/format_reward": 0.25, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 3344.7916870117188, "epoch": 0.053714285714285714, "grad_norm": 0.19864843785762787, "kl": 0.0494384765625, "learning_rate": 9.798752629550546e-07, "loss": 0.1064, "reward": -0.45881245099008083, "reward_std": 0.6017686780542135, "rewards/cosine_scaled_reward": -0.3127395585179329, "rewards/format_reward": 0.1666666716337204, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 3160.8751220703125, "epoch": 0.054285714285714284, "grad_norm": 0.7792258262634277, "kl": 0.142333984375, "learning_rate": 9.78935800506826e-07, "loss": 0.1646, "reward": -0.11449402663856745, "reward_std": 0.608274769037962, "rewards/cosine_scaled_reward": -0.18224701657891273, "rewards/format_reward": 0.2500000037252903, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 3546.5, "epoch": 0.054857142857142854, "grad_norm": 0.15461793541908264, "kl": 0.045196533203125, "learning_rate": 9.779754323328192e-07, "loss": 0.0222, "reward": -0.1999459322541952, "reward_std": 0.6907303184270859, "rewards/cosine_scaled_reward": -0.12080629542469978, "rewards/format_reward": 0.0416666679084301, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 3418.7083740234375, "epoch": 0.05542857142857143, "grad_norm": 0.17061863839626312, "kl": 0.05731201171875, "learning_rate": 9.769942052400235e-07, "loss": 0.0459, "reward": -0.578557875007391, "reward_std": 0.35796352103352547, "rewards/cosine_scaled_reward": -0.35177892446517944, "rewards/format_reward": 0.125, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 2118.3333435058594, "epoch": 0.056, "grad_norm": 0.28902173042297363, "kl": 0.03961181640625, "learning_rate": 9.759921670520634e-07, "loss": -0.1185, "reward": 0.6223690360784531, "reward_std": 0.46096891909837723, "rewards/cosine_scaled_reward": 0.08201783150434494, "rewards/format_reward": 0.4583333432674408, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 2805.000030517578, "epoch": 0.05657142857142857, "grad_norm": 0.21270987391471863, "kl": 0.0479736328125, "learning_rate": 9.749693666068663e-07, "loss": -0.0392, "reward": 0.10187321389093995, "reward_std": 0.4792479854077101, "rewards/cosine_scaled_reward": -0.09489673189818859, "rewards/format_reward": 0.2916666679084301, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 2614.625045776367, "epoch": 0.05714285714285714, "grad_norm": 0.3168502449989319, "kl": 0.05035400390625, "learning_rate": 9.739258537542835e-07, "loss": 0.0458, "reward": 0.4905807599425316, "reward_std": 0.6621165350079536, "rewards/cosine_scaled_reward": 0.0161236971616745, "rewards/format_reward": 0.4583333432674408, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 3412.8750610351562, "epoch": 0.05771428571428571, "grad_norm": 0.17035293579101562, "kl": 0.0479736328125, "learning_rate": 9.728616793536587e-07, "loss": 0.0864, "reward": -0.51438994333148, "reward_std": 0.4058373123407364, "rewards/cosine_scaled_reward": -0.2988616116344929, "rewards/format_reward": 0.0833333358168602, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.05828571428571429, "grad_norm": 0.1507677137851715, "kl": 0.0511474609375, "learning_rate": 9.717768952713511e-07, "loss": 0.0002, "reward": -0.6419448927044868, "reward_std": 0.14684983156621456, "rewards/cosine_scaled_reward": -0.3209724463522434, "rewards/format_reward": 0.0, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.05885714285714286, "grad_norm": 0.17621943354606628, "kl": 0.0430908203125, "learning_rate": 9.706715543782064e-07, "loss": 0.0002, "reward": -0.704314574599266, "reward_std": 0.20182611048221588, "rewards/cosine_scaled_reward": -0.3521573022007942, "rewards/format_reward": 0.0, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.05942857142857143, "grad_norm": 0.15782414376735687, "kl": 0.04888916015625, "learning_rate": 9.695457105469804e-07, "loss": 0.0002, "reward": -0.5785035863518715, "reward_std": 0.25241581723093987, "rewards/cosine_scaled_reward": -0.28925178572535515, "rewards/format_reward": 0.0, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 3385.5416870117188, "epoch": 0.06, "grad_norm": 0.19064433872699738, "kl": 0.05072021484375, "learning_rate": 9.683994186497132e-07, "loss": 0.0669, "reward": -0.5965285524725914, "reward_std": 0.3510228842496872, "rewards/cosine_scaled_reward": -0.360764279961586, "rewards/format_reward": 0.125, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 2878.3333435058594, "epoch": 0.060571428571428575, "grad_norm": 0.28683388233184814, "kl": 0.0538330078125, "learning_rate": 9.672327345550543e-07, "loss": -0.0562, "reward": 0.14185508340597153, "reward_std": 0.477683924138546, "rewards/cosine_scaled_reward": -0.05407246574759483, "rewards/format_reward": 0.25, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 3124.2916870117188, "epoch": 0.061142857142857145, "grad_norm": 0.20470094680786133, "kl": 0.05194091796875, "learning_rate": 9.66045715125541e-07, "loss": 0.0054, "reward": -0.014784537255764008, "reward_std": 0.49005767330527306, "rewards/cosine_scaled_reward": -0.1115589402616024, "rewards/format_reward": 0.2083333432674408, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 2828.9583435058594, "epoch": 0.061714285714285715, "grad_norm": 0.2078002393245697, "kl": 0.05157470703125, "learning_rate": 9.648384182148252e-07, "loss": 0.13, "reward": -0.3206620067358017, "reward_std": 0.299712959676981, "rewards/cosine_scaled_reward": -0.30616434663534164, "rewards/format_reward": 0.2916666679084301, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.062285714285714285, "grad_norm": 0.16639988124370575, "kl": 0.052459716796875, "learning_rate": 9.636109026648554e-07, "loss": 0.0002, "reward": -0.6066285073757172, "reward_std": 0.14198161102831364, "rewards/cosine_scaled_reward": -0.30331425555050373, "rewards/format_reward": 0.0, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 3380.2916870117188, "epoch": 0.06285714285714286, "grad_norm": 0.14965060353279114, "kl": 0.04302978515625, "learning_rate": 9.623632283030077e-07, "loss": 0.0395, "reward": -0.39130744338035583, "reward_std": 0.36896876990795135, "rewards/cosine_scaled_reward": -0.2998203821480274, "rewards/format_reward": 0.2083333432674408, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 2828.125, "epoch": 0.06342857142857143, "grad_norm": 0.24873219430446625, "kl": 0.0533447265625, "learning_rate": 9.610954559391704e-07, "loss": -0.0065, "reward": 0.19809278845787048, "reward_std": 0.43846164271235466, "rewards/cosine_scaled_reward": -0.025953616946935654, "rewards/format_reward": 0.25, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 3578.875, "epoch": 0.064, "grad_norm": 0.14656062424182892, "kl": 0.0477294921875, "learning_rate": 9.598076473627796e-07, "loss": 0.003, "reward": -0.35288260877132416, "reward_std": 0.4478282080963254, "rewards/cosine_scaled_reward": -0.19727462995797396, "rewards/format_reward": 0.0416666679084301, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 3449.7083740234375, "epoch": 0.06457142857142857, "grad_norm": 0.1971622109413147, "kl": 0.0526123046875, "learning_rate": 9.58499865339809e-07, "loss": 0.0869, "reward": -0.35047246143221855, "reward_std": 0.272010013461113, "rewards/cosine_scaled_reward": -0.19606954976916313, "rewards/format_reward": 0.0416666679084301, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 3555.75, "epoch": 0.06514285714285714, "grad_norm": 0.14575761556625366, "kl": 0.0478515625, "learning_rate": 9.571721736097088e-07, "loss": 0.0159, "reward": -0.647302895784378, "reward_std": 0.2911082152277231, "rewards/cosine_scaled_reward": -0.34448477625846863, "rewards/format_reward": 0.0416666679084301, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 3405.875, "epoch": 0.06571428571428571, "grad_norm": 0.19001184403896332, "kl": 0.05657958984375, "learning_rate": 9.55824636882301e-07, "loss": 0.0777, "reward": -0.39468052983283997, "reward_std": 0.31587182730436325, "rewards/cosine_scaled_reward": -0.23900692909955978, "rewards/format_reward": 0.0833333358168602, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.06628571428571428, "grad_norm": 0.14631710946559906, "kl": 0.04296875, "learning_rate": 9.54457320834625e-07, "loss": 0.0002, "reward": -0.8244208693504333, "reward_std": 0.12861562799662352, "rewards/cosine_scaled_reward": -0.41221044957637787, "rewards/format_reward": 0.0, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 3315.2083740234375, "epoch": 0.06685714285714285, "grad_norm": 0.18092887103557587, "kl": 0.041412353515625, "learning_rate": 9.530702921077358e-07, "loss": 0.0548, "reward": 0.3274298645555973, "reward_std": 0.6206382885575294, "rewards/cosine_scaled_reward": 0.03871491365134716, "rewards/format_reward": 0.2500000111758709, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 3117.3333435058594, "epoch": 0.06742857142857143, "grad_norm": 0.2688407003879547, "kl": 0.0552978515625, "learning_rate": 9.516636183034564e-07, "loss": 0.1092, "reward": -0.10805931687355042, "reward_std": 0.5824379585683346, "rewards/cosine_scaled_reward": -0.1790296584367752, "rewards/format_reward": 0.2500000111758709, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 2461.9166870117188, "epoch": 0.068, "grad_norm": 0.3144585192203522, "kl": 0.07818603515625, "learning_rate": 9.502373679810839e-07, "loss": 0.0948, "reward": 0.7282524108886719, "reward_std": 0.7472279723733664, "rewards/cosine_scaled_reward": 0.13495950400829315, "rewards/format_reward": 0.4583333395421505, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.06857142857142857, "grad_norm": 0.15482838451862335, "kl": 0.05096435546875, "learning_rate": 9.487916106540465e-07, "loss": 0.0002, "reward": -0.7398529201745987, "reward_std": 0.2509063072502613, "rewards/cosine_scaled_reward": -0.36992645263671875, "rewards/format_reward": 0.0, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 3361.0833740234375, "epoch": 0.06914285714285714, "grad_norm": 0.21992942690849304, "kl": 0.0552978515625, "learning_rate": 9.473264167865171e-07, "loss": 0.1049, "reward": -0.4808087758719921, "reward_std": 0.3338266760110855, "rewards/cosine_scaled_reward": -0.2820710465312004, "rewards/format_reward": 0.0833333358168602, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 2814.4583740234375, "epoch": 0.06971428571428571, "grad_norm": 0.23910638689994812, "kl": 0.0614013671875, "learning_rate": 9.458418577899774e-07, "loss": 0.0569, "reward": 0.9797220379114151, "reward_std": 1.128523275256157, "rewards/cosine_scaled_reward": 0.17736097052693367, "rewards/format_reward": 0.6250000149011612, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 3343.7083740234375, "epoch": 0.07028571428571428, "grad_norm": 0.17705029249191284, "kl": 0.0452880859375, "learning_rate": 9.443380060197385e-07, "loss": 0.0985, "reward": -0.2491093035787344, "reward_std": 0.7347416132688522, "rewards/cosine_scaled_reward": -0.24955465272068977, "rewards/format_reward": 0.2500000074505806, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 3565.7083740234375, "epoch": 0.07085714285714285, "grad_norm": 0.14711324870586395, "kl": 0.05926513671875, "learning_rate": 9.428149347714143e-07, "loss": 0.0103, "reward": -0.35953105124644935, "reward_std": 0.38775753043591976, "rewards/cosine_scaled_reward": -0.20059886015951633, "rewards/format_reward": 0.0416666679084301, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 3302.3750610351562, "epoch": 0.07142857142857142, "grad_norm": 0.19406969845294952, "kl": 0.05615234375, "learning_rate": 9.412727182773486e-07, "loss": 0.0624, "reward": -0.2552947551012039, "reward_std": 0.4256477430462837, "rewards/cosine_scaled_reward": -0.23181404545903206, "rewards/format_reward": 0.2083333358168602, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 2113.375015258789, "epoch": 0.072, "grad_norm": 0.25354713201522827, "kl": 0.05657958984375, "learning_rate": 9.397114317029974e-07, "loss": 0.134, "reward": 0.26037219166755676, "reward_std": 0.45451565831899643, "rewards/cosine_scaled_reward": -0.09898056834936142, "rewards/format_reward": 0.4583333432674408, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 3431.375, "epoch": 0.07257142857142856, "grad_norm": 0.1632765680551529, "kl": 0.05291748046875, "learning_rate": 9.381311511432658e-07, "loss": 0.0309, "reward": -0.036101870238780975, "reward_std": 0.7900894656777382, "rewards/cosine_scaled_reward": -0.10138426348567009, "rewards/format_reward": 0.1666666716337204, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 3377.8333740234375, "epoch": 0.07314285714285715, "grad_norm": 0.1895456165075302, "kl": 0.048583984375, "learning_rate": 9.36531953618799e-07, "loss": 0.0875, "reward": 0.3225628361105919, "reward_std": 0.5909937657415867, "rewards/cosine_scaled_reward": 0.015448085963726044, "rewards/format_reward": 0.291666679084301, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 3365.6666870117188, "epoch": 0.07371428571428572, "grad_norm": 0.19840599596500397, "kl": 0.04510498046875, "learning_rate": 9.34913917072228e-07, "loss": 0.0565, "reward": -0.14107680320739746, "reward_std": 0.6309686824679375, "rewards/cosine_scaled_reward": -0.21637173369526863, "rewards/format_reward": 0.2916666716337204, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 3476.4583740234375, "epoch": 0.07428571428571429, "grad_norm": 0.1595059335231781, "kl": 0.051025390625, "learning_rate": 9.332771203643714e-07, "loss": -0.0039, "reward": -0.14000652357935905, "reward_std": 0.44728637486696243, "rewards/cosine_scaled_reward": -0.11166992946527898, "rewards/format_reward": 0.0833333358168602, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 3205.7916870117188, "epoch": 0.07485714285714286, "grad_norm": 0.15057101845741272, "kl": 0.0469970703125, "learning_rate": 9.316216432703916e-07, "loss": -0.0239, "reward": -0.4409569948911667, "reward_std": 0.21845832839608192, "rewards/cosine_scaled_reward": -0.34547850489616394, "rewards/format_reward": 0.25, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 3533.7083740234375, "epoch": 0.07542857142857143, "grad_norm": 0.14339689910411835, "kl": 0.04742431640625, "learning_rate": 9.299475664759068e-07, "loss": 0.0081, "reward": -0.17459139972925186, "reward_std": 0.366999352700077, "rewards/cosine_scaled_reward": -0.19146236404776573, "rewards/format_reward": 0.2083333432674408, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 3345.7500610351562, "epoch": 0.076, "grad_norm": 0.28945690393447876, "kl": 0.068115234375, "learning_rate": 9.282549715730579e-07, "loss": 0.0761, "reward": -0.205301433801651, "reward_std": 0.7304530702531338, "rewards/cosine_scaled_reward": -0.185984056442976, "rewards/format_reward": 0.1666666716337204, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.07657142857142857, "grad_norm": 0.13178248703479767, "kl": 0.0406494140625, "learning_rate": 9.265439410565328e-07, "loss": 0.0002, "reward": -0.21339796762913465, "reward_std": 0.14427685737609863, "rewards/cosine_scaled_reward": -0.10669897636398673, "rewards/format_reward": 0.0, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 2877.0833740234375, "epoch": 0.07714285714285714, "grad_norm": 0.6117133498191833, "kl": 0.062744140625, "learning_rate": 9.248145583195447e-07, "loss": -0.1135, "reward": -0.3104187399148941, "reward_std": 0.4863443411886692, "rewards/cosine_scaled_reward": -0.30104270949959755, "rewards/format_reward": 0.2916666679084301, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 2646.166748046875, "epoch": 0.07771428571428571, "grad_norm": 0.2174384891986847, "kl": 0.078369140625, "learning_rate": 9.230669076497687e-07, "loss": 0.0099, "reward": 0.501250134781003, "reward_std": 0.6959330216050148, "rewards/cosine_scaled_reward": 0.0006250720471143723, "rewards/format_reward": 0.5000000149011612, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 2857.291717529297, "epoch": 0.07828571428571429, "grad_norm": 0.19513949751853943, "kl": 0.03631591796875, "learning_rate": 9.213010742252327e-07, "loss": 0.0129, "reward": 0.05199408531188965, "reward_std": 0.5263700187206268, "rewards/cosine_scaled_reward": -0.16150296479463577, "rewards/format_reward": 0.375, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 2722.4583435058594, "epoch": 0.07885714285714286, "grad_norm": 0.22437696158885956, "kl": 0.048919677734375, "learning_rate": 9.195171441101668e-07, "loss": 0.153, "reward": 0.05756654590368271, "reward_std": 0.4655684223398566, "rewards/cosine_scaled_reward": -0.17955005168914795, "rewards/format_reward": 0.4166666865348816, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 3546.625, "epoch": 0.07942857142857143, "grad_norm": 0.14434434473514557, "kl": 0.04290771484375, "learning_rate": 9.177152042508077e-07, "loss": 0.016, "reward": -0.4627462103962898, "reward_std": 0.31377890706062317, "rewards/cosine_scaled_reward": -0.2730397693812847, "rewards/format_reward": 0.0833333358168602, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 3403.2916870117188, "epoch": 0.08, "grad_norm": 0.2034330666065216, "kl": 0.041259765625, "learning_rate": 9.158953424711624e-07, "loss": 0.0195, "reward": 0.1140199825167656, "reward_std": 0.5117178149521351, "rewards/cosine_scaled_reward": -0.0471566803753376, "rewards/format_reward": 0.2083333432674408, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 3293.041748046875, "epoch": 0.08057142857142857, "grad_norm": 0.1963217407464981, "kl": 0.06451416015625, "learning_rate": 9.140576474687263e-07, "loss": 0.0816, "reward": -0.41971880942583084, "reward_std": 0.46679312735795975, "rewards/cosine_scaled_reward": -0.3140260688960552, "rewards/format_reward": 0.2083333358168602, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 3501.2916870117188, "epoch": 0.08114285714285714, "grad_norm": 0.15945497155189514, "kl": 0.06207275390625, "learning_rate": 9.122022088101613e-07, "loss": 0.0237, "reward": 0.19435557164251804, "reward_std": 0.5239780992269516, "rewards/cosine_scaled_reward": 0.034677786752581596, "rewards/format_reward": 0.1250000037252903, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 2787.458335876465, "epoch": 0.08171428571428571, "grad_norm": 0.2447752207517624, "kl": 0.0467529296875, "learning_rate": 9.103291169269299e-07, "loss": 0.0355, "reward": 0.38285309448838234, "reward_std": 0.41509686410427094, "rewards/cosine_scaled_reward": 0.045593203976750374, "rewards/format_reward": 0.2916666679084301, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 3170.9584350585938, "epoch": 0.08228571428571428, "grad_norm": 38.13862228393555, "kl": 6.65838623046875, "learning_rate": 9.084384631108882e-07, "loss": 0.1426, "reward": 0.41588541213423014, "reward_std": 1.0277538150548935, "rewards/cosine_scaled_reward": -0.04205727390944958, "rewards/format_reward": 0.5000000074505806, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 3023.7083435058594, "epoch": 0.08285714285714285, "grad_norm": 0.24795998632907867, "kl": 0.23541259765625, "learning_rate": 9.065303395098358e-07, "loss": 0.1092, "reward": 0.08817495405673981, "reward_std": 0.6404087841510773, "rewards/cosine_scaled_reward": -0.0809125080704689, "rewards/format_reward": 0.2500000111758709, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 2834.2916870117188, "epoch": 0.08342857142857144, "grad_norm": 0.1742977350950241, "kl": 0.05084228515625, "learning_rate": 9.046048391230247e-07, "loss": 0.0127, "reward": -0.02124733477830887, "reward_std": 0.6222574003040791, "rewards/cosine_scaled_reward": -0.17729033529758453, "rewards/format_reward": 0.3333333358168602, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 3126.5, "epoch": 0.084, "grad_norm": 0.2359555959701538, "kl": 0.0499267578125, "learning_rate": 9.026620557966279e-07, "loss": 0.0858, "reward": 0.02666241116821766, "reward_std": 0.4308905638754368, "rewards/cosine_scaled_reward": -0.09083548840135336, "rewards/format_reward": 0.2083333432674408, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 3065.1666870117188, "epoch": 0.08457142857142858, "grad_norm": 0.16794665157794952, "kl": 0.05181884765625, "learning_rate": 9.007020842191634e-07, "loss": -0.0189, "reward": -0.25704628229141235, "reward_std": 0.5457647405564785, "rewards/cosine_scaled_reward": -0.2951898043975234, "rewards/format_reward": 0.3333333358168602, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 3517.9583740234375, "epoch": 0.08514285714285715, "grad_norm": 0.1912597119808197, "kl": 0.078857421875, "learning_rate": 8.987250199168808e-07, "loss": 0.0277, "reward": -0.3515687808394432, "reward_std": 0.3064217194914818, "rewards/cosine_scaled_reward": -0.2382843866944313, "rewards/format_reward": 0.1250000037252903, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 3309.8333740234375, "epoch": 0.08571428571428572, "grad_norm": 0.17105424404144287, "kl": 0.0648193359375, "learning_rate": 8.967309592491052e-07, "loss": 0.0291, "reward": -0.26190581917762756, "reward_std": 0.4774948377162218, "rewards/cosine_scaled_reward": -0.19345290772616863, "rewards/format_reward": 0.125, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 3336.7500610351562, "epoch": 0.08628571428571429, "grad_norm": 0.15193696320056915, "kl": 0.0372314453125, "learning_rate": 8.9471999940354e-07, "loss": 0.0409, "reward": 0.2141360342502594, "reward_std": 0.7047755531966686, "rewards/cosine_scaled_reward": -0.05959864519536495, "rewards/format_reward": 0.3333333432674408, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 3453.0833740234375, "epoch": 0.08685714285714285, "grad_norm": 0.17530380189418793, "kl": 0.0565185546875, "learning_rate": 8.926922383915315e-07, "loss": 0.0401, "reward": -0.20536936819553375, "reward_std": 0.6435524728149176, "rewards/cosine_scaled_reward": -0.16518468409776688, "rewards/format_reward": 0.125, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 3076.2083435058594, "epoch": 0.08742857142857142, "grad_norm": 0.2100270837545395, "kl": 0.05108642578125, "learning_rate": 8.906477750432903e-07, "loss": 0.1391, "reward": -0.07981336116790771, "reward_std": 0.5872795805335045, "rewards/cosine_scaled_reward": -0.14407330751419067, "rewards/format_reward": 0.2083333432674408, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 3504.4166870117188, "epoch": 0.088, "grad_norm": 0.1730157434940338, "kl": 0.05682373046875, "learning_rate": 8.88586709003076e-07, "loss": 0.026, "reward": -0.5636200718581676, "reward_std": 0.1353142261505127, "rewards/cosine_scaled_reward": -0.30264334939420223, "rewards/format_reward": 0.0416666679084301, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 3428.7916870117188, "epoch": 0.08857142857142856, "grad_norm": 0.1775522381067276, "kl": 0.050537109375, "learning_rate": 8.865091407243394e-07, "loss": 0.0571, "reward": -0.22037950158119202, "reward_std": 0.46343767642974854, "rewards/cosine_scaled_reward": -0.1518564149737358, "rewards/format_reward": 0.0833333358168602, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 3031.916717529297, "epoch": 0.08914285714285715, "grad_norm": 0.2878535985946655, "kl": 0.06085205078125, "learning_rate": 8.844151714648274e-07, "loss": 0.1139, "reward": 0.004119843244552612, "reward_std": 0.5550502277910709, "rewards/cosine_scaled_reward": -0.1437734253704548, "rewards/format_reward": 0.291666679084301, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 3031.9166870117188, "epoch": 0.08971428571428572, "grad_norm": 0.1691678762435913, "kl": 0.04876708984375, "learning_rate": 8.823049032816478e-07, "loss": 0.0713, "reward": -0.09758711606264114, "reward_std": 0.5461144000291824, "rewards/cosine_scaled_reward": -0.21546022966504097, "rewards/format_reward": 0.3333333358168602, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 3353.8333740234375, "epoch": 0.09028571428571429, "grad_norm": 0.1852702796459198, "kl": 0.04815673828125, "learning_rate": 8.801784390262943e-07, "loss": 0.0788, "reward": -0.16877157613635063, "reward_std": 0.5984261110424995, "rewards/cosine_scaled_reward": -0.20938578806817532, "rewards/format_reward": 0.2500000111758709, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 3472.5833740234375, "epoch": 0.09085714285714286, "grad_norm": 0.2011410892009735, "kl": 0.05548095703125, "learning_rate": 8.780358823396352e-07, "loss": 0.0726, "reward": -0.5162428542971611, "reward_std": 0.5920832827687263, "rewards/cosine_scaled_reward": -0.29978808760643005, "rewards/format_reward": 0.0833333358168602, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.09142857142857143, "grad_norm": 0.15513016283512115, "kl": 0.0465087890625, "learning_rate": 8.758773376468604e-07, "loss": 0.0002, "reward": -0.45894186943769455, "reward_std": 0.1620104108005762, "rewards/cosine_scaled_reward": -0.22947093471884727, "rewards/format_reward": 0.0, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.092, "grad_norm": 0.1544281244277954, "kl": 0.05731201171875, "learning_rate": 8.737029101523929e-07, "loss": 0.0002, "reward": -0.5140766091644764, "reward_std": 0.2021910808980465, "rewards/cosine_scaled_reward": -0.25703830271959305, "rewards/format_reward": 0.0, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 3211.875, "epoch": 0.09257142857142857, "grad_norm": 0.18425820767879486, "kl": 0.04522705078125, "learning_rate": 8.715127058347614e-07, "loss": 0.0104, "reward": 0.019165851175785065, "reward_std": 0.553180105984211, "rewards/cosine_scaled_reward": -0.09458375349640846, "rewards/format_reward": 0.2083333432674408, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 3459.7083740234375, "epoch": 0.09314285714285714, "grad_norm": 0.19152286648750305, "kl": 0.068115234375, "learning_rate": 8.693068314414344e-07, "loss": 0.0666, "reward": -0.32729392871260643, "reward_std": 0.5201658196747303, "rewards/cosine_scaled_reward": -0.18448030017316341, "rewards/format_reward": 0.0416666679084301, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 3519.9583740234375, "epoch": 0.09371428571428571, "grad_norm": 0.15800310671329498, "kl": 0.04876708984375, "learning_rate": 8.670853944836176e-07, "loss": 0.0335, "reward": -0.4632922485470772, "reward_std": 0.4524738918989897, "rewards/cosine_scaled_reward": -0.27331279031932354, "rewards/format_reward": 0.0833333358168602, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.09428571428571429, "grad_norm": 0.1423046737909317, "kl": 0.04443359375, "learning_rate": 8.648485032310144e-07, "loss": 0.0002, "reward": -0.6841404587030411, "reward_std": 0.12779409438371658, "rewards/cosine_scaled_reward": -0.34207022935152054, "rewards/format_reward": 0.0, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.09485714285714286, "grad_norm": 0.15924735367298126, "kl": 0.0531005859375, "learning_rate": 8.625962667065487e-07, "loss": 0.0002, "reward": -0.8217453360557556, "reward_std": 0.14010480791330338, "rewards/cosine_scaled_reward": -0.4108726605772972, "rewards/format_reward": 0.0, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 3299.3750610351562, "epoch": 0.09542857142857143, "grad_norm": 0.1801840364933014, "kl": 0.04791259765625, "learning_rate": 8.603287946810513e-07, "loss": 0.0996, "reward": 0.3397903465665877, "reward_std": 1.7125960290431976, "rewards/cosine_scaled_reward": 0.003228497225791216, "rewards/format_reward": 0.3333333395421505, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 3347.5416870117188, "epoch": 0.096, "grad_norm": 0.1715705245733261, "kl": 0.05438232421875, "learning_rate": 8.580461976679099e-07, "loss": 0.0074, "reward": -0.49858966283500195, "reward_std": 0.5019380133599043, "rewards/cosine_scaled_reward": -0.33262816444039345, "rewards/format_reward": 0.1666666679084301, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 3189.25, "epoch": 0.09657142857142857, "grad_norm": 0.2528628408908844, "kl": 0.04998779296875, "learning_rate": 8.557485869176825e-07, "loss": 0.0895, "reward": -0.13017432391643524, "reward_std": 0.38139653019607067, "rewards/cosine_scaled_reward": -0.16925383359193802, "rewards/format_reward": 0.2083333432674408, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.09714285714285714, "grad_norm": 0.14756138622760773, "kl": 0.0531005859375, "learning_rate": 8.534360744126753e-07, "loss": 0.0002, "reward": -0.36126868799328804, "reward_std": 0.25860733538866043, "rewards/cosine_scaled_reward": -0.18063434585928917, "rewards/format_reward": 0.0, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 3454.625, "epoch": 0.09771428571428571, "grad_norm": 0.163909912109375, "kl": 0.050048828125, "learning_rate": 8.511087728614862e-07, "loss": -0.0115, "reward": -0.11545135825872421, "reward_std": 0.5508421286940575, "rewards/cosine_scaled_reward": -0.0785590149462223, "rewards/format_reward": 0.0416666679084301, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 3559.1666870117188, "epoch": 0.09828571428571428, "grad_norm": 0.17620068788528442, "kl": 0.056884765625, "learning_rate": 8.487667956935087e-07, "loss": 0.01, "reward": -0.5306723043322563, "reward_std": 0.20070407167077065, "rewards/cosine_scaled_reward": -0.30700283497571945, "rewards/format_reward": 0.0833333358168602, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 3125.8750610351562, "epoch": 0.09885714285714285, "grad_norm": 0.16078141331672668, "kl": 0.06512451171875, "learning_rate": 8.464102570534061e-07, "loss": -0.0308, "reward": -0.16053162794560194, "reward_std": 0.5758852250874043, "rewards/cosine_scaled_reward": -0.2469324842095375, "rewards/format_reward": 0.3333333358168602, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.09942857142857142, "grad_norm": 0.15852725505828857, "kl": 0.06536865234375, "learning_rate": 8.440392717955475e-07, "loss": 0.0003, "reward": -0.5079451501369476, "reward_std": 0.22260741889476776, "rewards/cosine_scaled_reward": -0.2539725750684738, "rewards/format_reward": 0.0, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 3431.7500610351562, "epoch": 0.1, "grad_norm": 0.15926480293273926, "kl": 0.044921875, "learning_rate": 8.416539554784089e-07, "loss": 0.0673, "reward": -0.14099129289388657, "reward_std": 0.7832577079534531, "rewards/cosine_scaled_reward": -0.1746623208746314, "rewards/format_reward": 0.2083333358168602, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 3245.3750610351562, "epoch": 0.10057142857142858, "grad_norm": 0.17859475314617157, "kl": 0.068115234375, "learning_rate": 8.392544243589427e-07, "loss": 0.0869, "reward": -0.0380060151219368, "reward_std": 0.6192945204675198, "rewards/cosine_scaled_reward": -0.1440030261874199, "rewards/format_reward": 0.2500000074505806, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 3133.875, "epoch": 0.10114285714285715, "grad_norm": 0.15891003608703613, "kl": 0.04754638671875, "learning_rate": 8.368407953869103e-07, "loss": 0.0181, "reward": 0.08751339092850685, "reward_std": 0.16736168786883354, "rewards/cosine_scaled_reward": -0.08124331943690777, "rewards/format_reward": 0.25, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 3558.6666870117188, "epoch": 0.10171428571428572, "grad_norm": 0.14599938690662384, "kl": 0.04962158203125, "learning_rate": 8.344131861991828e-07, "loss": 0.0082, "reward": -0.16755510121583939, "reward_std": 0.7793578952550888, "rewards/cosine_scaled_reward": -0.1462775506079197, "rewards/format_reward": 0.125, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 3514.75, "epoch": 0.10228571428571429, "grad_norm": 0.2012149542570114, "kl": 0.055419921875, "learning_rate": 8.319717151140072e-07, "loss": 0.027, "reward": -0.6090946160256863, "reward_std": 0.3212553486227989, "rewards/cosine_scaled_reward": -0.3462139815092087, "rewards/format_reward": 0.0833333358168602, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.10285714285714286, "grad_norm": 0.15520262718200684, "kl": 0.05035400390625, "learning_rate": 8.295165011252396e-07, "loss": 0.0002, "reward": -0.44433633610606194, "reward_std": 0.4537891875952482, "rewards/cosine_scaled_reward": -0.2638348173350096, "rewards/format_reward": 0.0833333358168602, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.10342857142857143, "grad_norm": 0.1958397924900055, "kl": 0.04217529296875, "learning_rate": 8.270476638965461e-07, "loss": 0.0002, "reward": -0.19999001920223236, "reward_std": 0.5615897215902805, "rewards/cosine_scaled_reward": -0.20416167378425598, "rewards/format_reward": 0.2083333395421505, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.104, "grad_norm": 0.15346062183380127, "kl": 0.05303955078125, "learning_rate": 8.245653237555705e-07, "loss": 0.0002, "reward": -0.7306937873363495, "reward_std": 0.16790879145264626, "rewards/cosine_scaled_reward": -0.36534689366817474, "rewards/format_reward": 0.0, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 3526.5000610351562, "epoch": 0.10457142857142857, "grad_norm": 0.16117896139621735, "kl": 0.052734375, "learning_rate": 8.220696016880687e-07, "loss": 0.0222, "reward": -0.4183296374976635, "reward_std": 0.4902483597397804, "rewards/cosine_scaled_reward": -0.2924981564283371, "rewards/format_reward": 0.1666666679084301, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 3427.625, "epoch": 0.10514285714285715, "grad_norm": 0.1658228635787964, "kl": 0.0543212890625, "learning_rate": 8.195606193320136e-07, "loss": 0.0467, "reward": -0.23194494098424911, "reward_std": 0.825510136783123, "rewards/cosine_scaled_reward": -0.15763913467526436, "rewards/format_reward": 0.0833333358168602, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 2791.125, "epoch": 0.10571428571428572, "grad_norm": 0.42574554681777954, "kl": 0.0634765625, "learning_rate": 8.170384989716657e-07, "loss": 0.0135, "reward": -0.48800092935562134, "reward_std": 0.21989084407687187, "rewards/cosine_scaled_reward": -0.3481671344488859, "rewards/format_reward": 0.2083333432674408, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 2866.3333740234375, "epoch": 0.10628571428571429, "grad_norm": 0.20383749902248383, "kl": 0.05389404296875, "learning_rate": 8.145033635316128e-07, "loss": 0.0027, "reward": 0.5907801762223244, "reward_std": 0.6398285947507247, "rewards/cosine_scaled_reward": 0.08705675601959229, "rewards/format_reward": 0.4166666679084301, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.10685714285714286, "grad_norm": 0.1624538004398346, "kl": 0.05572509765625, "learning_rate": 8.119553365707802e-07, "loss": 0.0002, "reward": -0.5849527418613434, "reward_std": 0.18283319287002087, "rewards/cosine_scaled_reward": -0.2924763709306717, "rewards/format_reward": 0.0, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.10742857142857143, "grad_norm": 0.14820487797260284, "kl": 0.05047607421875, "learning_rate": 8.093945422764069e-07, "loss": 0.0002, "reward": -0.5387367829680443, "reward_std": 0.39942592941224575, "rewards/cosine_scaled_reward": -0.2902017207816243, "rewards/format_reward": 0.0416666679084301, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 3558.5833740234375, "epoch": 0.108, "grad_norm": 0.1541958600282669, "kl": 0.04547119140625, "learning_rate": 8.068211054579943e-07, "loss": 0.0146, "reward": -0.5106032621115446, "reward_std": 0.5608320534229279, "rewards/cosine_scaled_reward": -0.2969682924449444, "rewards/format_reward": 0.0833333358168602, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 3313.5833740234375, "epoch": 0.10857142857142857, "grad_norm": 0.2825907766819, "kl": 0.0518798828125, "learning_rate": 8.04235151541222e-07, "loss": 0.0952, "reward": -0.5725184977054596, "reward_std": 0.32253118604421616, "rewards/cosine_scaled_reward": -0.3487592488527298, "rewards/format_reward": 0.125, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 2769.500045776367, "epoch": 0.10914285714285714, "grad_norm": 0.751027524471283, "kl": 0.06817626953125, "learning_rate": 8.01636806561836e-07, "loss": -0.0603, "reward": -0.14048952236771584, "reward_std": 0.46144552156329155, "rewards/cosine_scaled_reward": -0.23691142722964287, "rewards/format_reward": 0.3333333432674408, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 3254.0833740234375, "epoch": 0.10971428571428571, "grad_norm": 0.17224395275115967, "kl": 0.06939697265625, "learning_rate": 7.990261971595048e-07, "loss": -0.0575, "reward": 0.1011834591627121, "reward_std": 0.3935512360185385, "rewards/cosine_scaled_reward": -0.07440828531980515, "rewards/format_reward": 0.25, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.11028571428571429, "grad_norm": 0.13426987826824188, "kl": 0.04534912109375, "learning_rate": 7.964034505716476e-07, "loss": 0.0002, "reward": -0.6477106511592865, "reward_std": 0.0925671923905611, "rewards/cosine_scaled_reward": -0.32385531067848206, "rewards/format_reward": 0.0, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 2811.875045776367, "epoch": 0.11085714285714286, "grad_norm": 0.2597994804382324, "kl": 0.08905029296875, "learning_rate": 7.93768694627233e-07, "loss": -0.0486, "reward": 0.23529191315174103, "reward_std": 0.8221290409564972, "rewards/cosine_scaled_reward": -0.04902072250843048, "rewards/format_reward": 0.3333333358168602, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 3352.5833740234375, "epoch": 0.11142857142857143, "grad_norm": 0.23991723358631134, "kl": 0.060546875, "learning_rate": 7.911220577405484e-07, "loss": 0.0555, "reward": -0.0027063414454460144, "reward_std": 0.608617402613163, "rewards/cosine_scaled_reward": -0.1263531558215618, "rewards/format_reward": 0.2500000074505806, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 2819.0416717529297, "epoch": 0.112, "grad_norm": 0.24572671949863434, "kl": 0.07049560546875, "learning_rate": 7.884636689049422e-07, "loss": 0.0432, "reward": -0.14430035650730133, "reward_std": 0.4976443909108639, "rewards/cosine_scaled_reward": -0.17631685733795166, "rewards/format_reward": 0.2083333432674408, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 3383.0416870117188, "epoch": 0.11257142857142857, "grad_norm": 0.15762414038181305, "kl": 0.05340576171875, "learning_rate": 7.857936576865356e-07, "loss": 0.0622, "reward": -0.433723047375679, "reward_std": 0.5919782798737288, "rewards/cosine_scaled_reward": -0.2793615125119686, "rewards/format_reward": 0.125, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 2997.4166870117188, "epoch": 0.11314285714285714, "grad_norm": 0.1949368119239807, "kl": 0.0482177734375, "learning_rate": 7.831121542179086e-07, "loss": 0.0038, "reward": 0.30401055328547955, "reward_std": 0.3298298269510269, "rewards/cosine_scaled_reward": 0.02700526174157858, "rewards/format_reward": 0.25, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 3476.9583740234375, "epoch": 0.11371428571428571, "grad_norm": 0.16105806827545166, "kl": 0.04974365234375, "learning_rate": 7.804192891917571e-07, "loss": 0.0267, "reward": 0.08356641232967377, "reward_std": 0.5651987139135599, "rewards/cosine_scaled_reward": -0.062383463606238365, "rewards/format_reward": 0.2083333395421505, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 3510.125, "epoch": 0.11428571428571428, "grad_norm": 0.1825430691242218, "kl": 0.06591796875, "learning_rate": 7.777151938545235e-07, "loss": 0.0202, "reward": -0.3780656084418297, "reward_std": 0.22097079828381538, "rewards/cosine_scaled_reward": -0.25153281539678574, "rewards/format_reward": 0.125, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 3476.125, "epoch": 0.11485714285714285, "grad_norm": 0.15311142802238464, "kl": 0.06298828125, "learning_rate": 7.75e-07, "loss": 0.068, "reward": -0.3960627820342779, "reward_std": 0.26806606631726027, "rewards/cosine_scaled_reward": -0.21886471938341856, "rewards/format_reward": 0.0416666679084301, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 3063.3333435058594, "epoch": 0.11542857142857142, "grad_norm": 0.2661282420158386, "kl": 0.050048828125, "learning_rate": 7.72273839962904e-07, "loss": 0.1064, "reward": -0.12841053307056427, "reward_std": 0.5312525816261768, "rewards/cosine_scaled_reward": -0.16837193816900253, "rewards/format_reward": 0.2083333432674408, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 3433.9583740234375, "epoch": 0.116, "grad_norm": 0.20914843678474426, "kl": 0.049560546875, "learning_rate": 7.695368466124296e-07, "loss": 0.0853, "reward": -0.22245256043970585, "reward_std": 0.3599269762635231, "rewards/cosine_scaled_reward": -0.15289294999092817, "rewards/format_reward": 0.0833333358168602, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 2902.2083435058594, "epoch": 0.11657142857142858, "grad_norm": 0.18525813519954681, "kl": 0.04925537109375, "learning_rate": 7.667891533457718e-07, "loss": -0.0045, "reward": -0.38538965582847595, "reward_std": 0.27233337238430977, "rewards/cosine_scaled_reward": -0.29686148278415203, "rewards/format_reward": 0.2083333432674408, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 3465.875, "epoch": 0.11714285714285715, "grad_norm": 0.27074506878852844, "kl": 0.0501708984375, "learning_rate": 7.640308940816239e-07, "loss": 0.0666, "reward": -0.20100652147084475, "reward_std": 0.5179510489106178, "rewards/cosine_scaled_reward": -0.14216993749141693, "rewards/format_reward": 0.0833333358168602, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 3581.5833740234375, "epoch": 0.11771428571428572, "grad_norm": 0.14444300532341003, "kl": 0.06109619140625, "learning_rate": 7.612622032536507e-07, "loss": 0.0009, "reward": -0.007464568130671978, "reward_std": 0.5384093690663576, "rewards/cosine_scaled_reward": -0.0662322910502553, "rewards/format_reward": 0.1250000037252903, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 3480.5833740234375, "epoch": 0.11828571428571429, "grad_norm": 0.17702296376228333, "kl": 0.04974365234375, "learning_rate": 7.584832158039378e-07, "loss": 0.0625, "reward": -0.43253427371382713, "reward_std": 0.3365586632862687, "rewards/cosine_scaled_reward": -0.2579338103532791, "rewards/format_reward": 0.0833333358168602, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 2894.500030517578, "epoch": 0.11885714285714286, "grad_norm": 0.17304745316505432, "kl": 0.06671142578125, "learning_rate": 7.556940671764124e-07, "loss": -0.0349, "reward": 0.5662415772676468, "reward_std": 0.7728973366320133, "rewards/cosine_scaled_reward": 0.0747874528169632, "rewards/format_reward": 0.4166666716337204, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 2965.875, "epoch": 0.11942857142857143, "grad_norm": 0.18019767105579376, "kl": 0.04754638671875, "learning_rate": 7.528948933102438e-07, "loss": 0.0153, "reward": 0.2692076712846756, "reward_std": 0.359660305082798, "rewards/cosine_scaled_reward": -0.0320628322660923, "rewards/format_reward": 0.3333333358168602, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 3572.5, "epoch": 0.12, "grad_norm": 0.17177176475524902, "kl": 0.06103515625, "learning_rate": 7.500858306332172e-07, "loss": 0.0069, "reward": -0.5149929281324148, "reward_std": 0.27825676556676626, "rewards/cosine_scaled_reward": -0.278329792432487, "rewards/format_reward": 0.0416666679084301, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 3285.2083740234375, "epoch": 0.12057142857142857, "grad_norm": 0.1756087690591812, "kl": 0.05584716796875, "learning_rate": 7.472670160550848e-07, "loss": 0.0255, "reward": -0.08769790083169937, "reward_std": 0.4134560525417328, "rewards/cosine_scaled_reward": -0.12718229368329048, "rewards/format_reward": 0.1666666716337204, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.12114285714285715, "grad_norm": 0.15736572444438934, "kl": 0.06146240234375, "learning_rate": 7.444385869608921e-07, "loss": 0.0002, "reward": -0.09432668518275023, "reward_std": 0.6111114136874676, "rewards/cosine_scaled_reward": -0.06799666350707412, "rewards/format_reward": 0.0416666679084301, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 3303.0833740234375, "epoch": 0.12171428571428572, "grad_norm": 0.13703764975070953, "kl": 0.04400634765625, "learning_rate": 7.416006812042827e-07, "loss": -0.0403, "reward": 0.23149769008159637, "reward_std": 0.8395795077085495, "rewards/cosine_scaled_reward": -0.009251154959201813, "rewards/format_reward": 0.2500000111758709, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 2751.875, "epoch": 0.12228571428571429, "grad_norm": 0.22418977320194244, "kl": 0.05072021484375, "learning_rate": 7.387534371007797e-07, "loss": 0.0599, "reward": -0.3867769241333008, "reward_std": 0.32847015745937824, "rewards/cosine_scaled_reward": -0.3392217978835106, "rewards/format_reward": 0.291666679084301, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.12285714285714286, "grad_norm": 0.16271401941776276, "kl": 0.0491943359375, "learning_rate": 7.358969934210438e-07, "loss": 0.0002, "reward": -0.763099730014801, "reward_std": 0.22296234592795372, "rewards/cosine_scaled_reward": -0.3815498650074005, "rewards/format_reward": 0.0, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 2543.2084350585938, "epoch": 0.12342857142857143, "grad_norm": 0.19669151306152344, "kl": 0.06500244140625, "learning_rate": 7.330314893841101e-07, "loss": 0.0565, "reward": 1.6257893741130829, "reward_std": 0.8823383823037148, "rewards/cosine_scaled_reward": 0.4378946740180254, "rewards/format_reward": 0.7500000074505806, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 3438.5833740234375, "epoch": 0.124, "grad_norm": 0.19945161044597626, "kl": 0.044189453125, "learning_rate": 7.301570646506027e-07, "loss": 0.06, "reward": -0.2967698462307453, "reward_std": 0.30110811814665794, "rewards/cosine_scaled_reward": -0.19005159474909306, "rewards/format_reward": 0.0833333358168602, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 3469.9584350585938, "epoch": 0.12457142857142857, "grad_norm": 0.24981477856636047, "kl": 0.056640625, "learning_rate": 7.27273859315928e-07, "loss": 0.0418, "reward": 0.14413084089756012, "reward_std": 0.7537662945687771, "rewards/cosine_scaled_reward": -0.05293455999344587, "rewards/format_reward": 0.2500000037252903, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 2854.4583435058594, "epoch": 0.12514285714285714, "grad_norm": 0.1918228268623352, "kl": 0.0614013671875, "learning_rate": 7.243820139034464e-07, "loss": -0.027, "reward": 0.359823577105999, "reward_std": 0.3150374963879585, "rewards/cosine_scaled_reward": -0.0700882226228714, "rewards/format_reward": 0.5, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 3304.8333740234375, "epoch": 0.12571428571428572, "grad_norm": 0.17970924079418182, "kl": 0.04547119140625, "learning_rate": 7.214816693576234e-07, "loss": 0.0824, "reward": 0.4905561991035938, "reward_std": 1.0811701826751232, "rewards/cosine_scaled_reward": 0.036944760009646416, "rewards/format_reward": 0.4166666716337204, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 2944.4583740234375, "epoch": 0.12628571428571428, "grad_norm": 0.20005832612514496, "kl": 0.0582275390625, "learning_rate": 7.185729670371604e-07, "loss": 0.1705, "reward": 0.11963904649019241, "reward_std": 0.9095522128045559, "rewards/cosine_scaled_reward": -0.10684715583920479, "rewards/format_reward": 0.3333333395421505, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 2707.9583587646484, "epoch": 0.12685714285714286, "grad_norm": 0.29521119594573975, "kl": 0.0628662109375, "learning_rate": 7.156560487081051e-07, "loss": -0.0572, "reward": 0.17322428710758686, "reward_std": 0.6296472698450089, "rewards/cosine_scaled_reward": -0.12172118946909904, "rewards/format_reward": 0.4166666716337204, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 3465.9583740234375, "epoch": 0.12742857142857142, "grad_norm": 0.20125603675842285, "kl": 0.06634521484375, "learning_rate": 7.127310565369415e-07, "loss": 0.0676, "reward": -0.5744385868310928, "reward_std": 0.25765218771994114, "rewards/cosine_scaled_reward": -0.308052621781826, "rewards/format_reward": 0.0416666679084301, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 3561.2916870117188, "epoch": 0.128, "grad_norm": 0.21267585456371307, "kl": 0.0467529296875, "learning_rate": 7.097981330836616e-07, "loss": 0.0093, "reward": -0.16986877843737602, "reward_std": 0.8435531742870808, "rewards/cosine_scaled_reward": -0.1474343854933977, "rewards/format_reward": 0.1250000037252903, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 3497.125, "epoch": 0.12857142857142856, "grad_norm": 0.18112824857234955, "kl": 0.05999755859375, "learning_rate": 7.068574212948169e-07, "loss": 0.0182, "reward": -0.0508711040019989, "reward_std": 0.6743626110255718, "rewards/cosine_scaled_reward": -0.10876888036727905, "rewards/format_reward": 0.1666666716337204, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 3313.5, "epoch": 0.12914285714285714, "grad_norm": 0.176024928689003, "kl": 0.05694580078125, "learning_rate": 7.039090644965509e-07, "loss": -0.0426, "reward": 0.4725576564669609, "reward_std": 0.18281785771250725, "rewards/cosine_scaled_reward": 0.11127886641770601, "rewards/format_reward": 0.25, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 2694.4166717529297, "epoch": 0.12971428571428573, "grad_norm": 0.20780931413173676, "kl": 0.04412841796875, "learning_rate": 7.009532063876148e-07, "loss": 0.1619, "reward": 0.05987407639622688, "reward_std": 0.787646472454071, "rewards/cosine_scaled_reward": -0.1575629599392414, "rewards/format_reward": 0.3750000037252903, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 2700.6666870117188, "epoch": 0.13028571428571428, "grad_norm": 0.20493757724761963, "kl": 0.0606689453125, "learning_rate": 6.979899910323624e-07, "loss": 0.0601, "reward": 0.9252843372523785, "reward_std": 0.8674749806523323, "rewards/cosine_scaled_reward": 0.21264216862618923, "rewards/format_reward": 0.5, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 2944.9583740234375, "epoch": 0.13085714285714287, "grad_norm": 0.17230452597141266, "kl": 0.052490234375, "learning_rate": 6.950195628537299e-07, "loss": -0.0099, "reward": -0.023678046971326694, "reward_std": 0.5056402957998216, "rewards/cosine_scaled_reward": -0.19933902844786644, "rewards/format_reward": 0.3750000037252903, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 3560.6666870117188, "epoch": 0.13142857142857142, "grad_norm": 0.16569606959819794, "kl": 0.048095703125, "learning_rate": 6.920420666261961e-07, "loss": 0.013, "reward": -0.08609153889119625, "reward_std": 0.5831933580338955, "rewards/cosine_scaled_reward": -0.10554578248411417, "rewards/format_reward": 0.1250000037252903, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 3443.7916870117188, "epoch": 0.132, "grad_norm": 0.17575590312480927, "kl": 0.0565185546875, "learning_rate": 6.890576474687263e-07, "loss": 0.0443, "reward": 0.044117134995758533, "reward_std": 0.6273631011135876, "rewards/cosine_scaled_reward": -0.040441428776830435, "rewards/format_reward": 0.1250000037252903, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 2883.8333435058594, "epoch": 0.13257142857142856, "grad_norm": 0.21763095259666443, "kl": 0.0523681640625, "learning_rate": 6.860664508377001e-07, "loss": 0.026, "reward": -0.18567287921905518, "reward_std": 0.4521710118278861, "rewards/cosine_scaled_reward": -0.217836432158947, "rewards/format_reward": 0.25, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.13314285714285715, "grad_norm": 0.16012197732925415, "kl": 0.05657958984375, "learning_rate": 6.83068622519821e-07, "loss": 0.0002, "reward": -0.4914732947945595, "reward_std": 0.17538534849882126, "rewards/cosine_scaled_reward": -0.24573664739727974, "rewards/format_reward": 0.0, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.1337142857142857, "grad_norm": 0.17368990182876587, "kl": 0.0556640625, "learning_rate": 6.800643086250121e-07, "loss": 0.0002, "reward": -0.5374301336705685, "reward_std": 0.2229807935655117, "rewards/cosine_scaled_reward": -0.26871506683528423, "rewards/format_reward": 0.0, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 2889.9583435058594, "epoch": 0.13428571428571429, "grad_norm": 0.18229198455810547, "kl": 0.06298828125, "learning_rate": 6.770536555792944e-07, "loss": 0.0052, "reward": 0.22926755994558334, "reward_std": 0.5730621181428432, "rewards/cosine_scaled_reward": -0.010366253554821014, "rewards/format_reward": 0.25, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 2836.5000610351562, "epoch": 0.13485714285714287, "grad_norm": 0.19761309027671814, "kl": 0.06787109375, "learning_rate": 6.740368101176495e-07, "loss": 0.0007, "reward": 0.009470507502555847, "reward_std": 0.4301174655556679, "rewards/cosine_scaled_reward": -0.30776476114988327, "rewards/format_reward": 0.625, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 2920.541717529297, "epoch": 0.13542857142857143, "grad_norm": 0.4432980418205261, "kl": 0.07293701171875, "learning_rate": 6.710139192768694e-07, "loss": 0.2146, "reward": -0.30785553343594074, "reward_std": 0.3011339120566845, "rewards/cosine_scaled_reward": -0.299761101603508, "rewards/format_reward": 0.291666679084301, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 3406.5000610351562, "epoch": 0.136, "grad_norm": 0.1461995542049408, "kl": 0.0609130859375, "learning_rate": 6.679851303883891e-07, "loss": 0.0953, "reward": -0.4275861941277981, "reward_std": 0.42392516881227493, "rewards/cosine_scaled_reward": -0.3179597780108452, "rewards/format_reward": 0.2083333395421505, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 3582.4583740234375, "epoch": 0.13657142857142857, "grad_norm": 0.14785300195217133, "kl": 0.05682373046875, "learning_rate": 6.649505910711058e-07, "loss": 0.0009, "reward": -0.2121518924832344, "reward_std": 0.45589612051844597, "rewards/cosine_scaled_reward": -0.1269092820584774, "rewards/format_reward": 0.0416666679084301, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.13714285714285715, "grad_norm": 0.17068707942962646, "kl": 0.0528564453125, "learning_rate": 6.619104492241847e-07, "loss": 0.0002, "reward": -0.6798624247312546, "reward_std": 0.14933781139552593, "rewards/cosine_scaled_reward": -0.3399312049150467, "rewards/format_reward": 0.0, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 2797.2916870117188, "epoch": 0.1377142857142857, "grad_norm": 0.2396436184644699, "kl": 0.05389404296875, "learning_rate": 6.588648530198504e-07, "loss": 0.0737, "reward": 0.3082697440404445, "reward_std": 0.345156442373991, "rewards/cosine_scaled_reward": -0.033365145325660706, "rewards/format_reward": 0.375, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.1382857142857143, "grad_norm": 0.21391235291957855, "kl": 0.07708740234375, "learning_rate": 6.558139508961654e-07, "loss": 0.0003, "reward": -0.508253276348114, "reward_std": 0.14199923910200596, "rewards/cosine_scaled_reward": -0.254126638174057, "rewards/format_reward": 0.0, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.13885714285714285, "grad_norm": 0.16383755207061768, "kl": 0.070068359375, "learning_rate": 6.527578915497951e-07, "loss": 0.0003, "reward": -0.6054684594273567, "reward_std": 0.18681432865560055, "rewards/cosine_scaled_reward": -0.30273422971367836, "rewards/format_reward": 0.0, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 2971.6666870117188, "epoch": 0.13942857142857143, "grad_norm": 0.5523943305015564, "kl": 0.07171630859375, "learning_rate": 6.496968239287603e-07, "loss": 0.2458, "reward": -0.2852616235613823, "reward_std": 0.6826003473252058, "rewards/cosine_scaled_reward": -0.28846415132284164, "rewards/format_reward": 0.2916666753590107, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 3252.5833740234375, "epoch": 0.14, "grad_norm": 0.34070122241973877, "kl": 0.0665283203125, "learning_rate": 6.466308972251785e-07, "loss": 0.1237, "reward": -0.17518402566201985, "reward_std": 0.9328324533998966, "rewards/cosine_scaled_reward": -0.19175868202000856, "rewards/format_reward": 0.2083333358168602, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 3163.6666870117188, "epoch": 0.14057142857142857, "grad_norm": 0.21714583039283752, "kl": 0.0450439453125, "learning_rate": 6.435602608679916e-07, "loss": 0.0759, "reward": 0.32051989436149597, "reward_std": 0.785211868584156, "rewards/cosine_scaled_reward": 0.03525993227958679, "rewards/format_reward": 0.2500000111758709, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.14114285714285715, "grad_norm": 0.14294424653053284, "kl": 0.05615234375, "learning_rate": 6.404850645156841e-07, "loss": 0.0002, "reward": -0.5327610298991203, "reward_std": 0.23994574137032032, "rewards/cosine_scaled_reward": -0.26638051867485046, "rewards/format_reward": 0.0, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 2089.9166870117188, "epoch": 0.1417142857142857, "grad_norm": 0.6865227222442627, "kl": 0.06201171875, "learning_rate": 6.374054580489873e-07, "loss": 0.2536, "reward": 0.7073055021464825, "reward_std": 0.8850769177079201, "rewards/cosine_scaled_reward": 0.04115273058414459, "rewards/format_reward": 0.6250000149011612, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 3493.0, "epoch": 0.1422857142857143, "grad_norm": 0.2024909406900406, "kl": 0.052978515625, "learning_rate": 6.343215915635761e-07, "loss": 0.0052, "reward": -0.4589925929903984, "reward_std": 0.3254140354692936, "rewards/cosine_scaled_reward": -0.3336629644036293, "rewards/format_reward": 0.2083333432674408, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 3328.75, "epoch": 0.14285714285714285, "grad_norm": 0.16820330917835236, "kl": 0.0540771484375, "learning_rate": 6.31233615362752e-07, "loss": 0.0172, "reward": 0.03026793897151947, "reward_std": 0.5065001584589481, "rewards/cosine_scaled_reward": -0.08903270214796066, "rewards/format_reward": 0.2083333432674408, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 2929.4583435058594, "epoch": 0.14342857142857143, "grad_norm": 0.16301412880420685, "kl": 0.05047607421875, "learning_rate": 6.281416799501187e-07, "loss": -0.0337, "reward": 0.12225878238677979, "reward_std": 0.5798847824335098, "rewards/cosine_scaled_reward": -0.1472039744257927, "rewards/format_reward": 0.4166666716337204, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 3335.9166870117188, "epoch": 0.144, "grad_norm": 0.41827014088630676, "kl": 0.0655517578125, "learning_rate": 6.25045936022246e-07, "loss": 0.1143, "reward": -0.3226817846298218, "reward_std": 0.5383487045764923, "rewards/cosine_scaled_reward": -0.2446742206811905, "rewards/format_reward": 0.1666666679084301, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 3154.5416870117188, "epoch": 0.14457142857142857, "grad_norm": 0.2202560156583786, "kl": 0.320556640625, "learning_rate": 6.219465344613258e-07, "loss": 0.0519, "reward": 0.08730845898389816, "reward_std": 0.3982619745656848, "rewards/cosine_scaled_reward": -0.060512442141771317, "rewards/format_reward": 0.2083333432674408, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 2991.75, "epoch": 0.14514285714285713, "grad_norm": 0.1752176582813263, "kl": 0.037750244140625, "learning_rate": 6.188436263278172e-07, "loss": 0.0455, "reward": -0.2120664268732071, "reward_std": 0.19734735041856766, "rewards/cosine_scaled_reward": -0.23103319853544235, "rewards/format_reward": 0.25, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 3577.4583740234375, "epoch": 0.1457142857142857, "grad_norm": 0.19093064963817596, "kl": 0.0672607421875, "learning_rate": 6.157373628530852e-07, "loss": 0.0024, "reward": -0.43620166182518005, "reward_std": 0.5630457103252411, "rewards/cosine_scaled_reward": -0.259767509996891, "rewards/format_reward": 0.0833333358168602, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.1462857142857143, "grad_norm": 0.27532970905303955, "kl": 0.04736328125, "learning_rate": 6.126278954320294e-07, "loss": 0.0002, "reward": -0.7111312747001648, "reward_std": 0.19976815208792686, "rewards/cosine_scaled_reward": -0.3555656373500824, "rewards/format_reward": 0.0, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 3072.7916870117188, "epoch": 0.14685714285714285, "grad_norm": 0.2951967418193817, "kl": 0.06939697265625, "learning_rate": 6.095153756157051e-07, "loss": 0.1916, "reward": -0.30195215344429016, "reward_std": 0.5967418141663074, "rewards/cosine_scaled_reward": -0.23430940508842468, "rewards/format_reward": 0.1666666716337204, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 2571.916717529297, "epoch": 0.14742857142857144, "grad_norm": 0.24137046933174133, "kl": 0.0477294921875, "learning_rate": 6.06399955103937e-07, "loss": -0.0004, "reward": 0.16775222728028893, "reward_std": 0.5024616029113531, "rewards/cosine_scaled_reward": -0.1661239117383957, "rewards/format_reward": 0.5000000111758709, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.148, "grad_norm": 0.15327315032482147, "kl": 0.055908203125, "learning_rate": 6.032817857379256e-07, "loss": 0.0002, "reward": -0.6158068254590034, "reward_std": 0.23382795974612236, "rewards/cosine_scaled_reward": -0.3079034052789211, "rewards/format_reward": 0.0, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 3555.75, "epoch": 0.14857142857142858, "grad_norm": 0.13946880400180817, "kl": 0.0531005859375, "learning_rate": 6.001610194928464e-07, "loss": 0.0039, "reward": -0.1873398944735527, "reward_std": 0.41095768846571445, "rewards/cosine_scaled_reward": -0.13533661514520645, "rewards/format_reward": 0.0833333358168602, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.14914285714285713, "grad_norm": 0.14360758662223816, "kl": 0.05487060546875, "learning_rate": 5.97037808470444e-07, "loss": 0.0002, "reward": -0.652643047273159, "reward_std": 0.15021498315036297, "rewards/cosine_scaled_reward": -0.3263215161859989, "rewards/format_reward": 0.0, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 3575.9583740234375, "epoch": 0.14971428571428572, "grad_norm": 0.16494929790496826, "kl": 0.05615234375, "learning_rate": 5.939123048916173e-07, "loss": 0.0024, "reward": -0.3416307270526886, "reward_std": 0.6418765336275101, "rewards/cosine_scaled_reward": -0.2333153709769249, "rewards/format_reward": 0.125, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.15028571428571427, "grad_norm": 0.16168098151683807, "kl": 0.068115234375, "learning_rate": 5.907846610890011e-07, "loss": 0.0003, "reward": -0.47282079607248306, "reward_std": 0.17454615235328674, "rewards/cosine_scaled_reward": -0.23641039803624153, "rewards/format_reward": 0.0, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.15085714285714286, "grad_norm": 0.17631350457668304, "kl": 0.065673828125, "learning_rate": 5.87655029499542e-07, "loss": 0.0003, "reward": -0.388326043728739, "reward_std": 0.3333720900118351, "rewards/cosine_scaled_reward": -0.19416302209720016, "rewards/format_reward": 0.0, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 3424.2083740234375, "epoch": 0.15142857142857144, "grad_norm": 0.19585396349430084, "kl": 0.070556640625, "learning_rate": 5.845235626570683e-07, "loss": 0.057, "reward": -0.058371422346681356, "reward_std": 0.7044498100876808, "rewards/cosine_scaled_reward": -0.15418571420013905, "rewards/format_reward": 0.25, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 3448.2083740234375, "epoch": 0.152, "grad_norm": 0.15063370764255524, "kl": 0.061279296875, "learning_rate": 5.813904131848564e-07, "loss": 0.0162, "reward": -0.41802845895290375, "reward_std": 0.29004839062690735, "rewards/cosine_scaled_reward": -0.31318090111017227, "rewards/format_reward": 0.2083333432674408, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 2616.7500915527344, "epoch": 0.15257142857142858, "grad_norm": 0.27274543046951294, "kl": 0.1700439453125, "learning_rate": 5.78255733788191e-07, "loss": 0.0972, "reward": -0.0867045596241951, "reward_std": 0.6231234706938267, "rewards/cosine_scaled_reward": -0.2516856137663126, "rewards/format_reward": 0.4166666716337204, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 2716.875030517578, "epoch": 0.15314285714285714, "grad_norm": 0.29783037304878235, "kl": 0.05804443359375, "learning_rate": 5.751196772469237e-07, "loss": 0.0792, "reward": 0.11836675927042961, "reward_std": 0.5197452530264854, "rewards/cosine_scaled_reward": -0.12831661850214005, "rewards/format_reward": 0.375, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.15371428571428572, "grad_norm": 0.16401593387126923, "kl": 0.053466796875, "learning_rate": 5.71982396408026e-07, "loss": 0.0002, "reward": -0.5338539285585284, "reward_std": 0.09281859919428825, "rewards/cosine_scaled_reward": -0.2669269605539739, "rewards/format_reward": 0.0, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 2750.0416870117188, "epoch": 0.15428571428571428, "grad_norm": 0.2087016999721527, "kl": 0.07025146484375, "learning_rate": 5.688440441781398e-07, "loss": -0.0195, "reward": 0.8994439318776131, "reward_std": 0.3248627707362175, "rewards/cosine_scaled_reward": 0.22055532410740852, "rewards/format_reward": 0.4583333432674408, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 3442.2083740234375, "epoch": 0.15485714285714286, "grad_norm": 0.21406613290309906, "kl": 0.08978271484375, "learning_rate": 5.657047735161255e-07, "loss": 0.0622, "reward": -0.3841980127617717, "reward_std": 0.4643286466598511, "rewards/cosine_scaled_reward": -0.2754323445260525, "rewards/format_reward": 0.1666666716337204, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 2941.0833435058594, "epoch": 0.15542857142857142, "grad_norm": 0.30245617032051086, "kl": 0.053955078125, "learning_rate": 5.625647374256061e-07, "loss": 0.1571, "reward": -0.32398582744644955, "reward_std": 0.7070891531184316, "rewards/cosine_scaled_reward": -0.18282624892890453, "rewards/format_reward": 0.0416666679084301, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 3562.5833740234375, "epoch": 0.156, "grad_norm": 0.2364557981491089, "kl": 0.08203125, "learning_rate": 5.594240889475106e-07, "loss": 0.0027, "reward": -0.5826010927557945, "reward_std": 0.2715669982135296, "rewards/cosine_scaled_reward": -0.29130054637789726, "rewards/format_reward": 0.0, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 3581.625, "epoch": 0.15657142857142858, "grad_norm": 0.15081574022769928, "kl": 0.0504150390625, "learning_rate": 5.562829811526154e-07, "loss": 0.0015, "reward": -0.5524590611457825, "reward_std": 0.2481101332232356, "rewards/cosine_scaled_reward": -0.29706286266446114, "rewards/format_reward": 0.0416666679084301, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 3234.7500610351562, "epoch": 0.15714285714285714, "grad_norm": 0.152909055352211, "kl": 0.05181884765625, "learning_rate": 5.531415671340826e-07, "loss": 0.1079, "reward": -0.4288216568529606, "reward_std": 0.3044823817908764, "rewards/cosine_scaled_reward": -0.29774416238069534, "rewards/format_reward": 0.1666666679084301, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 2614.8333587646484, "epoch": 0.15771428571428572, "grad_norm": 0.23602361977100372, "kl": 0.05712890625, "learning_rate": 5.5e-07, "loss": -0.0228, "reward": 0.7694906368851662, "reward_std": 1.0586964339017868, "rewards/cosine_scaled_reward": 0.11391199752688408, "rewards/format_reward": 0.5416666865348816, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 3485.3333740234375, "epoch": 0.15828571428571428, "grad_norm": 0.1500847041606903, "kl": 0.0594482421875, "learning_rate": 5.468584328659172e-07, "loss": 0.0609, "reward": -0.2842923626303673, "reward_std": 0.6474459134042263, "rewards/cosine_scaled_reward": -0.18381286598742008, "rewards/format_reward": 0.0833333358168602, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 3267.916748046875, "epoch": 0.15885714285714286, "grad_norm": 0.15897172689437866, "kl": 0.0579833984375, "learning_rate": 5.437170188473847e-07, "loss": 0.0396, "reward": -0.12434278428554535, "reward_std": 0.8399608321487904, "rewards/cosine_scaled_reward": -0.18717139214277267, "rewards/format_reward": 0.25, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 3488.3333740234375, "epoch": 0.15942857142857142, "grad_norm": 0.23828986287117004, "kl": 0.0665283203125, "learning_rate": 5.405759110524894e-07, "loss": 0.0221, "reward": 0.005788974463939667, "reward_std": 0.5269475094974041, "rewards/cosine_scaled_reward": -0.08043883368372917, "rewards/format_reward": 0.1666666679084301, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 2997.0833740234375, "epoch": 0.16, "grad_norm": 0.21499881148338318, "kl": 0.04998779296875, "learning_rate": 5.37435262574394e-07, "loss": 0.0863, "reward": 0.2994392313994467, "reward_std": 0.5071588382124901, "rewards/cosine_scaled_reward": -0.05861368915066123, "rewards/format_reward": 0.4166666865348816, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 2781.791748046875, "epoch": 0.16057142857142856, "grad_norm": 0.3882482647895813, "kl": 0.0693359375, "learning_rate": 5.342952264838747e-07, "loss": 0.2062, "reward": 0.33036062493920326, "reward_std": 0.5028475373983383, "rewards/cosine_scaled_reward": -0.02231965959072113, "rewards/format_reward": 0.3750000037252903, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 3425.9166870117188, "epoch": 0.16114285714285714, "grad_norm": 0.15672989189624786, "kl": 0.05950927734375, "learning_rate": 5.311559558218603e-07, "loss": 0.0345, "reward": 0.03875131532549858, "reward_std": 0.6108375154435635, "rewards/cosine_scaled_reward": -0.06395764835178852, "rewards/format_reward": 0.1666666716337204, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 3141.7916870117188, "epoch": 0.16171428571428573, "grad_norm": 0.24408189952373505, "kl": 0.06939697265625, "learning_rate": 5.28017603591974e-07, "loss": 0.1504, "reward": -0.5937509834766388, "reward_std": 0.25269549153745174, "rewards/cosine_scaled_reward": -0.3802088275551796, "rewards/format_reward": 0.1666666716337204, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 2734.750045776367, "epoch": 0.16228571428571428, "grad_norm": 0.3049091100692749, "kl": 0.06610107421875, "learning_rate": 5.248803227530763e-07, "loss": 0.1375, "reward": 0.437591552734375, "reward_std": 0.3647673763334751, "rewards/cosine_scaled_reward": 0.07296241726726294, "rewards/format_reward": 0.2916666679084301, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.16285714285714287, "grad_norm": 0.18352235853672028, "kl": 0.07177734375, "learning_rate": 5.21744266211809e-07, "loss": 0.0003, "reward": -0.5737389139831066, "reward_std": 0.1539888195693493, "rewards/cosine_scaled_reward": -0.2868694569915533, "rewards/format_reward": 0.0, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 3354.1250610351562, "epoch": 0.16342857142857142, "grad_norm": 0.2930234968662262, "kl": 0.0418701171875, "learning_rate": 5.186095868151436e-07, "loss": 0.1145, "reward": -0.2171124890446663, "reward_std": 0.7043565139174461, "rewards/cosine_scaled_reward": -0.1918895822018385, "rewards/format_reward": 0.1666666716337204, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 3343.3750610351562, "epoch": 0.164, "grad_norm": 0.16565990447998047, "kl": 0.05560302734375, "learning_rate": 5.154764373429315e-07, "loss": 0.0172, "reward": -0.2116563618183136, "reward_std": 0.49668116495013237, "rewards/cosine_scaled_reward": -0.25166154466569424, "rewards/format_reward": 0.2916666679084301, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 3557.3750610351562, "epoch": 0.16457142857142856, "grad_norm": 0.14387962222099304, "kl": 0.06109619140625, "learning_rate": 5.123449705004581e-07, "loss": 0.0158, "reward": -0.7241450697183609, "reward_std": 0.3718634694814682, "rewards/cosine_scaled_reward": -0.40373920649290085, "rewards/format_reward": 0.0833333358168602, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.16514285714285715, "grad_norm": 0.1413969099521637, "kl": 0.06207275390625, "learning_rate": 5.09215338910999e-07, "loss": 0.0002, "reward": -0.49317351169884205, "reward_std": 0.11529102176427841, "rewards/cosine_scaled_reward": -0.24658673349767923, "rewards/format_reward": 0.0, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 3508.0416870117188, "epoch": 0.1657142857142857, "grad_norm": 0.16138982772827148, "kl": 0.09033203125, "learning_rate": 5.060876951083828e-07, "loss": 0.0174, "reward": -0.308999203145504, "reward_std": 0.2485736645758152, "rewards/cosine_scaled_reward": -0.1961662769317627, "rewards/format_reward": 0.0833333358168602, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.1662857142857143, "grad_norm": 0.6058784127235413, "kl": 0.123291015625, "learning_rate": 5.02962191529556e-07, "loss": 0.0005, "reward": -0.41384733468294144, "reward_std": 0.3912395089864731, "rewards/cosine_scaled_reward": -0.20692366734147072, "rewards/format_reward": 0.0, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 3541.9166870117188, "epoch": 0.16685714285714287, "grad_norm": 0.1335904747247696, "kl": 0.0677490234375, "learning_rate": 4.998389805071536e-07, "loss": 0.0181, "reward": -0.5098201781511307, "reward_std": 0.5878917332738638, "rewards/cosine_scaled_reward": -0.29657675698399544, "rewards/format_reward": 0.0833333358168602, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 3127.541748046875, "epoch": 0.16742857142857143, "grad_norm": 0.19865606725215912, "kl": 0.07427978515625, "learning_rate": 4.967182142620745e-07, "loss": 0.133, "reward": 0.4655437543988228, "reward_std": 1.0733295306563377, "rewards/cosine_scaled_reward": 0.04527186043560505, "rewards/format_reward": 0.3750000074505806, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 3428.7083740234375, "epoch": 0.168, "grad_norm": 0.17549487948417664, "kl": 0.0758056640625, "learning_rate": 4.93600044896063e-07, "loss": 0.0327, "reward": -0.4640468708239496, "reward_std": 0.46714043989777565, "rewards/cosine_scaled_reward": -0.2736901044845581, "rewards/format_reward": 0.0833333358168602, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 3540.375, "epoch": 0.16857142857142857, "grad_norm": 0.17328360676765442, "kl": 0.065399169921875, "learning_rate": 4.904846243842949e-07, "loss": 0.0115, "reward": -0.21514444053173065, "reward_std": 0.19357968866825104, "rewards/cosine_scaled_reward": -0.14923888631165028, "rewards/format_reward": 0.0833333358168602, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 3369.0416870117188, "epoch": 0.16914285714285715, "grad_norm": 0.26708632707595825, "kl": 0.075439453125, "learning_rate": 4.873721045679706e-07, "loss": 0.0851, "reward": -0.5817175265401602, "reward_std": 0.30820662481710315, "rewards/cosine_scaled_reward": -0.35335876047611237, "rewards/format_reward": 0.125, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.1697142857142857, "grad_norm": 0.14792469143867493, "kl": 0.0599365234375, "learning_rate": 4.842626371469149e-07, "loss": 0.0002, "reward": -0.22593041975051165, "reward_std": 0.6252758391201496, "rewards/cosine_scaled_reward": -0.15463187545537949, "rewards/format_reward": 0.0833333358168602, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 3517.875, "epoch": 0.1702857142857143, "grad_norm": 0.17152658104896545, "kl": 0.06243896484375, "learning_rate": 4.811563736721829e-07, "loss": 0.0254, "reward": -0.43114787340164185, "reward_std": 0.5913300961256027, "rewards/cosine_scaled_reward": -0.2572406269609928, "rewards/format_reward": 0.0833333358168602, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 3537.6666870117188, "epoch": 0.17085714285714285, "grad_norm": 0.16120056807994843, "kl": 0.0836181640625, "learning_rate": 4.780534655386743e-07, "loss": 0.028, "reward": -0.09139684634283185, "reward_std": 0.4525203984230757, "rewards/cosine_scaled_reward": -0.06653175503015518, "rewards/format_reward": 0.0416666679084301, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 3293.8333740234375, "epoch": 0.17142857142857143, "grad_norm": 0.20001089572906494, "kl": 0.0758056640625, "learning_rate": 4.749540639777539e-07, "loss": 0.0915, "reward": -0.4263968728482723, "reward_std": 0.5731936097145081, "rewards/cosine_scaled_reward": -0.2965317729394883, "rewards/format_reward": 0.1666666679084301, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 3485.8333740234375, "epoch": 0.172, "grad_norm": 0.1472051441669464, "kl": 0.05682373046875, "learning_rate": 4.7185832004988133e-07, "loss": 0.0364, "reward": -0.38035064563155174, "reward_std": 0.330845657736063, "rewards/cosine_scaled_reward": -0.2526753172278404, "rewards/format_reward": 0.125, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 2926.625, "epoch": 0.17257142857142857, "grad_norm": 0.17583613097667694, "kl": 0.0887451171875, "learning_rate": 4.68766384637248e-07, "loss": 0.0105, "reward": 0.0778973288834095, "reward_std": 0.35484304931014776, "rewards/cosine_scaled_reward": -0.08605133555829525, "rewards/format_reward": 0.25, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 3239.125, "epoch": 0.17314285714285715, "grad_norm": 0.22526656091213226, "kl": 0.0518798828125, "learning_rate": 4.656784084364238e-07, "loss": -0.0222, "reward": 0.1513216346502304, "reward_std": 0.2562936134636402, "rewards/cosine_scaled_reward": -0.04933919757604599, "rewards/format_reward": 0.25, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 3572.5833740234375, "epoch": 0.1737142857142857, "grad_norm": 0.1886323094367981, "kl": 0.0618896484375, "learning_rate": 4.6259454195101267e-07, "loss": 0.0052, "reward": -0.6805952340364456, "reward_std": 0.24055694788694382, "rewards/cosine_scaled_reward": -0.361130952835083, "rewards/format_reward": 0.0416666679084301, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 3441.7083740234375, "epoch": 0.1742857142857143, "grad_norm": 0.33955273032188416, "kl": 0.16815185546875, "learning_rate": 4.59514935484316e-07, "loss": 0.0358, "reward": -0.30776484683156013, "reward_std": 0.5681664384901524, "rewards/cosine_scaled_reward": -0.2788824327290058, "rewards/format_reward": 0.2500000111758709, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 2418.8333435058594, "epoch": 0.17485714285714285, "grad_norm": 0.2670734226703644, "kl": 0.156005859375, "learning_rate": 4.5643973913200837e-07, "loss": 0.2228, "reward": 0.33872567117214203, "reward_std": 0.8834966868162155, "rewards/cosine_scaled_reward": -0.03897051140666008, "rewards/format_reward": 0.4166666716337204, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.17542857142857143, "grad_norm": 0.1793275624513626, "kl": 0.04876708984375, "learning_rate": 4.5336910277482155e-07, "loss": 0.0002, "reward": -0.7549219056963921, "reward_std": 0.10495427064597607, "rewards/cosine_scaled_reward": -0.37746093794703484, "rewards/format_reward": 0.0, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 2914.875, "epoch": 0.176, "grad_norm": 0.16645893454551697, "kl": 0.0543212890625, "learning_rate": 4.503031760712397e-07, "loss": -0.0321, "reward": 0.7862786203622818, "reward_std": 0.4856582581996918, "rewards/cosine_scaled_reward": 0.1431393399834633, "rewards/format_reward": 0.5, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.17657142857142857, "grad_norm": 0.1512133628129959, "kl": 0.06036376953125, "learning_rate": 4.4724210845020494e-07, "loss": 0.0002, "reward": -0.5015019737184048, "reward_std": 0.18755416758358479, "rewards/cosine_scaled_reward": -0.25075098779052496, "rewards/format_reward": 0.0, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.17714285714285713, "grad_norm": 0.1387099325656891, "kl": 0.0472412109375, "learning_rate": 4.441860491038345e-07, "loss": 0.0002, "reward": -0.7481062561273575, "reward_std": 0.17375030741095543, "rewards/cosine_scaled_reward": -0.37405312806367874, "rewards/format_reward": 0.0, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 3508.6666870117188, "epoch": 0.1777142857142857, "grad_norm": 0.15475568175315857, "kl": 0.05950927734375, "learning_rate": 4.4113514698014953e-07, "loss": 0.0364, "reward": -0.0750106479972601, "reward_std": 0.5773040950298309, "rewards/cosine_scaled_reward": -0.12083865702152252, "rewards/format_reward": 0.1666666679084301, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.1782857142857143, "grad_norm": 0.1638628989458084, "kl": 0.084228515625, "learning_rate": 4.3808955077581546e-07, "loss": 0.0003, "reward": -0.566048726439476, "reward_std": 0.20274320989847183, "rewards/cosine_scaled_reward": -0.2830243557691574, "rewards/format_reward": 0.0, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 2778.333335876465, "epoch": 0.17885714285714285, "grad_norm": 0.2880570888519287, "kl": 0.05511474609375, "learning_rate": 4.350494089288943e-07, "loss": -0.0187, "reward": 0.12140727043151855, "reward_std": 0.5429899077862501, "rewards/cosine_scaled_reward": -0.06429639086127281, "rewards/format_reward": 0.25, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 3279.2916870117188, "epoch": 0.17942857142857144, "grad_norm": 0.2017597258090973, "kl": 0.06085205078125, "learning_rate": 4.3201486961161093e-07, "loss": 0.0592, "reward": -0.1488794982433319, "reward_std": 0.4336536340415478, "rewards/cosine_scaled_reward": -0.17860640585422516, "rewards/format_reward": 0.2083333432674408, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 3494.0, "epoch": 0.18, "grad_norm": 0.1533028930425644, "kl": 0.06219482421875, "learning_rate": 4.2898608072313045e-07, "loss": 0.017, "reward": -0.6007172577083111, "reward_std": 0.23412644118070602, "rewards/cosine_scaled_reward": -0.3420252874493599, "rewards/format_reward": 0.0833333358168602, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 3558.875, "epoch": 0.18057142857142858, "grad_norm": 0.1556541472673416, "kl": 0.06085205078125, "learning_rate": 4.2596318988235037e-07, "loss": 0.0149, "reward": -0.5843707770109177, "reward_std": 0.26823178865015507, "rewards/cosine_scaled_reward": -0.31301872432231903, "rewards/format_reward": 0.0416666679084301, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 3410.5000610351562, "epoch": 0.18114285714285713, "grad_norm": 0.30716508626937866, "kl": 0.0994873046875, "learning_rate": 4.2294634442070553e-07, "loss": 0.0859, "reward": -0.14803938567638397, "reward_std": 0.9198054745793343, "rewards/cosine_scaled_reward": -0.15735302958637476, "rewards/format_reward": 0.1666666716337204, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 3349.4166870117188, "epoch": 0.18171428571428572, "grad_norm": 0.16601794958114624, "kl": 0.06646728515625, "learning_rate": 4.1993569137498776e-07, "loss": 0.0599, "reward": -0.34297938644886017, "reward_std": 0.745935533195734, "rewards/cosine_scaled_reward": -0.2548230402171612, "rewards/format_reward": 0.1666666679084301, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 3559.7083740234375, "epoch": 0.18228571428571427, "grad_norm": 0.18891586363315582, "kl": 0.056884765625, "learning_rate": 4.1693137748017915e-07, "loss": 0.0138, "reward": -0.45162222534418106, "reward_std": 0.5305716693401337, "rewards/cosine_scaled_reward": -0.26747776684351265, "rewards/format_reward": 0.0833333358168602, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 3171.75, "epoch": 0.18285714285714286, "grad_norm": 0.14246085286140442, "kl": 0.055816650390625, "learning_rate": 4.1393354916230005e-07, "loss": -0.044, "reward": 0.17042651772499084, "reward_std": 0.24396423622965813, "rewards/cosine_scaled_reward": -0.03978674113750458, "rewards/format_reward": 0.25, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.18342857142857144, "grad_norm": 0.16783183813095093, "kl": 0.07696533203125, "learning_rate": 4.1094235253127374e-07, "loss": 0.0003, "reward": -0.5164474323391914, "reward_std": 0.16403124667704105, "rewards/cosine_scaled_reward": -0.25822371058166027, "rewards/format_reward": 0.0, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.184, "grad_norm": 0.13648982346057892, "kl": 0.05108642578125, "learning_rate": 4.079579333738039e-07, "loss": 0.0002, "reward": -0.3795193247497082, "reward_std": 0.49757753871381283, "rewards/cosine_scaled_reward": -0.2105929981917143, "rewards/format_reward": 0.0416666679084301, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.18457142857142858, "grad_norm": 0.14335662126541138, "kl": 0.05615234375, "learning_rate": 4.0498043714627006e-07, "loss": 0.0002, "reward": -0.499904029071331, "reward_std": 0.40482280403375626, "rewards/cosine_scaled_reward": -0.2499520145356655, "rewards/format_reward": 0.0, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 3524.3333740234375, "epoch": 0.18514285714285714, "grad_norm": 0.1648027002811432, "kl": 0.05670166015625, "learning_rate": 4.020100089676376e-07, "loss": 0.0232, "reward": -0.22896066308021545, "reward_std": 0.8391835503280163, "rewards/cosine_scaled_reward": -0.17698032222688198, "rewards/format_reward": 0.1250000037252903, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 3188.75, "epoch": 0.18571428571428572, "grad_norm": 0.30512839555740356, "kl": 0.0650634765625, "learning_rate": 3.9904679361238526e-07, "loss": 0.1489, "reward": -0.47016472124960274, "reward_std": 0.2999837603420019, "rewards/cosine_scaled_reward": -0.31841566786170006, "rewards/format_reward": 0.1666666716337204, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 2809.458335876465, "epoch": 0.18628571428571428, "grad_norm": 0.502647876739502, "kl": 0.085205078125, "learning_rate": 3.9609093550344907e-07, "loss": 0.1019, "reward": -0.14913707971572876, "reward_std": 0.15667949616909027, "rewards/cosine_scaled_reward": -0.19956854358315468, "rewards/format_reward": 0.25, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.18685714285714286, "grad_norm": 0.15863609313964844, "kl": 0.058349609375, "learning_rate": 3.931425787051832e-07, "loss": 0.0002, "reward": -0.6654567644000053, "reward_std": 0.10124669130891562, "rewards/cosine_scaled_reward": -0.33272838313132524, "rewards/format_reward": 0.0, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 3116.291717529297, "epoch": 0.18742857142857142, "grad_norm": 0.3484804332256317, "kl": 0.09393310546875, "learning_rate": 3.902018669163384e-07, "loss": 0.1693, "reward": -0.508688498288393, "reward_std": 0.37443263083696365, "rewards/cosine_scaled_reward": -0.37934424355626106, "rewards/format_reward": 0.2500000074505806, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 3456.5, "epoch": 0.188, "grad_norm": 0.20245981216430664, "kl": 0.05224609375, "learning_rate": 3.872689434630585e-07, "loss": 0.0368, "reward": -0.6058483272790909, "reward_std": 0.33944240026175976, "rewards/cosine_scaled_reward": -0.36542417109012604, "rewards/format_reward": 0.125, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 3498.375, "epoch": 0.18857142857142858, "grad_norm": 0.15449191629886627, "kl": 0.05731201171875, "learning_rate": 3.843439512918949e-07, "loss": 0.0257, "reward": -0.7321149110794067, "reward_std": 0.17255932837724686, "rewards/cosine_scaled_reward": -0.40772412717342377, "rewards/format_reward": 0.0833333358168602, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 3003.0416870117188, "epoch": 0.18914285714285714, "grad_norm": 0.1611107587814331, "kl": 0.07293701171875, "learning_rate": 3.8142703296283953e-07, "loss": -0.0101, "reward": 0.07398295402526855, "reward_std": 0.43574428372085094, "rewards/cosine_scaled_reward": -0.10884186625480652, "rewards/format_reward": 0.2916666679084301, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 3474.4166870117188, "epoch": 0.18971428571428572, "grad_norm": 0.1410750448703766, "kl": 0.0550537109375, "learning_rate": 3.785183306423767e-07, "loss": 0.0602, "reward": -0.3694458119571209, "reward_std": 0.21785772289149463, "rewards/cosine_scaled_reward": -0.2055562452878803, "rewards/format_reward": 0.0416666679084301, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 3202.625, "epoch": 0.19028571428571428, "grad_norm": 0.2355586290359497, "kl": 0.069091796875, "learning_rate": 3.7561798609655373e-07, "loss": 0.1097, "reward": -0.24160130321979523, "reward_std": 0.5544586107134819, "rewards/cosine_scaled_reward": -0.20413399254903197, "rewards/format_reward": 0.1666666716337204, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.19085714285714286, "grad_norm": 0.1450648009777069, "kl": 0.0533447265625, "learning_rate": 3.72726140684072e-07, "loss": 0.0002, "reward": -0.5695072785019875, "reward_std": 0.1969633586704731, "rewards/cosine_scaled_reward": -0.284753642976284, "rewards/format_reward": 0.0, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 3126.4583740234375, "epoch": 0.19142857142857142, "grad_norm": 0.2336735874414444, "kl": 0.058837890625, "learning_rate": 3.6984293534939737e-07, "loss": 0.1574, "reward": 0.08427366428077221, "reward_std": 0.6991970017552376, "rewards/cosine_scaled_reward": -0.12452983297407627, "rewards/format_reward": 0.3333333395421505, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 3331.25, "epoch": 0.192, "grad_norm": 0.15116646885871887, "kl": 0.05267333984375, "learning_rate": 3.6696851061588994e-07, "loss": 0.0781, "reward": -0.09475219994783401, "reward_std": 0.5551117174327374, "rewards/cosine_scaled_reward": -0.13070943020284176, "rewards/format_reward": 0.1666666716337204, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 3105.75, "epoch": 0.19257142857142856, "grad_norm": 0.3058691918849945, "kl": 0.05621337890625, "learning_rate": 3.641030065789562e-07, "loss": 0.1142, "reward": 0.07824870198965073, "reward_std": 0.3396912142634392, "rewards/cosine_scaled_reward": -0.06504232808947563, "rewards/format_reward": 0.2083333432674408, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 2867.8333740234375, "epoch": 0.19314285714285714, "grad_norm": 0.2673719525337219, "kl": 0.070068359375, "learning_rate": 3.612465628992203e-07, "loss": 0.0946, "reward": -0.12158103799447417, "reward_std": 0.7564724162220955, "rewards/cosine_scaled_reward": -0.20662385318428278, "rewards/format_reward": 0.2916666679084301, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.19371428571428573, "grad_norm": 0.19467608630657196, "kl": 0.0643310546875, "learning_rate": 3.5839931879571725e-07, "loss": 0.0003, "reward": -0.6717243790626526, "reward_std": 0.19412844628095627, "rewards/cosine_scaled_reward": -0.3358621746301651, "rewards/format_reward": 0.0, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 2584.916717529297, "epoch": 0.19428571428571428, "grad_norm": 0.22200323641300201, "kl": 0.0587158203125, "learning_rate": 3.555614130391079e-07, "loss": 0.111, "reward": -0.06582178920507431, "reward_std": 0.39784812927246094, "rewards/cosine_scaled_reward": -0.26207755878567696, "rewards/format_reward": 0.4583333432674408, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 3294.4583740234375, "epoch": 0.19485714285714287, "grad_norm": 0.19403870403766632, "kl": 0.063079833984375, "learning_rate": 3.5273298394491515e-07, "loss": 0.1289, "reward": -0.3598571866750717, "reward_std": 0.6343272626399994, "rewards/cosine_scaled_reward": -0.26326192915439606, "rewards/format_reward": 0.1666666716337204, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 3205.5, "epoch": 0.19542857142857142, "grad_norm": 0.26232898235321045, "kl": 0.06964111328125, "learning_rate": 3.4991416936678276e-07, "loss": 0.1101, "reward": -0.16020217537879944, "reward_std": 0.5374426189810038, "rewards/cosine_scaled_reward": -0.16343442350625992, "rewards/format_reward": 0.1666666716337204, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 3284.3333740234375, "epoch": 0.196, "grad_norm": 0.1842142790555954, "kl": 0.08135986328125, "learning_rate": 3.471051066897562e-07, "loss": 0.0697, "reward": -0.3795701861381531, "reward_std": 0.38023433461785316, "rewards/cosine_scaled_reward": -0.2939517763443291, "rewards/format_reward": 0.2083333395421505, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 3421.7916870117188, "epoch": 0.19657142857142856, "grad_norm": 0.17274045944213867, "kl": 0.05596923828125, "learning_rate": 3.4430593282358777e-07, "loss": 0.0354, "reward": -0.0571894496679306, "reward_std": 0.5195095278322697, "rewards/cosine_scaled_reward": -0.11192803084850311, "rewards/format_reward": 0.1666666716337204, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 2716.4583740234375, "epoch": 0.19714285714285715, "grad_norm": 0.3007984757423401, "kl": 0.07659912109375, "learning_rate": 3.4151678419606233e-07, "loss": 0.071, "reward": 0.6559105515480042, "reward_std": 0.8571383208036423, "rewards/cosine_scaled_reward": 0.09878861159086227, "rewards/format_reward": 0.4583333395421505, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.1977142857142857, "grad_norm": 0.16842573881149292, "kl": 0.056671142578125, "learning_rate": 3.387377967463493e-07, "loss": 0.0002, "reward": -0.4263014793395996, "reward_std": 0.184997221454978, "rewards/cosine_scaled_reward": -0.23398405965417624, "rewards/format_reward": 0.0416666679084301, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.1982857142857143, "grad_norm": 0.17800471186637878, "kl": 0.05902099609375, "learning_rate": 3.359691059183761e-07, "loss": 0.0002, "reward": -0.43403539061546326, "reward_std": 0.5467140041291714, "rewards/cosine_scaled_reward": -0.23785103484988213, "rewards/format_reward": 0.0416666679084301, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 3102.75, "epoch": 0.19885714285714284, "grad_norm": 0.24414986371994019, "kl": 0.075439453125, "learning_rate": 3.3321084665422803e-07, "loss": 0.0301, "reward": -0.3239244371652603, "reward_std": 0.2671828344464302, "rewards/cosine_scaled_reward": -0.28696221858263016, "rewards/format_reward": 0.25, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 3567.25, "epoch": 0.19942857142857143, "grad_norm": 0.1622619926929474, "kl": 0.060791015625, "learning_rate": 3.3046315338757026e-07, "loss": 0.0097, "reward": -0.4406840596348047, "reward_std": 0.6408121213316917, "rewards/cosine_scaled_reward": -0.24117536237463355, "rewards/format_reward": 0.0416666679084301, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 3151.7083740234375, "epoch": 0.2, "grad_norm": 0.298951655626297, "kl": 0.064453125, "learning_rate": 3.2772616003709616e-07, "loss": 0.1421, "reward": 0.4877912010997534, "reward_std": 0.32348718494176865, "rewards/cosine_scaled_reward": 0.11889559030532837, "rewards/format_reward": 0.2500000074505806, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 3521.25, "epoch": 0.20057142857142857, "grad_norm": 0.14912287890911102, "kl": 0.06524658203125, "learning_rate": 3.250000000000001e-07, "loss": 0.0257, "reward": -0.12844760343432426, "reward_std": 0.9212811887264252, "rewards/cosine_scaled_reward": -0.12672380730509758, "rewards/format_reward": 0.1250000037252903, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 2800.875030517578, "epoch": 0.20114285714285715, "grad_norm": 0.2803266644477844, "kl": 0.058837890625, "learning_rate": 3.222848061454764e-07, "loss": 0.1305, "reward": 0.3184015303850174, "reward_std": 0.8227717503905296, "rewards/cosine_scaled_reward": -0.049132585525512695, "rewards/format_reward": 0.4166666716337204, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 3494.6666870117188, "epoch": 0.2017142857142857, "grad_norm": 0.13803733885288239, "kl": 0.06195068359375, "learning_rate": 3.195807108082429e-07, "loss": 0.0364, "reward": -0.3796301782131195, "reward_std": 0.3742275796830654, "rewards/cosine_scaled_reward": -0.23148176074028015, "rewards/format_reward": 0.0833333358168602, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 3425.7083740234375, "epoch": 0.2022857142857143, "grad_norm": 0.14631807804107666, "kl": 0.053466796875, "learning_rate": 3.168878457820915e-07, "loss": 0.0276, "reward": -0.3159081442281604, "reward_std": 0.33451056107878685, "rewards/cosine_scaled_reward": -0.24128740979358554, "rewards/format_reward": 0.1666666716337204, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 2898.375, "epoch": 0.20285714285714285, "grad_norm": 0.18180125951766968, "kl": 0.06915283203125, "learning_rate": 3.142063423134644e-07, "loss": 0.0021, "reward": 0.19555070251226425, "reward_std": 0.2758802194148302, "rewards/cosine_scaled_reward": -0.08972465991973877, "rewards/format_reward": 0.375, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 3481.7500610351562, "epoch": 0.20342857142857143, "grad_norm": 0.18392685055732727, "kl": 0.05126953125, "learning_rate": 3.115363310950578e-07, "loss": 0.0401, "reward": -0.02315519005060196, "reward_std": 0.9286646004766226, "rewards/cosine_scaled_reward": -0.11574426759034395, "rewards/format_reward": 0.2083333358168602, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 3214.5833740234375, "epoch": 0.204, "grad_norm": 0.14562208950519562, "kl": 0.056884765625, "learning_rate": 3.0887794225945143e-07, "loss": -0.0449, "reward": -0.42156238853931427, "reward_std": 0.2620309516787529, "rewards/cosine_scaled_reward": -0.33578120172023773, "rewards/format_reward": 0.25, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 3117.2083740234375, "epoch": 0.20457142857142857, "grad_norm": 0.17316700518131256, "kl": 0.06512451171875, "learning_rate": 3.062313053727671e-07, "loss": -0.0734, "reward": 0.12614673376083374, "reward_std": 0.474518496543169, "rewards/cosine_scaled_reward": -0.08275998383760452, "rewards/format_reward": 0.2916666679084301, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 3429.8333740234375, "epoch": 0.20514285714285715, "grad_norm": 0.23892873525619507, "kl": 0.072265625, "learning_rate": 3.0359654942835247e-07, "loss": 0.0686, "reward": -0.5188581272959709, "reward_std": 0.37433599308133125, "rewards/cosine_scaled_reward": -0.32192906737327576, "rewards/format_reward": 0.1250000037252903, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 3555.5416870117188, "epoch": 0.2057142857142857, "grad_norm": 0.12861715257167816, "kl": 0.0487060546875, "learning_rate": 3.0097380284049523e-07, "loss": 0.0163, "reward": -0.41498488187789917, "reward_std": 0.29756803065538406, "rewards/cosine_scaled_reward": -0.22832577303051949, "rewards/format_reward": 0.0416666679084301, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 3466.9583740234375, "epoch": 0.2062857142857143, "grad_norm": 0.19341325759887695, "kl": 0.06231689453125, "learning_rate": 2.9836319343816397e-07, "loss": 0.0662, "reward": -0.44232044741511345, "reward_std": 0.38765473291277885, "rewards/cosine_scaled_reward": -0.24199354834854603, "rewards/format_reward": 0.0416666679084301, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.20685714285714285, "grad_norm": 0.13692274689674377, "kl": 0.04925537109375, "learning_rate": 2.9576484845877793e-07, "loss": 0.0002, "reward": -0.47414352430496365, "reward_std": 0.39991648495197296, "rewards/cosine_scaled_reward": -0.27873843535780907, "rewards/format_reward": 0.0833333358168602, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 3512.75, "epoch": 0.20742857142857143, "grad_norm": 0.2301596701145172, "kl": 0.1016845703125, "learning_rate": 2.931788945420058e-07, "loss": 0.0428, "reward": -0.3924715518951416, "reward_std": 0.4111638516187668, "rewards/cosine_scaled_reward": -0.2170691154897213, "rewards/format_reward": 0.0416666679084301, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 3534.4583740234375, "epoch": 0.208, "grad_norm": 0.13970957696437836, "kl": 0.0601806640625, "learning_rate": 2.9060545772359305e-07, "loss": 0.03, "reward": -0.5889072120189667, "reward_std": 0.3112984448671341, "rewards/cosine_scaled_reward": -0.31528693437576294, "rewards/format_reward": 0.0416666679084301, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 3378.7083740234375, "epoch": 0.20857142857142857, "grad_norm": 0.22361378371715546, "kl": 0.06732177734375, "learning_rate": 2.8804466342921987e-07, "loss": 0.0314, "reward": -0.39835788309574127, "reward_std": 0.3161539016291499, "rewards/cosine_scaled_reward": -0.30334562435746193, "rewards/format_reward": 0.2083333432674408, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 3558.625, "epoch": 0.20914285714285713, "grad_norm": 0.1580444574356079, "kl": 0.06201171875, "learning_rate": 2.854966364683872e-07, "loss": 0.0145, "reward": -0.4920261278748512, "reward_std": 0.28505855053663254, "rewards/cosine_scaled_reward": -0.2876797318458557, "rewards/format_reward": 0.0833333358168602, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 2983.9583435058594, "epoch": 0.20971428571428571, "grad_norm": 0.16354723274707794, "kl": 0.06024169921875, "learning_rate": 2.829615010283344e-07, "loss": -0.0499, "reward": -0.0459320992231369, "reward_std": 0.4094668244943023, "rewards/cosine_scaled_reward": -0.14796602725982666, "rewards/format_reward": 0.25, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 2961.625, "epoch": 0.2102857142857143, "grad_norm": 0.1965043544769287, "kl": 0.06488037109375, "learning_rate": 2.8043938066798645e-07, "loss": -0.0578, "reward": -0.2956502139568329, "reward_std": 0.37837161868810654, "rewards/cosine_scaled_reward": -0.27282512187957764, "rewards/format_reward": 0.25, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 3540.8333740234375, "epoch": 0.21085714285714285, "grad_norm": 0.1523246318101883, "kl": 0.062255859375, "learning_rate": 2.7793039831193133e-07, "loss": 0.0014, "reward": -0.48877737671136856, "reward_std": 0.40059489756822586, "rewards/cosine_scaled_reward": -0.2860553557984531, "rewards/format_reward": 0.0833333358168602, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 3571.0833740234375, "epoch": 0.21142857142857144, "grad_norm": 0.14142955839633942, "kl": 0.063232421875, "learning_rate": 2.7543467624442956e-07, "loss": 0.005, "reward": -0.4047308452427387, "reward_std": 0.5568485893309116, "rewards/cosine_scaled_reward": -0.2440320923924446, "rewards/format_reward": 0.0833333358168602, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 3227.9583740234375, "epoch": 0.212, "grad_norm": 0.2218811810016632, "kl": 0.0560302734375, "learning_rate": 2.729523361034538e-07, "loss": 0.0255, "reward": 0.3210463747382164, "reward_std": 0.4591142237186432, "rewards/cosine_scaled_reward": 0.0355231873691082, "rewards/format_reward": 0.25, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.21257142857142858, "grad_norm": 0.15657925605773926, "kl": 0.06109619140625, "learning_rate": 2.7048349887476037e-07, "loss": 0.0002, "reward": -0.44183915108442307, "reward_std": 0.3711034543812275, "rewards/cosine_scaled_reward": -0.22091957554221153, "rewards/format_reward": 0.0, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 3060.75, "epoch": 0.21314285714285713, "grad_norm": 0.26730263233184814, "kl": 0.059326171875, "learning_rate": 2.6802828488599294e-07, "loss": -0.0419, "reward": 0.1390758454799652, "reward_std": 0.4297582358121872, "rewards/cosine_scaled_reward": -0.07629543542861938, "rewards/format_reward": 0.2916666679084301, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.21371428571428572, "grad_norm": 0.18330542743206024, "kl": 0.0675048828125, "learning_rate": 2.655868138008171e-07, "loss": 0.0003, "reward": -0.6386789381504059, "reward_std": 0.2875619940459728, "rewards/cosine_scaled_reward": -0.31933946907520294, "rewards/format_reward": 0.0, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 3327.7083740234375, "epoch": 0.21428571428571427, "grad_norm": 0.29869186878204346, "kl": 0.0648193359375, "learning_rate": 2.631592046130896e-07, "loss": 0.0974, "reward": 0.09292280673980713, "reward_std": 0.8725230973213911, "rewards/cosine_scaled_reward": -0.036871928721666336, "rewards/format_reward": 0.1666666716337204, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 3528.3333740234375, "epoch": 0.21485714285714286, "grad_norm": 0.18041333556175232, "kl": 0.05859375, "learning_rate": 2.6074557564105724e-07, "loss": 0.0234, "reward": -0.5773191377520561, "reward_std": 0.5778478160500526, "rewards/cosine_scaled_reward": -0.33032623305916786, "rewards/format_reward": 0.0833333358168602, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 3195.7083740234375, "epoch": 0.21542857142857144, "grad_norm": 0.2103518396615982, "kl": 0.07080078125, "learning_rate": 2.583460445215911e-07, "loss": 0.071, "reward": 0.030612453818321228, "reward_std": 0.5441110543906689, "rewards/cosine_scaled_reward": -0.10969378054141998, "rewards/format_reward": 0.2500000111758709, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 3409.625, "epoch": 0.216, "grad_norm": 0.17528486251831055, "kl": 0.0775146484375, "learning_rate": 2.5596072820445254e-07, "loss": 0.043, "reward": -0.5896594896912575, "reward_std": 0.3001709654927254, "rewards/cosine_scaled_reward": -0.35732975602149963, "rewards/format_reward": 0.125, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 3403.875, "epoch": 0.21657142857142858, "grad_norm": 0.15156695246696472, "kl": 0.066162109375, "learning_rate": 2.5358974294659373e-07, "loss": 0.0245, "reward": -0.30837604124099016, "reward_std": 0.2952228505164385, "rewards/cosine_scaled_reward": -0.2583546922542155, "rewards/format_reward": 0.2083333432674408, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 3511.0000610351562, "epoch": 0.21714285714285714, "grad_norm": 0.14815667271614075, "kl": 0.06707763671875, "learning_rate": 2.512332043064913e-07, "loss": 0.036, "reward": -0.433868832886219, "reward_std": 0.5863572731614113, "rewards/cosine_scaled_reward": -0.2586010806262493, "rewards/format_reward": 0.0833333358168602, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 3383.7083740234375, "epoch": 0.21771428571428572, "grad_norm": 0.1917722523212433, "kl": 0.07476806640625, "learning_rate": 2.488912271385139e-07, "loss": 0.0918, "reward": -0.5000769086182117, "reward_std": 0.5215454287827015, "rewards/cosine_scaled_reward": -0.3125384636223316, "rewards/format_reward": 0.1250000037252903, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.21828571428571428, "grad_norm": 0.15971945226192474, "kl": 0.0699462890625, "learning_rate": 2.465639255873246e-07, "loss": 0.0003, "reward": -0.7036767303943634, "reward_std": 0.18040955439209938, "rewards/cosine_scaled_reward": -0.3518383651971817, "rewards/format_reward": 0.0, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 3277.2083740234375, "epoch": 0.21885714285714286, "grad_norm": 0.19748570024967194, "kl": 0.0924072265625, "learning_rate": 2.4425141308231765e-07, "loss": 0.0122, "reward": 0.2866915538907051, "reward_std": 0.7322729602456093, "rewards/cosine_scaled_reward": 0.018345749005675316, "rewards/format_reward": 0.2500000111758709, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.21942857142857142, "grad_norm": 0.14422692358493805, "kl": 0.058837890625, "learning_rate": 2.4195380233209006e-07, "loss": 0.0002, "reward": -0.4746512770652771, "reward_std": 0.40125712379813194, "rewards/cosine_scaled_reward": -0.25815898180007935, "rewards/format_reward": 0.0416666679084301, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 2883.25, "epoch": 0.22, "grad_norm": 0.20794402062892914, "kl": 0.06744384765625, "learning_rate": 2.3967120531894857e-07, "loss": 0.0445, "reward": -0.0814894586801529, "reward_std": 0.5334790125489235, "rewards/cosine_scaled_reward": -0.16574472934007645, "rewards/format_reward": 0.25, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 3516.2916870117188, "epoch": 0.22057142857142858, "grad_norm": 0.17907118797302246, "kl": 0.07537841796875, "learning_rate": 2.374037332934512e-07, "loss": 0.0077, "reward": -0.5339493155479431, "reward_std": 0.1955426223576069, "rewards/cosine_scaled_reward": -0.32947464287281036, "rewards/format_reward": 0.125, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 2310.0416870117188, "epoch": 0.22114285714285714, "grad_norm": 0.26615405082702637, "kl": 0.078369140625, "learning_rate": 2.3515149676898552e-07, "loss": 0.1631, "reward": 0.8421094790101051, "reward_std": 0.3642051964998245, "rewards/cosine_scaled_reward": 0.19188803806900978, "rewards/format_reward": 0.4583333432674408, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 2787.041717529297, "epoch": 0.22171428571428572, "grad_norm": 0.171456977725029, "kl": 0.06024169921875, "learning_rate": 2.3291460551638237e-07, "loss": 0.0421, "reward": 0.8226055353879929, "reward_std": 0.5110184028744698, "rewards/cosine_scaled_reward": 0.11963611841201782, "rewards/format_reward": 0.5833333358168602, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 2454.875045776367, "epoch": 0.22228571428571428, "grad_norm": 0.20929378271102905, "kl": 0.06768798828125, "learning_rate": 2.306931685585657e-07, "loss": 0.1073, "reward": 0.9306686669588089, "reward_std": 1.0127770230174065, "rewards/cosine_scaled_reward": 0.19450097158551216, "rewards/format_reward": 0.5416666716337204, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 3351.0416870117188, "epoch": 0.22285714285714286, "grad_norm": 0.28745704889297485, "kl": 0.06829833984375, "learning_rate": 2.2848729416523859e-07, "loss": 0.0874, "reward": -0.35296558355912566, "reward_std": 0.5742789804935455, "rewards/cosine_scaled_reward": -0.23898280411958694, "rewards/format_reward": 0.1250000037252903, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 3579.2083740234375, "epoch": 0.22342857142857142, "grad_norm": 0.1473505049943924, "kl": 0.06964111328125, "learning_rate": 2.2629708984760706e-07, "loss": 0.003, "reward": -0.3405249584466219, "reward_std": 0.20318820234388113, "rewards/cosine_scaled_reward": -0.19109581504017115, "rewards/format_reward": 0.0416666679084301, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 3410.5416870117188, "epoch": 0.224, "grad_norm": 0.16982409358024597, "kl": 0.06280517578125, "learning_rate": 2.2412266235313973e-07, "loss": 0.0332, "reward": -0.26890936493873596, "reward_std": 0.5837244279682636, "rewards/cosine_scaled_reward": -0.21778801828622818, "rewards/format_reward": 0.1666666716337204, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.22457142857142856, "grad_norm": 0.14082053303718567, "kl": 0.05859375, "learning_rate": 2.2196411766036487e-07, "loss": 0.0002, "reward": -0.4981812983751297, "reward_std": 0.3227638751268387, "rewards/cosine_scaled_reward": -0.26992398500442505, "rewards/format_reward": 0.0416666679084301, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 2889.25, "epoch": 0.22514285714285714, "grad_norm": 0.20773468911647797, "kl": 0.07421875, "learning_rate": 2.1982156097370557e-07, "loss": 0.0717, "reward": 0.11960902251303196, "reward_std": 0.47472497448325157, "rewards/cosine_scaled_reward": -0.06519548781216145, "rewards/format_reward": 0.25, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.2257142857142857, "grad_norm": 0.18604017794132233, "kl": 0.05572509765625, "learning_rate": 2.1769509671835223e-07, "loss": 0.0002, "reward": -0.43610429018735886, "reward_std": 0.18078533560037613, "rewards/cosine_scaled_reward": -0.21805214136838913, "rewards/format_reward": 0.0, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 3554.6666870117188, "epoch": 0.22628571428571428, "grad_norm": 0.12925174832344055, "kl": 0.05133056640625, "learning_rate": 2.1558482853517253e-07, "loss": 0.0112, "reward": -0.23759066313505173, "reward_std": 0.5703822672367096, "rewards/cosine_scaled_reward": -0.22296201065182686, "rewards/format_reward": 0.2083333395421505, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 3202.0000610351562, "epoch": 0.22685714285714287, "grad_norm": 0.18819594383239746, "kl": 0.07476806640625, "learning_rate": 2.134908592756607e-07, "loss": 0.0672, "reward": -0.029868334531784058, "reward_std": 0.6436930447816849, "rewards/cosine_scaled_reward": -0.16076749563217163, "rewards/format_reward": 0.2916666716337204, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 3310.25, "epoch": 0.22742857142857142, "grad_norm": 0.2150135636329651, "kl": 0.0743408203125, "learning_rate": 2.1141329099692406e-07, "loss": 0.1091, "reward": -0.34711187332868576, "reward_std": 0.4893313031643629, "rewards/cosine_scaled_reward": -0.2568892817944288, "rewards/format_reward": 0.1666666716337204, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.228, "grad_norm": 0.1621653437614441, "kl": 0.072998046875, "learning_rate": 2.0935222495670968e-07, "loss": 0.0003, "reward": -0.6336401849985123, "reward_std": 0.2006698123877868, "rewards/cosine_scaled_reward": -0.31682008877396584, "rewards/format_reward": 0.0, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 2075.4583435058594, "epoch": 0.22857142857142856, "grad_norm": 0.3685378432273865, "kl": 0.0650634765625, "learning_rate": 2.0730776160846853e-07, "loss": 0.1218, "reward": 0.7177584320306778, "reward_std": 0.5401098467409611, "rewards/cosine_scaled_reward": 0.06721250712871552, "rewards/format_reward": 0.5833333358168602, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 3315.5, "epoch": 0.22914285714285715, "grad_norm": 0.15908077359199524, "kl": 0.07159423828125, "learning_rate": 2.0528000059645995e-07, "loss": 0.0029, "reward": 0.04471557028591633, "reward_std": 0.7973760291934013, "rewards/cosine_scaled_reward": -0.12347555533051491, "rewards/format_reward": 0.291666679084301, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 2788.4166717529297, "epoch": 0.2297142857142857, "grad_norm": 0.27960649132728577, "kl": 0.05523681640625, "learning_rate": 2.032690407508949e-07, "loss": -0.0295, "reward": -0.3924320638179779, "reward_std": 0.39799761585891247, "rewards/cosine_scaled_reward": -0.21704937238246202, "rewards/format_reward": 0.0416666679084301, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 2993.6666870117188, "epoch": 0.2302857142857143, "grad_norm": 0.3006753623485565, "kl": 0.066650390625, "learning_rate": 2.0127498008311922e-07, "loss": 0.1707, "reward": -0.2186871642479673, "reward_std": 0.7662175893783569, "rewards/cosine_scaled_reward": -0.27601024881005287, "rewards/format_reward": 0.3333333395421505, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 3025.5416870117188, "epoch": 0.23085714285714284, "grad_norm": 0.17790178954601288, "kl": 0.07391357421875, "learning_rate": 1.9929791578083655e-07, "loss": -0.0457, "reward": -0.3540494963526726, "reward_std": 0.2286781333386898, "rewards/cosine_scaled_reward": -0.3020247519016266, "rewards/format_reward": 0.25, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 3297.7083740234375, "epoch": 0.23142857142857143, "grad_norm": 0.21704569458961487, "kl": 0.06781005859375, "learning_rate": 1.9733794420337213e-07, "loss": 0.033, "reward": 0.39337925612926483, "reward_std": 0.5153894275426865, "rewards/cosine_scaled_reward": 0.050856344401836395, "rewards/format_reward": 0.2916666679084301, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.232, "grad_norm": 0.18073974549770355, "kl": 0.0775146484375, "learning_rate": 1.9539516087697517e-07, "loss": 0.0003, "reward": -0.72935850918293, "reward_std": 0.17922993749380112, "rewards/cosine_scaled_reward": -0.364679254591465, "rewards/format_reward": 0.0, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 3288.375, "epoch": 0.23257142857142857, "grad_norm": 0.16751199960708618, "kl": 0.05029296875, "learning_rate": 1.934696604901642e-07, "loss": 0.0725, "reward": -0.014177265577018261, "reward_std": 0.5347921415232122, "rewards/cosine_scaled_reward": -0.09042196115478873, "rewards/format_reward": 0.1666666716337204, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 3183.0833740234375, "epoch": 0.23314285714285715, "grad_norm": 0.289958655834198, "kl": 0.08087158203125, "learning_rate": 1.915615368891117e-07, "loss": 0.0824, "reward": -0.2237246111035347, "reward_std": 0.6425449755042791, "rewards/cosine_scaled_reward": -0.2368623074144125, "rewards/format_reward": 0.2500000111758709, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.2337142857142857, "grad_norm": 0.15507179498672485, "kl": 0.0733642578125, "learning_rate": 1.8967088307307e-07, "loss": 0.0003, "reward": -0.8279737383127213, "reward_std": 0.19148441590368748, "rewards/cosine_scaled_reward": -0.4139868766069412, "rewards/format_reward": 0.0, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 3522.5833740234375, "epoch": 0.2342857142857143, "grad_norm": 0.15713585913181305, "kl": 0.0584716796875, "learning_rate": 1.8779779118983867e-07, "loss": 0.037, "reward": -0.6865072301588953, "reward_std": 0.4425274422392249, "rewards/cosine_scaled_reward": -0.3640869613736868, "rewards/format_reward": 0.0416666679084301, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.23485714285714285, "grad_norm": 0.14535216987133026, "kl": 0.05908203125, "learning_rate": 1.8594235253127372e-07, "loss": 0.0002, "reward": -0.46431963704526424, "reward_std": 0.40429612435400486, "rewards/cosine_scaled_reward": -0.2529931543394923, "rewards/format_reward": 0.0416666679084301, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.23542857142857143, "grad_norm": 0.14524948596954346, "kl": 0.059814453125, "learning_rate": 1.8410465752883758e-07, "loss": 0.0002, "reward": -0.7351335138082504, "reward_std": 0.2500329352915287, "rewards/cosine_scaled_reward": -0.3675667643547058, "rewards/format_reward": 0.0, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 3240.375, "epoch": 0.236, "grad_norm": 0.1661958247423172, "kl": 0.04791259765625, "learning_rate": 1.822847957491922e-07, "loss": 0.049, "reward": -0.06640298664569855, "reward_std": 0.7313026990741491, "rewards/cosine_scaled_reward": -0.15820149332284927, "rewards/format_reward": 0.2500000111758709, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 3034.4166870117188, "epoch": 0.23657142857142857, "grad_norm": 0.1746898889541626, "kl": 0.05645751953125, "learning_rate": 1.804828558898332e-07, "loss": 0.1069, "reward": -0.08215373568236828, "reward_std": 0.4830262325704098, "rewards/cosine_scaled_reward": -0.1452435301616788, "rewards/format_reward": 0.2083333432674408, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 3484.5416870117188, "epoch": 0.23714285714285716, "grad_norm": 0.14372943341732025, "kl": 0.06488037109375, "learning_rate": 1.7869892577476722e-07, "loss": 0.0328, "reward": -0.33930344693362713, "reward_std": 0.47646061331033707, "rewards/cosine_scaled_reward": -0.27381839603185654, "rewards/format_reward": 0.2083333358168602, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 3189.4583435058594, "epoch": 0.2377142857142857, "grad_norm": 0.304644376039505, "kl": 0.05975341796875, "learning_rate": 1.7693309235023127e-07, "loss": 0.1251, "reward": -0.2368946084752679, "reward_std": 0.1720826616510749, "rewards/cosine_scaled_reward": -0.20178064471110702, "rewards/format_reward": 0.1666666716337204, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 3047.5, "epoch": 0.2382857142857143, "grad_norm": 0.13361294567584991, "kl": 0.061279296875, "learning_rate": 1.7518544168045524e-07, "loss": 0.06, "reward": 0.06008124351501465, "reward_std": 0.7015728913247585, "rewards/cosine_scaled_reward": -0.11579271033406258, "rewards/format_reward": 0.2916666679084301, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 3249.7083740234375, "epoch": 0.23885714285714285, "grad_norm": 0.2082819938659668, "kl": 0.0965576171875, "learning_rate": 1.7345605894346726e-07, "loss": 0.0679, "reward": -0.3713177442550659, "reward_std": 0.2855104599148035, "rewards/cosine_scaled_reward": -0.28982554003596306, "rewards/format_reward": 0.2083333432674408, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 3562.25, "epoch": 0.23942857142857144, "grad_norm": 0.2277977466583252, "kl": 0.060546875, "learning_rate": 1.7174502842694212e-07, "loss": 0.0129, "reward": -0.4065123088657856, "reward_std": 0.28984224516898394, "rewards/cosine_scaled_reward": -0.2449228223413229, "rewards/format_reward": 0.0833333358168602, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 2630.916717529297, "epoch": 0.24, "grad_norm": 0.1967594474554062, "kl": 0.0904541015625, "learning_rate": 1.7005243352409333e-07, "loss": 0.0107, "reward": 0.7067751418799162, "reward_std": 0.7725067064166069, "rewards/cosine_scaled_reward": 0.040887586772441864, "rewards/format_reward": 0.6250000149011612, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 3460.7916870117188, "epoch": 0.24057142857142857, "grad_norm": 0.16090837121009827, "kl": 0.06695556640625, "learning_rate": 1.6837835672960831e-07, "loss": 0.066, "reward": -0.6657570600509644, "reward_std": 0.2832261845469475, "rewards/cosine_scaled_reward": -0.3537118658423424, "rewards/format_reward": 0.0416666679084301, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.24114285714285713, "grad_norm": 0.13034385442733765, "kl": 0.0517578125, "learning_rate": 1.6672287963562852e-07, "loss": 0.0002, "reward": -0.6389777138829231, "reward_std": 0.23646372184157372, "rewards/cosine_scaled_reward": -0.31948884204030037, "rewards/format_reward": 0.0, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 3240.7083740234375, "epoch": 0.24171428571428571, "grad_norm": 0.21142347157001495, "kl": 0.0682373046875, "learning_rate": 1.6508608292777203e-07, "loss": 0.0498, "reward": 0.22329876571893692, "reward_std": 0.7350533455610275, "rewards/cosine_scaled_reward": -0.013350628316402435, "rewards/format_reward": 0.2500000074505806, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 3533.0416870117188, "epoch": 0.2422857142857143, "grad_norm": 0.15932375192642212, "kl": 0.07061767578125, "learning_rate": 1.6346804638120098e-07, "loss": 0.0236, "reward": -0.3861430063843727, "reward_std": 0.6368795782327652, "rewards/cosine_scaled_reward": -0.23473817482590675, "rewards/format_reward": 0.0833333358168602, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.24285714285714285, "grad_norm": 0.15948008000850677, "kl": 0.0645751953125, "learning_rate": 1.6186884885673413e-07, "loss": 0.0003, "reward": -0.6987170726060867, "reward_std": 0.15650003217160702, "rewards/cosine_scaled_reward": -0.34935855120420456, "rewards/format_reward": 0.0, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 3515.0, "epoch": 0.24342857142857144, "grad_norm": 0.1669153869152069, "kl": 0.0499267578125, "learning_rate": 1.6028856829700258e-07, "loss": 0.0158, "reward": -0.10988223552703857, "reward_std": 0.5193835459649563, "rewards/cosine_scaled_reward": -0.13827446848154068, "rewards/format_reward": 0.1666666716337204, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.244, "grad_norm": 0.1475888043642044, "kl": 0.0557861328125, "learning_rate": 1.5872728172265146e-07, "loss": 0.0002, "reward": -0.35475102812051773, "reward_std": 0.5787421241402626, "rewards/cosine_scaled_reward": -0.23987551219761372, "rewards/format_reward": 0.1250000037252903, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 3454.8333740234375, "epoch": 0.24457142857142858, "grad_norm": 0.15619736909866333, "kl": 0.06072998046875, "learning_rate": 1.5718506522858572e-07, "loss": 0.0498, "reward": -0.4008786417543888, "reward_std": 0.5806032568216324, "rewards/cosine_scaled_reward": -0.26293932646512985, "rewards/format_reward": 0.1250000037252903, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 3573.5416870117188, "epoch": 0.24514285714285713, "grad_norm": 0.16329319775104523, "kl": 0.0654296875, "learning_rate": 1.5566199398026147e-07, "loss": 0.0062, "reward": -0.6754651218652725, "reward_std": 0.2705872841179371, "rewards/cosine_scaled_reward": -0.35856589674949646, "rewards/format_reward": 0.0416666679084301, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.24571428571428572, "grad_norm": 0.153706356883049, "kl": 0.06573486328125, "learning_rate": 1.5415814221002265e-07, "loss": 0.0003, "reward": -0.6211144402623177, "reward_std": 0.2846235502511263, "rewards/cosine_scaled_reward": -0.331390555948019, "rewards/format_reward": 0.0416666679084301, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.24628571428571427, "grad_norm": 0.14263786375522614, "kl": 0.05303955078125, "learning_rate": 1.5267358321348285e-07, "loss": 0.0002, "reward": -0.5838388651609421, "reward_std": 0.299509858712554, "rewards/cosine_scaled_reward": -0.29191943258047104, "rewards/format_reward": 0.0, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 3518.5, "epoch": 0.24685714285714286, "grad_norm": 0.1427145153284073, "kl": 0.06005859375, "learning_rate": 1.5120838934595337e-07, "loss": -0.0016, "reward": -0.22047018259763718, "reward_std": 0.6944977939128876, "rewards/cosine_scaled_reward": -0.1727350875735283, "rewards/format_reward": 0.1250000037252903, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 3022.4166870117188, "epoch": 0.24742857142857144, "grad_norm": 0.1529431790113449, "kl": 0.06256103515625, "learning_rate": 1.4976263201891613e-07, "loss": -0.0272, "reward": 0.18959258869290352, "reward_std": 0.38590487092733383, "rewards/cosine_scaled_reward": -0.030203720554709435, "rewards/format_reward": 0.25, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 3537.1666870117188, "epoch": 0.248, "grad_norm": 0.17672474682331085, "kl": 0.06597900390625, "learning_rate": 1.483363816965435e-07, "loss": 0.0283, "reward": -0.37754696048796177, "reward_std": 0.4972882941365242, "rewards/cosine_scaled_reward": -0.20960681792348623, "rewards/format_reward": 0.0416666679084301, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 3184.625, "epoch": 0.24857142857142858, "grad_norm": 0.1457490473985672, "kl": 0.05413818359375, "learning_rate": 1.469297078922642e-07, "loss": -0.0538, "reward": 0.18912622332572937, "reward_std": 0.17255538050085306, "rewards/cosine_scaled_reward": -0.03043687343597412, "rewards/format_reward": 0.25, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 3228.875, "epoch": 0.24914285714285714, "grad_norm": 0.26985064148902893, "kl": 0.05609130859375, "learning_rate": 1.4554267916537495e-07, "loss": 0.0741, "reward": -0.41794606298208237, "reward_std": 0.19913825299590826, "rewards/cosine_scaled_reward": -0.3131397217512131, "rewards/format_reward": 0.2083333395421505, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 2429.8333435058594, "epoch": 0.24971428571428572, "grad_norm": 0.33341628313064575, "kl": 0.0677490234375, "learning_rate": 1.4417536311769885e-07, "loss": 0.1697, "reward": 0.001805141568183899, "reward_std": 0.49792607966810465, "rewards/cosine_scaled_reward": -0.22826410830020905, "rewards/format_reward": 0.4583333432674408, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.2502857142857143, "grad_norm": 0.14212451875209808, "kl": 0.0621337890625, "learning_rate": 1.4282782639029128e-07, "loss": 0.0002, "reward": -0.698416069149971, "reward_std": 0.15093004517257214, "rewards/cosine_scaled_reward": -0.3492080345749855, "rewards/format_reward": 0.0, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 3442.1250610351562, "epoch": 0.25085714285714283, "grad_norm": 0.16146516799926758, "kl": 0.0662841796875, "learning_rate": 1.4150013466019114e-07, "loss": 0.0241, "reward": -0.31549201533198357, "reward_std": 0.500336742028594, "rewards/cosine_scaled_reward": -0.22024601697921753, "rewards/format_reward": 0.1250000037252903, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 3325.9583740234375, "epoch": 0.25142857142857145, "grad_norm": 0.21158044040203094, "kl": 0.082275390625, "learning_rate": 1.4019235263722034e-07, "loss": 0.1078, "reward": -0.5876736007630825, "reward_std": 0.39598204009234905, "rewards/cosine_scaled_reward": -0.377170130610466, "rewards/format_reward": 0.1666666716337204, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.252, "grad_norm": 0.15019887685775757, "kl": 0.059326171875, "learning_rate": 1.3890454406082956e-07, "loss": 0.0002, "reward": -0.6195696294307709, "reward_std": 0.2191491797566414, "rewards/cosine_scaled_reward": -0.30978482961654663, "rewards/format_reward": 0.0, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 3102.7083740234375, "epoch": 0.25257142857142856, "grad_norm": 0.1601240336894989, "kl": 0.05877685546875, "learning_rate": 1.3763677169699217e-07, "loss": 0.166, "reward": -0.11922668665647507, "reward_std": 0.5712239369750023, "rewards/cosine_scaled_reward": -0.18461336567997932, "rewards/format_reward": 0.2500000074505806, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 3339.2083740234375, "epoch": 0.25314285714285717, "grad_norm": 0.2830808758735657, "kl": 0.0550537109375, "learning_rate": 1.3638909733514452e-07, "loss": 0.1017, "reward": -0.5837467163801193, "reward_std": 0.38514454290270805, "rewards/cosine_scaled_reward": -0.35437335819005966, "rewards/format_reward": 0.1250000037252903, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.2537142857142857, "grad_norm": 0.12788225710391998, "kl": 0.0516357421875, "learning_rate": 1.351615817851748e-07, "loss": 0.0002, "reward": -0.8638512194156647, "reward_std": 0.12708378583192825, "rewards/cosine_scaled_reward": -0.43192560970783234, "rewards/format_reward": 0.0, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 3210.6666870117188, "epoch": 0.2542857142857143, "grad_norm": 0.17025582492351532, "kl": 0.0570068359375, "learning_rate": 1.3395428487445914e-07, "loss": -0.0604, "reward": 0.14303337410092354, "reward_std": 0.35370171954855323, "rewards/cosine_scaled_reward": -0.053483348339796066, "rewards/format_reward": 0.25, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 2819.4166870117188, "epoch": 0.25485714285714284, "grad_norm": 0.522460401058197, "kl": 0.06072998046875, "learning_rate": 1.3276726544494571e-07, "loss": 0.1376, "reward": -0.20927497744560242, "reward_std": 0.5216442719101906, "rewards/cosine_scaled_reward": -0.3129708394408226, "rewards/format_reward": 0.4166666828095913, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 3188.0, "epoch": 0.25542857142857145, "grad_norm": 0.1497071087360382, "kl": 0.0562744140625, "learning_rate": 1.316005813502869e-07, "loss": 0.0099, "reward": 0.11079806089401245, "reward_std": 0.2190828826278448, "rewards/cosine_scaled_reward": -0.06960096955299377, "rewards/format_reward": 0.25, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.256, "grad_norm": 0.13108882308006287, "kl": 0.06268310546875, "learning_rate": 1.3045428945301953e-07, "loss": 0.0003, "reward": 0.10701127722859383, "reward_std": 0.24802839010953903, "rewards/cosine_scaled_reward": 0.03267230466008186, "rewards/format_reward": 0.0416666679084301, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 3524.125, "epoch": 0.25657142857142856, "grad_norm": 0.16846035420894623, "kl": 0.04693603515625, "learning_rate": 1.2932844562179352e-07, "loss": 0.0308, "reward": 0.05915266275405884, "reward_std": 0.8702057525515556, "rewards/cosine_scaled_reward": -0.012090334668755531, "rewards/format_reward": 0.0833333358168602, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 2342.3334045410156, "epoch": 0.2571428571428571, "grad_norm": 0.5184911489486694, "kl": 0.0618896484375, "learning_rate": 1.2822310472864885e-07, "loss": 0.2169, "reward": 0.7721665650606155, "reward_std": 0.7354639321565628, "rewards/cosine_scaled_reward": 0.11524996906518936, "rewards/format_reward": 0.5416666716337204, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 3561.3333740234375, "epoch": 0.25771428571428573, "grad_norm": 0.22711819410324097, "kl": 0.07025146484375, "learning_rate": 1.2713832064634125e-07, "loss": 0.0134, "reward": -0.7618950307369232, "reward_std": 0.3447403945028782, "rewards/cosine_scaled_reward": -0.4017808511853218, "rewards/format_reward": 0.0416666679084301, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 3037.6250610351562, "epoch": 0.2582857142857143, "grad_norm": 0.28949570655822754, "kl": 0.06170654296875, "learning_rate": 1.260741462457165e-07, "loss": 0.2172, "reward": -0.047480987675953656, "reward_std": 0.5772276148200035, "rewards/cosine_scaled_reward": -0.1279071494936943, "rewards/format_reward": 0.2083333358168602, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 3401.0000610351562, "epoch": 0.25885714285714284, "grad_norm": 0.25010257959365845, "kl": 0.06524658203125, "learning_rate": 1.2503063339313356e-07, "loss": 0.0653, "reward": -0.24748031795024872, "reward_std": 0.6849566511809826, "rewards/cosine_scaled_reward": -0.20707351714372635, "rewards/format_reward": 0.1666666679084301, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 3466.9166870117188, "epoch": 0.25942857142857145, "grad_norm": 0.209227055311203, "kl": 0.0743408203125, "learning_rate": 1.2400783294793668e-07, "loss": 0.0432, "reward": -0.3158434331417084, "reward_std": 0.6184379309415817, "rewards/cosine_scaled_reward": -0.1995883844792843, "rewards/format_reward": 0.0833333358168602, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.26, "grad_norm": 0.19111324846744537, "kl": 0.0718994140625, "learning_rate": 1.2300579475997657e-07, "loss": 0.0003, "reward": -0.541237011551857, "reward_std": 0.22257393412292004, "rewards/cosine_scaled_reward": -0.2706185057759285, "rewards/format_reward": 0.0, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.26057142857142856, "grad_norm": 0.15810757875442505, "kl": 0.06884765625, "learning_rate": 1.220245676671809e-07, "loss": 0.0003, "reward": -0.4421476610004902, "reward_std": 0.19191257283091545, "rewards/cosine_scaled_reward": -0.22107383236289024, "rewards/format_reward": 0.0, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 2937.125, "epoch": 0.2611428571428571, "grad_norm": 0.2057061344385147, "kl": 0.140380859375, "learning_rate": 1.2106419949317388e-07, "loss": 0.0211, "reward": -0.03692680597305298, "reward_std": 0.3839663378894329, "rewards/cosine_scaled_reward": -0.1434633992612362, "rewards/format_reward": 0.25, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 3399.2083740234375, "epoch": 0.26171428571428573, "grad_norm": 0.20098821818828583, "kl": 0.066162109375, "learning_rate": 1.2012473704494537e-07, "loss": 0.0564, "reward": -0.3796830028295517, "reward_std": 0.35559918358922005, "rewards/cosine_scaled_reward": -0.2315081711858511, "rewards/format_reward": 0.0833333358168602, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.2622857142857143, "grad_norm": 0.188381627202034, "kl": 0.05596923828125, "learning_rate": 1.1920622611056974e-07, "loss": 0.0002, "reward": -0.4935343600809574, "reward_std": 0.35022899881005287, "rewards/cosine_scaled_reward": -0.2467671800404787, "rewards/format_reward": 0.0, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 2985.9166870117188, "epoch": 0.26285714285714284, "grad_norm": 0.23509609699249268, "kl": 0.06756591796875, "learning_rate": 1.1830871145697412e-07, "loss": -0.0768, "reward": 0.22741861082613468, "reward_std": 0.20784308947622776, "rewards/cosine_scaled_reward": -0.011290664784610271, "rewards/format_reward": 0.25, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 2779.5417098999023, "epoch": 0.2634285714285714, "grad_norm": 0.452884316444397, "kl": 0.06634521484375, "learning_rate": 1.1743223682775649e-07, "loss": 0.1131, "reward": -0.07000309228897095, "reward_std": 0.4777501877397299, "rewards/cosine_scaled_reward": -0.20166821405291557, "rewards/format_reward": 0.3333333358168602, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 3556.5, "epoch": 0.264, "grad_norm": 0.13507026433944702, "kl": 0.04974365234375, "learning_rate": 1.1657684494105386e-07, "loss": 0.0163, "reward": -0.47025562822818756, "reward_std": 0.5417722593992949, "rewards/cosine_scaled_reward": -0.2559611462056637, "rewards/format_reward": 0.0416666679084301, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.26457142857142857, "grad_norm": 0.139174684882164, "kl": 0.05755615234375, "learning_rate": 1.1574257748745986e-07, "loss": 0.0002, "reward": -0.5710950195789337, "reward_std": 0.528672531247139, "rewards/cosine_scaled_reward": -0.30638084560632706, "rewards/format_reward": 0.0416666679084301, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 3249.125, "epoch": 0.2651428571428571, "grad_norm": 0.16046997904777527, "kl": 0.05987548828125, "learning_rate": 1.1492947512799328e-07, "loss": -0.0441, "reward": -0.28501003235578537, "reward_std": 0.4225130509585142, "rewards/cosine_scaled_reward": -0.24667168036103249, "rewards/format_reward": 0.2083333432674408, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 3547.0, "epoch": 0.26571428571428574, "grad_norm": 0.1321614384651184, "kl": 0.04974365234375, "learning_rate": 1.1413757749211602e-07, "loss": 0.0219, "reward": -0.4046256057918072, "reward_std": 0.48913862090557814, "rewards/cosine_scaled_reward": -0.22314614709466696, "rewards/format_reward": 0.0416666679084301, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 3458.1250610351562, "epoch": 0.2662857142857143, "grad_norm": 0.14799264073371887, "kl": 0.0499267578125, "learning_rate": 1.1336692317580158e-07, "loss": 0.0464, "reward": -0.34675589203834534, "reward_std": 0.5809434130787849, "rewards/cosine_scaled_reward": -0.2983779553323984, "rewards/format_reward": 0.2500000074505806, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.26685714285714285, "grad_norm": 0.14886276423931122, "kl": 0.05316162109375, "learning_rate": 1.1261754973965422e-07, "loss": 0.0002, "reward": -0.4710244685411453, "reward_std": 0.23188624903559685, "rewards/cosine_scaled_reward": -0.23551223799586296, "rewards/format_reward": 0.0, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 3387.8333740234375, "epoch": 0.2674285714285714, "grad_norm": 0.1751989722251892, "kl": 0.05670166015625, "learning_rate": 1.1188949370707787e-07, "loss": -0.0164, "reward": -0.15125497430562973, "reward_std": 0.4892147481441498, "rewards/cosine_scaled_reward": -0.17979413457214832, "rewards/format_reward": 0.2083333432674408, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.268, "grad_norm": 0.14392061531543732, "kl": 0.06378173828125, "learning_rate": 1.1118279056249653e-07, "loss": 0.0003, "reward": -0.7428162097930908, "reward_std": 0.17959357798099518, "rewards/cosine_scaled_reward": -0.3714081048965454, "rewards/format_reward": 0.0, "step": 469 }, { "clip_ratio": 0.0, "completion_length": 2713.6250610351562, "epoch": 0.26857142857142857, "grad_norm": 0.2141987532377243, "kl": 0.0821533203125, "learning_rate": 1.1049747474962444e-07, "loss": 0.0536, "reward": 0.39758430421352386, "reward_std": 0.6199628822505474, "rewards/cosine_scaled_reward": -0.009541166247799993, "rewards/format_reward": 0.4166666716337204, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 3557.75, "epoch": 0.26914285714285713, "grad_norm": 0.14714567363262177, "kl": 0.0565185546875, "learning_rate": 1.0983357966978745e-07, "loss": 0.0138, "reward": -0.365282267332077, "reward_std": 0.6022834666073322, "rewards/cosine_scaled_reward": -0.2451411336660385, "rewards/format_reward": 0.1250000037252903, "step": 471 }, { "clip_ratio": 0.0, "completion_length": 3472.7916870117188, "epoch": 0.26971428571428574, "grad_norm": 0.179129496216774, "kl": 0.0621337890625, "learning_rate": 1.0919113768029517e-07, "loss": 0.0498, "reward": -0.3794557135552168, "reward_std": 0.3991483077406883, "rewards/cosine_scaled_reward": -0.2522278605028987, "rewards/format_reward": 0.1250000037252903, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 3019.2916870117188, "epoch": 0.2702857142857143, "grad_norm": 0.22841279208660126, "kl": 0.06793212890625, "learning_rate": 1.0857018009286381e-07, "loss": 0.0973, "reward": -0.4580535739660263, "reward_std": 0.39184647984802723, "rewards/cosine_scaled_reward": -0.37486011534929276, "rewards/format_reward": 0.2916666679084301, "step": 473 }, { "clip_ratio": 0.0, "completion_length": 2870.916717529297, "epoch": 0.27085714285714285, "grad_norm": 0.1689484417438507, "kl": 0.0419921875, "learning_rate": 1.0797073717209013e-07, "loss": -0.0129, "reward": 0.28269072622060776, "reward_std": 0.2588655799627304, "rewards/cosine_scaled_reward": -0.08782129734754562, "rewards/format_reward": 0.4583333432674408, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 3473.5833740234375, "epoch": 0.2714285714285714, "grad_norm": 0.475265234708786, "kl": 0.16973876953125, "learning_rate": 1.0739283813397639e-07, "loss": 0.0462, "reward": -0.11720195040106773, "reward_std": 0.746030643582344, "rewards/cosine_scaled_reward": -0.14193430170416832, "rewards/format_reward": 0.1666666716337204, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 3468.9583740234375, "epoch": 0.272, "grad_norm": 0.22294382750988007, "kl": 0.06390380859375, "learning_rate": 1.068365111445064e-07, "loss": 0.0685, "reward": -0.8103623390197754, "reward_std": 0.16845603752881289, "rewards/cosine_scaled_reward": -0.4260144978761673, "rewards/format_reward": 0.0416666679084301, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.2725714285714286, "grad_norm": 0.163096621632576, "kl": 0.07647705078125, "learning_rate": 1.063017833182728e-07, "loss": 0.0003, "reward": -0.7045374438166618, "reward_std": 0.3001190824434161, "rewards/cosine_scaled_reward": -0.393935389816761, "rewards/format_reward": 0.0833333358168602, "step": 477 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.27314285714285713, "grad_norm": 0.16238611936569214, "kl": 0.051025390625, "learning_rate": 1.0578868071715544e-07, "loss": 0.0002, "reward": -0.49170275777578354, "reward_std": 0.1920237122103572, "rewards/cosine_scaled_reward": -0.24585136957466602, "rewards/format_reward": 0.0, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 3314.25, "epoch": 0.2737142857142857, "grad_norm": 0.16748814284801483, "kl": 0.06201171875, "learning_rate": 1.0529722834905125e-07, "loss": 0.0871, "reward": 0.5320697575807571, "reward_std": 0.6121283564716578, "rewards/cosine_scaled_reward": 0.16186823695898056, "rewards/format_reward": 0.2083333358168602, "step": 479 }, { "clip_ratio": 0.0, "completion_length": 3570.9166870117188, "epoch": 0.2742857142857143, "grad_norm": 0.15536968410015106, "kl": 0.0684814453125, "learning_rate": 1.0482745016665526e-07, "loss": 0.0078, "reward": -0.5202671885490417, "reward_std": 0.39154190942645073, "rewards/cosine_scaled_reward": -0.2809669245034456, "rewards/format_reward": 0.0416666679084301, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 3291.9583740234375, "epoch": 0.27485714285714286, "grad_norm": 0.26999354362487793, "kl": 0.0859375, "learning_rate": 1.0437936906629334e-07, "loss": 0.092, "reward": -0.4586787410080433, "reward_std": 0.26687505654990673, "rewards/cosine_scaled_reward": -0.291839387267828, "rewards/format_reward": 0.125, "step": 481 }, { "clip_ratio": 0.0, "completion_length": 3199.75, "epoch": 0.2754285714285714, "grad_norm": 0.15752355754375458, "kl": 0.0611572265625, "learning_rate": 1.0395300688680625e-07, "loss": 0.0365, "reward": -0.32594752311706543, "reward_std": 0.4160085953772068, "rewards/cosine_scaled_reward": -0.2671404145658016, "rewards/format_reward": 0.2083333432674408, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 3544.6666870117188, "epoch": 0.276, "grad_norm": 0.14844165742397308, "kl": 0.06427001953125, "learning_rate": 1.0354838440848501e-07, "loss": 0.0233, "reward": -0.6612615287303925, "reward_std": 0.4578050300478935, "rewards/cosine_scaled_reward": -0.3514641039073467, "rewards/format_reward": 0.0416666679084301, "step": 483 }, { "clip_ratio": 0.0, "completion_length": 2735.25, "epoch": 0.2765714285714286, "grad_norm": 0.2591208815574646, "kl": 0.0555419921875, "learning_rate": 1.0316552135205837e-07, "loss": -0.024, "reward": -0.3376142382621765, "reward_std": 0.3799719735980034, "rewards/cosine_scaled_reward": -0.29380711913108826, "rewards/format_reward": 0.2500000111758709, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 3395.5833740234375, "epoch": 0.27714285714285714, "grad_norm": 0.1682201325893402, "kl": 0.060791015625, "learning_rate": 1.0280443637773163e-07, "loss": 0.0391, "reward": -0.04071862995624542, "reward_std": 0.600635758601129, "rewards/cosine_scaled_reward": -0.10369264334440231, "rewards/format_reward": 0.1666666716337204, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 2664.500045776367, "epoch": 0.2777142857142857, "grad_norm": 0.3096904456615448, "kl": 0.05816650390625, "learning_rate": 1.0246514708427701e-07, "loss": 0.0887, "reward": -0.291459396481514, "reward_std": 0.21652375906705856, "rewards/cosine_scaled_reward": -0.29156303964555264, "rewards/format_reward": 0.2916666679084301, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 3101.9166870117188, "epoch": 0.2782857142857143, "grad_norm": 0.25412291288375854, "kl": 0.0621337890625, "learning_rate": 1.0214767000817596e-07, "loss": 0.1419, "reward": 0.23972990177571774, "reward_std": 0.9307698365300894, "rewards/cosine_scaled_reward": -0.00513505470007658, "rewards/format_reward": 0.2500000074505806, "step": 487 }, { "clip_ratio": 0.0, "completion_length": 3562.4583740234375, "epoch": 0.27885714285714286, "grad_norm": 0.15202371776103973, "kl": 0.05609130859375, "learning_rate": 1.0185202062281336e-07, "loss": 0.0126, "reward": -0.47963531874120235, "reward_std": 0.48471274971961975, "rewards/cosine_scaled_reward": -0.26065099239349365, "rewards/format_reward": 0.0416666679084301, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.2794285714285714, "grad_norm": 0.16845981776714325, "kl": 0.0772705078125, "learning_rate": 1.0157821333772304e-07, "loss": 0.0003, "reward": -0.5912460908293724, "reward_std": 0.15500911697745323, "rewards/cosine_scaled_reward": -0.2956230528652668, "rewards/format_reward": 0.0, "step": 489 }, { "clip_ratio": 0.0, "completion_length": 2680.6666870117188, "epoch": 0.28, "grad_norm": 0.2831243872642517, "kl": 0.05218505859375, "learning_rate": 1.013262614978859e-07, "loss": 0.1674, "reward": 0.24096882343292236, "reward_std": 0.7047604825347662, "rewards/cosine_scaled_reward": -0.08784890919923782, "rewards/format_reward": 0.4166666865348816, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 3334.5, "epoch": 0.2805714285714286, "grad_norm": 0.23872096836566925, "kl": 0.06268310546875, "learning_rate": 1.0109617738307911e-07, "loss": 0.1248, "reward": -0.6380931735038757, "reward_std": 0.5134330447763205, "rewards/cosine_scaled_reward": -0.36071325466036797, "rewards/format_reward": 0.0833333358168602, "step": 491 }, { "clip_ratio": 0.0, "completion_length": 3078.5416870117188, "epoch": 0.28114285714285714, "grad_norm": 0.46638068556785583, "kl": 0.10198974609375, "learning_rate": 1.0088797220727779e-07, "loss": 0.0973, "reward": -0.29000576585531235, "reward_std": 0.3600935824215412, "rewards/cosine_scaled_reward": -0.2700028717517853, "rewards/format_reward": 0.2500000111758709, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 2935.5833435058594, "epoch": 0.2817142857142857, "grad_norm": 0.18331332504749298, "kl": 0.08758544921875, "learning_rate": 1.0070165611810855e-07, "loss": 0.0326, "reward": 0.09402071312069893, "reward_std": 0.43435685709118843, "rewards/cosine_scaled_reward": -0.09882297366857529, "rewards/format_reward": 0.2916666679084301, "step": 493 }, { "clip_ratio": 0.0, "completion_length": 3129.5000610351562, "epoch": 0.2822857142857143, "grad_norm": 0.23339956998825073, "kl": 0.07867431640625, "learning_rate": 1.005372381963547e-07, "loss": 0.1134, "reward": 0.17649170011281967, "reward_std": 0.6835528574883938, "rewards/cosine_scaled_reward": -0.03675416484475136, "rewards/format_reward": 0.2500000074505806, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.28285714285714286, "grad_norm": 0.14486908912658691, "kl": 0.05389404296875, "learning_rate": 1.0039472645551372e-07, "loss": 0.0002, "reward": -0.5066226273775101, "reward_std": 0.25357529893517494, "rewards/cosine_scaled_reward": -0.27414464950561523, "rewards/format_reward": 0.0416666679084301, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 2462.3750610351562, "epoch": 0.2834285714285714, "grad_norm": 0.4829932749271393, "kl": 0.08642578125, "learning_rate": 1.002741278414069e-07, "loss": 0.1718, "reward": 0.3855774737894535, "reward_std": 0.946885883808136, "rewards/cosine_scaled_reward": -0.07804461070918478, "rewards/format_reward": 0.5416666753590107, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 3563.7916870117188, "epoch": 0.284, "grad_norm": 0.13713513314723969, "kl": 0.047088623046875, "learning_rate": 1.0017544823184055e-07, "loss": 0.0098, "reward": -0.4956064634025097, "reward_std": 0.2097947271540761, "rewards/cosine_scaled_reward": -0.26863656379282475, "rewards/format_reward": 0.0416666679084301, "step": 497 }, { "clip_ratio": 0.0, "completion_length": 2874.5833435058594, "epoch": 0.2845714285714286, "grad_norm": 28.739835739135742, "kl": 14.1053466796875, "learning_rate": 1.0009869243631952e-07, "loss": 0.0113, "reward": -0.19248086214065552, "reward_std": 0.41352982819080353, "rewards/cosine_scaled_reward": -0.22124045342206955, "rewards/format_reward": 0.25, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 3546.5, "epoch": 0.28514285714285714, "grad_norm": 0.15727418661117554, "kl": 0.072509765625, "learning_rate": 1.000438641958131e-07, "loss": 0.0135, "reward": -0.3870402202010155, "reward_std": 0.4852425046265125, "rewards/cosine_scaled_reward": -0.2351867752149701, "rewards/format_reward": 0.0833333358168602, "step": 499 }, { "clip_ratio": 0.0, "completion_length": 3363.375, "epoch": 0.2857142857142857, "grad_norm": 0.14489901065826416, "kl": 0.0672607421875, "learning_rate": 1.0001096618257236e-07, "loss": 0.0835, "reward": -0.4100040141493082, "reward_std": 0.22312426194548607, "rewards/cosine_scaled_reward": -0.24666867777705193, "rewards/format_reward": 0.0833333358168602, "step": 500 }, { "epoch": 0.2857142857142857, "step": 500, "total_flos": 0.0, "train_loss": 0.04149833003999083, "train_runtime": 27564.67, "train_samples_per_second": 0.435, "train_steps_per_second": 0.018 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }