{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2857142857142857, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 3068.5000610351562, "epoch": 0.0005714285714285715, "grad_norm": 0.08316484093666077, "kl": 0.0204010009765625, "learning_rate": 0.0, "loss": -0.0234, "reward": 0.200983926653862, "reward_std": 0.24425111338496208, "rewards/cosine_scaled_reward": -0.0453413650393486, "rewards/format_reward": 0.2916666679084301, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 2821.9166717529297, "epoch": 0.001142857142857143, "grad_norm": 0.08570546656847, "kl": 0.011383056640625, "learning_rate": 2.0000000000000003e-06, "loss": 0.0401, "reward": -0.22895785048604012, "reward_std": 0.31652447022497654, "rewards/cosine_scaled_reward": -0.3019789308309555, "rewards/format_reward": 0.375, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 3095.1250610351562, "epoch": 0.0017142857142857142, "grad_norm": 0.12569886445999146, "kl": 0.0147705078125, "learning_rate": 4.000000000000001e-06, "loss": 0.09, "reward": 0.7377238147892058, "reward_std": 0.8798377588391304, "rewards/cosine_scaled_reward": 0.1396951973438263, "rewards/format_reward": 0.4583333507180214, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 2732.041748046875, "epoch": 0.002285714285714286, "grad_norm": 0.158451646566391, "kl": 0.014007568359375, "learning_rate": 6e-06, "loss": 0.1068, "reward": 0.12815280258655548, "reward_std": 0.5621042996644974, "rewards/cosine_scaled_reward": -0.22759027034044266, "rewards/format_reward": 0.5833333432674408, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 3108.0416870117188, "epoch": 0.002857142857142857, "grad_norm": 0.18162518739700317, "kl": 0.017303466796875, "learning_rate": 8.000000000000001e-06, "loss": 0.1695, "reward": -0.3482682505855337, "reward_std": 0.40073107928037643, "rewards/cosine_scaled_reward": -0.25746746733784676, "rewards/format_reward": 0.1666666716337204, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 2933.7916717529297, "epoch": 0.0034285714285714284, "grad_norm": 0.21090327203273773, "kl": 0.0165863037109375, "learning_rate": 1e-05, "loss": 0.1763, "reward": -0.33019445836544037, "reward_std": 0.4870590269565582, "rewards/cosine_scaled_reward": -0.26926389336586, "rewards/format_reward": 0.2083333432674408, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 2758.4584045410156, "epoch": 0.004, "grad_norm": 0.14720787107944489, "kl": 0.020599365234375, "learning_rate": 1.2e-05, "loss": 0.2279, "reward": -0.05960409715771675, "reward_std": 0.259865116328001, "rewards/cosine_scaled_reward": -0.19646872207522392, "rewards/format_reward": 0.3333333469927311, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 2996.2916870117188, "epoch": 0.004571428571428572, "grad_norm": 0.11497102677822113, "kl": 0.0175323486328125, "learning_rate": 1.4000000000000001e-05, "loss": 0.0293, "reward": -0.543822355568409, "reward_std": 0.21584158390760422, "rewards/cosine_scaled_reward": -0.4177445247769356, "rewards/format_reward": 0.2916666679084301, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 2414.25, "epoch": 0.005142857142857143, "grad_norm": 0.0950588658452034, "kl": 0.0137939453125, "learning_rate": 1.6000000000000003e-05, "loss": 0.0196, "reward": 0.346224058419466, "reward_std": 0.2081431858241558, "rewards/cosine_scaled_reward": -0.07688797824084759, "rewards/format_reward": 0.5, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 3161.8333740234375, "epoch": 0.005714285714285714, "grad_norm": 0.06650526076555252, "kl": 0.011474609375, "learning_rate": 1.8e-05, "loss": -0.0217, "reward": 0.37098103761672974, "reward_std": 0.7834450677037239, "rewards/cosine_scaled_reward": -0.0020095184445381165, "rewards/format_reward": 0.3750000037252903, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 2583.500045776367, "epoch": 0.006285714285714286, "grad_norm": 0.11317435652017593, "kl": 0.021820068359375, "learning_rate": 2e-05, "loss": 0.0819, "reward": -0.07037418521940708, "reward_std": 0.7221511900424957, "rewards/cosine_scaled_reward": -0.26435376331210136, "rewards/format_reward": 0.4583333395421505, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 3155.3333740234375, "epoch": 0.006857142857142857, "grad_norm": 0.09496507048606873, "kl": 0.0161285400390625, "learning_rate": 2.2000000000000003e-05, "loss": 0.0429, "reward": 0.3653342239558697, "reward_std": 0.8600304946303368, "rewards/cosine_scaled_reward": 0.016000449657440186, "rewards/format_reward": 0.3333333432674408, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 3130.2083740234375, "epoch": 0.0074285714285714285, "grad_norm": 0.10498952120542526, "kl": 0.01226806640625, "learning_rate": 2.4e-05, "loss": 0.1331, "reward": 0.41605053562670946, "reward_std": 0.8384798839688301, "rewards/cosine_scaled_reward": -0.0003080591559410095, "rewards/format_reward": 0.4166666828095913, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 2837.9166870117188, "epoch": 0.008, "grad_norm": 0.07714565843343735, "kl": 0.0148468017578125, "learning_rate": 2.6000000000000002e-05, "loss": 0.037, "reward": 0.8426800966262817, "reward_std": 0.5940273888409138, "rewards/cosine_scaled_reward": 0.21300670504570007, "rewards/format_reward": 0.4166666716337204, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 2945.0834045410156, "epoch": 0.008571428571428572, "grad_norm": 0.1212492287158966, "kl": 0.014190673828125, "learning_rate": 2.8000000000000003e-05, "loss": 0.2023, "reward": -0.19765784591436386, "reward_std": 0.4926959238946438, "rewards/cosine_scaled_reward": -0.2654956020414829, "rewards/format_reward": 0.3333333469927311, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 3162.2083740234375, "epoch": 0.009142857142857144, "grad_norm": 0.08731003850698471, "kl": 0.0145416259765625, "learning_rate": 3e-05, "loss": -0.0229, "reward": 0.32796957343816757, "reward_std": 0.6958065256476402, "rewards/cosine_scaled_reward": -0.044348541647195816, "rewards/format_reward": 0.4166666716337204, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.009714285714285713, "grad_norm": 0.07292164862155914, "kl": 0.0186614990234375, "learning_rate": 3.2000000000000005e-05, "loss": 0.0007, "reward": -0.43665362149477005, "reward_std": 0.42871900647878647, "rewards/cosine_scaled_reward": -0.2391601405106485, "rewards/format_reward": 0.0416666679084301, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 3454.8333740234375, "epoch": 0.010285714285714285, "grad_norm": 0.06652959436178207, "kl": 0.0173797607421875, "learning_rate": 3.4000000000000007e-05, "loss": 0.0322, "reward": -0.15206466615200043, "reward_std": 0.8579627200961113, "rewards/cosine_scaled_reward": -0.1801990047097206, "rewards/format_reward": 0.2083333358168602, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 3139.5, "epoch": 0.010857142857142857, "grad_norm": 0.07377547770738602, "kl": 0.021087646484375, "learning_rate": 3.6e-05, "loss": -0.0956, "reward": 0.27641918882727623, "reward_std": 0.3846270814538002, "rewards/cosine_scaled_reward": 0.013209596276283264, "rewards/format_reward": 0.2500000111758709, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 2240.791732788086, "epoch": 0.011428571428571429, "grad_norm": 0.11604765802621841, "kl": 0.01763916015625, "learning_rate": 3.8e-05, "loss": -0.0082, "reward": 0.5373580157756805, "reward_std": 0.6646340787410736, "rewards/cosine_scaled_reward": -0.04382099770009518, "rewards/format_reward": 0.625, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 1933.7500610351562, "epoch": 0.012, "grad_norm": 0.17429925501346588, "kl": 0.0303955078125, "learning_rate": 4e-05, "loss": 0.2362, "reward": 0.5577291771769524, "reward_std": 0.8789636418223381, "rewards/cosine_scaled_reward": -0.07530209049582481, "rewards/format_reward": 0.7083333358168602, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 3554.666748046875, "epoch": 0.012571428571428572, "grad_norm": 0.05839058756828308, "kl": 0.0146636962890625, "learning_rate": 4.2e-05, "loss": 0.0079, "reward": 0.32253583520650864, "reward_std": 1.1349023096263409, "rewards/cosine_scaled_reward": 0.01543455570936203, "rewards/format_reward": 0.2916666753590107, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 2841.666717529297, "epoch": 0.013142857142857144, "grad_norm": 0.1242748275399208, "kl": 0.01934814453125, "learning_rate": 4.4000000000000006e-05, "loss": 0.0414, "reward": 0.6808896251022816, "reward_std": 0.757483784109354, "rewards/cosine_scaled_reward": 0.09044479578733444, "rewards/format_reward": 0.5000000074505806, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 2709.6666870117188, "epoch": 0.013714285714285714, "grad_norm": 0.08774807304143906, "kl": 0.018585205078125, "learning_rate": 4.600000000000001e-05, "loss": 0.0703, "reward": 0.7750062793493271, "reward_std": 0.5958304777741432, "rewards/cosine_scaled_reward": 0.11666978895664215, "rewards/format_reward": 0.5416666679084301, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 2390.125030517578, "epoch": 0.014285714285714285, "grad_norm": 0.08277406543493271, "kl": 0.015777587890625, "learning_rate": 4.8e-05, "loss": 0.0432, "reward": 1.0784958824515343, "reward_std": 0.4524005614221096, "rewards/cosine_scaled_reward": 0.18508130311965942, "rewards/format_reward": 0.7083333432674408, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 3135.791717529297, "epoch": 0.014857142857142857, "grad_norm": 0.0981830507516861, "kl": 0.0186614990234375, "learning_rate": 5e-05, "loss": -0.0303, "reward": -0.14081082493066788, "reward_std": 0.684042863547802, "rewards/cosine_scaled_reward": -0.23707208782434464, "rewards/format_reward": 0.3333333358168602, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 3550.1666870117188, "epoch": 0.015428571428571429, "grad_norm": 0.08542604744434357, "kl": 0.0210418701171875, "learning_rate": 5.2000000000000004e-05, "loss": 0.0193, "reward": -0.24810760095715523, "reward_std": 0.5632593892514706, "rewards/cosine_scaled_reward": -0.16572047024965286, "rewards/format_reward": 0.0833333358168602, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 3004.2500610351562, "epoch": 0.016, "grad_norm": 0.1371905505657196, "kl": 0.024505615234375, "learning_rate": 5.4000000000000005e-05, "loss": 0.1257, "reward": 0.41203732788562775, "reward_std": 0.9210044294595718, "rewards/cosine_scaled_reward": -0.0023146718740463257, "rewards/format_reward": 0.416666679084301, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 2946.9583740234375, "epoch": 0.01657142857142857, "grad_norm": 0.08096691220998764, "kl": 0.023193359375, "learning_rate": 5.6000000000000006e-05, "loss": -0.026, "reward": 0.13879438489675522, "reward_std": 0.6071850284934044, "rewards/cosine_scaled_reward": -0.07643614336848259, "rewards/format_reward": 0.2916666679084301, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 3571.5833740234375, "epoch": 0.017142857142857144, "grad_norm": 0.06950397789478302, "kl": 0.0203857421875, "learning_rate": 5.8e-05, "loss": 0.008, "reward": -0.10841021686792374, "reward_std": 1.061239955946803, "rewards/cosine_scaled_reward": -0.13753844052553177, "rewards/format_reward": 0.1666666716337204, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 2661.9583740234375, "epoch": 0.017714285714285714, "grad_norm": 0.13690024614334106, "kl": 0.021331787109375, "learning_rate": 6e-05, "loss": 0.069, "reward": 0.4652084708213806, "reward_std": 0.5595494862645864, "rewards/cosine_scaled_reward": 0.024270888417959213, "rewards/format_reward": 0.4166666716337204, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 2511.9583740234375, "epoch": 0.018285714285714287, "grad_norm": 0.07431714236736298, "kl": 0.020538330078125, "learning_rate": 6.2e-05, "loss": 0.0788, "reward": 0.8832313418388367, "reward_std": 0.708160936832428, "rewards/cosine_scaled_reward": 0.14994902536273003, "rewards/format_reward": 0.5833333432674408, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.018857142857142857, "grad_norm": 0.08026785403490067, "kl": 0.02886962890625, "learning_rate": 6.400000000000001e-05, "loss": 0.0012, "reward": -0.6131787896156311, "reward_std": 0.201310433447361, "rewards/cosine_scaled_reward": -0.30658938735723495, "rewards/format_reward": 0.0, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 2966.3333435058594, "epoch": 0.019428571428571427, "grad_norm": 0.10494574159383774, "kl": 0.024169921875, "learning_rate": 6.6e-05, "loss": -0.0606, "reward": -0.22858721017837524, "reward_std": 0.3569427113980055, "rewards/cosine_scaled_reward": -0.2601269483566284, "rewards/format_reward": 0.2916666679084301, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.02, "grad_norm": 0.07825391739606857, "kl": 0.027740478515625, "learning_rate": 6.800000000000001e-05, "loss": 0.0011, "reward": -0.7152878791093826, "reward_std": 0.29954793583601713, "rewards/cosine_scaled_reward": -0.3576439470052719, "rewards/format_reward": 0.0, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.02057142857142857, "grad_norm": 0.07421068102121353, "kl": 0.029144287109375, "learning_rate": 7e-05, "loss": 0.0012, "reward": -0.4027569368481636, "reward_std": 0.21831603534519672, "rewards/cosine_scaled_reward": -0.20137847773730755, "rewards/format_reward": 0.0, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 3572.5416870117188, "epoch": 0.021142857142857144, "grad_norm": 0.057787228375673294, "kl": 0.022430419921875, "learning_rate": 7.2e-05, "loss": 0.0044, "reward": -0.1326567530632019, "reward_std": 0.5722145922482014, "rewards/cosine_scaled_reward": -0.1288283858448267, "rewards/format_reward": 0.125, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 3340.375, "epoch": 0.021714285714285714, "grad_norm": 0.06777527928352356, "kl": 0.027099609375, "learning_rate": 7.4e-05, "loss": -0.021, "reward": -0.35966064035892487, "reward_std": 0.3422342501580715, "rewards/cosine_scaled_reward": -0.30483032763004303, "rewards/format_reward": 0.25, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 2416.291717529297, "epoch": 0.022285714285714287, "grad_norm": 0.10955022275447845, "kl": 0.0400390625, "learning_rate": 7.6e-05, "loss": 0.0797, "reward": 0.43888016045093536, "reward_std": 0.6578193679451942, "rewards/cosine_scaled_reward": -0.05139327887445688, "rewards/format_reward": 0.541666679084301, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 3547.3333740234375, "epoch": 0.022857142857142857, "grad_norm": 0.07015793770551682, "kl": 0.0251922607421875, "learning_rate": 7.800000000000001e-05, "loss": 0.0054, "reward": -0.40920185297727585, "reward_std": 0.4526283470913768, "rewards/cosine_scaled_reward": -0.2671009246259928, "rewards/format_reward": 0.125, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 2995.75, "epoch": 0.023428571428571427, "grad_norm": 0.07032545655965805, "kl": 0.0225830078125, "learning_rate": 8e-05, "loss": -0.0094, "reward": -0.155843585729599, "reward_std": 0.10809195134788752, "rewards/cosine_scaled_reward": -0.2029217779636383, "rewards/format_reward": 0.25, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 2693.4166717529297, "epoch": 0.024, "grad_norm": 0.06467006355524063, "kl": 0.0238800048828125, "learning_rate": 8.2e-05, "loss": 0.0134, "reward": 0.43927521631121635, "reward_std": 0.6234664730727673, "rewards/cosine_scaled_reward": -0.009529059752821922, "rewards/format_reward": 0.4583333395421505, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.02457142857142857, "grad_norm": 0.06509877741336823, "kl": 0.02996826171875, "learning_rate": 8.4e-05, "loss": 0.0012, "reward": -0.6067558601498604, "reward_std": 0.1665392592549324, "rewards/cosine_scaled_reward": -0.3033779449760914, "rewards/format_reward": 0.0, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 3574.125, "epoch": 0.025142857142857144, "grad_norm": 0.06305427849292755, "kl": 0.027435302734375, "learning_rate": 8.6e-05, "loss": 0.004, "reward": -0.23581353574991226, "reward_std": 0.6179232448339462, "rewards/cosine_scaled_reward": -0.18040677905082703, "rewards/format_reward": 0.125, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 3155.125, "epoch": 0.025714285714285714, "grad_norm": 0.10693054646253586, "kl": 0.0302734375, "learning_rate": 8.800000000000001e-05, "loss": 0.1501, "reward": -0.355922631919384, "reward_std": 0.56600271910429, "rewards/cosine_scaled_reward": -0.2821279801428318, "rewards/format_reward": 0.2083333395421505, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 3348.875, "epoch": 0.026285714285714287, "grad_norm": 0.08973235636949539, "kl": 0.027587890625, "learning_rate": 9e-05, "loss": 0.087, "reward": -0.17564061796292663, "reward_std": 0.6632250510156155, "rewards/cosine_scaled_reward": -0.19198699295520782, "rewards/format_reward": 0.2083333358168602, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 2860.0833740234375, "epoch": 0.026857142857142857, "grad_norm": 0.0798032283782959, "kl": 0.023529052734375, "learning_rate": 9.200000000000001e-05, "loss": 0.0161, "reward": 0.2136048525571823, "reward_std": 0.788389652967453, "rewards/cosine_scaled_reward": -0.05986428260803223, "rewards/format_reward": 0.3333333358168602, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 2709.3333740234375, "epoch": 0.027428571428571427, "grad_norm": 0.10712946206331253, "kl": 0.0269775390625, "learning_rate": 9.4e-05, "loss": 0.2225, "reward": 0.7221578508615494, "reward_std": 0.7804624438285828, "rewards/cosine_scaled_reward": 0.04857892170548439, "rewards/format_reward": 0.6250000149011612, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 2863.666717529297, "epoch": 0.028, "grad_norm": 0.10389462858438492, "kl": 0.03228759765625, "learning_rate": 9.6e-05, "loss": 0.1357, "reward": 0.5737984776496887, "reward_std": 0.6438678838312626, "rewards/cosine_scaled_reward": 0.07856592535972595, "rewards/format_reward": 0.4166666865348816, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 3154.916748046875, "epoch": 0.02857142857142857, "grad_norm": 0.1013183668255806, "kl": 0.0369873046875, "learning_rate": 9.8e-05, "loss": 0.1183, "reward": 0.13152291253209114, "reward_std": 0.5646936669945717, "rewards/cosine_scaled_reward": -0.10090522468090057, "rewards/format_reward": 0.3333333432674408, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 3158.9583435058594, "epoch": 0.029142857142857144, "grad_norm": 0.07180243730545044, "kl": 0.033721923828125, "learning_rate": 0.0001, "loss": -0.0368, "reward": 0.07173049449920654, "reward_std": 0.21982344426214695, "rewards/cosine_scaled_reward": -0.08913473784923553, "rewards/format_reward": 0.25, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 3562.7083740234375, "epoch": 0.029714285714285714, "grad_norm": 0.06174299493432045, "kl": 0.02374267578125, "learning_rate": 9.999890338174276e-05, "loss": 0.0131, "reward": -0.344110494479537, "reward_std": 0.8078071549534798, "rewards/cosine_scaled_reward": -0.23455525189638138, "rewards/format_reward": 0.1250000037252903, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 3081.625030517578, "epoch": 0.030285714285714287, "grad_norm": 0.073664590716362, "kl": 0.0404052734375, "learning_rate": 9.999561358041869e-05, "loss": 0.0109, "reward": 0.42618853598833084, "reward_std": 0.5039119943976402, "rewards/cosine_scaled_reward": 0.06726095359772444, "rewards/format_reward": 0.2916666679084301, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 3388.2083740234375, "epoch": 0.030857142857142857, "grad_norm": 0.0838247761130333, "kl": 0.04852294921875, "learning_rate": 9.999013075636805e-05, "loss": 0.0601, "reward": -0.505183070898056, "reward_std": 0.41321277990937233, "rewards/cosine_scaled_reward": -0.3359248712658882, "rewards/format_reward": 0.1666666679084301, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 3401.75, "epoch": 0.03142857142857143, "grad_norm": 0.07153363525867462, "kl": 0.048828125, "learning_rate": 9.998245517681595e-05, "loss": 0.0337, "reward": 0.6791047602891922, "reward_std": 0.6441808789968491, "rewards/cosine_scaled_reward": 0.1312190592288971, "rewards/format_reward": 0.4166666716337204, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 3383.5416870117188, "epoch": 0.032, "grad_norm": 0.2566574811935425, "kl": 0.1033935546875, "learning_rate": 9.997258721585931e-05, "loss": 0.1083, "reward": -0.4387590363621712, "reward_std": 0.3598366603255272, "rewards/cosine_scaled_reward": -0.2610462047159672, "rewards/format_reward": 0.0833333358168602, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 3268.0416870117188, "epoch": 0.03257142857142857, "grad_norm": 0.11019923537969589, "kl": 0.04718017578125, "learning_rate": 9.996052735444863e-05, "loss": 0.0445, "reward": 0.07900557294487953, "reward_std": 0.7729422375559807, "rewards/cosine_scaled_reward": -0.08549723774194717, "rewards/format_reward": 0.2500000111758709, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 3465.5, "epoch": 0.03314285714285714, "grad_norm": 0.08362831175327301, "kl": 0.05291748046875, "learning_rate": 9.994627618036454e-05, "loss": 0.0591, "reward": -0.44299469888210297, "reward_std": 0.6330600045621395, "rewards/cosine_scaled_reward": -0.28399735875427723, "rewards/format_reward": 0.1250000037252903, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 3503.9583740234375, "epoch": 0.03371428571428572, "grad_norm": 0.08290518820285797, "kl": 0.0670166015625, "learning_rate": 9.992983438818914e-05, "loss": 0.01, "reward": -0.33126915991306305, "reward_std": 0.5302771776914597, "rewards/cosine_scaled_reward": -0.22813457623124123, "rewards/format_reward": 0.125, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.03428571428571429, "grad_norm": 0.06376607716083527, "kl": 0.055938720703125, "learning_rate": 9.991120277927223e-05, "loss": 0.0022, "reward": -0.5242063365876675, "reward_std": 0.24296396784484386, "rewards/cosine_scaled_reward": -0.2829365022480488, "rewards/format_reward": 0.0416666679084301, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 2896.500030517578, "epoch": 0.03485714285714286, "grad_norm": 0.10472023487091064, "kl": 0.0540771484375, "learning_rate": 9.989038226169209e-05, "loss": 0.0943, "reward": 0.12865129858255386, "reward_std": 1.051661066710949, "rewards/cosine_scaled_reward": -0.16484103631228209, "rewards/format_reward": 0.4583333395421505, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 2667.5416717529297, "epoch": 0.03542857142857143, "grad_norm": 17.001340866088867, "kl": 1.73077392578125, "learning_rate": 9.986737385021142e-05, "loss": 0.1479, "reward": -0.36982037127017975, "reward_std": 0.38738980889320374, "rewards/cosine_scaled_reward": -0.3515768498182297, "rewards/format_reward": 0.3333333358168602, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 3583.3333740234375, "epoch": 0.036, "grad_norm": 0.07674090564250946, "kl": 0.0684814453125, "learning_rate": 9.98421786662277e-05, "loss": 0.0031, "reward": 0.01588384434580803, "reward_std": 0.9027550332248211, "rewards/cosine_scaled_reward": -0.07539140060544014, "rewards/format_reward": 0.1666666716337204, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 3046.6666870117188, "epoch": 0.036571428571428574, "grad_norm": 0.25831571221351624, "kl": 0.06427001953125, "learning_rate": 9.981479793771866e-05, "loss": 0.212, "reward": -0.42657897621393204, "reward_std": 0.26655818335711956, "rewards/cosine_scaled_reward": -0.3174561560153961, "rewards/format_reward": 0.2083333358168602, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 3522.0416870117188, "epoch": 0.037142857142857144, "grad_norm": 0.0953318402171135, "kl": 0.1075439453125, "learning_rate": 9.97852329991824e-05, "loss": 0.0295, "reward": -0.37634188309311867, "reward_std": 0.3436935096979141, "rewards/cosine_scaled_reward": -0.22983760759234428, "rewards/format_reward": 0.0833333358168602, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 3214.0000610351562, "epoch": 0.037714285714285714, "grad_norm": 0.2412433624267578, "kl": 0.08349609375, "learning_rate": 9.97534852915723e-05, "loss": 0.0985, "reward": 0.3821183741092682, "reward_std": 1.0826219320297241, "rewards/cosine_scaled_reward": -0.038107482716441154, "rewards/format_reward": 0.4583333432674408, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 3045.5000610351562, "epoch": 0.038285714285714284, "grad_norm": 0.11741996556520462, "kl": 0.1163330078125, "learning_rate": 9.971955636222684e-05, "loss": 0.0296, "reward": 0.2733103707432747, "reward_std": 1.0267937406897545, "rewards/cosine_scaled_reward": -0.05084482580423355, "rewards/format_reward": 0.3750000037252903, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.038857142857142854, "grad_norm": 0.1156265065073967, "kl": 0.1378173828125, "learning_rate": 9.968344786479416e-05, "loss": 0.0055, "reward": -0.5511204618960619, "reward_std": 0.14900160022079945, "rewards/cosine_scaled_reward": -0.27556023094803095, "rewards/format_reward": 0.0, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 3256.9166870117188, "epoch": 0.03942857142857143, "grad_norm": 0.14248953759670258, "kl": 0.20166015625, "learning_rate": 9.964516155915151e-05, "loss": 0.0011, "reward": -0.09149269759654999, "reward_std": 0.4294120315462351, "rewards/cosine_scaled_reward": -0.1707463413476944, "rewards/format_reward": 0.25, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 3430.25, "epoch": 0.04, "grad_norm": 0.26481300592422485, "kl": 0.226318359375, "learning_rate": 9.960469931131939e-05, "loss": 0.061, "reward": -0.2762333448044956, "reward_std": 0.6956104636192322, "rewards/cosine_scaled_reward": -0.24228334799408913, "rewards/format_reward": 0.2083333358168602, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 2133.9583435058594, "epoch": 0.04057142857142857, "grad_norm": 0.55636066198349, "kl": 0.196533203125, "learning_rate": 9.956206309337068e-05, "loss": 0.1912, "reward": 0.6508506219834089, "reward_std": 0.7065669223666191, "rewards/cosine_scaled_reward": 0.012925267219543457, "rewards/format_reward": 0.6250000149011612, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 3280.0000610351562, "epoch": 0.04114285714285714, "grad_norm": 0.2733776569366455, "kl": 0.326416015625, "learning_rate": 9.951725498333448e-05, "loss": 0.089, "reward": 0.04937553819036111, "reward_std": 0.650901660323143, "rewards/cosine_scaled_reward": -0.10031222924590111, "rewards/format_reward": 0.2500000074505806, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 3007.9166870117188, "epoch": 0.04171428571428572, "grad_norm": 0.3959416151046753, "kl": 0.407958984375, "learning_rate": 9.947027716509488e-05, "loss": 0.1308, "reward": 0.21741341799497604, "reward_std": 0.7383521795272827, "rewards/cosine_scaled_reward": -0.16212662309408188, "rewards/format_reward": 0.5416666865348816, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 3514.2083740234375, "epoch": 0.04228571428571429, "grad_norm": 0.2642626464366913, "kl": 0.671875, "learning_rate": 9.942113192828445e-05, "loss": 0.0673, "reward": -0.7055738568305969, "reward_std": 0.5232572704553604, "rewards/cosine_scaled_reward": -0.39445359259843826, "rewards/format_reward": 0.0833333358168602, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 2784.0000610351562, "epoch": 0.04285714285714286, "grad_norm": 0.5180374383926392, "kl": 0.708984375, "learning_rate": 9.936982166817273e-05, "loss": 0.1735, "reward": 0.5747006963938475, "reward_std": 0.956249326467514, "rewards/cosine_scaled_reward": -0.06681633368134499, "rewards/format_reward": 0.7083333507180214, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 3248.041748046875, "epoch": 0.04342857142857143, "grad_norm": 0.46085768938064575, "kl": 0.947265625, "learning_rate": 9.931634888554937e-05, "loss": 0.1319, "reward": 0.21069841086864471, "reward_std": 0.9020620584487915, "rewards/cosine_scaled_reward": -0.2071508066728711, "rewards/format_reward": 0.6250000149011612, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 2514.75, "epoch": 0.044, "grad_norm": 0.5324290990829468, "kl": 1.1328125, "learning_rate": 9.926071618660238e-05, "loss": 0.0618, "reward": 0.7419831641018391, "reward_std": 0.5870551839470863, "rewards/cosine_scaled_reward": -0.08734174817800522, "rewards/format_reward": 0.9166666716337204, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 2806.041748046875, "epoch": 0.044571428571428574, "grad_norm": 0.5151541233062744, "kl": 0.9423828125, "learning_rate": 9.920292628279099e-05, "loss": 0.2011, "reward": 1.1562666706740856, "reward_std": 0.9286830723285675, "rewards/cosine_scaled_reward": 0.14063331112265587, "rewards/format_reward": 0.8750000149011612, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 2392.416748046875, "epoch": 0.045142857142857144, "grad_norm": 0.38864853978157043, "kl": 0.93310546875, "learning_rate": 9.914298199071362e-05, "loss": 0.0511, "reward": 0.5092507172375917, "reward_std": 0.6603717654943466, "rewards/cosine_scaled_reward": -0.16204129718244076, "rewards/format_reward": 0.8333333432674408, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 2799.0001220703125, "epoch": 0.045714285714285714, "grad_norm": 0.33745047450065613, "kl": 1.154296875, "learning_rate": 9.908088623197048e-05, "loss": 0.1739, "reward": 0.49583832919597626, "reward_std": 0.5688673853874207, "rewards/cosine_scaled_reward": -0.21041417494416237, "rewards/format_reward": 0.9166666865348816, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 2742.9584350585938, "epoch": 0.046285714285714284, "grad_norm": 0.6719815135002136, "kl": 1.2353515625, "learning_rate": 9.901664203302126e-05, "loss": 0.0793, "reward": 0.8690863847732544, "reward_std": 0.67031354829669, "rewards/cosine_scaled_reward": -0.04462350904941559, "rewards/format_reward": 0.9583333432674408, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 2788.166748046875, "epoch": 0.046857142857142854, "grad_norm": 0.5158824920654297, "kl": 0.869140625, "learning_rate": 9.895025252503756e-05, "loss": 0.0128, "reward": 0.9047548621892929, "reward_std": 0.6626572608947754, "rewards/cosine_scaled_reward": -0.04762259125709534, "rewards/format_reward": 1.0, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 1382.4583587646484, "epoch": 0.04742857142857143, "grad_norm": 0.27055126428604126, "kl": 0.299072265625, "learning_rate": 9.888172094375034e-05, "loss": 0.0848, "reward": 1.0674683526158333, "reward_std": 0.4278734102845192, "rewards/cosine_scaled_reward": 0.03373415768146515, "rewards/format_reward": 1.0, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 2475.666717529297, "epoch": 0.048, "grad_norm": 0.20611289143562317, "kl": 0.50341796875, "learning_rate": 9.881105062929221e-05, "loss": 0.0433, "reward": 1.037320300936699, "reward_std": 1.0173222571611404, "rewards/cosine_scaled_reward": 0.08116012637037784, "rewards/format_reward": 0.8750000149011612, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 2356.3333740234375, "epoch": 0.04857142857142857, "grad_norm": 0.25637194514274597, "kl": 0.556884765625, "learning_rate": 9.87382450260346e-05, "loss": 0.0248, "reward": 1.1020738258957863, "reward_std": 0.6897303387522697, "rewards/cosine_scaled_reward": 0.05103694926947355, "rewards/format_reward": 1.0, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 3392.2501220703125, "epoch": 0.04914285714285714, "grad_norm": 0.18801869451999664, "kl": 0.587890625, "learning_rate": 9.866330768241984e-05, "loss": 0.0602, "reward": 0.8699261844158173, "reward_std": 1.160498969256878, "rewards/cosine_scaled_reward": 0.08079641312360764, "rewards/format_reward": 0.7083333507180214, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 3220.041748046875, "epoch": 0.04971428571428571, "grad_norm": 0.238076850771904, "kl": 0.57275390625, "learning_rate": 9.858624225078841e-05, "loss": 0.0654, "reward": 0.5753591060638428, "reward_std": 0.5094204191118479, "rewards/cosine_scaled_reward": -0.045653849840164185, "rewards/format_reward": 0.6666666865348816, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 2765.9583740234375, "epoch": 0.05028571428571429, "grad_norm": 0.1997767835855484, "kl": 0.48583984375, "learning_rate": 9.850705248720069e-05, "loss": 0.0971, "reward": 0.47976822033524513, "reward_std": 0.6232990622520447, "rewards/cosine_scaled_reward": -0.17678256519138813, "rewards/format_reward": 0.8333333358168602, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 3000.875030517578, "epoch": 0.05085714285714286, "grad_norm": 0.189803808927536, "kl": 0.403564453125, "learning_rate": 9.842574225125401e-05, "loss": 0.017, "reward": 0.1426578164100647, "reward_std": 0.8176284991204739, "rewards/cosine_scaled_reward": -0.13700442761182785, "rewards/format_reward": 0.4166666716337204, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 2594.2084350585938, "epoch": 0.05142857142857143, "grad_norm": 0.44425904750823975, "kl": 0.220947265625, "learning_rate": 9.834231550589462e-05, "loss": 0.0654, "reward": 0.7554673850536346, "reward_std": 1.3941691219806671, "rewards/cosine_scaled_reward": 0.04440037161111832, "rewards/format_reward": 0.666666679084301, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 3254.916748046875, "epoch": 0.052, "grad_norm": 0.19816897809505463, "kl": 0.38525390625, "learning_rate": 9.825677631722435e-05, "loss": 0.0586, "reward": 0.5828766226768494, "reward_std": 0.8098243772983551, "rewards/cosine_scaled_reward": -0.06272834818810225, "rewards/format_reward": 0.7083333507180214, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 2826.291717529297, "epoch": 0.052571428571428575, "grad_norm": 0.21156969666481018, "kl": 0.36474609375, "learning_rate": 9.816912885430258e-05, "loss": 0.0932, "reward": 0.22245215624570847, "reward_std": 0.7063884437084198, "rewards/cosine_scaled_reward": -0.20127389580011368, "rewards/format_reward": 0.625, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 2395.6666870117188, "epoch": 0.053142857142857144, "grad_norm": 0.2171396166086197, "kl": 0.26220703125, "learning_rate": 9.807937738894303e-05, "loss": 0.0753, "reward": 0.8515213429927826, "reward_std": 0.9921551188454032, "rewards/cosine_scaled_reward": 0.11326067708432674, "rewards/format_reward": 0.625, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 3411.2500610351562, "epoch": 0.053714285714285714, "grad_norm": 0.27403607964515686, "kl": 0.39208984375, "learning_rate": 9.798752629550546e-05, "loss": 0.0915, "reward": -0.09905853308737278, "reward_std": 0.7048804685473442, "rewards/cosine_scaled_reward": -0.2578625986352563, "rewards/format_reward": 0.416666679084301, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 3139.2501220703125, "epoch": 0.054285714285714284, "grad_norm": 0.2699209153652191, "kl": 0.39892578125, "learning_rate": 9.789358005068262e-05, "loss": -0.0192, "reward": 0.33204662054777145, "reward_std": 0.8725630715489388, "rewards/cosine_scaled_reward": -0.16731002740561962, "rewards/format_reward": 0.666666679084301, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 3406.0833740234375, "epoch": 0.054857142857142854, "grad_norm": 0.2241956889629364, "kl": 0.4150390625, "learning_rate": 9.779754323328192e-05, "loss": 0.0288, "reward": 0.29199653305113316, "reward_std": 0.6021534074097872, "rewards/cosine_scaled_reward": -0.10400174837559462, "rewards/format_reward": 0.5, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 2894.8333435058594, "epoch": 0.05542857142857143, "grad_norm": 0.20786413550376892, "kl": 0.424560546875, "learning_rate": 9.769942052400235e-05, "loss": 0.0021, "reward": 0.17632445320487022, "reward_std": 0.2286351751536131, "rewards/cosine_scaled_reward": -0.16183778084814548, "rewards/format_reward": 0.5, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 2208.1666870117188, "epoch": 0.056, "grad_norm": 0.20279790461063385, "kl": 0.3033447265625, "learning_rate": 9.759921670520634e-05, "loss": 0.0457, "reward": 1.092903109267354, "reward_std": 0.9870968163013458, "rewards/cosine_scaled_reward": 0.08811822533607483, "rewards/format_reward": 0.9166666716337204, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 2346.5833587646484, "epoch": 0.05657142857142857, "grad_norm": 0.3798772394657135, "kl": 0.329345703125, "learning_rate": 9.749693666068664e-05, "loss": 0.1025, "reward": 0.7225791215896606, "reward_std": 0.9258620589971542, "rewards/cosine_scaled_reward": -0.03454381600022316, "rewards/format_reward": 0.791666679084301, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 1839.2500610351562, "epoch": 0.05714285714285714, "grad_norm": 0.2397909164428711, "kl": 0.238525390625, "learning_rate": 9.739258537542835e-05, "loss": 0.0492, "reward": 1.0250511392951012, "reward_std": 0.7990377843379974, "rewards/cosine_scaled_reward": 0.012525551952421665, "rewards/format_reward": 1.0, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 3216.58349609375, "epoch": 0.05771428571428571, "grad_norm": 0.22909773886203766, "kl": 0.4794921875, "learning_rate": 9.728616793536588e-05, "loss": 0.0818, "reward": 0.23036185838282108, "reward_std": 0.5441985353827477, "rewards/cosine_scaled_reward": -0.23898574337363243, "rewards/format_reward": 0.7083333358168602, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 3036.4583740234375, "epoch": 0.05828571428571429, "grad_norm": 0.22612375020980835, "kl": 0.576416015625, "learning_rate": 9.717768952713513e-05, "loss": 0.0668, "reward": 0.42826264537870884, "reward_std": 0.4419846907258034, "rewards/cosine_scaled_reward": -0.18170202150940895, "rewards/format_reward": 0.7916666716337204, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 3196.041748046875, "epoch": 0.05885714285714286, "grad_norm": 0.32880789041519165, "kl": 0.55517578125, "learning_rate": 9.706715543782064e-05, "loss": 0.0338, "reward": 0.6674360632896423, "reward_std": 0.6189832799136639, "rewards/cosine_scaled_reward": -0.12461531162261963, "rewards/format_reward": 0.9166666865348816, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 2930.5833740234375, "epoch": 0.05942857142857143, "grad_norm": 0.2633622884750366, "kl": 0.53759765625, "learning_rate": 9.695457105469806e-05, "loss": 0.066, "reward": 0.7318950295448303, "reward_std": 0.4617820382118225, "rewards/cosine_scaled_reward": -0.13405249640345573, "rewards/format_reward": 1.0, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 2902.7084350585938, "epoch": 0.06, "grad_norm": 0.4247024953365326, "kl": 0.486328125, "learning_rate": 9.683994186497132e-05, "loss": 0.0147, "reward": 0.2280621938407421, "reward_std": 0.4256502091884613, "rewards/cosine_scaled_reward": -0.2401355840265751, "rewards/format_reward": 0.7083333544433117, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 2954.041717529297, "epoch": 0.060571428571428575, "grad_norm": 0.350850909948349, "kl": 0.587646484375, "learning_rate": 9.672327345550543e-05, "loss": 0.0095, "reward": 0.3893072069622576, "reward_std": 0.7964539304375648, "rewards/cosine_scaled_reward": -0.09701308235526085, "rewards/format_reward": 0.5833333395421505, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 2843.166717529297, "epoch": 0.061142857142857145, "grad_norm": 0.2774275839328766, "kl": 0.3834228515625, "learning_rate": 9.66045715125541e-05, "loss": 0.0528, "reward": 0.12887566909193993, "reward_std": 0.21474172547459602, "rewards/cosine_scaled_reward": -0.22722883895039558, "rewards/format_reward": 0.5833333358168602, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 2895.0000610351562, "epoch": 0.061714285714285715, "grad_norm": 0.3574961721897125, "kl": 0.34130859375, "learning_rate": 9.648384182148252e-05, "loss": -0.0363, "reward": 0.0623416006565094, "reward_std": 0.5782450139522552, "rewards/cosine_scaled_reward": -0.3021625205874443, "rewards/format_reward": 0.6666666679084301, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 3446.9583740234375, "epoch": 0.062285714285714285, "grad_norm": 0.3407225012779236, "kl": 0.269287109375, "learning_rate": 9.636109026648555e-05, "loss": 0.0503, "reward": 0.2119973637163639, "reward_std": 0.6676298193633556, "rewards/cosine_scaled_reward": -0.08150134235620499, "rewards/format_reward": 0.3750000111758709, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 3102.0, "epoch": 0.06285714285714286, "grad_norm": 0.36664795875549316, "kl": 0.21533203125, "learning_rate": 9.623632283030079e-05, "loss": -0.0268, "reward": -0.14389869943261147, "reward_std": 0.4256294723600149, "rewards/cosine_scaled_reward": -0.21778268064372241, "rewards/format_reward": 0.2916666679084301, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 2798.1667098999023, "epoch": 0.06342857142857143, "grad_norm": 0.19732221961021423, "kl": 0.21051025390625, "learning_rate": 9.610954559391703e-05, "loss": 0.0356, "reward": -0.028341025114059448, "reward_std": 0.41592887230217457, "rewards/cosine_scaled_reward": -0.16000381857156754, "rewards/format_reward": 0.2916666679084301, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 3370.5416870117188, "epoch": 0.064, "grad_norm": 0.23629334568977356, "kl": 0.22021484375, "learning_rate": 9.598076473627798e-05, "loss": 0.0157, "reward": -0.1737063229084015, "reward_std": 0.5679741557687521, "rewards/cosine_scaled_reward": -0.27435317635536194, "rewards/format_reward": 0.375, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 3097.250030517578, "epoch": 0.06457142857142857, "grad_norm": 0.20911185443401337, "kl": 0.1754150390625, "learning_rate": 9.58499865339809e-05, "loss": -0.0049, "reward": -0.31149674439802766, "reward_std": 0.4890762511640787, "rewards/cosine_scaled_reward": -0.3432483784854412, "rewards/format_reward": 0.375, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 3010.5834045410156, "epoch": 0.06514285714285714, "grad_norm": 0.1976766437292099, "kl": 0.11474609375, "learning_rate": 9.571721736097089e-05, "loss": -0.0025, "reward": -0.04094894975423813, "reward_std": 0.6383696794509888, "rewards/cosine_scaled_reward": -0.22880780510604382, "rewards/format_reward": 0.4166666716337204, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 2820.5834350585938, "epoch": 0.06571428571428571, "grad_norm": 0.13228124380111694, "kl": 0.1092529296875, "learning_rate": 9.558246368823013e-05, "loss": 0.0317, "reward": 0.16390416398644447, "reward_std": 0.8831267654895782, "rewards/cosine_scaled_reward": -0.23054791847243905, "rewards/format_reward": 0.6250000149011612, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 3487.75, "epoch": 0.06628571428571428, "grad_norm": 0.13215263187885284, "kl": 0.14013671875, "learning_rate": 9.544573208346253e-05, "loss": 0.0268, "reward": -0.4231237219646573, "reward_std": 0.5401189308613539, "rewards/cosine_scaled_reward": -0.2948951981961727, "rewards/format_reward": 0.1666666679084301, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 3039.0833435058594, "epoch": 0.06685714285714285, "grad_norm": 0.16131699085235596, "kl": 0.1080322265625, "learning_rate": 9.530702921077358e-05, "loss": -0.0214, "reward": 0.22934666275978088, "reward_std": 0.5350189581513405, "rewards/cosine_scaled_reward": -0.03115999698638916, "rewards/format_reward": 0.2916666679084301, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 2498.0000915527344, "epoch": 0.06742857142857143, "grad_norm": 0.3315119743347168, "kl": 0.0802001953125, "learning_rate": 9.516636183034565e-05, "loss": 0.0695, "reward": 0.9933711041230708, "reward_std": 1.1258303076028824, "rewards/cosine_scaled_reward": 0.14251888822764158, "rewards/format_reward": 0.7083333432674408, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 2174.25, "epoch": 0.068, "grad_norm": 0.4764270484447479, "kl": 0.1552734375, "learning_rate": 9.50237367981084e-05, "loss": -0.011, "reward": 0.7270113378763199, "reward_std": 0.5169851435348392, "rewards/cosine_scaled_reward": 0.11350566893815994, "rewards/format_reward": 0.5, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 3331.7084350585938, "epoch": 0.06857142857142857, "grad_norm": 0.23218177258968353, "kl": 0.08642578125, "learning_rate": 9.487916106540466e-05, "loss": 0.0861, "reward": 0.3240978792309761, "reward_std": 0.4474998824298382, "rewards/cosine_scaled_reward": -0.004617743194103241, "rewards/format_reward": 0.3333333358168602, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 2834.0416717529297, "epoch": 0.06914285714285714, "grad_norm": 0.11969159543514252, "kl": 0.096771240234375, "learning_rate": 9.473264167865173e-05, "loss": 0.0116, "reward": -0.09953830391168594, "reward_std": 0.4128492996096611, "rewards/cosine_scaled_reward": -0.17476914450526237, "rewards/format_reward": 0.25, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 2239.7084045410156, "epoch": 0.06971428571428571, "grad_norm": 0.13540911674499512, "kl": 0.0828094482421875, "learning_rate": 9.458418577899775e-05, "loss": -0.0462, "reward": 1.4352559298276901, "reward_std": 0.4657023213803768, "rewards/cosine_scaled_reward": 0.34262790158391, "rewards/format_reward": 0.75, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 3056.8750610351562, "epoch": 0.07028571428571428, "grad_norm": 0.3198010325431824, "kl": 0.1041259765625, "learning_rate": 9.443380060197387e-05, "loss": 0.0968, "reward": 0.48675229772925377, "reward_std": 0.8716996815055609, "rewards/cosine_scaled_reward": -0.0482905525714159, "rewards/format_reward": 0.5833333432674408, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 3280.5833740234375, "epoch": 0.07085714285714285, "grad_norm": 0.12906375527381897, "kl": 0.1143798828125, "learning_rate": 9.428149347714143e-05, "loss": 0.0243, "reward": 0.5198042392730713, "reward_std": 0.2846983075141907, "rewards/cosine_scaled_reward": 0.00990208238363266, "rewards/format_reward": 0.5, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 2674.8333740234375, "epoch": 0.07142857142857142, "grad_norm": 0.21717137098312378, "kl": 0.11065673828125, "learning_rate": 9.412727182773487e-05, "loss": -0.0629, "reward": 0.5749734938144684, "reward_std": 1.0196832083165646, "rewards/cosine_scaled_reward": 0.01665341481566429, "rewards/format_reward": 0.5416666716337204, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 2071.125045776367, "epoch": 0.072, "grad_norm": 0.1411234438419342, "kl": 0.0943756103515625, "learning_rate": 9.397114317029975e-05, "loss": 0.0888, "reward": 0.553262030123733, "reward_std": 0.7625375427305698, "rewards/cosine_scaled_reward": 0.005797676742076874, "rewards/format_reward": 0.5416666679084301, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 3094.000030517578, "epoch": 0.07257142857142856, "grad_norm": 0.41723886132240295, "kl": 0.1458740234375, "learning_rate": 9.381311511432659e-05, "loss": 0.1064, "reward": 0.19710233807563782, "reward_std": 0.8968651816248894, "rewards/cosine_scaled_reward": -0.06811549002304673, "rewards/format_reward": 0.3333333432674408, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 3223.4583740234375, "epoch": 0.07314285714285715, "grad_norm": 0.4333489239215851, "kl": 0.151611328125, "learning_rate": 9.36531953618799e-05, "loss": 0.1332, "reward": 0.1836181916296482, "reward_std": 0.7244069799780846, "rewards/cosine_scaled_reward": -0.05402424931526184, "rewards/format_reward": 0.2916666753590107, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 2923.3334350585938, "epoch": 0.07371428571428572, "grad_norm": 0.40884310007095337, "kl": 0.17724609375, "learning_rate": 9.349139170722281e-05, "loss": 0.0865, "reward": 0.1987609639763832, "reward_std": 0.6052179206162691, "rewards/cosine_scaled_reward": -0.1506195142865181, "rewards/format_reward": 0.5000000111758709, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 3203.291748046875, "epoch": 0.07428571428571429, "grad_norm": 0.38833317160606384, "kl": 0.321533203125, "learning_rate": 9.332771203643715e-05, "loss": 0.1088, "reward": 0.03225439786911011, "reward_std": 0.9268941730260849, "rewards/cosine_scaled_reward": -0.15053946431726217, "rewards/format_reward": 0.3333333432674408, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 2999.166717529297, "epoch": 0.07485714285714286, "grad_norm": 0.22766365110874176, "kl": 0.297119140625, "learning_rate": 9.316216432703917e-05, "loss": 0.022, "reward": 0.0023300140164792538, "reward_std": 0.5885078124701977, "rewards/cosine_scaled_reward": -0.2488350011408329, "rewards/format_reward": 0.5000000223517418, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 2991.4584350585938, "epoch": 0.07542857142857143, "grad_norm": 0.22822511196136475, "kl": 0.53076171875, "learning_rate": 9.299475664759069e-05, "loss": 0.0249, "reward": 0.4181240275502205, "reward_std": 0.45995941013097763, "rewards/cosine_scaled_reward": -0.22843801230192184, "rewards/format_reward": 0.8750000149011612, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 2522.5000610351562, "epoch": 0.076, "grad_norm": 0.6402227282524109, "kl": 0.5380859375, "learning_rate": 9.28254971573058e-05, "loss": 0.2187, "reward": 1.112146245315671, "reward_std": 0.6289810538291931, "rewards/cosine_scaled_reward": 0.13940642774105072, "rewards/format_reward": 0.8333333432674408, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 2573.33349609375, "epoch": 0.07657142857142857, "grad_norm": 0.34855917096138, "kl": 0.744140625, "learning_rate": 9.265439410565329e-05, "loss": 0.1362, "reward": 0.5219381079077721, "reward_std": 0.6780424751341343, "rewards/cosine_scaled_reward": -0.13486428651958704, "rewards/format_reward": 0.7916666716337204, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 2322.7500915527344, "epoch": 0.07714285714285714, "grad_norm": 0.3246491551399231, "kl": 0.55712890625, "learning_rate": 9.248145583195448e-05, "loss": 0.1347, "reward": 0.7710914388298988, "reward_std": 0.5349869206547737, "rewards/cosine_scaled_reward": -0.051954299211502075, "rewards/format_reward": 0.8750000149011612, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 1618.375015258789, "epoch": 0.07771428571428571, "grad_norm": 0.2646270990371704, "kl": 0.3037109375, "learning_rate": 9.230669076497688e-05, "loss": 0.0348, "reward": 0.6177230253815651, "reward_std": 0.5647286213934422, "rewards/cosine_scaled_reward": -0.17030514776706696, "rewards/format_reward": 0.9583333432674408, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 2437.125045776367, "epoch": 0.07828571428571429, "grad_norm": 0.3581116199493408, "kl": 0.67254638671875, "learning_rate": 9.213010742252328e-05, "loss": 0.1788, "reward": 0.2514616549015045, "reward_std": 0.8865174166858196, "rewards/cosine_scaled_reward": -0.12426918093115091, "rewards/format_reward": 0.5000000074505806, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 2302.041717529297, "epoch": 0.07885714285714286, "grad_norm": 0.5932723879814148, "kl": 0.683349609375, "learning_rate": 9.195171441101669e-05, "loss": 0.2266, "reward": 0.48782986029982567, "reward_std": 0.6536561995744705, "rewards/cosine_scaled_reward": -0.11025174707174301, "rewards/format_reward": 0.7083333432674408, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 2605.3334350585938, "epoch": 0.07942857142857143, "grad_norm": 0.3788131773471832, "kl": 1.0732421875, "learning_rate": 9.177152042508078e-05, "loss": 0.1714, "reward": 0.49786752462387085, "reward_std": 0.7835120111703873, "rewards/cosine_scaled_reward": -0.06356624886393547, "rewards/format_reward": 0.6250000223517418, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 2648.7084350585938, "epoch": 0.08, "grad_norm": 0.9030252695083618, "kl": 1.1220703125, "learning_rate": 9.158953424711625e-05, "loss": 0.1234, "reward": 0.28087351471185684, "reward_std": 0.7202268093824387, "rewards/cosine_scaled_reward": -0.13039657729677856, "rewards/format_reward": 0.541666679084301, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 2024.4167175292969, "epoch": 0.08057142857142857, "grad_norm": 1.2033867835998535, "kl": 0.7147216796875, "learning_rate": 9.140576474687264e-05, "loss": -0.0838, "reward": 0.22873285971581936, "reward_std": 0.2747483551502228, "rewards/cosine_scaled_reward": -0.17730024084448814, "rewards/format_reward": 0.5833333358168602, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 1909.3334045410156, "epoch": 0.08114285714285714, "grad_norm": 0.3704441487789154, "kl": 0.80645751953125, "learning_rate": 9.122022088101614e-05, "loss": 0.0942, "reward": 0.4351644292473793, "reward_std": 0.7959753908216953, "rewards/cosine_scaled_reward": -0.1574177942238748, "rewards/format_reward": 0.7500000074505806, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 2298.6250915527344, "epoch": 0.08171428571428571, "grad_norm": 0.2989516854286194, "kl": 0.9326171875, "learning_rate": 9.1032911692693e-05, "loss": 0.2218, "reward": 0.35630420781672, "reward_std": 0.6438803896307945, "rewards/cosine_scaled_reward": -0.15518124029040337, "rewards/format_reward": 0.666666679084301, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 1725.2500305175781, "epoch": 0.08228571428571428, "grad_norm": 0.36004939675331116, "kl": 0.5740966796875, "learning_rate": 9.084384631108883e-05, "loss": 0.1278, "reward": 0.6938553377985954, "reward_std": 0.5497538670897484, "rewards/cosine_scaled_reward": -0.04890568554401398, "rewards/format_reward": 0.7916666716337204, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 1852.3750610351562, "epoch": 0.08285714285714285, "grad_norm": 0.6046501398086548, "kl": 0.70703125, "learning_rate": 9.065303395098359e-05, "loss": 0.0028, "reward": 0.6369144171476364, "reward_std": 0.5210181921720505, "rewards/cosine_scaled_reward": -0.1398761412128806, "rewards/format_reward": 0.9166666865348816, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 2484.916748046875, "epoch": 0.08342857142857144, "grad_norm": 0.2475971132516861, "kl": 0.73388671875, "learning_rate": 9.046048391230248e-05, "loss": 0.0756, "reward": 0.7248256802558899, "reward_std": 0.6479251198470592, "rewards/cosine_scaled_reward": -0.09592050686478615, "rewards/format_reward": 0.9166666865348816, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 2398.166748046875, "epoch": 0.084, "grad_norm": 0.6569065451622009, "kl": 0.87890625, "learning_rate": 9.02662055796628e-05, "loss": -0.0233, "reward": 0.8563184477388859, "reward_std": 0.6260966360569, "rewards/cosine_scaled_reward": 0.07399253733456135, "rewards/format_reward": 0.7083333507180214, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 2529.291748046875, "epoch": 0.08457142857142858, "grad_norm": 0.41076308488845825, "kl": 0.578125, "learning_rate": 9.007020842191635e-05, "loss": 0.0031, "reward": 0.7412142492830753, "reward_std": 0.47748080641031265, "rewards/cosine_scaled_reward": -0.10855956003069878, "rewards/format_reward": 0.9583333432674408, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 3347.4166870117188, "epoch": 0.08514285714285715, "grad_norm": 0.16827178001403809, "kl": 0.52197265625, "learning_rate": 8.987250199168808e-05, "loss": 0.0373, "reward": 0.504796092864126, "reward_std": 0.5495826080441475, "rewards/cosine_scaled_reward": -0.03926862310618162, "rewards/format_reward": 0.5833333358168602, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 2754.791717529297, "epoch": 0.08571428571428572, "grad_norm": 0.531985342502594, "kl": 0.2252197265625, "learning_rate": 8.967309592491052e-05, "loss": -0.098, "reward": 0.4745659134350717, "reward_std": 0.4885084852576256, "rewards/cosine_scaled_reward": -0.03355037793517113, "rewards/format_reward": 0.5416666679084301, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 2840.7500915527344, "epoch": 0.08628571428571429, "grad_norm": 0.1887764185667038, "kl": 0.278228759765625, "learning_rate": 8.947199994035401e-05, "loss": 0.0466, "reward": 0.10480327904224396, "reward_std": 0.890035405755043, "rewards/cosine_scaled_reward": -0.2600983753800392, "rewards/format_reward": 0.6250000037252903, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 3203.3750610351562, "epoch": 0.08685714285714285, "grad_norm": 0.111234150826931, "kl": 0.203857421875, "learning_rate": 8.926922383915316e-05, "loss": 0.0474, "reward": 0.10628402233123779, "reward_std": 0.44779110699892044, "rewards/cosine_scaled_reward": -0.11352465860545635, "rewards/format_reward": 0.3333333358168602, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 3050.9166870117188, "epoch": 0.08742857142857142, "grad_norm": 0.2246592491865158, "kl": 0.193359375, "learning_rate": 8.906477750432904e-05, "loss": -0.0286, "reward": 0.7106999894604087, "reward_std": 0.9839373230934143, "rewards/cosine_scaled_reward": 0.12618330994155258, "rewards/format_reward": 0.4583333395421505, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 3488.1250610351562, "epoch": 0.088, "grad_norm": 0.299188494682312, "kl": 0.29833984375, "learning_rate": 8.885867090030761e-05, "loss": 0.0391, "reward": -0.033790960907936096, "reward_std": 0.9646570086479187, "rewards/cosine_scaled_reward": -0.12106215953826904, "rewards/format_reward": 0.2083333395421505, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 3418.541748046875, "epoch": 0.08857142857142856, "grad_norm": 0.15048454701900482, "kl": 0.1529541015625, "learning_rate": 8.865091407243394e-05, "loss": 0.0736, "reward": -0.3656083308160305, "reward_std": 0.6392169110476971, "rewards/cosine_scaled_reward": -0.2869708426296711, "rewards/format_reward": 0.2083333395421505, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 2060.1250610351562, "epoch": 0.08914285714285715, "grad_norm": 0.22759364545345306, "kl": 0.10736083984375, "learning_rate": 8.844151714648274e-05, "loss": 0.0749, "reward": 1.4114615470170975, "reward_std": 0.6891137436032295, "rewards/cosine_scaled_reward": 0.24739742651581764, "rewards/format_reward": 0.9166666716337204, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 2715.7083740234375, "epoch": 0.08971428571428572, "grad_norm": 0.1746428906917572, "kl": 0.19403076171875, "learning_rate": 8.823049032816479e-05, "loss": 0.104, "reward": -0.2559436559677124, "reward_std": 0.42704229010269046, "rewards/cosine_scaled_reward": -0.3363051738124341, "rewards/format_reward": 0.4166666716337204, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 2920.7500610351562, "epoch": 0.09028571428571429, "grad_norm": 0.1731269210577011, "kl": 0.09033203125, "learning_rate": 8.801784390262944e-05, "loss": 0.0437, "reward": 0.4546380043029785, "reward_std": 0.8263214789330959, "rewards/cosine_scaled_reward": -0.08518100716173649, "rewards/format_reward": 0.6250000149011612, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 3434.666748046875, "epoch": 0.09085714285714286, "grad_norm": 0.17278991639614105, "kl": 0.157958984375, "learning_rate": 8.780358823396352e-05, "loss": 0.0737, "reward": -0.6269425004720688, "reward_std": 0.387746115680784, "rewards/cosine_scaled_reward": -0.3968045935034752, "rewards/format_reward": 0.1666666716337204, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 3551.75, "epoch": 0.09142857142857143, "grad_norm": 0.2092837542295456, "kl": 0.26220703125, "learning_rate": 8.758773376468606e-05, "loss": 0.0227, "reward": -0.5527655929327011, "reward_std": 0.5685544777661562, "rewards/cosine_scaled_reward": -0.31804945319890976, "rewards/format_reward": 0.0833333358168602, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 3314.8333740234375, "epoch": 0.092, "grad_norm": 0.18133918941020966, "kl": 0.1513671875, "learning_rate": 8.73702910152393e-05, "loss": 0.0928, "reward": 0.07757844589650631, "reward_std": 1.0933372657746077, "rewards/cosine_scaled_reward": -0.06537744263187051, "rewards/format_reward": 0.2083333395421505, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 3062.8750610351562, "epoch": 0.09257142857142857, "grad_norm": 0.18608345091342926, "kl": 0.1959228515625, "learning_rate": 8.715127058347615e-05, "loss": -0.0589, "reward": 0.19248175248503685, "reward_std": 0.7195432111620903, "rewards/cosine_scaled_reward": -0.09125912375748158, "rewards/format_reward": 0.3750000037252903, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 3412.6250610351562, "epoch": 0.09314285714285714, "grad_norm": 0.19522501528263092, "kl": 0.1484375, "learning_rate": 8.693068314414344e-05, "loss": 0.0603, "reward": -0.07296949997544289, "reward_std": 0.6726852059364319, "rewards/cosine_scaled_reward": -0.16148475895170122, "rewards/format_reward": 0.2500000037252903, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 3415.5833740234375, "epoch": 0.09371428571428571, "grad_norm": 0.16382084786891937, "kl": 0.23876953125, "learning_rate": 8.670853944836176e-05, "loss": 0.0688, "reward": 0.06261083483695984, "reward_std": 0.6757630333304405, "rewards/cosine_scaled_reward": -0.11452792584896088, "rewards/format_reward": 0.2916666716337204, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 3390.25, "epoch": 0.09428571428571429, "grad_norm": 0.13250745832920074, "kl": 0.18994140625, "learning_rate": 8.648485032310145e-05, "loss": 0.0562, "reward": -0.657595120370388, "reward_std": 0.1904697474092245, "rewards/cosine_scaled_reward": -0.4121309034526348, "rewards/format_reward": 0.1666666716337204, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 3296.9583740234375, "epoch": 0.09485714285714286, "grad_norm": 0.1458793729543686, "kl": 0.1710205078125, "learning_rate": 8.625962667065488e-05, "loss": 0.0482, "reward": 0.298754021525383, "reward_std": 0.43109990283846855, "rewards/cosine_scaled_reward": -0.0172896608710289, "rewards/format_reward": 0.3333333358168602, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 3031.4166870117188, "epoch": 0.09542857142857143, "grad_norm": 0.5354498624801636, "kl": 0.2393798828125, "learning_rate": 8.603287946810515e-05, "loss": 0.1521, "reward": 0.740845113992691, "reward_std": 1.2252501547336578, "rewards/cosine_scaled_reward": 0.12042254209518433, "rewards/format_reward": 0.5000000186264515, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 3194.541748046875, "epoch": 0.096, "grad_norm": 0.19526343047618866, "kl": 0.20654296875, "learning_rate": 8.5804619766791e-05, "loss": 0.0617, "reward": 0.026736490428447723, "reward_std": 0.7318253479897976, "rewards/cosine_scaled_reward": -0.1324650919996202, "rewards/format_reward": 0.2916666716337204, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 2889.2083435058594, "epoch": 0.09657142857142857, "grad_norm": 0.13189108669757843, "kl": 0.2628173828125, "learning_rate": 8.557485869176826e-05, "loss": 0.0467, "reward": 0.3621276989579201, "reward_std": 0.5554591603577137, "rewards/cosine_scaled_reward": -0.08976950496435165, "rewards/format_reward": 0.5416666679084301, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 3473.7916870117188, "epoch": 0.09714285714285714, "grad_norm": 0.17061442136764526, "kl": 0.42626953125, "learning_rate": 8.534360744126755e-05, "loss": 0.0345, "reward": 0.26488298177719116, "reward_std": 0.6278351005166769, "rewards/cosine_scaled_reward": -0.11755852587521076, "rewards/format_reward": 0.5000000149011612, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 3441.2918090820312, "epoch": 0.09771428571428571, "grad_norm": 0.2095242440700531, "kl": 0.5634765625, "learning_rate": 8.511087728614862e-05, "loss": 0.0491, "reward": 0.1459126800764352, "reward_std": 0.7411493808031082, "rewards/cosine_scaled_reward": -0.2603769972920418, "rewards/format_reward": 0.6666666865348816, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 3050.1250610351562, "epoch": 0.09828571428571428, "grad_norm": 0.3047703504562378, "kl": 0.354248046875, "learning_rate": 8.487667956935088e-05, "loss": 0.1551, "reward": 0.09439549967646599, "reward_std": 0.3907792381942272, "rewards/cosine_scaled_reward": -0.244468929246068, "rewards/format_reward": 0.5833333358168602, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 3115.4583740234375, "epoch": 0.09885714285714285, "grad_norm": 0.22464460134506226, "kl": 0.40625, "learning_rate": 8.464102570534061e-05, "loss": 0.0295, "reward": 0.10012460593134165, "reward_std": 0.3750727055594325, "rewards/cosine_scaled_reward": -0.17910437239333987, "rewards/format_reward": 0.4583333432674408, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 3400.25, "epoch": 0.09942857142857142, "grad_norm": 0.17063897848129272, "kl": 0.4248046875, "learning_rate": 8.440392717955476e-05, "loss": 0.0536, "reward": 0.10912856008508243, "reward_std": 0.19460038893157616, "rewards/cosine_scaled_reward": -0.11210238412604667, "rewards/format_reward": 0.3333333432674408, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 2890.0000610351562, "epoch": 0.1, "grad_norm": 0.2909056842327118, "kl": 0.73583984375, "learning_rate": 8.416539554784089e-05, "loss": 0.1191, "reward": 0.5186500549316406, "reward_std": 0.5995163694024086, "rewards/cosine_scaled_reward": -0.13650833815336227, "rewards/format_reward": 0.7916666865348816, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 3099.3333740234375, "epoch": 0.10057142857142858, "grad_norm": 0.38485434651374817, "kl": 0.67333984375, "learning_rate": 8.392544243589427e-05, "loss": 0.0273, "reward": 0.32382382079958916, "reward_std": 0.3924727290868759, "rewards/cosine_scaled_reward": -0.2339214440435171, "rewards/format_reward": 0.791666679084301, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 2768.291748046875, "epoch": 0.10114285714285715, "grad_norm": 0.3035656213760376, "kl": 0.539306640625, "learning_rate": 8.368407953869104e-05, "loss": 0.0805, "reward": 1.056127205491066, "reward_std": 0.9225019067525864, "rewards/cosine_scaled_reward": 0.02806359902024269, "rewards/format_reward": 1.0, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 3217.791748046875, "epoch": 0.10171428571428572, "grad_norm": 0.38410308957099915, "kl": 0.7138671875, "learning_rate": 8.34413186199183e-05, "loss": 0.0429, "reward": 0.2634657397866249, "reward_std": 0.4688725695014, "rewards/cosine_scaled_reward": -0.2432671394199133, "rewards/format_reward": 0.7500000149011612, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 2783.1250610351562, "epoch": 0.10228571428571429, "grad_norm": 0.2818540632724762, "kl": 0.532958984375, "learning_rate": 8.319717151140073e-05, "loss": 0.0169, "reward": 1.0860532075166702, "reward_std": 0.5036949962377548, "rewards/cosine_scaled_reward": 0.08469325304031372, "rewards/format_reward": 0.9166666865348816, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 3131.8333740234375, "epoch": 0.10285714285714286, "grad_norm": 0.1775483340024948, "kl": 0.7109375, "learning_rate": 8.295165011252397e-05, "loss": 0.0516, "reward": 0.6567038595676422, "reward_std": 0.8435460180044174, "rewards/cosine_scaled_reward": -0.10914808511734009, "rewards/format_reward": 0.8750000298023224, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 3148.1250610351562, "epoch": 0.10342857142857143, "grad_norm": 0.233117014169693, "kl": 0.7314453125, "learning_rate": 8.270476638965462e-05, "loss": 0.0481, "reward": 0.3657357878983021, "reward_std": 0.47509450232610106, "rewards/cosine_scaled_reward": -0.17129878513514996, "rewards/format_reward": 0.7083333432674408, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 3470.4166870117188, "epoch": 0.104, "grad_norm": 0.3096827268600464, "kl": 0.49169921875, "learning_rate": 8.245653237555706e-05, "loss": 0.0023, "reward": 1.0028938204050064, "reward_std": 0.3635401166975498, "rewards/cosine_scaled_reward": 0.10561357298865914, "rewards/format_reward": 0.7916666716337204, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 3170.916748046875, "epoch": 0.10457142857142857, "grad_norm": 0.4448356032371521, "kl": 0.68212890625, "learning_rate": 8.220696016880688e-05, "loss": 0.0353, "reward": 0.3080389183014631, "reward_std": 0.3628546576946974, "rewards/cosine_scaled_reward": -0.20014722365885973, "rewards/format_reward": 0.7083333432674408, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 2907.2500610351562, "epoch": 0.10514285714285715, "grad_norm": 0.2639898657798767, "kl": 0.619140625, "learning_rate": 8.195606193320136e-05, "loss": 0.112, "reward": 0.39144587703049183, "reward_std": 0.5591121315956116, "rewards/cosine_scaled_reward": -0.17927706986665726, "rewards/format_reward": 0.7500000111758709, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 2645.5833740234375, "epoch": 0.10571428571428572, "grad_norm": 0.3939441442489624, "kl": 0.41021728515625, "learning_rate": 8.170384989716657e-05, "loss": 0.0815, "reward": 0.39359497651457787, "reward_std": 0.30915623903274536, "rewards/cosine_scaled_reward": -0.11570251337252557, "rewards/format_reward": 0.6250000111758709, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 3049.291748046875, "epoch": 0.10628571428571429, "grad_norm": 0.180860236287117, "kl": 0.39794921875, "learning_rate": 8.14503363531613e-05, "loss": 0.0304, "reward": 0.6096877008676529, "reward_std": 0.5758876949548721, "rewards/cosine_scaled_reward": -0.04932282119989395, "rewards/format_reward": 0.7083333544433117, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 3548.5833740234375, "epoch": 0.10685714285714286, "grad_norm": 0.1805480420589447, "kl": 0.51318359375, "learning_rate": 8.119553365707803e-05, "loss": 0.0357, "reward": -0.022652635350823402, "reward_std": 0.3080403096973896, "rewards/cosine_scaled_reward": -0.15715964138507843, "rewards/format_reward": 0.2916666753590107, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 3463.8750610351562, "epoch": 0.10742857142857143, "grad_norm": 0.19337642192840576, "kl": 0.5703125, "learning_rate": 8.09394542276407e-05, "loss": 0.0474, "reward": 0.056963276118040085, "reward_std": 0.4269789531826973, "rewards/cosine_scaled_reward": -0.26318503729999065, "rewards/format_reward": 0.5833333544433117, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 3522.1251220703125, "epoch": 0.108, "grad_norm": 0.17618711292743683, "kl": 0.47119140625, "learning_rate": 8.068211054579944e-05, "loss": 0.0353, "reward": 0.017235335893929005, "reward_std": 0.6464042738080025, "rewards/cosine_scaled_reward": -0.19971567392349243, "rewards/format_reward": 0.416666679084301, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 3556.5416870117188, "epoch": 0.10857142857142857, "grad_norm": 0.20488235354423523, "kl": 0.4287109375, "learning_rate": 8.042351515412221e-05, "loss": 0.0113, "reward": -0.17852318100631237, "reward_std": 0.226779380813241, "rewards/cosine_scaled_reward": -0.27676159143447876, "rewards/format_reward": 0.3750000037252903, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 2325.791702270508, "epoch": 0.10914285714285714, "grad_norm": 0.19414982199668884, "kl": 0.22698974609375, "learning_rate": 8.016368065618361e-05, "loss": 0.029, "reward": 0.07766161113977432, "reward_std": 0.23689523618668318, "rewards/cosine_scaled_reward": -0.25283585488796234, "rewards/format_reward": 0.5833333358168602, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 3216.5416870117188, "epoch": 0.10971428571428571, "grad_norm": 0.12748228013515472, "kl": 0.3115234375, "learning_rate": 7.99026197159505e-05, "loss": 0.0574, "reward": 0.4967636591754854, "reward_std": 0.3701250050216913, "rewards/cosine_scaled_reward": 0.08171516214497387, "rewards/format_reward": 0.3333333358168602, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 3243.0416870117188, "epoch": 0.11028571428571429, "grad_norm": 0.3466835021972656, "kl": 0.357666015625, "learning_rate": 7.964034505716477e-05, "loss": 0.1503, "reward": -0.2960766963660717, "reward_std": 0.3008538093417883, "rewards/cosine_scaled_reward": -0.2313716821372509, "rewards/format_reward": 0.1666666679084301, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 2656.75, "epoch": 0.11085714285714286, "grad_norm": 0.26660433411598206, "kl": 0.1654052734375, "learning_rate": 7.93768694627233e-05, "loss": 0.1287, "reward": 0.832207377650775, "reward_std": 0.4687335812486708, "rewards/cosine_scaled_reward": 0.2286036813748069, "rewards/format_reward": 0.375, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 3484.2083740234375, "epoch": 0.11142857142857143, "grad_norm": 0.1703551560640335, "kl": 0.350830078125, "learning_rate": 7.911220577405484e-05, "loss": 0.0664, "reward": -0.10861442796885967, "reward_std": 0.48796410858631134, "rewards/cosine_scaled_reward": -0.22097388468682766, "rewards/format_reward": 0.3333333358168602, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 2822.5833435058594, "epoch": 0.112, "grad_norm": 0.14060811698436737, "kl": 0.263519287109375, "learning_rate": 7.884636689049423e-05, "loss": 0.0503, "reward": 0.03914413973689079, "reward_std": 0.42792966961860657, "rewards/cosine_scaled_reward": -0.12626126781105995, "rewards/format_reward": 0.2916666679084301, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 3313.4583740234375, "epoch": 0.11257142857142857, "grad_norm": 0.266854852437973, "kl": 0.326171875, "learning_rate": 7.857936576865357e-05, "loss": 0.0504, "reward": 0.07123216986656189, "reward_std": 0.42342435102909803, "rewards/cosine_scaled_reward": -0.08938391506671906, "rewards/format_reward": 0.25, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 2759.9583740234375, "epoch": 0.11314285714285714, "grad_norm": 0.3209151029586792, "kl": 0.31805419921875, "learning_rate": 7.831121542179087e-05, "loss": 0.1378, "reward": 0.6144078075885773, "reward_std": 0.735694158822298, "rewards/cosine_scaled_reward": 0.015537269413471222, "rewards/format_reward": 0.5833333432674408, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 3151.916748046875, "epoch": 0.11371428571428571, "grad_norm": 0.1781582534313202, "kl": 0.284912109375, "learning_rate": 7.804192891917572e-05, "loss": 0.0324, "reward": 0.5205724835395813, "reward_std": 0.6832853183150291, "rewards/cosine_scaled_reward": -0.13554711267352104, "rewards/format_reward": 0.7916666716337204, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 3056.8333740234375, "epoch": 0.11428571428571428, "grad_norm": 0.1799694299697876, "kl": 0.3466796875, "learning_rate": 7.777151938545237e-05, "loss": 0.0166, "reward": 0.2738894410431385, "reward_std": 0.6837660204619169, "rewards/cosine_scaled_reward": -0.11305528320372105, "rewards/format_reward": 0.5000000223517418, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 3472.1250610351562, "epoch": 0.11485714285714285, "grad_norm": 0.14525848627090454, "kl": 0.32275390625, "learning_rate": 7.75e-05, "loss": 0.0428, "reward": 0.08027667133137584, "reward_std": 0.3925677575170994, "rewards/cosine_scaled_reward": -0.08486166223883629, "rewards/format_reward": 0.2500000037252903, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 2908.4583740234375, "epoch": 0.11542857142857142, "grad_norm": 0.19700901210308075, "kl": 0.452392578125, "learning_rate": 7.72273839962904e-05, "loss": 0.0939, "reward": 0.23302962351590395, "reward_std": 0.5167482197284698, "rewards/cosine_scaled_reward": -0.19598520174622536, "rewards/format_reward": 0.6250000111758709, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 3359.9584350585938, "epoch": 0.116, "grad_norm": 0.2025863081216812, "kl": 0.5537109375, "learning_rate": 7.695368466124298e-05, "loss": 0.0524, "reward": 0.5999528877437115, "reward_std": 0.7707385122776031, "rewards/cosine_scaled_reward": -0.07502354681491852, "rewards/format_reward": 0.7500000223517418, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 3285.5416870117188, "epoch": 0.11657142857142858, "grad_norm": 0.19257992506027222, "kl": 0.386962890625, "learning_rate": 7.667891533457719e-05, "loss": 0.0554, "reward": 0.1710458118468523, "reward_std": 0.3957900758832693, "rewards/cosine_scaled_reward": -0.20614376943558455, "rewards/format_reward": 0.5833333432674408, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 3412.166748046875, "epoch": 0.11714285714285715, "grad_norm": 0.3564988076686859, "kl": 0.58837890625, "learning_rate": 7.64030894081624e-05, "loss": 0.0441, "reward": 0.17527301236987114, "reward_std": 0.34667395800352097, "rewards/cosine_scaled_reward": -0.2665301710367203, "rewards/format_reward": 0.7083333358168602, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 3419.9583740234375, "epoch": 0.11771428571428572, "grad_norm": 0.22991527616977692, "kl": 0.48876953125, "learning_rate": 7.612622032536509e-05, "loss": 0.0659, "reward": 0.006236948072910309, "reward_std": 0.43261654675006866, "rewards/cosine_scaled_reward": -0.26771486178040504, "rewards/format_reward": 0.5416666679084301, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 3140.3750610351562, "epoch": 0.11828571428571429, "grad_norm": 0.2047661691904068, "kl": 0.6416015625, "learning_rate": 7.58483215803938e-05, "loss": 0.0602, "reward": 0.8695274218916893, "reward_std": 0.3878786191344261, "rewards/cosine_scaled_reward": 0.05976368859410286, "rewards/format_reward": 0.7500000074505806, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 3045.5416870117188, "epoch": 0.11885714285714286, "grad_norm": 0.39017602801322937, "kl": 0.60546875, "learning_rate": 7.556940671764125e-05, "loss": 0.0101, "reward": 0.6794271823018789, "reward_std": 0.43701300024986267, "rewards/cosine_scaled_reward": -0.035286422818899155, "rewards/format_reward": 0.75, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 2835.4583740234375, "epoch": 0.11942857142857143, "grad_norm": 0.24870344996452332, "kl": 0.4085693359375, "learning_rate": 7.52894893310244e-05, "loss": 0.1686, "reward": 0.6219545120256953, "reward_std": 0.4378160387277603, "rewards/cosine_scaled_reward": -0.001522757112979889, "rewards/format_reward": 0.6250000074505806, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 3224.666748046875, "epoch": 0.12, "grad_norm": 0.18834112584590912, "kl": 0.361572265625, "learning_rate": 7.500858306332173e-05, "loss": 0.0575, "reward": 0.139042385853827, "reward_std": 0.2909255549311638, "rewards/cosine_scaled_reward": -0.20131214708089828, "rewards/format_reward": 0.541666679084301, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 2521.500030517578, "epoch": 0.12057142857142857, "grad_norm": 0.1810249388217926, "kl": 0.42181396484375, "learning_rate": 7.472670160550849e-05, "loss": 0.0523, "reward": 0.6465329900383949, "reward_std": 0.5430318973958492, "rewards/cosine_scaled_reward": -0.11423350404947996, "rewards/format_reward": 0.875, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 3383.3750610351562, "epoch": 0.12114285714285715, "grad_norm": 0.17248566448688507, "kl": 0.30029296875, "learning_rate": 7.444385869608922e-05, "loss": 0.0381, "reward": 0.07134938985109329, "reward_std": 0.43245443142950535, "rewards/cosine_scaled_reward": -0.11015863250941038, "rewards/format_reward": 0.291666679084301, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 2899.6251220703125, "epoch": 0.12171428571428572, "grad_norm": 0.16710422933101654, "kl": 0.6005859375, "learning_rate": 7.416006812042828e-05, "loss": 0.0639, "reward": 0.9749854728579521, "reward_std": 0.8126206174492836, "rewards/cosine_scaled_reward": 0.008326039183884859, "rewards/format_reward": 0.9583333432674408, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 2762.0416870117188, "epoch": 0.12228571428571429, "grad_norm": 0.13040971755981445, "kl": 0.37322998046875, "learning_rate": 7.387534371007797e-05, "loss": 0.0808, "reward": 0.23875866644084454, "reward_std": 0.4626100994646549, "rewards/cosine_scaled_reward": -0.19312065839767456, "rewards/format_reward": 0.6250000037252903, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 3368.4166870117188, "epoch": 0.12285714285714286, "grad_norm": 0.22608883678913116, "kl": 0.642578125, "learning_rate": 7.358969934210438e-05, "loss": 0.0332, "reward": 0.07540189661085606, "reward_std": 0.31054434925317764, "rewards/cosine_scaled_reward": -0.31646573543548584, "rewards/format_reward": 0.7083333656191826, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 1938.416748046875, "epoch": 0.12342857142857143, "grad_norm": 0.6437280178070068, "kl": 0.2008056640625, "learning_rate": 7.330314893841101e-05, "loss": -0.1161, "reward": 1.7220262214541435, "reward_std": 0.25383612513542175, "rewards/cosine_scaled_reward": 0.3818464130163193, "rewards/format_reward": 0.9583333432674408, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 3377.7501220703125, "epoch": 0.124, "grad_norm": 0.18521635234355927, "kl": 0.61376953125, "learning_rate": 7.301570646506028e-05, "loss": 0.0548, "reward": 0.4902285588905215, "reward_std": 0.473490871489048, "rewards/cosine_scaled_reward": -0.1507190652191639, "rewards/format_reward": 0.791666679084301, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 2740.458465576172, "epoch": 0.12457142857142857, "grad_norm": 0.47241246700286865, "kl": 0.3935546875, "learning_rate": 7.27273859315928e-05, "loss": 0.1019, "reward": 0.30729155242443085, "reward_std": 0.7103235945105553, "rewards/cosine_scaled_reward": -0.20052088797092438, "rewards/format_reward": 0.7083333507180214, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 2876.7500610351562, "epoch": 0.12514285714285714, "grad_norm": 0.3306594789028168, "kl": 0.342529296875, "learning_rate": 7.243820139034464e-05, "loss": 0.0972, "reward": 0.5614618342369795, "reward_std": 0.49163829535245895, "rewards/cosine_scaled_reward": -0.11510240286588669, "rewards/format_reward": 0.7916666679084301, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 2676.0000610351562, "epoch": 0.12571428571428572, "grad_norm": 0.3193364441394806, "kl": 0.3392333984375, "learning_rate": 7.214816693576235e-05, "loss": 0.0709, "reward": 1.6334106158465147, "reward_std": 0.7052293419837952, "rewards/cosine_scaled_reward": 0.33753862231969833, "rewards/format_reward": 0.9583333432674408, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 2236.1666870117188, "epoch": 0.12628571428571428, "grad_norm": 0.2019508183002472, "kl": 0.35107421875, "learning_rate": 7.185729670371605e-05, "loss": -0.0156, "reward": 1.01932243257761, "reward_std": 0.6837839931249619, "rewards/cosine_scaled_reward": 0.009661169722676277, "rewards/format_reward": 1.0, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 2180.4583435058594, "epoch": 0.12685714285714286, "grad_norm": 1.0214191675186157, "kl": 0.2244873046875, "learning_rate": 7.156560487081053e-05, "loss": 0.2367, "reward": 0.6144461743533611, "reward_std": 0.5636948570609093, "rewards/cosine_scaled_reward": -0.04694357141852379, "rewards/format_reward": 0.7083333544433117, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 2938.791717529297, "epoch": 0.12742857142857142, "grad_norm": 0.20985785126686096, "kl": 0.47314453125, "learning_rate": 7.127310565369415e-05, "loss": 0.0899, "reward": 0.16650558728724718, "reward_std": 0.48911314830183983, "rewards/cosine_scaled_reward": -0.27091389521956444, "rewards/format_reward": 0.7083333432674408, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 3043.2916870117188, "epoch": 0.128, "grad_norm": 0.1921759396791458, "kl": 0.6591796875, "learning_rate": 7.097981330836617e-05, "loss": 0.055, "reward": 0.8815609216690063, "reward_std": 0.6539080664515495, "rewards/cosine_scaled_reward": 0.0032804161310195923, "rewards/format_reward": 0.8750000298023224, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 2789.3333740234375, "epoch": 0.12857142857142856, "grad_norm": 0.2785404324531555, "kl": 0.646484375, "learning_rate": 7.068574212948169e-05, "loss": 0.0309, "reward": 0.5741224549710751, "reward_std": 0.7463614344596863, "rewards/cosine_scaled_reward": -0.15043878043070436, "rewards/format_reward": 0.8750000149011612, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 2685.666778564453, "epoch": 0.12914285714285714, "grad_norm": 0.594291090965271, "kl": 0.492431640625, "learning_rate": 7.03909064496551e-05, "loss": -0.0623, "reward": 1.1779837608337402, "reward_std": 0.5032703503966331, "rewards/cosine_scaled_reward": 0.08899188553914428, "rewards/format_reward": 1.0, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 2405.416778564453, "epoch": 0.12971428571428573, "grad_norm": 0.2714992165565491, "kl": 0.485107421875, "learning_rate": 7.009532063876149e-05, "loss": 0.1552, "reward": 0.9733416438102722, "reward_std": 0.9007052779197693, "rewards/cosine_scaled_reward": 0.028337497264146805, "rewards/format_reward": 0.9166666865348816, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 1738.8750305175781, "epoch": 0.13028571428571428, "grad_norm": 0.484445720911026, "kl": 0.4476318359375, "learning_rate": 6.979899910323624e-05, "loss": 0.2133, "reward": 1.399103358387947, "reward_std": 0.7133887782692909, "rewards/cosine_scaled_reward": 0.1995516661554575, "rewards/format_reward": 1.0, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 2415.3334350585938, "epoch": 0.13085714285714287, "grad_norm": 0.2949579358100891, "kl": 0.6171875, "learning_rate": 6.9501956285373e-05, "loss": 0.0644, "reward": 1.2348989397287369, "reward_std": 0.8904998600482941, "rewards/cosine_scaled_reward": 0.13828279450535774, "rewards/format_reward": 0.9583333432674408, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 2715.791748046875, "epoch": 0.13142857142857142, "grad_norm": 0.43443575501441956, "kl": 0.775390625, "learning_rate": 6.920420666261962e-05, "loss": 0.0894, "reward": 0.7100101904943585, "reward_std": 0.5167043209075928, "rewards/cosine_scaled_reward": -0.10332825779914856, "rewards/format_reward": 0.9166666865348816, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 2503.2500610351562, "epoch": 0.132, "grad_norm": 0.21487928926944733, "kl": 0.7197265625, "learning_rate": 6.890576474687263e-05, "loss": 0.0538, "reward": 0.5765762068331242, "reward_std": 0.5030670911073685, "rewards/cosine_scaled_reward": -0.14921192079782486, "rewards/format_reward": 0.8750000149011612, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 2205.666748046875, "epoch": 0.13257142857142856, "grad_norm": 0.2001338005065918, "kl": 0.562835693359375, "learning_rate": 6.860664508377001e-05, "loss": 0.1013, "reward": 0.8642504140734673, "reward_std": 0.5591230466961861, "rewards/cosine_scaled_reward": -0.026208162307739258, "rewards/format_reward": 0.9166666865348816, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 2631.7916870117188, "epoch": 0.13314285714285715, "grad_norm": 0.28648537397384644, "kl": 0.7060546875, "learning_rate": 6.83068622519821e-05, "loss": 0.0581, "reward": 0.6012575253844261, "reward_std": 0.369386401027441, "rewards/cosine_scaled_reward": -0.17853792011737823, "rewards/format_reward": 0.9583333432674408, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 2923.541748046875, "epoch": 0.1337142857142857, "grad_norm": 0.37634986639022827, "kl": 0.712890625, "learning_rate": 6.800643086250122e-05, "loss": 0.0713, "reward": 0.8183658458292484, "reward_std": 0.9408960342407227, "rewards/cosine_scaled_reward": 0.07584960013628006, "rewards/format_reward": 0.6666666939854622, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 2446.3334045410156, "epoch": 0.13428571428571429, "grad_norm": 0.3095211386680603, "kl": 0.564208984375, "learning_rate": 6.770536555792944e-05, "loss": 0.0103, "reward": 1.0213327407836914, "reward_std": 0.6788808181881905, "rewards/cosine_scaled_reward": 0.0314996664528735, "rewards/format_reward": 0.9583333432674408, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 1727.4167175292969, "epoch": 0.13485714285714287, "grad_norm": 0.31641384959220886, "kl": 0.52777099609375, "learning_rate": 6.740368101176496e-05, "loss": 0.0146, "reward": 0.42938170582056046, "reward_std": 0.7883919924497604, "rewards/cosine_scaled_reward": -0.20197583828121424, "rewards/format_reward": 0.8333333432674408, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 2093.625045776367, "epoch": 0.13542857142857143, "grad_norm": 0.2643112540245056, "kl": 0.35101318359375, "learning_rate": 6.710139192768695e-05, "loss": 0.0559, "reward": 0.4819624274969101, "reward_std": 0.36040157824754715, "rewards/cosine_scaled_reward": -0.2173521351069212, "rewards/format_reward": 0.9166666865348816, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 2892.5834350585938, "epoch": 0.136, "grad_norm": 0.30492836236953735, "kl": 0.6455078125, "learning_rate": 6.679851303883892e-05, "loss": 0.0055, "reward": 0.8520525968633592, "reward_std": 1.034329280257225, "rewards/cosine_scaled_reward": -0.032307060435414314, "rewards/format_reward": 0.9166666865348816, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 3010.8750610351562, "epoch": 0.13657142857142857, "grad_norm": 0.2272806316614151, "kl": 0.548828125, "learning_rate": 6.649505910711058e-05, "loss": 0.0823, "reward": 0.9699130356311798, "reward_std": 0.7660344392061234, "rewards/cosine_scaled_reward": 0.0682898610830307, "rewards/format_reward": 0.8333333432674408, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 3037.5000610351562, "epoch": 0.13714285714285715, "grad_norm": 0.3186228573322296, "kl": 0.7724609375, "learning_rate": 6.619104492241848e-05, "loss": 0.0644, "reward": 0.2878115465864539, "reward_std": 0.41039496660232544, "rewards/cosine_scaled_reward": -0.2935942467302084, "rewards/format_reward": 0.8750000298023224, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 2400.5834197998047, "epoch": 0.1377142857142857, "grad_norm": 0.2673218548297882, "kl": 0.363006591796875, "learning_rate": 6.588648530198504e-05, "loss": 0.0143, "reward": 1.3452494442462921, "reward_std": 0.922061562538147, "rewards/cosine_scaled_reward": 0.29762469977140427, "rewards/format_reward": 0.7500000223517418, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 3292.416748046875, "epoch": 0.1382857142857143, "grad_norm": 0.24196265637874603, "kl": 0.5322265625, "learning_rate": 6.558139508961655e-05, "loss": 0.0516, "reward": 0.5418386338278651, "reward_std": 0.6776624768972397, "rewards/cosine_scaled_reward": -0.0832473672926426, "rewards/format_reward": 0.708333358168602, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 3185.2084350585938, "epoch": 0.13885714285714285, "grad_norm": 0.23741839826107025, "kl": 0.439208984375, "learning_rate": 6.527578915497951e-05, "loss": 0.0376, "reward": 0.351685244590044, "reward_std": 0.5521262660622597, "rewards/cosine_scaled_reward": -0.1574907097965479, "rewards/format_reward": 0.6666666679084301, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 2508.5417709350586, "epoch": 0.13942857142857143, "grad_norm": 0.31169164180755615, "kl": 0.28839111328125, "learning_rate": 6.496968239287605e-05, "loss": 0.1361, "reward": 0.7412183582782745, "reward_std": 0.305687353014946, "rewards/cosine_scaled_reward": -0.025224164128303528, "rewards/format_reward": 0.7916666679084301, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 3193.5416870117188, "epoch": 0.14, "grad_norm": 0.16886374354362488, "kl": 0.3564453125, "learning_rate": 6.466308972251785e-05, "loss": 0.0409, "reward": 1.4323917776346207, "reward_std": 0.8325834348797798, "rewards/cosine_scaled_reward": 0.27869584411382675, "rewards/format_reward": 0.8750000149011612, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 2717.25, "epoch": 0.14057142857142857, "grad_norm": 0.4018855094909668, "kl": 0.23583984375, "learning_rate": 6.435602608679918e-05, "loss": 0.0438, "reward": 0.8349570259451866, "reward_std": 0.6186618842184544, "rewards/cosine_scaled_reward": 0.12581188417971134, "rewards/format_reward": 0.5833333358168602, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 3374.1250610351562, "epoch": 0.14114285714285715, "grad_norm": 0.25589483976364136, "kl": 0.42626953125, "learning_rate": 6.404850645156841e-05, "loss": 0.0487, "reward": 0.25039391964673996, "reward_std": 1.0135847851634026, "rewards/cosine_scaled_reward": -0.14563637599349022, "rewards/format_reward": 0.5416666828095913, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 1883.6667175292969, "epoch": 0.1417142857142857, "grad_norm": 0.3505682349205017, "kl": 0.22479248046875, "learning_rate": 6.374054580489874e-05, "loss": -0.1151, "reward": 1.0633280351758003, "reward_std": 0.4256577081978321, "rewards/cosine_scaled_reward": 0.07333065941929817, "rewards/format_reward": 0.9166666716337204, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 3036.7501220703125, "epoch": 0.1422857142857143, "grad_norm": 0.19629524648189545, "kl": 0.340576171875, "learning_rate": 6.343215915635762e-05, "loss": 0.0384, "reward": 0.19172463938593864, "reward_std": 0.6912754252552986, "rewards/cosine_scaled_reward": -0.2999710142612457, "rewards/format_reward": 0.7916666716337204, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 2983.166748046875, "epoch": 0.14285714285714285, "grad_norm": 0.3300791382789612, "kl": 0.298583984375, "learning_rate": 6.31233615362752e-05, "loss": 0.0773, "reward": 0.7938342844136059, "reward_std": 0.745312362909317, "rewards/cosine_scaled_reward": 0.021917149424552917, "rewards/format_reward": 0.7500000149011612, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 2271.0833740234375, "epoch": 0.14342857142857143, "grad_norm": 0.17075654864311218, "kl": 0.1722412109375, "learning_rate": 6.281416799501188e-05, "loss": 0.0039, "reward": 0.8436714336276054, "reward_std": 0.580166794359684, "rewards/cosine_scaled_reward": -0.03649761341512203, "rewards/format_reward": 0.9166666865348816, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 2436.0834197998047, "epoch": 0.144, "grad_norm": 0.4659833312034607, "kl": 0.2823486328125, "learning_rate": 6.250459360222461e-05, "loss": 0.1394, "reward": 0.4135246090590954, "reward_std": 0.6481917910277843, "rewards/cosine_scaled_reward": -0.14740438014268875, "rewards/format_reward": 0.7083333507180214, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 2777.166732788086, "epoch": 0.14457142857142857, "grad_norm": 0.15978415310382843, "kl": 0.37872314453125, "learning_rate": 6.219465344613258e-05, "loss": 0.0411, "reward": 0.45254361629486084, "reward_std": 0.5954790785908699, "rewards/cosine_scaled_reward": -0.19039486348628998, "rewards/format_reward": 0.8333333432674408, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 2861.5000915527344, "epoch": 0.14514285714285713, "grad_norm": 0.22069796919822693, "kl": 0.4195556640625, "learning_rate": 6.188436263278172e-05, "loss": -0.0014, "reward": 0.8079469501972198, "reward_std": 0.8555595129728317, "rewards/cosine_scaled_reward": -0.054359860718250275, "rewards/format_reward": 0.9166666865348816, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 3140.9166870117188, "epoch": 0.1457142857142857, "grad_norm": 0.3371999263763428, "kl": 0.51953125, "learning_rate": 6.157373628530852e-05, "loss": 0.0141, "reward": 0.10784337669610977, "reward_std": 0.4433470778167248, "rewards/cosine_scaled_reward": -0.2585783116519451, "rewards/format_reward": 0.6250000298023224, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 3364.416748046875, "epoch": 0.1462857142857143, "grad_norm": 0.2070639282464981, "kl": 0.48974609375, "learning_rate": 6.126278954320295e-05, "loss": 0.0582, "reward": 0.5357752754352987, "reward_std": 0.6060156896710396, "rewards/cosine_scaled_reward": -0.16961237788200378, "rewards/format_reward": 0.8750000149011612, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 2713.7918090820312, "epoch": 0.14685714285714285, "grad_norm": 0.3514373302459717, "kl": 0.355712890625, "learning_rate": 6.095153756157051e-05, "loss": 0.094, "reward": 1.262501284480095, "reward_std": 0.9425565153360367, "rewards/cosine_scaled_reward": 0.15208394452929497, "rewards/format_reward": 0.9583333432674408, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 1865.4166870117188, "epoch": 0.14742857142857144, "grad_norm": 0.19031290709972382, "kl": 0.2869873046875, "learning_rate": 6.06399955103937e-05, "loss": 0.2014, "reward": 1.08034697920084, "reward_std": 0.29667292069643736, "rewards/cosine_scaled_reward": 0.10267347283661366, "rewards/format_reward": 0.8750000149011612, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 2987.041748046875, "epoch": 0.148, "grad_norm": 0.1868191808462143, "kl": 0.50732421875, "learning_rate": 6.032817857379256e-05, "loss": 0.0605, "reward": 0.4671786054968834, "reward_std": 0.7795231863856316, "rewards/cosine_scaled_reward": -0.14141068421304226, "rewards/format_reward": 0.7500000298023224, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 3072.0001220703125, "epoch": 0.14857142857142858, "grad_norm": 0.19339028000831604, "kl": 0.48876953125, "learning_rate": 6.001610194928464e-05, "loss": 0.044, "reward": 0.5511476572137326, "reward_std": 0.8131291791796684, "rewards/cosine_scaled_reward": -0.14109284430742264, "rewards/format_reward": 0.8333333432674408, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 3000.041748046875, "epoch": 0.14914285714285713, "grad_norm": 0.24656662344932556, "kl": 0.526611328125, "learning_rate": 5.970378084704441e-05, "loss": 0.0471, "reward": 0.9492662008851767, "reward_std": 0.4657456800341606, "rewards/cosine_scaled_reward": 0.037133121863007545, "rewards/format_reward": 0.8750000149011612, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 2996.041717529297, "epoch": 0.14971428571428572, "grad_norm": 0.1437804251909256, "kl": 0.3951416015625, "learning_rate": 5.9391230489161734e-05, "loss": 0.0074, "reward": 0.6824960559606552, "reward_std": 0.5514045283198357, "rewards/cosine_scaled_reward": -0.0962519608438015, "rewards/format_reward": 0.8750000149011612, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 3299.916748046875, "epoch": 0.15028571428571427, "grad_norm": 0.860514760017395, "kl": 0.480712890625, "learning_rate": 5.907846610890012e-05, "loss": 0.0247, "reward": 0.5251449644565582, "reward_std": 0.5569327585399151, "rewards/cosine_scaled_reward": -0.17492752522230148, "rewards/format_reward": 0.8750000149011612, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 3207.291748046875, "epoch": 0.15085714285714286, "grad_norm": 0.3078582286834717, "kl": 0.57763671875, "learning_rate": 5.876550294995421e-05, "loss": 0.0128, "reward": 0.6213351637125015, "reward_std": 0.8778404965996742, "rewards/cosine_scaled_reward": -0.04349910840392113, "rewards/format_reward": 0.7083333507180214, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 2830.3333740234375, "epoch": 0.15142857142857144, "grad_norm": 0.17845019698143005, "kl": 0.364501953125, "learning_rate": 5.8452356265706845e-05, "loss": 0.0782, "reward": 0.6127370540052652, "reward_std": 0.8011298030614853, "rewards/cosine_scaled_reward": -0.068631486967206, "rewards/format_reward": 0.7500000149011612, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 2766.5416870117188, "epoch": 0.152, "grad_norm": 0.31444934010505676, "kl": 0.2955322265625, "learning_rate": 5.813904131848564e-05, "loss": -0.0466, "reward": 0.3219255795702338, "reward_std": 0.44407689198851585, "rewards/cosine_scaled_reward": -0.2765372171998024, "rewards/format_reward": 0.8750000149011612, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 1446.833396911621, "epoch": 0.15257142857142858, "grad_norm": 0.6262673139572144, "kl": 0.2203369140625, "learning_rate": 5.782557337881911e-05, "loss": 0.293, "reward": 0.44304793514311314, "reward_std": 0.6326870061457157, "rewards/cosine_scaled_reward": -0.1743093803524971, "rewards/format_reward": 0.7916666716337204, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 2241.50008392334, "epoch": 0.15314285714285714, "grad_norm": 0.28819912672042847, "kl": 0.436279296875, "learning_rate": 5.751196772469237e-05, "loss": 0.1259, "reward": 0.5492542944848537, "reward_std": 0.793809786438942, "rewards/cosine_scaled_reward": -0.017039529979228973, "rewards/format_reward": 0.5833333432674408, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 3128.0000610351562, "epoch": 0.15371428571428572, "grad_norm": 0.19022883474826813, "kl": 0.46630859375, "learning_rate": 5.719823964080261e-05, "loss": 0.0451, "reward": 0.4434543699026108, "reward_std": 0.4890909567475319, "rewards/cosine_scaled_reward": -0.17410616483539343, "rewards/format_reward": 0.7916667014360428, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 2065.166778564453, "epoch": 0.15428571428571428, "grad_norm": 0.340876966714859, "kl": 0.368896484375, "learning_rate": 5.688440441781399e-05, "loss": -0.0992, "reward": 1.2790502207353711, "reward_std": 0.512369230389595, "rewards/cosine_scaled_reward": 0.18119176104664803, "rewards/format_reward": 0.9166666865348816, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 2220.791717529297, "epoch": 0.15485714285714286, "grad_norm": 0.4013032019138336, "kl": 0.3543701171875, "learning_rate": 5.657047735161256e-05, "loss": -0.0809, "reward": 0.34419402945786715, "reward_std": 0.34300512447953224, "rewards/cosine_scaled_reward": -0.20290300622582436, "rewards/format_reward": 0.7500000074505806, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 2219.750030517578, "epoch": 0.15542857142857142, "grad_norm": 0.35917147994041443, "kl": 0.238037109375, "learning_rate": 5.6256473742560614e-05, "loss": 0.1033, "reward": 0.7890710458159447, "reward_std": 0.7226014733314514, "rewards/cosine_scaled_reward": -0.06379782781004906, "rewards/format_reward": 0.9166666865348816, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 2267.3334045410156, "epoch": 0.156, "grad_norm": 0.24575629830360413, "kl": 0.382080078125, "learning_rate": 5.594240889475107e-05, "loss": 0.0056, "reward": 0.7830603891052306, "reward_std": 0.9113038927316666, "rewards/cosine_scaled_reward": -0.025136479176580906, "rewards/format_reward": 0.8333333432674408, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 3100.916748046875, "epoch": 0.15657142857142858, "grad_norm": 0.1788811832666397, "kl": 0.3974609375, "learning_rate": 5.5628298115261545e-05, "loss": 0.0351, "reward": 0.46365073323249817, "reward_std": 0.6709984391927719, "rewards/cosine_scaled_reward": -0.12234130874276161, "rewards/format_reward": 0.708333358168602, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 2573.666778564453, "epoch": 0.15714285714285714, "grad_norm": 0.2195524126291275, "kl": 0.45074462890625, "learning_rate": 5.5314156713408275e-05, "loss": 0.0242, "reward": 0.33101664669811726, "reward_std": 0.6106998361647129, "rewards/cosine_scaled_reward": -0.18865835666656494, "rewards/format_reward": 0.7083333507180214, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 1952.3750457763672, "epoch": 0.15771428571428572, "grad_norm": 0.4302261471748352, "kl": 0.19842529296875, "learning_rate": 5.500000000000001e-05, "loss": 0.1747, "reward": 1.8006355911493301, "reward_std": 0.5728400154039264, "rewards/cosine_scaled_reward": 0.42115106899291277, "rewards/format_reward": 0.9583333432674408, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 2859.4584350585938, "epoch": 0.15828571428571428, "grad_norm": 0.27882683277130127, "kl": 0.39794921875, "learning_rate": 5.468584328659173e-05, "loss": 0.0633, "reward": 0.9866037368774414, "reward_std": 0.9637529253959656, "rewards/cosine_scaled_reward": 0.09746852144598961, "rewards/format_reward": 0.7916666865348816, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 2202.75, "epoch": 0.15885714285714286, "grad_norm": 0.16858187317848206, "kl": 0.26458740234375, "learning_rate": 5.4371701884738466e-05, "loss": 0.1202, "reward": 0.39508337527513504, "reward_std": 0.7221878357231617, "rewards/cosine_scaled_reward": -0.21912500262260437, "rewards/format_reward": 0.8333333432674408, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 2615.0000610351562, "epoch": 0.15942857142857142, "grad_norm": 0.6125884056091309, "kl": 0.30419921875, "learning_rate": 5.405759110524894e-05, "loss": 0.1486, "reward": 0.9705154225230217, "reward_std": 0.7704289853572845, "rewards/cosine_scaled_reward": 0.047757700085639954, "rewards/format_reward": 0.8750000298023224, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 2305.1666870117188, "epoch": 0.16, "grad_norm": 0.25846433639526367, "kl": 0.323974609375, "learning_rate": 5.374352625743941e-05, "loss": -0.0256, "reward": 0.5761691424995661, "reward_std": 0.4631784576922655, "rewards/cosine_scaled_reward": -0.06608210876584053, "rewards/format_reward": 0.7083333358168602, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 2263.4584350585938, "epoch": 0.16057142857142856, "grad_norm": 0.6869896650314331, "kl": 0.30859375, "learning_rate": 5.342952264838747e-05, "loss": -0.1307, "reward": 1.1992193013429642, "reward_std": 0.8062855824828148, "rewards/cosine_scaled_reward": 0.16210963344201446, "rewards/format_reward": 0.8750000149011612, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 2695.5416870117188, "epoch": 0.16114285714285714, "grad_norm": 0.2958724796772003, "kl": 0.48028564453125, "learning_rate": 5.311559558218603e-05, "loss": 0.0435, "reward": 0.9084782637655735, "reward_std": 0.9329799860715866, "rewards/cosine_scaled_reward": 0.10007247515022755, "rewards/format_reward": 0.7083333507180214, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 2283.6250610351562, "epoch": 0.16171428571428573, "grad_norm": 0.6330475807189941, "kl": 0.239501953125, "learning_rate": 5.28017603591974e-05, "loss": -0.1108, "reward": 0.7594865150749683, "reward_std": 0.6004514396190643, "rewards/cosine_scaled_reward": -0.09942345693707466, "rewards/format_reward": 0.9583333432674408, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 2050.458396911621, "epoch": 0.16228571428571428, "grad_norm": 0.22488415241241455, "kl": 0.25628662109375, "learning_rate": 5.248803227530763e-05, "loss": 0.019, "reward": 2.079008385539055, "reward_std": 0.725292238406837, "rewards/cosine_scaled_reward": 0.6020041219890118, "rewards/format_reward": 0.875, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 3271.541748046875, "epoch": 0.16285714285714287, "grad_norm": 0.2648293673992157, "kl": 0.39990234375, "learning_rate": 5.2174426621180906e-05, "loss": 0.0162, "reward": 0.347307525575161, "reward_std": 0.5538838356733322, "rewards/cosine_scaled_reward": -0.2430129125714302, "rewards/format_reward": 0.8333333432674408, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 2571.7501220703125, "epoch": 0.16342857142857142, "grad_norm": 0.31099557876586914, "kl": 0.217041015625, "learning_rate": 5.186095868151436e-05, "loss": -0.0607, "reward": 1.441674392670393, "reward_std": 0.8816814571619034, "rewards/cosine_scaled_reward": 0.26250384002923965, "rewards/format_reward": 0.9166666716337204, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 2525.791748046875, "epoch": 0.164, "grad_norm": 0.21399115025997162, "kl": 0.212890625, "learning_rate": 5.154764373429316e-05, "loss": 0.0111, "reward": 1.264048159122467, "reward_std": 1.0165133327245712, "rewards/cosine_scaled_reward": 0.17369072884321213, "rewards/format_reward": 0.9166666865348816, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 3136.8750610351562, "epoch": 0.16457142857142856, "grad_norm": 0.2190590798854828, "kl": 0.3248291015625, "learning_rate": 5.1234497050045814e-05, "loss": -0.0187, "reward": 0.4037191644310951, "reward_std": 0.7680270224809647, "rewards/cosine_scaled_reward": -0.19397378712892532, "rewards/format_reward": 0.7916666865348816, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 3075.8333740234375, "epoch": 0.16514285714285715, "grad_norm": 0.18252302706241608, "kl": 0.40673828125, "learning_rate": 5.0921533891099905e-05, "loss": 0.0377, "reward": 0.6799687976017594, "reward_std": 0.6346831023693085, "rewards/cosine_scaled_reward": -0.07668228447437286, "rewards/format_reward": 0.8333333432674408, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 2687.8334350585938, "epoch": 0.1657142857142857, "grad_norm": 0.272157222032547, "kl": 0.2164306640625, "learning_rate": 5.0608769510838284e-05, "loss": -0.0162, "reward": 0.8302770014852285, "reward_std": 0.9631283730268478, "rewards/cosine_scaled_reward": -0.001528160646557808, "rewards/format_reward": 0.8333333432674408, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 3265.8751220703125, "epoch": 0.1662857142857143, "grad_norm": 0.23784101009368896, "kl": 0.3359375, "learning_rate": 5.0296219152955604e-05, "loss": 0.0305, "reward": 0.386111356317997, "reward_std": 0.39381321892142296, "rewards/cosine_scaled_reward": -0.20277767814695835, "rewards/format_reward": 0.7916666865348816, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 2972.5416870117188, "epoch": 0.16685714285714287, "grad_norm": 0.3053220510482788, "kl": 0.323974609375, "learning_rate": 4.998389805071536e-05, "loss": -0.0195, "reward": 0.5026585329324007, "reward_std": 0.9295567944645882, "rewards/cosine_scaled_reward": -0.12367073073983192, "rewards/format_reward": 0.7500000074505806, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 2035.3333740234375, "epoch": 0.16742857142857143, "grad_norm": 0.15959811210632324, "kl": 0.122650146484375, "learning_rate": 4.9671821426207455e-05, "loss": 0.0064, "reward": 1.4870387986302376, "reward_std": 0.46400389447808266, "rewards/cosine_scaled_reward": 0.3060194104909897, "rewards/format_reward": 0.875, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 2581.958465576172, "epoch": 0.168, "grad_norm": 0.2600279450416565, "kl": 0.1378173828125, "learning_rate": 4.936000448960631e-05, "loss": 0.0812, "reward": 0.6963506219908595, "reward_std": 0.8576374873518944, "rewards/cosine_scaled_reward": -0.047658056020736694, "rewards/format_reward": 0.7916666716337204, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 2707.750045776367, "epoch": 0.16857142857142857, "grad_norm": 0.3398168385028839, "kl": 0.26220703125, "learning_rate": 4.904846243842949e-05, "loss": 0.0073, "reward": 0.1822656556032598, "reward_std": 0.7573360428214073, "rewards/cosine_scaled_reward": -0.17970050126314163, "rewards/format_reward": 0.5416666716337204, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 2680.1251220703125, "epoch": 0.16914285714285715, "grad_norm": 0.29461199045181274, "kl": 0.2176513671875, "learning_rate": 4.873721045679707e-05, "loss": 0.0926, "reward": 0.17297326400876045, "reward_std": 0.5826424770057201, "rewards/cosine_scaled_reward": -0.26768005080521107, "rewards/format_reward": 0.7083333507180214, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 2895.4584350585938, "epoch": 0.1697142857142857, "grad_norm": 0.14295834302902222, "kl": 0.1163330078125, "learning_rate": 4.842626371469149e-05, "loss": -0.0717, "reward": 1.922136515378952, "reward_std": 0.5652984231710434, "rewards/cosine_scaled_reward": 0.4819015748798847, "rewards/format_reward": 0.9583333432674408, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 2346.041748046875, "epoch": 0.1702857142857143, "grad_norm": 0.172696053981781, "kl": 0.2733154296875, "learning_rate": 4.811563736721829e-05, "loss": 0.046, "reward": 0.902316652238369, "reward_std": 0.9037366360425949, "rewards/cosine_scaled_reward": 0.034491658210754395, "rewards/format_reward": 0.8333333432674408, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 2209.250045776367, "epoch": 0.17085714285714285, "grad_norm": 0.44964268803596497, "kl": 0.1575927734375, "learning_rate": 4.780534655386744e-05, "loss": -0.0211, "reward": 0.48153945803642273, "reward_std": 0.6702739596366882, "rewards/cosine_scaled_reward": -0.15506362076848745, "rewards/format_reward": 0.791666679084301, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 2662.041748046875, "epoch": 0.17142857142857143, "grad_norm": 0.29695504903793335, "kl": 0.177978515625, "learning_rate": 4.74954063977754e-05, "loss": 0.0111, "reward": 0.5166540406644344, "reward_std": 1.0730202198028564, "rewards/cosine_scaled_reward": -0.11667298898100853, "rewards/format_reward": 0.7500000149011612, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 2761.3334350585938, "epoch": 0.172, "grad_norm": 0.21826083958148956, "kl": 0.321380615234375, "learning_rate": 4.718583200498814e-05, "loss": 0.0618, "reward": 0.18236689269542694, "reward_std": 0.7055843695998192, "rewards/cosine_scaled_reward": -0.2629832345992327, "rewards/format_reward": 0.708333358168602, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 2820.3750610351562, "epoch": 0.17257142857142857, "grad_norm": 0.1797807216644287, "kl": 0.25360107421875, "learning_rate": 4.687663846372481e-05, "loss": -0.0163, "reward": 0.9326446410268545, "reward_std": 0.8454302102327347, "rewards/cosine_scaled_reward": 0.09132230095565319, "rewards/format_reward": 0.7500000149011612, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 2805.791748046875, "epoch": 0.17314285714285715, "grad_norm": 0.20349080860614777, "kl": 0.25506591796875, "learning_rate": 4.6567840843642384e-05, "loss": 0.0379, "reward": 0.5512912534177303, "reward_std": 0.5454032495617867, "rewards/cosine_scaled_reward": -0.14102105796337128, "rewards/format_reward": 0.8333333432674408, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 2817.4584350585938, "epoch": 0.1737142857142857, "grad_norm": 0.37047722935676575, "kl": 0.28692626953125, "learning_rate": 4.6259454195101274e-05, "loss": -0.1089, "reward": 0.6676226779818535, "reward_std": 0.9479784294962883, "rewards/cosine_scaled_reward": -0.04118867497891188, "rewards/format_reward": 0.7500000223517418, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 2684.416748046875, "epoch": 0.1742857142857143, "grad_norm": 0.166043221950531, "kl": 0.2327880859375, "learning_rate": 4.5951493548431603e-05, "loss": 0.0119, "reward": 1.0854606181383133, "reward_std": 0.4668488036841154, "rewards/cosine_scaled_reward": 0.08439697325229645, "rewards/format_reward": 0.9166666865348816, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 1833.25, "epoch": 0.17485714285714285, "grad_norm": 0.1900881677865982, "kl": 0.11285400390625, "learning_rate": 4.564397391320084e-05, "loss": 0.0551, "reward": 1.6040659546852112, "reward_std": 0.7503243815153837, "rewards/cosine_scaled_reward": 0.3645329400897026, "rewards/format_reward": 0.875, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 3245.9583740234375, "epoch": 0.17542857142857143, "grad_norm": 0.39895114302635193, "kl": 0.1572265625, "learning_rate": 4.5336910277482156e-05, "loss": 0.0809, "reward": 0.28692900389432907, "reward_std": 0.8957325369119644, "rewards/cosine_scaled_reward": -0.14820216968655586, "rewards/format_reward": 0.5833333469927311, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 2018.666748046875, "epoch": 0.176, "grad_norm": 0.32068708539009094, "kl": 0.10467529296875, "learning_rate": 4.503031760712397e-05, "loss": 0.077, "reward": 1.5325582474470139, "reward_std": 0.9100236482918262, "rewards/cosine_scaled_reward": 0.34961244463920593, "rewards/format_reward": 0.8333333358168602, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 3323.8333740234375, "epoch": 0.17657142857142857, "grad_norm": 0.21846871078014374, "kl": 0.2919921875, "learning_rate": 4.47242108450205e-05, "loss": 0.0366, "reward": -0.005359284579753876, "reward_std": 0.415067620575428, "rewards/cosine_scaled_reward": -0.25267963111400604, "rewards/format_reward": 0.5000000223517418, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 3336.8333740234375, "epoch": 0.17714285714285713, "grad_norm": 0.27864164113998413, "kl": 0.333984375, "learning_rate": 4.4418604910383456e-05, "loss": 0.0695, "reward": 0.5436939476057887, "reward_std": 0.7889657467603683, "rewards/cosine_scaled_reward": -0.1656530387699604, "rewards/format_reward": 0.8750000149011612, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 3064.5000610351562, "epoch": 0.1777142857142857, "grad_norm": 0.14427867531776428, "kl": 0.224365234375, "learning_rate": 4.411351469801496e-05, "loss": 0.0368, "reward": 0.4646228328347206, "reward_std": 0.5353215932846069, "rewards/cosine_scaled_reward": -0.1010219119489193, "rewards/format_reward": 0.6666666679084301, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 3088.2500610351562, "epoch": 0.1782857142857143, "grad_norm": 0.20969220995903015, "kl": 0.238525390625, "learning_rate": 4.380895507758155e-05, "loss": 0.0171, "reward": 0.33924252539873123, "reward_std": 0.3376295939087868, "rewards/cosine_scaled_reward": -0.2678787410259247, "rewards/format_reward": 0.8750000149011612, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 2415.5000228881836, "epoch": 0.17885714285714285, "grad_norm": 1.5862038135528564, "kl": 0.308349609375, "learning_rate": 4.3504940892889434e-05, "loss": 0.0216, "reward": 0.808872826397419, "reward_std": 0.7256791153922677, "rewards/cosine_scaled_reward": -0.012230251293658512, "rewards/format_reward": 0.8333333358168602, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 2261.12508392334, "epoch": 0.17942857142857144, "grad_norm": 0.28129690885543823, "kl": 0.21649169921875, "learning_rate": 4.3201486961161094e-05, "loss": 0.1205, "reward": 1.4821401089429855, "reward_std": 0.9167400598526001, "rewards/cosine_scaled_reward": 0.26190340146422386, "rewards/format_reward": 0.9583333432674408, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 2525.500045776367, "epoch": 0.18, "grad_norm": 0.17051948606967926, "kl": 0.2293701171875, "learning_rate": 4.289860807231305e-05, "loss": 0.09, "reward": 0.16041448712348938, "reward_std": 0.3049400746822357, "rewards/cosine_scaled_reward": -0.294792752712965, "rewards/format_reward": 0.7500000074505806, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 3395.791748046875, "epoch": 0.18057142857142858, "grad_norm": 0.17187951505184174, "kl": 0.312255859375, "learning_rate": 4.259631898823504e-05, "loss": 0.0457, "reward": 0.3425696883350611, "reward_std": 0.9086843878030777, "rewards/cosine_scaled_reward": -0.18288182839751244, "rewards/format_reward": 0.708333358168602, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 2967.2916870117188, "epoch": 0.18114285714285713, "grad_norm": 0.6705901622772217, "kl": 0.21875, "learning_rate": 4.229463444207056e-05, "loss": 0.0771, "reward": 0.533269502222538, "reward_std": 0.7537828385829926, "rewards/cosine_scaled_reward": -0.08753193635493517, "rewards/format_reward": 0.7083333507180214, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 2270.041748046875, "epoch": 0.18171428571428572, "grad_norm": 0.26364800333976746, "kl": 0.19378662109375, "learning_rate": 4.1993569137498776e-05, "loss": 0.0659, "reward": 1.1275322251021862, "reward_std": 0.8576941788196564, "rewards/cosine_scaled_reward": 0.1262660927604884, "rewards/format_reward": 0.8750000149011612, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 2608.541748046875, "epoch": 0.18228571428571427, "grad_norm": 0.3073921203613281, "kl": 0.302001953125, "learning_rate": 4.1693137748017916e-05, "loss": 0.0071, "reward": 0.6904770843684673, "reward_std": 0.8109993487596512, "rewards/cosine_scaled_reward": -0.09226147457957268, "rewards/format_reward": 0.8750000149011612, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 2851.1666870117188, "epoch": 0.18285714285714286, "grad_norm": 0.21277567744255066, "kl": 0.29742431640625, "learning_rate": 4.1393354916230006e-05, "loss": 0.0273, "reward": 0.687724407762289, "reward_std": 1.0126019269227982, "rewards/cosine_scaled_reward": 0.031362203881144524, "rewards/format_reward": 0.6250000074505806, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 3186.9583740234375, "epoch": 0.18342857142857144, "grad_norm": 0.26291391253471375, "kl": 0.435791015625, "learning_rate": 4.109423525312738e-05, "loss": 0.0092, "reward": 0.3795064650475979, "reward_std": 0.5648243799805641, "rewards/cosine_scaled_reward": -0.2477467879652977, "rewards/format_reward": 0.8750000149011612, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 3135.6668090820312, "epoch": 0.184, "grad_norm": 0.16939568519592285, "kl": 0.41015625, "learning_rate": 4.079579333738039e-05, "loss": 0.0569, "reward": 0.5801227353513241, "reward_std": 0.6866341158747673, "rewards/cosine_scaled_reward": -0.06410535424947739, "rewards/format_reward": 0.7083333656191826, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 2883.9584350585938, "epoch": 0.18457142857142858, "grad_norm": 0.3012012243270874, "kl": 0.3544921875, "learning_rate": 4.049804371462701e-05, "loss": 0.0825, "reward": 0.8750679045915604, "reward_std": 0.5257211327552795, "rewards/cosine_scaled_reward": 0.020867276936769485, "rewards/format_reward": 0.8333333432674408, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 2720.7083435058594, "epoch": 0.18514285714285714, "grad_norm": 0.16761939227581024, "kl": 0.320556640625, "learning_rate": 4.0201000896763766e-05, "loss": 0.0441, "reward": 0.7150210291147232, "reward_std": 0.9162970930337906, "rewards/cosine_scaled_reward": -0.05915616638958454, "rewards/format_reward": 0.8333333432674408, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 2600.541748046875, "epoch": 0.18571428571428572, "grad_norm": 0.28921419382095337, "kl": 0.536865234375, "learning_rate": 3.9904679361238525e-05, "loss": 0.0768, "reward": 0.5437839552760124, "reward_std": 0.8164877891540527, "rewards/cosine_scaled_reward": -0.04060804285109043, "rewards/format_reward": 0.625, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 2133.5000762939453, "epoch": 0.18628571428571428, "grad_norm": 0.28011372685432434, "kl": 0.3050537109375, "learning_rate": 3.960909355034491e-05, "loss": -0.0025, "reward": 0.789381206035614, "reward_std": 0.8102166727185249, "rewards/cosine_scaled_reward": -0.021976079791784286, "rewards/format_reward": 0.8333333358168602, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 2548.2083740234375, "epoch": 0.18685714285714286, "grad_norm": 0.44690772891044617, "kl": 0.39697265625, "learning_rate": 3.9314257870518325e-05, "loss": 0.2676, "reward": 0.8154665417969227, "reward_std": 0.568376112729311, "rewards/cosine_scaled_reward": -0.00893339142203331, "rewards/format_reward": 0.833333358168602, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 2082.916748046875, "epoch": 0.18742857142857142, "grad_norm": 0.39973941445350647, "kl": 0.22576904296875, "learning_rate": 3.902018669163384e-05, "loss": -0.0253, "reward": 0.7669844925403595, "reward_std": 0.7145446315407753, "rewards/cosine_scaled_reward": -0.0956744309514761, "rewards/format_reward": 0.9583333432674408, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 2922.6666870117188, "epoch": 0.188, "grad_norm": 0.33824387192726135, "kl": 0.66064453125, "learning_rate": 3.872689434630585e-05, "loss": 0.1195, "reward": 0.8472144799306989, "reward_std": 0.8882746696472168, "rewards/cosine_scaled_reward": 0.1111072227358818, "rewards/format_reward": 0.6250000260770321, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 2951.3333740234375, "epoch": 0.18857142857142858, "grad_norm": 0.22268566489219666, "kl": 0.62255859375, "learning_rate": 3.843439512918949e-05, "loss": 0.1063, "reward": 0.6393527542240918, "reward_std": 0.9115183800458908, "rewards/cosine_scaled_reward": 0.028009682893753052, "rewards/format_reward": 0.5833333507180214, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 1935.0000762939453, "epoch": 0.18914285714285714, "grad_norm": 0.4056748151779175, "kl": 0.3223876953125, "learning_rate": 3.814270329628396e-05, "loss": -0.0437, "reward": 0.987700991332531, "reward_std": 0.7533665373921394, "rewards/cosine_scaled_reward": 0.07718382868915796, "rewards/format_reward": 0.8333333432674408, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 2499.125030517578, "epoch": 0.18971428571428572, "grad_norm": 0.40982890129089355, "kl": 0.327392578125, "learning_rate": 3.785183306423768e-05, "loss": -0.0806, "reward": 0.9079742878675461, "reward_std": 0.9995783120393753, "rewards/cosine_scaled_reward": -0.025179538875818253, "rewards/format_reward": 0.9583333432674408, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 2600.2084197998047, "epoch": 0.19028571428571428, "grad_norm": 0.22521264851093292, "kl": 0.434326171875, "learning_rate": 3.756179860965538e-05, "loss": 0.1256, "reward": 0.21467669680714607, "reward_std": 0.7439640909433365, "rewards/cosine_scaled_reward": -0.2259949930012226, "rewards/format_reward": 0.666666679084301, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 3179.3333740234375, "epoch": 0.19085714285714286, "grad_norm": 0.3684881627559662, "kl": 0.6171875, "learning_rate": 3.7272614068407205e-05, "loss": 0.044, "reward": 0.2664009025320411, "reward_std": 0.7427940741181374, "rewards/cosine_scaled_reward": -0.15846621617674828, "rewards/format_reward": 0.5833333395421505, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 1974.791732788086, "epoch": 0.19142857142857142, "grad_norm": 0.19373755156993866, "kl": 0.152557373046875, "learning_rate": 3.698429353493974e-05, "loss": 0.0285, "reward": 0.8958619683980942, "reward_std": 0.7756932191550732, "rewards/cosine_scaled_reward": -0.052069032564759254, "rewards/format_reward": 1.0, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 2503.416732788086, "epoch": 0.192, "grad_norm": 0.4300239682197571, "kl": 0.178192138671875, "learning_rate": 3.6696851061589e-05, "loss": 0.0114, "reward": 1.0004391744732857, "reward_std": 0.7523290365934372, "rewards/cosine_scaled_reward": 0.08355289697647095, "rewards/format_reward": 0.8333333432674408, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 2300.5833587646484, "epoch": 0.19257142857142856, "grad_norm": 0.21802906692028046, "kl": 0.3260498046875, "learning_rate": 3.6410300657895626e-05, "loss": 0.1048, "reward": 1.2021107599139214, "reward_std": 0.6737323552370071, "rewards/cosine_scaled_reward": 0.18438871018588543, "rewards/format_reward": 0.833333358168602, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 2349.083396911621, "epoch": 0.19314285714285714, "grad_norm": 0.23494723439216614, "kl": 0.295654296875, "learning_rate": 3.6124656289922034e-05, "loss": 0.0219, "reward": 1.5479250699281693, "reward_std": 1.1081312000751495, "rewards/cosine_scaled_reward": 0.3364625433459878, "rewards/format_reward": 0.8750000149011612, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 3243.4584350585938, "epoch": 0.19371428571428573, "grad_norm": 0.22037623822689056, "kl": 0.5400390625, "learning_rate": 3.583993187957173e-05, "loss": 0.0615, "reward": 0.39422329515218735, "reward_std": 0.7927843853831291, "rewards/cosine_scaled_reward": -0.13622168637812138, "rewards/format_reward": 0.666666679084301, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 2087.750030517578, "epoch": 0.19428571428571428, "grad_norm": 0.2113656848669052, "kl": 0.368560791015625, "learning_rate": 3.5556141303910795e-05, "loss": 0.1216, "reward": 0.583352442830801, "reward_std": 1.0455666035413742, "rewards/cosine_scaled_reward": -0.062490461859852076, "rewards/format_reward": 0.7083333358168602, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 1610.5416717529297, "epoch": 0.19485714285714287, "grad_norm": 0.13782458007335663, "kl": 0.204315185546875, "learning_rate": 3.5273298394491515e-05, "loss": 0.0774, "reward": 0.5869630854576826, "reward_std": 0.42929424345493317, "rewards/cosine_scaled_reward": -0.1648518219590187, "rewards/format_reward": 0.9166666716337204, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 2041.5417175292969, "epoch": 0.19542857142857142, "grad_norm": 0.2814924716949463, "kl": 0.20819091796875, "learning_rate": 3.499141693667828e-05, "loss": 0.1013, "reward": 0.8442784734070301, "reward_std": 0.7920543104410172, "rewards/cosine_scaled_reward": 0.026305895298719406, "rewards/format_reward": 0.7916666716337204, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 2508.666748046875, "epoch": 0.196, "grad_norm": 0.24862973392009735, "kl": 0.31884765625, "learning_rate": 3.4710510668975624e-05, "loss": 0.0598, "reward": 0.3385371249169111, "reward_std": 0.6598921939730644, "rewards/cosine_scaled_reward": -0.2473981073126197, "rewards/format_reward": 0.8333333432674408, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 2231.833396911621, "epoch": 0.19657142857142856, "grad_norm": 0.22046416997909546, "kl": 0.376220703125, "learning_rate": 3.443059328235878e-05, "loss": -0.0093, "reward": 1.2481789495795965, "reward_std": 0.9312818646430969, "rewards/cosine_scaled_reward": 0.18658944219350815, "rewards/format_reward": 0.875, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 1617.5417022705078, "epoch": 0.19714285714285715, "grad_norm": 0.3377382159233093, "kl": 0.17791748046875, "learning_rate": 3.415167841960624e-05, "loss": 0.1082, "reward": 0.9055377095937729, "reward_std": 0.7036742344498634, "rewards/cosine_scaled_reward": 0.015268810093402863, "rewards/format_reward": 0.8750000149011612, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 3182.4584350585938, "epoch": 0.1977142857142857, "grad_norm": 0.24967238306999207, "kl": 0.5107421875, "learning_rate": 3.387377967463493e-05, "loss": 0.0664, "reward": 1.011141985654831, "reward_std": 0.9729516059160233, "rewards/cosine_scaled_reward": 0.10973765794187784, "rewards/format_reward": 0.7916666865348816, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 2438.166748046875, "epoch": 0.1982857142857143, "grad_norm": 0.35565564036369324, "kl": 0.45068359375, "learning_rate": 3.359691059183761e-05, "loss": -0.0418, "reward": 0.5317260958254337, "reward_std": 0.744279682636261, "rewards/cosine_scaled_reward": -0.08830362930893898, "rewards/format_reward": 0.7083333432674408, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 2034.1250915527344, "epoch": 0.19885714285714284, "grad_norm": 0.5289275050163269, "kl": 0.27899169921875, "learning_rate": 3.3321084665422807e-05, "loss": -0.0943, "reward": 0.5254748985171318, "reward_std": 0.6338090598583221, "rewards/cosine_scaled_reward": -0.17476258054375648, "rewards/format_reward": 0.8750000149011612, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 3002.916748046875, "epoch": 0.19942857142857143, "grad_norm": 0.23819538950920105, "kl": 0.419189453125, "learning_rate": 3.304631533875703e-05, "loss": 0.0293, "reward": 0.3799813613295555, "reward_std": 0.8570699989795685, "rewards/cosine_scaled_reward": -0.16417600587010384, "rewards/format_reward": 0.708333358168602, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 1831.2500305175781, "epoch": 0.2, "grad_norm": 0.12972407042980194, "kl": 0.311248779296875, "learning_rate": 3.2772616003709614e-05, "loss": 0.0679, "reward": 1.1549129895865917, "reward_std": 0.5767290014773607, "rewards/cosine_scaled_reward": 0.22328981384634972, "rewards/format_reward": 0.7083333358168602, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 2462.041717529297, "epoch": 0.20057142857142857, "grad_norm": 0.3680292069911957, "kl": 0.3271484375, "learning_rate": 3.250000000000001e-05, "loss": -0.0844, "reward": 0.6425135992467403, "reward_std": 0.7020438965409994, "rewards/cosine_scaled_reward": -0.13707654364407063, "rewards/format_reward": 0.9166666716337204, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 2185.2084197998047, "epoch": 0.20114285714285715, "grad_norm": 0.2795995771884918, "kl": 0.379791259765625, "learning_rate": 3.222848061454764e-05, "loss": 0.1466, "reward": 1.2067798674106598, "reward_std": 0.8647864162921906, "rewards/cosine_scaled_reward": 0.22838991414755583, "rewards/format_reward": 0.7500000149011612, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 2554.1250610351562, "epoch": 0.2017142857142857, "grad_norm": 0.22771836817264557, "kl": 0.226806640625, "learning_rate": 3.195807108082429e-05, "loss": 0.0443, "reward": 0.6497006267309189, "reward_std": 0.7212181687355042, "rewards/cosine_scaled_reward": -0.1334830243140459, "rewards/format_reward": 0.9166666865348816, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 2555.833465576172, "epoch": 0.2022857142857143, "grad_norm": 0.27769502997398376, "kl": 0.439697265625, "learning_rate": 3.168878457820915e-05, "loss": 0.0557, "reward": 0.36892162170261145, "reward_std": 0.725491639226675, "rewards/cosine_scaled_reward": -0.16970586776733398, "rewards/format_reward": 0.708333358168602, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 2070.666748046875, "epoch": 0.20285714285714285, "grad_norm": 0.17403961718082428, "kl": 0.232147216796875, "learning_rate": 3.1420634231346445e-05, "loss": 0.0253, "reward": 0.6142217069864273, "reward_std": 0.37743623182177544, "rewards/cosine_scaled_reward": -0.13038915395736694, "rewards/format_reward": 0.8750000149011612, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 2257.250030517578, "epoch": 0.20342857142857143, "grad_norm": 0.3374931812286377, "kl": 0.2642822265625, "learning_rate": 3.1153633109505784e-05, "loss": -0.1064, "reward": 1.8636004030704498, "reward_std": 0.7428180351853371, "rewards/cosine_scaled_reward": 0.4526335150003433, "rewards/format_reward": 0.9583333432674408, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 2539.291717529297, "epoch": 0.204, "grad_norm": 0.2252262383699417, "kl": 0.33685302734375, "learning_rate": 3.088779422594514e-05, "loss": 0.0651, "reward": 0.2009668005630374, "reward_std": 0.5703963618725538, "rewards/cosine_scaled_reward": -0.29534994065761566, "rewards/format_reward": 0.7916666865348816, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 2775.916748046875, "epoch": 0.20457142857142857, "grad_norm": 0.17728424072265625, "kl": 0.303619384765625, "learning_rate": 3.062313053727671e-05, "loss": 0.0615, "reward": 1.3498004898428917, "reward_std": 0.699789387639612, "rewards/cosine_scaled_reward": 0.19573353230953217, "rewards/format_reward": 0.9583333432674408, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 1551.7500457763672, "epoch": 0.20514285714285715, "grad_norm": 0.23342669010162354, "kl": 0.131744384765625, "learning_rate": 3.0359654942835248e-05, "loss": 0.1226, "reward": 0.8648534566164017, "reward_std": 0.41398559510707855, "rewards/cosine_scaled_reward": -0.06757331639528275, "rewards/format_reward": 1.0, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 2911.0834350585938, "epoch": 0.2057142857142857, "grad_norm": 0.20153023302555084, "kl": 0.409423828125, "learning_rate": 3.0097380284049527e-05, "loss": 0.0591, "reward": 0.5484894886612892, "reward_std": 0.8311030864715576, "rewards/cosine_scaled_reward": -0.12158861197531223, "rewards/format_reward": 0.7916666716337204, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 2764.166748046875, "epoch": 0.2062857142857143, "grad_norm": 0.2474966049194336, "kl": 0.27166748046875, "learning_rate": 2.98363193438164e-05, "loss": 0.0633, "reward": 1.1824394315481186, "reward_std": 0.5549486838281155, "rewards/cosine_scaled_reward": 0.13288633804768324, "rewards/format_reward": 0.9166666865348816, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 2818.0416870117188, "epoch": 0.20685714285714285, "grad_norm": 0.27774032950401306, "kl": 0.40234375, "learning_rate": 2.9576484845877794e-05, "loss": 0.1074, "reward": 0.7827701717615128, "reward_std": 0.8135464563965797, "rewards/cosine_scaled_reward": 0.016385079594329, "rewards/format_reward": 0.7500000223517418, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 1643.9583740234375, "epoch": 0.20742857142857143, "grad_norm": 0.21469183266162872, "kl": 0.1004638671875, "learning_rate": 2.931788945420058e-05, "loss": 0.0814, "reward": 1.152499184012413, "reward_std": 0.48752832412719727, "rewards/cosine_scaled_reward": 0.07624955475330353, "rewards/format_reward": 1.0, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 2685.2501068115234, "epoch": 0.208, "grad_norm": 0.228445902466774, "kl": 0.30401611328125, "learning_rate": 2.906054577235931e-05, "loss": 0.0817, "reward": 1.0254673808813095, "reward_std": 0.894573763012886, "rewards/cosine_scaled_reward": 0.09606703370809555, "rewards/format_reward": 0.833333358168602, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 2010.1666870117188, "epoch": 0.20857142857142857, "grad_norm": 0.2819450795650482, "kl": 0.1759033203125, "learning_rate": 2.880446634292199e-05, "loss": -0.0103, "reward": 0.446915403008461, "reward_std": 0.32444334402680397, "rewards/cosine_scaled_reward": -0.2765423096716404, "rewards/format_reward": 1.0, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 3228.9168090820312, "epoch": 0.20914285714285713, "grad_norm": 0.27710995078086853, "kl": 0.38671875, "learning_rate": 2.854966364683872e-05, "loss": -0.0126, "reward": 0.35312827420420945, "reward_std": 0.4395810291171074, "rewards/cosine_scaled_reward": -0.17760252580046654, "rewards/format_reward": 0.7083333507180214, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 2320.5001068115234, "epoch": 0.20971428571428571, "grad_norm": 0.20172470808029175, "kl": 0.324066162109375, "learning_rate": 2.829615010283344e-05, "loss": 0.0833, "reward": 1.231175735592842, "reward_std": 1.0247896611690521, "rewards/cosine_scaled_reward": 0.2614212017506361, "rewards/format_reward": 0.7083333358168602, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 2057.875030517578, "epoch": 0.2102857142857143, "grad_norm": 0.3990497589111328, "kl": 0.275390625, "learning_rate": 2.8043938066798646e-05, "loss": 0.0937, "reward": 1.4740150086581707, "reward_std": 0.6022031959146261, "rewards/cosine_scaled_reward": 0.2995075099170208, "rewards/format_reward": 0.8750000149011612, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 2476.750045776367, "epoch": 0.21085714285714285, "grad_norm": 0.22831466794013977, "kl": 0.36370849609375, "learning_rate": 2.7793039831193136e-05, "loss": -0.0094, "reward": 0.7754583209753036, "reward_std": 0.6475037336349487, "rewards/cosine_scaled_reward": -0.008104167878627777, "rewards/format_reward": 0.791666679084301, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 2979.3334350585938, "epoch": 0.21142857142857144, "grad_norm": 0.2895065248012543, "kl": 0.324951171875, "learning_rate": 2.754346762444296e-05, "loss": 0.0505, "reward": 0.7149831403512508, "reward_std": 0.612546693533659, "rewards/cosine_scaled_reward": -0.08000845834612846, "rewards/format_reward": 0.8750000298023224, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 1844.6666870117188, "epoch": 0.212, "grad_norm": 0.2284790277481079, "kl": 0.42205810546875, "learning_rate": 2.729523361034538e-05, "loss": 0.1355, "reward": 0.6728616314940155, "reward_std": 0.2811877764761448, "rewards/cosine_scaled_reward": 0.044764142483472824, "rewards/format_reward": 0.5833333358168602, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 2974.4583740234375, "epoch": 0.21257142857142858, "grad_norm": 0.3309856951236725, "kl": 0.2061767578125, "learning_rate": 2.7048349887476037e-05, "loss": 0.0268, "reward": 0.7733481526374817, "reward_std": 0.7097474634647369, "rewards/cosine_scaled_reward": -0.009159276261925697, "rewards/format_reward": 0.7916666716337204, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 2476.291748046875, "epoch": 0.21314285714285713, "grad_norm": 0.19564294815063477, "kl": 0.3516845703125, "learning_rate": 2.6802828488599297e-05, "loss": 0.0353, "reward": 1.6214977502822876, "reward_std": 0.9677339103072882, "rewards/cosine_scaled_reward": 0.3940822258591652, "rewards/format_reward": 0.8333333432674408, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 3004.2083740234375, "epoch": 0.21371428571428572, "grad_norm": 0.2780701518058777, "kl": 0.34326171875, "learning_rate": 2.6558681380081713e-05, "loss": 0.0384, "reward": 0.8554385527968407, "reward_std": 0.8751993477344513, "rewards/cosine_scaled_reward": 0.03188592568039894, "rewards/format_reward": 0.7916667014360428, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 2040.041748046875, "epoch": 0.21428571428571427, "grad_norm": 0.2669806480407715, "kl": 0.29327392578125, "learning_rate": 2.6315920461308964e-05, "loss": 0.039, "reward": 1.2006212025880814, "reward_std": 0.6928756944835186, "rewards/cosine_scaled_reward": 0.18364391289651394, "rewards/format_reward": 0.8333333432674408, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 2124.416717529297, "epoch": 0.21485714285714286, "grad_norm": 0.7000332474708557, "kl": 0.2510986328125, "learning_rate": 2.6074557564105727e-05, "loss": -0.1579, "reward": 1.4331469386816025, "reward_std": 0.778608538210392, "rewards/cosine_scaled_reward": 0.23740678373724222, "rewards/format_reward": 0.9583333432674408, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 2067.5834045410156, "epoch": 0.21542857142857144, "grad_norm": 0.19451673328876495, "kl": 0.312744140625, "learning_rate": 2.5834604452159112e-05, "loss": 0.1316, "reward": 0.983905091881752, "reward_std": 0.9569895938038826, "rewards/cosine_scaled_reward": 0.09611918777227402, "rewards/format_reward": 0.7916666716337204, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 2343.1250915527344, "epoch": 0.216, "grad_norm": 0.4488096535205841, "kl": 0.331787109375, "learning_rate": 2.5596072820445254e-05, "loss": 0.0512, "reward": 0.4893091805279255, "reward_std": 0.574474148452282, "rewards/cosine_scaled_reward": -0.15117877395823598, "rewards/format_reward": 0.7916666716337204, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 2728.0000915527344, "epoch": 0.21657142857142858, "grad_norm": 0.21385972201824188, "kl": 0.462890625, "learning_rate": 2.5358974294659375e-05, "loss": 0.0806, "reward": 0.12568058911710978, "reward_std": 0.8110382407903671, "rewards/cosine_scaled_reward": -0.2079930566251278, "rewards/format_reward": 0.5416666753590107, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 2148.8750915527344, "epoch": 0.21714285714285714, "grad_norm": 0.21321555972099304, "kl": 0.2987060546875, "learning_rate": 2.5123320430649133e-05, "loss": 0.0423, "reward": 1.0129856215789914, "reward_std": 0.6403012797236443, "rewards/cosine_scaled_reward": 0.08982611820101738, "rewards/format_reward": 0.8333333358168602, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 1976.8750915527344, "epoch": 0.21771428571428572, "grad_norm": 0.2554902136325836, "kl": 0.26251220703125, "learning_rate": 2.4889122713851394e-05, "loss": 0.058, "reward": 0.7505160495638847, "reward_std": 0.803048387169838, "rewards/cosine_scaled_reward": 0.0002580210566520691, "rewards/format_reward": 0.7500000111758709, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 2567.9584197998047, "epoch": 0.21828571428571428, "grad_norm": 0.23561526834964752, "kl": 0.4429931640625, "learning_rate": 2.4656392558732464e-05, "loss": 0.1233, "reward": 0.5255677588284016, "reward_std": 0.7546349912881851, "rewards/cosine_scaled_reward": -0.02888280153274536, "rewards/format_reward": 0.5833333432674408, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 1854.666748046875, "epoch": 0.21885714285714286, "grad_norm": 0.41661813855171204, "kl": 0.24273681640625, "learning_rate": 2.442514130823177e-05, "loss": 0.1697, "reward": 0.5279544293880463, "reward_std": 0.7327054888010025, "rewards/cosine_scaled_reward": -0.1526894560083747, "rewards/format_reward": 0.8333333432674408, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 2985.416748046875, "epoch": 0.21942857142857142, "grad_norm": 0.47805020213127136, "kl": 0.41455078125, "learning_rate": 2.4195380233209008e-05, "loss": 0.0078, "reward": 0.6774251461029053, "reward_std": 0.5871247202157974, "rewards/cosine_scaled_reward": -0.07795410230755806, "rewards/format_reward": 0.8333333730697632, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 1744.916732788086, "epoch": 0.22, "grad_norm": 0.22037595510482788, "kl": 0.257568359375, "learning_rate": 2.396712053189486e-05, "loss": 0.1639, "reward": 0.9186278469860554, "reward_std": 0.33239728957414627, "rewards/cosine_scaled_reward": 0.021813903003931046, "rewards/format_reward": 0.8750000149011612, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 2539.1666870117188, "epoch": 0.22057142857142858, "grad_norm": 0.24289961159229279, "kl": 0.399169921875, "learning_rate": 2.374037332934512e-05, "loss": 0.0472, "reward": 0.2605516407638788, "reward_std": 0.49868740141391754, "rewards/cosine_scaled_reward": -0.20305750891566277, "rewards/format_reward": 0.666666679084301, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 1674.8750228881836, "epoch": 0.22114285714285714, "grad_norm": 0.2652975022792816, "kl": 0.3177490234375, "learning_rate": 2.3515149676898555e-05, "loss": 0.0461, "reward": 1.29573517665267, "reward_std": 0.6526281535625458, "rewards/cosine_scaled_reward": 0.2520342655479908, "rewards/format_reward": 0.791666679084301, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 1416.9166870117188, "epoch": 0.22171428571428572, "grad_norm": 0.22367416322231293, "kl": 0.12982177734375, "learning_rate": 2.329146055163824e-05, "loss": 0.0651, "reward": 1.4672380983829498, "reward_std": 0.9607173055410385, "rewards/cosine_scaled_reward": 0.27528570708818734, "rewards/format_reward": 0.9166666716337204, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 1106.2500457763672, "epoch": 0.22228571428571428, "grad_norm": 0.16189609467983246, "kl": 0.0909576416015625, "learning_rate": 2.306931685585657e-05, "loss": 0.0612, "reward": 1.3787225484848022, "reward_std": 0.8911682516336441, "rewards/cosine_scaled_reward": 0.23102791607379913, "rewards/format_reward": 0.9166666716337204, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 2349.7500915527344, "epoch": 0.22285714285714286, "grad_norm": 0.24648283421993256, "kl": 0.2828369140625, "learning_rate": 2.284872941652386e-05, "loss": 0.0402, "reward": 0.3927423320710659, "reward_std": 0.49121467769145966, "rewards/cosine_scaled_reward": -0.26196216978132725, "rewards/format_reward": 0.9166666716337204, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 2174.7083740234375, "epoch": 0.22342857142857142, "grad_norm": 0.2675975263118744, "kl": 0.111419677734375, "learning_rate": 2.2629708984760708e-05, "loss": 0.0141, "reward": 0.9644523113965988, "reward_std": 0.7060663215816021, "rewards/cosine_scaled_reward": -0.017773881554603577, "rewards/format_reward": 1.0, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 2664.3334045410156, "epoch": 0.224, "grad_norm": 0.2830732762813568, "kl": 0.30059814453125, "learning_rate": 2.2412266235313975e-05, "loss": -0.0216, "reward": 0.530030932277441, "reward_std": 0.6014800369739532, "rewards/cosine_scaled_reward": -0.17248454061336815, "rewards/format_reward": 0.875, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 2592.7084197998047, "epoch": 0.22457142857142856, "grad_norm": 0.18396273255348206, "kl": 0.33929443359375, "learning_rate": 2.219641176603649e-05, "loss": 0.083, "reward": 0.20931893214583397, "reward_std": 0.35438287258148193, "rewards/cosine_scaled_reward": -0.29117387905716896, "rewards/format_reward": 0.7916666716337204, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 1201.6667175292969, "epoch": 0.22514285714285714, "grad_norm": 0.24478864669799805, "kl": 0.10565185546875, "learning_rate": 2.198215609737056e-05, "loss": 0.026, "reward": 0.9410552708432078, "reward_std": 0.8450669944286346, "rewards/cosine_scaled_reward": -0.008639028761535883, "rewards/format_reward": 0.9583333432674408, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 2590.3750610351562, "epoch": 0.2257142857142857, "grad_norm": 0.24432548880577087, "kl": 0.191162109375, "learning_rate": 2.1769509671835224e-05, "loss": -0.0402, "reward": 1.1612588688731194, "reward_std": 0.8244020491838455, "rewards/cosine_scaled_reward": 0.08062941022217274, "rewards/format_reward": 1.0, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 2692.1250610351562, "epoch": 0.22628571428571428, "grad_norm": 0.2695596218109131, "kl": 0.24951171875, "learning_rate": 2.1558482853517257e-05, "loss": 0.0775, "reward": 0.7021965757012367, "reward_std": 0.6225183010101318, "rewards/cosine_scaled_reward": -0.06556838750839233, "rewards/format_reward": 0.8333333432674408, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 2493.2084197998047, "epoch": 0.22685714285714287, "grad_norm": 0.8182937502861023, "kl": 0.2171630859375, "learning_rate": 2.1349085927566073e-05, "loss": 0.0479, "reward": 1.2701049419119954, "reward_std": 0.675561960786581, "rewards/cosine_scaled_reward": 0.28088577929884195, "rewards/format_reward": 0.7083333358168602, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 1911.6250305175781, "epoch": 0.22742857142857142, "grad_norm": 0.2450830191373825, "kl": 0.1436767578125, "learning_rate": 2.114132909969241e-05, "loss": -0.0067, "reward": 0.8501160591840744, "reward_std": 0.6082466319203377, "rewards/cosine_scaled_reward": -0.03327532671391964, "rewards/format_reward": 0.9166666865348816, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 2610.50008392334, "epoch": 0.228, "grad_norm": 0.3445512056350708, "kl": 0.45556640625, "learning_rate": 2.093522249567097e-05, "loss": 0.0639, "reward": 0.44864194467663765, "reward_std": 0.6613021939992905, "rewards/cosine_scaled_reward": -0.15067904442548752, "rewards/format_reward": 0.75, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 957.1666793823242, "epoch": 0.22857142857142856, "grad_norm": 0.19021636247634888, "kl": 0.042205810546875, "learning_rate": 2.0730776160846853e-05, "loss": 0.0533, "reward": 1.58597931265831, "reward_std": 0.6685393303632736, "rewards/cosine_scaled_reward": 0.2929896265268326, "rewards/format_reward": 1.0, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 1739.2500457763672, "epoch": 0.22914285714285715, "grad_norm": 0.2134786993265152, "kl": 0.13818359375, "learning_rate": 2.0528000059645997e-05, "loss": 0.0305, "reward": 0.6482241563498974, "reward_std": 0.4021666403859854, "rewards/cosine_scaled_reward": -0.15505461394786835, "rewards/format_reward": 0.9583333432674408, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 2346.7084197998047, "epoch": 0.2297142857142857, "grad_norm": 0.2189859002828598, "kl": 0.27001953125, "learning_rate": 2.0326904075089492e-05, "loss": 0.0286, "reward": 0.9544563218951225, "reward_std": 0.9867835119366646, "rewards/cosine_scaled_reward": 0.03972811624407768, "rewards/format_reward": 0.8750000149011612, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 1908.1250305175781, "epoch": 0.2302857142857143, "grad_norm": 0.2624204158782959, "kl": 0.134124755859375, "learning_rate": 2.0127498008311922e-05, "loss": 0.0638, "reward": 1.8549927771091461, "reward_std": 0.986580029129982, "rewards/cosine_scaled_reward": 0.44832973554730415, "rewards/format_reward": 0.9583333432674408, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 2161.166732788086, "epoch": 0.23085714285714284, "grad_norm": 0.25534167885780334, "kl": 0.202880859375, "learning_rate": 1.9929791578083658e-05, "loss": 0.0734, "reward": 0.6427417621016502, "reward_std": 0.7742570489645004, "rewards/cosine_scaled_reward": -0.11612912639975548, "rewards/format_reward": 0.875, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 2189.333396911621, "epoch": 0.23142857142857143, "grad_norm": 0.2308729588985443, "kl": 0.1011962890625, "learning_rate": 1.9733794420337214e-05, "loss": 0.0567, "reward": 1.7219876870512962, "reward_std": 0.4820314012467861, "rewards/cosine_scaled_reward": 0.40266046952456236, "rewards/format_reward": 0.9166666865348816, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 2541.2916870117188, "epoch": 0.232, "grad_norm": 0.33757075667381287, "kl": 0.1832275390625, "learning_rate": 1.9539516087697518e-05, "loss": 0.0846, "reward": 0.8487504161894321, "reward_std": 0.7889838367700577, "rewards/cosine_scaled_reward": 0.007708512246608734, "rewards/format_reward": 0.833333358168602, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 1872.5000457763672, "epoch": 0.23257142857142857, "grad_norm": 0.1827789694070816, "kl": 0.206634521484375, "learning_rate": 1.9346966049016424e-05, "loss": 0.1478, "reward": 1.3279360756278038, "reward_std": 0.7395913898944855, "rewards/cosine_scaled_reward": 0.2056347131729126, "rewards/format_reward": 0.9166666716337204, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 1630.0834045410156, "epoch": 0.23314285714285715, "grad_norm": 0.2044801563024521, "kl": 0.2037353515625, "learning_rate": 1.915615368891117e-05, "loss": 0.064, "reward": 1.0793461948633194, "reward_std": 0.6011250354349613, "rewards/cosine_scaled_reward": 0.0813397541642189, "rewards/format_reward": 0.9166666716337204, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 2164.000030517578, "epoch": 0.2337142857142857, "grad_norm": 0.1898968368768692, "kl": 0.2315673828125, "learning_rate": 1.8967088307307003e-05, "loss": 0.0842, "reward": 0.5218268632888794, "reward_std": 0.8983562588691711, "rewards/cosine_scaled_reward": -0.13491991348564625, "rewards/format_reward": 0.7916666716337204, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 2933.916748046875, "epoch": 0.2342857142857143, "grad_norm": 0.18983033299446106, "kl": 0.38818359375, "learning_rate": 1.877977911898387e-05, "loss": 0.0487, "reward": 0.6665925020352006, "reward_std": 0.4128606617450714, "rewards/cosine_scaled_reward": 0.020796246826648712, "rewards/format_reward": 0.625, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 2553.1250610351562, "epoch": 0.23485714285714285, "grad_norm": 0.31600725650787354, "kl": 0.187255859375, "learning_rate": 1.8594235253127375e-05, "loss": 0.0502, "reward": 0.7985327839851379, "reward_std": 0.7022458389401436, "rewards/cosine_scaled_reward": -0.07990030199289322, "rewards/format_reward": 0.9583333432674408, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 2516.3334350585938, "epoch": 0.23542857142857143, "grad_norm": 0.48707082867622375, "kl": 0.3245849609375, "learning_rate": 1.8410465752883758e-05, "loss": -0.0511, "reward": 0.08879928779788315, "reward_std": 0.36526790633797646, "rewards/cosine_scaled_reward": -0.3306003734469414, "rewards/format_reward": 0.7500000074505806, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 2167.3751068115234, "epoch": 0.236, "grad_norm": 0.20420606434345245, "kl": 0.11083984375, "learning_rate": 1.822847957491922e-05, "loss": 0.028, "reward": 1.2763259708881378, "reward_std": 0.8057107403874397, "rewards/cosine_scaled_reward": 0.13816297985613346, "rewards/format_reward": 1.0, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 2270.7500915527344, "epoch": 0.23657142857142857, "grad_norm": 0.30266261100769043, "kl": 0.17315673828125, "learning_rate": 1.804828558898332e-05, "loss": -0.0628, "reward": 0.9188649505376816, "reward_std": 0.6045625507831573, "rewards/cosine_scaled_reward": 0.042765818536281586, "rewards/format_reward": 0.8333333432674408, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 2395.0000762939453, "epoch": 0.23714285714285716, "grad_norm": 0.19137543439865112, "kl": 0.248870849609375, "learning_rate": 1.7869892577476724e-05, "loss": 0.0178, "reward": 0.5630744565278292, "reward_std": 0.5253113936632872, "rewards/cosine_scaled_reward": -0.13512944988906384, "rewards/format_reward": 0.8333333432674408, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 2648.6250610351562, "epoch": 0.2377142857142857, "grad_norm": 0.3866080641746521, "kl": 0.272674560546875, "learning_rate": 1.769330923502313e-05, "loss": 0.0603, "reward": 0.532728798687458, "reward_std": 0.5860278196632862, "rewards/cosine_scaled_reward": -0.17113562487065792, "rewards/format_reward": 0.8750000149011612, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 1926.833396911621, "epoch": 0.2382857142857143, "grad_norm": 0.28291311860084534, "kl": 0.1495361328125, "learning_rate": 1.7518544168045525e-05, "loss": 0.2172, "reward": 0.8828543275594711, "reward_std": 0.4056566394865513, "rewards/cosine_scaled_reward": -0.016906175762414932, "rewards/format_reward": 0.9166666865348816, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 2516.6250610351562, "epoch": 0.23885714285714285, "grad_norm": 0.26306572556495667, "kl": 0.2476806640625, "learning_rate": 1.734560589434673e-05, "loss": 0.0767, "reward": 0.2381377201527357, "reward_std": 0.3217194452881813, "rewards/cosine_scaled_reward": -0.33926449716091156, "rewards/format_reward": 0.9166666865348816, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 2267.3750915527344, "epoch": 0.23942857142857144, "grad_norm": 0.16968634724617004, "kl": 0.29937744140625, "learning_rate": 1.7174502842694213e-05, "loss": -0.0294, "reward": 0.15903769060969353, "reward_std": 0.6953174918889999, "rewards/cosine_scaled_reward": -0.2954811677336693, "rewards/format_reward": 0.7500000074505806, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 1356.208366394043, "epoch": 0.24, "grad_norm": 0.16105955839157104, "kl": 0.1409912109375, "learning_rate": 1.7005243352409334e-05, "loss": 0.0785, "reward": 1.5586809143424034, "reward_std": 0.6286041433922946, "rewards/cosine_scaled_reward": 0.3210071250796318, "rewards/format_reward": 0.9166666716337204, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 2299.0833740234375, "epoch": 0.24057142857142857, "grad_norm": 0.24983569979667664, "kl": 0.207275390625, "learning_rate": 1.6837835672960835e-05, "loss": 0.0504, "reward": 0.802627682685852, "reward_std": 0.7565935924649239, "rewards/cosine_scaled_reward": -0.057019513100385666, "rewards/format_reward": 0.9166666865348816, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 3433.166748046875, "epoch": 0.24114285714285713, "grad_norm": 0.26617732644081116, "kl": 0.4814453125, "learning_rate": 1.6672287963562855e-05, "loss": 0.029, "reward": 0.4502560719847679, "reward_std": 0.8340122252702713, "rewards/cosine_scaled_reward": -0.14987197145819664, "rewards/format_reward": 0.7500000149011612, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 1953.916732788086, "epoch": 0.24171428571428571, "grad_norm": 0.23757286369800568, "kl": 0.172607421875, "learning_rate": 1.6508608292777204e-05, "loss": -0.0152, "reward": 0.899025060236454, "reward_std": 0.9901894629001617, "rewards/cosine_scaled_reward": -0.008820809423923492, "rewards/format_reward": 0.9166666865348816, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 2103.3334045410156, "epoch": 0.2422857142857143, "grad_norm": 0.7382405400276184, "kl": 0.1934814453125, "learning_rate": 1.63468046381201e-05, "loss": 0.0604, "reward": 0.9285087138414383, "reward_std": 0.8037227541208267, "rewards/cosine_scaled_reward": 0.026754358783364296, "rewards/format_reward": 0.8750000298023224, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 3309.3334350585938, "epoch": 0.24285714285714285, "grad_norm": 0.3841143250465393, "kl": 0.486083984375, "learning_rate": 1.6186884885673413e-05, "loss": 0.0301, "reward": 0.1549401180818677, "reward_std": 0.38981814309954643, "rewards/cosine_scaled_reward": -0.2766966111958027, "rewards/format_reward": 0.7083333507180214, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 2825.0000610351562, "epoch": 0.24342857142857144, "grad_norm": 0.22417770326137543, "kl": 0.32391357421875, "learning_rate": 1.602885682970026e-05, "loss": 0.0358, "reward": 0.6782936668023467, "reward_std": 0.5605045408010483, "rewards/cosine_scaled_reward": -0.03585319593548775, "rewards/format_reward": 0.7500000149011612, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 3096.291748046875, "epoch": 0.244, "grad_norm": 0.21399366855621338, "kl": 0.3536376953125, "learning_rate": 1.5872728172265147e-05, "loss": 0.0608, "reward": 0.7031284123659134, "reward_std": 0.7477234750986099, "rewards/cosine_scaled_reward": -0.04426916316151619, "rewards/format_reward": 0.7916666865348816, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 1948.2916870117188, "epoch": 0.24457142857142858, "grad_norm": 0.27849727869033813, "kl": 0.208099365234375, "learning_rate": 1.5718506522858573e-05, "loss": 0.0685, "reward": 1.0386689975857735, "reward_std": 0.9889701455831528, "rewards/cosine_scaled_reward": 0.1026678173802793, "rewards/format_reward": 0.8333333432674408, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 2656.875030517578, "epoch": 0.24514285714285713, "grad_norm": 0.2026260942220688, "kl": 0.36163330078125, "learning_rate": 1.556619939802615e-05, "loss": 0.0058, "reward": 0.3719893768429756, "reward_std": 0.768018901348114, "rewards/cosine_scaled_reward": -0.1890053153038025, "rewards/format_reward": 0.7500000074505806, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 3101.416748046875, "epoch": 0.24571428571428572, "grad_norm": 0.3154773414134979, "kl": 0.255615234375, "learning_rate": 1.5415814221002267e-05, "loss": 0.0772, "reward": 0.6607606410980225, "reward_std": 0.9749536961317062, "rewards/cosine_scaled_reward": -0.10711969062685966, "rewards/format_reward": 0.8750000298023224, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 3123.3750610351562, "epoch": 0.24628571428571427, "grad_norm": 0.22464747726917267, "kl": 0.48779296875, "learning_rate": 1.526735832134829e-05, "loss": 0.088, "reward": 0.6580713596194983, "reward_std": 0.521630696952343, "rewards/cosine_scaled_reward": -0.06679766997694969, "rewards/format_reward": 0.7916666716337204, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 2067.7083892822266, "epoch": 0.24685714285714286, "grad_norm": 0.22717218101024628, "kl": 0.16131591796875, "learning_rate": 1.5120838934595339e-05, "loss": -0.0427, "reward": 0.5478418692946434, "reward_std": 0.8848011568188667, "rewards/cosine_scaled_reward": -0.1427457444369793, "rewards/format_reward": 0.833333358168602, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 1642.3334045410156, "epoch": 0.24742857142857144, "grad_norm": 0.24554996192455292, "kl": 0.128936767578125, "learning_rate": 1.4976263201891614e-05, "loss": -0.0574, "reward": 1.2519729286432266, "reward_std": 0.6352694146335125, "rewards/cosine_scaled_reward": 0.1468198113143444, "rewards/format_reward": 0.9583333432674408, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 3017.0834350585938, "epoch": 0.248, "grad_norm": 0.23111708462238312, "kl": 0.3212890625, "learning_rate": 1.4833638169654352e-05, "loss": 0.0242, "reward": 0.4378589540719986, "reward_std": 0.47337810695171356, "rewards/cosine_scaled_reward": -0.1977371945977211, "rewards/format_reward": 0.8333333432674408, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 2320.666732788086, "epoch": 0.24857142857142858, "grad_norm": 0.18372684717178345, "kl": 0.31683349609375, "learning_rate": 1.469297078922642e-05, "loss": 0.122, "reward": 0.9090068517252803, "reward_std": 0.4904613792896271, "rewards/cosine_scaled_reward": 0.10033675655722618, "rewards/format_reward": 0.7083333507180214, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 2480.87508392334, "epoch": 0.24914285714285714, "grad_norm": 0.36884191632270813, "kl": 0.284912109375, "learning_rate": 1.4554267916537495e-05, "loss": 0.1089, "reward": 0.9375562369823456, "reward_std": 1.2034636735916138, "rewards/cosine_scaled_reward": 0.07294480130076408, "rewards/format_reward": 0.7916666865348816, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 1246.0000381469727, "epoch": 0.24971428571428572, "grad_norm": 0.5064862966537476, "kl": 0.128814697265625, "learning_rate": 1.4417536311769886e-05, "loss": 0.0032, "reward": 1.3162876069545746, "reward_std": 0.5200040265917778, "rewards/cosine_scaled_reward": 0.199810478836298, "rewards/format_reward": 0.9166666716337204, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 2604.0834045410156, "epoch": 0.2502857142857143, "grad_norm": 0.15203051269054413, "kl": 0.22100830078125, "learning_rate": 1.428278263902913e-05, "loss": 0.0638, "reward": 0.6659852899610996, "reward_std": 0.7454147860407829, "rewards/cosine_scaled_reward": -0.10450738109648228, "rewards/format_reward": 0.8750000149011612, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 1844.3750915527344, "epoch": 0.25085714285714283, "grad_norm": 0.16812385618686676, "kl": 0.21405029296875, "learning_rate": 1.4150013466019115e-05, "loss": 0.0654, "reward": 0.451425364241004, "reward_std": 0.5447552353143692, "rewards/cosine_scaled_reward": -0.21178732067346573, "rewards/format_reward": 0.8750000149011612, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 1738.7500305175781, "epoch": 0.25142857142857145, "grad_norm": 0.25210797786712646, "kl": 0.2158203125, "learning_rate": 1.4019235263722036e-05, "loss": -0.0197, "reward": 0.3830821365118027, "reward_std": 0.4169432930648327, "rewards/cosine_scaled_reward": -0.24595895409584045, "rewards/format_reward": 0.8750000149011612, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 2879.2501220703125, "epoch": 0.252, "grad_norm": 0.4387795925140381, "kl": 0.404296875, "learning_rate": 1.389045440608296e-05, "loss": 0.0095, "reward": 0.17910403199493885, "reward_std": 0.450714360922575, "rewards/cosine_scaled_reward": -0.28544800728559494, "rewards/format_reward": 0.7500000149011612, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 2282.2501068115234, "epoch": 0.25257142857142856, "grad_norm": 0.2824661433696747, "kl": 0.2359619140625, "learning_rate": 1.3763677169699218e-05, "loss": 0.016, "reward": 1.4356295093894005, "reward_std": 0.3898382596671581, "rewards/cosine_scaled_reward": 0.28031472861766815, "rewards/format_reward": 0.875, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 2792.0416870117188, "epoch": 0.25314285714285717, "grad_norm": 0.2786317765712738, "kl": 0.376953125, "learning_rate": 1.3638909733514454e-05, "loss": -0.0293, "reward": 0.19937769044190645, "reward_std": 0.5115105733275414, "rewards/cosine_scaled_reward": -0.2336445041000843, "rewards/format_reward": 0.6666666828095913, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 2881.7500610351562, "epoch": 0.2537142857142857, "grad_norm": 0.19312676787376404, "kl": 0.377685546875, "learning_rate": 1.3516158178517482e-05, "loss": 0.0486, "reward": 0.3308720774948597, "reward_std": 0.581496886909008, "rewards/cosine_scaled_reward": -0.20956396497786045, "rewards/format_reward": 0.7500000149011612, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 2190.1251068115234, "epoch": 0.2542857142857143, "grad_norm": 0.3452683091163635, "kl": 0.161651611328125, "learning_rate": 1.3395428487445916e-05, "loss": -0.0681, "reward": 1.0236865878105164, "reward_std": 0.8273026198148727, "rewards/cosine_scaled_reward": 0.07434329763054848, "rewards/format_reward": 0.875, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 1811.2500305175781, "epoch": 0.25485714285714284, "grad_norm": 0.2189359962940216, "kl": 0.183319091796875, "learning_rate": 1.3276726544494572e-05, "loss": 0.0868, "reward": 0.420175077393651, "reward_std": 0.2947616521269083, "rewards/cosine_scaled_reward": -0.24824582412838936, "rewards/format_reward": 0.9166666865348816, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 2670.6250610351562, "epoch": 0.25542857142857145, "grad_norm": 0.21349620819091797, "kl": 0.27740478515625, "learning_rate": 1.3160058135028691e-05, "loss": 0.0593, "reward": 0.64484803378582, "reward_std": 0.8606602028012276, "rewards/cosine_scaled_reward": -0.11507599544711411, "rewards/format_reward": 0.8750000149011612, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 2424.8334350585938, "epoch": 0.256, "grad_norm": 0.23509125411510468, "kl": 0.2618408203125, "learning_rate": 1.3045428945301954e-05, "loss": 0.0366, "reward": 1.0526692494750023, "reward_std": 0.46373073756694794, "rewards/cosine_scaled_reward": 0.13050128147006035, "rewards/format_reward": 0.7916666865348816, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 2667.2084350585938, "epoch": 0.25657142857142856, "grad_norm": 0.39572006464004517, "kl": 0.258056640625, "learning_rate": 1.2932844562179353e-05, "loss": 0.0822, "reward": 1.0360109135508537, "reward_std": 0.7895313426852226, "rewards/cosine_scaled_reward": 0.18467211723327637, "rewards/format_reward": 0.6666666828095913, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 1222.3750762939453, "epoch": 0.2571428571428571, "grad_norm": 0.18777267634868622, "kl": 0.132415771484375, "learning_rate": 1.2822310472864884e-05, "loss": 0.1176, "reward": 1.2537522641941905, "reward_std": 0.40848940052092075, "rewards/cosine_scaled_reward": 0.1893761195242405, "rewards/format_reward": 0.875, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 2296.0000915527344, "epoch": 0.25771428571428573, "grad_norm": 0.3156144618988037, "kl": 0.206787109375, "learning_rate": 1.2713832064634126e-05, "loss": 0.1219, "reward": 0.5749104283750057, "reward_std": 0.6167091354727745, "rewards/cosine_scaled_reward": -0.21254480443894863, "rewards/format_reward": 1.0, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 1653.2500686645508, "epoch": 0.2582857142857143, "grad_norm": 0.4651844799518585, "kl": 0.131317138671875, "learning_rate": 1.260741462457165e-05, "loss": 0.133, "reward": 1.0920193493366241, "reward_std": 0.3826703876256943, "rewards/cosine_scaled_reward": 0.04600968584418297, "rewards/format_reward": 1.0, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 2377.666717529297, "epoch": 0.25885714285714284, "grad_norm": 0.2226954698562622, "kl": 0.1348876953125, "learning_rate": 1.2503063339313356e-05, "loss": 0.0658, "reward": 0.6897746287286282, "reward_std": 0.6120940484106541, "rewards/cosine_scaled_reward": -0.07177936565130949, "rewards/format_reward": 0.8333333358168602, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 2628.2500610351562, "epoch": 0.25942857142857145, "grad_norm": 0.19401276111602783, "kl": 0.25439453125, "learning_rate": 1.240078329479367e-05, "loss": 0.0921, "reward": 1.017133679240942, "reward_std": 0.8067431151866913, "rewards/cosine_scaled_reward": 0.1335668321698904, "rewards/format_reward": 0.7500000149011612, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 2419.7083740234375, "epoch": 0.26, "grad_norm": 0.4960789680480957, "kl": 0.21307373046875, "learning_rate": 1.2300579475997657e-05, "loss": 0.1634, "reward": 0.6709480434656143, "reward_std": 0.7460784912109375, "rewards/cosine_scaled_reward": -0.08119264617562294, "rewards/format_reward": 0.833333358168602, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 3323.3751220703125, "epoch": 0.26057142857142856, "grad_norm": 0.33247068524360657, "kl": 0.4267578125, "learning_rate": 1.2202456766718093e-05, "loss": 0.0369, "reward": 0.37625838071107864, "reward_std": 0.7378395646810532, "rewards/cosine_scaled_reward": -0.16603748872876167, "rewards/format_reward": 0.7083333507180214, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 2653.0833740234375, "epoch": 0.2611428571428571, "grad_norm": 0.27788954973220825, "kl": 0.3778076171875, "learning_rate": 1.210641994931739e-05, "loss": 0.0299, "reward": 0.7535701543092728, "reward_std": 1.0977338403463364, "rewards/cosine_scaled_reward": 0.0017850752919912338, "rewards/format_reward": 0.7500000149011612, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 2128.0000610351562, "epoch": 0.26171428571428573, "grad_norm": 0.23413938283920288, "kl": 0.14739990234375, "learning_rate": 1.2012473704494538e-05, "loss": 0.0325, "reward": 0.5362795293331146, "reward_std": 0.3089091796427965, "rewards/cosine_scaled_reward": -0.21102692931890488, "rewards/format_reward": 0.9583333432674408, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 2767.3750610351562, "epoch": 0.2622857142857143, "grad_norm": 0.22002078592777252, "kl": 0.2928466796875, "learning_rate": 1.1920622611056975e-05, "loss": 0.0267, "reward": 1.019486054778099, "reward_std": 0.9020187258720398, "rewards/cosine_scaled_reward": 0.05140970088541508, "rewards/format_reward": 0.9166666716337204, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 2332.416778564453, "epoch": 0.26285714285714284, "grad_norm": 0.33746153116226196, "kl": 0.23980712890625, "learning_rate": 1.1830871145697413e-05, "loss": 0.1162, "reward": 1.126957267522812, "reward_std": 0.5669834688305855, "rewards/cosine_scaled_reward": 0.08431193931028247, "rewards/format_reward": 0.9583333432674408, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 2162.8333740234375, "epoch": 0.2634285714285714, "grad_norm": 0.5436040759086609, "kl": 0.21539306640625, "learning_rate": 1.174322368277565e-05, "loss": 0.1728, "reward": 0.7292355694808066, "reward_std": 0.5752151645720005, "rewards/cosine_scaled_reward": -0.09371556341648102, "rewards/format_reward": 0.9166666716337204, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 3241.666748046875, "epoch": 0.264, "grad_norm": 0.27894335985183716, "kl": 0.4532470703125, "learning_rate": 1.1657684494105387e-05, "loss": 0.0877, "reward": 0.5649700947105885, "reward_std": 0.8659732490777969, "rewards/cosine_scaled_reward": -0.05084826331585646, "rewards/format_reward": 0.6666666865348816, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 3078.8750610351562, "epoch": 0.26457142857142857, "grad_norm": 0.2687225043773651, "kl": 0.2301025390625, "learning_rate": 1.1574257748745986e-05, "loss": 0.0225, "reward": 1.2971481904387474, "reward_std": 1.1507280617952347, "rewards/cosine_scaled_reward": 0.2110740765929222, "rewards/format_reward": 0.8750000298023224, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 2286.7500915527344, "epoch": 0.2651428571428571, "grad_norm": 0.17512497305870056, "kl": 0.220703125, "learning_rate": 1.149294751279933e-05, "loss": -0.0102, "reward": 0.6485597789287567, "reward_std": 0.6553373290225863, "rewards/cosine_scaled_reward": -0.11322011891752481, "rewards/format_reward": 0.8750000149011612, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 2979.9584350585938, "epoch": 0.26571428571428574, "grad_norm": 0.2867068648338318, "kl": 0.391845703125, "learning_rate": 1.1413757749211602e-05, "loss": 0.0628, "reward": 0.7631555460393429, "reward_std": 0.9646207839250565, "rewards/cosine_scaled_reward": 0.027411112561821938, "rewards/format_reward": 0.7083333432674408, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 2141.0834350585938, "epoch": 0.2662857142857143, "grad_norm": 0.3626898527145386, "kl": 0.17266845703125, "learning_rate": 1.133669231758016e-05, "loss": 0.0811, "reward": 1.0105885118246078, "reward_std": 0.7124912440776825, "rewards/cosine_scaled_reward": 0.026127580553293228, "rewards/format_reward": 0.9583333432674408, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 2783.666748046875, "epoch": 0.26685714285714285, "grad_norm": 0.4113839566707611, "kl": 0.309814453125, "learning_rate": 1.1261754973965422e-05, "loss": -0.0117, "reward": 0.3410836011171341, "reward_std": 0.5675521939992905, "rewards/cosine_scaled_reward": -0.22529152780771255, "rewards/format_reward": 0.7916666679084301, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 2199.7084045410156, "epoch": 0.2674285714285714, "grad_norm": 0.2993246614933014, "kl": 0.20440673828125, "learning_rate": 1.1188949370707787e-05, "loss": 0.0531, "reward": 0.8006970658898354, "reward_std": 0.7448753714561462, "rewards/cosine_scaled_reward": -0.057984789134934545, "rewards/format_reward": 0.9166666865348816, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 2270.166717529297, "epoch": 0.268, "grad_norm": 0.2324548065662384, "kl": 0.23199462890625, "learning_rate": 1.1118279056249655e-05, "loss": -0.0361, "reward": 0.4024841138161719, "reward_std": 0.4483271986246109, "rewards/cosine_scaled_reward": -0.23625795356929302, "rewards/format_reward": 0.875, "step": 469 }, { "clip_ratio": 0.0, "completion_length": 1424.6666946411133, "epoch": 0.26857142857142857, "grad_norm": 0.21465709805488586, "kl": 0.157196044921875, "learning_rate": 1.1049747474962445e-05, "loss": 0.1015, "reward": 1.568716924637556, "reward_std": 0.24283705279231071, "rewards/cosine_scaled_reward": 0.32602518051862717, "rewards/format_reward": 0.9166666865348816, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 2525.666732788086, "epoch": 0.26914285714285713, "grad_norm": 0.22672057151794434, "kl": 0.36737060546875, "learning_rate": 1.0983357966978745e-05, "loss": 0.0491, "reward": 0.396379379555583, "reward_std": 0.629085049033165, "rewards/cosine_scaled_reward": -0.21847698092460632, "rewards/format_reward": 0.8333333432674408, "step": 471 }, { "clip_ratio": 0.0, "completion_length": 2505.666748046875, "epoch": 0.26971428571428574, "grad_norm": 0.2537076771259308, "kl": 0.24371337890625, "learning_rate": 1.0919113768029518e-05, "loss": -0.0105, "reward": 0.5954728499054909, "reward_std": 0.3647001665085554, "rewards/cosine_scaled_reward": -0.16059689968824387, "rewards/format_reward": 0.9166666716337204, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 2161.3334350585938, "epoch": 0.2702857142857143, "grad_norm": 0.14275974035263062, "kl": 0.173126220703125, "learning_rate": 1.0857018009286382e-05, "loss": 0.0868, "reward": 0.9977487437427044, "reward_std": 0.6313546672463417, "rewards/cosine_scaled_reward": 0.06137435883283615, "rewards/format_reward": 0.875, "step": 473 }, { "clip_ratio": 0.0, "completion_length": 2200.2916870117188, "epoch": 0.27085714285714285, "grad_norm": 0.2012917548418045, "kl": 0.238983154296875, "learning_rate": 1.0797073717209014e-05, "loss": 0.0855, "reward": 0.5041792467236519, "reward_std": 0.5631431899964809, "rewards/cosine_scaled_reward": -0.16457706969231367, "rewards/format_reward": 0.833333358168602, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 2275.916732788086, "epoch": 0.2714285714285714, "grad_norm": 0.39984893798828125, "kl": 0.262451171875, "learning_rate": 1.0739283813397639e-05, "loss": 0.1526, "reward": 1.0086217746138573, "reward_std": 1.0287954062223434, "rewards/cosine_scaled_reward": 0.08764421939849854, "rewards/format_reward": 0.833333358168602, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 2965.416748046875, "epoch": 0.272, "grad_norm": 0.24678374826908112, "kl": 0.3681640625, "learning_rate": 1.0683651114450641e-05, "loss": 0.0372, "reward": 0.4287208868190646, "reward_std": 0.7793765217065811, "rewards/cosine_scaled_reward": -0.20230622496455908, "rewards/format_reward": 0.833333358168602, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 2816.041748046875, "epoch": 0.2725714285714286, "grad_norm": 0.23802922666072845, "kl": 0.2159423828125, "learning_rate": 1.0630178331827282e-05, "loss": 0.0086, "reward": 0.5022896975278854, "reward_std": 0.682855635881424, "rewards/cosine_scaled_reward": -0.14468851312994957, "rewards/format_reward": 0.7916666865348816, "step": 477 }, { "clip_ratio": 0.0, "completion_length": 3076.6251220703125, "epoch": 0.27314285714285713, "grad_norm": 0.48291072249412537, "kl": 0.40087890625, "learning_rate": 1.0578868071715544e-05, "loss": 0.0037, "reward": 0.7402574494481087, "reward_std": 0.9319310411810875, "rewards/cosine_scaled_reward": -0.004871279001235962, "rewards/format_reward": 0.7500000149011612, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 2252.5417098999023, "epoch": 0.2737142857142857, "grad_norm": 0.42745697498321533, "kl": 0.185760498046875, "learning_rate": 1.0529722834905126e-05, "loss": 0.059, "reward": 1.4727585576474667, "reward_std": 0.44060441479086876, "rewards/cosine_scaled_reward": 0.3197126239538193, "rewards/format_reward": 0.8333333432674408, "step": 479 }, { "clip_ratio": 0.0, "completion_length": 3045.291748046875, "epoch": 0.2742857142857143, "grad_norm": 0.2685858905315399, "kl": 0.316650390625, "learning_rate": 1.0482745016665526e-05, "loss": 0.0343, "reward": 1.3959271758794785, "reward_std": 0.8586903437972069, "rewards/cosine_scaled_reward": 0.21879691816866398, "rewards/format_reward": 0.9583333432674408, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 2271.4167098999023, "epoch": 0.27485714285714286, "grad_norm": 0.26521894335746765, "kl": 0.2750244140625, "learning_rate": 1.0437936906629336e-05, "loss": 0.0158, "reward": 0.7311921007931232, "reward_std": 0.8896900117397308, "rewards/cosine_scaled_reward": -0.05107062542811036, "rewards/format_reward": 0.8333333432674408, "step": 481 }, { "clip_ratio": 0.0, "completion_length": 2437.5000610351562, "epoch": 0.2754285714285714, "grad_norm": 0.19190503656864166, "kl": 0.275604248046875, "learning_rate": 1.0395300688680626e-05, "loss": 0.045, "reward": 0.9152816236019135, "reward_std": 0.8774067535996437, "rewards/cosine_scaled_reward": 0.061807457357645035, "rewards/format_reward": 0.7916666865348816, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 1939.6250610351562, "epoch": 0.276, "grad_norm": 0.29819318652153015, "kl": 0.2987060546875, "learning_rate": 1.0354838440848503e-05, "loss": -0.0064, "reward": 0.9426087737083435, "reward_std": 0.949207216501236, "rewards/cosine_scaled_reward": 0.07547104358673096, "rewards/format_reward": 0.7916666865348816, "step": 483 }, { "clip_ratio": 0.0, "completion_length": 2167.9583892822266, "epoch": 0.2765714285714286, "grad_norm": 0.37711989879608154, "kl": 0.29034423828125, "learning_rate": 1.0316552135205838e-05, "loss": 0.0293, "reward": 0.6746074706315994, "reward_std": 0.6280665933154523, "rewards/cosine_scaled_reward": -0.07936295960098505, "rewards/format_reward": 0.8333333432674408, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 2680.0000610351562, "epoch": 0.27714285714285714, "grad_norm": 0.35504573583602905, "kl": 0.446533203125, "learning_rate": 1.0280443637773165e-05, "loss": 0.0954, "reward": 0.5523308105766773, "reward_std": 0.5918973311781883, "rewards/cosine_scaled_reward": -0.05716794729232788, "rewards/format_reward": 0.6666666865348816, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 2262.5416870117188, "epoch": 0.2777142857142857, "grad_norm": 0.18296080827713013, "kl": 0.1949462890625, "learning_rate": 1.0246514708427702e-05, "loss": 0.0443, "reward": 0.9830817077308893, "reward_std": 0.859332574531436, "rewards/cosine_scaled_reward": 0.05404083710163832, "rewards/format_reward": 0.8750000149011612, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 1231.5833587646484, "epoch": 0.2782857142857143, "grad_norm": 0.18730592727661133, "kl": 0.1104736328125, "learning_rate": 1.0214767000817597e-05, "loss": 0.0454, "reward": 1.0873636417090893, "reward_std": 0.7174102757126093, "rewards/cosine_scaled_reward": 0.06451515853404999, "rewards/format_reward": 0.9583333432674408, "step": 487 }, { "clip_ratio": 0.0, "completion_length": 3112.8750610351562, "epoch": 0.27885714285714286, "grad_norm": 0.17851535975933075, "kl": 0.427734375, "learning_rate": 1.0185202062281336e-05, "loss": 0.0646, "reward": 0.607562929391861, "reward_std": 0.9592312499880791, "rewards/cosine_scaled_reward": -0.008718553930521011, "rewards/format_reward": 0.6250000260770321, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 2498.3333435058594, "epoch": 0.2794285714285714, "grad_norm": 0.3319559395313263, "kl": 0.241943359375, "learning_rate": 1.0157821333772305e-05, "loss": -0.0705, "reward": 0.541482325643301, "reward_std": 0.6799080520868301, "rewards/cosine_scaled_reward": -0.16675885394215584, "rewards/format_reward": 0.8750000149011612, "step": 489 }, { "clip_ratio": 0.0, "completion_length": 1838.8750915527344, "epoch": 0.28, "grad_norm": 0.2813575565814972, "kl": 0.18414306640625, "learning_rate": 1.0132626149788591e-05, "loss": -0.0205, "reward": 0.9313252754509449, "reward_std": 0.4187740869820118, "rewards/cosine_scaled_reward": -0.013504065573215485, "rewards/format_reward": 0.9583333432674408, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 1959.1250762939453, "epoch": 0.2805714285714286, "grad_norm": 1.0054746866226196, "kl": 0.123046875, "learning_rate": 1.0109617738307912e-05, "loss": 0.3442, "reward": 1.3521376699209213, "reward_std": 0.4419573098421097, "rewards/cosine_scaled_reward": 0.17606881260871887, "rewards/format_reward": 1.0, "step": 491 }, { "clip_ratio": 0.0, "completion_length": 2286.2084350585938, "epoch": 0.28114285714285714, "grad_norm": 0.35966256260871887, "kl": 0.2607421875, "learning_rate": 1.008879722072778e-05, "loss": 0.0295, "reward": 0.5943646021187305, "reward_std": 0.751454122364521, "rewards/cosine_scaled_reward": -0.07781771570444107, "rewards/format_reward": 0.7500000111758709, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 2124.9583587646484, "epoch": 0.2817142857142857, "grad_norm": 0.27436164021492004, "kl": 0.241302490234375, "learning_rate": 1.0070165611810856e-05, "loss": -0.019, "reward": 1.310558546334505, "reward_std": 1.1386958360671997, "rewards/cosine_scaled_reward": 0.2386126071214676, "rewards/format_reward": 0.8333333432674408, "step": 493 }, { "clip_ratio": 0.0, "completion_length": 1135.4166870117188, "epoch": 0.2822857142857143, "grad_norm": 0.217626690864563, "kl": 0.07623291015625, "learning_rate": 1.0053723819635471e-05, "loss": 0.0985, "reward": 1.053057461977005, "reward_std": 0.5034293830394745, "rewards/cosine_scaled_reward": 0.0473620742559433, "rewards/format_reward": 0.9583333432674408, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 2913.5000610351562, "epoch": 0.28285714285714286, "grad_norm": 0.16135992109775543, "kl": 0.2882080078125, "learning_rate": 1.0039472645551373e-05, "loss": 0.0153, "reward": 0.7073401757515967, "reward_std": 0.8274732977151871, "rewards/cosine_scaled_reward": -0.06299658864736557, "rewards/format_reward": 0.833333358168602, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 1558.0834045410156, "epoch": 0.2834285714285714, "grad_norm": 0.17040328681468964, "kl": 0.181396484375, "learning_rate": 1.0027412784140691e-05, "loss": 0.0125, "reward": 1.1920581981539726, "reward_std": 0.9239856712520123, "rewards/cosine_scaled_reward": 0.13769576186314225, "rewards/format_reward": 0.9166666716337204, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 2690.166748046875, "epoch": 0.284, "grad_norm": 0.2827388346195221, "kl": 0.3397216796875, "learning_rate": 1.0017544823184056e-05, "loss": 0.0234, "reward": 0.4144871234893799, "reward_std": 0.8535003513097763, "rewards/cosine_scaled_reward": -0.16775644943118095, "rewards/format_reward": 0.7500000149011612, "step": 497 }, { "clip_ratio": 0.0, "completion_length": 2351.541717529297, "epoch": 0.2845714285714286, "grad_norm": 0.233955517411232, "kl": 0.33514404296875, "learning_rate": 1.0009869243631953e-05, "loss": 0.0613, "reward": 1.0902096033096313, "reward_std": 1.2109316736459732, "rewards/cosine_scaled_reward": 0.17010477557778358, "rewards/format_reward": 0.7500000149011612, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 2419.2084350585938, "epoch": 0.28514285714285714, "grad_norm": 0.31629398465156555, "kl": 0.232330322265625, "learning_rate": 1.000438641958131e-05, "loss": 0.1232, "reward": 1.0896388813853264, "reward_std": 1.0943061411380768, "rewards/cosine_scaled_reward": 0.1073194369673729, "rewards/format_reward": 0.8750000149011612, "step": 499 }, { "clip_ratio": 0.0, "completion_length": 1832.9167022705078, "epoch": 0.2857142857142857, "grad_norm": 0.27818962931632996, "kl": 0.27166748046875, "learning_rate": 1.0001096618257236e-05, "loss": 0.0069, "reward": 0.625803031027317, "reward_std": 0.7328432351350784, "rewards/cosine_scaled_reward": -0.08293185429647565, "rewards/format_reward": 0.7916666679084301, "step": 500 }, { "epoch": 0.2857142857142857, "step": 500, "total_flos": 0.0, "train_loss": 0.051498750954982825, "train_runtime": 24099.439, "train_samples_per_second": 0.498, "train_steps_per_second": 0.021 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }