{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2857142857142857, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 3068.5000610351562, "epoch": 0.0005714285714285715, "grad_norm": 0.002309690695255995, "kl": 6.03795051574707e-05, "learning_rate": 0.0, "loss": -0.0242, "reward": 0.200983926653862, "reward_std": 0.24425111338496208, "rewards/cosine_scaled_reward": -0.0453413650393486, "rewards/format_reward": 0.2916666679084301, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 2642.7083435058594, "epoch": 0.001142857142857143, "grad_norm": 0.002800055081024766, "kl": 3.174692392349243e-05, "learning_rate": 2e-08, "loss": 0.0082, "reward": -0.12178973853588104, "reward_std": 0.1931382417678833, "rewards/cosine_scaled_reward": -0.31089488230645657, "rewards/format_reward": 0.5, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 2316.041732788086, "epoch": 0.0017142857142857142, "grad_norm": 0.005990335252135992, "kl": 3.5703182220458984e-05, "learning_rate": 4e-08, "loss": 0.1839, "reward": 0.9445240348577499, "reward_std": 0.8087804093956947, "rewards/cosine_scaled_reward": 0.15976200997829437, "rewards/format_reward": 0.6250000111758709, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 2640.166748046875, "epoch": 0.002285714285714286, "grad_norm": 0.0028529397677630186, "kl": 8.153915405273438e-05, "learning_rate": 6e-08, "loss": -0.0074, "reward": 0.2509598582983017, "reward_std": 0.6914642155170441, "rewards/cosine_scaled_reward": -0.14535339921712875, "rewards/format_reward": 0.541666679084301, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 2986.6666870117188, "epoch": 0.002857142857142857, "grad_norm": 0.0053066289983689785, "kl": 6.085634231567383e-05, "learning_rate": 8e-08, "loss": -0.1281, "reward": -0.3166642114520073, "reward_std": 0.23417249508202076, "rewards/cosine_scaled_reward": -0.28333210945129395, "rewards/format_reward": 0.25, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 2739.7083740234375, "epoch": 0.0034285714285714284, "grad_norm": 0.002914381679147482, "kl": 5.0008296966552734e-05, "learning_rate": 1e-07, "loss": 0.085, "reward": -0.19622442871332169, "reward_std": 0.22156326659023762, "rewards/cosine_scaled_reward": -0.24394555389881134, "rewards/format_reward": 0.2916666679084301, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 3269.916748046875, "epoch": 0.004, "grad_norm": 0.0024341391399502754, "kl": 4.7534704208374023e-05, "learning_rate": 1.2e-07, "loss": 0.0508, "reward": -0.08005068078637123, "reward_std": 0.6786237582564354, "rewards/cosine_scaled_reward": -0.16502534411847591, "rewards/format_reward": 0.2500000074505806, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 2942.7083740234375, "epoch": 0.004571428571428572, "grad_norm": 0.0027040301356464624, "kl": 4.982948303222656e-05, "learning_rate": 1.4e-07, "loss": 0.0469, "reward": -0.5336742624640465, "reward_std": 0.1747309099882841, "rewards/cosine_scaled_reward": -0.41267047822475433, "rewards/format_reward": 0.2916666679084301, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 2260.8333435058594, "epoch": 0.005142857142857143, "grad_norm": 0.0029155886732041836, "kl": 4.762411117553711e-05, "learning_rate": 1.6e-07, "loss": 0.0213, "reward": 0.25530365481972694, "reward_std": 0.3043131195008755, "rewards/cosine_scaled_reward": -0.12234819680452347, "rewards/format_reward": 0.5, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 2507.8333740234375, "epoch": 0.005714285714285714, "grad_norm": 0.004195820074528456, "kl": 4.690885543823242e-05, "learning_rate": 1.8e-07, "loss": 0.1213, "reward": 0.4200296476483345, "reward_std": 0.6165902353823185, "rewards/cosine_scaled_reward": -0.03998521342873573, "rewards/format_reward": 0.5000000074505806, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 2455.2083587646484, "epoch": 0.006285714285714286, "grad_norm": 0.006622746586799622, "kl": 6.347894668579102e-05, "learning_rate": 2e-07, "loss": 0.166, "reward": 0.05761030316352844, "reward_std": 0.7302033603191376, "rewards/cosine_scaled_reward": -0.17952818423509598, "rewards/format_reward": 0.4166666679084301, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 2738.916717529297, "epoch": 0.006857142857142857, "grad_norm": 0.002917631296440959, "kl": 5.716085433959961e-05, "learning_rate": 2.1999999999999998e-07, "loss": -0.0065, "reward": 0.3864743858575821, "reward_std": 0.814457044005394, "rewards/cosine_scaled_reward": -0.07759615452960134, "rewards/format_reward": 0.5416666753590107, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 2718.6666870117188, "epoch": 0.0074285714285714285, "grad_norm": 0.003016383619979024, "kl": 2.8625130653381348e-05, "learning_rate": 2.4e-07, "loss": 0.1066, "reward": 0.4783855639398098, "reward_std": 0.49341488629579544, "rewards/cosine_scaled_reward": 0.0100261140614748, "rewards/format_reward": 0.4583333432674408, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 2307.9166717529297, "epoch": 0.008, "grad_norm": 0.0034118546172976494, "kl": 4.00543212890625e-05, "learning_rate": 2.6e-07, "loss": 0.1058, "reward": 0.9889725893735886, "reward_std": 0.6486281417310238, "rewards/cosine_scaled_reward": 0.18198630958795547, "rewards/format_reward": 0.6250000111758709, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 2213.4583740234375, "epoch": 0.008571428571428572, "grad_norm": 0.007290677633136511, "kl": 3.065168857574463e-05, "learning_rate": 2.8e-07, "loss": 0.0989, "reward": 0.12133636325597763, "reward_std": 0.508549964055419, "rewards/cosine_scaled_reward": -0.2518318174406886, "rewards/format_reward": 0.625, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 3010.8333740234375, "epoch": 0.009142857142857144, "grad_norm": 0.002742619952186942, "kl": 7.230043411254883e-05, "learning_rate": 3e-07, "loss": 0.0118, "reward": 0.06766366213560104, "reward_std": 0.7642434202134609, "rewards/cosine_scaled_reward": -0.1120015112683177, "rewards/format_reward": 0.291666679084301, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 2615.5416870117188, "epoch": 0.009714285714285713, "grad_norm": 0.012783374637365341, "kl": 6.562471389770508e-05, "learning_rate": 3.2e-07, "loss": 0.3258, "reward": -0.00655742920935154, "reward_std": 0.6741594485938549, "rewards/cosine_scaled_reward": -0.16994539904408157, "rewards/format_reward": 0.3333333432674408, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 3220.875, "epoch": 0.010285714285714285, "grad_norm": 0.0021120209712535143, "kl": 4.678964614868164e-05, "learning_rate": 3.4000000000000003e-07, "loss": -0.0358, "reward": -0.21572577953338623, "reward_std": 0.4029254298657179, "rewards/cosine_scaled_reward": -0.23286291398108006, "rewards/format_reward": 0.25, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 2728.166732788086, "epoch": 0.010857142857142857, "grad_norm": 0.003262892598286271, "kl": 5.4895877838134766e-05, "learning_rate": 3.6e-07, "loss": 0.0436, "reward": 0.46039681136608124, "reward_std": 0.38068958930671215, "rewards/cosine_scaled_reward": -0.01980163250118494, "rewards/format_reward": 0.5000000111758709, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 1603.0833740234375, "epoch": 0.011428571428571429, "grad_norm": 0.004830970894545317, "kl": 3.6716461181640625e-05, "learning_rate": 3.7999999999999996e-07, "loss": 0.0854, "reward": 0.548977192491293, "reward_std": 0.6819327622652054, "rewards/cosine_scaled_reward": -0.10051142424345016, "rewards/format_reward": 0.75, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 1871.000015258789, "epoch": 0.012, "grad_norm": 0.006882881745696068, "kl": 8.362531661987305e-05, "learning_rate": 4e-07, "loss": 0.1629, "reward": 0.18009768426418304, "reward_std": 0.42535306327044964, "rewards/cosine_scaled_reward": -0.20161783043295145, "rewards/format_reward": 0.5833333358168602, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 2794.7083740234375, "epoch": 0.012571428571428572, "grad_norm": 0.0036584294866770506, "kl": 4.118680953979492e-05, "learning_rate": 4.1999999999999995e-07, "loss": 0.1264, "reward": 1.3178257942199707, "reward_std": 1.4490605294704437, "rewards/cosine_scaled_reward": 0.3464129101485014, "rewards/format_reward": 0.6250000149011612, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 2632.875, "epoch": 0.013142857142857144, "grad_norm": 0.0034264870919287205, "kl": 3.62396240234375e-05, "learning_rate": 4.3999999999999997e-07, "loss": 0.0292, "reward": 0.4209946542978287, "reward_std": 0.8962592929601669, "rewards/cosine_scaled_reward": -0.10200268402695656, "rewards/format_reward": 0.6250000260770321, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 2834.4166870117188, "epoch": 0.013714285714285714, "grad_norm": 0.00343229528516531, "kl": 6.428360939025879e-05, "learning_rate": 4.6e-07, "loss": 0.055, "reward": 0.36936939135193825, "reward_std": 0.6851825267076492, "rewards/cosine_scaled_reward": -0.08614865690469742, "rewards/format_reward": 0.5416666865348816, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 1640.7916717529297, "epoch": 0.014285714285714285, "grad_norm": 0.004301222041249275, "kl": 1.9932747818529606e-05, "learning_rate": 4.8e-07, "loss": -0.0284, "reward": 0.52116459608078, "reward_std": 0.5130745191127062, "rewards/cosine_scaled_reward": -0.11441771686077118, "rewards/format_reward": 0.75, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 2640.3333740234375, "epoch": 0.014857142857142857, "grad_norm": 0.004840579349547625, "kl": 5.5164098739624023e-05, "learning_rate": 5e-07, "loss": 0.0157, "reward": 0.4319348633289337, "reward_std": 0.7223718725144863, "rewards/cosine_scaled_reward": -0.013199232518672943, "rewards/format_reward": 0.4583333432674408, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 3343.875, "epoch": 0.015428571428571429, "grad_norm": 0.0025182762183248997, "kl": 5.891919136047363e-05, "learning_rate": 5.2e-07, "loss": 0.0399, "reward": -0.08342526666820049, "reward_std": 0.6825180053710938, "rewards/cosine_scaled_reward": -0.14587929844856262, "rewards/format_reward": 0.2083333395421505, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 2606.0, "epoch": 0.016, "grad_norm": 0.00435933331027627, "kl": 7.039308547973633e-05, "learning_rate": 5.4e-07, "loss": 0.2051, "reward": 0.12169324839487672, "reward_std": 0.7347929701209068, "rewards/cosine_scaled_reward": -0.16832004487514496, "rewards/format_reward": 0.4583333544433117, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 2855.375, "epoch": 0.01657142857142857, "grad_norm": 0.003918589558452368, "kl": 5.6862831115722656e-05, "learning_rate": 5.6e-07, "loss": 0.0375, "reward": 0.3272378444671631, "reward_std": 0.3536018393933773, "rewards/cosine_scaled_reward": 0.038618892431259155, "rewards/format_reward": 0.25, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 3577.875, "epoch": 0.017142857142857144, "grad_norm": 0.0020369745325297117, "kl": 5.513429641723633e-05, "learning_rate": 5.8e-07, "loss": 0.0024, "reward": -0.30357279628515244, "reward_std": 0.5767568703740835, "rewards/cosine_scaled_reward": -0.19345306744799018, "rewards/format_reward": 0.0833333358168602, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 2464.7083587646484, "epoch": 0.017714285714285714, "grad_norm": 0.01027754694223404, "kl": 7.18832015991211e-05, "learning_rate": 6e-07, "loss": 0.2232, "reward": 0.24775437265634537, "reward_std": 1.024050772190094, "rewards/cosine_scaled_reward": -0.12612281111069024, "rewards/format_reward": 0.5000000037252903, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 1781.875015258789, "epoch": 0.018285714285714287, "grad_norm": 0.005226707551628351, "kl": 4.528462886810303e-05, "learning_rate": 6.2e-07, "loss": 0.0542, "reward": 1.1365013420581818, "reward_std": 0.9176329895853996, "rewards/cosine_scaled_reward": 0.21408401941880584, "rewards/format_reward": 0.7083333432674408, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.018857142857142857, "grad_norm": 0.0027663451619446278, "kl": 7.009506225585938e-05, "learning_rate": 6.4e-07, "loss": 0.0, "reward": -0.4670066684484482, "reward_std": 0.516772277187556, "rewards/cosine_scaled_reward": -0.2543366737663746, "rewards/format_reward": 0.0416666679084301, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 2894.75, "epoch": 0.019428571428571427, "grad_norm": 0.003632154082879424, "kl": 4.598498344421387e-05, "learning_rate": 6.6e-07, "loss": -0.0177, "reward": 0.08219683915376663, "reward_std": 0.5962511524558067, "rewards/cosine_scaled_reward": -0.16723492369055748, "rewards/format_reward": 0.4166666679084301, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 3348.6666870117188, "epoch": 0.02, "grad_norm": 0.004805452656000853, "kl": 6.568431854248047e-05, "learning_rate": 6.800000000000001e-07, "loss": 0.0231, "reward": -0.5678123719990253, "reward_std": 0.38157590106129646, "rewards/cosine_scaled_reward": -0.3880728632211685, "rewards/format_reward": 0.2083333395421505, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 2657.8750915527344, "epoch": 0.02057142857142857, "grad_norm": 0.002950070658698678, "kl": 6.586313247680664e-05, "learning_rate": 7e-07, "loss": 0.0769, "reward": 0.04810567945241928, "reward_std": 0.7001688666641712, "rewards/cosine_scaled_reward": -0.18428050074726343, "rewards/format_reward": 0.4166666716337204, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 3413.5416870117188, "epoch": 0.021142857142857144, "grad_norm": 0.0033406990114599466, "kl": 4.89354133605957e-05, "learning_rate": 7.2e-07, "loss": 0.0165, "reward": 0.08298752084374428, "reward_std": 1.008618526160717, "rewards/cosine_scaled_reward": -0.06267290934920311, "rewards/format_reward": 0.2083333395421505, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 3148.166748046875, "epoch": 0.021714285714285714, "grad_norm": 0.002555250423029065, "kl": 6.23464584350586e-05, "learning_rate": 7.4e-07, "loss": 0.0043, "reward": 0.3247833289206028, "reward_std": 1.3051073253154755, "rewards/cosine_scaled_reward": -0.0459416788071394, "rewards/format_reward": 0.4166666716337204, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 2121.041717529297, "epoch": 0.022285714285714287, "grad_norm": 0.0035025400575250387, "kl": 4.270672798156738e-05, "learning_rate": 7.599999999999999e-07, "loss": 0.0671, "reward": 0.9847291931509972, "reward_std": 0.6829237621277571, "rewards/cosine_scaled_reward": 0.11736459657549858, "rewards/format_reward": 0.7500000149011612, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 3410.3333740234375, "epoch": 0.022857142857142857, "grad_norm": 0.003162966575473547, "kl": 8.064508438110352e-05, "learning_rate": 7.799999999999999e-07, "loss": 0.0593, "reward": -0.2106223925948143, "reward_std": 0.4903811328113079, "rewards/cosine_scaled_reward": -0.20947785605676472, "rewards/format_reward": 0.2083333395421505, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 2980.8333435058594, "epoch": 0.023428571428571427, "grad_norm": 0.0031273975037038326, "kl": 5.4717063903808594e-05, "learning_rate": 8e-07, "loss": -0.0476, "reward": -0.07270145416259766, "reward_std": 0.3931320020928979, "rewards/cosine_scaled_reward": -0.182184100151062, "rewards/format_reward": 0.2916666679084301, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 2246.875, "epoch": 0.024, "grad_norm": 0.005238016601651907, "kl": 4.6819448471069336e-05, "learning_rate": 8.199999999999999e-07, "loss": 0.1029, "reward": 0.573455267585814, "reward_std": 0.8384359991177917, "rewards/cosine_scaled_reward": -0.004939058795571327, "rewards/format_reward": 0.5833333432674408, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 3387.4584350585938, "epoch": 0.02457142857142857, "grad_norm": 0.0028164831455796957, "kl": 5.517899990081787e-05, "learning_rate": 8.399999999999999e-07, "loss": 0.0556, "reward": -0.028015973512083292, "reward_std": 0.9041736237704754, "rewards/cosine_scaled_reward": -0.15984132140874863, "rewards/format_reward": 0.2916666753590107, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 3518.375, "epoch": 0.025142857142857144, "grad_norm": 0.0023141219280660152, "kl": 4.163384437561035e-05, "learning_rate": 8.599999999999999e-07, "loss": 0.0182, "reward": -0.04804127663373947, "reward_std": 0.6431730538606644, "rewards/cosine_scaled_reward": -0.08652063831686974, "rewards/format_reward": 0.125, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 3325.9583740234375, "epoch": 0.025714285714285714, "grad_norm": 0.0020613158121705055, "kl": 6.204843521118164e-05, "learning_rate": 8.799999999999999e-07, "loss": 0.0469, "reward": -0.3371434025466442, "reward_std": 0.5085054785013199, "rewards/cosine_scaled_reward": -0.31440503895282745, "rewards/format_reward": 0.2916666716337204, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 3019.5833740234375, "epoch": 0.026285714285714287, "grad_norm": 0.002535178791731596, "kl": 6.377696990966797e-05, "learning_rate": 9e-07, "loss": 0.037, "reward": 0.6408770428970456, "reward_std": 1.0627424120903015, "rewards/cosine_scaled_reward": 0.07043852843344212, "rewards/format_reward": 0.5000000223517418, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 2560.958335876465, "epoch": 0.026857142857142857, "grad_norm": 0.003359941067174077, "kl": 3.9011240005493164e-05, "learning_rate": 9.2e-07, "loss": 0.0141, "reward": 0.5696116760373116, "reward_std": 0.4324403740465641, "rewards/cosine_scaled_reward": 0.09730582777410746, "rewards/format_reward": 0.375, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 1821.2500305175781, "epoch": 0.027428571428571427, "grad_norm": 0.0042999619618058205, "kl": 5.5670738220214844e-05, "learning_rate": 9.399999999999999e-07, "loss": 0.1166, "reward": 0.7101763039827347, "reward_std": 0.8398408368229866, "rewards/cosine_scaled_reward": 0.0009214803576469421, "rewards/format_reward": 0.7083333432674408, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 2289.7083587646484, "epoch": 0.028, "grad_norm": 0.006429169327020645, "kl": 4.604458808898926e-05, "learning_rate": 9.6e-07, "loss": -0.0494, "reward": 0.46904078125953674, "reward_std": 0.5444277357310057, "rewards/cosine_scaled_reward": -0.01547963172197342, "rewards/format_reward": 0.5, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 3266.4583740234375, "epoch": 0.02857142857142857, "grad_norm": 0.0039102076552808285, "kl": 4.738569259643555e-05, "learning_rate": 9.8e-07, "loss": 0.0761, "reward": 0.35403240751475096, "reward_std": 0.7540874853730202, "rewards/cosine_scaled_reward": 0.031182889826595783, "rewards/format_reward": 0.291666679084301, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 2815.2083435058594, "epoch": 0.029142857142857144, "grad_norm": 0.003096741857007146, "kl": 3.65525484085083e-05, "learning_rate": 1e-06, "loss": 0.0413, "reward": 0.23980098962783813, "reward_std": 0.4307633563876152, "rewards/cosine_scaled_reward": -0.025932814925909042, "rewards/format_reward": 0.2916666679084301, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 3531.0833740234375, "epoch": 0.029714285714285714, "grad_norm": 0.0018911019433289766, "kl": 4.410743713378906e-05, "learning_rate": 9.999890338174275e-07, "loss": 0.0299, "reward": -0.3362464178353548, "reward_std": 0.8673944771289825, "rewards/cosine_scaled_reward": -0.25145654007792473, "rewards/format_reward": 0.1666666716337204, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 3025.625, "epoch": 0.030285714285714287, "grad_norm": 0.010085882619023323, "kl": 4.3392181396484375e-05, "learning_rate": 9.999561358041868e-07, "loss": 0.1691, "reward": 0.1282375380396843, "reward_std": 0.34603655710816383, "rewards/cosine_scaled_reward": -0.04004790261387825, "rewards/format_reward": 0.2083333432674408, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 3095.625, "epoch": 0.030857142857142857, "grad_norm": 0.003278875257819891, "kl": 5.125999450683594e-05, "learning_rate": 9.999013075636804e-07, "loss": 0.1083, "reward": -0.4596647098660469, "reward_std": 0.27485764399170876, "rewards/cosine_scaled_reward": -0.33399902656674385, "rewards/format_reward": 0.2083333432674408, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 3319.4166870117188, "epoch": 0.03142857142857143, "grad_norm": 0.002991720335558057, "kl": 5.054473876953125e-05, "learning_rate": 9.998245517681593e-07, "loss": 0.0301, "reward": -0.15757111459970474, "reward_std": 0.8355813696980476, "rewards/cosine_scaled_reward": -0.18295222707092762, "rewards/format_reward": 0.2083333395421505, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 3054.416717529297, "epoch": 0.032, "grad_norm": 0.0036454068031162024, "kl": 8.291006088256836e-05, "learning_rate": 9.997258721585931e-07, "loss": 0.1389, "reward": 0.0915512889623642, "reward_std": 0.8793400973081589, "rewards/cosine_scaled_reward": -0.12089102528989315, "rewards/format_reward": 0.3333333469927311, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 3091.875030517578, "epoch": 0.03257142857142857, "grad_norm": 0.0049927146174013615, "kl": 4.51505184173584e-05, "learning_rate": 9.996052735444862e-07, "loss": 0.0756, "reward": 0.15427808463573456, "reward_std": 0.8372934907674789, "rewards/cosine_scaled_reward": -0.06869429349899292, "rewards/format_reward": 0.291666679084301, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 3194.2083740234375, "epoch": 0.03314285714285714, "grad_norm": 0.003153211669996381, "kl": 4.935264587402344e-05, "learning_rate": 9.994627618036452e-07, "loss": 0.1653, "reward": -0.17860857397317886, "reward_std": 1.024783294647932, "rewards/cosine_scaled_reward": -0.19347094930708408, "rewards/format_reward": 0.2083333395421505, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 2552.875, "epoch": 0.03371428571428572, "grad_norm": 0.006331880576908588, "kl": 5.84721565246582e-05, "learning_rate": 9.992983438818915e-07, "loss": 0.1837, "reward": 0.39516815543174744, "reward_std": 0.8250751569867134, "rewards/cosine_scaled_reward": -0.11491593718528748, "rewards/format_reward": 0.6250000260770321, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 3348.291748046875, "epoch": 0.03428571428571429, "grad_norm": 0.005531872622668743, "kl": 5.1721930503845215e-05, "learning_rate": 9.991120277927223e-07, "loss": 0.0567, "reward": 0.19023261219263077, "reward_std": 1.0187619477510452, "rewards/cosine_scaled_reward": -0.09238370601087809, "rewards/format_reward": 0.3750000111758709, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 2207.5833587646484, "epoch": 0.03485714285714286, "grad_norm": 0.004937394987791777, "kl": 3.698468208312988e-05, "learning_rate": 9.989038226169207e-07, "loss": 0.0115, "reward": 0.5068478137254715, "reward_std": 0.3499330710619688, "rewards/cosine_scaled_reward": 0.003423880785703659, "rewards/format_reward": 0.5, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 2679.1666870117188, "epoch": 0.03542857142857143, "grad_norm": 0.0032471788581460714, "kl": 9.340047836303711e-05, "learning_rate": 9.98673738502114e-07, "loss": -0.0314, "reward": -0.25477635860443115, "reward_std": 0.3404678776860237, "rewards/cosine_scaled_reward": -0.33572152256965637, "rewards/format_reward": 0.4166666716337204, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.036, "grad_norm": 0.0029119590763002634, "kl": 6.48200511932373e-05, "learning_rate": 9.98421786662277e-07, "loss": 0.0, "reward": -0.304746150970459, "reward_std": 0.45586051046848297, "rewards/cosine_scaled_reward": -0.1732064108364284, "rewards/format_reward": 0.0416666679084301, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 3530.375, "epoch": 0.036571428571428574, "grad_norm": 0.002201510826125741, "kl": 4.76837158203125e-05, "learning_rate": 9.981479793771866e-07, "loss": 0.0187, "reward": -0.45341508835554123, "reward_std": 0.3314024079591036, "rewards/cosine_scaled_reward": -0.2683742120862007, "rewards/format_reward": 0.0833333358168602, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 3165.5833740234375, "epoch": 0.037142857142857144, "grad_norm": 0.003183267079293728, "kl": 6.142258644104004e-05, "learning_rate": 9.97852329991824e-07, "loss": 0.1159, "reward": 0.009997284680139273, "reward_std": 0.6268695276230574, "rewards/cosine_scaled_reward": -0.14083468075841665, "rewards/format_reward": 0.2916666716337204, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 2571.9584045410156, "epoch": 0.037714285714285714, "grad_norm": 0.003915028180927038, "kl": 3.510713577270508e-05, "learning_rate": 9.975348529157229e-07, "loss": 0.0958, "reward": 0.49450133740901947, "reward_std": 0.8202755898237228, "rewards/cosine_scaled_reward": -0.04441600292921066, "rewards/format_reward": 0.5833333469927311, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 2554.5416870117188, "epoch": 0.038285714285714284, "grad_norm": 0.004811212420463562, "kl": 4.373490810394287e-05, "learning_rate": 9.971955636222684e-07, "loss": 0.0782, "reward": 0.17955105006694794, "reward_std": 1.0079337358474731, "rewards/cosine_scaled_reward": -0.11855782195925713, "rewards/format_reward": 0.4166666716337204, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 3486.3333740234375, "epoch": 0.038857142857142854, "grad_norm": 0.0016735662939026952, "kl": 3.8117170333862305e-05, "learning_rate": 9.968344786479415e-07, "loss": 0.0235, "reward": -0.48610935895703733, "reward_std": 0.1278753315564245, "rewards/cosine_scaled_reward": -0.26388800516724586, "rewards/format_reward": 0.0416666679084301, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 2936.7083740234375, "epoch": 0.03942857142857143, "grad_norm": 0.002471222309395671, "kl": 5.6177377700805664e-05, "learning_rate": 9.964516155915151e-07, "loss": -0.0343, "reward": 0.06483094394207001, "reward_std": 0.5898599550127983, "rewards/cosine_scaled_reward": -0.175917848944664, "rewards/format_reward": 0.4166666716337204, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 2804.9583740234375, "epoch": 0.04, "grad_norm": 0.0033950440119951963, "kl": 5.501508712768555e-05, "learning_rate": 9.960469931131936e-07, "loss": 0.0223, "reward": 0.08960220217704773, "reward_std": 0.6047345735132694, "rewards/cosine_scaled_reward": -0.14269892871379852, "rewards/format_reward": 0.3750000037252903, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 2147.2916870117188, "epoch": 0.04057142857142857, "grad_norm": 0.004546794109046459, "kl": 4.620850086212158e-05, "learning_rate": 9.956206309337066e-07, "loss": 0.0531, "reward": 1.0326433405280113, "reward_std": 0.32297929376363754, "rewards/cosine_scaled_reward": 0.20382165908813477, "rewards/format_reward": 0.625, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 3090.0, "epoch": 0.04114285714285714, "grad_norm": 0.002502832328900695, "kl": 5.453824996948242e-05, "learning_rate": 9.951725498333448e-07, "loss": -0.0368, "reward": 0.1497725248336792, "reward_std": 0.5293796770274639, "rewards/cosine_scaled_reward": -0.07094704359769821, "rewards/format_reward": 0.2916666679084301, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 2512.375030517578, "epoch": 0.04171428571428572, "grad_norm": 0.00826625619083643, "kl": 5.322694778442383e-05, "learning_rate": 9.947027716509488e-07, "loss": 0.1536, "reward": 0.23887160420417786, "reward_std": 0.5898401029407978, "rewards/cosine_scaled_reward": -0.17223086208105087, "rewards/format_reward": 0.5833333432674408, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 3139.7500610351562, "epoch": 0.04228571428571429, "grad_norm": 0.004434363450855017, "kl": 4.267692565917969e-05, "learning_rate": 9.942113192828444e-07, "loss": 0.0913, "reward": 0.04066075012087822, "reward_std": 0.7316096499562263, "rewards/cosine_scaled_reward": -0.12550296634435654, "rewards/format_reward": 0.2916666716337204, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 2486.916748046875, "epoch": 0.04285714285714286, "grad_norm": 0.01011582650244236, "kl": 5.6117773056030273e-05, "learning_rate": 9.93698216681727e-07, "loss": 0.2466, "reward": 0.2905881777405739, "reward_std": 0.928753674030304, "rewards/cosine_scaled_reward": -0.16720592603087425, "rewards/format_reward": 0.6250000149011612, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 3307.5834350585938, "epoch": 0.04342857142857143, "grad_norm": 0.0020555469673126936, "kl": 5.266070365905762e-05, "learning_rate": 9.931634888554935e-07, "loss": 0.0847, "reward": -0.34680964797735214, "reward_std": 0.5232329443097115, "rewards/cosine_scaled_reward": -0.27757149329409003, "rewards/format_reward": 0.2083333358168602, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 2871.6666870117188, "epoch": 0.044, "grad_norm": 0.004709742031991482, "kl": 6.121397018432617e-05, "learning_rate": 9.926071618660237e-07, "loss": 0.1709, "reward": 0.14738586451858282, "reward_std": 0.7037345245480537, "rewards/cosine_scaled_reward": -0.13464040122926235, "rewards/format_reward": 0.416666679084301, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 2764.791717529297, "epoch": 0.044571428571428574, "grad_norm": 0.003112459322437644, "kl": 2.4959444999694824e-05, "learning_rate": 9.9202926282791e-07, "loss": 0.0661, "reward": 0.4672761410474777, "reward_std": 1.1672374829649925, "rewards/cosine_scaled_reward": 0.004471390275284648, "rewards/format_reward": 0.4583333358168602, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 2620.2500610351562, "epoch": 0.045142857142857144, "grad_norm": 0.005684725008904934, "kl": 2.711266279220581e-05, "learning_rate": 9.91429819907136e-07, "loss": 0.0631, "reward": 0.33977673947811127, "reward_std": 0.5383468419313431, "rewards/cosine_scaled_reward": -0.10094496980309486, "rewards/format_reward": 0.5416666716337204, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 3402.7083740234375, "epoch": 0.045714285714285714, "grad_norm": 0.0022668626625090837, "kl": 5.435943603515625e-05, "learning_rate": 9.908088623197048e-07, "loss": 0.036, "reward": -0.05569327995181084, "reward_std": 0.6252028122544289, "rewards/cosine_scaled_reward": -0.17367996647953987, "rewards/format_reward": 0.2916666679084301, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 3181.3333740234375, "epoch": 0.046285714285714284, "grad_norm": 0.003710041055455804, "kl": 6.085634231567383e-05, "learning_rate": 9.901664203302124e-07, "loss": 0.073, "reward": 0.6848106384277344, "reward_std": 0.3867860995233059, "rewards/cosine_scaled_reward": 0.09240532107651234, "rewards/format_reward": 0.5, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 3158.125, "epoch": 0.046857142857142854, "grad_norm": 0.0020377952605485916, "kl": 3.549456596374512e-05, "learning_rate": 9.895025252503755e-07, "loss": -0.0579, "reward": 0.1932678446173668, "reward_std": 0.55375961586833, "rewards/cosine_scaled_reward": -0.0491994172334671, "rewards/format_reward": 0.2916666679084301, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 1307.8750610351562, "epoch": 0.04742857142857143, "grad_norm": 0.0060053481720387936, "kl": 6.428360939025879e-05, "learning_rate": 9.888172094375033e-07, "loss": 0.1859, "reward": 0.883403018116951, "reward_std": 0.5653196312487125, "rewards/cosine_scaled_reward": -0.016631828621029854, "rewards/format_reward": 0.9166666716337204, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 2560.4583740234375, "epoch": 0.048, "grad_norm": 0.002554214559495449, "kl": 4.4226646423339844e-05, "learning_rate": 9.881105062929221e-07, "loss": -0.0665, "reward": 0.5398676842451096, "reward_std": 1.2668142020702362, "rewards/cosine_scaled_reward": -0.04256616160273552, "rewards/format_reward": 0.6250000111758709, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 2928.0416717529297, "epoch": 0.04857142857142857, "grad_norm": 0.002872791839763522, "kl": 7.063150405883789e-05, "learning_rate": 9.873824502603459e-07, "loss": 0.0088, "reward": -0.2354523427784443, "reward_std": 0.20928797079250216, "rewards/cosine_scaled_reward": -0.24272617511451244, "rewards/format_reward": 0.25, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 2823.0416870117188, "epoch": 0.04914285714285714, "grad_norm": 0.005900247022509575, "kl": 3.603100776672363e-05, "learning_rate": 9.866330768241983e-07, "loss": 0.1228, "reward": 0.37013984844088554, "reward_std": 1.4434142112731934, "rewards/cosine_scaled_reward": -0.04409674555063248, "rewards/format_reward": 0.4583333469927311, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 3121.6666870117188, "epoch": 0.04971428571428571, "grad_norm": 0.005350765772163868, "kl": 6.332993507385254e-05, "learning_rate": 9.85862422507884e-07, "loss": 0.0598, "reward": 0.06190880388021469, "reward_std": 0.630573745816946, "rewards/cosine_scaled_reward": -0.11487893387675285, "rewards/format_reward": 0.291666679084301, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 2513.2083435058594, "epoch": 0.05028571428571429, "grad_norm": 0.005598131101578474, "kl": 8.33272933959961e-05, "learning_rate": 9.850705248720068e-07, "loss": -0.111, "reward": 0.03702998161315918, "reward_std": 0.6165482252836227, "rewards/cosine_scaled_reward": -0.27315169339999557, "rewards/format_reward": 0.5833333358168602, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 2963.250030517578, "epoch": 0.05085714285714286, "grad_norm": 0.003585414495319128, "kl": 4.559755325317383e-05, "learning_rate": 9.8425742251254e-07, "loss": 0.1119, "reward": 0.23719044029712677, "reward_std": 0.6234142333269119, "rewards/cosine_scaled_reward": -0.08973810821771622, "rewards/format_reward": 0.4166666865348816, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 2556.2084045410156, "epoch": 0.05142857142857143, "grad_norm": 0.002330746967345476, "kl": 3.5482458770275116e-05, "learning_rate": 9.83423155058946e-07, "loss": 0.0454, "reward": 1.2368909418582916, "reward_std": 1.3607692942023277, "rewards/cosine_scaled_reward": 0.2851121202111244, "rewards/format_reward": 0.6666666865348816, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 3098.7500610351562, "epoch": 0.052, "grad_norm": 0.002479028655216098, "kl": 5.143880844116211e-05, "learning_rate": 9.825677631722435e-07, "loss": 0.0296, "reward": 0.31459467858076096, "reward_std": 0.7795293852686882, "rewards/cosine_scaled_reward": -0.07186933234333992, "rewards/format_reward": 0.4583333544433117, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 2800.875, "epoch": 0.052571428571428575, "grad_norm": 0.0062688072212040424, "kl": 3.552436828613281e-05, "learning_rate": 9.816912885430258e-07, "loss": -0.0037, "reward": 0.24378880113363266, "reward_std": 0.371351920068264, "rewards/cosine_scaled_reward": -0.0031055696308612823, "rewards/format_reward": 0.25, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 2191.4583892822266, "epoch": 0.053142857142857144, "grad_norm": 0.006531110964715481, "kl": 4.729628562927246e-05, "learning_rate": 9.807937738894303e-07, "loss": 0.0307, "reward": 0.47441989928483963, "reward_std": 0.9149939436465502, "rewards/cosine_scaled_reward": -0.03362339362502098, "rewards/format_reward": 0.5416666679084301, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 2829.1666870117188, "epoch": 0.053714285714285714, "grad_norm": 0.004531018901616335, "kl": 2.703070640563965e-05, "learning_rate": 9.798752629550546e-07, "loss": 0.1184, "reward": 0.3197888396680355, "reward_std": 0.5005435179919004, "rewards/cosine_scaled_reward": -0.1526055708527565, "rewards/format_reward": 0.6250000149011612, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 2329.9583740234375, "epoch": 0.054285714285714284, "grad_norm": 0.004706867970526218, "kl": 5.316734313964844e-05, "learning_rate": 9.78935800506826e-07, "loss": 0.1106, "reward": 0.7271681129932404, "reward_std": 1.1773381531238556, "rewards/cosine_scaled_reward": 0.030250702053308487, "rewards/format_reward": 0.6666666828095913, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 3340.3750610351562, "epoch": 0.054857142857142854, "grad_norm": 0.00459945248439908, "kl": 5.906820297241211e-05, "learning_rate": 9.779754323328192e-07, "loss": 0.1029, "reward": -0.2224051170051098, "reward_std": 0.543691985309124, "rewards/cosine_scaled_reward": -0.1945358868688345, "rewards/format_reward": 0.1666666716337204, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 2480.500030517578, "epoch": 0.05542857142857143, "grad_norm": 0.0036497104447335005, "kl": 5.65648078918457e-05, "learning_rate": 9.769942052400235e-07, "loss": -0.0602, "reward": -0.057941749691963196, "reward_std": 0.37048453744500875, "rewards/cosine_scaled_reward": -0.27897088788449764, "rewards/format_reward": 0.5, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 1999.0000762939453, "epoch": 0.056, "grad_norm": 0.004622583277523518, "kl": 4.1544437408447266e-05, "learning_rate": 9.759921670520634e-07, "loss": 0.0892, "reward": 0.9070358909666538, "reward_std": 1.3653730154037476, "rewards/cosine_scaled_reward": 0.12018460407853127, "rewards/format_reward": 0.6666666679084301, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 2504.62508392334, "epoch": 0.05657142857142857, "grad_norm": 0.004686880856752396, "kl": 3.5181641578674316e-05, "learning_rate": 9.749693666068663e-07, "loss": 0.18, "reward": 0.7037244886159897, "reward_std": 0.5616102423518896, "rewards/cosine_scaled_reward": 0.14352891594171524, "rewards/format_reward": 0.4166666679084301, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 1970.7084045410156, "epoch": 0.05714285714285714, "grad_norm": 0.008396320044994354, "kl": 6.771087646484375e-05, "learning_rate": 9.739258537542835e-07, "loss": 0.1601, "reward": 0.39771436899900436, "reward_std": 0.7990332767367363, "rewards/cosine_scaled_reward": -0.1136428159661591, "rewards/format_reward": 0.6250000037252903, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 3299.4583740234375, "epoch": 0.05771428571428571, "grad_norm": 0.0018837860552594066, "kl": 4.062056541442871e-05, "learning_rate": 9.728616793536587e-07, "loss": 0.0552, "reward": -0.269857388921082, "reward_std": 0.6310795210301876, "rewards/cosine_scaled_reward": -0.25992869585752487, "rewards/format_reward": 0.2500000074505806, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 2745.7500610351562, "epoch": 0.05828571428571429, "grad_norm": 0.008048228919506073, "kl": 3.936886787414551e-05, "learning_rate": 9.717768952713511e-07, "loss": 0.2157, "reward": 0.2698771320283413, "reward_std": 1.0683788657188416, "rewards/cosine_scaled_reward": -0.0942281186580658, "rewards/format_reward": 0.4583333469927311, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 2834.1666870117188, "epoch": 0.05885714285714286, "grad_norm": 0.003009202191606164, "kl": 5.984306335449219e-05, "learning_rate": 9.706715543782064e-07, "loss": 0.177, "reward": -0.20766256004571915, "reward_std": 0.48913512006402016, "rewards/cosine_scaled_reward": -0.3121646121144295, "rewards/format_reward": 0.4166666828095913, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 3153.2083435058594, "epoch": 0.05942857142857143, "grad_norm": 0.002378274453803897, "kl": 5.8770179748535156e-05, "learning_rate": 9.695457105469804e-07, "loss": -0.0125, "reward": -0.01313498243689537, "reward_std": 0.43787195160984993, "rewards/cosine_scaled_reward": -0.13156749121844769, "rewards/format_reward": 0.25, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 2838.8333740234375, "epoch": 0.06, "grad_norm": 0.003965969663113356, "kl": 4.845857620239258e-05, "learning_rate": 9.683994186497132e-07, "loss": -0.056, "reward": -0.06605833396315575, "reward_std": 0.5113831609487534, "rewards/cosine_scaled_reward": -0.24136251024901867, "rewards/format_reward": 0.4166666716337204, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 2724.2916717529297, "epoch": 0.060571428571428575, "grad_norm": 0.004145693965256214, "kl": 8.720159530639648e-05, "learning_rate": 9.672327345550543e-07, "loss": -0.0158, "reward": 0.17488502897322178, "reward_std": 0.6664385069161654, "rewards/cosine_scaled_reward": -0.07922414783388376, "rewards/format_reward": 0.3333333358168602, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 2233.3333740234375, "epoch": 0.061142857142857145, "grad_norm": 0.0037315809167921543, "kl": 6.401538848876953e-05, "learning_rate": 9.66045715125541e-07, "loss": 0.1749, "reward": -0.0024418197572231293, "reward_std": 0.3194875009357929, "rewards/cosine_scaled_reward": -0.2928875833749771, "rewards/format_reward": 0.5833333432674408, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 2153.6666717529297, "epoch": 0.061714285714285715, "grad_norm": 0.003744048299267888, "kl": 3.629177808761597e-05, "learning_rate": 9.648384182148252e-07, "loss": 0.0802, "reward": 0.13083535805344582, "reward_std": 0.30223673209547997, "rewards/cosine_scaled_reward": -0.20541565865278244, "rewards/format_reward": 0.5416666679084301, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 3034.0000610351562, "epoch": 0.062285714285714285, "grad_norm": 0.003513565519824624, "kl": 5.9098005294799805e-05, "learning_rate": 9.636109026648554e-07, "loss": 0.0522, "reward": -0.25997395999729633, "reward_std": 0.6268702074885368, "rewards/cosine_scaled_reward": -0.25498698092997074, "rewards/format_reward": 0.2500000111758709, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 3113.6250610351562, "epoch": 0.06285714285714286, "grad_norm": 0.00211512902751565, "kl": 5.9485435485839844e-05, "learning_rate": 9.623632283030077e-07, "loss": 0.0688, "reward": -0.1261254847049713, "reward_std": 0.711979441344738, "rewards/cosine_scaled_reward": -0.20889607770368457, "rewards/format_reward": 0.2916666679084301, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 2797.375, "epoch": 0.06342857142857143, "grad_norm": 0.0038311269599944353, "kl": 4.416704177856445e-05, "learning_rate": 9.610954559391704e-07, "loss": 0.0742, "reward": 0.007988154888153076, "reward_std": 0.60484404489398, "rewards/cosine_scaled_reward": -0.14183926954865456, "rewards/format_reward": 0.2916666679084301, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 3285.0416870117188, "epoch": 0.064, "grad_norm": 0.0023358033504337072, "kl": 3.8295984268188477e-05, "learning_rate": 9.598076473627796e-07, "loss": 0.0271, "reward": 0.20066367089748383, "reward_std": 0.9480449110269547, "rewards/cosine_scaled_reward": -0.04550149664282799, "rewards/format_reward": 0.2916666753590107, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 2677.5833740234375, "epoch": 0.06457142857142857, "grad_norm": 0.006738745141774416, "kl": 6.052851676940918e-05, "learning_rate": 9.58499865339809e-07, "loss": 0.1729, "reward": 0.022471264004707336, "reward_std": 0.726767435669899, "rewards/cosine_scaled_reward": -0.19709771126508713, "rewards/format_reward": 0.4166666679084301, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 3187.1666870117188, "epoch": 0.06514285714285714, "grad_norm": 0.0027834682259708643, "kl": 6.449222564697266e-05, "learning_rate": 9.571721736097088e-07, "loss": -0.0693, "reward": -0.18754486367106438, "reward_std": 0.5675012841820717, "rewards/cosine_scaled_reward": -0.23960577324032784, "rewards/format_reward": 0.2916666679084301, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 2998.7916870117188, "epoch": 0.06571428571428571, "grad_norm": 0.0029923291876912117, "kl": 5.370378494262695e-05, "learning_rate": 9.55824636882301e-07, "loss": 0.0364, "reward": -0.22411979362368584, "reward_std": 0.4881392642855644, "rewards/cosine_scaled_reward": -0.25789323868229985, "rewards/format_reward": 0.2916666679084301, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 3095.75, "epoch": 0.06628571428571428, "grad_norm": 0.0036450796760618687, "kl": 5.882978439331055e-05, "learning_rate": 9.54457320834625e-07, "loss": 0.1253, "reward": -0.5881903991103172, "reward_std": 0.25955382362008095, "rewards/cosine_scaled_reward": -0.3982618674635887, "rewards/format_reward": 0.2083333432674408, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 3137.4166870117188, "epoch": 0.06685714285714285, "grad_norm": 0.0022206513676792383, "kl": 5.7816505432128906e-05, "learning_rate": 9.530702921077358e-07, "loss": -0.0599, "reward": 0.1665901243686676, "reward_std": 0.2177308164536953, "rewards/cosine_scaled_reward": -0.0417049303650856, "rewards/format_reward": 0.25, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 2813.2084350585938, "epoch": 0.06742857142857143, "grad_norm": 0.0027043092995882034, "kl": 4.6372413635253906e-05, "learning_rate": 9.516636183034564e-07, "loss": -0.0197, "reward": 0.41955848410725594, "reward_std": 0.7885990515351295, "rewards/cosine_scaled_reward": -0.01938742771744728, "rewards/format_reward": 0.4583333395421505, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 2253.750015258789, "epoch": 0.068, "grad_norm": 0.010293739847838879, "kl": 7.727742195129395e-05, "learning_rate": 9.502373679810839e-07, "loss": 0.1629, "reward": 0.7499631196260452, "reward_std": 0.7833771929144859, "rewards/cosine_scaled_reward": 0.10414820536971092, "rewards/format_reward": 0.541666679084301, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 3251.4583740234375, "epoch": 0.06857142857142857, "grad_norm": 0.004974659066647291, "kl": 5.0008296966552734e-05, "learning_rate": 9.487916106540465e-07, "loss": 0.1055, "reward": -0.36817148327827454, "reward_std": 0.6743681197986007, "rewards/cosine_scaled_reward": -0.28825241327285767, "rewards/format_reward": 0.2083333358168602, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 3107.416717529297, "epoch": 0.06914285714285714, "grad_norm": 0.008310997858643532, "kl": 5.3435564041137695e-05, "learning_rate": 9.473264167865171e-07, "loss": 0.1645, "reward": -0.06399485468864441, "reward_std": 0.7036623954772949, "rewards/cosine_scaled_reward": -0.1361641176044941, "rewards/format_reward": 0.2083333395421505, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 2101.3333740234375, "epoch": 0.06971428571428571, "grad_norm": 0.002272537676617503, "kl": 2.7000904083251953e-05, "learning_rate": 9.458418577899774e-07, "loss": -0.0245, "reward": 1.6105660200119019, "reward_std": 0.47269606590270996, "rewards/cosine_scaled_reward": 0.4094496890902519, "rewards/format_reward": 0.7916666679084301, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 2997.2083740234375, "epoch": 0.07028571428571428, "grad_norm": 0.0041992985643446445, "kl": 5.6624412536621094e-05, "learning_rate": 9.443380060197385e-07, "loss": 0.0503, "reward": 0.04324941337108612, "reward_std": 0.919912327080965, "rewards/cosine_scaled_reward": -0.12420864496380091, "rewards/format_reward": 0.2916666753590107, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 3418.5833740234375, "epoch": 0.07085714285714285, "grad_norm": 0.001762260915711522, "kl": 4.285573959350586e-05, "learning_rate": 9.428149347714143e-07, "loss": -0.0293, "reward": 0.38678684271872044, "reward_std": 0.22143647260963917, "rewards/cosine_scaled_reward": 0.02672677580267191, "rewards/format_reward": 0.3333333358168602, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 2346.416717529297, "epoch": 0.07142857142857142, "grad_norm": 0.005217802710831165, "kl": 6.586313247680664e-05, "learning_rate": 9.412727182773486e-07, "loss": 0.0739, "reward": 0.4534882754087448, "reward_std": 0.790243930183351, "rewards/cosine_scaled_reward": -0.0857558511197567, "rewards/format_reward": 0.625, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 1726.6666870117188, "epoch": 0.072, "grad_norm": 0.0042092944495379925, "kl": 3.6656856536865234e-05, "learning_rate": 9.397114317029974e-07, "loss": 0.1451, "reward": 0.15183740481734276, "reward_std": 0.7399844340980053, "rewards/cosine_scaled_reward": -0.27824796736240387, "rewards/format_reward": 0.7083333395421505, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 2628.375, "epoch": 0.07257142857142856, "grad_norm": 0.002517557702958584, "kl": 3.1128525733947754e-05, "learning_rate": 9.381311511432658e-07, "loss": 0.0234, "reward": 0.6651469371281564, "reward_std": 0.8857209756970406, "rewards/cosine_scaled_reward": 0.10340677137719467, "rewards/format_reward": 0.4583333395421505, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 3332.791748046875, "epoch": 0.07314285714285715, "grad_norm": 0.0019706180319190025, "kl": 5.9664249420166016e-05, "learning_rate": 9.36531953618799e-07, "loss": -0.0308, "reward": 0.427790991961956, "reward_std": 0.5538795441389084, "rewards/cosine_scaled_reward": 0.02639550156891346, "rewards/format_reward": 0.3750000037252903, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 2611.375030517578, "epoch": 0.07371428571428572, "grad_norm": 0.00927409902215004, "kl": 4.404783248901367e-05, "learning_rate": 9.34913917072228e-07, "loss": 0.1887, "reward": 0.09853430464863777, "reward_std": 0.5585183463990688, "rewards/cosine_scaled_reward": -0.13823286071419716, "rewards/format_reward": 0.3750000149011612, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 2141.2500610351562, "epoch": 0.07428571428571429, "grad_norm": 0.003805689048022032, "kl": 2.8681010007858276e-05, "learning_rate": 9.332771203643714e-07, "loss": 0.166, "reward": 0.4730747821740806, "reward_std": 0.7538301292806864, "rewards/cosine_scaled_reward": -0.07596261263825, "rewards/format_reward": 0.6250000149011612, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 3099.75, "epoch": 0.07485714285714286, "grad_norm": 0.002263197209686041, "kl": 5.2928924560546875e-05, "learning_rate": 9.316216432703916e-07, "loss": -0.0219, "reward": -0.5468751192092896, "reward_std": 0.2845967300236225, "rewards/cosine_scaled_reward": -0.4192708879709244, "rewards/format_reward": 0.2916666679084301, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 3003.9583740234375, "epoch": 0.07542857142857143, "grad_norm": 0.004270457662642002, "kl": 4.225969314575195e-05, "learning_rate": 9.299475664759068e-07, "loss": 0.084, "reward": -0.05889533646404743, "reward_std": 0.4773993450216949, "rewards/cosine_scaled_reward": -0.279447671957314, "rewards/format_reward": 0.5000000149011612, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 3064.5000610351562, "epoch": 0.076, "grad_norm": 0.002178435679525137, "kl": 3.853440284729004e-05, "learning_rate": 9.282549715730579e-07, "loss": 0.0365, "reward": 0.43633100390434265, "reward_std": 0.5200366117060184, "rewards/cosine_scaled_reward": 0.009832175448536873, "rewards/format_reward": 0.4166666679084301, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 3227.7500610351562, "epoch": 0.07657142857142857, "grad_norm": 0.0021323668770492077, "kl": 3.3289194107055664e-05, "learning_rate": 9.265439410565328e-07, "loss": 0.0074, "reward": 0.5697381664067507, "reward_std": 1.1259863451123238, "rewards/cosine_scaled_reward": 0.09736906737089157, "rewards/format_reward": 0.3750000111758709, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 2854.6666717529297, "epoch": 0.07714285714285714, "grad_norm": 0.0032251900993287563, "kl": 6.496906280517578e-05, "learning_rate": 9.248145583195447e-07, "loss": 0.0404, "reward": -0.5585590153932571, "reward_std": 0.11863808473572135, "rewards/cosine_scaled_reward": -0.40427953749895096, "rewards/format_reward": 0.25, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 2020.2083740234375, "epoch": 0.07771428571428571, "grad_norm": 0.00535603566095233, "kl": 9.846687316894531e-05, "learning_rate": 9.230669076497687e-07, "loss": 0.0991, "reward": 0.09188777673989534, "reward_std": 0.21595470421016216, "rewards/cosine_scaled_reward": -0.24572279304265976, "rewards/format_reward": 0.5833333358168602, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 2658.1666870117188, "epoch": 0.07828571428571429, "grad_norm": 0.004129425156861544, "kl": 3.1501054763793945e-05, "learning_rate": 9.213010742252327e-07, "loss": -0.1181, "reward": 0.5355297178030014, "reward_std": 0.9262653067708015, "rewards/cosine_scaled_reward": -0.0447351336479187, "rewards/format_reward": 0.625, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 2134.7916717529297, "epoch": 0.07885714285714286, "grad_norm": 0.003129597520455718, "kl": 3.406405448913574e-05, "learning_rate": 9.195171441101668e-07, "loss": 0.0232, "reward": 0.3976396471261978, "reward_std": 0.38821550738066435, "rewards/cosine_scaled_reward": -0.07201351597905159, "rewards/format_reward": 0.5416666679084301, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 3495.2500610351562, "epoch": 0.07942857142857143, "grad_norm": 0.00239741918630898, "kl": 4.2051076889038086e-05, "learning_rate": 9.177152042508077e-07, "loss": 0.0339, "reward": -0.06825350597500801, "reward_std": 0.9805519096553326, "rewards/cosine_scaled_reward": -0.13829341903328896, "rewards/format_reward": 0.2083333395421505, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 2812.5833740234375, "epoch": 0.08, "grad_norm": 0.002151095075532794, "kl": 2.119317650794983e-05, "learning_rate": 9.158953424711624e-07, "loss": 0.0512, "reward": 0.5533006391488016, "reward_std": 0.592644490301609, "rewards/cosine_scaled_reward": 0.06831696536391973, "rewards/format_reward": 0.4166666679084301, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 2509.625, "epoch": 0.08057142857142857, "grad_norm": 0.007386465091258287, "kl": 6.115436553955078e-05, "learning_rate": 9.140576474687263e-07, "loss": 0.1382, "reward": -0.16108264029026031, "reward_std": 0.6070685498416424, "rewards/cosine_scaled_reward": -0.30970799177885056, "rewards/format_reward": 0.4583333358168602, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 2520.333465576172, "epoch": 0.08114285714285714, "grad_norm": 0.005152600817382336, "kl": 4.8957765102386475e-05, "learning_rate": 9.122022088101613e-07, "loss": 0.1394, "reward": 0.6436279788613319, "reward_std": 0.6478906013071537, "rewards/cosine_scaled_reward": 0.07181395869702101, "rewards/format_reward": 0.5000000074505806, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 2752.0833740234375, "epoch": 0.08171428571428571, "grad_norm": 0.0027891474310308695, "kl": 4.0277838706970215e-05, "learning_rate": 9.103291169269299e-07, "loss": 0.0234, "reward": 0.6115199327468872, "reward_std": 0.6348966094665229, "rewards/cosine_scaled_reward": 0.11825995147228241, "rewards/format_reward": 0.375, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 2618.666778564453, "epoch": 0.08228571428571428, "grad_norm": 0.003518847282975912, "kl": 3.820657730102539e-05, "learning_rate": 9.084384631108882e-07, "loss": 0.1381, "reward": 0.6802586987614632, "reward_std": 0.8089336268603802, "rewards/cosine_scaled_reward": 0.006796026602387428, "rewards/format_reward": 0.666666679084301, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 1646.5416717529297, "epoch": 0.08285714285714285, "grad_norm": 0.010752597823739052, "kl": 6.574392318725586e-05, "learning_rate": 9.065303395098358e-07, "loss": 0.2614, "reward": 0.37815138790756464, "reward_std": 0.4954442046582699, "rewards/cosine_scaled_reward": -0.18592432886362076, "rewards/format_reward": 0.7500000074505806, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 2114.2083740234375, "epoch": 0.08342857142857144, "grad_norm": 0.007812983356416225, "kl": 4.094839096069336e-05, "learning_rate": 9.046048391230247e-07, "loss": 0.2425, "reward": 0.7894833981990814, "reward_std": 0.9341420978307724, "rewards/cosine_scaled_reward": 0.040575042366981506, "rewards/format_reward": 0.7083333544433117, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 2698.625045776367, "epoch": 0.084, "grad_norm": 0.006019408814609051, "kl": 3.282725811004639e-05, "learning_rate": 9.026620557966279e-07, "loss": 0.1254, "reward": 0.5333442576229572, "reward_std": 0.9185099750757217, "rewards/cosine_scaled_reward": 0.03750544972717762, "rewards/format_reward": 0.4583333358168602, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 2151.625, "epoch": 0.08457142857142858, "grad_norm": 0.012440349906682968, "kl": 4.747509956359863e-05, "learning_rate": 9.007020842191634e-07, "loss": 0.2389, "reward": 0.48029588419012725, "reward_std": 0.5528060272336006, "rewards/cosine_scaled_reward": -0.1140187568962574, "rewards/format_reward": 0.7083333544433117, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 3049.4583740234375, "epoch": 0.08514285714285715, "grad_norm": 0.011064182966947556, "kl": 6.586313247680664e-05, "learning_rate": 8.987250199168808e-07, "loss": 0.1671, "reward": 0.06769676133990288, "reward_std": 0.833286034874618, "rewards/cosine_scaled_reward": -0.15365162305533886, "rewards/format_reward": 0.375, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 2719.791717529297, "epoch": 0.08571428571428572, "grad_norm": 0.0031947626266628504, "kl": 4.769861698150635e-05, "learning_rate": 8.967309592491052e-07, "loss": 0.05, "reward": -0.15981899201869965, "reward_std": 0.42918937653303146, "rewards/cosine_scaled_reward": -0.2674095034599304, "rewards/format_reward": 0.375, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 2954.4583740234375, "epoch": 0.08628571428571429, "grad_norm": 0.003440645756199956, "kl": 4.601478576660156e-05, "learning_rate": 8.9471999940354e-07, "loss": 0.071, "reward": 0.6965780556201935, "reward_std": 0.8623233735561371, "rewards/cosine_scaled_reward": 0.056622354313731194, "rewards/format_reward": 0.5833333544433117, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 3026.7916870117188, "epoch": 0.08685714285714285, "grad_norm": 0.00399381248280406, "kl": 4.413723945617676e-05, "learning_rate": 8.926922383915315e-07, "loss": 0.1389, "reward": -0.24755356460809708, "reward_std": 0.41591826826334, "rewards/cosine_scaled_reward": -0.31127678602933884, "rewards/format_reward": 0.3750000149011612, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 2853.0, "epoch": 0.08742857142857142, "grad_norm": 0.004272238351404667, "kl": 1.5154480934143066e-05, "learning_rate": 8.906477750432903e-07, "loss": 0.0141, "reward": 0.25616830214858055, "reward_std": 0.7309020422399044, "rewards/cosine_scaled_reward": -0.038582539185881615, "rewards/format_reward": 0.3333333358168602, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 3381.875, "epoch": 0.088, "grad_norm": 0.0025632160250097513, "kl": 5.59687614440918e-05, "learning_rate": 8.88586709003076e-07, "loss": 0.0141, "reward": -0.10425251722335815, "reward_std": 0.612097816541791, "rewards/cosine_scaled_reward": -0.1771262725815177, "rewards/format_reward": 0.2500000111758709, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 3436.9583740234375, "epoch": 0.08857142857142856, "grad_norm": 0.0018446771427989006, "kl": 4.914402961730957e-05, "learning_rate": 8.865091407243394e-07, "loss": 0.0577, "reward": -0.2496664710342884, "reward_std": 0.7900647670030594, "rewards/cosine_scaled_reward": -0.18733322760090232, "rewards/format_reward": 0.1250000037252903, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 1783.4167175292969, "epoch": 0.08914285714285715, "grad_norm": 0.005924923811107874, "kl": 4.07099723815918e-05, "learning_rate": 8.844151714648274e-07, "loss": 0.1441, "reward": 1.2347888499498367, "reward_std": 1.1125836223363876, "rewards/cosine_scaled_reward": 0.20072776451706886, "rewards/format_reward": 0.8333333432674408, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 2373.2083435058594, "epoch": 0.08971428571428572, "grad_norm": 0.0033626053482294083, "kl": 4.8726797103881836e-05, "learning_rate": 8.823049032816478e-07, "loss": 0.1118, "reward": 0.0559084489941597, "reward_std": 0.4580029286444187, "rewards/cosine_scaled_reward": -0.222045773640275, "rewards/format_reward": 0.5, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 2975.4584350585938, "epoch": 0.09028571428571429, "grad_norm": 0.0029625447932630777, "kl": 4.2498111724853516e-05, "learning_rate": 8.801784390262943e-07, "loss": 0.0524, "reward": 0.6341586112976074, "reward_std": 0.9859010055661201, "rewards/cosine_scaled_reward": 0.02541262749582529, "rewards/format_reward": 0.5833333469927311, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.09085714285714286, "grad_norm": 0.0021222420036792755, "kl": 6.073713302612305e-05, "learning_rate": 8.780358823396352e-07, "loss": 0.0, "reward": -0.5768651217222214, "reward_std": 0.28308454528450966, "rewards/cosine_scaled_reward": -0.3092659078538418, "rewards/format_reward": 0.0416666679084301, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 3393.041748046875, "epoch": 0.09142857142857143, "grad_norm": 0.002315351041033864, "kl": 4.673004150390625e-05, "learning_rate": 8.758773376468604e-07, "loss": 0.0833, "reward": -0.030030936002731323, "reward_std": 0.5910896249115467, "rewards/cosine_scaled_reward": -0.16084881778806448, "rewards/format_reward": 0.2916666753590107, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 2971.5834045410156, "epoch": 0.092, "grad_norm": 0.009864565916359425, "kl": 2.9772520065307617e-05, "learning_rate": 8.737029101523929e-07, "loss": 0.1475, "reward": 0.20956206321716309, "reward_std": 0.9482175186276436, "rewards/cosine_scaled_reward": -0.08271897211670876, "rewards/format_reward": 0.3750000111758709, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 2947.5416870117188, "epoch": 0.09257142857142857, "grad_norm": 0.002943541621789336, "kl": 4.106760025024414e-05, "learning_rate": 8.715127058347614e-07, "loss": -0.0038, "reward": 0.012634404003620148, "reward_std": 0.4564446955919266, "rewards/cosine_scaled_reward": -0.11868278682231903, "rewards/format_reward": 0.25, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 3184.0, "epoch": 0.09314285714285714, "grad_norm": 0.004738081246614456, "kl": 5.429983139038086e-05, "learning_rate": 8.693068314414344e-07, "loss": 0.124, "reward": 0.031208477914333344, "reward_std": 0.5236095748841763, "rewards/cosine_scaled_reward": -0.10939577221870422, "rewards/format_reward": 0.2500000074505806, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 2846.291748046875, "epoch": 0.09371428571428571, "grad_norm": 0.002329840324819088, "kl": 3.78340482711792e-05, "learning_rate": 8.670853944836176e-07, "loss": 0.0326, "reward": 0.2723292261362076, "reward_std": 0.7624285668134689, "rewards/cosine_scaled_reward": -0.1555020585656166, "rewards/format_reward": 0.5833333358168602, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 3456.7500610351562, "epoch": 0.09428571428571429, "grad_norm": 0.0023597863037139177, "kl": 4.699826240539551e-05, "learning_rate": 8.648485032310144e-07, "loss": 0.0488, "reward": -0.1419277684763074, "reward_std": 0.9228034242987633, "rewards/cosine_scaled_reward": -0.1959638874977827, "rewards/format_reward": 0.2500000074505806, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 3447.6666870117188, "epoch": 0.09485714285714286, "grad_norm": 0.0023803836666047573, "kl": 5.8710575103759766e-05, "learning_rate": 8.625962667065487e-07, "loss": 0.0395, "reward": -0.3713258057832718, "reward_std": 0.6153203658759594, "rewards/cosine_scaled_reward": -0.2481629028916359, "rewards/format_reward": 0.125, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 2209.5834350585938, "epoch": 0.09542857142857143, "grad_norm": 0.003827621228992939, "kl": 3.835558891296387e-05, "learning_rate": 8.603287946810513e-07, "loss": 0.079, "reward": 1.3686834275722504, "reward_std": 1.1783415526151657, "rewards/cosine_scaled_reward": 0.26767505099996924, "rewards/format_reward": 0.833333358168602, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 3020.4583740234375, "epoch": 0.096, "grad_norm": 0.0026927669532597065, "kl": 4.649162292480469e-05, "learning_rate": 8.580461976679099e-07, "loss": 0.1193, "reward": -0.01824396848678589, "reward_std": 0.643525879830122, "rewards/cosine_scaled_reward": -0.19662199914455414, "rewards/format_reward": 0.3750000111758709, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 2622.500030517578, "epoch": 0.09657142857142857, "grad_norm": 0.00335469632409513, "kl": 3.059208393096924e-05, "learning_rate": 8.557485869176825e-07, "loss": -0.1168, "reward": 0.6284488886594772, "reward_std": 1.0370341651141644, "rewards/cosine_scaled_reward": 0.0433911457657814, "rewards/format_reward": 0.5416666716337204, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 3326.5000610351562, "epoch": 0.09714285714285714, "grad_norm": 0.002699315780773759, "kl": 3.189593553543091e-05, "learning_rate": 8.534360744126753e-07, "loss": 0.085, "reward": -0.09205850353464484, "reward_std": 0.20292617194354534, "rewards/cosine_scaled_reward": -0.12936258222907782, "rewards/format_reward": 0.1666666716337204, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 2685.7916870117188, "epoch": 0.09771428571428571, "grad_norm": 0.002784872427582741, "kl": 3.439188003540039e-05, "learning_rate": 8.511087728614862e-07, "loss": -0.007, "reward": 0.30891015753149986, "reward_std": 0.47979120910167694, "rewards/cosine_scaled_reward": -0.07471158355474472, "rewards/format_reward": 0.4583333432674408, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 2777.2083740234375, "epoch": 0.09828571428571428, "grad_norm": 0.006983851548284292, "kl": 3.954768180847168e-05, "learning_rate": 8.487667956935087e-07, "loss": 0.2782, "reward": -0.25468173064291477, "reward_std": 0.3752941247075796, "rewards/cosine_scaled_reward": -0.2731742076575756, "rewards/format_reward": 0.2916666716337204, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 2515.8334045410156, "epoch": 0.09885714285714285, "grad_norm": 0.003232243238016963, "kl": 4.437565803527832e-05, "learning_rate": 8.464102570534061e-07, "loss": 0.095, "reward": 0.05222426541149616, "reward_std": 0.7281403876841068, "rewards/cosine_scaled_reward": -0.22388787381350994, "rewards/format_reward": 0.5000000111758709, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 3384.75, "epoch": 0.09942857142857142, "grad_norm": 0.0032190983183681965, "kl": 5.0693750381469727e-05, "learning_rate": 8.440392717955475e-07, "loss": 0.0857, "reward": -0.5572768598794937, "reward_std": 0.342021893709898, "rewards/cosine_scaled_reward": -0.32030509412288666, "rewards/format_reward": 0.0833333358168602, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 2464.166717529297, "epoch": 0.1, "grad_norm": 0.004571603611111641, "kl": 2.2508203983306885e-05, "learning_rate": 8.416539554784089e-07, "loss": 0.1657, "reward": 1.121598768979311, "reward_std": 0.9176075011491776, "rewards/cosine_scaled_reward": 0.24829940125346184, "rewards/format_reward": 0.6250000149011612, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 2779.25, "epoch": 0.10057142857142858, "grad_norm": 0.0036453050561249256, "kl": 3.826618194580078e-05, "learning_rate": 8.392544243589427e-07, "loss": -0.002, "reward": 0.3596850214526057, "reward_std": 0.5097733177244663, "rewards/cosine_scaled_reward": -0.049324167892336845, "rewards/format_reward": 0.4583333432674408, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 2708.2916870117188, "epoch": 0.10114285714285715, "grad_norm": 0.0027700336650013924, "kl": 5.1975250244140625e-05, "learning_rate": 8.368407953869103e-07, "loss": -0.0087, "reward": 0.6223908215761185, "reward_std": 0.35710109770298004, "rewards/cosine_scaled_reward": 0.061195455491542816, "rewards/format_reward": 0.5, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 3560.7083740234375, "epoch": 0.10171428571428572, "grad_norm": 0.002747470512986183, "kl": 4.374980926513672e-05, "learning_rate": 8.344131861991828e-07, "loss": 0.0118, "reward": -0.2404082715511322, "reward_std": 0.7911849692463875, "rewards/cosine_scaled_reward": -0.1827041357755661, "rewards/format_reward": 0.1250000037252903, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 3293.125, "epoch": 0.10228571428571429, "grad_norm": 0.0029437618795782328, "kl": 5.97834587097168e-05, "learning_rate": 8.319717151140072e-07, "loss": 0.0357, "reward": 0.07692883908748627, "reward_std": 0.6668560411781073, "rewards/cosine_scaled_reward": -0.08653558790683746, "rewards/format_reward": 0.2500000111758709, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 3137.7916870117188, "epoch": 0.10285714285714286, "grad_norm": 0.003096961649134755, "kl": 6.0886144638061523e-05, "learning_rate": 8.295165011252396e-07, "loss": -0.0323, "reward": 0.0634104385972023, "reward_std": 0.9417376779019833, "rewards/cosine_scaled_reward": -0.11412811902118847, "rewards/format_reward": 0.2916666679084301, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 2885.2083435058594, "epoch": 0.10342857142857143, "grad_norm": 0.0021336576901376247, "kl": 3.346055746078491e-05, "learning_rate": 8.270476638965461e-07, "loss": -0.01, "reward": 0.2920467872172594, "reward_std": 0.8627648167312145, "rewards/cosine_scaled_reward": -0.08314326778054237, "rewards/format_reward": 0.4583333358168602, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 2997.7916870117188, "epoch": 0.104, "grad_norm": 0.002693953225389123, "kl": 5.555152893066406e-05, "learning_rate": 8.245653237555705e-07, "loss": 0.0071, "reward": -0.47716905176639557, "reward_std": 0.1475867610424757, "rewards/cosine_scaled_reward": -0.3635845109820366, "rewards/format_reward": 0.25, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 2524.9166870117188, "epoch": 0.10457142857142857, "grad_norm": 0.005251576192677021, "kl": 4.9054622650146484e-05, "learning_rate": 8.220696016880687e-07, "loss": 0.0654, "reward": -0.22561275959014893, "reward_std": 0.26068686321377754, "rewards/cosine_scaled_reward": -0.34197305142879486, "rewards/format_reward": 0.4583333432674408, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 2383.916717529297, "epoch": 0.10514285714285715, "grad_norm": 0.003508098889142275, "kl": 3.7997961044311523e-05, "learning_rate": 8.195606193320136e-07, "loss": -0.0559, "reward": 0.29369629733264446, "reward_std": 0.801459439098835, "rewards/cosine_scaled_reward": -0.16565186344087124, "rewards/format_reward": 0.625, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 2556.0416870117188, "epoch": 0.10571428571428572, "grad_norm": 0.004692578222602606, "kl": 4.807114601135254e-05, "learning_rate": 8.170384989716657e-07, "loss": 0.2196, "reward": -0.19284344464540482, "reward_std": 0.522798104211688, "rewards/cosine_scaled_reward": -0.28392172837629914, "rewards/format_reward": 0.375, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 2721.416717529297, "epoch": 0.10628571428571429, "grad_norm": 0.0025106044486165047, "kl": 4.979968070983887e-05, "learning_rate": 8.145033635316128e-07, "loss": -0.0084, "reward": 0.5346466451883316, "reward_std": 0.8034709915518761, "rewards/cosine_scaled_reward": 0.03815661370754242, "rewards/format_reward": 0.4583333358168602, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 3545.2916870117188, "epoch": 0.10685714285714286, "grad_norm": 0.0022053788416087627, "kl": 5.8710575103759766e-05, "learning_rate": 8.119553365707802e-07, "loss": 0.0149, "reward": -0.4619976691901684, "reward_std": 0.21103957667946815, "rewards/cosine_scaled_reward": -0.25183216109871864, "rewards/format_reward": 0.0416666679084301, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 3367.3333740234375, "epoch": 0.10742857142857143, "grad_norm": 0.003088901750743389, "kl": 4.4286251068115234e-05, "learning_rate": 8.093945422764069e-07, "loss": 0.0415, "reward": -0.23367027938365936, "reward_std": 0.8442450277507305, "rewards/cosine_scaled_reward": -0.20016847737133503, "rewards/format_reward": 0.1666666679084301, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 3421.791748046875, "epoch": 0.108, "grad_norm": 0.0021817106753587723, "kl": 3.2335519790649414e-05, "learning_rate": 8.068211054579943e-07, "loss": 0.0515, "reward": -0.10547615587711334, "reward_std": 0.7077377215027809, "rewards/cosine_scaled_reward": -0.17773808538913727, "rewards/format_reward": 0.2500000074505806, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 3176.2083435058594, "epoch": 0.10857142857142857, "grad_norm": 0.0037218970246613026, "kl": 2.396106719970703e-05, "learning_rate": 8.04235151541222e-07, "loss": 0.135, "reward": -0.30605872720479965, "reward_std": 0.30986810475587845, "rewards/cosine_scaled_reward": -0.2571960296481848, "rewards/format_reward": 0.2083333395421505, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 2959.1250610351562, "epoch": 0.10914285714285714, "grad_norm": 0.003466800320893526, "kl": 4.982948303222656e-05, "learning_rate": 8.01636806561836e-07, "loss": 0.1414, "reward": -0.01989271119236946, "reward_std": 0.5551414601504803, "rewards/cosine_scaled_reward": -0.2182796848937869, "rewards/format_reward": 0.4166666828095913, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 2735.666717529297, "epoch": 0.10971428571428571, "grad_norm": 0.0031787888146936893, "kl": 5.136430263519287e-05, "learning_rate": 7.990261971595048e-07, "loss": 0.0411, "reward": 0.14356477558612823, "reward_std": 0.4512869492173195, "rewards/cosine_scaled_reward": -0.09488428384065628, "rewards/format_reward": 0.3333333358168602, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.11028571428571429, "grad_norm": 0.0021374195348471403, "kl": 3.606081008911133e-05, "learning_rate": 7.964034505716476e-07, "loss": 0.0, "reward": -0.4283226765692234, "reward_std": 0.22863994538784027, "rewards/cosine_scaled_reward": -0.2349946592003107, "rewards/format_reward": 0.0416666679084301, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 2577.208396911621, "epoch": 0.11085714285714286, "grad_norm": 0.008714930154383183, "kl": 5.8710575103759766e-05, "learning_rate": 7.93768694627233e-07, "loss": 0.1469, "reward": 0.379447802901268, "reward_std": 1.0763712525367737, "rewards/cosine_scaled_reward": -0.0394427664577961, "rewards/format_reward": 0.4583333395421505, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 3056.0416870117188, "epoch": 0.11142857142857143, "grad_norm": 0.004441999830305576, "kl": 4.8041343688964844e-05, "learning_rate": 7.911220577405484e-07, "loss": 0.0, "reward": 0.07377637922763824, "reward_std": 0.7862755209207535, "rewards/cosine_scaled_reward": -0.12977851927280426, "rewards/format_reward": 0.3333333358168602, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 2460.8333587646484, "epoch": 0.112, "grad_norm": 0.0038764234632253647, "kl": 1.858919858932495e-05, "learning_rate": 7.884636689049422e-07, "loss": 0.1277, "reward": 0.37988660484552383, "reward_std": 0.2439479697495699, "rewards/cosine_scaled_reward": -0.018390029668807983, "rewards/format_reward": 0.4166666716337204, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 3163.625, "epoch": 0.11257142857142857, "grad_norm": 0.0023682203609496355, "kl": 4.455447196960449e-05, "learning_rate": 7.857936576865356e-07, "loss": 0.0384, "reward": -0.3652522414922714, "reward_std": 0.497545950114727, "rewards/cosine_scaled_reward": -0.28679277934134007, "rewards/format_reward": 0.2083333432674408, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 2555.4583435058594, "epoch": 0.11314285714285714, "grad_norm": 0.0035334392450749874, "kl": 6.371736526489258e-05, "learning_rate": 7.831121542179086e-07, "loss": 0.0551, "reward": 0.3976600244641304, "reward_std": 0.406425304710865, "rewards/cosine_scaled_reward": -0.009503308683633804, "rewards/format_reward": 0.4166666716337204, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 3012.6666870117188, "epoch": 0.11371428571428571, "grad_norm": 0.003297847229987383, "kl": 3.933906555175781e-05, "learning_rate": 7.804192891917571e-07, "loss": 0.051, "reward": 0.16553892940282822, "reward_std": 0.8705131523311138, "rewards/cosine_scaled_reward": -0.10473055252805352, "rewards/format_reward": 0.3750000074505806, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 2619.2916870117188, "epoch": 0.11428571428571428, "grad_norm": 0.010652230121195316, "kl": 3.664195537567139e-05, "learning_rate": 7.777151938545235e-07, "loss": 0.102, "reward": 0.27913960814476013, "reward_std": 0.9184211455285549, "rewards/cosine_scaled_reward": -0.08959684334695339, "rewards/format_reward": 0.4583333395421505, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 2651.0000610351562, "epoch": 0.11485714285714285, "grad_norm": 0.0043051885440945625, "kl": 4.982948303222656e-05, "learning_rate": 7.75e-07, "loss": -0.0637, "reward": 0.09083989635109901, "reward_std": 0.5603456795215607, "rewards/cosine_scaled_reward": -0.22541339695453644, "rewards/format_reward": 0.541666679084301, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 2754.4584045410156, "epoch": 0.11542857142857142, "grad_norm": 0.0033791528549045324, "kl": 8.746795356273651e-05, "learning_rate": 7.72273839962904e-07, "loss": 0.0539, "reward": 0.402818888425827, "reward_std": 0.8441142663359642, "rewards/cosine_scaled_reward": 0.013909453991800547, "rewards/format_reward": 0.3750000037252903, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 3077.666748046875, "epoch": 0.116, "grad_norm": 0.0044442457146942616, "kl": 4.76837158203125e-05, "learning_rate": 7.695368466124296e-07, "loss": 0.1061, "reward": 0.513311724178493, "reward_std": 1.119561430066824, "rewards/cosine_scaled_reward": 0.02748920302838087, "rewards/format_reward": 0.4583333469927311, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 2742.8333435058594, "epoch": 0.11657142857142858, "grad_norm": 0.0030542558524757624, "kl": 3.9070844650268555e-05, "learning_rate": 7.667891533457718e-07, "loss": 0.0672, "reward": -0.33509062230587006, "reward_std": 0.20147894229739904, "rewards/cosine_scaled_reward": -0.35504530370235443, "rewards/format_reward": 0.3750000037252903, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 2805.0416870117188, "epoch": 0.11714285714285715, "grad_norm": 0.003254874609410763, "kl": 5.1274895668029785e-05, "learning_rate": 7.640308940816239e-07, "loss": 0.0829, "reward": 0.09149940311908722, "reward_std": 0.7408989146351814, "rewards/cosine_scaled_reward": -0.14175030961632729, "rewards/format_reward": 0.375, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 3140.5833435058594, "epoch": 0.11771428571428572, "grad_norm": 0.003935704939067364, "kl": 4.225969314575195e-05, "learning_rate": 7.612622032536507e-07, "loss": -0.0897, "reward": -0.06805887818336487, "reward_std": 0.5183359235525131, "rewards/cosine_scaled_reward": -0.15902943164110184, "rewards/format_reward": 0.25, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 3012.6666870117188, "epoch": 0.11828571428571429, "grad_norm": 0.010786812752485275, "kl": 3.5256147384643555e-05, "learning_rate": 7.584832158039378e-07, "loss": 0.1521, "reward": 0.40019524469971657, "reward_std": 0.7886368874460459, "rewards/cosine_scaled_reward": 0.03343097306787968, "rewards/format_reward": 0.3333333432674408, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 2307.250030517578, "epoch": 0.11885714285714286, "grad_norm": 0.00365254282951355, "kl": 2.9072165489196777e-05, "learning_rate": 7.556940671764124e-07, "loss": -0.1084, "reward": 1.1552911549806595, "reward_std": 0.9773512110114098, "rewards/cosine_scaled_reward": 0.28597887605428696, "rewards/format_reward": 0.5833333432674408, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 2685.416732788086, "epoch": 0.11942857142857143, "grad_norm": 0.003647660603746772, "kl": 2.574920654296875e-05, "learning_rate": 7.528948933102438e-07, "loss": 0.066, "reward": 0.5394881032407284, "reward_std": 0.5921271587722003, "rewards/cosine_scaled_reward": 0.04057743027806282, "rewards/format_reward": 0.4583333358168602, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 2998.666748046875, "epoch": 0.12, "grad_norm": 0.003377731656655669, "kl": 4.7713518142700195e-05, "learning_rate": 7.500858306332172e-07, "loss": -0.0007, "reward": -0.04719238355755806, "reward_std": 0.5942197516560555, "rewards/cosine_scaled_reward": -0.2110961927101016, "rewards/format_reward": 0.3750000037252903, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 2438.25, "epoch": 0.12057142857142857, "grad_norm": 0.00782945565879345, "kl": 2.682209014892578e-05, "learning_rate": 7.472670160550848e-07, "loss": 0.166, "reward": 0.7385347038507462, "reward_std": 0.6991745624691248, "rewards/cosine_scaled_reward": 0.14010068029165268, "rewards/format_reward": 0.4583333432674408, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 3452.0000610351562, "epoch": 0.12114285714285715, "grad_norm": 0.0024741115048527718, "kl": 7.659196853637695e-05, "learning_rate": 7.444385869608921e-07, "loss": 0.05, "reward": 0.18024883948964998, "reward_std": 0.8185827434062958, "rewards/cosine_scaled_reward": -0.014042245224118233, "rewards/format_reward": 0.2083333395421505, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 2006.291732788086, "epoch": 0.12171428571428572, "grad_norm": 0.003888034960255027, "kl": 1.5139579772949219e-05, "learning_rate": 7.416006812042827e-07, "loss": 0.0612, "reward": 0.6797242658212781, "reward_std": 0.9185204580426216, "rewards/cosine_scaled_reward": 0.006528797559440136, "rewards/format_reward": 0.6666666679084301, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 2501.625, "epoch": 0.12228571428571429, "grad_norm": 0.0037861778400838375, "kl": 2.767890691757202e-05, "learning_rate": 7.387534371007797e-07, "loss": 0.112, "reward": -0.16776859015226364, "reward_std": 0.30463359877467155, "rewards/cosine_scaled_reward": -0.3130509778857231, "rewards/format_reward": 0.4583333432674408, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 2972.7083435058594, "epoch": 0.12285714285714286, "grad_norm": 0.0025469064712524414, "kl": 3.832578659057617e-05, "learning_rate": 7.358969934210438e-07, "loss": -0.042, "reward": -0.3024084270000458, "reward_std": 0.5482605956494808, "rewards/cosine_scaled_reward": -0.31787090189754963, "rewards/format_reward": 0.3333333358168602, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 1278.7916870117188, "epoch": 0.12342857142857143, "grad_norm": 0.006723387632519007, "kl": 4.062056541442871e-05, "learning_rate": 7.330314893841101e-07, "loss": 0.1709, "reward": 2.010191023349762, "reward_std": 0.7538085486739874, "rewards/cosine_scaled_reward": 0.5259288102388382, "rewards/format_reward": 0.9583333432674408, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 3498.25, "epoch": 0.124, "grad_norm": 0.0020758018363267183, "kl": 3.9011240005493164e-05, "learning_rate": 7.301570646506027e-07, "loss": 0.0412, "reward": -0.15263769030570984, "reward_std": 0.4959714636206627, "rewards/cosine_scaled_reward": -0.11798550933599472, "rewards/format_reward": 0.0833333358168602, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 2252.2083435058594, "epoch": 0.12457142857142857, "grad_norm": 0.005996921099722385, "kl": 5.181506276130676e-05, "learning_rate": 7.27273859315928e-07, "loss": 0.0957, "reward": 0.7515158951282501, "reward_std": 0.5205066092312336, "rewards/cosine_scaled_reward": 0.02159126102924347, "rewards/format_reward": 0.7083333432674408, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 2268.291748046875, "epoch": 0.12514285714285714, "grad_norm": 0.005396787542849779, "kl": 5.3822994232177734e-05, "learning_rate": 7.243820139034464e-07, "loss": -0.0202, "reward": 1.0924356114119291, "reward_std": 0.6144313961267471, "rewards/cosine_scaled_reward": 0.10871778428554535, "rewards/format_reward": 0.875, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 2791.6250610351562, "epoch": 0.12571428571428572, "grad_norm": 0.0036442929413169622, "kl": 4.908442497253418e-05, "learning_rate": 7.214816693576234e-07, "loss": 0.1078, "reward": 0.797138144262135, "reward_std": 0.7596875503659248, "rewards/cosine_scaled_reward": 0.08606908097863197, "rewards/format_reward": 0.6250000111758709, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 1810.6250305175781, "epoch": 0.12628571428571428, "grad_norm": 0.0073392982594668865, "kl": 2.9087066650390625e-05, "learning_rate": 7.185729670371604e-07, "loss": 0.1733, "reward": 0.6059142500162125, "reward_std": 1.0285449177026749, "rewards/cosine_scaled_reward": -0.0720429141074419, "rewards/format_reward": 0.7500000111758709, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 1707.75, "epoch": 0.12685714285714286, "grad_norm": 0.0058254050090909, "kl": 6.22868537902832e-05, "learning_rate": 7.156560487081051e-07, "loss": -0.0217, "reward": 0.5610026121139526, "reward_std": 0.4320370554924011, "rewards/cosine_scaled_reward": -0.09449871256947517, "rewards/format_reward": 0.75, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 3217.8750610351562, "epoch": 0.12742857142857142, "grad_norm": 0.002638309495523572, "kl": 4.83095645904541e-05, "learning_rate": 7.127310565369415e-07, "loss": 0.0311, "reward": -0.034965626895427704, "reward_std": 0.6273398473858833, "rewards/cosine_scaled_reward": -0.16331613808870316, "rewards/format_reward": 0.2916666716337204, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 3119.166748046875, "epoch": 0.128, "grad_norm": 0.0033599366433918476, "kl": 3.606081008911133e-05, "learning_rate": 7.097981330836616e-07, "loss": 0.0746, "reward": 0.8190377280116081, "reward_std": 0.7048506755381823, "rewards/cosine_scaled_reward": 0.18035220727324486, "rewards/format_reward": 0.4583333358168602, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 3055.125, "epoch": 0.12857142857142856, "grad_norm": 0.0045721810311079025, "kl": 2.7857720851898193e-05, "learning_rate": 7.068574212948169e-07, "loss": -0.0466, "reward": 0.21222206205129623, "reward_std": 0.30652017891407013, "rewards/cosine_scaled_reward": -0.01888899877667427, "rewards/format_reward": 0.25, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 2263.9583587646484, "epoch": 0.12914285714285714, "grad_norm": 0.004758474417030811, "kl": 3.94284725189209e-05, "learning_rate": 7.039090644965509e-07, "loss": -0.0502, "reward": 0.7423373945057392, "reward_std": 0.3515051454305649, "rewards/cosine_scaled_reward": 0.12116867303848267, "rewards/format_reward": 0.5, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 2196.125, "epoch": 0.12971428571428573, "grad_norm": 0.0028872666880488396, "kl": 2.041459083557129e-05, "learning_rate": 7.009532063876148e-07, "loss": -0.0022, "reward": 0.5498324036598206, "reward_std": 0.7948595993220806, "rewards/cosine_scaled_reward": -0.03758380934596062, "rewards/format_reward": 0.625, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 1336.0417022705078, "epoch": 0.13028571428571428, "grad_norm": 0.00446860259398818, "kl": 2.1146610379219055e-05, "learning_rate": 6.979899910323624e-07, "loss": 0.0718, "reward": 1.0034500658512115, "reward_std": 0.3992225453257561, "rewards/cosine_scaled_reward": 0.08505835384130478, "rewards/format_reward": 0.8333333358168602, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 2710.041717529297, "epoch": 0.13085714285714287, "grad_norm": 0.0038390173576772213, "kl": 2.504885196685791e-05, "learning_rate": 6.950195628537299e-07, "loss": 0.0697, "reward": 0.3567547835409641, "reward_std": 0.2776902988553047, "rewards/cosine_scaled_reward": -0.029955971986055374, "rewards/format_reward": 0.4166666716337204, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 3496.875, "epoch": 0.13142857142857142, "grad_norm": 0.002959838602691889, "kl": 4.273653030395508e-05, "learning_rate": 6.920420666261961e-07, "loss": 0.0246, "reward": 0.05579942651093006, "reward_std": 0.8793547209352255, "rewards/cosine_scaled_reward": -0.05543362256139517, "rewards/format_reward": 0.1666666679084301, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 3178.6666870117188, "epoch": 0.132, "grad_norm": 0.0028935428708791733, "kl": 6.222724914550781e-05, "learning_rate": 6.890576474687263e-07, "loss": -0.0444, "reward": 0.24354754388332367, "reward_std": 0.7453293558210135, "rewards/cosine_scaled_reward": -0.04489289969205856, "rewards/format_reward": 0.3333333358168602, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 2722.5, "epoch": 0.13257142857142856, "grad_norm": 0.0036100989673286676, "kl": 3.808736801147461e-05, "learning_rate": 6.860664508377001e-07, "loss": 0.1031, "reward": -0.28667350858449936, "reward_std": 0.312796700745821, "rewards/cosine_scaled_reward": -0.2891700938344002, "rewards/format_reward": 0.2916666679084301, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 3563.9166870117188, "epoch": 0.13314285714285715, "grad_norm": 0.001670429832302034, "kl": 2.8133392333984375e-05, "learning_rate": 6.83068622519821e-07, "loss": 0.0072, "reward": -0.26926689222455025, "reward_std": 0.3057979866862297, "rewards/cosine_scaled_reward": -0.17630011402070522, "rewards/format_reward": 0.0833333358168602, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 3420.3750610351562, "epoch": 0.1337142857142857, "grad_norm": 0.0019689537584781647, "kl": 5.710124969482422e-05, "learning_rate": 6.800643086250121e-07, "loss": 0.0649, "reward": -0.11857819184660912, "reward_std": 0.79299321398139, "rewards/cosine_scaled_reward": -0.12178908661007881, "rewards/format_reward": 0.1250000037252903, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 2734.7083587646484, "epoch": 0.13428571428571429, "grad_norm": 0.0032859258353710175, "kl": 3.275275230407715e-05, "learning_rate": 6.770536555792944e-07, "loss": 0.0852, "reward": 0.5404793471097946, "reward_std": 0.5098964013159275, "rewards/cosine_scaled_reward": 0.10357297211885452, "rewards/format_reward": 0.3333333358168602, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 1997.6250305175781, "epoch": 0.13485714285714287, "grad_norm": 0.0030274989549070597, "kl": 4.328787326812744e-05, "learning_rate": 6.740368101176495e-07, "loss": 0.0324, "reward": 0.9046305678784847, "reward_std": 0.4162073917686939, "rewards/cosine_scaled_reward": -0.026851389557123184, "rewards/format_reward": 0.9583333432674408, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 1991.0833435058594, "epoch": 0.13542857142857143, "grad_norm": 0.012578634545207024, "kl": 0.00019246339797973633, "learning_rate": 6.710139192768694e-07, "loss": 0.2968, "reward": 0.06364882737398148, "reward_std": 0.33432740345597267, "rewards/cosine_scaled_reward": -0.28067560493946075, "rewards/format_reward": 0.6250000149011612, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 2756.5416870117188, "epoch": 0.136, "grad_norm": 0.003025411395356059, "kl": 4.3779611587524414e-05, "learning_rate": 6.679851303883891e-07, "loss": 0.0886, "reward": -0.061539724469184875, "reward_std": 0.5319161303341389, "rewards/cosine_scaled_reward": -0.23910319432616234, "rewards/format_reward": 0.4166666716337204, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 3125.0000610351562, "epoch": 0.13657142857142857, "grad_norm": 0.0038780400063842535, "kl": 3.1828880310058594e-05, "learning_rate": 6.649505910711058e-07, "loss": 0.126, "reward": 0.24956950050545856, "reward_std": 1.2599260210990906, "rewards/cosine_scaled_reward": -0.06271525658667088, "rewards/format_reward": 0.3750000037252903, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 2713.2084045410156, "epoch": 0.13714285714285715, "grad_norm": 0.0067334724590182304, "kl": 3.93986701965332e-05, "learning_rate": 6.619104492241847e-07, "loss": 0.2597, "reward": 0.16051556961610913, "reward_std": 0.598676634952426, "rewards/cosine_scaled_reward": -0.16974221915006638, "rewards/format_reward": 0.5000000149011612, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 2769.2083587646484, "epoch": 0.1377142857142857, "grad_norm": 0.005129523109644651, "kl": 5.561113357543945e-05, "learning_rate": 6.588648530198504e-07, "loss": 0.0109, "reward": 0.7743917256593704, "reward_std": 1.1010991334915161, "rewards/cosine_scaled_reward": 0.15802916581742465, "rewards/format_reward": 0.4583333358168602, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 3484.0, "epoch": 0.1382857142857143, "grad_norm": 0.0032152337953448296, "kl": 6.651878356933594e-05, "learning_rate": 6.558139508961654e-07, "loss": 0.0386, "reward": -0.27925232984125614, "reward_std": 0.15112961642444134, "rewards/cosine_scaled_reward": -0.16045950073748827, "rewards/format_reward": 0.0416666679084301, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 3083.6666870117188, "epoch": 0.13885714285714285, "grad_norm": 0.005801417864859104, "kl": 4.985928535461426e-05, "learning_rate": 6.527578915497951e-07, "loss": 0.0691, "reward": -0.3468597615137696, "reward_std": 0.36454798001796007, "rewards/cosine_scaled_reward": -0.2775965635664761, "rewards/format_reward": 0.2083333432674408, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 3152.291717529297, "epoch": 0.13942857142857143, "grad_norm": 0.0028560124337673187, "kl": 7.191300392150879e-05, "learning_rate": 6.496968239287603e-07, "loss": -0.069, "reward": -0.14723733812570572, "reward_std": 0.5063724052160978, "rewards/cosine_scaled_reward": -0.26111866161227226, "rewards/format_reward": 0.375, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 2605.1666870117188, "epoch": 0.14, "grad_norm": 0.0030706387478858232, "kl": 5.2422285079956055e-05, "learning_rate": 6.466308972251785e-07, "loss": 0.3315, "reward": 0.6067185699939728, "reward_std": 1.3671210408210754, "rewards/cosine_scaled_reward": 0.07419260777533054, "rewards/format_reward": 0.4583333469927311, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 2902.4583587646484, "epoch": 0.14057142857142857, "grad_norm": 0.002993271453306079, "kl": 3.580749034881592e-05, "learning_rate": 6.435602608679916e-07, "loss": 0.0463, "reward": -0.022408947348594666, "reward_std": 0.5483135208487511, "rewards/cosine_scaled_reward": -0.17787115648388863, "rewards/format_reward": 0.3333333358168602, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 3522.4583740234375, "epoch": 0.14114285714285715, "grad_norm": 0.0018838891992345452, "kl": 3.56137752532959e-05, "learning_rate": 6.404850645156841e-07, "loss": 0.0187, "reward": 0.30432749539613724, "reward_std": 1.2303246967494488, "rewards/cosine_scaled_reward": 0.0063303932547569275, "rewards/format_reward": 0.2916666679084301, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 1874.2917175292969, "epoch": 0.1417142857142857, "grad_norm": 0.00652493629604578, "kl": 3.209710121154785e-05, "learning_rate": 6.374054580489873e-07, "loss": 0.1189, "reward": 0.7739015333354473, "reward_std": 0.9217725694179535, "rewards/cosine_scaled_reward": -0.008882574737071991, "rewards/format_reward": 0.791666679084301, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 3049.0833740234375, "epoch": 0.1422857142857143, "grad_norm": 0.0035696683917194605, "kl": 4.778057336807251e-05, "learning_rate": 6.343215915635761e-07, "loss": 0.1231, "reward": -0.17236267449334264, "reward_std": 0.655427098274231, "rewards/cosine_scaled_reward": -0.2736813314259052, "rewards/format_reward": 0.3750000111758709, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 2750.5833587646484, "epoch": 0.14285714285714285, "grad_norm": 0.002898057224228978, "kl": 3.740191459655762e-05, "learning_rate": 6.31233615362752e-07, "loss": 0.0582, "reward": 0.08264456689357758, "reward_std": 0.500743918120861, "rewards/cosine_scaled_reward": -0.1253443881869316, "rewards/format_reward": 0.3333333358168602, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 2088.2084045410156, "epoch": 0.14342857142857143, "grad_norm": 0.003718048334121704, "kl": 2.4572014808654785e-05, "learning_rate": 6.281416799501187e-07, "loss": 0.0679, "reward": 0.39472272619605064, "reward_std": 0.7821769379079342, "rewards/cosine_scaled_reward": -0.15680530993267894, "rewards/format_reward": 0.7083333432674408, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 2919.2084045410156, "epoch": 0.144, "grad_norm": 0.0034788185730576515, "kl": 5.519390106201172e-05, "learning_rate": 6.25045936022246e-07, "loss": 0.1207, "reward": 0.07164974510669708, "reward_std": 0.7563342545181513, "rewards/cosine_scaled_reward": -0.17250845208764076, "rewards/format_reward": 0.416666679084301, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 2955.4166870117188, "epoch": 0.14457142857142857, "grad_norm": 0.0036410244647413492, "kl": 4.571676254272461e-05, "learning_rate": 6.219465344613258e-07, "loss": -0.1062, "reward": -0.2852509953081608, "reward_std": 0.222552802413702, "rewards/cosine_scaled_reward": -0.267625505104661, "rewards/format_reward": 0.25, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 2840.0833435058594, "epoch": 0.14514285714285713, "grad_norm": 0.002139769494533539, "kl": 1.9311904907226562e-05, "learning_rate": 6.188436263278172e-07, "loss": 0.0966, "reward": 0.7411958947777748, "reward_std": 0.8247161880135536, "rewards/cosine_scaled_reward": 0.0997646115720272, "rewards/format_reward": 0.541666679084301, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 2588.0833587646484, "epoch": 0.1457142857142857, "grad_norm": 0.0029731092508882284, "kl": 4.3958425521850586e-05, "learning_rate": 6.157373628530852e-07, "loss": 0.0872, "reward": 0.7755952775478363, "reward_std": 0.4564969390630722, "rewards/cosine_scaled_reward": 0.15863101184368134, "rewards/format_reward": 0.4583333432674408, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 3512.5833740234375, "epoch": 0.1462857142857143, "grad_norm": 0.0030253275763243437, "kl": 3.644824028015137e-05, "learning_rate": 6.126278954320294e-07, "loss": 0.0273, "reward": -0.5299574732780457, "reward_std": 0.33321886509656906, "rewards/cosine_scaled_reward": -0.30664539337158203, "rewards/format_reward": 0.0833333358168602, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 2930.3750610351562, "epoch": 0.14685714285714285, "grad_norm": 0.0032464053947478533, "kl": 4.4345855712890625e-05, "learning_rate": 6.095153756157051e-07, "loss": 0.0163, "reward": -0.03792719542980194, "reward_std": 0.5811977386474609, "rewards/cosine_scaled_reward": -0.20646359771490097, "rewards/format_reward": 0.3750000037252903, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 2009.0833587646484, "epoch": 0.14742857142857144, "grad_norm": 0.003291090251877904, "kl": 2.537667751312256e-05, "learning_rate": 6.06399955103937e-07, "loss": 0.0857, "reward": 0.5619839504361153, "reward_std": 0.5102794338017702, "rewards/cosine_scaled_reward": -0.05234135687351227, "rewards/format_reward": 0.6666666716337204, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 3262.2500610351562, "epoch": 0.148, "grad_norm": 0.0030783037655055523, "kl": 3.460049629211426e-05, "learning_rate": 6.032817857379256e-07, "loss": 0.0153, "reward": -0.26981719583272934, "reward_std": 0.7145614288747311, "rewards/cosine_scaled_reward": -0.23907526023685932, "rewards/format_reward": 0.2083333358168602, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 3462.5, "epoch": 0.14857142857142858, "grad_norm": 0.00235256040468812, "kl": 3.841519355773926e-05, "learning_rate": 6.001610194928464e-07, "loss": 0.0175, "reward": 0.3264825101941824, "reward_std": 0.32108007185161114, "rewards/cosine_scaled_reward": 0.03824122529476881, "rewards/format_reward": 0.25, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 3391.875, "epoch": 0.14914285714285713, "grad_norm": 0.002372028538957238, "kl": 4.51207160949707e-05, "learning_rate": 5.97037808470444e-07, "loss": 0.0627, "reward": -0.5741367600858212, "reward_std": 0.21517318114638329, "rewards/cosine_scaled_reward": -0.32873503863811493, "rewards/format_reward": 0.0833333358168602, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 3266.0833740234375, "epoch": 0.14971428571428572, "grad_norm": 0.0019727584440261126, "kl": 3.857165575027466e-05, "learning_rate": 5.939123048916173e-07, "loss": 0.0249, "reward": 0.3536590598523617, "reward_std": 0.2609690725803375, "rewards/cosine_scaled_reward": 0.010162822902202606, "rewards/format_reward": 0.3333333358168602, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 2608.5, "epoch": 0.15028571428571427, "grad_norm": 0.003275679424405098, "kl": 5.266070365905762e-05, "learning_rate": 5.907846610890011e-07, "loss": 0.1018, "reward": 0.10551428329199553, "reward_std": 0.33940607495605946, "rewards/cosine_scaled_reward": -0.13474285730626434, "rewards/format_reward": 0.375, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.15085714285714286, "grad_norm": 0.0020819255150854588, "kl": 6.270408630371094e-05, "learning_rate": 5.87655029499542e-07, "loss": 0.0, "reward": -0.5984752140939236, "reward_std": 0.16026042588055134, "rewards/cosine_scaled_reward": -0.2992375921458006, "rewards/format_reward": 0.0, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 3450.1250610351562, "epoch": 0.15142857142857144, "grad_norm": 0.0027288312558084726, "kl": 6.195902824401855e-05, "learning_rate": 5.845235626570683e-07, "loss": 0.0173, "reward": -0.1949300542473793, "reward_std": 0.5684019215404987, "rewards/cosine_scaled_reward": -0.20163171365857124, "rewards/format_reward": 0.2083333395421505, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 3069.2916870117188, "epoch": 0.152, "grad_norm": 0.005662751849740744, "kl": 4.8786401748657227e-05, "learning_rate": 5.813904131848564e-07, "loss": 0.0767, "reward": 0.1254543773829937, "reward_std": 0.4721978511661291, "rewards/cosine_scaled_reward": -0.1247728019952774, "rewards/format_reward": 0.375, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 1895.125015258789, "epoch": 0.15257142857142858, "grad_norm": 0.008434503339231014, "kl": 2.736598253250122e-05, "learning_rate": 5.78255733788191e-07, "loss": 0.1131, "reward": 0.2025701403617859, "reward_std": 0.4612788297235966, "rewards/cosine_scaled_reward": -0.25288160145282745, "rewards/format_reward": 0.7083333432674408, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 2430.5834045410156, "epoch": 0.15314285714285714, "grad_norm": 0.003727820934727788, "kl": 3.916025161743164e-05, "learning_rate": 5.751196772469237e-07, "loss": 0.0008, "reward": 0.14945873618125916, "reward_std": 0.7373831644654274, "rewards/cosine_scaled_reward": -0.17527063190937042, "rewards/format_reward": 0.5000000074505806, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 3454.2916870117188, "epoch": 0.15371428571428572, "grad_norm": 0.0024431291967630386, "kl": 1.817941665649414e-05, "learning_rate": 5.71982396408026e-07, "loss": 0.0202, "reward": -0.24634560383856297, "reward_std": 0.46645686961710453, "rewards/cosine_scaled_reward": -0.164839468896389, "rewards/format_reward": 0.0833333358168602, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 2094.4583435058594, "epoch": 0.15428571428571428, "grad_norm": 0.003973724320530891, "kl": 4.7266483306884766e-05, "learning_rate": 5.688440441781398e-07, "loss": 0.0566, "reward": 0.7501662075519562, "reward_std": 0.36367307882755995, "rewards/cosine_scaled_reward": 0.1250830963253975, "rewards/format_reward": 0.5, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 2104.5833740234375, "epoch": 0.15485714285714286, "grad_norm": 0.008249267004430294, "kl": 4.731118679046631e-05, "learning_rate": 5.657047735161255e-07, "loss": 0.1897, "reward": 0.6790582984685898, "reward_std": 0.6757728382945061, "rewards/cosine_scaled_reward": -0.05630417726933956, "rewards/format_reward": 0.7916666716337204, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 2825.3750915527344, "epoch": 0.15542857142857142, "grad_norm": 0.0036491211503744125, "kl": 4.8786401748657227e-05, "learning_rate": 5.625647374256061e-07, "loss": -0.0303, "reward": 0.4601898267865181, "reward_std": 1.0635973513126373, "rewards/cosine_scaled_reward": -0.08240509033203125, "rewards/format_reward": 0.6250000111758709, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 3119.6666870117188, "epoch": 0.156, "grad_norm": 0.0034709065221250057, "kl": 5.245208740234375e-05, "learning_rate": 5.594240889475106e-07, "loss": 0.0636, "reward": -0.15936421230435371, "reward_std": 0.6982405669987202, "rewards/cosine_scaled_reward": -0.22551544196903706, "rewards/format_reward": 0.2916666753590107, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 3552.0, "epoch": 0.15657142857142858, "grad_norm": 0.0021109154913574457, "kl": 3.0353665351867676e-05, "learning_rate": 5.562829811526154e-07, "loss": 0.0187, "reward": -0.49512597266584635, "reward_std": 0.2467654086649418, "rewards/cosine_scaled_reward": -0.2683963133022189, "rewards/format_reward": 0.0416666679084301, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 2172.916702270508, "epoch": 0.15714285714285714, "grad_norm": 0.003394890343770385, "kl": 6.458163261413574e-05, "learning_rate": 5.531415671340826e-07, "loss": 0.0659, "reward": -0.0462833046913147, "reward_std": 0.41944438125938177, "rewards/cosine_scaled_reward": -0.293974993750453, "rewards/format_reward": 0.5416666679084301, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 2155.291702270508, "epoch": 0.15771428571428572, "grad_norm": 0.0036149960942566395, "kl": 3.6716461181640625e-05, "learning_rate": 5.5e-07, "loss": 0.0182, "reward": 1.0795062081888318, "reward_std": 1.133161500096321, "rewards/cosine_scaled_reward": 0.20641976967453957, "rewards/format_reward": 0.6666666679084301, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 2870.5833740234375, "epoch": 0.15828571428571428, "grad_norm": 0.0024542666506022215, "kl": 4.220008850097656e-05, "learning_rate": 5.468584328659172e-07, "loss": -0.0311, "reward": 0.4436817020177841, "reward_std": 0.6852018758654594, "rewards/cosine_scaled_reward": -0.007325820624828339, "rewards/format_reward": 0.4583333432674408, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 2459.7916870117188, "epoch": 0.15885714285714286, "grad_norm": 0.003170615527778864, "kl": 4.8100948333740234e-05, "learning_rate": 5.437170188473847e-07, "loss": 0.1285, "reward": 0.18449506163597107, "reward_std": 0.7764367498457432, "rewards/cosine_scaled_reward": -0.15775247290730476, "rewards/format_reward": 0.5000000111758709, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 2768.125030517578, "epoch": 0.15942857142857142, "grad_norm": 0.009930440224707127, "kl": 5.936622619628906e-05, "learning_rate": 5.405759110524894e-07, "loss": 0.2261, "reward": 0.20033748634159565, "reward_std": 0.8129779398441315, "rewards/cosine_scaled_reward": -0.14983126148581505, "rewards/format_reward": 0.5000000111758709, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 2207.0000610351562, "epoch": 0.16, "grad_norm": 0.003168596886098385, "kl": 2.8252601623535156e-05, "learning_rate": 5.37435262574394e-07, "loss": -0.0925, "reward": 0.47836675494909286, "reward_std": 0.42086486145853996, "rewards/cosine_scaled_reward": -0.07331661134958267, "rewards/format_reward": 0.625, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 2235.500030517578, "epoch": 0.16057142857142856, "grad_norm": 0.012314674444496632, "kl": 3.674626350402832e-05, "learning_rate": 5.342952264838747e-07, "loss": 0.2402, "reward": 0.5244650598615408, "reward_std": 0.3065570890903473, "rewards/cosine_scaled_reward": 0.03306586295366287, "rewards/format_reward": 0.4583333395421505, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 3482.9583740234375, "epoch": 0.16114285714285714, "grad_norm": 0.003394032595679164, "kl": 6.920099258422852e-05, "learning_rate": 5.311559558218603e-07, "loss": 0.0195, "reward": 0.18921626545488834, "reward_std": 0.8379529621452093, "rewards/cosine_scaled_reward": -0.00955851562321186, "rewards/format_reward": 0.2083333395421505, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 2518.250045776367, "epoch": 0.16171428571428573, "grad_norm": 0.005581878591328859, "kl": 5.644559860229492e-05, "learning_rate": 5.28017603591974e-07, "loss": 0.2005, "reward": -0.21906772628426552, "reward_std": 0.344997763633728, "rewards/cosine_scaled_reward": -0.29703386686742306, "rewards/format_reward": 0.375, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 2490.0417098999023, "epoch": 0.16228571428571428, "grad_norm": 0.005271017085760832, "kl": 6.723403930664062e-05, "learning_rate": 5.248803227530763e-07, "loss": 0.1337, "reward": 0.8023579549044371, "reward_std": 1.264666199684143, "rewards/cosine_scaled_reward": 0.15117894113063812, "rewards/format_reward": 0.5000000074505806, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 3333.7083740234375, "epoch": 0.16285714285714287, "grad_norm": 0.0038196779787540436, "kl": 6.0558319091796875e-05, "learning_rate": 5.21744266211809e-07, "loss": 0.052, "reward": -0.23526735417544842, "reward_std": 0.6071600466966629, "rewards/cosine_scaled_reward": -0.20096702128648758, "rewards/format_reward": 0.1666666679084301, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 2804.4584045410156, "epoch": 0.16342857142857142, "grad_norm": 0.0022388980723917484, "kl": 2.637505531311035e-05, "learning_rate": 5.186095868151436e-07, "loss": 0.0184, "reward": 1.0767283365130424, "reward_std": 0.7745907232165337, "rewards/cosine_scaled_reward": 0.22586413100361824, "rewards/format_reward": 0.625, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 2746.4584045410156, "epoch": 0.164, "grad_norm": 0.003931526560336351, "kl": 3.160536289215088e-05, "learning_rate": 5.154764373429315e-07, "loss": 0.0282, "reward": 0.3201872259378433, "reward_std": 1.2375166565179825, "rewards/cosine_scaled_reward": -0.08990639168769121, "rewards/format_reward": 0.5000000074505806, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 3171.0833435058594, "epoch": 0.16457142857142856, "grad_norm": 0.0046902066096663475, "kl": 5.02467155456543e-05, "learning_rate": 5.123449705004581e-07, "loss": 0.1096, "reward": -0.5258437097072601, "reward_std": 0.3504077633842826, "rewards/cosine_scaled_reward": -0.34625519067049026, "rewards/format_reward": 0.1666666716337204, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 3037.500030517578, "epoch": 0.16514285714285715, "grad_norm": 0.006959085818380117, "kl": 3.802776336669922e-05, "learning_rate": 5.09215338910999e-07, "loss": 0.1499, "reward": 0.05442573130130768, "reward_std": 0.7501186542212963, "rewards/cosine_scaled_reward": -0.13945381715893745, "rewards/format_reward": 0.3333333395421505, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 2573.166748046875, "epoch": 0.1657142857142857, "grad_norm": 0.006533089559525251, "kl": 7.301568984985352e-05, "learning_rate": 5.060876951083828e-07, "loss": 0.1377, "reward": 0.2523855767212808, "reward_std": 0.8958756141364574, "rewards/cosine_scaled_reward": -0.14464055188000202, "rewards/format_reward": 0.5416666865348816, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 3398.8333740234375, "epoch": 0.1662857142857143, "grad_norm": 0.002478619571775198, "kl": 5.59389591217041e-05, "learning_rate": 5.02962191529556e-07, "loss": 0.0546, "reward": -0.15824139676988125, "reward_std": 0.4572010412812233, "rewards/cosine_scaled_reward": -0.18328737560659647, "rewards/format_reward": 0.2083333395421505, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 2954.9583740234375, "epoch": 0.16685714285714287, "grad_norm": 0.006710364017635584, "kl": 4.022568464279175e-05, "learning_rate": 4.998389805071536e-07, "loss": 0.1662, "reward": 0.037495965138077736, "reward_std": 0.7841918021440506, "rewards/cosine_scaled_reward": -0.1895853504538536, "rewards/format_reward": 0.416666679084301, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 1611.9583587646484, "epoch": 0.16742857142857143, "grad_norm": 0.0037447651848196983, "kl": 2.1070241928100586e-05, "learning_rate": 4.967182142620745e-07, "loss": 0.1026, "reward": 1.0151976346969604, "reward_std": 0.6685754917562008, "rewards/cosine_scaled_reward": 0.13259880617260933, "rewards/format_reward": 0.75, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 2959.9584045410156, "epoch": 0.168, "grad_norm": 0.0037328009493649006, "kl": 7.581710815429688e-05, "learning_rate": 4.93600044896063e-07, "loss": 0.1844, "reward": -0.3398366719484329, "reward_std": 0.5668137073516846, "rewards/cosine_scaled_reward": -0.33658500388264656, "rewards/format_reward": 0.3333333469927311, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 3345.2083740234375, "epoch": 0.16857142857142857, "grad_norm": 0.002446865662932396, "kl": 5.8785080909729004e-05, "learning_rate": 4.904846243842949e-07, "loss": 0.0775, "reward": -0.07560757733881474, "reward_std": 0.46546874195337296, "rewards/cosine_scaled_reward": -0.183637123554945, "rewards/format_reward": 0.2916666716337204, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 2831.7084350585938, "epoch": 0.16914285714285715, "grad_norm": 0.0058394684456288815, "kl": 4.4792890548706055e-05, "learning_rate": 4.873721045679706e-07, "loss": 0.1853, "reward": -0.2766757644712925, "reward_std": 0.6155375838279724, "rewards/cosine_scaled_reward": -0.3258378803730011, "rewards/format_reward": 0.3750000111758709, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 3156.25, "epoch": 0.1697142857142857, "grad_norm": 0.002862437628209591, "kl": 5.173683166503906e-05, "learning_rate": 4.842626371469149e-07, "loss": 0.0856, "reward": 0.6619877014309168, "reward_std": 0.7456456199288368, "rewards/cosine_scaled_reward": 0.143493820913136, "rewards/format_reward": 0.3750000149011612, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 2581.166717529297, "epoch": 0.1702857142857143, "grad_norm": 0.005228836555033922, "kl": 5.8770179748535156e-05, "learning_rate": 4.811563736721829e-07, "loss": 0.1347, "reward": 0.17597029358148575, "reward_std": 0.8140305206179619, "rewards/cosine_scaled_reward": -0.12034818809479475, "rewards/format_reward": 0.4166666679084301, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 3064.416748046875, "epoch": 0.17085714285714285, "grad_norm": 0.0032789120450615883, "kl": 6.365776062011719e-05, "learning_rate": 4.780534655386743e-07, "loss": 0.0055, "reward": 0.05702040530741215, "reward_std": 0.5979477316141129, "rewards/cosine_scaled_reward": -0.15898980293422937, "rewards/format_reward": 0.3750000037252903, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 2909.125, "epoch": 0.17142857142857143, "grad_norm": 0.0025979981292039156, "kl": 3.316998481750488e-05, "learning_rate": 4.749540639777539e-07, "loss": 0.0169, "reward": -0.11416986212134361, "reward_std": 0.22534791752696037, "rewards/cosine_scaled_reward": -0.2862515989691019, "rewards/format_reward": 0.4583333432674408, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 2805.8333740234375, "epoch": 0.172, "grad_norm": 0.005956125445663929, "kl": 2.4738721549510956e-05, "learning_rate": 4.7185832004988133e-07, "loss": 0.1411, "reward": 0.08245547860860825, "reward_std": 0.6094719283282757, "rewards/cosine_scaled_reward": -0.14627227559685707, "rewards/format_reward": 0.3750000149011612, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 2464.3333740234375, "epoch": 0.17257142857142857, "grad_norm": 0.005091224797070026, "kl": 3.0701979994773865e-05, "learning_rate": 4.68766384637248e-07, "loss": 0.1768, "reward": 0.4501515403389931, "reward_std": 0.1912960773333907, "rewards/cosine_scaled_reward": 0.037575824186205864, "rewards/format_reward": 0.375, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 3012.4166870117188, "epoch": 0.17314285714285715, "grad_norm": 0.003126407042145729, "kl": 3.9011240005493164e-05, "learning_rate": 4.656784084364238e-07, "loss": -0.064, "reward": -0.01822870969772339, "reward_std": 0.38900607265532017, "rewards/cosine_scaled_reward": -0.13411437720060349, "rewards/format_reward": 0.25, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 3566.4166870117188, "epoch": 0.1737142857142857, "grad_norm": 0.0021087222266942263, "kl": 4.363059997558594e-05, "learning_rate": 4.6259454195101267e-07, "loss": 0.0066, "reward": -0.237492136657238, "reward_std": 0.5864962637424469, "rewards/cosine_scaled_reward": -0.1604127213358879, "rewards/format_reward": 0.0833333358168602, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 3038.0834350585938, "epoch": 0.1742857142857143, "grad_norm": 0.0027497108094394207, "kl": 2.065300941467285e-05, "learning_rate": 4.59514935484316e-07, "loss": 0.0722, "reward": 0.29027049988508224, "reward_std": 0.7373021356761456, "rewards/cosine_scaled_reward": -0.08403141051530838, "rewards/format_reward": 0.4583333358168602, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 2394.2500915527344, "epoch": 0.17485714285714285, "grad_norm": 0.0035033971071243286, "kl": 5.817413330078125e-05, "learning_rate": 4.5643973913200837e-07, "loss": 0.0839, "reward": 0.5357905328273773, "reward_std": 0.7523738704621792, "rewards/cosine_scaled_reward": -0.0029380805790424347, "rewards/format_reward": 0.5416666716337204, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 3449.0416870117188, "epoch": 0.17542857142857143, "grad_norm": 0.002159115858376026, "kl": 3.272294998168945e-05, "learning_rate": 4.5336910277482155e-07, "loss": 0.0549, "reward": -0.4404924660921097, "reward_std": 0.5054446496069431, "rewards/cosine_scaled_reward": -0.32441290095448494, "rewards/format_reward": 0.2083333395421505, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 2162.250015258789, "epoch": 0.176, "grad_norm": 0.0042375498451292515, "kl": 4.4405460357666016e-05, "learning_rate": 4.503031760712397e-07, "loss": 0.0273, "reward": 0.514749251306057, "reward_std": 0.7152674570679665, "rewards/cosine_scaled_reward": -0.013458725064992905, "rewards/format_reward": 0.5416666679084301, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 3354.7083740234375, "epoch": 0.17657142857142857, "grad_norm": 0.004238435998558998, "kl": 6.529688835144043e-05, "learning_rate": 4.4724210845020494e-07, "loss": 0.1021, "reward": -0.39251967146992683, "reward_std": 0.31250322982668877, "rewards/cosine_scaled_reward": -0.2587598394602537, "rewards/format_reward": 0.1250000037252903, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.17714285714285713, "grad_norm": 0.0018692758167162538, "kl": 3.419816493988037e-05, "learning_rate": 4.441860491038345e-07, "loss": 0.0, "reward": -0.0591370090842247, "reward_std": 0.9635532647371292, "rewards/cosine_scaled_reward": -0.11290184780955315, "rewards/format_reward": 0.1666666716337204, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 2797.416717529297, "epoch": 0.1777142857142857, "grad_norm": 0.0027630694676190615, "kl": 4.808604717254639e-05, "learning_rate": 4.4113514698014953e-07, "loss": 0.0389, "reward": -0.030886530876159668, "reward_std": 0.43456877768039703, "rewards/cosine_scaled_reward": -0.16127660498023033, "rewards/format_reward": 0.2916666679084301, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 3442.5833740234375, "epoch": 0.1782857142857143, "grad_norm": 0.003056673565879464, "kl": 6.717443466186523e-05, "learning_rate": 4.3808955077581546e-07, "loss": 0.0002, "reward": -0.37724767066538334, "reward_std": 0.2869449257850647, "rewards/cosine_scaled_reward": -0.2511238418519497, "rewards/format_reward": 0.125, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 2744.9166717529297, "epoch": 0.17885714285714285, "grad_norm": 0.0052271317690610886, "kl": 3.325939178466797e-05, "learning_rate": 4.350494089288943e-07, "loss": 0.102, "reward": 0.18764016032218933, "reward_std": 0.7255005538463593, "rewards/cosine_scaled_reward": -0.09367992915213108, "rewards/format_reward": 0.3750000037252903, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 2524.875045776367, "epoch": 0.17942857142857144, "grad_norm": 0.005840318743139505, "kl": 4.601478576660156e-05, "learning_rate": 4.3201486961161093e-07, "loss": 0.1531, "reward": 0.9655874371528625, "reward_std": 1.4103786647319794, "rewards/cosine_scaled_reward": 0.19112704833969474, "rewards/format_reward": 0.5833333469927311, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 3008.7916870117188, "epoch": 0.18, "grad_norm": 0.0035126220900565386, "kl": 4.088878631591797e-05, "learning_rate": 4.2898608072313045e-07, "loss": 0.1804, "reward": -0.45178741589188576, "reward_std": 0.3258051462471485, "rewards/cosine_scaled_reward": -0.3508937135338783, "rewards/format_reward": 0.2500000111758709, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 3477.875, "epoch": 0.18057142857142858, "grad_norm": 0.0028681934345513582, "kl": 5.558133125305176e-05, "learning_rate": 4.2596318988235037e-07, "loss": 0.0293, "reward": -0.27217016741633415, "reward_std": 0.5886598266661167, "rewards/cosine_scaled_reward": -0.19858508126344532, "rewards/format_reward": 0.1250000037252903, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 2689.2083435058594, "epoch": 0.18114285714285713, "grad_norm": 0.0031018939334899187, "kl": 3.853440284729004e-05, "learning_rate": 4.2294634442070553e-07, "loss": 0.007, "reward": 0.6098862756043673, "reward_std": 0.6624674461781979, "rewards/cosine_scaled_reward": 0.07577648013830185, "rewards/format_reward": 0.4583333432674408, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 3037.3751220703125, "epoch": 0.18171428571428572, "grad_norm": 0.003414622973650694, "kl": 4.416704177856445e-05, "learning_rate": 4.1993569137498776e-07, "loss": 0.0584, "reward": 0.6276337634772062, "reward_std": 0.6273336000740528, "rewards/cosine_scaled_reward": 0.02215018612332642, "rewards/format_reward": 0.5833333432674408, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 2576.4583740234375, "epoch": 0.18228571428571427, "grad_norm": 0.003611760213971138, "kl": 3.325939178466797e-05, "learning_rate": 4.1693137748017915e-07, "loss": 0.0869, "reward": 0.6472079530358315, "reward_std": 0.8605827018618584, "rewards/cosine_scaled_reward": 0.07360398769378662, "rewards/format_reward": 0.5000000074505806, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 2887.3333587646484, "epoch": 0.18285714285714286, "grad_norm": 0.0037816159892827272, "kl": 3.982335329055786e-05, "learning_rate": 4.1393354916230005e-07, "loss": 0.0372, "reward": -0.11431579291820526, "reward_std": 0.44235752895474434, "rewards/cosine_scaled_reward": -0.22382456809282303, "rewards/format_reward": 0.3333333358168602, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 3309.625, "epoch": 0.18342857142857144, "grad_norm": 0.0026822034269571304, "kl": 5.558133125305176e-05, "learning_rate": 4.1094235253127374e-07, "loss": 0.0109, "reward": -0.19009164534509182, "reward_std": 0.5137807168066502, "rewards/cosine_scaled_reward": -0.1783791354391724, "rewards/format_reward": 0.1666666716337204, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.184, "grad_norm": 0.0035175138618797064, "kl": 3.904104232788086e-05, "learning_rate": 4.079579333738039e-07, "loss": 0.0, "reward": -0.5746848918497562, "reward_std": 0.17760023847222328, "rewards/cosine_scaled_reward": -0.2873424384742975, "rewards/format_reward": 0.0, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 3073.0416870117188, "epoch": 0.18457142857142858, "grad_norm": 0.003574939211830497, "kl": 2.9087066650390625e-05, "learning_rate": 4.0498043714627006e-07, "loss": -0.0051, "reward": 0.22183078713715076, "reward_std": 0.40956074371933937, "rewards/cosine_scaled_reward": -0.05575130879878998, "rewards/format_reward": 0.3333333358168602, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 3110.5833740234375, "epoch": 0.18514285714285714, "grad_norm": 0.0031830561347305775, "kl": 2.187490463256836e-05, "learning_rate": 4.020100089676376e-07, "loss": 0.0894, "reward": 0.039558641612529755, "reward_std": 0.9110538437962532, "rewards/cosine_scaled_reward": -0.1468873475678265, "rewards/format_reward": 0.3333333432674408, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 2941.0416717529297, "epoch": 0.18571428571428572, "grad_norm": 0.002910768846049905, "kl": 3.853440284729004e-05, "learning_rate": 3.9904679361238526e-07, "loss": 0.0187, "reward": -0.24609478563070297, "reward_std": 0.3466242253780365, "rewards/cosine_scaled_reward": -0.24804740026593208, "rewards/format_reward": 0.25, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 2301.875045776367, "epoch": 0.18628571428571428, "grad_norm": 0.004634602461010218, "kl": 5.936622619628906e-05, "learning_rate": 3.9609093550344907e-07, "loss": -0.0703, "reward": 0.8200063109397888, "reward_std": 0.8190762605518103, "rewards/cosine_scaled_reward": 0.09750313125550747, "rewards/format_reward": 0.625, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 2883.875, "epoch": 0.18685714285714286, "grad_norm": 0.002490414073690772, "kl": 4.684925079345703e-05, "learning_rate": 3.931425787051832e-07, "loss": 0.0574, "reward": -0.21707230806350708, "reward_std": 0.17424386367201805, "rewards/cosine_scaled_reward": -0.23353616148233414, "rewards/format_reward": 0.25, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 2835.2083435058594, "epoch": 0.18742857142857142, "grad_norm": 0.0026330244727432728, "kl": 5.1856040954589844e-05, "learning_rate": 3.902018669163384e-07, "loss": 0.0051, "reward": 0.017457574605941772, "reward_std": 0.4332781806588173, "rewards/cosine_scaled_reward": -0.11627121269702911, "rewards/format_reward": 0.25, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 3061.5833740234375, "epoch": 0.188, "grad_norm": 0.0022818713914602995, "kl": 4.366040229797363e-05, "learning_rate": 3.872689434630585e-07, "loss": 0.1125, "reward": -0.19408408179879189, "reward_std": 0.7231174632906914, "rewards/cosine_scaled_reward": -0.24287537176860496, "rewards/format_reward": 0.291666679084301, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 2826.2083740234375, "epoch": 0.18857142857142858, "grad_norm": 0.005971286445856094, "kl": 2.6516616344451904e-05, "learning_rate": 3.843439512918949e-07, "loss": 0.2055, "reward": -0.07049742341041565, "reward_std": 0.9241986498236656, "rewards/cosine_scaled_reward": -0.222748720087111, "rewards/format_reward": 0.3750000149011612, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 2521.7500228881836, "epoch": 0.18914285714285714, "grad_norm": 0.010425534099340439, "kl": 4.607439041137695e-05, "learning_rate": 3.8142703296283953e-07, "loss": 0.1671, "reward": 0.5187118984758854, "reward_std": 0.4546149205416441, "rewards/cosine_scaled_reward": 0.030189277604222298, "rewards/format_reward": 0.4583333358168602, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 2098.5416870117188, "epoch": 0.18971428571428572, "grad_norm": 0.009883593767881393, "kl": 2.4850480258464813e-05, "learning_rate": 3.785183306423767e-07, "loss": 0.433, "reward": 0.26787397265434265, "reward_std": 0.5567906722426414, "rewards/cosine_scaled_reward": -0.15772969648241997, "rewards/format_reward": 0.583333358168602, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 2901.5833435058594, "epoch": 0.19028571428571428, "grad_norm": 0.008169352076947689, "kl": 4.477798938751221e-05, "learning_rate": 3.7561798609655373e-07, "loss": -0.0809, "reward": 0.23628418147563934, "reward_std": 0.9392815940082073, "rewards/cosine_scaled_reward": -0.13185792788863182, "rewards/format_reward": 0.5000000074505806, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 3053.5, "epoch": 0.19085714285714286, "grad_norm": 0.0031820854637771845, "kl": 3.3795833587646484e-05, "learning_rate": 3.72726140684072e-07, "loss": 0.0662, "reward": -0.1781381368637085, "reward_std": 0.46754560247063637, "rewards/cosine_scaled_reward": -0.2140690665692091, "rewards/format_reward": 0.2500000111758709, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 2258.625030517578, "epoch": 0.19142857142857142, "grad_norm": 0.005283020436763763, "kl": 3.001093864440918e-05, "learning_rate": 3.6984293534939737e-07, "loss": 0.1296, "reward": 0.2328640166670084, "reward_std": 0.7446381561458111, "rewards/cosine_scaled_reward": -0.1960679953917861, "rewards/format_reward": 0.6250000149011612, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 2621.166748046875, "epoch": 0.192, "grad_norm": 0.005859015975147486, "kl": 4.1604042053222656e-05, "learning_rate": 3.6696851061588994e-07, "loss": 0.1638, "reward": 0.26473490661010146, "reward_std": 0.8232553470879793, "rewards/cosine_scaled_reward": -0.15929922461509705, "rewards/format_reward": 0.5833333358168602, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 2745.0, "epoch": 0.19257142857142856, "grad_norm": 0.011254413053393364, "kl": 2.7865171432495117e-05, "learning_rate": 3.641030065789562e-07, "loss": 0.1323, "reward": 0.8917782939970493, "reward_std": 0.5826251674443483, "rewards/cosine_scaled_reward": 0.21672249399125576, "rewards/format_reward": 0.4583333395421505, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 2623.2083740234375, "epoch": 0.19314285714285714, "grad_norm": 0.004178225062787533, "kl": 5.581974983215332e-05, "learning_rate": 3.612465628992203e-07, "loss": 0.0503, "reward": 0.8658206164836884, "reward_std": 1.1784981563687325, "rewards/cosine_scaled_reward": 0.14124363288283348, "rewards/format_reward": 0.5833333432674408, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 3375.6666870117188, "epoch": 0.19371428571428573, "grad_norm": 0.002414165763184428, "kl": 3.361701965332031e-05, "learning_rate": 3.5839931879571725e-07, "loss": 0.0357, "reward": -0.19526179134845734, "reward_std": 0.5450187101960182, "rewards/cosine_scaled_reward": -0.20179756730794907, "rewards/format_reward": 0.2083333395421505, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 2224.375015258789, "epoch": 0.19428571428571428, "grad_norm": 0.004184546414762735, "kl": 3.948807716369629e-05, "learning_rate": 3.555614130391079e-07, "loss": 0.0351, "reward": 0.26722824200987816, "reward_std": 0.36019703559577465, "rewards/cosine_scaled_reward": -0.11638587713241577, "rewards/format_reward": 0.5, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 1793.6250610351562, "epoch": 0.19485714285714287, "grad_norm": 0.005891189444810152, "kl": 2.275407314300537e-05, "learning_rate": 3.5273298394491515e-07, "loss": 0.061, "reward": 0.7466001734137535, "reward_std": 0.8340809307992458, "rewards/cosine_scaled_reward": -0.06419992819428444, "rewards/format_reward": 0.8750000149011612, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 2577.416748046875, "epoch": 0.19542857142857142, "grad_norm": 0.0038896689657121897, "kl": 4.8220157623291016e-05, "learning_rate": 3.4991416936678276e-07, "loss": 0.096, "reward": 0.10229167528450489, "reward_std": 0.8370918184518814, "rewards/cosine_scaled_reward": -0.15718749351799488, "rewards/format_reward": 0.4166666716337204, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 2472.000030517578, "epoch": 0.196, "grad_norm": 0.004021757282316685, "kl": 4.1544437408447266e-05, "learning_rate": 3.471051066897562e-07, "loss": 0.0913, "reward": 0.43446559202857316, "reward_std": 0.7998727262020111, "rewards/cosine_scaled_reward": -0.07443386688828468, "rewards/format_reward": 0.5833333432674408, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 2510.916717529297, "epoch": 0.19657142857142856, "grad_norm": 0.009949182160198689, "kl": 3.5375356674194336e-05, "learning_rate": 3.4430593282358777e-07, "loss": 0.1237, "reward": 0.2777495412155986, "reward_std": 0.7787264138460159, "rewards/cosine_scaled_reward": -0.06945853307843208, "rewards/format_reward": 0.4166666679084301, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 1706.1667022705078, "epoch": 0.19714285714285715, "grad_norm": 0.006244292948395014, "kl": 0.00010317564010620117, "learning_rate": 3.4151678419606233e-07, "loss": -0.1257, "reward": 0.8028604239225388, "reward_std": 0.537322573363781, "rewards/cosine_scaled_reward": 0.06809689849615097, "rewards/format_reward": 0.6666666716337204, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 3549.5416870117188, "epoch": 0.1977142857142857, "grad_norm": 0.0019481972558423877, "kl": 5.0514936447143555e-05, "learning_rate": 3.387377967463493e-07, "loss": 0.0141, "reward": -0.24506978318095207, "reward_std": 0.30526506528258324, "rewards/cosine_scaled_reward": -0.18503489159047604, "rewards/format_reward": 0.125, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 2970.0833435058594, "epoch": 0.1982857142857143, "grad_norm": 0.0030003669671714306, "kl": 4.032254219055176e-05, "learning_rate": 3.359691059183761e-07, "loss": 0.024, "reward": 0.09095068741589785, "reward_std": 0.4444047249853611, "rewards/cosine_scaled_reward": -0.10035799816250801, "rewards/format_reward": 0.2916666679084301, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 2805.2083435058594, "epoch": 0.19885714285714284, "grad_norm": 0.003840967547148466, "kl": 5.9276819229125977e-05, "learning_rate": 3.3321084665422803e-07, "loss": -0.1135, "reward": 0.12965786457061768, "reward_std": 0.44081803783774376, "rewards/cosine_scaled_reward": -0.18517107143998146, "rewards/format_reward": 0.5, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 3485.125, "epoch": 0.19942857142857143, "grad_norm": 0.0025248327292501926, "kl": 5.136430263519287e-05, "learning_rate": 3.3046315338757026e-07, "loss": 0.0063, "reward": -0.14613626152276993, "reward_std": 0.7561026737093925, "rewards/cosine_scaled_reward": -0.17723480612039566, "rewards/format_reward": 0.2083333395421505, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 2097.666717529297, "epoch": 0.2, "grad_norm": 0.008205823600292206, "kl": 5.570054054260254e-05, "learning_rate": 3.2772616003709616e-07, "loss": 0.1793, "reward": 1.2111198753118515, "reward_std": 0.5373520702123642, "rewards/cosine_scaled_reward": 0.23055994510650635, "rewards/format_reward": 0.7500000111758709, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 2420.5000915527344, "epoch": 0.20057142857142857, "grad_norm": 0.003912824671715498, "kl": 4.1365623474121094e-05, "learning_rate": 3.250000000000001e-07, "loss": 0.1109, "reward": 0.8816900439560413, "reward_std": 0.8446491956710815, "rewards/cosine_scaled_reward": 0.06584502384066582, "rewards/format_reward": 0.7500000111758709, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 2333.2916870117188, "epoch": 0.20114285714285715, "grad_norm": 0.004581141751259565, "kl": 3.167986869812012e-05, "learning_rate": 3.222848061454764e-07, "loss": -0.1101, "reward": 0.17848296463489532, "reward_std": 0.5651724115014076, "rewards/cosine_scaled_reward": -0.16075852699577808, "rewards/format_reward": 0.5, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 2585.0416870117188, "epoch": 0.2017142857142857, "grad_norm": 0.004388903267681599, "kl": 6.29425048828125e-05, "learning_rate": 3.195807108082429e-07, "loss": 0.1621, "reward": -0.0497121661901474, "reward_std": 0.4621069021522999, "rewards/cosine_scaled_reward": -0.23318939935415983, "rewards/format_reward": 0.4166666865348816, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 2551.625, "epoch": 0.2022857142857143, "grad_norm": 0.0042444500140845776, "kl": 1.9704923033714294e-05, "learning_rate": 3.168878457820915e-07, "loss": 0.1427, "reward": -0.17303968127816916, "reward_std": 0.31680453941226006, "rewards/cosine_scaled_reward": -0.33651985973119736, "rewards/format_reward": 0.5000000111758709, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 2298.8334045410156, "epoch": 0.20285714285714285, "grad_norm": 0.004028092138469219, "kl": 4.780292510986328e-05, "learning_rate": 3.142063423134644e-07, "loss": -0.0405, "reward": 0.4047674387693405, "reward_std": 0.37886786088347435, "rewards/cosine_scaled_reward": -0.06844958942383528, "rewards/format_reward": 0.5416666679084301, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 2414.375, "epoch": 0.20342857142857143, "grad_norm": 0.007681987714022398, "kl": 2.088025212287903e-05, "learning_rate": 3.115363310950578e-07, "loss": 0.1011, "reward": 0.8521680235862732, "reward_std": 1.1125583052635193, "rewards/cosine_scaled_reward": 0.15525068156421185, "rewards/format_reward": 0.541666679084301, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 2435.125030517578, "epoch": 0.204, "grad_norm": 0.005125285126268864, "kl": 4.522502422332764e-05, "learning_rate": 3.0887794225945143e-07, "loss": -0.093, "reward": -0.15626798570156097, "reward_std": 0.4681190177798271, "rewards/cosine_scaled_reward": -0.3281340152025223, "rewards/format_reward": 0.5, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 2843.1666717529297, "epoch": 0.20457142857142857, "grad_norm": 0.003724311012774706, "kl": 4.9330294132232666e-05, "learning_rate": 3.062313053727671e-07, "loss": 0.0538, "reward": 0.3197288252413273, "reward_std": 1.0811421573162079, "rewards/cosine_scaled_reward": -0.006802256219089031, "rewards/format_reward": 0.3333333469927311, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 2669.3750915527344, "epoch": 0.20514285714285715, "grad_norm": 0.002664777683094144, "kl": 5.123019218444824e-05, "learning_rate": 3.0359654942835247e-07, "loss": 0.143, "reward": 0.2532944455742836, "reward_std": 0.5981452465057373, "rewards/cosine_scaled_reward": -0.1233527883887291, "rewards/format_reward": 0.5, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 3300.875, "epoch": 0.2057142857142857, "grad_norm": 0.0023720276076346636, "kl": 3.3035874366760254e-05, "learning_rate": 3.0097380284049523e-07, "loss": 0.0463, "reward": 0.14571953378617764, "reward_std": 0.6399435251951218, "rewards/cosine_scaled_reward": -0.0521402433514595, "rewards/format_reward": 0.2500000074505806, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 2695.000030517578, "epoch": 0.2062857142857143, "grad_norm": 0.0025475353468209505, "kl": 3.8504600524902344e-05, "learning_rate": 2.9836319343816397e-07, "loss": -0.0669, "reward": 0.438812792301178, "reward_std": 0.6623473539948463, "rewards/cosine_scaled_reward": -0.03059360943734646, "rewards/format_reward": 0.5, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 3471.8333740234375, "epoch": 0.20685714285714285, "grad_norm": 0.001852922374382615, "kl": 2.7894973754882812e-05, "learning_rate": 2.9576484845877793e-07, "loss": 0.0418, "reward": 0.08933638781309128, "reward_std": 0.9598925188183784, "rewards/cosine_scaled_reward": -0.10116515308618546, "rewards/format_reward": 0.2916666716337204, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 2374.416748046875, "epoch": 0.20742857142857143, "grad_norm": 0.004277784377336502, "kl": 5.036592483520508e-05, "learning_rate": 2.931788945420058e-07, "loss": 0.128, "reward": 0.312676377594471, "reward_std": 0.7018604502081871, "rewards/cosine_scaled_reward": -0.17699514888226986, "rewards/format_reward": 0.666666679084301, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 3520.125, "epoch": 0.208, "grad_norm": 0.0021903044544160366, "kl": 3.7550926208496094e-05, "learning_rate": 2.9060545772359305e-07, "loss": 0.0229, "reward": -0.4128142520785332, "reward_std": 0.3390595354139805, "rewards/cosine_scaled_reward": -0.2480737864971161, "rewards/format_reward": 0.0833333358168602, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 2336.291717529297, "epoch": 0.20857142857142857, "grad_norm": 0.003934715874493122, "kl": 3.5643577575683594e-05, "learning_rate": 2.8804466342921987e-07, "loss": 0.0392, "reward": 0.14100591838359833, "reward_std": 0.5171593204140663, "rewards/cosine_scaled_reward": -0.26283038035035133, "rewards/format_reward": 0.6666666828095913, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 3460.916748046875, "epoch": 0.20914285714285713, "grad_norm": 0.0021239020861685276, "kl": 4.3392181396484375e-05, "learning_rate": 2.854966364683872e-07, "loss": 0.0492, "reward": -0.2543212100863457, "reward_std": 0.5400286912918091, "rewards/cosine_scaled_reward": -0.25216061156243086, "rewards/format_reward": 0.2500000037252903, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 2608.500045776367, "epoch": 0.20971428571428571, "grad_norm": 0.0037426890339702368, "kl": 3.7550926208496094e-05, "learning_rate": 2.829615010283344e-07, "loss": 0.0536, "reward": 0.16127640008926392, "reward_std": 0.5796531792730093, "rewards/cosine_scaled_reward": -0.14852848649024963, "rewards/format_reward": 0.4583333358168602, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 2690.375, "epoch": 0.2102857142857143, "grad_norm": 0.005546187050640583, "kl": 3.867223858833313e-05, "learning_rate": 2.8043938066798645e-07, "loss": 0.1011, "reward": 0.5565584152936935, "reward_std": 1.263109251856804, "rewards/cosine_scaled_reward": -0.013387463986873627, "rewards/format_reward": 0.5833333395421505, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 3270.0416870117188, "epoch": 0.21085714285714285, "grad_norm": 0.0026215489488095045, "kl": 4.2319297790527344e-05, "learning_rate": 2.7793039831193133e-07, "loss": 0.0808, "reward": 0.05823414772748947, "reward_std": 1.084249071776867, "rewards/cosine_scaled_reward": -0.09588292986154556, "rewards/format_reward": 0.2500000074505806, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 3360.7083740234375, "epoch": 0.21142857142857144, "grad_norm": 0.002352589275687933, "kl": 5.1647424697875977e-05, "learning_rate": 2.7543467624442956e-07, "loss": 0.0204, "reward": -0.05687224864959717, "reward_std": 0.5668267272412777, "rewards/cosine_scaled_reward": -0.17426948994398117, "rewards/format_reward": 0.291666679084301, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 2706.4583740234375, "epoch": 0.212, "grad_norm": 0.0027068681083619595, "kl": 1.6629695892333984e-05, "learning_rate": 2.729523361034538e-07, "loss": 0.0365, "reward": 0.5900436838855967, "reward_std": 0.5758216064423323, "rewards/cosine_scaled_reward": 0.08668848196975887, "rewards/format_reward": 0.4166666716337204, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 3270.666748046875, "epoch": 0.21257142857142858, "grad_norm": 0.0024408355820924044, "kl": 4.2945146560668945e-05, "learning_rate": 2.7048349887476037e-07, "loss": 0.1014, "reward": 0.12777501717209816, "reward_std": 0.7682351693511009, "rewards/cosine_scaled_reward": -0.06111249979585409, "rewards/format_reward": 0.2500000074505806, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 2897.5416870117188, "epoch": 0.21314285714285713, "grad_norm": 0.0025841612368822098, "kl": 3.30805778503418e-05, "learning_rate": 2.6802828488599294e-07, "loss": 0.0017, "reward": 0.4733143337070942, "reward_std": 0.7977562882006168, "rewards/cosine_scaled_reward": 0.04915717989206314, "rewards/format_reward": 0.3750000149011612, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 3447.8750610351562, "epoch": 0.21371428571428572, "grad_norm": 0.002725493861362338, "kl": 5.811452865600586e-05, "learning_rate": 2.655868138008171e-07, "loss": 0.0677, "reward": -0.6090673804283142, "reward_std": 0.31959998421370983, "rewards/cosine_scaled_reward": -0.3462003655731678, "rewards/format_reward": 0.0833333358168602, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 2453.375015258789, "epoch": 0.21428571428571427, "grad_norm": 0.0034551313146948814, "kl": 4.348158836364746e-05, "learning_rate": 2.631592046130896e-07, "loss": 0.0238, "reward": 0.4879840016365051, "reward_std": 0.501481868326664, "rewards/cosine_scaled_reward": 0.014825336635112762, "rewards/format_reward": 0.4583333432674408, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 2982.1250610351562, "epoch": 0.21485714285714286, "grad_norm": 0.006414256524294615, "kl": 3.3020973205566406e-05, "learning_rate": 2.6074557564105724e-07, "loss": 0.1209, "reward": 0.7191503159701824, "reward_std": 0.8925209678709507, "rewards/cosine_scaled_reward": 0.08874182403087616, "rewards/format_reward": 0.5416666865348816, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 2582.75, "epoch": 0.21542857142857144, "grad_norm": 0.00291492254473269, "kl": 4.399195313453674e-05, "learning_rate": 2.583460445215911e-07, "loss": 0.0845, "reward": 0.2575896345078945, "reward_std": 0.8453133478760719, "rewards/cosine_scaled_reward": -0.1212051862385124, "rewards/format_reward": 0.5000000111758709, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 3118.0833740234375, "epoch": 0.216, "grad_norm": 0.0031876382417976856, "kl": 5.3435564041137695e-05, "learning_rate": 2.5596072820445254e-07, "loss": 0.081, "reward": -0.24646225478500128, "reward_std": 0.366558026522398, "rewards/cosine_scaled_reward": -0.24823113158345222, "rewards/format_reward": 0.2500000074505806, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 3033.25, "epoch": 0.21657142857142858, "grad_norm": 0.013751584105193615, "kl": 4.1425228118896484e-05, "learning_rate": 2.5358974294659373e-07, "loss": 0.1621, "reward": -0.4311616560444236, "reward_std": 0.22450479492545128, "rewards/cosine_scaled_reward": -0.3197474963963032, "rewards/format_reward": 0.2083333432674408, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 3090.7083740234375, "epoch": 0.21714285714285714, "grad_norm": 0.0037409733049571514, "kl": 4.1604042053222656e-05, "learning_rate": 2.512332043064913e-07, "loss": 0.1298, "reward": 0.2103189006447792, "reward_std": 1.1751992926001549, "rewards/cosine_scaled_reward": -0.06150721479207277, "rewards/format_reward": 0.3333333395421505, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 3297.75, "epoch": 0.21771428571428572, "grad_norm": 0.005465133115649223, "kl": 6.252527236938477e-05, "learning_rate": 2.488912271385139e-07, "loss": 0.0672, "reward": -0.20230305823497474, "reward_std": 0.7160279415547848, "rewards/cosine_scaled_reward": -0.22615152969956398, "rewards/format_reward": 0.25, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 3113.4583740234375, "epoch": 0.21828571428571428, "grad_norm": 0.0027221147902309895, "kl": 3.9190053939819336e-05, "learning_rate": 2.465639255873246e-07, "loss": 0.026, "reward": -0.23569109290838242, "reward_std": 0.788255587220192, "rewards/cosine_scaled_reward": -0.2428455650806427, "rewards/format_reward": 0.2500000074505806, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 2698.5, "epoch": 0.21885714285714286, "grad_norm": 0.0034194085747003555, "kl": 4.178285598754883e-05, "learning_rate": 2.4425141308231765e-07, "loss": 0.0576, "reward": 0.34038370475172997, "reward_std": 0.4640974849462509, "rewards/cosine_scaled_reward": -0.058974847197532654, "rewards/format_reward": 0.4583333395421505, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 3184.25, "epoch": 0.21942857142857142, "grad_norm": 0.0023127743043005466, "kl": 3.9324164390563965e-05, "learning_rate": 2.4195380233209006e-07, "loss": 0.0563, "reward": 0.2806257251650095, "reward_std": 0.5856044944375753, "rewards/cosine_scaled_reward": -0.04718713369220495, "rewards/format_reward": 0.3750000149011612, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 2448.5416870117188, "epoch": 0.22, "grad_norm": 0.007362040225416422, "kl": 3.7044286727905273e-05, "learning_rate": 2.3967120531894857e-07, "loss": 0.0483, "reward": 0.034520357847213745, "reward_std": 0.5006875544786453, "rewards/cosine_scaled_reward": -0.21190649271011353, "rewards/format_reward": 0.4583333432674408, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 3112.8750610351562, "epoch": 0.22057142857142858, "grad_norm": 0.0029328162781894207, "kl": 4.7013163566589355e-05, "learning_rate": 2.374037332934512e-07, "loss": 0.0589, "reward": -0.08694405108690262, "reward_std": 0.4349938966333866, "rewards/cosine_scaled_reward": -0.2518053762614727, "rewards/format_reward": 0.4166666716337204, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 1948.8333740234375, "epoch": 0.22114285714285714, "grad_norm": 0.006329999305307865, "kl": 4.0590763092041016e-05, "learning_rate": 2.3515149676898552e-07, "loss": 0.0709, "reward": 1.2103230450302362, "reward_std": 0.21112675499171019, "rewards/cosine_scaled_reward": 0.3343281866982579, "rewards/format_reward": 0.5416666679084301, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 1849.166748046875, "epoch": 0.22171428571428572, "grad_norm": 0.0032311498653143644, "kl": 2.5041401386260986e-05, "learning_rate": 2.3291460551638237e-07, "loss": -0.005, "reward": 1.4248562157154083, "reward_std": 0.7988525703549385, "rewards/cosine_scaled_reward": 0.23326139152050018, "rewards/format_reward": 0.9583333432674408, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 1795.3750228881836, "epoch": 0.22228571428571428, "grad_norm": 0.003434552112594247, "kl": 3.1463801860809326e-05, "learning_rate": 2.306931685585657e-07, "loss": 0.0059, "reward": 1.272888496518135, "reward_std": 0.551943626254797, "rewards/cosine_scaled_reward": 0.26144424360245466, "rewards/format_reward": 0.75, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 3369.9583740234375, "epoch": 0.22285714285714286, "grad_norm": 0.005511069670319557, "kl": 5.412101745605469e-05, "learning_rate": 2.2848729416523859e-07, "loss": 0.1354, "reward": -0.35971175134181976, "reward_std": 0.7988480478525162, "rewards/cosine_scaled_reward": -0.24235588498413563, "rewards/format_reward": 0.1250000037252903, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 3314.0833740234375, "epoch": 0.22342857142857142, "grad_norm": 0.0026866502594202757, "kl": 7.051229476928711e-05, "learning_rate": 2.2629708984760706e-07, "loss": 0.0533, "reward": -0.10006466833874583, "reward_std": 0.4289187639951706, "rewards/cosine_scaled_reward": -0.2166990041732788, "rewards/format_reward": 0.3333333395421505, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 3187.3333435058594, "epoch": 0.224, "grad_norm": 0.002154660178348422, "kl": 4.4792890548706055e-05, "learning_rate": 2.2412266235313973e-07, "loss": -0.0091, "reward": -0.3848835378885269, "reward_std": 0.362817719578743, "rewards/cosine_scaled_reward": -0.3174417628906667, "rewards/format_reward": 0.25, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 3271.7916870117188, "epoch": 0.22457142857142856, "grad_norm": 0.0026353721041232347, "kl": 4.786252975463867e-05, "learning_rate": 2.2196411766036487e-07, "loss": -0.0589, "reward": -0.12252432107925415, "reward_std": 0.59483284316957, "rewards/cosine_scaled_reward": -0.24876217544078827, "rewards/format_reward": 0.3750000037252903, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 2235.625030517578, "epoch": 0.22514285714285714, "grad_norm": 0.006691074930131435, "kl": 4.331767559051514e-05, "learning_rate": 2.1982156097370557e-07, "loss": 0.1726, "reward": 0.19804169610142708, "reward_std": 0.6304092928767204, "rewards/cosine_scaled_reward": -0.13014581729657948, "rewards/format_reward": 0.4583333395421505, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 3415.5, "epoch": 0.2257142857142857, "grad_norm": 0.003077984321862459, "kl": 4.565715789794922e-05, "learning_rate": 2.1769509671835223e-07, "loss": -0.0169, "reward": -0.333173505961895, "reward_std": 0.48199817538261414, "rewards/cosine_scaled_reward": -0.2290867615956813, "rewards/format_reward": 0.125, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 2832.4166870117188, "epoch": 0.22628571428571428, "grad_norm": 0.002271835459396243, "kl": 2.4664215743541718e-05, "learning_rate": 2.1558482853517253e-07, "loss": -0.0068, "reward": 0.34013938158750534, "reward_std": 0.7048006877303123, "rewards/cosine_scaled_reward": -0.05909697711467743, "rewards/format_reward": 0.4583333395421505, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 2771.2083587646484, "epoch": 0.22685714285714287, "grad_norm": 0.003106387099251151, "kl": 3.978610038757324e-05, "learning_rate": 2.134908592756607e-07, "loss": 0.0138, "reward": 0.7925433763302863, "reward_std": 0.43546339869499207, "rewards/cosine_scaled_reward": 0.14627171796746552, "rewards/format_reward": 0.5, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 3034.791748046875, "epoch": 0.22742857142857142, "grad_norm": 0.002252863487228751, "kl": 3.74913215637207e-05, "learning_rate": 2.1141329099692406e-07, "loss": -0.001, "reward": 0.6877913624048233, "reward_std": 0.8415800631046295, "rewards/cosine_scaled_reward": 0.07306232675909996, "rewards/format_reward": 0.5416666753590107, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 3157.4166870117188, "epoch": 0.228, "grad_norm": 0.00475455354899168, "kl": 5.08427619934082e-05, "learning_rate": 2.0935222495670968e-07, "loss": 0.0311, "reward": -0.38170846924185753, "reward_std": 0.21903511695563793, "rewards/cosine_scaled_reward": -0.29502090252935886, "rewards/format_reward": 0.2083333432674408, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 1429.3750381469727, "epoch": 0.22857142857142856, "grad_norm": 0.0053851972334086895, "kl": 3.5315752029418945e-05, "learning_rate": 2.0730776160846853e-07, "loss": 0.1143, "reward": 1.1556461527943611, "reward_std": 0.6506059970706701, "rewards/cosine_scaled_reward": 0.16115637868642807, "rewards/format_reward": 0.8333333432674408, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 2713.416748046875, "epoch": 0.22914285714285715, "grad_norm": 0.0030983807519078255, "kl": 3.769993782043457e-05, "learning_rate": 2.0528000059645995e-07, "loss": -0.0085, "reward": 0.1042788103222847, "reward_std": 0.6133656948804855, "rewards/cosine_scaled_reward": -0.21869393438100815, "rewards/format_reward": 0.541666679084301, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 3103.2083435058594, "epoch": 0.2297142857142857, "grad_norm": 0.005909958854317665, "kl": 3.388524055480957e-05, "learning_rate": 2.032690407508949e-07, "loss": 0.1854, "reward": -0.26185158151201904, "reward_std": 0.644702635705471, "rewards/cosine_scaled_reward": -0.23509246483445168, "rewards/format_reward": 0.2083333395421505, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 1939.1250457763672, "epoch": 0.2302857142857143, "grad_norm": 0.0047539700753986835, "kl": 3.196299076080322e-05, "learning_rate": 2.0127498008311922e-07, "loss": 0.1266, "reward": 0.8025200664997101, "reward_std": 0.9597503207623959, "rewards/cosine_scaled_reward": 0.06792667228728533, "rewards/format_reward": 0.6666666716337204, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 2750.1666870117188, "epoch": 0.23085714285714284, "grad_norm": 0.006730088964104652, "kl": 2.4259090423583984e-05, "learning_rate": 1.9929791578083655e-07, "loss": 0.2723, "reward": -0.20311221294105053, "reward_std": 0.48295126110315323, "rewards/cosine_scaled_reward": -0.2682227957993746, "rewards/format_reward": 0.3333333432674408, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 2676.166732788086, "epoch": 0.23142857142857143, "grad_norm": 0.0026496213395148516, "kl": 3.395974636077881e-05, "learning_rate": 1.9733794420337213e-07, "loss": 0.048, "reward": 0.7930244952440262, "reward_std": 0.5977701349183917, "rewards/cosine_scaled_reward": 0.1673455461859703, "rewards/format_reward": 0.4583333358168602, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 3550.0416870117188, "epoch": 0.232, "grad_norm": 0.0027270494028925896, "kl": 7.337331771850586e-05, "learning_rate": 1.9539516087697517e-07, "loss": 0.0118, "reward": -0.5849849805235863, "reward_std": 0.5089729800820351, "rewards/cosine_scaled_reward": -0.35499248653650284, "rewards/format_reward": 0.1250000037252903, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 2377.750030517578, "epoch": 0.23257142857142857, "grad_norm": 0.0031844163313508034, "kl": 3.319978713989258e-05, "learning_rate": 1.934696604901642e-07, "loss": 0.1288, "reward": 0.729004230350256, "reward_std": 0.6163272336125374, "rewards/cosine_scaled_reward": 0.09366881102323532, "rewards/format_reward": 0.5416666753590107, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 2253.625015258789, "epoch": 0.23314285714285715, "grad_norm": 0.00557373184710741, "kl": 4.908442497253418e-05, "learning_rate": 1.915615368891117e-07, "loss": 0.2264, "reward": 0.11264412850141525, "reward_std": 0.8959826305508614, "rewards/cosine_scaled_reward": -0.19367793574929237, "rewards/format_reward": 0.5000000074505806, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 3210.3334350585938, "epoch": 0.2337142857142857, "grad_norm": 0.0058773658238351345, "kl": 3.1948089599609375e-05, "learning_rate": 1.8967088307307e-07, "loss": 0.1015, "reward": 0.6319226892665029, "reward_std": 0.5924031846225262, "rewards/cosine_scaled_reward": 0.0867946445941925, "rewards/format_reward": 0.4583333358168602, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 3284.125, "epoch": 0.2342857142857143, "grad_norm": 0.0043516517616808414, "kl": 4.166364669799805e-05, "learning_rate": 1.8779779118983867e-07, "loss": 0.1053, "reward": -0.193182073533535, "reward_std": 0.6506270952522755, "rewards/cosine_scaled_reward": -0.2007577307522297, "rewards/format_reward": 0.2083333358168602, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 3337.6250610351562, "epoch": 0.23485714285714285, "grad_norm": 0.0032587519381195307, "kl": 5.266070365905762e-05, "learning_rate": 1.8594235253127372e-07, "loss": 0.0497, "reward": 0.11934100929647684, "reward_std": 0.9782011471688747, "rewards/cosine_scaled_reward": -0.06532951164990664, "rewards/format_reward": 0.2500000074505806, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 3385.8333740234375, "epoch": 0.23542857142857143, "grad_norm": 0.0037332435604184866, "kl": 3.007054328918457e-05, "learning_rate": 1.8410465752883758e-07, "loss": 0.0998, "reward": -0.6088740974664688, "reward_std": 0.32181330770254135, "rewards/cosine_scaled_reward": -0.3669370636343956, "rewards/format_reward": 0.1250000037252903, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 2754.166748046875, "epoch": 0.236, "grad_norm": 0.0028077580500394106, "kl": 2.6650726795196533e-05, "learning_rate": 1.822847957491922e-07, "loss": 0.0206, "reward": 0.5454868003726006, "reward_std": 1.2364729642868042, "rewards/cosine_scaled_reward": -0.03975661285221577, "rewards/format_reward": 0.6250000074505806, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 2816.5000610351562, "epoch": 0.23657142857142857, "grad_norm": 0.0044348943047225475, "kl": 5.1140785217285156e-05, "learning_rate": 1.804828558898332e-07, "loss": 0.1393, "reward": 0.24837438017129898, "reward_std": 0.8838984072208405, "rewards/cosine_scaled_reward": -0.0841461569070816, "rewards/format_reward": 0.4166666716337204, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 2987.8333740234375, "epoch": 0.23714285714285716, "grad_norm": 0.0039975885301828384, "kl": 4.7534704208374023e-05, "learning_rate": 1.7869892577476722e-07, "loss": -0.0535, "reward": -0.18539035817229887, "reward_std": 0.3096881993114948, "rewards/cosine_scaled_reward": -0.23852851800620556, "rewards/format_reward": 0.2916666679084301, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 3379.1666870117188, "epoch": 0.2377142857142857, "grad_norm": 0.0028565311804413795, "kl": 5.0961971282958984e-05, "learning_rate": 1.7693309235023127e-07, "loss": 0.0565, "reward": -0.2679970972239971, "reward_std": 0.22912515699863434, "rewards/cosine_scaled_reward": -0.19649854861199856, "rewards/format_reward": 0.125, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 2294.875, "epoch": 0.2382857142857143, "grad_norm": 0.0042418851517140865, "kl": 3.103911876678467e-05, "learning_rate": 1.7518544168045524e-07, "loss": 0.0347, "reward": 0.46424780786037445, "reward_std": 0.5792650878429413, "rewards/cosine_scaled_reward": -0.017876043915748596, "rewards/format_reward": 0.5, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 2581.0833740234375, "epoch": 0.23885714285714285, "grad_norm": 0.004319500178098679, "kl": 7.051229476928711e-05, "learning_rate": 1.7345605894346726e-07, "loss": 0.0441, "reward": -0.3086647242307663, "reward_std": 0.33590472862124443, "rewards/cosine_scaled_reward": -0.38349905237555504, "rewards/format_reward": 0.4583333432674408, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 2913.7083740234375, "epoch": 0.23942857142857144, "grad_norm": 0.004125258419662714, "kl": 4.3332576751708984e-05, "learning_rate": 1.7174502842694212e-07, "loss": 0.2193, "reward": -0.18997978325933218, "reward_std": 0.4317042101174593, "rewards/cosine_scaled_reward": -0.26165657164528966, "rewards/format_reward": 0.3333333432674408, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 1712.0417022705078, "epoch": 0.24, "grad_norm": 0.006640000734478235, "kl": 4.450976848602295e-05, "learning_rate": 1.7005243352409333e-07, "loss": 0.2217, "reward": 0.8606925960630178, "reward_std": 0.45840954408049583, "rewards/cosine_scaled_reward": 0.0761796347796917, "rewards/format_reward": 0.7083333432674408, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 3018.750030517578, "epoch": 0.24057142857142857, "grad_norm": 0.0025240823160856962, "kl": 4.553794860839844e-05, "learning_rate": 1.6837835672960831e-07, "loss": -0.0074, "reward": 0.1150827407836914, "reward_std": 0.7066870704293251, "rewards/cosine_scaled_reward": -0.1507919728755951, "rewards/format_reward": 0.4166666679084301, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 3565.3333740234375, "epoch": 0.24114285714285713, "grad_norm": 0.0018934713443741202, "kl": 3.5434961318969727e-05, "learning_rate": 1.6672287963562852e-07, "loss": 0.0108, "reward": -0.47955281287431717, "reward_std": 0.48351244255900383, "rewards/cosine_scaled_reward": -0.2606097348034382, "rewards/format_reward": 0.0416666679084301, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 2248.2916717529297, "epoch": 0.24171428571428571, "grad_norm": 0.003473414108157158, "kl": 4.051625728607178e-05, "learning_rate": 1.6508608292777203e-07, "loss": 0.0117, "reward": 0.4412359930574894, "reward_std": 0.5415541492402554, "rewards/cosine_scaled_reward": -0.07104866206645966, "rewards/format_reward": 0.5833333358168602, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 3247.4583740234375, "epoch": 0.2422857142857143, "grad_norm": 0.0033475859090685844, "kl": 4.826486110687256e-05, "learning_rate": 1.6346804638120098e-07, "loss": 0.0685, "reward": 0.21159496158361435, "reward_std": 0.8556940704584122, "rewards/cosine_scaled_reward": -0.06086919829249382, "rewards/format_reward": 0.3333333432674408, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.24285714285714285, "grad_norm": 0.0020190952345728874, "kl": 5.4836273193359375e-05, "learning_rate": 1.6186884885673413e-07, "loss": 0.0, "reward": -0.6794822365045547, "reward_std": 0.1823638565838337, "rewards/cosine_scaled_reward": -0.3397411182522774, "rewards/format_reward": 0.0, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 3202.6666870117188, "epoch": 0.24342857142857144, "grad_norm": 0.001987931551411748, "kl": 3.119930624961853e-05, "learning_rate": 1.6028856829700258e-07, "loss": -0.0086, "reward": 0.025133922696113586, "reward_std": 0.16450994741171598, "rewards/cosine_scaled_reward": -0.1124330461025238, "rewards/format_reward": 0.25, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 2872.3750915527344, "epoch": 0.244, "grad_norm": 0.00237718946300447, "kl": 3.52710485458374e-05, "learning_rate": 1.5872728172265146e-07, "loss": 0.1456, "reward": 0.15694271447136998, "reward_std": 0.8154643476009369, "rewards/cosine_scaled_reward": -0.15069531090557575, "rewards/format_reward": 0.4583333507180214, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 2523.250030517578, "epoch": 0.24457142857142858, "grad_norm": 0.0036101643927395344, "kl": 4.348158836364746e-05, "learning_rate": 1.5718506522858572e-07, "loss": -0.037, "reward": 0.42383430153131485, "reward_std": 0.8960227519273758, "rewards/cosine_scaled_reward": -0.10058285482227802, "rewards/format_reward": 0.625, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 3070.291717529297, "epoch": 0.24514285714285713, "grad_norm": 0.0022330034989863634, "kl": 3.4183263778686523e-05, "learning_rate": 1.5566199398026147e-07, "loss": 0.0976, "reward": -0.28143755346536636, "reward_std": 0.5036581829190254, "rewards/cosine_scaled_reward": -0.286552120000124, "rewards/format_reward": 0.2916666679084301, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 3249.8750610351562, "epoch": 0.24571428571428572, "grad_norm": 0.002668174682185054, "kl": 4.754960536956787e-05, "learning_rate": 1.5415814221002265e-07, "loss": 0.1087, "reward": -0.3141830489039421, "reward_std": 0.988391324877739, "rewards/cosine_scaled_reward": -0.26125819608569145, "rewards/format_reward": 0.2083333358168602, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 3510.1666870117188, "epoch": 0.24628571428571427, "grad_norm": 0.0019668028689920902, "kl": 3.454089164733887e-05, "learning_rate": 1.5267358321348285e-07, "loss": 0.0449, "reward": -0.26002343744039536, "reward_std": 0.5662377625703812, "rewards/cosine_scaled_reward": -0.192511732224375, "rewards/format_reward": 0.1250000037252903, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 3141.3333435058594, "epoch": 0.24685714285714286, "grad_norm": 0.003482809057459235, "kl": 3.88026237487793e-05, "learning_rate": 1.5120838934595337e-07, "loss": -0.0043, "reward": -0.10190819203853607, "reward_std": 0.2963053174316883, "rewards/cosine_scaled_reward": -0.17595409229397774, "rewards/format_reward": 0.25, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 2228.041717529297, "epoch": 0.24742857142857144, "grad_norm": 0.0031109952833503485, "kl": 2.251565456390381e-05, "learning_rate": 1.4976263201891613e-07, "loss": 0.0041, "reward": 1.052326887845993, "reward_std": 0.527951080352068, "rewards/cosine_scaled_reward": 0.19283007830381393, "rewards/format_reward": 0.6666666716337204, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 3467.625, "epoch": 0.248, "grad_norm": 0.002682294463738799, "kl": 3.9517879486083984e-05, "learning_rate": 1.483363816965435e-07, "loss": 0.0189, "reward": 0.04943974316120148, "reward_std": 0.5982782319188118, "rewards/cosine_scaled_reward": -0.058613456785678864, "rewards/format_reward": 0.1666666716337204, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 2790.083335876465, "epoch": 0.24857142857142858, "grad_norm": 0.003148401854559779, "kl": 2.73287296295166e-05, "learning_rate": 1.469297078922642e-07, "loss": 0.0319, "reward": 0.5368229523301125, "reward_std": 0.4591746125370264, "rewards/cosine_scaled_reward": 0.1017447616904974, "rewards/format_reward": 0.3333333358168602, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 2851.5833740234375, "epoch": 0.24914285714285714, "grad_norm": 0.004788549616932869, "kl": 2.8088688850402832e-05, "learning_rate": 1.4554267916537495e-07, "loss": 0.1272, "reward": 0.8048395961523056, "reward_std": 0.9437854867428541, "rewards/cosine_scaled_reward": 0.15241979126585647, "rewards/format_reward": 0.5000000223517418, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 1117.9167022705078, "epoch": 0.24971428571428572, "grad_norm": 0.0053713358938694, "kl": 3.5703182220458984e-05, "learning_rate": 1.4417536311769885e-07, "loss": 0.1976, "reward": 0.6703309891745448, "reward_std": 0.4448237419128418, "rewards/cosine_scaled_reward": -0.14400118589401245, "rewards/format_reward": 0.9583333432674408, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 3072.125, "epoch": 0.2502857142857143, "grad_norm": 0.0036464622244238853, "kl": 3.215670585632324e-05, "learning_rate": 1.4282782639029128e-07, "loss": -0.1335, "reward": -0.016481630504131317, "reward_std": 0.6812924295663834, "rewards/cosine_scaled_reward": -0.15407415106892586, "rewards/format_reward": 0.291666679084301, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 2314.8333892822266, "epoch": 0.25085714285714283, "grad_norm": 0.004501568619161844, "kl": 2.7614645659923553e-05, "learning_rate": 1.4150013466019114e-07, "loss": 0.101, "reward": 0.2146640159189701, "reward_std": 0.7314743623137474, "rewards/cosine_scaled_reward": -0.16350134275853634, "rewards/format_reward": 0.541666679084301, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 1712.6666870117188, "epoch": 0.25142857142857145, "grad_norm": 0.003505983157083392, "kl": 3.999471664428711e-05, "learning_rate": 1.4019235263722034e-07, "loss": 0.0104, "reward": 0.25034987926483154, "reward_std": 0.5995085947215557, "rewards/cosine_scaled_reward": -0.24982509016990662, "rewards/format_reward": 0.75, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 3546.25, "epoch": 0.252, "grad_norm": 0.002209461061283946, "kl": 3.758072853088379e-05, "learning_rate": 1.3890454406082956e-07, "loss": 0.0221, "reward": -0.35466065257787704, "reward_std": 0.49806540086865425, "rewards/cosine_scaled_reward": -0.19816366396844387, "rewards/format_reward": 0.0416666679084301, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 2916.8333435058594, "epoch": 0.25257142857142856, "grad_norm": 0.0031127003021538258, "kl": 3.74913215637207e-05, "learning_rate": 1.3763677169699217e-07, "loss": 0.0835, "reward": 0.3389095589518547, "reward_std": 0.7792292460799217, "rewards/cosine_scaled_reward": -0.018045231699943542, "rewards/format_reward": 0.375, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 3364.25, "epoch": 0.25314285714285717, "grad_norm": 0.003212068462744355, "kl": 3.2901763916015625e-05, "learning_rate": 1.3638909733514452e-07, "loss": 0.0984, "reward": -0.482344675809145, "reward_std": 0.4047136679291725, "rewards/cosine_scaled_reward": -0.34533900767564774, "rewards/format_reward": 0.2083333395421505, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 3288.7916870117188, "epoch": 0.2537142857142857, "grad_norm": 0.0032868534326553345, "kl": 3.8504600524902344e-05, "learning_rate": 1.351615817851748e-07, "loss": 0.1118, "reward": -0.7099503725767136, "reward_std": 0.2111910916864872, "rewards/cosine_scaled_reward": -0.4174751862883568, "rewards/format_reward": 0.125, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 2749.8333435058594, "epoch": 0.2542857142857143, "grad_norm": 0.00827477965503931, "kl": 2.467632293701172e-05, "learning_rate": 1.3395428487445914e-07, "loss": 0.1266, "reward": 0.5643904823809862, "reward_std": 1.1060209525749087, "rewards/cosine_scaled_reward": 0.07386190351098776, "rewards/format_reward": 0.4166666716337204, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 2112.791717529297, "epoch": 0.25485714285714284, "grad_norm": 0.004392516333609819, "kl": 3.916025161743164e-05, "learning_rate": 1.3276726544494571e-07, "loss": 0.0797, "reward": 0.3459349423646927, "reward_std": 0.626835897564888, "rewards/cosine_scaled_reward": -0.1395325306802988, "rewards/format_reward": 0.6250000037252903, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 2798.0833740234375, "epoch": 0.25542857142857145, "grad_norm": 0.006370176561176777, "kl": 3.7223100662231445e-05, "learning_rate": 1.316005813502869e-07, "loss": 0.0973, "reward": -0.2408202663064003, "reward_std": 0.46487484872341156, "rewards/cosine_scaled_reward": -0.2662434671074152, "rewards/format_reward": 0.2916666679084301, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 3110.6666870117188, "epoch": 0.256, "grad_norm": 0.003158599603921175, "kl": 3.0681490898132324e-05, "learning_rate": 1.3045428945301953e-07, "loss": 0.1043, "reward": 1.0269762203097343, "reward_std": 0.9692520722746849, "rewards/cosine_scaled_reward": 0.28432143456302583, "rewards/format_reward": 0.4583333358168602, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 3133.9583740234375, "epoch": 0.25657142857142856, "grad_norm": 0.003640873124822974, "kl": 3.275275230407715e-05, "learning_rate": 1.2932844562179352e-07, "loss": 0.1736, "reward": 0.4554176330566406, "reward_std": 1.1091559082269669, "rewards/cosine_scaled_reward": 0.04020881466567516, "rewards/format_reward": 0.3750000149011612, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 2029.0000839233398, "epoch": 0.2571428571428571, "grad_norm": 0.004625717643648386, "kl": 1.671421341598034e-05, "learning_rate": 1.2822310472864885e-07, "loss": 0.3094, "reward": 0.5760099496692419, "reward_std": 0.6390558686107397, "rewards/cosine_scaled_reward": -0.02449503168463707, "rewards/format_reward": 0.6250000074505806, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 3325.0416870117188, "epoch": 0.25771428571428573, "grad_norm": 0.0026540823746472597, "kl": 5.924701690673828e-05, "learning_rate": 1.2713832064634125e-07, "loss": 0.0567, "reward": -0.5035366732627153, "reward_std": 0.5824164692312479, "rewards/cosine_scaled_reward": -0.3559350073337555, "rewards/format_reward": 0.2083333358168602, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 2121.0833587646484, "epoch": 0.2582857142857143, "grad_norm": 0.005077024921774864, "kl": 3.635883331298828e-05, "learning_rate": 1.260741462457165e-07, "loss": 0.1185, "reward": 0.1832403689622879, "reward_std": 0.5796112641692162, "rewards/cosine_scaled_reward": -0.20004649343900383, "rewards/format_reward": 0.5833333358168602, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 2773.8333587646484, "epoch": 0.25885714285714284, "grad_norm": 0.004313652869313955, "kl": 4.602968692779541e-05, "learning_rate": 1.2503063339313356e-07, "loss": 0.0262, "reward": 0.1849600374698639, "reward_std": 0.8068434670567513, "rewards/cosine_scaled_reward": -0.07418663054704666, "rewards/format_reward": 0.3333333358168602, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 2960.1250610351562, "epoch": 0.25942857142857145, "grad_norm": 0.003947154618799686, "kl": 5.765259265899658e-05, "learning_rate": 1.2400783294793668e-07, "loss": -0.0805, "reward": 0.42200128734111786, "reward_std": 0.6595911085605621, "rewards/cosine_scaled_reward": 0.023500639013946056, "rewards/format_reward": 0.375, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 3092.5833740234375, "epoch": 0.26, "grad_norm": 0.0020724909845739603, "kl": 2.4662003852427006e-05, "learning_rate": 1.2300579475997657e-07, "loss": 0.0212, "reward": 0.18169401306658983, "reward_std": 0.53652074187994, "rewards/cosine_scaled_reward": -0.07581967674195766, "rewards/format_reward": 0.3333333358168602, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 3508.25, "epoch": 0.26057142857142856, "grad_norm": 0.002207675715908408, "kl": 6.0498714447021484e-05, "learning_rate": 1.220245676671809e-07, "loss": 0.0432, "reward": -0.3331937964539975, "reward_std": 0.32354384288191795, "rewards/cosine_scaled_reward": -0.1874302327632904, "rewards/format_reward": 0.0416666679084301, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 2852.875, "epoch": 0.2611428571428571, "grad_norm": 0.004505333956331015, "kl": 3.202259540557861e-05, "learning_rate": 1.2106419949317388e-07, "loss": 0.0693, "reward": -0.06501695513725281, "reward_std": 0.5639016218483448, "rewards/cosine_scaled_reward": -0.1991751492023468, "rewards/format_reward": 0.3333333358168602, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 2941.7916870117188, "epoch": 0.26171428571428573, "grad_norm": 0.003214410273358226, "kl": 5.9699639678001404e-05, "learning_rate": 1.2012473704494537e-07, "loss": -0.0677, "reward": 0.1302248314023018, "reward_std": 0.5533175133168697, "rewards/cosine_scaled_reward": -0.1223875880241394, "rewards/format_reward": 0.3750000037252903, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 2864.791717529297, "epoch": 0.2622857142857143, "grad_norm": 0.0027346203569322824, "kl": 2.7000904083251953e-05, "learning_rate": 1.1920622611056974e-07, "loss": 0.0438, "reward": -0.17386821657419205, "reward_std": 0.34645166620612144, "rewards/cosine_scaled_reward": -0.23276744224131107, "rewards/format_reward": 0.2916666679084301, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 2982.291717529297, "epoch": 0.26285714285714284, "grad_norm": 0.002626008354127407, "kl": 6.622076034545898e-05, "learning_rate": 1.1830871145697412e-07, "loss": 0.0008, "reward": 0.1544975433498621, "reward_std": 0.5268663372844458, "rewards/cosine_scaled_reward": -0.06858456134796143, "rewards/format_reward": 0.2916666679084301, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 2816.250030517578, "epoch": 0.2634285714285714, "grad_norm": 0.003703025169670582, "kl": 6.437301635742188e-05, "learning_rate": 1.1743223682775649e-07, "loss": 0.0704, "reward": -0.15123708546161652, "reward_std": 0.4461169093847275, "rewards/cosine_scaled_reward": -0.24228521436452866, "rewards/format_reward": 0.3333333358168602, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 3322.9583740234375, "epoch": 0.264, "grad_norm": 0.002342699095606804, "kl": 2.6881694793701172e-05, "learning_rate": 1.1657684494105386e-07, "loss": 0.0368, "reward": 0.32628389447927475, "reward_std": 0.7125649503432214, "rewards/cosine_scaled_reward": 0.03814195655286312, "rewards/format_reward": 0.2500000111758709, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 3308.3333740234375, "epoch": 0.26457142857142857, "grad_norm": 0.0025879517197608948, "kl": 3.695487976074219e-05, "learning_rate": 1.1574257748745986e-07, "loss": 0.0016, "reward": -0.2570188567042351, "reward_std": 0.6527134664356709, "rewards/cosine_scaled_reward": -0.27434276416897774, "rewards/format_reward": 0.2916666716337204, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 2939.875030517578, "epoch": 0.2651428571428571, "grad_norm": 0.0039133490063250065, "kl": 2.7000904083251953e-05, "learning_rate": 1.1492947512799328e-07, "loss": 0.1896, "reward": 0.318607896566391, "reward_std": 0.6164628490805626, "rewards/cosine_scaled_reward": -0.0281960628926754, "rewards/format_reward": 0.3750000149011612, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 3552.5416870117188, "epoch": 0.26571428571428574, "grad_norm": 0.0021735162008553743, "kl": 4.029273986816406e-05, "learning_rate": 1.1413757749211602e-07, "loss": 0.0061, "reward": 0.23934463411569595, "reward_std": 0.917254988104105, "rewards/cosine_scaled_reward": 0.015505656599998474, "rewards/format_reward": 0.2083333395421505, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 2683.6250610351562, "epoch": 0.2662857142857143, "grad_norm": 0.0027948280330747366, "kl": 3.0606985092163086e-05, "learning_rate": 1.1336692317580158e-07, "loss": 0.1376, "reward": -0.14104346558451653, "reward_std": 0.6036871001124382, "rewards/cosine_scaled_reward": -0.2996883988380432, "rewards/format_reward": 0.4583333395421505, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 3156.7083740234375, "epoch": 0.26685714285714285, "grad_norm": 0.00244973530061543, "kl": 3.954768180847168e-05, "learning_rate": 1.1261754973965422e-07, "loss": 0.0728, "reward": 0.27295311354100704, "reward_std": 0.5461500771343708, "rewards/cosine_scaled_reward": -0.030190102756023407, "rewards/format_reward": 0.3333333432674408, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 2774.8333740234375, "epoch": 0.2674285714285714, "grad_norm": 0.0065203760750591755, "kl": 3.674626350402832e-05, "learning_rate": 1.1188949370707787e-07, "loss": 0.1816, "reward": 0.6325602661818266, "reward_std": 0.9874615147709846, "rewards/cosine_scaled_reward": 0.04544678330421448, "rewards/format_reward": 0.5416666865348816, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 2779.2916870117188, "epoch": 0.268, "grad_norm": 0.008147571235895157, "kl": 3.841519355773926e-05, "learning_rate": 1.1118279056249653e-07, "loss": 0.1902, "reward": -0.31290698423981667, "reward_std": 0.5613552518188953, "rewards/cosine_scaled_reward": -0.3231201581656933, "rewards/format_reward": 0.3333333432674408, "step": 469 }, { "clip_ratio": 0.0, "completion_length": 2053.9166717529297, "epoch": 0.26857142857142857, "grad_norm": 0.004407951608300209, "kl": 3.729388117790222e-05, "learning_rate": 1.1049747474962444e-07, "loss": 0.0366, "reward": 0.7286863494664431, "reward_std": 0.6468604430556297, "rewards/cosine_scaled_reward": 0.09350983053445816, "rewards/format_reward": 0.5416666679084301, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 2322.8334045410156, "epoch": 0.26914285714285713, "grad_norm": 0.0044290050864219666, "kl": 3.1054019927978516e-05, "learning_rate": 1.0983357966978745e-07, "loss": 0.1892, "reward": 0.12340907007455826, "reward_std": 0.5407845675945282, "rewards/cosine_scaled_reward": -0.25079547613859177, "rewards/format_reward": 0.6250000149011612, "step": 471 }, { "clip_ratio": 0.0, "completion_length": 3183.666748046875, "epoch": 0.26971428571428574, "grad_norm": 0.004356452263891697, "kl": 3.731250762939453e-05, "learning_rate": 1.0919113768029517e-07, "loss": 0.1507, "reward": -0.3689257865771651, "reward_std": 0.3060622923076153, "rewards/cosine_scaled_reward": -0.28862956492230296, "rewards/format_reward": 0.2083333358168602, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 3048.375030517578, "epoch": 0.2702857142857143, "grad_norm": 0.00265933433547616, "kl": 4.579126834869385e-05, "learning_rate": 1.0857018009286381e-07, "loss": -0.046, "reward": -0.20010873675346375, "reward_std": 0.5477882605046034, "rewards/cosine_scaled_reward": -0.26672103721648455, "rewards/format_reward": 0.3333333358168602, "step": 473 }, { "clip_ratio": 0.0, "completion_length": 2537.4583740234375, "epoch": 0.27085714285714285, "grad_norm": 0.002336263656616211, "kl": 1.895427703857422e-05, "learning_rate": 1.0797073717209013e-07, "loss": 0.0463, "reward": 0.570537842810154, "reward_std": 0.3717269003391266, "rewards/cosine_scaled_reward": -0.027231089770793915, "rewards/format_reward": 0.6250000037252903, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 2848.0000610351562, "epoch": 0.2714285714285714, "grad_norm": 0.0036343473475426435, "kl": 5.441904067993164e-05, "learning_rate": 1.0739283813397639e-07, "loss": 0.0328, "reward": 0.47458529472351074, "reward_std": 0.9797524027526379, "rewards/cosine_scaled_reward": 0.008125968277454376, "rewards/format_reward": 0.4583333507180214, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 3290.625, "epoch": 0.272, "grad_norm": 0.0035789578687399626, "kl": 4.13060188293457e-05, "learning_rate": 1.068365111445064e-07, "loss": 0.112, "reward": -0.6011077389121056, "reward_std": 0.2807481847703457, "rewards/cosine_scaled_reward": -0.3630538806319237, "rewards/format_reward": 0.125, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 2985.916717529297, "epoch": 0.2725714285714286, "grad_norm": 0.0031627591233700514, "kl": 3.9383769035339355e-05, "learning_rate": 1.063017833182728e-07, "loss": 0.0719, "reward": -0.1700519472360611, "reward_std": 0.6547817178070545, "rewards/cosine_scaled_reward": -0.23085930198431015, "rewards/format_reward": 0.291666679084301, "step": 477 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.27314285714285713, "grad_norm": 0.002635597251355648, "kl": 4.0918588638305664e-05, "learning_rate": 1.0578868071715544e-07, "loss": 0.0, "reward": -0.2367440052330494, "reward_std": 0.47697851806879044, "rewards/cosine_scaled_reward": -0.1392053384333849, "rewards/format_reward": 0.0416666679084301, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 2676.2500610351562, "epoch": 0.2737142857142857, "grad_norm": 0.0036513032391667366, "kl": 5.620718002319336e-05, "learning_rate": 1.0529722834905125e-07, "loss": 0.0769, "reward": 0.6267237924039364, "reward_std": 0.7198405005037785, "rewards/cosine_scaled_reward": 0.0841952059417963, "rewards/format_reward": 0.4583333395421505, "step": 479 }, { "clip_ratio": 0.0, "completion_length": 3194.9583740234375, "epoch": 0.2742857142857143, "grad_norm": 0.006981777027249336, "kl": 5.0008296966552734e-05, "learning_rate": 1.0482745016665526e-07, "loss": 0.0839, "reward": 0.23194425785914063, "reward_std": 0.38723673298954964, "rewards/cosine_scaled_reward": -0.0715278685092926, "rewards/format_reward": 0.375, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 2998.125, "epoch": 0.27485714285714286, "grad_norm": 0.0035159483086317778, "kl": 7.155537605285645e-05, "learning_rate": 1.0437936906629334e-07, "loss": 0.0702, "reward": -0.281748965382576, "reward_std": 0.1873009279370308, "rewards/cosine_scaled_reward": -0.2658744901418686, "rewards/format_reward": 0.25, "step": 481 }, { "clip_ratio": 0.0, "completion_length": 2691.125, "epoch": 0.2754285714285714, "grad_norm": 0.0028464747592806816, "kl": 3.9130449295043945e-05, "learning_rate": 1.0395300688680625e-07, "loss": 0.0843, "reward": -0.07830730825662613, "reward_std": 0.46800680086016655, "rewards/cosine_scaled_reward": -0.24748700112104416, "rewards/format_reward": 0.4166666716337204, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 3004.375030517578, "epoch": 0.276, "grad_norm": 0.003125025425106287, "kl": 5.441904067993164e-05, "learning_rate": 1.0354838440848501e-07, "loss": 0.0821, "reward": -0.26137344539165497, "reward_std": 0.4395611882209778, "rewards/cosine_scaled_reward": -0.29735338501632214, "rewards/format_reward": 0.3333333358168602, "step": 483 }, { "clip_ratio": 0.0, "completion_length": 2520.9167098999023, "epoch": 0.2765714285714286, "grad_norm": 0.008685966953635216, "kl": 2.4566426873207092e-05, "learning_rate": 1.0316552135205837e-07, "loss": 0.1215, "reward": 0.03355927672237158, "reward_std": 0.3452487476170063, "rewards/cosine_scaled_reward": -0.23322037234902382, "rewards/format_reward": 0.5000000074505806, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 3118.8750610351562, "epoch": 0.27714285714285714, "grad_norm": 0.006218232214450836, "kl": 4.51207160949707e-05, "learning_rate": 1.0280443637773163e-07, "loss": 0.1853, "reward": -0.19294019415974617, "reward_std": 0.6819088384509087, "rewards/cosine_scaled_reward": -0.24230343848466873, "rewards/format_reward": 0.2916666716337204, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 2653.7083587646484, "epoch": 0.2777142857142857, "grad_norm": 0.008190950378775597, "kl": 5.608797073364258e-05, "learning_rate": 1.0246514708427701e-07, "loss": 0.1149, "reward": 0.3064362294971943, "reward_std": 1.1603137403726578, "rewards/cosine_scaled_reward": -0.05511523166205734, "rewards/format_reward": 0.4166666828095913, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 1755.1667022705078, "epoch": 0.2782857142857143, "grad_norm": 0.004699239507317543, "kl": 2.8789043426513672e-05, "learning_rate": 1.0214767000817596e-07, "loss": 0.1845, "reward": 0.8687861561775208, "reward_std": 0.8345159851014614, "rewards/cosine_scaled_reward": 0.10105973854660988, "rewards/format_reward": 0.6666666716337204, "step": 487 }, { "clip_ratio": 0.0, "completion_length": 3323.916748046875, "epoch": 0.27885714285714286, "grad_norm": 0.0025124498642981052, "kl": 3.583729267120361e-05, "learning_rate": 1.0185202062281336e-07, "loss": 0.0357, "reward": 0.3644334077835083, "reward_std": 0.9635904431343079, "rewards/cosine_scaled_reward": -0.02611662377603352, "rewards/format_reward": 0.416666679084301, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 2957.7501220703125, "epoch": 0.2794285714285714, "grad_norm": 0.006344238296151161, "kl": 6.568431854248047e-05, "learning_rate": 1.0157821333772304e-07, "loss": 0.263, "reward": -0.2175223045051098, "reward_std": 0.7979172915220261, "rewards/cosine_scaled_reward": -0.27542782574892044, "rewards/format_reward": 0.3333333395421505, "step": 489 }, { "clip_ratio": 0.0, "completion_length": 2238.6666717529297, "epoch": 0.28, "grad_norm": 0.003336644498631358, "kl": 4.173070192337036e-05, "learning_rate": 1.013262614978859e-07, "loss": 0.0455, "reward": 0.44363706558942795, "reward_std": 0.30416621919721365, "rewards/cosine_scaled_reward": -0.02818143740296364, "rewards/format_reward": 0.5, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 2797.9584045410156, "epoch": 0.2805714285714286, "grad_norm": 0.0057897004298865795, "kl": 3.555417060852051e-05, "learning_rate": 1.0109617738307911e-07, "loss": 0.1934, "reward": 0.4745689434930682, "reward_std": 0.9885793998837471, "rewards/cosine_scaled_reward": -0.012715548276901245, "rewards/format_reward": 0.5, "step": 491 }, { "clip_ratio": 0.0, "completion_length": 2865.6666870117188, "epoch": 0.28114285714285714, "grad_norm": 0.0027676261961460114, "kl": 6.0677528381347656e-05, "learning_rate": 1.0088797220727779e-07, "loss": 0.064, "reward": 0.0014106929302215576, "reward_std": 0.5617579072713852, "rewards/cosine_scaled_reward": -0.18679466377943754, "rewards/format_reward": 0.3750000149011612, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 2518.0, "epoch": 0.2817142857142857, "grad_norm": 0.007662121206521988, "kl": 5.4836273193359375e-05, "learning_rate": 1.0070165611810855e-07, "loss": 0.1662, "reward": 0.1066875010728836, "reward_std": 0.6054680943489075, "rewards/cosine_scaled_reward": -0.1758229173719883, "rewards/format_reward": 0.4583333358168602, "step": 493 }, { "clip_ratio": 0.0, "completion_length": 2421.416748046875, "epoch": 0.2822857142857143, "grad_norm": 0.004627623129636049, "kl": 4.842877388000488e-05, "learning_rate": 1.005372381963547e-07, "loss": 0.2092, "reward": 0.36174818873405457, "reward_std": 1.1407049596309662, "rewards/cosine_scaled_reward": -0.04829256609082222, "rewards/format_reward": 0.4583333469927311, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 3418.416748046875, "epoch": 0.28285714285714286, "grad_norm": 0.0037113623693585396, "kl": 4.0724873542785645e-05, "learning_rate": 1.0039472645551372e-07, "loss": 0.0797, "reward": -0.3331603854894638, "reward_std": 0.5591974928975105, "rewards/cosine_scaled_reward": -0.2499135248363018, "rewards/format_reward": 0.1666666716337204, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 1695.8334350585938, "epoch": 0.2834285714285714, "grad_norm": 0.006037239450961351, "kl": 4.509091377258301e-05, "learning_rate": 1.002741278414069e-07, "loss": 0.133, "reward": 0.7991033643484116, "reward_std": 0.8886711373925209, "rewards/cosine_scaled_reward": -0.03794832527637482, "rewards/format_reward": 0.8750000149011612, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 3584.0, "epoch": 0.284, "grad_norm": 0.001694100210443139, "kl": 3.255903720855713e-05, "learning_rate": 1.0017544823184055e-07, "loss": 0.0, "reward": -0.34066522028297186, "reward_std": 0.3968076538294554, "rewards/cosine_scaled_reward": -0.19116594083607197, "rewards/format_reward": 0.0416666679084301, "step": 497 }, { "clip_ratio": 0.0, "completion_length": 2825.500045776367, "epoch": 0.2845714285714286, "grad_norm": 0.0027246945537626743, "kl": 4.690885543823242e-05, "learning_rate": 1.0009869243631952e-07, "loss": 0.0815, "reward": 0.11544790863990784, "reward_std": 0.7895869575440884, "rewards/cosine_scaled_reward": -0.12977605033665895, "rewards/format_reward": 0.3750000037252903, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 2991.5001220703125, "epoch": 0.28514285714285714, "grad_norm": 0.0031819732394069433, "kl": 4.802830517292023e-05, "learning_rate": 1.000438641958131e-07, "loss": 0.0904, "reward": 0.7263234108686447, "reward_std": 1.3813723027706146, "rewards/cosine_scaled_reward": 0.11316169673227705, "rewards/format_reward": 0.5000000149011612, "step": 499 }, { "clip_ratio": 0.0, "completion_length": 2482.4583435058594, "epoch": 0.2857142857142857, "grad_norm": 0.0063637965358793736, "kl": 3.319978713989258e-05, "learning_rate": 1.0001096618257236e-07, "loss": 0.048, "reward": -0.07245631515979767, "reward_std": 0.4825677201151848, "rewards/cosine_scaled_reward": -0.26539483666419983, "rewards/format_reward": 0.4583333432674408, "step": 500 }, { "epoch": 0.2857142857142857, "step": 500, "total_flos": 0.0, "train_loss": 0.06715515071895266, "train_runtime": 24398.5227, "train_samples_per_second": 0.492, "train_steps_per_second": 0.02 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }