{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5714285714285714, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 1644.166748046875, "epoch": 0.001142857142857143, "grad_norm": 0.20607953518495117, "kl": 0.0, "learning_rate": 2e-08, "loss": 0.0022, "reward": -0.1127668060362339, "reward_std": 0.20213491283357143, "rewards/cosine_scaled_reward": -0.18138340720906854, "rewards/format_reward": 0.25, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 1656.791748046875, "epoch": 0.002285714285714286, "grad_norm": 0.31679714617652144, "kl": 0.0, "learning_rate": 4e-08, "loss": 0.0623, "reward": -0.05582176148891449, "reward_std": 0.6275629922747612, "rewards/cosine_scaled_reward": -0.19457754865288734, "rewards/format_reward": 0.3333333432674408, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 1606.7500610351562, "epoch": 0.0034285714285714284, "grad_norm": 0.2789602147805501, "kl": 3.388524055480957e-05, "learning_rate": 6e-08, "loss": 0.0376, "reward": -0.2583192214369774, "reward_std": 0.2636854462325573, "rewards/cosine_scaled_reward": -0.222909614443779, "rewards/format_reward": 0.1875000074505806, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 1690.6250610351562, "epoch": 0.004571428571428572, "grad_norm": 0.27232938747073254, "kl": 4.017353057861328e-05, "learning_rate": 8e-08, "loss": 0.0159, "reward": -0.40017254278063774, "reward_std": 0.17111004143953323, "rewards/cosine_scaled_reward": -0.3146696165204048, "rewards/format_reward": 0.2291666716337204, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 1618.3541870117188, "epoch": 0.005714285714285714, "grad_norm": 0.2939867481096334, "kl": 2.8431415557861328e-05, "learning_rate": 1e-07, "loss": 0.0576, "reward": 0.13743871822953224, "reward_std": 0.7271581590175629, "rewards/cosine_scaled_reward": -0.12919731251895428, "rewards/format_reward": 0.3958333395421505, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 1629.4791870117188, "epoch": 0.006857142857142857, "grad_norm": 0.248871735331751, "kl": 3.477931022644043e-05, "learning_rate": 1.2e-07, "loss": -0.0029, "reward": -0.029103130102157593, "reward_std": 0.5708433166146278, "rewards/cosine_scaled_reward": -0.1708015874028206, "rewards/format_reward": 0.3125000037252903, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 1490.6458740234375, "epoch": 0.008, "grad_norm": 0.22790937530079167, "kl": 3.007054328918457e-05, "learning_rate": 1.4e-07, "loss": 0.0903, "reward": 0.12145921215415001, "reward_std": 0.5416159555315971, "rewards/cosine_scaled_reward": -0.10593708232045174, "rewards/format_reward": 0.33333334140479565, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 1683.5000305175781, "epoch": 0.009142857142857144, "grad_norm": 0.20752077742039396, "kl": 4.646182060241699e-05, "learning_rate": 1.6e-07, "loss": 0.0277, "reward": -0.23692437633872032, "reward_std": 0.4620281979441643, "rewards/cosine_scaled_reward": -0.2747122012078762, "rewards/format_reward": 0.31250000558793545, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 1719.2292175292969, "epoch": 0.010285714285714285, "grad_norm": 0.2983323511333683, "kl": 4.1991472244262695e-05, "learning_rate": 1.8e-07, "loss": 0.0511, "reward": -0.31221747025847435, "reward_std": 0.21310735493898392, "rewards/cosine_scaled_reward": -0.24985874257981777, "rewards/format_reward": 0.1875000074505806, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 1477.2083740234375, "epoch": 0.011428571428571429, "grad_norm": 0.23645082786220448, "kl": 3.116577863693237e-05, "learning_rate": 2e-07, "loss": 0.0495, "reward": 0.37697479128837585, "reward_std": 0.44906593672931194, "rewards/cosine_scaled_reward": -0.05109592713415623, "rewards/format_reward": 0.4791666716337204, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 1508.8958587646484, "epoch": 0.012571428571428572, "grad_norm": 0.339825377520832, "kl": 2.8848648071289062e-05, "learning_rate": 2.1999999999999998e-07, "loss": 0.0535, "reward": -0.13005081936717033, "reward_std": 0.6173823103308678, "rewards/cosine_scaled_reward": -0.2525254301726818, "rewards/format_reward": 0.37500000558793545, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 1631.1041870117188, "epoch": 0.013714285714285714, "grad_norm": 0.20658630326267732, "kl": 3.084540367126465e-05, "learning_rate": 2.4e-07, "loss": 0.0635, "reward": 0.03064786270260811, "reward_std": 0.4376446008682251, "rewards/cosine_scaled_reward": -0.1513427309691906, "rewards/format_reward": 0.33333334140479565, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 1422.604232788086, "epoch": 0.014857142857142857, "grad_norm": 0.23614097630983502, "kl": 2.527981996536255e-05, "learning_rate": 2.6e-07, "loss": -0.0306, "reward": 0.4512472003698349, "reward_std": 0.40983884781599045, "rewards/cosine_scaled_reward": -0.02437640482094139, "rewards/format_reward": 0.5, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 1652.3542175292969, "epoch": 0.016, "grad_norm": 0.2206408502680819, "kl": 3.93986701965332e-05, "learning_rate": 2.8e-07, "loss": 0.0059, "reward": -0.2542928569018841, "reward_std": 0.17246506363153458, "rewards/cosine_scaled_reward": -0.26256311126053333, "rewards/format_reward": 0.2708333395421505, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 1679.229248046875, "epoch": 0.017142857142857144, "grad_norm": 0.2314183406404789, "kl": 4.3898820877075195e-05, "learning_rate": 3e-07, "loss": 0.0053, "reward": -0.258657343685627, "reward_std": 0.23606499657034874, "rewards/cosine_scaled_reward": -0.1918286692816764, "rewards/format_reward": 0.125, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 1396.7917175292969, "epoch": 0.018285714285714287, "grad_norm": 0.25436941656143647, "kl": 2.3171305656433105e-05, "learning_rate": 3.2e-07, "loss": 0.1053, "reward": 0.20216324925422668, "reward_std": 0.4999893419444561, "rewards/cosine_scaled_reward": -0.13850171491503716, "rewards/format_reward": 0.4791666716337204, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 1719.416748046875, "epoch": 0.019428571428571427, "grad_norm": 0.23312894299622924, "kl": 4.0084123611450195e-05, "learning_rate": 3.4000000000000003e-07, "loss": -0.0007, "reward": -0.41149570792913437, "reward_std": 0.13166083209216595, "rewards/cosine_scaled_reward": -0.26824783720076084, "rewards/format_reward": 0.125, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 1686.0833740234375, "epoch": 0.02057142857142857, "grad_norm": 0.24676487462788851, "kl": 4.7713518142700195e-05, "learning_rate": 3.6e-07, "loss": 0.0814, "reward": -0.32610235549509525, "reward_std": 0.23402154073119164, "rewards/cosine_scaled_reward": -0.25680116564035416, "rewards/format_reward": 0.18750000186264515, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 1773.6458740234375, "epoch": 0.021714285714285714, "grad_norm": 0.21561964662639843, "kl": 2.1457672119140625e-05, "learning_rate": 3.7999999999999996e-07, "loss": 0.0164, "reward": -0.5961569249629974, "reward_std": 0.1714775264263153, "rewards/cosine_scaled_reward": -0.3501618057489395, "rewards/format_reward": 0.10416666977107525, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 1529.3125610351562, "epoch": 0.022857142857142857, "grad_norm": 0.251130340260543, "kl": 3.24249267578125e-05, "learning_rate": 4e-07, "loss": 0.0293, "reward": -0.048260755836963654, "reward_std": 0.34835576079785824, "rewards/cosine_scaled_reward": -0.20121371746063232, "rewards/format_reward": 0.35416667722165585, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 1494.6250305175781, "epoch": 0.024, "grad_norm": 0.3018968569179871, "kl": 2.6673078536987305e-05, "learning_rate": 4.1999999999999995e-07, "loss": 0.0278, "reward": 0.021329142153263092, "reward_std": 0.45257429778575897, "rewards/cosine_scaled_reward": -0.15600210055708885, "rewards/format_reward": 0.3333333358168602, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 1778.5625610351562, "epoch": 0.025142857142857144, "grad_norm": 0.29253387654098556, "kl": 3.1888484954833984e-05, "learning_rate": 4.3999999999999997e-07, "loss": 0.0494, "reward": -0.5034094974398613, "reward_std": 0.3080843798816204, "rewards/cosine_scaled_reward": -0.29337141662836075, "rewards/format_reward": 0.08333333395421505, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 1762.8958740234375, "epoch": 0.026285714285714287, "grad_norm": 0.21053978305274443, "kl": 4.506111145019531e-05, "learning_rate": 4.6e-07, "loss": 0.0144, "reward": -0.028878159821033478, "reward_std": 0.5564102046191692, "rewards/cosine_scaled_reward": -0.10818908177316189, "rewards/format_reward": 0.1875000074505806, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 1352.5625305175781, "epoch": 0.027428571428571427, "grad_norm": 0.20202450012624545, "kl": 1.6548670828342438e-05, "learning_rate": 4.8e-07, "loss": 0.0005, "reward": 0.6555859744548798, "reward_std": 0.47822858951985836, "rewards/cosine_scaled_reward": 0.06737629324197769, "rewards/format_reward": 0.520833333954215, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 1597.1875610351562, "epoch": 0.02857142857142857, "grad_norm": 0.4327230812041704, "kl": 3.0606985092163086e-05, "learning_rate": 5e-07, "loss": 0.0701, "reward": 0.05484675616025925, "reward_std": 0.6329891942441463, "rewards/cosine_scaled_reward": -0.11840994283556938, "rewards/format_reward": 0.29166667722165585, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 1647.916748046875, "epoch": 0.029714285714285714, "grad_norm": 0.21123992049117873, "kl": 2.2917985916137695e-05, "learning_rate": 5.2e-07, "loss": 0.031, "reward": -0.24321994185447693, "reward_std": 0.12097731977701187, "rewards/cosine_scaled_reward": -0.18410997837781906, "rewards/format_reward": 0.125, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 1638.8958740234375, "epoch": 0.030857142857142857, "grad_norm": 0.21745088219923464, "kl": 3.2067298889160156e-05, "learning_rate": 5.4e-07, "loss": -0.0097, "reward": -0.3657397888600826, "reward_std": 0.24539830163121223, "rewards/cosine_scaled_reward": -0.2974532376974821, "rewards/format_reward": 0.2291666716337204, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 1711.2709045410156, "epoch": 0.032, "grad_norm": 0.2552233664551883, "kl": 2.8468668460845947e-05, "learning_rate": 5.6e-07, "loss": 0.0256, "reward": -0.38710537925362587, "reward_std": 0.2530311979353428, "rewards/cosine_scaled_reward": -0.2768860347568989, "rewards/format_reward": 0.1666666716337204, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 1713.8125610351562, "epoch": 0.03314285714285714, "grad_norm": 0.202249350617508, "kl": 2.86102294921875e-05, "learning_rate": 5.8e-07, "loss": 0.0135, "reward": -0.1931730881333351, "reward_std": 0.5632064789533615, "rewards/cosine_scaled_reward": -0.20075321290642023, "rewards/format_reward": 0.2083333358168602, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 1732.291748046875, "epoch": 0.03428571428571429, "grad_norm": 0.23328556356102392, "kl": 2.165883779525757e-05, "learning_rate": 6e-07, "loss": 0.0564, "reward": -0.3746844604611397, "reward_std": 0.34011659026145935, "rewards/cosine_scaled_reward": -0.24984224140644073, "rewards/format_reward": 0.12500000186264515, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 1445.3125305175781, "epoch": 0.03542857142857143, "grad_norm": 0.30643607095324277, "kl": 3.966689109802246e-05, "learning_rate": 6.2e-07, "loss": 0.0923, "reward": -0.09436208941042423, "reward_std": 0.3265727870166302, "rewards/cosine_scaled_reward": -0.21384770551230758, "rewards/format_reward": 0.33333333395421505, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 1810.7917175292969, "epoch": 0.036571428571428574, "grad_norm": 0.20484433233713875, "kl": 2.8021633625030518e-05, "learning_rate": 6.4e-07, "loss": 0.0202, "reward": -0.5034667998552322, "reward_std": 0.15860500000417233, "rewards/cosine_scaled_reward": -0.2621500678360462, "rewards/format_reward": 0.02083333395421505, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 1750.9584045410156, "epoch": 0.037714285714285714, "grad_norm": 0.2027434434467969, "kl": 2.5600194931030273e-05, "learning_rate": 6.6e-07, "loss": -0.0171, "reward": -0.25296103954315186, "reward_std": 0.4817052260041237, "rewards/cosine_scaled_reward": -0.2514805067330599, "rewards/format_reward": 0.25000000558793545, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 1634.8333740234375, "epoch": 0.038857142857142854, "grad_norm": 0.23764579059557195, "kl": 2.331659197807312e-05, "learning_rate": 6.800000000000001e-07, "loss": 0.0003, "reward": -0.3657361939549446, "reward_std": 0.2039697989821434, "rewards/cosine_scaled_reward": -0.25578476674854755, "rewards/format_reward": 0.14583333395421505, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 1691.1875610351562, "epoch": 0.04, "grad_norm": 0.2390715088796384, "kl": 1.8522143363952637e-05, "learning_rate": 7e-07, "loss": 0.0579, "reward": -0.1916074175387621, "reward_std": 0.40257398039102554, "rewards/cosine_scaled_reward": -0.23122038505971432, "rewards/format_reward": 0.27083334885537624, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 1526.2292175292969, "epoch": 0.04114285714285714, "grad_norm": 0.2361249356185026, "kl": 3.781914710998535e-05, "learning_rate": 7.2e-07, "loss": 0.0401, "reward": 0.35939645767211914, "reward_std": 0.39011720940470695, "rewards/cosine_scaled_reward": -0.01821846514940262, "rewards/format_reward": 0.3958333395421505, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 1645.7708740234375, "epoch": 0.04228571428571429, "grad_norm": 0.26864783041008133, "kl": 3.820657730102539e-05, "learning_rate": 7.4e-07, "loss": 0.0746, "reward": -0.2870800420641899, "reward_std": 0.46812814101576805, "rewards/cosine_scaled_reward": -0.25812335684895515, "rewards/format_reward": 0.2291666679084301, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 1722.5000610351562, "epoch": 0.04342857142857143, "grad_norm": 0.27664066975056834, "kl": 5.131959915161133e-05, "learning_rate": 7.599999999999999e-07, "loss": 0.0586, "reward": -0.15014038234949112, "reward_std": 0.4126087427139282, "rewards/cosine_scaled_reward": -0.2000702191144228, "rewards/format_reward": 0.2500000074505806, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 1678.7083740234375, "epoch": 0.044571428571428574, "grad_norm": 0.3003829192682386, "kl": 4.968792200088501e-05, "learning_rate": 7.799999999999999e-07, "loss": 0.097, "reward": -0.21257384680211544, "reward_std": 0.48539142310619354, "rewards/cosine_scaled_reward": -0.2312869280576706, "rewards/format_reward": 0.2500000111758709, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 1690.8958740234375, "epoch": 0.045714285714285714, "grad_norm": 0.20909108511646457, "kl": 5.0902366638183594e-05, "learning_rate": 8e-07, "loss": 0.0436, "reward": -0.5045258924365044, "reward_std": 0.2920587807893753, "rewards/cosine_scaled_reward": -0.3564296290278435, "rewards/format_reward": 0.2083333358168602, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 1806.3334045410156, "epoch": 0.046857142857142854, "grad_norm": 0.2168555566166619, "kl": 3.137439489364624e-05, "learning_rate": 8.199999999999999e-07, "loss": -0.0012, "reward": 0.04771171510219574, "reward_std": 0.33250839821994305, "rewards/cosine_scaled_reward": -0.06989414617419243, "rewards/format_reward": 0.18750000186264515, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 1300.6250457763672, "epoch": 0.048, "grad_norm": 0.40542845209419376, "kl": 0.000291675329208374, "learning_rate": 8.399999999999999e-07, "loss": 0.0768, "reward": 0.27488730661571026, "reward_std": 0.45710677094757557, "rewards/cosine_scaled_reward": -0.1646396858850494, "rewards/format_reward": 0.6041666716337204, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 1705.8750610351562, "epoch": 0.04914285714285714, "grad_norm": 0.21842925663095267, "kl": 3.538280725479126e-05, "learning_rate": 8.599999999999999e-07, "loss": 0.0308, "reward": -0.2755163535475731, "reward_std": 0.3637393806129694, "rewards/cosine_scaled_reward": -0.2210915139876306, "rewards/format_reward": 0.1666666679084301, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 1665.0625305175781, "epoch": 0.05028571428571429, "grad_norm": 0.26271417694787236, "kl": 0.00046503543853759766, "learning_rate": 8.799999999999999e-07, "loss": 0.073, "reward": -0.12092901021242142, "reward_std": 0.5556337833404541, "rewards/cosine_scaled_reward": -0.17504783952608705, "rewards/format_reward": 0.2291666679084301, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 1733.2084045410156, "epoch": 0.05142857142857143, "grad_norm": 0.21285192669515357, "kl": 5.0537288188934326e-05, "learning_rate": 9e-07, "loss": 0.0423, "reward": -0.05799056589603424, "reward_std": 0.4342048391699791, "rewards/cosine_scaled_reward": -0.14357861876487732, "rewards/format_reward": 0.22916666977107525, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 1640.0834045410156, "epoch": 0.052571428571428575, "grad_norm": 0.2622293688477209, "kl": 0.00013068318367004395, "learning_rate": 9.2e-07, "loss": 0.0317, "reward": -0.005384169518947601, "reward_std": 0.3068407401442528, "rewards/cosine_scaled_reward": -0.1068587563931942, "rewards/format_reward": 0.20833333395421505, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 1498.8333892822266, "epoch": 0.053714285714285714, "grad_norm": 0.274608905827555, "kl": 0.0001885145902633667, "learning_rate": 9.399999999999999e-07, "loss": 0.049, "reward": -0.002073638141155243, "reward_std": 0.4514222964644432, "rewards/cosine_scaled_reward": -0.17812015302479267, "rewards/format_reward": 0.3541666753590107, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 1610.4792175292969, "epoch": 0.054857142857142854, "grad_norm": 0.24771930467103717, "kl": 0.00015616416931152344, "learning_rate": 9.6e-07, "loss": 0.0334, "reward": -0.22091616783291101, "reward_std": 0.33334225323051214, "rewards/cosine_scaled_reward": -0.21462474018335342, "rewards/format_reward": 0.20833334140479565, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 1341.1458740234375, "epoch": 0.056, "grad_norm": 0.3710205417665813, "kl": 0.00029793381690979004, "learning_rate": 9.8e-07, "loss": 0.0862, "reward": 0.40674951672554016, "reward_std": 0.5115297809243202, "rewards/cosine_scaled_reward": -0.025791920721530914, "rewards/format_reward": 0.45833333395421505, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 1335.1667175292969, "epoch": 0.05714285714285714, "grad_norm": 0.3034272517231627, "kl": 0.0005925297737121582, "learning_rate": 1e-06, "loss": 0.1036, "reward": 0.36978277564048767, "reward_std": 0.4990865057334304, "rewards/cosine_scaled_reward": -0.033858626149594784, "rewards/format_reward": 0.43750002048909664, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 1686.8959045410156, "epoch": 0.05828571428571429, "grad_norm": 0.3009121706411098, "kl": 0.00032591819763183594, "learning_rate": 9.999890338174275e-07, "loss": 0.0864, "reward": -0.20582207757979631, "reward_std": 0.5198994930833578, "rewards/cosine_scaled_reward": -0.19666103832423687, "rewards/format_reward": 0.1875000111758709, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 1718.2291870117188, "epoch": 0.05942857142857143, "grad_norm": 0.21311754620957382, "kl": 0.0005127787590026855, "learning_rate": 9.999561358041868e-07, "loss": 0.0262, "reward": -0.39756081253290176, "reward_std": 0.34694093093276024, "rewards/cosine_scaled_reward": -0.2716970667243004, "rewards/format_reward": 0.1458333358168602, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 1611.8334045410156, "epoch": 0.060571428571428575, "grad_norm": 0.22683388578373892, "kl": 0.0005531832575798035, "learning_rate": 9.999013075636804e-07, "loss": 0.068, "reward": -0.13391486555337906, "reward_std": 0.27848392724990845, "rewards/cosine_scaled_reward": -0.22320742718875408, "rewards/format_reward": 0.31250000186264515, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 1442.0834045410156, "epoch": 0.061714285714285715, "grad_norm": 0.24769106962876689, "kl": 0.0002713203430175781, "learning_rate": 9.998245517681593e-07, "loss": 0.0911, "reward": -0.11875106766819954, "reward_std": 0.1542784534394741, "rewards/cosine_scaled_reward": -0.2572922073304653, "rewards/format_reward": 0.3958333432674408, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 1688.4167175292969, "epoch": 0.06285714285714286, "grad_norm": 0.22851815885942953, "kl": 0.0001881718635559082, "learning_rate": 9.997258721585931e-07, "loss": 0.0068, "reward": -0.3640219047665596, "reward_std": 0.2585913948714733, "rewards/cosine_scaled_reward": -0.2965943031013012, "rewards/format_reward": 0.2291666753590107, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 1569.4166870117188, "epoch": 0.064, "grad_norm": 0.2466081306910316, "kl": 0.0021448135375976562, "learning_rate": 9.996052735444862e-07, "loss": 0.096, "reward": -0.4589140391908586, "reward_std": 0.4320836700499058, "rewards/cosine_scaled_reward": -0.3440403640270233, "rewards/format_reward": 0.2291666679084301, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 1629.979248046875, "epoch": 0.06514285714285714, "grad_norm": 0.22573731739546327, "kl": 0.0010238885879516602, "learning_rate": 9.994627618036452e-07, "loss": 0.0592, "reward": -0.3061641752719879, "reward_std": 0.5002065226435661, "rewards/cosine_scaled_reward": -0.26766542345285416, "rewards/format_reward": 0.2291666716337204, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 1660.4792175292969, "epoch": 0.06628571428571428, "grad_norm": 0.22190381637143303, "kl": 0.0011049509048461914, "learning_rate": 9.992983438818915e-07, "loss": 0.022, "reward": -0.32173825055360794, "reward_std": 0.27725364826619625, "rewards/cosine_scaled_reward": -0.2754524536430836, "rewards/format_reward": 0.2291666679084301, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 1690.0417175292969, "epoch": 0.06742857142857143, "grad_norm": 0.21914617585966853, "kl": 0.0010164976119995117, "learning_rate": 9.991120277927223e-07, "loss": 0.0444, "reward": -0.021609768271446228, "reward_std": 0.3677750062197447, "rewards/cosine_scaled_reward": -0.135804895311594, "rewards/format_reward": 0.25000000558793545, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 1581.6875305175781, "epoch": 0.06857142857142857, "grad_norm": 0.4016735260144472, "kl": 0.01423954963684082, "learning_rate": 9.989038226169207e-07, "loss": 0.0192, "reward": 0.11502109467983246, "reward_std": 0.29630398005247116, "rewards/cosine_scaled_reward": -0.057072801515460014, "rewards/format_reward": 0.2291666716337204, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 1475.5833740234375, "epoch": 0.06971428571428571, "grad_norm": 0.24285848407581584, "kl": 0.0003628730773925781, "learning_rate": 9.98673738502114e-07, "loss": 0.0731, "reward": 0.5937481597065926, "reward_std": 0.6881431620568037, "rewards/cosine_scaled_reward": 0.046874068677425385, "rewards/format_reward": 0.5000000149011612, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 1805.7500610351562, "epoch": 0.07085714285714285, "grad_norm": 0.19714468440546948, "kl": 0.0005519390106201172, "learning_rate": 9.98421786662277e-07, "loss": 0.0172, "reward": -0.4636555463075638, "reward_std": 0.3160466430708766, "rewards/cosine_scaled_reward": -0.2734944522380829, "rewards/format_reward": 0.08333333395421505, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 1329.2917175292969, "epoch": 0.072, "grad_norm": 0.28510447078335305, "kl": 0.004929542541503906, "learning_rate": 9.981479793771866e-07, "loss": 0.1079, "reward": 0.30475724674761295, "reward_std": 0.4675188772380352, "rewards/cosine_scaled_reward": -0.0976213626563549, "rewards/format_reward": 0.5000000149011612, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 1636.5208740234375, "epoch": 0.07314285714285715, "grad_norm": 0.20815660806735267, "kl": 0.0003604888916015625, "learning_rate": 9.97852329991824e-07, "loss": 0.0625, "reward": 0.29327625688165426, "reward_std": 0.5610844530165195, "rewards/cosine_scaled_reward": -0.03044520819094032, "rewards/format_reward": 0.354166679084301, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 1559.5000305175781, "epoch": 0.07428571428571429, "grad_norm": 0.24172417943995111, "kl": 0.001363515853881836, "learning_rate": 9.975348529157229e-07, "loss": 0.0995, "reward": 0.1283707581460476, "reward_std": 0.7667413726449013, "rewards/cosine_scaled_reward": -0.13373128045350313, "rewards/format_reward": 0.3958333507180214, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 1729.6667175292969, "epoch": 0.07542857142857143, "grad_norm": 0.20090852438136195, "kl": 0.00067138671875, "learning_rate": 9.971955636222684e-07, "loss": 0.0209, "reward": -0.39017004892230034, "reward_std": 0.32542612217366695, "rewards/cosine_scaled_reward": -0.3200850263237953, "rewards/format_reward": 0.2500000149011612, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 1648.7292175292969, "epoch": 0.07657142857142857, "grad_norm": 0.18795555019652113, "kl": 0.0007681846618652344, "learning_rate": 9.968344786479415e-07, "loss": 0.0342, "reward": -0.1792638599872589, "reward_std": 0.3578680492937565, "rewards/cosine_scaled_reward": -0.20421527326107025, "rewards/format_reward": 0.2291666679084301, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 1388.5625610351562, "epoch": 0.07771428571428571, "grad_norm": 0.3904259482407812, "kl": 0.00202178955078125, "learning_rate": 9.964516155915151e-07, "loss": 0.0637, "reward": 0.16577239707112312, "reward_std": 0.3421984985470772, "rewards/cosine_scaled_reward": -0.09419714100658894, "rewards/format_reward": 0.3541666716337204, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 1507.8333740234375, "epoch": 0.07885714285714286, "grad_norm": 0.2361059164440503, "kl": 0.0008258819580078125, "learning_rate": 9.960469931131936e-07, "loss": 0.0613, "reward": 0.17160904966294765, "reward_std": 0.38275655917823315, "rewards/cosine_scaled_reward": -0.10169548355042934, "rewards/format_reward": 0.37500000558793545, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 1690.3750305175781, "epoch": 0.08, "grad_norm": 0.19302606573391104, "kl": 0.002358675003051758, "learning_rate": 9.956206309337066e-07, "loss": 0.105, "reward": -0.1555338129401207, "reward_std": 0.37855083122849464, "rewards/cosine_scaled_reward": -0.20276692137122154, "rewards/format_reward": 0.25000000186264515, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 1441.729232788086, "epoch": 0.08114285714285714, "grad_norm": 0.331702227116139, "kl": 0.0023870468139648438, "learning_rate": 9.951725498333448e-07, "loss": 0.1388, "reward": -0.2453744667582214, "reward_std": 0.15839526243507862, "rewards/cosine_scaled_reward": -0.3101872429251671, "rewards/format_reward": 0.3750000149011612, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 1497.3959045410156, "epoch": 0.08228571428571428, "grad_norm": 0.33894190686830156, "kl": 0.0017808079719543457, "learning_rate": 9.947027716509488e-07, "loss": 0.0553, "reward": 0.09824148565530777, "reward_std": 0.1729265321046114, "rewards/cosine_scaled_reward": -0.08629592880606651, "rewards/format_reward": 0.2708333358168602, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 1444.7708892822266, "epoch": 0.08342857142857144, "grad_norm": 0.9254159035231885, "kl": 0.039752960205078125, "learning_rate": 9.942113192828444e-07, "loss": 0.1025, "reward": 0.47389062121510506, "reward_std": 0.7162522077560425, "rewards/cosine_scaled_reward": -0.05472135776653886, "rewards/format_reward": 0.5833333507180214, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 1484.7083740234375, "epoch": 0.08457142857142858, "grad_norm": 0.2164345231616129, "kl": 0.0021944046020507812, "learning_rate": 9.93698216681727e-07, "loss": 0.0129, "reward": -0.06718481332063675, "reward_std": 0.16878989525139332, "rewards/cosine_scaled_reward": -0.22109240666031837, "rewards/format_reward": 0.3750000037252903, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 1526.0417175292969, "epoch": 0.08571428571428572, "grad_norm": 0.3075410122107456, "kl": 0.00359344482421875, "learning_rate": 9.931634888554935e-07, "loss": 0.0753, "reward": 0.17093585059046745, "reward_std": 0.4688509330153465, "rewards/cosine_scaled_reward": -0.08119874075055122, "rewards/format_reward": 0.33333334513008595, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 1640.4583740234375, "epoch": 0.08685714285714285, "grad_norm": 0.20492660661291412, "kl": 0.00046312808990478516, "learning_rate": 9.926071618660237e-07, "loss": 0.0184, "reward": 0.029385031666606665, "reward_std": 0.6126945875585079, "rewards/cosine_scaled_reward": -0.151974156498909, "rewards/format_reward": 0.33333333395421505, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 1674.5625610351562, "epoch": 0.088, "grad_norm": 0.21980728108796918, "kl": 0.0009822845458984375, "learning_rate": 9.9202926282791e-07, "loss": -0.0002, "reward": -0.18806731700897217, "reward_std": 0.12730432488024235, "rewards/cosine_scaled_reward": -0.15653366968035698, "rewards/format_reward": 0.125, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 1518.0625610351562, "epoch": 0.08914285714285715, "grad_norm": 0.242785552217566, "kl": 0.0009822845458984375, "learning_rate": 9.91429819907136e-07, "loss": 0.0619, "reward": 0.13657424598932266, "reward_std": 0.4360465779900551, "rewards/cosine_scaled_reward": -0.10879619419574738, "rewards/format_reward": 0.35416666977107525, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 1575.4792175292969, "epoch": 0.09028571428571429, "grad_norm": 0.24080955526978698, "kl": 0.0005426406860351562, "learning_rate": 9.908088623197048e-07, "loss": 0.0519, "reward": 0.016203314065933228, "reward_std": 0.6479124575853348, "rewards/cosine_scaled_reward": -0.1585650178603828, "rewards/format_reward": 0.3333333395421505, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 1733.9167175292969, "epoch": 0.09142857142857143, "grad_norm": 0.2186002750502081, "kl": 0.0005044937133789062, "learning_rate": 9.901664203302124e-07, "loss": 0.031, "reward": -0.5251612327992916, "reward_std": 0.40141166001558304, "rewards/cosine_scaled_reward": -0.33549728989601135, "rewards/format_reward": 0.1458333358168602, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 1728.479248046875, "epoch": 0.09257142857142857, "grad_norm": 0.21399417944679958, "kl": 0.0009112358093261719, "learning_rate": 9.895025252503755e-07, "loss": 0.0374, "reward": -0.19506264757364988, "reward_std": 0.48094464652240276, "rewards/cosine_scaled_reward": -0.1912813438102603, "rewards/format_reward": 0.18750000558793545, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 1601.9792175292969, "epoch": 0.09371428571428571, "grad_norm": 0.2450961734236274, "kl": 0.0009531974792480469, "learning_rate": 9.888172094375033e-07, "loss": 0.077, "reward": -0.1917775571346283, "reward_std": 0.5255400985479355, "rewards/cosine_scaled_reward": -0.25213877484202385, "rewards/format_reward": 0.31250001303851604, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 1748.1875610351562, "epoch": 0.09485714285714286, "grad_norm": 0.22448680749018862, "kl": 0.0004420280456542969, "learning_rate": 9.881105062929221e-07, "loss": 0.0159, "reward": -0.43924427404999733, "reward_std": 0.2609596960246563, "rewards/cosine_scaled_reward": -0.27170546911656857, "rewards/format_reward": 0.10416666977107525, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 1515.7708435058594, "epoch": 0.096, "grad_norm": 0.2231038243696207, "kl": 0.0006551742553710938, "learning_rate": 9.873824502603459e-07, "loss": 0.0246, "reward": 0.36620646342635155, "reward_std": 0.884237602353096, "rewards/cosine_scaled_reward": -0.06689677853137255, "rewards/format_reward": 0.5000000074505806, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 1701.2083740234375, "epoch": 0.09714285714285714, "grad_norm": 0.20906161676384463, "kl": 0.000804901123046875, "learning_rate": 9.866330768241983e-07, "loss": 0.0555, "reward": -0.39954638853669167, "reward_std": 0.31576116755604744, "rewards/cosine_scaled_reward": -0.2726898640394211, "rewards/format_reward": 0.14583333395421505, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 1610.9792175292969, "epoch": 0.09828571428571428, "grad_norm": 0.22100681278056383, "kl": 0.0009822845458984375, "learning_rate": 9.85862422507884e-07, "loss": 0.0444, "reward": -0.24343110900372267, "reward_std": 0.2885846998542547, "rewards/cosine_scaled_reward": -0.30921556800603867, "rewards/format_reward": 0.375, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 1695.354248046875, "epoch": 0.09942857142857142, "grad_norm": 0.24683069440334848, "kl": 0.0029506683349609375, "learning_rate": 9.850705248720068e-07, "loss": 0.0377, "reward": -0.09222975745797157, "reward_std": 0.24668438732624054, "rewards/cosine_scaled_reward": -0.1502815391868353, "rewards/format_reward": 0.2083333432674408, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 1594.9167175292969, "epoch": 0.10057142857142858, "grad_norm": 0.27215086328931853, "kl": 0.0016989707946777344, "learning_rate": 9.8425742251254e-07, "loss": 0.1075, "reward": 0.18186672404408455, "reward_std": 0.9013341814279556, "rewards/cosine_scaled_reward": -0.07573332265019417, "rewards/format_reward": 0.3333333432674408, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 1738.7292175292969, "epoch": 0.10171428571428572, "grad_norm": 0.1946900134085172, "kl": 0.000820159912109375, "learning_rate": 9.83423155058946e-07, "loss": 0.0331, "reward": -0.28752805292606354, "reward_std": 0.4243736080825329, "rewards/cosine_scaled_reward": -0.22709737345576286, "rewards/format_reward": 0.16666667722165585, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 1572.2916870117188, "epoch": 0.10285714285714286, "grad_norm": 0.20694868118264276, "kl": 0.0007328987121582031, "learning_rate": 9.825677631722435e-07, "loss": 0.0753, "reward": -0.08595774043351412, "reward_std": 0.5348180644214153, "rewards/cosine_scaled_reward": -0.18881220323964953, "rewards/format_reward": 0.2916666679084301, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 1601.5625610351562, "epoch": 0.104, "grad_norm": 0.20840771038907893, "kl": 0.0007948875427246094, "learning_rate": 9.816912885430258e-07, "loss": 0.0808, "reward": -0.015035435557365417, "reward_std": 0.14022575318813324, "rewards/cosine_scaled_reward": -0.1429343856871128, "rewards/format_reward": 0.2708333432674408, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 1498.2292175292969, "epoch": 0.10514285714285715, "grad_norm": 0.20771988001872319, "kl": 0.0009174346923828125, "learning_rate": 9.807937738894303e-07, "loss": 0.0994, "reward": 0.07728531863540411, "reward_std": 0.508693166077137, "rewards/cosine_scaled_reward": -0.1384406816214323, "rewards/format_reward": 0.35416666977107525, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 1347.8125305175781, "epoch": 0.10628571428571429, "grad_norm": 0.27527082284418775, "kl": 0.0021848678588867188, "learning_rate": 9.798752629550546e-07, "loss": 0.0296, "reward": 0.30088429898023605, "reward_std": 0.5643313899636269, "rewards/cosine_scaled_reward": -0.10997452400624752, "rewards/format_reward": 0.5208333432674408, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 1733.2500610351562, "epoch": 0.10742857142857143, "grad_norm": 0.23935442867120157, "kl": 0.0012607574462890625, "learning_rate": 9.78935800506826e-07, "loss": 0.021, "reward": -0.34041892923414707, "reward_std": 0.2469240017235279, "rewards/cosine_scaled_reward": -0.26395946741104126, "rewards/format_reward": 0.18750000186264515, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 1738.5625610351562, "epoch": 0.10857142857142857, "grad_norm": 0.21273217079983556, "kl": 0.0006814002990722656, "learning_rate": 9.779754323328192e-07, "loss": -0.0093, "reward": -0.5389137789607048, "reward_std": 0.17841140553355217, "rewards/cosine_scaled_reward": -0.3423735648393631, "rewards/format_reward": 0.14583333395421505, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 1433.9375610351562, "epoch": 0.10971428571428571, "grad_norm": 0.30691056711732384, "kl": 0.002574920654296875, "learning_rate": 9.769942052400235e-07, "loss": 0.137, "reward": 0.296867486089468, "reward_std": 0.3943296894431114, "rewards/cosine_scaled_reward": -0.04948292672634125, "rewards/format_reward": 0.3958333432674408, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 1567.2500610351562, "epoch": 0.11085714285714286, "grad_norm": 0.25051085956589897, "kl": 0.0013968944549560547, "learning_rate": 9.759921670520634e-07, "loss": 0.0267, "reward": -0.15386457741260529, "reward_std": 0.37108149379491806, "rewards/cosine_scaled_reward": -0.21234895661473274, "rewards/format_reward": 0.2708333432674408, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 1406.0208740234375, "epoch": 0.112, "grad_norm": 0.366560785041491, "kl": 0.0012578964233398438, "learning_rate": 9.749693666068663e-07, "loss": 0.099, "reward": 0.3372333124279976, "reward_std": 0.3852754198014736, "rewards/cosine_scaled_reward": -0.12305000983178616, "rewards/format_reward": 0.5833333507180214, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 1598.7917175292969, "epoch": 0.11314285714285714, "grad_norm": 0.2584279138871096, "kl": 0.0010881423950195312, "learning_rate": 9.739258537542835e-07, "loss": 0.0536, "reward": 0.1023973822593689, "reward_std": 0.4502338841557503, "rewards/cosine_scaled_reward": -0.1258846465498209, "rewards/format_reward": 0.3541666828095913, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 1557.479248046875, "epoch": 0.11428571428571428, "grad_norm": 0.23713752727518134, "kl": 0.0009851455688476562, "learning_rate": 9.728616793536587e-07, "loss": 0.0694, "reward": -0.15063253417611122, "reward_std": 0.3854830376803875, "rewards/cosine_scaled_reward": -0.23156626150012016, "rewards/format_reward": 0.3125000111758709, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 1387.4583435058594, "epoch": 0.11542857142857142, "grad_norm": 0.32157411791816565, "kl": 0.001094818115234375, "learning_rate": 9.717768952713511e-07, "loss": 0.1116, "reward": 0.07011325657367706, "reward_std": 0.3243808038532734, "rewards/cosine_scaled_reward": -0.19411004893481731, "rewards/format_reward": 0.4583333395421505, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 1449.3750610351562, "epoch": 0.11657142857142858, "grad_norm": 0.2168599934302549, "kl": 0.0015411376953125, "learning_rate": 9.706715543782064e-07, "loss": 0.0577, "reward": -0.21096567437052727, "reward_std": 0.29599858447909355, "rewards/cosine_scaled_reward": -0.3138161562383175, "rewards/format_reward": 0.4166666865348816, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 1715.166748046875, "epoch": 0.11771428571428572, "grad_norm": 0.21920178674297372, "kl": 0.0015869140625, "learning_rate": 9.695457105469804e-07, "loss": 0.0667, "reward": -0.18699942529201508, "reward_std": 0.5092732682824135, "rewards/cosine_scaled_reward": -0.22891639173030853, "rewards/format_reward": 0.2708333395421505, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 1304.4583435058594, "epoch": 0.11885714285714286, "grad_norm": 0.22942484314958453, "kl": 0.0013804435729980469, "learning_rate": 9.683994186497132e-07, "loss": 0.0839, "reward": 0.5173723250627518, "reward_std": 0.5176322646439075, "rewards/cosine_scaled_reward": -0.001730518415570259, "rewards/format_reward": 0.5208333358168602, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 1364.8333740234375, "epoch": 0.12, "grad_norm": 0.25403433256650454, "kl": 0.0016727447509765625, "learning_rate": 9.672327345550543e-07, "loss": 0.1156, "reward": 0.28816052433103323, "reward_std": 0.240465197712183, "rewards/cosine_scaled_reward": -0.1267530769109726, "rewards/format_reward": 0.541666679084301, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 1570.6667175292969, "epoch": 0.12114285714285715, "grad_norm": 0.2462172191203138, "kl": 0.0020122528076171875, "learning_rate": 9.66045715125541e-07, "loss": 0.0866, "reward": 0.34020555624738336, "reward_std": 0.7328735627233982, "rewards/cosine_scaled_reward": -0.038230573292821646, "rewards/format_reward": 0.41666666977107525, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 1243.4583740234375, "epoch": 0.12228571428571429, "grad_norm": 0.22392855280151888, "kl": 0.001399993896484375, "learning_rate": 9.648384182148252e-07, "loss": 0.0861, "reward": 0.19801579043269157, "reward_std": 0.4772573560476303, "rewards/cosine_scaled_reward": -0.18224211037158966, "rewards/format_reward": 0.5625000149011612, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 1376.5625610351562, "epoch": 0.12342857142857143, "grad_norm": 0.2328882803373465, "kl": 0.0032482147216796875, "learning_rate": 9.636109026648554e-07, "loss": 0.0636, "reward": 0.6495321169495583, "reward_std": 0.5899618566036224, "rewards/cosine_scaled_reward": 0.06434935945435427, "rewards/format_reward": 0.5208333488553762, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 1368.0625305175781, "epoch": 0.12457142857142857, "grad_norm": 0.3696050391986309, "kl": 0.0028667449951171875, "learning_rate": 9.623632283030077e-07, "loss": 0.1246, "reward": -0.031360091641545296, "reward_std": 0.4002140313386917, "rewards/cosine_scaled_reward": -0.2656800393015146, "rewards/format_reward": 0.5, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 1444.6666870117188, "epoch": 0.12571428571428572, "grad_norm": 0.35213532577859125, "kl": 0.0029430389404296875, "learning_rate": 9.610954559391704e-07, "loss": 0.1339, "reward": 0.6942434869706631, "reward_std": 0.9198908805847168, "rewards/cosine_scaled_reward": 0.06587174534797668, "rewards/format_reward": 0.5625000149011612, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 1072.7708587646484, "epoch": 0.12685714285714286, "grad_norm": 0.2985726423715741, "kl": 0.001979827880859375, "learning_rate": 9.598076473627796e-07, "loss": 0.0476, "reward": 0.7408694333862513, "reward_std": 0.7333548963069916, "rewards/cosine_scaled_reward": -0.004565277136862278, "rewards/format_reward": 0.7500000149011612, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 1633.4167175292969, "epoch": 0.128, "grad_norm": 0.22471101395696397, "kl": 0.00258636474609375, "learning_rate": 9.58499865339809e-07, "loss": 0.0346, "reward": -0.05079384706914425, "reward_std": 0.4366183038800955, "rewards/cosine_scaled_reward": -0.2337302602827549, "rewards/format_reward": 0.4166666865348816, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 1319.7291870117188, "epoch": 0.12914285714285714, "grad_norm": 0.27063696127291986, "kl": 0.0033721923828125, "learning_rate": 9.571721736097088e-07, "loss": 0.0833, "reward": 0.6321319434791803, "reward_std": 0.5336715504527092, "rewards/cosine_scaled_reward": -0.006850697100162506, "rewards/format_reward": 0.6458333507180214, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 1052.3958892822266, "epoch": 0.13028571428571428, "grad_norm": 0.250125198289797, "kl": 0.0016460418701171875, "learning_rate": 9.55824636882301e-07, "loss": 0.0768, "reward": 0.653087726328522, "reward_std": 0.35864404030144215, "rewards/cosine_scaled_reward": -0.017206139862537384, "rewards/format_reward": 0.6875000149011612, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 1440.2083740234375, "epoch": 0.13142857142857142, "grad_norm": 0.29266585256345196, "kl": 0.0030155181884765625, "learning_rate": 9.54457320834625e-07, "loss": 0.0755, "reward": 0.21958831325173378, "reward_std": 0.704796127974987, "rewards/cosine_scaled_reward": -0.16103917988948524, "rewards/format_reward": 0.5416666865348816, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 1472.5416870117188, "epoch": 0.13257142857142856, "grad_norm": 0.26433038131357134, "kl": 0.003147125244140625, "learning_rate": 9.530702921077358e-07, "loss": 0.073, "reward": 0.018861573189496994, "reward_std": 0.3587416708469391, "rewards/cosine_scaled_reward": -0.18848587945103645, "rewards/format_reward": 0.39583334140479565, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 1545.9375305175781, "epoch": 0.1337142857142857, "grad_norm": 0.21836493727001577, "kl": 0.002864837646484375, "learning_rate": 9.516636183034564e-07, "loss": 0.1366, "reward": -0.32600877061486244, "reward_std": 0.43822694569826126, "rewards/cosine_scaled_reward": -0.35050439089536667, "rewards/format_reward": 0.3750000074505806, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 1334.1250305175781, "epoch": 0.13485714285714287, "grad_norm": 0.24394321780710398, "kl": 0.0026493072509765625, "learning_rate": 9.502373679810839e-07, "loss": 0.035, "reward": 0.457018606364727, "reward_std": 0.5285698734223843, "rewards/cosine_scaled_reward": -0.09440736100077629, "rewards/format_reward": 0.645833358168602, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 1220.3333740234375, "epoch": 0.136, "grad_norm": 0.28272459137828676, "kl": 0.0042877197265625, "learning_rate": 9.487916106540465e-07, "loss": 0.0804, "reward": 0.3442453145980835, "reward_std": 0.564174473285675, "rewards/cosine_scaled_reward": -0.12996070086956024, "rewards/format_reward": 0.6041666716337204, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 1528.6042175292969, "epoch": 0.13714285714285715, "grad_norm": 0.2668307726658885, "kl": 0.00232696533203125, "learning_rate": 9.473264167865171e-07, "loss": 0.1032, "reward": -0.03986197151243687, "reward_std": 0.37811761628836393, "rewards/cosine_scaled_reward": -0.2282643192447722, "rewards/format_reward": 0.4166666716337204, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 1584.8125610351562, "epoch": 0.1382857142857143, "grad_norm": 0.22786468100552407, "kl": 0.002208709716796875, "learning_rate": 9.458418577899774e-07, "loss": 0.0046, "reward": 0.16309459879994392, "reward_std": 0.2453223168849945, "rewards/cosine_scaled_reward": -0.1372026912868023, "rewards/format_reward": 0.4375000074505806, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 1462.6875610351562, "epoch": 0.13942857142857143, "grad_norm": 0.28816738889821486, "kl": 0.00514984130859375, "learning_rate": 9.443380060197385e-07, "loss": 0.0974, "reward": -0.12114270869642496, "reward_std": 0.2534109205007553, "rewards/cosine_scaled_reward": -0.2689046934247017, "rewards/format_reward": 0.41666666977107525, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 1375.7708740234375, "epoch": 0.14057142857142857, "grad_norm": 0.32101258824146217, "kl": 0.0041351318359375, "learning_rate": 9.428149347714143e-07, "loss": 0.1284, "reward": 0.18988706171512604, "reward_std": 0.8535008877515793, "rewards/cosine_scaled_reward": -0.17588980495929718, "rewards/format_reward": 0.541666679084301, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 1374.7292175292969, "epoch": 0.1417142857142857, "grad_norm": 0.2425349865258595, "kl": 0.00324249267578125, "learning_rate": 9.412727182773486e-07, "loss": 0.0382, "reward": 0.07038946449756622, "reward_std": 0.49846766516566277, "rewards/cosine_scaled_reward": -0.15230527985841036, "rewards/format_reward": 0.375, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 1664.8541870117188, "epoch": 0.14285714285714285, "grad_norm": 0.2457250240943947, "kl": 0.0025730133056640625, "learning_rate": 9.397114317029974e-07, "loss": 0.0291, "reward": 0.004289238480851054, "reward_std": 0.32331261597573757, "rewards/cosine_scaled_reward": -0.15410537272691727, "rewards/format_reward": 0.31250000186264515, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 1422.3750305175781, "epoch": 0.144, "grad_norm": 0.32285843347583837, "kl": 0.005126953125, "learning_rate": 9.381311511432658e-07, "loss": 0.0961, "reward": 0.19516459852457047, "reward_std": 0.6147220581769943, "rewards/cosine_scaled_reward": -0.162834367249161, "rewards/format_reward": 0.5208333507180214, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 1370.7708435058594, "epoch": 0.14514285714285713, "grad_norm": 0.24341515642410516, "kl": 0.0030078887939453125, "learning_rate": 9.36531953618799e-07, "loss": 0.0726, "reward": -0.08839717879891396, "reward_std": 0.4017263073474169, "rewards/cosine_scaled_reward": -0.2941986061632633, "rewards/format_reward": 0.5000000149011612, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 1219.9167175292969, "epoch": 0.1462857142857143, "grad_norm": 0.2623858416818109, "kl": 0.004070281982421875, "learning_rate": 9.34913917072228e-07, "loss": 0.0537, "reward": 0.43044765666127205, "reward_std": 0.49690980464220047, "rewards/cosine_scaled_reward": -0.15977618098258972, "rewards/format_reward": 0.7500000149011612, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 1109.1875305175781, "epoch": 0.14742857142857144, "grad_norm": 0.2829401049059584, "kl": 0.00757598876953125, "learning_rate": 9.332771203643714e-07, "loss": 0.0692, "reward": 0.6423492059111595, "reward_std": 0.4438105970621109, "rewards/cosine_scaled_reward": -0.03299206681549549, "rewards/format_reward": 0.7083333358168602, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 1495.8333740234375, "epoch": 0.14857142857142858, "grad_norm": 0.23104014201895975, "kl": 0.00299835205078125, "learning_rate": 9.316216432703916e-07, "loss": 0.0064, "reward": -0.09923176001757383, "reward_std": 0.43960002437233925, "rewards/cosine_scaled_reward": -0.29961589351296425, "rewards/format_reward": 0.5000000149011612, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 1543.2500305175781, "epoch": 0.14971428571428572, "grad_norm": 0.22132261730116032, "kl": 0.0033931732177734375, "learning_rate": 9.299475664759068e-07, "loss": 0.1051, "reward": -0.012558471411466599, "reward_std": 0.5053001046180725, "rewards/cosine_scaled_reward": -0.24586258456110954, "rewards/format_reward": 0.47916667722165585, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 1477.0625, "epoch": 0.15085714285714286, "grad_norm": 0.2442588816427236, "kl": 0.004528045654296875, "learning_rate": 9.282549715730579e-07, "loss": 0.0768, "reward": -0.11025669425725937, "reward_std": 0.18197684548795223, "rewards/cosine_scaled_reward": -0.284295029938221, "rewards/format_reward": 0.4583333432674408, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 1563.3958740234375, "epoch": 0.152, "grad_norm": 0.21023108591248665, "kl": 0.00415802001953125, "learning_rate": 9.265439410565328e-07, "loss": 0.0672, "reward": 0.13176406361162663, "reward_std": 0.5022407323122025, "rewards/cosine_scaled_reward": -0.18411797285079956, "rewards/format_reward": 0.5000000149011612, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 1049.1042022705078, "epoch": 0.15314285714285714, "grad_norm": 0.3838039161390532, "kl": 0.00562286376953125, "learning_rate": 9.248145583195447e-07, "loss": 0.1973, "reward": 0.4749515192816034, "reward_std": 0.3580738380551338, "rewards/cosine_scaled_reward": -0.15835759788751602, "rewards/format_reward": 0.7916666865348816, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 1351.5416870117188, "epoch": 0.15428571428571428, "grad_norm": 0.34500799880157473, "kl": 0.00400543212890625, "learning_rate": 9.230669076497687e-07, "loss": 0.143, "reward": 0.2647483544424176, "reward_std": 0.5427017770707607, "rewards/cosine_scaled_reward": -0.11762583442032337, "rewards/format_reward": 0.5000000204890966, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 1168.0625305175781, "epoch": 0.15542857142857142, "grad_norm": 0.31218899888892226, "kl": 0.004955291748046875, "learning_rate": 9.213010742252327e-07, "loss": 0.0562, "reward": 0.3584494572132826, "reward_std": 0.5529016815125942, "rewards/cosine_scaled_reward": -0.17494194395840168, "rewards/format_reward": 0.7083333432674408, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 1282.0416870117188, "epoch": 0.15657142857142858, "grad_norm": 0.2721613126225875, "kl": 0.007869720458984375, "learning_rate": 9.195171441101668e-07, "loss": 0.1358, "reward": 0.2924184873700142, "reward_std": 0.5777250528335571, "rewards/cosine_scaled_reward": -0.16629073955118656, "rewards/format_reward": 0.6250000055879354, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 1041.8958740234375, "epoch": 0.15771428571428572, "grad_norm": 0.30890354701331296, "kl": 0.00521087646484375, "learning_rate": 9.177152042508077e-07, "loss": 0.0338, "reward": 0.860385000705719, "reward_std": 0.8024220168590546, "rewards/cosine_scaled_reward": 0.023942476138472557, "rewards/format_reward": 0.8125000149011612, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 1192.8125457763672, "epoch": 0.15885714285714286, "grad_norm": 0.2622844914783918, "kl": 0.00412750244140625, "learning_rate": 9.158953424711624e-07, "loss": 0.0825, "reward": 0.5425689108669758, "reward_std": 0.5253265127539635, "rewards/cosine_scaled_reward": -0.12454888969659805, "rewards/format_reward": 0.7916666865348816, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 1283.4375305175781, "epoch": 0.16, "grad_norm": 0.24897560413424463, "kl": 0.004375457763671875, "learning_rate": 9.140576474687263e-07, "loss": 0.1075, "reward": 0.3927510902285576, "reward_std": 0.43108681961894035, "rewards/cosine_scaled_reward": -0.1577911265194416, "rewards/format_reward": 0.7083333507180214, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 1262.7916717529297, "epoch": 0.16114285714285714, "grad_norm": 0.3772239136734691, "kl": 0.005344390869140625, "learning_rate": 9.122022088101613e-07, "loss": 0.1713, "reward": 0.37745123356580734, "reward_std": 0.5623941943049431, "rewards/cosine_scaled_reward": -0.13419108092784882, "rewards/format_reward": 0.6458333432674408, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 1246.6042022705078, "epoch": 0.16228571428571428, "grad_norm": 0.28965855619789826, "kl": 0.0045166015625, "learning_rate": 9.103291169269299e-07, "loss": 0.0725, "reward": 0.5083264335989952, "reward_std": 0.5853047892451286, "rewards/cosine_scaled_reward": -0.047920111566782, "rewards/format_reward": 0.6041666828095913, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 1448.9375610351562, "epoch": 0.16342857142857142, "grad_norm": 0.2549900151123108, "kl": 0.006290435791015625, "learning_rate": 9.084384631108882e-07, "loss": 0.1142, "reward": 0.13985165720805526, "reward_std": 0.2659877985715866, "rewards/cosine_scaled_reward": -0.20090750604867935, "rewards/format_reward": 0.5416666865348816, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 1203.2083740234375, "epoch": 0.16457142857142856, "grad_norm": 0.2224971436424363, "kl": 0.005550384521484375, "learning_rate": 9.065303395098358e-07, "loss": 0.085, "reward": 0.5334329381585121, "reward_std": 0.5584629252552986, "rewards/cosine_scaled_reward": -0.10828354395925999, "rewards/format_reward": 0.75, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 1086.9792175292969, "epoch": 0.1657142857142857, "grad_norm": 0.3813865927902241, "kl": 0.0063629150390625, "learning_rate": 9.046048391230247e-07, "loss": 0.1879, "reward": 0.2875216994434595, "reward_std": 0.5303685888648033, "rewards/cosine_scaled_reward": -0.23123916238546371, "rewards/format_reward": 0.7500000149011612, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 1285.5625610351562, "epoch": 0.16685714285714287, "grad_norm": 0.2972743546494519, "kl": 0.0059814453125, "learning_rate": 9.026620557966279e-07, "loss": 0.0594, "reward": 0.2565866466611624, "reward_std": 0.46598899737000465, "rewards/cosine_scaled_reward": -0.25712333619594574, "rewards/format_reward": 0.770833358168602, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 896.6250305175781, "epoch": 0.168, "grad_norm": 0.32647401434979056, "kl": 0.006877899169921875, "learning_rate": 9.007020842191634e-07, "loss": -0.0011, "reward": 1.0985181145370007, "reward_std": 0.5338096916675568, "rewards/cosine_scaled_reward": 0.05967570189386606, "rewards/format_reward": 0.9791666716337204, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 1160.9583435058594, "epoch": 0.16914285714285715, "grad_norm": 0.2816274273158885, "kl": 0.00585174560546875, "learning_rate": 8.987250199168808e-07, "loss": 0.0442, "reward": 0.18387611024081707, "reward_std": 0.2959946282207966, "rewards/cosine_scaled_reward": -0.3143119588494301, "rewards/format_reward": 0.8125000149011612, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 1223.9791870117188, "epoch": 0.1702857142857143, "grad_norm": 0.2823488259457116, "kl": 0.00612640380859375, "learning_rate": 8.967309592491052e-07, "loss": 0.0654, "reward": 0.47756416723132133, "reward_std": 0.7413289695978165, "rewards/cosine_scaled_reward": -0.11538459919393063, "rewards/format_reward": 0.7083333432674408, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 986.6042022705078, "epoch": 0.17142857142857143, "grad_norm": 0.318064277562745, "kl": 0.0080413818359375, "learning_rate": 8.9471999940354e-07, "loss": 0.1332, "reward": 0.401881605386734, "reward_std": 0.6674076840281487, "rewards/cosine_scaled_reward": -0.20530920289456844, "rewards/format_reward": 0.8125000298023224, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 1111.4375610351562, "epoch": 0.17257142857142857, "grad_norm": 0.23750964874516883, "kl": 0.005405426025390625, "learning_rate": 8.926922383915315e-07, "loss": 0.0547, "reward": 0.42157261446118355, "reward_std": 0.2637167125940323, "rewards/cosine_scaled_reward": -0.14338038116693497, "rewards/format_reward": 0.7083333432674408, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 1390.2292175292969, "epoch": 0.1737142857142857, "grad_norm": 0.3108688575018839, "kl": 0.008697509765625, "learning_rate": 8.906477750432903e-07, "loss": 0.1077, "reward": 0.11867762915790081, "reward_std": 0.5801703371107578, "rewards/cosine_scaled_reward": -0.2739945203065872, "rewards/format_reward": 0.6666666716337204, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 1189.4792022705078, "epoch": 0.17485714285714285, "grad_norm": 0.22859697466435477, "kl": 0.006011962890625, "learning_rate": 8.88586709003076e-07, "loss": 0.0402, "reward": 0.46854234486818314, "reward_std": 0.5257667489349842, "rewards/cosine_scaled_reward": -0.07822884852066636, "rewards/format_reward": 0.6250000149011612, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 1227.7292175292969, "epoch": 0.176, "grad_norm": 0.23458511838935112, "kl": 0.0063323974609375, "learning_rate": 8.865091407243394e-07, "loss": 0.129, "reward": 0.7308447554241866, "reward_std": 0.4724605940282345, "rewards/cosine_scaled_reward": 0.011255700141191483, "rewards/format_reward": 0.7083333432674408, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 1320.6666870117188, "epoch": 0.17714285714285713, "grad_norm": 0.29059316598505575, "kl": 0.007198333740234375, "learning_rate": 8.844151714648274e-07, "loss": 0.1327, "reward": -0.1417454145848751, "reward_std": 0.3702365458011627, "rewards/cosine_scaled_reward": -0.3521227166056633, "rewards/format_reward": 0.5625000055879354, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 1116.000015258789, "epoch": 0.1782857142857143, "grad_norm": 0.37926874198201693, "kl": 0.00821685791015625, "learning_rate": 8.823049032816478e-07, "loss": 0.2189, "reward": 0.15536441165022552, "reward_std": 0.2769140414893627, "rewards/cosine_scaled_reward": -0.2869011387228966, "rewards/format_reward": 0.729166679084301, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 1049.5000305175781, "epoch": 0.17942857142857144, "grad_norm": 0.3728612044799926, "kl": 0.02156829833984375, "learning_rate": 8.801784390262943e-07, "loss": 0.0389, "reward": 0.7612650550436229, "reward_std": 0.31401624344289303, "rewards/cosine_scaled_reward": 0.005632489919662476, "rewards/format_reward": 0.75, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 1386.0833740234375, "epoch": 0.18057142857142858, "grad_norm": 0.3889317879381384, "kl": 0.00850677490234375, "learning_rate": 8.780358823396352e-07, "loss": 0.1308, "reward": 0.06261628679931164, "reward_std": 0.3530626520514488, "rewards/cosine_scaled_reward": -0.3124418593943119, "rewards/format_reward": 0.6875000298023224, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 1129.7708740234375, "epoch": 0.18171428571428572, "grad_norm": 0.3220977926530576, "kl": 0.00748443603515625, "learning_rate": 8.758773376468604e-07, "loss": 0.1262, "reward": 0.6195714063942432, "reward_std": 0.6696993261575699, "rewards/cosine_scaled_reward": -0.08604763355106115, "rewards/format_reward": 0.7916666865348816, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 1160.4167175292969, "epoch": 0.18285714285714286, "grad_norm": 0.2845576646765754, "kl": 0.0069122314453125, "learning_rate": 8.737029101523929e-07, "loss": 0.1064, "reward": 0.6454856535419822, "reward_std": 0.8377318382263184, "rewards/cosine_scaled_reward": -0.08350718393921852, "rewards/format_reward": 0.8125000149011612, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 1429.4792175292969, "epoch": 0.184, "grad_norm": 0.25219633802451885, "kl": 0.00858306884765625, "learning_rate": 8.715127058347614e-07, "loss": 0.0965, "reward": 0.009432412683963776, "reward_std": 0.3042390923947096, "rewards/cosine_scaled_reward": -0.27653381787240505, "rewards/format_reward": 0.5625000149011612, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 1226.7917175292969, "epoch": 0.18514285714285714, "grad_norm": 0.26347106975524837, "kl": 0.0080108642578125, "learning_rate": 8.693068314414344e-07, "loss": 0.077, "reward": 0.24512136541306973, "reward_std": 0.43705228716135025, "rewards/cosine_scaled_reward": -0.29410600662231445, "rewards/format_reward": 0.8333333432674408, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 926.1250305175781, "epoch": 0.18628571428571428, "grad_norm": 0.41739039022115654, "kl": 0.0112457275390625, "learning_rate": 8.670853944836176e-07, "loss": 0.1827, "reward": 0.7628292813897133, "reward_std": 0.8151352852582932, "rewards/cosine_scaled_reward": -0.04566871002316475, "rewards/format_reward": 0.8541666716337204, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 946.125, "epoch": 0.18742857142857142, "grad_norm": 0.36967841595429546, "kl": 0.01031494140625, "learning_rate": 8.648485032310144e-07, "loss": 0.1834, "reward": 0.6057916302233934, "reward_std": 0.48515384271740913, "rewards/cosine_scaled_reward": -0.10335419327020645, "rewards/format_reward": 0.8125000298023224, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 1174.4792175292969, "epoch": 0.18857142857142858, "grad_norm": 0.2708332867444507, "kl": 0.00788116455078125, "learning_rate": 8.625962667065487e-07, "loss": 0.0394, "reward": 0.32264771312475204, "reward_std": 0.4833778813481331, "rewards/cosine_scaled_reward": -0.24492615275084972, "rewards/format_reward": 0.8125000149011612, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 907.0833740234375, "epoch": 0.18971428571428572, "grad_norm": 0.3261002575687217, "kl": 0.00717926025390625, "learning_rate": 8.603287946810513e-07, "loss": 0.1283, "reward": 0.6173169314861298, "reward_std": 0.2740478292107582, "rewards/cosine_scaled_reward": -0.1705082282423973, "rewards/format_reward": 0.9583333432674408, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 1270.0417175292969, "epoch": 0.19085714285714286, "grad_norm": 0.3129560409827591, "kl": 0.00971221923828125, "learning_rate": 8.580461976679099e-07, "loss": 0.1093, "reward": 0.3518100567162037, "reward_std": 0.5595069229602814, "rewards/cosine_scaled_reward": -0.1470116525888443, "rewards/format_reward": 0.645833358168602, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 1077.1041870117188, "epoch": 0.192, "grad_norm": 0.26915582005747507, "kl": 0.0078125, "learning_rate": 8.557485869176825e-07, "loss": 0.1617, "reward": 0.2642595246434212, "reward_std": 0.46994560211896896, "rewards/cosine_scaled_reward": -0.26370356790721416, "rewards/format_reward": 0.7916666865348816, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 1156.2500305175781, "epoch": 0.19314285714285714, "grad_norm": 0.35785552736378773, "kl": 0.0098724365234375, "learning_rate": 8.534360744126753e-07, "loss": 0.0922, "reward": 0.77548947930336, "reward_std": 0.7726699560880661, "rewards/cosine_scaled_reward": 0.0023280568420886993, "rewards/format_reward": 0.770833358168602, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 1073.7292175292969, "epoch": 0.19428571428571428, "grad_norm": 0.32755955253118335, "kl": 0.0117034912109375, "learning_rate": 8.511087728614862e-07, "loss": 0.0752, "reward": 0.19202834740281105, "reward_std": 0.3850276917219162, "rewards/cosine_scaled_reward": -0.3206525072455406, "rewards/format_reward": 0.8333333432674408, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 918.8333511352539, "epoch": 0.19542857142857142, "grad_norm": 0.3616914993674, "kl": 0.00833892822265625, "learning_rate": 8.487667956935087e-07, "loss": 0.0904, "reward": 0.5478162653744221, "reward_std": 0.6629246398806572, "rewards/cosine_scaled_reward": -0.1948418878018856, "rewards/format_reward": 0.9375, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 1036.1458740234375, "epoch": 0.19657142857142856, "grad_norm": 0.3354400822116869, "kl": 0.0130157470703125, "learning_rate": 8.464102570534061e-07, "loss": 0.0669, "reward": 0.7608658275566995, "reward_std": 0.6014236621558666, "rewards/cosine_scaled_reward": -0.04665040969848633, "rewards/format_reward": 0.8541666865348816, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 1106.4583740234375, "epoch": 0.1977142857142857, "grad_norm": 0.3236947350770136, "kl": 0.0121307373046875, "learning_rate": 8.440392717955475e-07, "loss": 0.093, "reward": 0.7088564559817314, "reward_std": 0.4235651511698961, "rewards/cosine_scaled_reward": -0.010155089199543, "rewards/format_reward": 0.7291666865348816, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 1109.5000305175781, "epoch": 0.19885714285714284, "grad_norm": 0.37244639543702895, "kl": 0.015838623046875, "learning_rate": 8.416539554784089e-07, "loss": 0.1098, "reward": 0.17886048182845116, "reward_std": 0.35543810576200485, "rewards/cosine_scaled_reward": -0.30640310421586037, "rewards/format_reward": 0.7916666865348816, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 972.9167022705078, "epoch": 0.2, "grad_norm": 0.6554774460546362, "kl": 0.0153045654296875, "learning_rate": 8.392544243589427e-07, "loss": 0.2068, "reward": 0.607050247490406, "reward_std": 0.4396999180316925, "rewards/cosine_scaled_reward": -0.14439154416322708, "rewards/format_reward": 0.895833358168602, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 998.2291870117188, "epoch": 0.20114285714285715, "grad_norm": 0.28748166515655293, "kl": 0.0133819580078125, "learning_rate": 8.368407953869103e-07, "loss": 0.0371, "reward": 0.486224377527833, "reward_std": 0.6124172061681747, "rewards/cosine_scaled_reward": -0.17355448007583618, "rewards/format_reward": 0.8333333432674408, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 916.2291870117188, "epoch": 0.2022857142857143, "grad_norm": 0.4438177799902679, "kl": 0.0131378173828125, "learning_rate": 8.344131861991828e-07, "loss": 0.1487, "reward": 0.8074519336223602, "reward_std": 0.4988584369421005, "rewards/cosine_scaled_reward": -0.05460738018155098, "rewards/format_reward": 0.9166666865348816, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 822.8333740234375, "epoch": 0.20342857142857143, "grad_norm": 0.5173286289503403, "kl": 0.0179443359375, "learning_rate": 8.319717151140072e-07, "loss": 0.1961, "reward": 1.0362385213375092, "reward_std": 0.5397170335054398, "rewards/cosine_scaled_reward": 0.13270257785916328, "rewards/format_reward": 0.770833358168602, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 959.9792022705078, "epoch": 0.20457142857142857, "grad_norm": 0.369107073779179, "kl": 0.016815185546875, "learning_rate": 8.295165011252396e-07, "loss": 0.1417, "reward": 0.6556574255228043, "reward_std": 0.4815560430288315, "rewards/cosine_scaled_reward": -0.10967130470089614, "rewards/format_reward": 0.8750000149011612, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 1162.7292022705078, "epoch": 0.2057142857142857, "grad_norm": 0.5036563993456736, "kl": 0.01904296875, "learning_rate": 8.270476638965461e-07, "loss": 0.0949, "reward": 0.2779462654143572, "reward_std": 0.4615231901407242, "rewards/cosine_scaled_reward": -0.24644354078918695, "rewards/format_reward": 0.7708333432674408, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 1078.8333435058594, "epoch": 0.20685714285714285, "grad_norm": 0.4317948665990577, "kl": 0.0135955810546875, "learning_rate": 8.245653237555705e-07, "loss": 0.1473, "reward": 0.6264736168086529, "reward_std": 0.5298948585987091, "rewards/cosine_scaled_reward": -0.10342983156442642, "rewards/format_reward": 0.8333333432674408, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 1065.8125305175781, "epoch": 0.208, "grad_norm": 0.5168299485262725, "kl": 0.02105712890625, "learning_rate": 8.220696016880687e-07, "loss": 0.1884, "reward": 0.3882112614810467, "reward_std": 0.5859006345272064, "rewards/cosine_scaled_reward": -0.2017277143895626, "rewards/format_reward": 0.7916666865348816, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 1069.4583740234375, "epoch": 0.20914285714285713, "grad_norm": 0.5024855038579699, "kl": 0.0205078125, "learning_rate": 8.195606193320136e-07, "loss": 0.1323, "reward": 0.24412129819393158, "reward_std": 0.47408775985240936, "rewards/cosine_scaled_reward": -0.2529393620789051, "rewards/format_reward": 0.7500000149011612, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 936.4792022705078, "epoch": 0.2102857142857143, "grad_norm": 0.4981488833418968, "kl": 0.017730712890625, "learning_rate": 8.170384989716657e-07, "loss": 0.137, "reward": 0.6930912919342518, "reward_std": 0.5617035925388336, "rewards/cosine_scaled_reward": -0.03887102263979614, "rewards/format_reward": 0.7708333432674408, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 1030.4166717529297, "epoch": 0.21142857142857144, "grad_norm": 0.4904295101939947, "kl": 0.0301513671875, "learning_rate": 8.145033635316128e-07, "loss": 0.1667, "reward": 0.07037857547402382, "reward_std": 0.27715054154396057, "rewards/cosine_scaled_reward": -0.33981072157621384, "rewards/format_reward": 0.7500000298023224, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 1297.3750457763672, "epoch": 0.21257142857142858, "grad_norm": 0.359329704495533, "kl": 0.02447509765625, "learning_rate": 8.119553365707802e-07, "loss": 0.0722, "reward": 0.27740756422281265, "reward_std": 0.35020239651203156, "rewards/cosine_scaled_reward": -0.20504622161388397, "rewards/format_reward": 0.6875000149011612, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 1278.2291870117188, "epoch": 0.21371428571428572, "grad_norm": 0.6229091446373484, "kl": 0.037841796875, "learning_rate": 8.093945422764069e-07, "loss": 0.159, "reward": 0.6862413678318262, "reward_std": 0.806188240647316, "rewards/cosine_scaled_reward": -0.011045984923839569, "rewards/format_reward": 0.7083333507180214, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 828.7083587646484, "epoch": 0.21485714285714286, "grad_norm": 0.8396211982213951, "kl": 0.029296875, "learning_rate": 8.068211054579943e-07, "loss": 0.1705, "reward": 0.5941705331206322, "reward_std": 0.6708386167883873, "rewards/cosine_scaled_reward": -0.12999806739389896, "rewards/format_reward": 0.8541666865348816, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 1126.9583740234375, "epoch": 0.216, "grad_norm": 1.0692586435721545, "kl": 0.05059814453125, "learning_rate": 8.04235151541222e-07, "loss": 0.2306, "reward": 0.3716874085366726, "reward_std": 0.6852569133043289, "rewards/cosine_scaled_reward": -0.17873962549492717, "rewards/format_reward": 0.7291666939854622, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 1441.7292175292969, "epoch": 0.21714285714285714, "grad_norm": 0.4556901372243305, "kl": 0.0775146484375, "learning_rate": 8.01636806561836e-07, "loss": 0.0641, "reward": -0.02832420915365219, "reward_std": 0.41898399591445923, "rewards/cosine_scaled_reward": -0.21207877062261105, "rewards/format_reward": 0.3958333432674408, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 1079.3541870117188, "epoch": 0.21828571428571428, "grad_norm": 0.7752155732582218, "kl": 0.05340576171875, "learning_rate": 7.990261971595048e-07, "loss": 0.1862, "reward": 0.4970630258321762, "reward_std": 0.6355597376823425, "rewards/cosine_scaled_reward": -0.1264684833586216, "rewards/format_reward": 0.7500000149011612, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 1067.5417175292969, "epoch": 0.21942857142857142, "grad_norm": 0.9433921479755671, "kl": 0.0628662109375, "learning_rate": 7.964034505716476e-07, "loss": 0.1345, "reward": 0.34896004013717175, "reward_std": 0.44530968368053436, "rewards/cosine_scaled_reward": -0.19010332133620977, "rewards/format_reward": 0.7291666865348816, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 1153.1458892822266, "epoch": 0.22057142857142858, "grad_norm": 0.557299045473737, "kl": 0.07440185546875, "learning_rate": 7.93768694627233e-07, "loss": 0.0623, "reward": 0.3937496952712536, "reward_std": 0.4528709352016449, "rewards/cosine_scaled_reward": -0.14687515422701836, "rewards/format_reward": 0.6875000149011612, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 772.7708435058594, "epoch": 0.22171428571428572, "grad_norm": 1.0195572615380695, "kl": 0.03753662109375, "learning_rate": 7.911220577405484e-07, "loss": 0.1994, "reward": 1.379511073231697, "reward_std": 0.604660227894783, "rewards/cosine_scaled_reward": 0.23142218962311745, "rewards/format_reward": 0.9166666865348816, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 1043.2708740234375, "epoch": 0.22285714285714286, "grad_norm": 0.9603645520119819, "kl": 0.057830810546875, "learning_rate": 7.884636689049422e-07, "loss": 0.101, "reward": 0.9527463093400002, "reward_std": 0.651703879237175, "rewards/cosine_scaled_reward": 0.12220647558569908, "rewards/format_reward": 0.7083333432674408, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 1171.8125305175781, "epoch": 0.224, "grad_norm": 1.0759043540199384, "kl": 0.094970703125, "learning_rate": 7.857936576865356e-07, "loss": 0.0986, "reward": 0.22757766395807266, "reward_std": 0.5421559736132622, "rewards/cosine_scaled_reward": -0.14662783965468407, "rewards/format_reward": 0.5208333488553762, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 1254.6458892822266, "epoch": 0.22514285714285714, "grad_norm": 1.2281398548522355, "kl": 0.1163330078125, "learning_rate": 7.831121542179086e-07, "loss": 0.2334, "reward": 0.1120694987475872, "reward_std": 0.406834427267313, "rewards/cosine_scaled_reward": -0.21479860320687294, "rewards/format_reward": 0.541666679084301, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 1551.9792175292969, "epoch": 0.22628571428571428, "grad_norm": 1.2807709220712407, "kl": 0.1573486328125, "learning_rate": 7.804192891917571e-07, "loss": 0.1642, "reward": 0.1520095318555832, "reward_std": 0.5469059012830257, "rewards/cosine_scaled_reward": -0.16357857827097178, "rewards/format_reward": 0.4791666716337204, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 1243.7500457763672, "epoch": 0.22742857142857142, "grad_norm": 1.2387930807523095, "kl": 0.1546630859375, "learning_rate": 7.777151938545235e-07, "loss": 0.0664, "reward": 0.5908387266099453, "reward_std": 0.44286736100912094, "rewards/cosine_scaled_reward": 0.014169345609843731, "rewards/format_reward": 0.5625000149011612, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 998.3542022705078, "epoch": 0.22857142857142856, "grad_norm": 1.6258231243608119, "kl": 0.146240234375, "learning_rate": 7.75e-07, "loss": 0.223, "reward": 0.9689896870404482, "reward_std": 0.6490836925804615, "rewards/cosine_scaled_reward": 0.10949480719864368, "rewards/format_reward": 0.7500000298023224, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 1197.5208587646484, "epoch": 0.2297142857142857, "grad_norm": 1.2117522808382983, "kl": 0.15203857421875, "learning_rate": 7.72273839962904e-07, "loss": 0.1108, "reward": 0.29535099118947983, "reward_std": 0.6659888252615929, "rewards/cosine_scaled_reward": -0.18565785279497504, "rewards/format_reward": 0.6666666865348816, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 1035.2292022705078, "epoch": 0.23085714285714284, "grad_norm": 2.430024645446878, "kl": 0.1729736328125, "learning_rate": 7.695368466124296e-07, "loss": 0.1341, "reward": 0.4362456016242504, "reward_std": 0.665816992521286, "rewards/cosine_scaled_reward": -0.13604386523365974, "rewards/format_reward": 0.7083333432674408, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 1285.2083587646484, "epoch": 0.232, "grad_norm": 3.5252314114631926, "kl": 0.2603759765625, "learning_rate": 7.667891533457718e-07, "loss": 0.2005, "reward": 0.48519248701632023, "reward_std": 0.612464651465416, "rewards/cosine_scaled_reward": -0.08032042533159256, "rewards/format_reward": 0.6458333432674408, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 835.0833587646484, "epoch": 0.23314285714285715, "grad_norm": 1.8381824332135883, "kl": 0.1859130859375, "learning_rate": 7.640308940816239e-07, "loss": 0.053, "reward": 1.2399137616157532, "reward_std": 0.6745168194174767, "rewards/cosine_scaled_reward": 0.2241235449910164, "rewards/format_reward": 0.7916666865348816, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 1421.9583435058594, "epoch": 0.2342857142857143, "grad_norm": 1.7567396533005133, "kl": 0.362548828125, "learning_rate": 7.612622032536507e-07, "loss": 0.1051, "reward": 0.3085259608924389, "reward_std": 0.6349210105836391, "rewards/cosine_scaled_reward": -0.08532036282122135, "rewards/format_reward": 0.4791666865348816, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 1228.7083892822266, "epoch": 0.23542857142857143, "grad_norm": 2.3389392066981562, "kl": 0.30615234375, "learning_rate": 7.584832158039378e-07, "loss": 0.0693, "reward": 0.18148453161120415, "reward_std": 0.5284193530678749, "rewards/cosine_scaled_reward": -0.24259107932448387, "rewards/format_reward": 0.6666666716337204, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 1070.4792022705078, "epoch": 0.23657142857142857, "grad_norm": 3.543320557594463, "kl": 0.26416015625, "learning_rate": 7.556940671764124e-07, "loss": 0.1883, "reward": 0.542645301669836, "reward_std": 0.5379708558320999, "rewards/cosine_scaled_reward": -0.12451068125665188, "rewards/format_reward": 0.7916666716337204, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 1317.2708740234375, "epoch": 0.2377142857142857, "grad_norm": 1.9754558032385148, "kl": 0.6748046875, "learning_rate": 7.528948933102438e-07, "loss": 0.1365, "reward": 0.09549727046396583, "reward_std": 0.3623932749032974, "rewards/cosine_scaled_reward": -0.2126680426299572, "rewards/format_reward": 0.5208333395421505, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 919.9375457763672, "epoch": 0.23885714285714285, "grad_norm": 3.305425458945869, "kl": 0.474609375, "learning_rate": 7.500858306332172e-07, "loss": 0.0593, "reward": 0.7489641904830933, "reward_std": 0.4507276937365532, "rewards/cosine_scaled_reward": 0.030732073821127415, "rewards/format_reward": 0.6875000149011612, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 908.8958435058594, "epoch": 0.24, "grad_norm": 3.7173678494051496, "kl": 0.403564453125, "learning_rate": 7.472670160550848e-07, "loss": 0.1606, "reward": 0.7559212893247604, "reward_std": 0.5382421165704727, "rewards/cosine_scaled_reward": -0.007456040009856224, "rewards/format_reward": 0.7708333432674408, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 1479.0416870117188, "epoch": 0.24114285714285713, "grad_norm": 39.96198653082631, "kl": 2.5693359375, "learning_rate": 7.444385869608921e-07, "loss": 0.2707, "reward": 0.03475058265030384, "reward_std": 0.3246455695480108, "rewards/cosine_scaled_reward": -0.18054138123989105, "rewards/format_reward": 0.39583333395421505, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 1051.0208587646484, "epoch": 0.2422857142857143, "grad_norm": 3.2393755765757777, "kl": 0.53466796875, "learning_rate": 7.416006812042827e-07, "loss": 0.0958, "reward": 0.6123923324048519, "reward_std": 0.5387515500187874, "rewards/cosine_scaled_reward": -0.04797050543129444, "rewards/format_reward": 0.708333358168602, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 1301.3750610351562, "epoch": 0.24342857142857144, "grad_norm": 2.65733014184082, "kl": 0.755859375, "learning_rate": 7.387534371007797e-07, "loss": 0.1374, "reward": 0.1711240354925394, "reward_std": 0.42111407220363617, "rewards/cosine_scaled_reward": -0.16443797945976257, "rewards/format_reward": 0.5000000149011612, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 1203.6875610351562, "epoch": 0.24457142857142858, "grad_norm": 2.501952170306742, "kl": 0.50732421875, "learning_rate": 7.358969934210438e-07, "loss": 0.1105, "reward": 0.22278533224016428, "reward_std": 0.434869222342968, "rewards/cosine_scaled_reward": -0.22194067016243935, "rewards/format_reward": 0.6666667014360428, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 1249.6667022705078, "epoch": 0.24571428571428572, "grad_norm": 4.086485386572322, "kl": 0.880859375, "learning_rate": 7.330314893841101e-07, "loss": 0.0173, "reward": 0.3316160347312689, "reward_std": 0.5279753059148788, "rewards/cosine_scaled_reward": -0.14669198356568813, "rewards/format_reward": 0.6250000223517418, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 1369.1666870117188, "epoch": 0.24685714285714286, "grad_norm": 3.328918162087878, "kl": 0.773193359375, "learning_rate": 7.301570646506027e-07, "loss": 0.1402, "reward": 0.2145287273451686, "reward_std": 0.5796768814325333, "rewards/cosine_scaled_reward": -0.16356897167861462, "rewards/format_reward": 0.5416666865348816, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 1269.8542175292969, "epoch": 0.248, "grad_norm": 2.8333189883709515, "kl": 0.75927734375, "learning_rate": 7.27273859315928e-07, "loss": -0.0115, "reward": 0.5310591869056225, "reward_std": 0.4825605973601341, "rewards/cosine_scaled_reward": -0.057387083768844604, "rewards/format_reward": 0.645833358168602, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 1252.2708740234375, "epoch": 0.24914285714285714, "grad_norm": 4.762778702423241, "kl": 0.74072265625, "learning_rate": 7.243820139034464e-07, "loss": 0.1477, "reward": 0.5015929639339447, "reward_std": 0.3994259871542454, "rewards/cosine_scaled_reward": -0.07212021434679627, "rewards/format_reward": 0.6458333507180214, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 1017.5416870117188, "epoch": 0.2502857142857143, "grad_norm": 4.164501369060878, "kl": 1.07958984375, "learning_rate": 7.214816693576234e-07, "loss": 0.1337, "reward": 0.767455330118537, "reward_std": 0.5030167028307915, "rewards/cosine_scaled_reward": 0.07122766599059105, "rewards/format_reward": 0.6250000223517418, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 1053.1875457763672, "epoch": 0.25142857142857145, "grad_norm": 3.588996799420188, "kl": 0.71142578125, "learning_rate": 7.185729670371604e-07, "loss": 0.1866, "reward": 0.48609594255685806, "reward_std": 0.617650680243969, "rewards/cosine_scaled_reward": -0.11111870361492038, "rewards/format_reward": 0.7083333432674408, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 1244.687515258789, "epoch": 0.25257142857142856, "grad_norm": 2.946733537468475, "kl": 1.330078125, "learning_rate": 7.156560487081051e-07, "loss": 0.1268, "reward": 0.4570632018148899, "reward_std": 0.36856189370155334, "rewards/cosine_scaled_reward": -0.0006350576877593994, "rewards/format_reward": 0.4583333395421505, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 1376.9167175292969, "epoch": 0.2537142857142857, "grad_norm": 3.53418042013775, "kl": 1.1337890625, "learning_rate": 7.127310565369415e-07, "loss": 0.2362, "reward": 0.1362705221399665, "reward_std": 0.3934030085802078, "rewards/cosine_scaled_reward": -0.19228141009807587, "rewards/format_reward": 0.5208333358168602, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 1156.750015258789, "epoch": 0.25485714285714284, "grad_norm": 35.23833796360462, "kl": 2.369140625, "learning_rate": 7.097981330836616e-07, "loss": 0.1765, "reward": 0.6305188983678818, "reward_std": 0.5979669764637947, "rewards/cosine_scaled_reward": 0.023592765908688307, "rewards/format_reward": 0.5833333507180214, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 1251.3125305175781, "epoch": 0.256, "grad_norm": 3.4418620220945138, "kl": 1.376953125, "learning_rate": 7.068574212948169e-07, "loss": 0.1723, "reward": 0.5104624545201659, "reward_std": 0.25178899243474007, "rewards/cosine_scaled_reward": -0.06768545880913734, "rewards/format_reward": 0.6458333432674408, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 924.3541870117188, "epoch": 0.2571428571428571, "grad_norm": 6.348797231777103, "kl": 0.9375, "learning_rate": 7.039090644965509e-07, "loss": 0.1337, "reward": 0.7791457176208496, "reward_std": 0.7603946030139923, "rewards/cosine_scaled_reward": 0.07707285927608609, "rewards/format_reward": 0.6250000223517418, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 1052.5000305175781, "epoch": 0.2582857142857143, "grad_norm": 3.9386080018485288, "kl": 1.52734375, "learning_rate": 7.009532063876148e-07, "loss": 0.2459, "reward": 0.46499455720186234, "reward_std": 0.6090477257966995, "rewards/cosine_scaled_reward": -0.09041939489543438, "rewards/format_reward": 0.6458333432674408, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 892.1875152587891, "epoch": 0.25942857142857145, "grad_norm": 3.4313724086317445, "kl": 1.125, "learning_rate": 6.979899910323624e-07, "loss": 0.1959, "reward": 0.5925753712654114, "reward_std": 0.8098603934049606, "rewards/cosine_scaled_reward": -0.04746231180615723, "rewards/format_reward": 0.6875000298023224, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 1091.6458740234375, "epoch": 0.26057142857142856, "grad_norm": 4.447647427267497, "kl": 1.66015625, "learning_rate": 6.950195628537299e-07, "loss": 0.1179, "reward": 0.24639339372515678, "reward_std": 0.48318010196089745, "rewards/cosine_scaled_reward": -0.17888664733618498, "rewards/format_reward": 0.6041666865348816, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 809.2916870117188, "epoch": 0.26171428571428573, "grad_norm": 8.169532609480521, "kl": 2.046875, "learning_rate": 6.920420666261961e-07, "loss": 0.3082, "reward": 0.5617873594164848, "reward_std": 0.7489510700106621, "rewards/cosine_scaled_reward": -0.07327299565076828, "rewards/format_reward": 0.7083333432674408, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 1110.3542175292969, "epoch": 0.26285714285714284, "grad_norm": 2.921180843223507, "kl": 2.2265625, "learning_rate": 6.890576474687263e-07, "loss": 0.1487, "reward": 0.4394577872008085, "reward_std": 0.4748491495847702, "rewards/cosine_scaled_reward": -0.05110444873571396, "rewards/format_reward": 0.5416666977107525, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 1026.3542175292969, "epoch": 0.264, "grad_norm": 2.544177744090501, "kl": 1.572265625, "learning_rate": 6.860664508377001e-07, "loss": 0.1564, "reward": 0.2407762985676527, "reward_std": 0.5902754589915276, "rewards/cosine_scaled_reward": -0.20252852141857147, "rewards/format_reward": 0.645833358168602, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 1030.8958587646484, "epoch": 0.2651428571428571, "grad_norm": 3.5304119337525526, "kl": 1.529296875, "learning_rate": 6.83068622519821e-07, "loss": 0.1109, "reward": 0.42541009094566107, "reward_std": 0.6807678937911987, "rewards/cosine_scaled_reward": -0.11021162755787373, "rewards/format_reward": 0.6458333507180214, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 1073.3333435058594, "epoch": 0.2662857142857143, "grad_norm": 3.0267711493511382, "kl": 1.1796875, "learning_rate": 6.800643086250121e-07, "loss": 0.2702, "reward": 0.42545080557465553, "reward_std": 0.48426005244255066, "rewards/cosine_scaled_reward": -0.15185793861746788, "rewards/format_reward": 0.7291666716337204, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 1166.9791870117188, "epoch": 0.2674285714285714, "grad_norm": 2.956369605796136, "kl": 1.1279296875, "learning_rate": 6.770536555792944e-07, "loss": 0.1076, "reward": 0.3714570254087448, "reward_std": 0.650765061378479, "rewards/cosine_scaled_reward": -0.13718816195614636, "rewards/format_reward": 0.645833358168602, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 1054.6667175292969, "epoch": 0.26857142857142857, "grad_norm": 4.47554265499188, "kl": 1.21484375, "learning_rate": 6.740368101176495e-07, "loss": 0.2849, "reward": 0.6623743935488164, "reward_std": 0.7155829221010208, "rewards/cosine_scaled_reward": -0.012562822550535202, "rewards/format_reward": 0.6875000223517418, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 1096.1875457763672, "epoch": 0.26971428571428574, "grad_norm": 4.925975683565178, "kl": 1.3408203125, "learning_rate": 6.710139192768694e-07, "loss": 0.2351, "reward": 0.26786297000944614, "reward_std": 0.5117842257022858, "rewards/cosine_scaled_reward": -0.2202351950109005, "rewards/format_reward": 0.708333358168602, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 983.2291870117188, "epoch": 0.27085714285714285, "grad_norm": 2.226077510557553, "kl": 0.77294921875, "learning_rate": 6.679851303883891e-07, "loss": 0.1527, "reward": 0.5171467587351799, "reward_std": 0.5790724456310272, "rewards/cosine_scaled_reward": -0.10600997135043144, "rewards/format_reward": 0.7291666865348816, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 1015.9167175292969, "epoch": 0.272, "grad_norm": 2.746018994596942, "kl": 1.0703125, "learning_rate": 6.649505910711058e-07, "loss": 0.1685, "reward": 0.4093864783644676, "reward_std": 0.5853541940450668, "rewards/cosine_scaled_reward": -0.1911400929093361, "rewards/format_reward": 0.7916667014360428, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 1138.8542022705078, "epoch": 0.27314285714285713, "grad_norm": 2.366422791383297, "kl": 1.3916015625, "learning_rate": 6.619104492241847e-07, "loss": 0.1319, "reward": 0.03224743437021971, "reward_std": 0.40017952769994736, "rewards/cosine_scaled_reward": -0.2963762879371643, "rewards/format_reward": 0.6250000298023224, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 916.8125305175781, "epoch": 0.2742857142857143, "grad_norm": 1.7577643969871468, "kl": 1.291015625, "learning_rate": 6.588648530198504e-07, "loss": 0.13, "reward": 0.8863477371633053, "reward_std": 0.6274040639400482, "rewards/cosine_scaled_reward": 0.10984052997082472, "rewards/format_reward": 0.6666666865348816, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 891.0417022705078, "epoch": 0.2754285714285714, "grad_norm": 2.841473966918375, "kl": 1.0361328125, "learning_rate": 6.558139508961654e-07, "loss": 0.1554, "reward": 0.48904264718294144, "reward_std": 0.669127531349659, "rewards/cosine_scaled_reward": -0.16172868385910988, "rewards/format_reward": 0.8125000149011612, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 900.8541870117188, "epoch": 0.2765714285714286, "grad_norm": 4.202915193648642, "kl": 0.96337890625, "learning_rate": 6.527578915497951e-07, "loss": 0.1132, "reward": 0.6491687893867493, "reward_std": 0.6397206410765648, "rewards/cosine_scaled_reward": -0.08166561461985111, "rewards/format_reward": 0.8125000149011612, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 871.3542022705078, "epoch": 0.2777142857142857, "grad_norm": 4.013401867872089, "kl": 1.2275390625, "learning_rate": 6.496968239287603e-07, "loss": 0.0343, "reward": 0.6437305957078934, "reward_std": 0.566775843501091, "rewards/cosine_scaled_reward": -0.06355137238278985, "rewards/format_reward": 0.7708333432674408, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 1051.8541870117188, "epoch": 0.27885714285714286, "grad_norm": 2.0640323982742346, "kl": 1.2119140625, "learning_rate": 6.466308972251785e-07, "loss": 0.1283, "reward": 0.6993502229452133, "reward_std": 0.8381707072257996, "rewards/cosine_scaled_reward": -0.04615823458880186, "rewards/format_reward": 0.7916667014360428, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 926.8958587646484, "epoch": 0.28, "grad_norm": 2.3095581027269456, "kl": 1.2373046875, "learning_rate": 6.435602608679916e-07, "loss": 0.1728, "reward": 0.5032865107059479, "reward_std": 0.4741464629769325, "rewards/cosine_scaled_reward": -0.15460674837231636, "rewards/format_reward": 0.8125000298023224, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 948.7292022705078, "epoch": 0.28114285714285714, "grad_norm": 2.2705966167509697, "kl": 1.0166015625, "learning_rate": 6.404850645156841e-07, "loss": 0.0879, "reward": 0.5439350083470345, "reward_std": 0.6458217911422253, "rewards/cosine_scaled_reward": -0.11344920098781586, "rewards/format_reward": 0.770833358168602, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 766.6250152587891, "epoch": 0.2822857142857143, "grad_norm": 4.218176679768865, "kl": 1.375, "learning_rate": 6.374054580489873e-07, "loss": 0.1529, "reward": 0.7583817802369595, "reward_std": 0.9407426938414574, "rewards/cosine_scaled_reward": 0.02502422034740448, "rewards/format_reward": 0.7083333432674408, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 1149.2708435058594, "epoch": 0.2834285714285714, "grad_norm": 2.966316254338991, "kl": 1.69921875, "learning_rate": 6.343215915635761e-07, "loss": 0.1307, "reward": 0.37028552405536175, "reward_std": 0.35450038872659206, "rewards/cosine_scaled_reward": -0.15860724076628685, "rewards/format_reward": 0.6875000298023224, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 1236.3750305175781, "epoch": 0.2845714285714286, "grad_norm": 2.8644099570080126, "kl": 1.646484375, "learning_rate": 6.31233615362752e-07, "loss": 0.142, "reward": 0.3449726775288582, "reward_std": 0.7856429815292358, "rewards/cosine_scaled_reward": -0.09834698960185051, "rewards/format_reward": 0.5416666865348816, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 977.7500305175781, "epoch": 0.2857142857142857, "grad_norm": 1.9099821609277308, "kl": 0.921875, "learning_rate": 6.281416799501187e-07, "loss": 0.0404, "reward": 0.6945669716224074, "reward_std": 0.822948083281517, "rewards/cosine_scaled_reward": -0.048549871891736984, "rewards/format_reward": 0.7916666716337204, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 1265.9583740234375, "epoch": 0.28685714285714287, "grad_norm": 2.751476452748249, "kl": 1.216796875, "learning_rate": 1.000438641958131e-07, "loss": 0.1111, "reward": 0.12667130306363106, "reward_std": 0.7467320710420609, "rewards/cosine_scaled_reward": -0.17624769732356071, "rewards/format_reward": 0.4791666865348816, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 1031.0833740234375, "epoch": 0.288, "grad_norm": 3.701835452468544, "kl": 1.033203125, "learning_rate": 6.219465344613258e-07, "loss": 0.2332, "reward": 0.3126375643769279, "reward_std": 0.748970627784729, "rewards/cosine_scaled_reward": -0.09368122089654207, "rewards/format_reward": 0.5000000074505806, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 889.5833587646484, "epoch": 0.28914285714285715, "grad_norm": 5.141640270028422, "kl": 1.69921875, "learning_rate": 6.188436263278172e-07, "loss": -0.1188, "reward": 0.23392239259555936, "reward_std": 0.8090809062123299, "rewards/cosine_scaled_reward": -0.11220548488199711, "rewards/format_reward": 0.4583333432674408, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 912.5208587646484, "epoch": 0.29028571428571426, "grad_norm": 3.5136083178201183, "kl": 1.1953125, "learning_rate": 6.157373628530852e-07, "loss": 0.1793, "reward": 0.7197382766753435, "reward_std": 0.9268201515078545, "rewards/cosine_scaled_reward": 0.057785794138908386, "rewards/format_reward": 0.6041666865348816, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 1043.3333740234375, "epoch": 0.2914285714285714, "grad_norm": 2.8576463073310023, "kl": 1.361328125, "learning_rate": 6.126278954320294e-07, "loss": 0.1618, "reward": 0.21097473427653313, "reward_std": 0.8950171619653702, "rewards/cosine_scaled_reward": -0.08201263658702374, "rewards/format_reward": 0.3750000074505806, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 1132.9166870117188, "epoch": 0.2925714285714286, "grad_norm": 2.6390372016890877, "kl": 0.9296875, "learning_rate": 6.095153756157051e-07, "loss": 0.1517, "reward": 0.3409617803990841, "reward_std": 0.7687749713659286, "rewards/cosine_scaled_reward": -0.142019122838974, "rewards/format_reward": 0.6250000149011612, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 1172.1458740234375, "epoch": 0.2937142857142857, "grad_norm": 1.7999790033387904, "kl": 0.8994140625, "learning_rate": 6.06399955103937e-07, "loss": 0.0345, "reward": 0.24714069813489914, "reward_std": 0.526521310210228, "rewards/cosine_scaled_reward": -0.20976299978792667, "rewards/format_reward": 0.6666667014360428, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 1025.2292022705078, "epoch": 0.2948571428571429, "grad_norm": 3.7817000702854284, "kl": 0.9970703125, "learning_rate": 6.032817857379256e-07, "loss": 0.0254, "reward": 0.371606208384037, "reward_std": 0.8782027065753937, "rewards/cosine_scaled_reward": -0.10586357489228249, "rewards/format_reward": 0.583333358168602, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 1027.1041870117188, "epoch": 0.296, "grad_norm": 2.2007546083055627, "kl": 1.23828125, "learning_rate": 6.001610194928464e-07, "loss": 0.1329, "reward": 0.2863161154091358, "reward_std": 0.6974881812930107, "rewards/cosine_scaled_reward": -0.16934195160865784, "rewards/format_reward": 0.6250000149011612, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 1060.1666870117188, "epoch": 0.29714285714285715, "grad_norm": 2.0712856185453226, "kl": 1.314453125, "learning_rate": 5.97037808470444e-07, "loss": -0.0031, "reward": 0.05191618762910366, "reward_std": 0.5254812240600586, "rewards/cosine_scaled_reward": -0.1927919089794159, "rewards/format_reward": 0.4375000074505806, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 800.5625152587891, "epoch": 0.29828571428571427, "grad_norm": 3.953323642394609, "kl": 1.18359375, "learning_rate": 5.939123048916173e-07, "loss": 0.1926, "reward": 0.16135332686826587, "reward_std": 0.6497361660003662, "rewards/cosine_scaled_reward": -0.21099001914262772, "rewards/format_reward": 0.5833333432674408, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 906.3542022705078, "epoch": 0.29942857142857143, "grad_norm": 6.975231366994329, "kl": 1.1025390625, "learning_rate": 5.907846610890011e-07, "loss": 0.2163, "reward": 0.13131073210388422, "reward_std": 0.5159479975700378, "rewards/cosine_scaled_reward": -0.1739279804751277, "rewards/format_reward": 0.47916667722165585, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 981.3958587646484, "epoch": 0.30057142857142854, "grad_norm": 3.6462739135853304, "kl": 0.93359375, "learning_rate": 5.87655029499542e-07, "loss": 0.2144, "reward": 0.2528093755245209, "reward_std": 0.6878427565097809, "rewards/cosine_scaled_reward": -0.19651199039071798, "rewards/format_reward": 0.645833358168602, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 1036.4792022705078, "epoch": 0.3017142857142857, "grad_norm": 2.4186369761638797, "kl": 1.11328125, "learning_rate": 5.845235626570683e-07, "loss": 0.0094, "reward": 0.34765794809209183, "reward_std": 0.7917995601892471, "rewards/cosine_scaled_reward": -0.10742103308439255, "rewards/format_reward": 0.5625000149011612, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 988.1458587646484, "epoch": 0.3028571428571429, "grad_norm": 3.8358402184782845, "kl": 1.125, "learning_rate": 5.813904131848564e-07, "loss": 0.1412, "reward": 0.22985844686627388, "reward_std": 0.4855259954929352, "rewards/cosine_scaled_reward": -0.17673744820058346, "rewards/format_reward": 0.5833333358168602, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 853.1042022705078, "epoch": 0.304, "grad_norm": 3.155418565951925, "kl": 1.138671875, "learning_rate": 5.78255733788191e-07, "loss": -0.0981, "reward": 0.23544084653258324, "reward_std": 0.5617225617170334, "rewards/cosine_scaled_reward": -0.18436292186379433, "rewards/format_reward": 0.6041666865348816, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 1040.7291870117188, "epoch": 0.30514285714285716, "grad_norm": 4.49377424287265, "kl": 1.8671875, "learning_rate": 5.751196772469237e-07, "loss": 0.3133, "reward": 0.019660448655486107, "reward_std": 0.5969599932432175, "rewards/cosine_scaled_reward": -0.14641978219151497, "rewards/format_reward": 0.3125000149011612, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 1120.3334045410156, "epoch": 0.3062857142857143, "grad_norm": 2.9296163486934588, "kl": 1.455078125, "learning_rate": 5.71982396408026e-07, "loss": 0.0891, "reward": 0.019381534308195114, "reward_std": 0.6385679095983505, "rewards/cosine_scaled_reward": -0.188225906342268, "rewards/format_reward": 0.3958333432674408, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 940.0833587646484, "epoch": 0.30742857142857144, "grad_norm": 3.99474649335861, "kl": 1.58203125, "learning_rate": 5.688440441781398e-07, "loss": 0.2037, "reward": 0.21233398653566837, "reward_std": 0.5940781682729721, "rewards/cosine_scaled_reward": -0.17508301883935928, "rewards/format_reward": 0.5625000149011612, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 820.8541870117188, "epoch": 0.30857142857142855, "grad_norm": 3.64920081986899, "kl": 1.548828125, "learning_rate": 5.657047735161255e-07, "loss": 0.187, "reward": 0.287849310785532, "reward_std": 0.7942548245191574, "rewards/cosine_scaled_reward": -0.16857536626048386, "rewards/format_reward": 0.6250000298023224, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 918.2708435058594, "epoch": 0.3097142857142857, "grad_norm": 4.142397150940974, "kl": 1.3642578125, "learning_rate": 5.625647374256061e-07, "loss": -0.0034, "reward": 0.21712711825966835, "reward_std": 0.7582554370164871, "rewards/cosine_scaled_reward": -0.1726864455267787, "rewards/format_reward": 0.5625000223517418, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 1067.3125305175781, "epoch": 0.31085714285714283, "grad_norm": 5.568481701496752, "kl": 1.576171875, "learning_rate": 5.594240889475106e-07, "loss": 0.2629, "reward": 0.07018839695956558, "reward_std": 0.6307368651032448, "rewards/cosine_scaled_reward": -0.17323914170265198, "rewards/format_reward": 0.4166666753590107, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 1032.7916870117188, "epoch": 0.312, "grad_norm": 2.7380334201594207, "kl": 1.763671875, "learning_rate": 5.562829811526154e-07, "loss": 0.1532, "reward": 0.1198783004656434, "reward_std": 0.5959479659795761, "rewards/cosine_scaled_reward": -0.15881085954606533, "rewards/format_reward": 0.4375000149011612, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 1005.0000305175781, "epoch": 0.31314285714285717, "grad_norm": 3.288058849096818, "kl": 1.3232421875, "learning_rate": 5.531415671340826e-07, "loss": 0.0679, "reward": 0.33828355744481087, "reward_std": 0.7625949904322624, "rewards/cosine_scaled_reward": -0.1329415813088417, "rewards/format_reward": 0.6041666716337204, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 1209.2083892822266, "epoch": 0.3142857142857143, "grad_norm": 3.384369498507843, "kl": 1.3759765625, "learning_rate": 5.5e-07, "loss": 0.1487, "reward": 0.2773652821779251, "reward_std": 0.7781829237937927, "rewards/cosine_scaled_reward": -0.09048402030020952, "rewards/format_reward": 0.45833334885537624, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 871.2500152587891, "epoch": 0.31542857142857145, "grad_norm": 3.6001944034052666, "kl": 1.2470703125, "learning_rate": 5.468584328659172e-07, "loss": 0.2545, "reward": 0.4259207919239998, "reward_std": 0.7986200153827667, "rewards/cosine_scaled_reward": -0.1099562719464302, "rewards/format_reward": 0.6458333432674408, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 970.1458740234375, "epoch": 0.31657142857142856, "grad_norm": 5.098242367200561, "kl": 1.9375, "learning_rate": 5.437170188473847e-07, "loss": 0.0347, "reward": 0.1577397957444191, "reward_std": 0.8665766268968582, "rewards/cosine_scaled_reward": -0.16071344492956996, "rewards/format_reward": 0.479166679084301, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 1138.0417175292969, "epoch": 0.3177142857142857, "grad_norm": 4.893358334263393, "kl": 1.51953125, "learning_rate": 5.405759110524894e-07, "loss": 0.2335, "reward": 0.2129652127623558, "reward_std": 0.8123987764120102, "rewards/cosine_scaled_reward": -0.1122674010694027, "rewards/format_reward": 0.4375000149011612, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 1055.5416870117188, "epoch": 0.31885714285714284, "grad_norm": 11.325087114885777, "kl": 1.70703125, "learning_rate": 5.37435262574394e-07, "loss": 0.1758, "reward": 0.2276703668758273, "reward_std": 0.7087787315249443, "rewards/cosine_scaled_reward": -0.14658149890601635, "rewards/format_reward": 0.520833358168602, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 1201.5625305175781, "epoch": 0.32, "grad_norm": 4.499791162135755, "kl": 1.3359375, "learning_rate": 5.342952264838747e-07, "loss": 0.199, "reward": 0.4334499780088663, "reward_std": 0.8222155347466469, "rewards/cosine_scaled_reward": -0.0853583601419814, "rewards/format_reward": 0.6041666716337204, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 1183.5833740234375, "epoch": 0.3211428571428571, "grad_norm": 3.6400895329844336, "kl": 1.931640625, "learning_rate": 5.311559558218603e-07, "loss": 0.0286, "reward": -0.14555206894874573, "reward_std": 0.4930955022573471, "rewards/cosine_scaled_reward": -0.2081927042454481, "rewards/format_reward": 0.2708333395421505, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 1227.6875305175781, "epoch": 0.3222857142857143, "grad_norm": 3.351330372342759, "kl": 1.51953125, "learning_rate": 5.28017603591974e-07, "loss": 0.1735, "reward": 0.08991836942732334, "reward_std": 0.7664570957422256, "rewards/cosine_scaled_reward": -0.1946241520345211, "rewards/format_reward": 0.479166679084301, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 1011.8958435058594, "epoch": 0.32342857142857145, "grad_norm": 3.607306150140324, "kl": 1.52734375, "learning_rate": 5.248803227530763e-07, "loss": 0.1756, "reward": -0.16347728297114372, "reward_std": 0.6131603866815567, "rewards/cosine_scaled_reward": -0.269238643348217, "rewards/format_reward": 0.3750000074505806, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 1238.1875305175781, "epoch": 0.32457142857142857, "grad_norm": 3.700854838943554, "kl": 1.3291015625, "learning_rate": 5.21744266211809e-07, "loss": 0.0644, "reward": 0.19410160928964615, "reward_std": 0.6351519152522087, "rewards/cosine_scaled_reward": -0.16336587071418762, "rewards/format_reward": 0.520833358168602, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 1302.5833435058594, "epoch": 0.32571428571428573, "grad_norm": 5.590443825333452, "kl": 1.396484375, "learning_rate": 5.186095868151436e-07, "loss": 0.1172, "reward": 0.0053066437467350625, "reward_std": 0.6190855652093887, "rewards/cosine_scaled_reward": -0.1952633447945118, "rewards/format_reward": 0.3958333507180214, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 1408.7708587646484, "epoch": 0.32685714285714285, "grad_norm": 5820.413747461295, "kl": 44.6220703125, "learning_rate": 5.154764373429315e-07, "loss": 2.1366, "reward": 0.321873364970088, "reward_std": 0.7274122461676598, "rewards/cosine_scaled_reward": -0.06822998262941837, "rewards/format_reward": 0.45833334885537624, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 1293.6875305175781, "epoch": 0.328, "grad_norm": 10688.293773017389, "kl": 90.048828125, "learning_rate": 5.123449705004581e-07, "loss": 3.6012, "reward": 0.22728685289621353, "reward_std": 0.6926668882369995, "rewards/cosine_scaled_reward": -0.10510657541453838, "rewards/format_reward": 0.4375000074505806, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 1143.1042175292969, "epoch": 0.3291428571428571, "grad_norm": 69995.08344409091, "kl": 821.830078125, "learning_rate": 5.09215338910999e-07, "loss": 50.9221, "reward": 0.3029659762978554, "reward_std": 0.8068300932645798, "rewards/cosine_scaled_reward": -0.04643368790857494, "rewards/format_reward": 0.3958333432674408, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 1325.7084045410156, "epoch": 0.3302857142857143, "grad_norm": 62.300695111663714, "kl": 1.5146484375, "learning_rate": 5.060876951083828e-07, "loss": 0.1171, "reward": 0.10640177875757217, "reward_std": 0.6392035633325577, "rewards/cosine_scaled_reward": -0.08221577852964401, "rewards/format_reward": 0.2708333358168602, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 1066.375015258789, "epoch": 0.3314285714285714, "grad_norm": 3.0451709688438138, "kl": 0.85791015625, "learning_rate": 5.02962191529556e-07, "loss": 0.0875, "reward": 0.4837397076189518, "reward_std": 0.6303973346948624, "rewards/cosine_scaled_reward": -0.008130142465233803, "rewards/format_reward": 0.5000000074505806, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 1176.2292175292969, "epoch": 0.3325714285714286, "grad_norm": 6.431194933370891, "kl": 1.169921875, "learning_rate": 4.998389805071536e-07, "loss": 0.0944, "reward": 0.004224353935569525, "reward_std": 0.7458223477005959, "rewards/cosine_scaled_reward": -0.17497116327285767, "rewards/format_reward": 0.354166679084301, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 1279.1875610351562, "epoch": 0.33371428571428574, "grad_norm": 11.784461019524304, "kl": 1.0419921875, "learning_rate": 4.967182142620745e-07, "loss": 0.0752, "reward": -0.019843921065330505, "reward_std": 0.5733096897602081, "rewards/cosine_scaled_reward": -0.21825530380010605, "rewards/format_reward": 0.4166666753590107, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 1270.3750305175781, "epoch": 0.33485714285714285, "grad_norm": 12451.222306718704, "kl": 56.82421875, "learning_rate": 4.93600044896063e-07, "loss": 2.6089, "reward": -0.0518635269254446, "reward_std": 0.4941852539777756, "rewards/cosine_scaled_reward": -0.22384843230247498, "rewards/format_reward": 0.3958333507180214, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 1304.8750457763672, "epoch": 0.336, "grad_norm": 354145.9079404987, "kl": 3584.8046875, "learning_rate": 4.904846243842949e-07, "loss": 283.5748, "reward": 0.06046904996037483, "reward_std": 0.7505204379558563, "rewards/cosine_scaled_reward": -0.13643214339390397, "rewards/format_reward": 0.3333333358168602, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 1317.5625610351562, "epoch": 0.33714285714285713, "grad_norm": 5.242464203702877, "kl": 1.0029296875, "learning_rate": 4.873721045679706e-07, "loss": 0.1195, "reward": 0.005757967010140419, "reward_std": 0.6009484976530075, "rewards/cosine_scaled_reward": -0.12212102208286524, "rewards/format_reward": 0.2500000111758709, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 1103.0625610351562, "epoch": 0.3382857142857143, "grad_norm": 4.2430557491796055, "kl": 0.8115234375, "learning_rate": 4.842626371469149e-07, "loss": 0.0632, "reward": 0.0580328986980021, "reward_std": 0.6936925277113914, "rewards/cosine_scaled_reward": -0.15848355647176504, "rewards/format_reward": 0.3750000149011612, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 1432.3333435058594, "epoch": 0.3394285714285714, "grad_norm": 2.9908283966206457, "kl": 0.7646484375, "learning_rate": 4.811563736721829e-07, "loss": 0.1022, "reward": -0.011708778678439558, "reward_std": 0.5683621913194656, "rewards/cosine_scaled_reward": -0.12043773010373116, "rewards/format_reward": 0.2291666716337204, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 1451.8750305175781, "epoch": 0.3405714285714286, "grad_norm": 4.214445887739457, "kl": 0.673828125, "learning_rate": 4.780534655386743e-07, "loss": -0.0113, "reward": -0.12220606487244368, "reward_std": 0.5942584052681923, "rewards/cosine_scaled_reward": -0.18610304035246372, "rewards/format_reward": 0.2500000074505806, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 1379.4791870117188, "epoch": 0.3417142857142857, "grad_norm": 4.524572878515851, "kl": 0.5302734375, "learning_rate": 4.749540639777539e-07, "loss": -0.0319, "reward": -0.08997016213834286, "reward_std": 0.6837709844112396, "rewards/cosine_scaled_reward": -0.1804017536342144, "rewards/format_reward": 0.27083334140479565, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 1242.2083740234375, "epoch": 0.34285714285714286, "grad_norm": 22.44129435449986, "kl": 0.6015625, "learning_rate": 4.7185832004988133e-07, "loss": 0.047, "reward": 0.4733648784458637, "reward_std": 0.6498839557170868, "rewards/cosine_scaled_reward": -0.013317572651430964, "rewards/format_reward": 0.5000000111758709, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 1358.5000610351562, "epoch": 0.344, "grad_norm": 5.451894779313779, "kl": 0.55419921875, "learning_rate": 4.68766384637248e-07, "loss": 0.0201, "reward": 0.012628388591110706, "reward_std": 0.6598528623580933, "rewards/cosine_scaled_reward": -0.11868580989539623, "rewards/format_reward": 0.2500000037252903, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 1208.0000610351562, "epoch": 0.34514285714285714, "grad_norm": 2.502133066720727, "kl": 0.5078125, "learning_rate": 4.656784084364238e-07, "loss": 0.0976, "reward": 0.01287244912236929, "reward_std": 0.6720428466796875, "rewards/cosine_scaled_reward": -0.14981378242373466, "rewards/format_reward": 0.3125000111758709, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 1372.2917175292969, "epoch": 0.3462857142857143, "grad_norm": 9.527527408809727, "kl": 0.591796875, "learning_rate": 4.6259454195101267e-07, "loss": -0.0351, "reward": -0.0026968184392899275, "reward_std": 0.7502148300409317, "rewards/cosine_scaled_reward": -0.1784317558631301, "rewards/format_reward": 0.3541666716337204, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 1228.1458435058594, "epoch": 0.3474285714285714, "grad_norm": 5.5176774561091655, "kl": 0.4345703125, "learning_rate": 4.59514935484316e-07, "loss": 0.1598, "reward": 0.39222877379506826, "reward_std": 0.840458020567894, "rewards/cosine_scaled_reward": -0.03305228240787983, "rewards/format_reward": 0.4583333358168602, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 1448.8125305175781, "epoch": 0.3485714285714286, "grad_norm": 7.801875434214254, "kl": 0.3525390625, "learning_rate": 4.5643973913200837e-07, "loss": 0.0808, "reward": 0.005279352888464928, "reward_std": 0.6858643740415573, "rewards/cosine_scaled_reward": -0.1536103216931224, "rewards/format_reward": 0.3125000074505806, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 1402.6666870117188, "epoch": 0.3497142857142857, "grad_norm": 3.566822202421308, "kl": 0.29638671875, "learning_rate": 4.5336910277482155e-07, "loss": 0.0791, "reward": 0.18335522711277008, "reward_std": 0.6350644528865814, "rewards/cosine_scaled_reward": -0.13748905574902892, "rewards/format_reward": 0.4583333432674408, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 1421.8541870117188, "epoch": 0.35085714285714287, "grad_norm": 1.9532542741070622, "kl": 0.289794921875, "learning_rate": 4.503031760712397e-07, "loss": 0.0514, "reward": 0.2609965428709984, "reward_std": 0.7012953609228134, "rewards/cosine_scaled_reward": -0.06741839554160833, "rewards/format_reward": 0.39583334885537624, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 1331.2500305175781, "epoch": 0.352, "grad_norm": 2.135773174322825, "kl": 0.26416015625, "learning_rate": 4.4724210845020494e-07, "loss": 0.1508, "reward": 0.21997906267642975, "reward_std": 0.6842755973339081, "rewards/cosine_scaled_reward": -0.1191771375015378, "rewards/format_reward": 0.4583333358168602, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 1291.1666870117188, "epoch": 0.35314285714285715, "grad_norm": 3.030174625800062, "kl": 0.323486328125, "learning_rate": 4.441860491038345e-07, "loss": 0.1012, "reward": -0.060309079475700855, "reward_std": 0.48270438611507416, "rewards/cosine_scaled_reward": -0.16557121649384499, "rewards/format_reward": 0.2708333469927311, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 1296.8125457763672, "epoch": 0.35428571428571426, "grad_norm": 3.288974321286699, "kl": 0.30712890625, "learning_rate": 4.4113514698014953e-07, "loss": 0.1053, "reward": 0.3812438789755106, "reward_std": 0.6454566046595573, "rewards/cosine_scaled_reward": 0.0031219255179166794, "rewards/format_reward": 0.37500000558793545, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 1582.4167175292969, "epoch": 0.3554285714285714, "grad_norm": 11.037201589242047, "kl": 0.3916015625, "learning_rate": 4.3808955077581546e-07, "loss": 0.0554, "reward": 0.011564895510673523, "reward_std": 0.5866778641939163, "rewards/cosine_scaled_reward": -0.12963422574102879, "rewards/format_reward": 0.27083333767950535, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 1511.1458740234375, "epoch": 0.3565714285714286, "grad_norm": 541.360267852673, "kl": 2.48046875, "learning_rate": 4.350494089288943e-07, "loss": 0.1743, "reward": 0.09507806971669197, "reward_std": 0.7126565277576447, "rewards/cosine_scaled_reward": -0.12954430282115936, "rewards/format_reward": 0.3541666716337204, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 1310.4792022705078, "epoch": 0.3577142857142857, "grad_norm": 1.6060292822301743, "kl": 0.2235107421875, "learning_rate": 4.3201486961161093e-07, "loss": 0.0119, "reward": 0.19681214727461338, "reward_std": 0.5347588732838631, "rewards/cosine_scaled_reward": -0.14117726124823093, "rewards/format_reward": 0.4791666902601719, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 1412.7708740234375, "epoch": 0.3588571428571429, "grad_norm": 0.9234012789427545, "kl": 0.2705078125, "learning_rate": 4.2898608072313045e-07, "loss": 0.0522, "reward": 0.1253851738292724, "reward_std": 0.5503663271665573, "rewards/cosine_scaled_reward": -0.1352240853011608, "rewards/format_reward": 0.3958333469927311, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 1183.7916870117188, "epoch": 0.36, "grad_norm": 1.7837131712349448, "kl": 0.248779296875, "learning_rate": 4.2596318988235037e-07, "loss": 0.1102, "reward": 0.06632774323225021, "reward_std": 0.8003478944301605, "rewards/cosine_scaled_reward": -0.14391947723925114, "rewards/format_reward": 0.354166679084301, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 1321.1667022705078, "epoch": 0.36114285714285715, "grad_norm": 3.8904561936208473, "kl": 0.311279296875, "learning_rate": 4.2294634442070553e-07, "loss": 0.0684, "reward": -0.12211128510534763, "reward_std": 0.3644377589225769, "rewards/cosine_scaled_reward": -0.19647231698036194, "rewards/format_reward": 0.27083333767950535, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 1472.5208435058594, "epoch": 0.36228571428571427, "grad_norm": 0.6761305622628668, "kl": 0.2392578125, "learning_rate": 4.1993569137498776e-07, "loss": 0.0051, "reward": 0.07694595551583916, "reward_std": 0.698570191860199, "rewards/cosine_scaled_reward": -0.08652702532708645, "rewards/format_reward": 0.25000000186264515, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 1446.3959045410156, "epoch": 0.36342857142857143, "grad_norm": 1.610083766620256, "kl": 0.23046875, "learning_rate": 4.1693137748017915e-07, "loss": 0.1272, "reward": 0.22593690548092127, "reward_std": 0.7007799595594406, "rewards/cosine_scaled_reward": -0.11619820445775986, "rewards/format_reward": 0.45833336375653744, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 1419.7500610351562, "epoch": 0.36457142857142855, "grad_norm": 1.3177147357732026, "kl": 0.3505859375, "learning_rate": 4.1393354916230005e-07, "loss": 0.0908, "reward": 0.05421498417854309, "reward_std": 0.6087209582328796, "rewards/cosine_scaled_reward": -0.10830917488783598, "rewards/format_reward": 0.27083334140479565, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 1459.2917175292969, "epoch": 0.3657142857142857, "grad_norm": 2.383045046585821, "kl": 0.177001953125, "learning_rate": 4.1094235253127374e-07, "loss": 0.143, "reward": 0.23994141444563866, "reward_std": 0.7169264256954193, "rewards/cosine_scaled_reward": -0.08836262859404087, "rewards/format_reward": 0.4166666753590107, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 1525.8542175292969, "epoch": 0.3668571428571429, "grad_norm": 1.4014039132566267, "kl": 0.327880859375, "learning_rate": 4.079579333738039e-07, "loss": 0.0636, "reward": 0.07618786534294486, "reward_std": 0.6110149621963501, "rewards/cosine_scaled_reward": -0.17023939825594425, "rewards/format_reward": 0.416666679084301, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 1505.1042022705078, "epoch": 0.368, "grad_norm": 0.9016635108285753, "kl": 0.18017578125, "learning_rate": 4.0498043714627006e-07, "loss": 0.0766, "reward": 0.1637781597673893, "reward_std": 0.6868859454989433, "rewards/cosine_scaled_reward": -0.13686091732233763, "rewards/format_reward": 0.4375000186264515, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 1378.7500305175781, "epoch": 0.36914285714285716, "grad_norm": 1.1982814454055981, "kl": 0.39306640625, "learning_rate": 4.020100089676376e-07, "loss": 0.0913, "reward": 0.17529202857986093, "reward_std": 0.6956184059381485, "rewards/cosine_scaled_reward": -0.14152065757662058, "rewards/format_reward": 0.4583333432674408, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 1414.5208740234375, "epoch": 0.3702857142857143, "grad_norm": 5.168812943695912, "kl": 0.2706298828125, "learning_rate": 3.9904679361238526e-07, "loss": 0.0758, "reward": -0.05163134215399623, "reward_std": 0.573038712143898, "rewards/cosine_scaled_reward": -0.2133156731724739, "rewards/format_reward": 0.37500000558793545, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 1026.2292175292969, "epoch": 0.37142857142857144, "grad_norm": 2.717389747197644, "kl": 0.2042236328125, "learning_rate": 3.9609093550344907e-07, "loss": 0.0446, "reward": 0.35916636511683464, "reward_std": 0.7165441811084747, "rewards/cosine_scaled_reward": -0.11208349000662565, "rewards/format_reward": 0.583333358168602, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 1296.2083435058594, "epoch": 0.37257142857142855, "grad_norm": 0.9706132798560072, "kl": 0.1436767578125, "learning_rate": 3.931425787051832e-07, "loss": 0.0264, "reward": 0.03931037150323391, "reward_std": 0.5944674462080002, "rewards/cosine_scaled_reward": -0.24076148495078087, "rewards/format_reward": 0.5208333395421505, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 1119.3958740234375, "epoch": 0.3737142857142857, "grad_norm": 7.20904098295775, "kl": 0.34619140625, "learning_rate": 3.902018669163384e-07, "loss": 0.0023, "reward": 0.5026027010753751, "reward_std": 0.4505321756005287, "rewards/cosine_scaled_reward": 0.011718038469552994, "rewards/format_reward": 0.4791666716337204, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 1340.9791717529297, "epoch": 0.37485714285714283, "grad_norm": 1.2860908020915138, "kl": 0.416259765625, "learning_rate": 3.872689434630585e-07, "loss": 0.1449, "reward": 0.15127216652035713, "reward_std": 0.6304197087883949, "rewards/cosine_scaled_reward": -0.15353058651089668, "rewards/format_reward": 0.4583333507180214, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 1571.3750610351562, "epoch": 0.376, "grad_norm": 0.8293118478307562, "kl": 0.242431640625, "learning_rate": 3.843439512918949e-07, "loss": 0.0229, "reward": 0.09288652800023556, "reward_std": 0.5842361897230148, "rewards/cosine_scaled_reward": -0.15147340297698975, "rewards/format_reward": 0.3958333432674408, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 1407.9583740234375, "epoch": 0.37714285714285717, "grad_norm": 1.189781094856149, "kl": 0.1077880859375, "learning_rate": 3.8142703296283953e-07, "loss": 0.0681, "reward": -0.09090141206979752, "reward_std": 0.5390855148434639, "rewards/cosine_scaled_reward": -0.21211737021803856, "rewards/format_reward": 0.3333333395421505, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 1255.6875457763672, "epoch": 0.3782857142857143, "grad_norm": 1.046472107288498, "kl": 0.10308837890625, "learning_rate": 3.785183306423767e-07, "loss": 0.0811, "reward": -0.12841611605836079, "reward_std": 0.39798876643180847, "rewards/cosine_scaled_reward": -0.3350413963198662, "rewards/format_reward": 0.5416666865348816, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 1201.8958740234375, "epoch": 0.37942857142857145, "grad_norm": 1.123018980255247, "kl": 0.117584228515625, "learning_rate": 3.7561798609655373e-07, "loss": 0.072, "reward": 0.499036006629467, "reward_std": 0.6711834743618965, "rewards/cosine_scaled_reward": -0.03173201950266957, "rewards/format_reward": 0.5625000074505806, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 1133.3333587646484, "epoch": 0.38057142857142856, "grad_norm": 2.177638459571002, "kl": 0.14453125, "learning_rate": 3.72726140684072e-07, "loss": 0.1488, "reward": 0.03351620538160205, "reward_std": 0.4431127682328224, "rewards/cosine_scaled_reward": -0.27490856871008873, "rewards/format_reward": 0.5833333432674408, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 1252.5833587646484, "epoch": 0.38171428571428573, "grad_norm": 1.6680786188797292, "kl": 2.4984130859375, "learning_rate": 3.6984293534939737e-07, "loss": 0.1246, "reward": -0.1514057070016861, "reward_std": 0.5695896856486797, "rewards/cosine_scaled_reward": -0.26320285350084305, "rewards/format_reward": 0.3750000074505806, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 1232.0000610351562, "epoch": 0.38285714285714284, "grad_norm": 1.828125714274309, "kl": 0.07843017578125, "learning_rate": 3.6696851061588994e-07, "loss": 0.1105, "reward": 0.07522661844268441, "reward_std": 0.5525132827460766, "rewards/cosine_scaled_reward": -0.21238669380545616, "rewards/format_reward": 0.5000000149011612, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 1240.4375305175781, "epoch": 0.384, "grad_norm": 3.2255921262965432, "kl": 0.19232177734375, "learning_rate": 3.641030065789562e-07, "loss": 0.2104, "reward": -0.07903135940432549, "reward_std": 0.4235813617706299, "rewards/cosine_scaled_reward": -0.3103490248322487, "rewards/format_reward": 0.5416666865348816, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 1136.2500457763672, "epoch": 0.3851428571428571, "grad_norm": 2.1359050155328076, "kl": 0.298095703125, "learning_rate": 3.612465628992203e-07, "loss": 0.1271, "reward": 0.29203586652874947, "reward_std": 0.6221929639577866, "rewards/cosine_scaled_reward": -0.14564874302595854, "rewards/format_reward": 0.583333358168602, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 1304.4792175292969, "epoch": 0.3862857142857143, "grad_norm": 1.42801024449987, "kl": 0.2041015625, "learning_rate": 3.5839931879571725e-07, "loss": 0.0306, "reward": -0.07640792615711689, "reward_std": 0.29374565184116364, "rewards/cosine_scaled_reward": -0.30903729796409607, "rewards/format_reward": 0.5416666716337204, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 1483.5625305175781, "epoch": 0.38742857142857146, "grad_norm": 4.530770296915891, "kl": 0.2216796875, "learning_rate": 3.555614130391079e-07, "loss": 0.0756, "reward": -0.22593690641224384, "reward_std": 0.42642898857593536, "rewards/cosine_scaled_reward": -0.31088512018322945, "rewards/format_reward": 0.39583334513008595, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 1301.6875305175781, "epoch": 0.38857142857142857, "grad_norm": 32.229056752997074, "kl": 0.72998046875, "learning_rate": 3.5273298394491515e-07, "loss": 0.0451, "reward": 0.1187155619263649, "reward_std": 0.6100866496562958, "rewards/cosine_scaled_reward": -0.16980887576937675, "rewards/format_reward": 0.4583333432674408, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 1222.5000610351562, "epoch": 0.38971428571428574, "grad_norm": 31.15024931066955, "kl": 1.814453125, "learning_rate": 3.4991416936678276e-07, "loss": 0.0053, "reward": 0.4647822715342045, "reward_std": 0.8535723686218262, "rewards/cosine_scaled_reward": 0.013641122728586197, "rewards/format_reward": 0.4375000149011612, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 1550.9583740234375, "epoch": 0.39085714285714285, "grad_norm": 5.073035047796139, "kl": 0.40185546875, "learning_rate": 3.471051066897562e-07, "loss": 0.1274, "reward": -0.049222253262996674, "reward_std": 0.6296448782086372, "rewards/cosine_scaled_reward": -0.1704444605857134, "rewards/format_reward": 0.29166667349636555, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 1254.1458740234375, "epoch": 0.392, "grad_norm": 2.9987047793682247, "kl": 0.191650390625, "learning_rate": 3.4430593282358777e-07, "loss": 0.132, "reward": 0.4507103096693754, "reward_std": 0.46682045608758926, "rewards/cosine_scaled_reward": -0.11839485540986061, "rewards/format_reward": 0.6875000298023224, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 1259.0833740234375, "epoch": 0.3931428571428571, "grad_norm": 11.834773130920754, "kl": 0.765625, "learning_rate": 3.4151678419606233e-07, "loss": 0.1692, "reward": 0.04102582670748234, "reward_std": 0.6375212371349335, "rewards/cosine_scaled_reward": -0.16698708944022655, "rewards/format_reward": 0.3750000111758709, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 948.8333587646484, "epoch": 0.3942857142857143, "grad_norm": 4.082579051373274, "kl": 0.11126708984375, "learning_rate": 3.387377967463493e-07, "loss": 0.1531, "reward": 0.32552773877978325, "reward_std": 0.5937002822756767, "rewards/cosine_scaled_reward": -0.18098615854978561, "rewards/format_reward": 0.6875000149011612, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 1348.5208740234375, "epoch": 0.3954285714285714, "grad_norm": 4.16581520032074, "kl": 0.233154296875, "learning_rate": 3.359691059183761e-07, "loss": 0.0891, "reward": -0.024696938693523407, "reward_std": 0.6840994879603386, "rewards/cosine_scaled_reward": -0.2310984805226326, "rewards/format_reward": 0.4375000074505806, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 1152.9583435058594, "epoch": 0.3965714285714286, "grad_norm": 6.491892036842968, "kl": 0.688232421875, "learning_rate": 3.3321084665422803e-07, "loss": 0.1813, "reward": 0.7761995047330856, "reward_std": 0.9014021009206772, "rewards/cosine_scaled_reward": 0.08601640490815043, "rewards/format_reward": 0.6041666865348816, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 1119.6875305175781, "epoch": 0.3977142857142857, "grad_norm": 6.465035426418669, "kl": 0.2274169921875, "learning_rate": 3.3046315338757026e-07, "loss": 0.3084, "reward": 0.1041297996416688, "reward_std": 0.5661944150924683, "rewards/cosine_scaled_reward": -0.2187684327363968, "rewards/format_reward": 0.541666679084301, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 1280.5000305175781, "epoch": 0.39885714285714285, "grad_norm": 5.965340713566614, "kl": 0.0919189453125, "learning_rate": 3.2772616003709616e-07, "loss": 0.264, "reward": 0.5343287643045187, "reward_std": 1.0619665831327438, "rewards/cosine_scaled_reward": -0.024502300075255334, "rewards/format_reward": 0.5833333432674408, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 1420.2291870117188, "epoch": 0.4, "grad_norm": 2.925238124886515, "kl": 0.185791015625, "learning_rate": 3.250000000000001e-07, "loss": 0.1961, "reward": 0.12700789980590343, "reward_std": 0.8331074118614197, "rewards/cosine_scaled_reward": -0.1656627282500267, "rewards/format_reward": 0.4583333432674408, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 1083.6042175292969, "epoch": 0.40114285714285713, "grad_norm": 3.606767246674259, "kl": 0.18115234375, "learning_rate": 3.222848061454764e-07, "loss": -0.0154, "reward": 0.25727599672973156, "reward_std": 0.6387183666229248, "rewards/cosine_scaled_reward": -0.18386201839894056, "rewards/format_reward": 0.6250000074505806, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 1439.1250610351562, "epoch": 0.4022857142857143, "grad_norm": 1.929818758425276, "kl": 0.1397705078125, "learning_rate": 3.195807108082429e-07, "loss": 0.1728, "reward": -0.14825151395052671, "reward_std": 0.5558790042996407, "rewards/cosine_scaled_reward": -0.2824591100215912, "rewards/format_reward": 0.4166666716337204, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 1523.3542175292969, "epoch": 0.4034285714285714, "grad_norm": 1.501137402879622, "kl": 0.15606689453125, "learning_rate": 3.168878457820915e-07, "loss": 0.1054, "reward": -0.2005203291773796, "reward_std": 0.5384240373969078, "rewards/cosine_scaled_reward": -0.2565101645886898, "rewards/format_reward": 0.31250000558793545, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 1457.8125305175781, "epoch": 0.4045714285714286, "grad_norm": 1.7447813143906967, "kl": 0.19287109375, "learning_rate": 3.142063423134644e-07, "loss": 0.0946, "reward": -0.07205517496913671, "reward_std": 0.5912996232509613, "rewards/cosine_scaled_reward": -0.27561092376708984, "rewards/format_reward": 0.4791666865348816, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 935.1041793823242, "epoch": 0.4057142857142857, "grad_norm": 5.735907828017728, "kl": 0.416259765625, "learning_rate": 3.115363310950578e-07, "loss": 0.2126, "reward": 0.6018264503218234, "reward_std": 0.43670547753572464, "rewards/cosine_scaled_reward": -0.04283679276704788, "rewards/format_reward": 0.6875000149011612, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 1400.4375610351562, "epoch": 0.40685714285714286, "grad_norm": 4.2513620245343855, "kl": 0.2110595703125, "learning_rate": 3.0887794225945143e-07, "loss": 0.0986, "reward": 0.07107849605381489, "reward_std": 0.6532387360930443, "rewards/cosine_scaled_reward": -0.22487742826342583, "rewards/format_reward": 0.5208333432674408, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 1214.1250305175781, "epoch": 0.408, "grad_norm": 1.8135177203210504, "kl": 0.1314697265625, "learning_rate": 3.062313053727671e-07, "loss": 0.1426, "reward": 0.03724817745387554, "reward_std": 0.5181447230279446, "rewards/cosine_scaled_reward": -0.2730425810441375, "rewards/format_reward": 0.5833333507180214, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 1284.7500305175781, "epoch": 0.40914285714285714, "grad_norm": 3.565695417018542, "kl": 0.18597412109375, "learning_rate": 3.0359654942835247e-07, "loss": 0.1245, "reward": 0.04130622744560242, "reward_std": 0.7205251231789589, "rewards/cosine_scaled_reward": -0.19809689931571484, "rewards/format_reward": 0.4375000111758709, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 1343.2083740234375, "epoch": 0.4102857142857143, "grad_norm": 3.2057830256260917, "kl": 0.14324951171875, "learning_rate": 3.0097380284049523e-07, "loss": -0.0078, "reward": 0.1697351299226284, "reward_std": 0.3564612567424774, "rewards/cosine_scaled_reward": -0.13388244062662125, "rewards/format_reward": 0.4375000111758709, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 1433.2291870117188, "epoch": 0.4114285714285714, "grad_norm": 1.6762255456245136, "kl": 0.1739501953125, "learning_rate": 2.9836319343816397e-07, "loss": 0.1781, "reward": 0.21988008171319962, "reward_std": 0.7903619408607483, "rewards/cosine_scaled_reward": -0.10880996193736792, "rewards/format_reward": 0.4375000223517418, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 1309.6250457763672, "epoch": 0.4125714285714286, "grad_norm": 0.9826821036841882, "kl": 0.135498046875, "learning_rate": 2.9576484845877793e-07, "loss": 0.0186, "reward": 0.33486853912472725, "reward_std": 0.500580433756113, "rewards/cosine_scaled_reward": -0.11381572997197509, "rewards/format_reward": 0.5625000149011612, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 1173.5625305175781, "epoch": 0.4137142857142857, "grad_norm": 4.226570835684713, "kl": 0.0926513671875, "learning_rate": 2.931788945420058e-07, "loss": 0.18, "reward": 0.15393588319420815, "reward_std": 0.5774414390325546, "rewards/cosine_scaled_reward": -0.20428206771612167, "rewards/format_reward": 0.5625000149011612, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 1250.0416870117188, "epoch": 0.41485714285714287, "grad_norm": 1.978862188088671, "kl": 0.09033203125, "learning_rate": 2.9060545772359305e-07, "loss": 0.1327, "reward": 0.2741839215159416, "reward_std": 0.6551093906164169, "rewards/cosine_scaled_reward": -0.17540805786848068, "rewards/format_reward": 0.6250000149011612, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 1354.9583892822266, "epoch": 0.416, "grad_norm": 1.500971160749094, "kl": 0.1431884765625, "learning_rate": 2.8804466342921987e-07, "loss": 0.1556, "reward": 0.09914333745837212, "reward_std": 0.5969183072447777, "rewards/cosine_scaled_reward": -0.17959501221776009, "rewards/format_reward": 0.4583333432674408, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 1443.8958435058594, "epoch": 0.41714285714285715, "grad_norm": 1.5336203716893533, "kl": 0.14166259765625, "learning_rate": 2.854966364683872e-07, "loss": 0.0794, "reward": 0.08230920624919236, "reward_std": 0.7491874545812607, "rewards/cosine_scaled_reward": -0.18801206350326538, "rewards/format_reward": 0.4583333358168602, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 1280.0833740234375, "epoch": 0.41828571428571426, "grad_norm": 5.917103922817008, "kl": 0.1395263671875, "learning_rate": 2.829615010283344e-07, "loss": 0.2201, "reward": 0.30844624526798725, "reward_std": 0.6032929718494415, "rewards/cosine_scaled_reward": -0.11661022901535034, "rewards/format_reward": 0.5416666865348816, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 1071.7083587646484, "epoch": 0.41942857142857143, "grad_norm": 6.764653159306351, "kl": 1.21533203125, "learning_rate": 2.8043938066798645e-07, "loss": 0.2217, "reward": 0.5629880558699369, "reward_std": 0.7271402254700661, "rewards/cosine_scaled_reward": -0.02058931067585945, "rewards/format_reward": 0.6041666716337204, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 1272.2708892822266, "epoch": 0.4205714285714286, "grad_norm": 3.868751461553908, "kl": 0.376953125, "learning_rate": 2.7793039831193133e-07, "loss": 0.0282, "reward": 0.2414314430207014, "reward_std": 0.783539354801178, "rewards/cosine_scaled_reward": -0.13970092684030533, "rewards/format_reward": 0.5208333544433117, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 1453.8333740234375, "epoch": 0.4217142857142857, "grad_norm": 1.6191974125060598, "kl": 0.29150390625, "learning_rate": 2.7543467624442956e-07, "loss": 0.172, "reward": 0.11266430467367172, "reward_std": 0.7149153798818588, "rewards/cosine_scaled_reward": -0.1415845244191587, "rewards/format_reward": 0.39583334140479565, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 1463.3959045410156, "epoch": 0.4228571428571429, "grad_norm": 4.101308083609096, "kl": 0.56884765625, "learning_rate": 2.729523361034538e-07, "loss": 0.2149, "reward": -0.2552230432629585, "reward_std": 0.5415500551462173, "rewards/cosine_scaled_reward": -0.26302820444107056, "rewards/format_reward": 0.27083334140479565, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 1240.8542175292969, "epoch": 0.424, "grad_norm": 3.8927886605185447, "kl": 0.30340576171875, "learning_rate": 2.7048349887476037e-07, "loss": 0.1602, "reward": 0.1614240426570177, "reward_std": 0.5875495374202728, "rewards/cosine_scaled_reward": -0.15887131541967392, "rewards/format_reward": 0.479166679084301, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 1177.3958740234375, "epoch": 0.42514285714285716, "grad_norm": 3.066569475752354, "kl": 0.1824951171875, "learning_rate": 2.6802828488599294e-07, "loss": 0.1059, "reward": 0.2956250160932541, "reward_std": 0.6594211757183075, "rewards/cosine_scaled_reward": -0.12302083522081375, "rewards/format_reward": 0.5416666939854622, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 1163.4375610351562, "epoch": 0.42628571428571427, "grad_norm": 5.09566585578463, "kl": 0.2724609375, "learning_rate": 2.655868138008171e-07, "loss": 0.1544, "reward": 0.07318597589619458, "reward_std": 0.7096846550703049, "rewards/cosine_scaled_reward": -0.2759070098400116, "rewards/format_reward": 0.6250000149011612, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 1246.7500457763672, "epoch": 0.42742857142857144, "grad_norm": 32.203352857308325, "kl": 0.839111328125, "learning_rate": 2.631592046130896e-07, "loss": 0.1927, "reward": 0.08969515189528465, "reward_std": 0.6610818058252335, "rewards/cosine_scaled_reward": -0.22598576080054045, "rewards/format_reward": 0.5416666865348816, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 1280.7708740234375, "epoch": 0.42857142857142855, "grad_norm": 63.335567619096544, "kl": 0.94873046875, "learning_rate": 2.6074557564105724e-07, "loss": 0.2184, "reward": 0.18546735402196646, "reward_std": 0.9102050960063934, "rewards/cosine_scaled_reward": -0.17809965554624796, "rewards/format_reward": 0.5416667014360428, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 1256.6250305175781, "epoch": 0.4297142857142857, "grad_norm": 3.6519960558396716, "kl": 0.310302734375, "learning_rate": 2.583460445215911e-07, "loss": 0.1114, "reward": 0.1940733604133129, "reward_std": 0.5819907337427139, "rewards/cosine_scaled_reward": -0.1946299858391285, "rewards/format_reward": 0.583333358168602, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 1285.6458740234375, "epoch": 0.4308571428571429, "grad_norm": 5.319529708040252, "kl": 0.3394775390625, "learning_rate": 2.5596072820445254e-07, "loss": 0.0359, "reward": 0.25018906872719526, "reward_std": 0.8042758777737617, "rewards/cosine_scaled_reward": -0.13532213680446148, "rewards/format_reward": 0.5208333432674408, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 1374.5416870117188, "epoch": 0.432, "grad_norm": 20.461016176615068, "kl": 0.70166015625, "learning_rate": 2.5358974294659373e-07, "loss": 0.2346, "reward": -0.005498896003700793, "reward_std": 0.5357099026441574, "rewards/cosine_scaled_reward": -0.22149945423007011, "rewards/format_reward": 0.4375000074505806, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 1435.1875, "epoch": 0.43314285714285716, "grad_norm": 2.391831824846237, "kl": 0.292236328125, "learning_rate": 2.512332043064913e-07, "loss": 0.1982, "reward": 0.012932289391756058, "reward_std": 0.799980454146862, "rewards/cosine_scaled_reward": -0.20186719112098217, "rewards/format_reward": 0.4166666865348816, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 1369.5833740234375, "epoch": 0.4342857142857143, "grad_norm": 2.2747857355280208, "kl": 0.1715087890625, "learning_rate": 2.488912271385139e-07, "loss": 0.1725, "reward": -0.22791396314278245, "reward_std": 0.4170580878853798, "rewards/cosine_scaled_reward": -0.3431236445903778, "rewards/format_reward": 0.4583333544433117, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 1187.6458587646484, "epoch": 0.43542857142857144, "grad_norm": 2.7553958817382593, "kl": 0.16162109375, "learning_rate": 2.465639255873246e-07, "loss": 0.1247, "reward": 0.19117721682414412, "reward_std": 0.46048377081751823, "rewards/cosine_scaled_reward": -0.23774472624063492, "rewards/format_reward": 0.6666667014360428, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 1300.3125305175781, "epoch": 0.43657142857142855, "grad_norm": 2.0362039263750082, "kl": 0.1822509765625, "learning_rate": 2.4425141308231765e-07, "loss": 0.1158, "reward": 0.2739548869431019, "reward_std": 0.603746622800827, "rewards/cosine_scaled_reward": -0.09218922536820173, "rewards/format_reward": 0.4583333432674408, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 1263.7917175292969, "epoch": 0.4377142857142857, "grad_norm": 7.617696331239462, "kl": 0.2333984375, "learning_rate": 2.4195380233209006e-07, "loss": 0.1076, "reward": 0.12070683389902115, "reward_std": 0.38592402543872595, "rewards/cosine_scaled_reward": -0.18964658118784428, "rewards/format_reward": 0.5, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 1170.3750457763672, "epoch": 0.43885714285714283, "grad_norm": 3.2601623912372233, "kl": 0.2103271484375, "learning_rate": 2.3967120531894857e-07, "loss": 0.1471, "reward": -0.021999074146151543, "reward_std": 0.34355130419135094, "rewards/cosine_scaled_reward": -0.31308288127183914, "rewards/format_reward": 0.6041666865348816, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 1233.7083740234375, "epoch": 0.44, "grad_norm": 2.1916650637468025, "kl": 0.16259765625, "learning_rate": 2.374037332934512e-07, "loss": 0.0922, "reward": 0.054161038249731064, "reward_std": 0.7442760765552521, "rewards/cosine_scaled_reward": -0.2541694864630699, "rewards/format_reward": 0.5625000149011612, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 870.3333740234375, "epoch": 0.44114285714285717, "grad_norm": 1.4860604247340325, "kl": 0.0894775390625, "learning_rate": 2.3515149676898552e-07, "loss": 0.1158, "reward": 0.28954136464744806, "reward_std": 0.5479708462953568, "rewards/cosine_scaled_reward": -0.240646006539464, "rewards/format_reward": 0.770833358168602, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 1227.2291870117188, "epoch": 0.4422857142857143, "grad_norm": 1.687755076974517, "kl": 0.2470703125, "learning_rate": 2.3291460551638237e-07, "loss": 0.151, "reward": -0.0012904666364192963, "reward_std": 0.4440325200557709, "rewards/cosine_scaled_reward": -0.2714785784482956, "rewards/format_reward": 0.5416666865348816, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 1033.6042022705078, "epoch": 0.44342857142857145, "grad_norm": 2.5341444420596884, "kl": 0.164947509765625, "learning_rate": 2.306931685585657e-07, "loss": 0.0897, "reward": 0.4180222749710083, "reward_std": 0.754804901778698, "rewards/cosine_scaled_reward": -0.14515553694218397, "rewards/format_reward": 0.7083333432674408, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 1356.8333740234375, "epoch": 0.44457142857142856, "grad_norm": 3.704344948231344, "kl": 0.372314453125, "learning_rate": 2.2848729416523859e-07, "loss": 0.102, "reward": 0.2806839719414711, "reward_std": 0.6125510483980179, "rewards/cosine_scaled_reward": -0.07840801030397415, "rewards/format_reward": 0.4375000074505806, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 1090.2708740234375, "epoch": 0.44571428571428573, "grad_norm": 14.470921296685399, "kl": 0.47216796875, "learning_rate": 2.2629708984760706e-07, "loss": 0.2654, "reward": 0.07703178748488426, "reward_std": 0.5665107443928719, "rewards/cosine_scaled_reward": -0.26356743834912777, "rewards/format_reward": 0.6041666939854622, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 1098.7500305175781, "epoch": 0.44685714285714284, "grad_norm": 2.4001916122615157, "kl": 0.26904296875, "learning_rate": 2.2412266235313973e-07, "loss": 0.1304, "reward": 0.2017030455172062, "reward_std": 0.5325312875211239, "rewards/cosine_scaled_reward": -0.20123182306997478, "rewards/format_reward": 0.6041666865348816, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 1404.0625610351562, "epoch": 0.448, "grad_norm": 12.93850484473414, "kl": 0.662109375, "learning_rate": 2.2196411766036487e-07, "loss": 0.0663, "reward": 0.39279897045344114, "reward_std": 0.9181084930896759, "rewards/cosine_scaled_reward": -0.04318385384976864, "rewards/format_reward": 0.4791666939854622, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 1046.0417175292969, "epoch": 0.4491428571428571, "grad_norm": 3.1910943036863695, "kl": 0.2236328125, "learning_rate": 2.1982156097370557e-07, "loss": 0.094, "reward": 0.1259294361807406, "reward_std": 0.620373547077179, "rewards/cosine_scaled_reward": -0.23911861330270767, "rewards/format_reward": 0.6041666865348816, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 898.3333511352539, "epoch": 0.4502857142857143, "grad_norm": 4.93057169428389, "kl": 0.25714111328125, "learning_rate": 2.1769509671835223e-07, "loss": 0.2665, "reward": 0.2223543766885996, "reward_std": 0.4368506968021393, "rewards/cosine_scaled_reward": -0.23257281631231308, "rewards/format_reward": 0.6875000149011612, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 1140.0000457763672, "epoch": 0.4514285714285714, "grad_norm": 2.3738396662205945, "kl": 0.35986328125, "learning_rate": 2.1558482853517253e-07, "loss": 0.3105, "reward": 0.10918148793280125, "reward_std": 0.5202281884849072, "rewards/cosine_scaled_reward": -0.21624258160591125, "rewards/format_reward": 0.5416666865348816, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 1151.3958740234375, "epoch": 0.45257142857142857, "grad_norm": 2.5367764499763257, "kl": 0.3154296875, "learning_rate": 2.134908592756607e-07, "loss": 0.1917, "reward": 0.17909681051969528, "reward_std": 0.7349686250090599, "rewards/cosine_scaled_reward": -0.2021182719618082, "rewards/format_reward": 0.5833333432674408, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 1223.8958740234375, "epoch": 0.45371428571428574, "grad_norm": 3.0861426217577645, "kl": 0.38720703125, "learning_rate": 2.1141329099692406e-07, "loss": 0.2308, "reward": 0.6319128852337599, "reward_std": 0.8242618143558502, "rewards/cosine_scaled_reward": 0.04512310400605202, "rewards/format_reward": 0.541666679084301, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 1218.8541870117188, "epoch": 0.45485714285714285, "grad_norm": 18.365837770437405, "kl": 0.6829833984375, "learning_rate": 2.0935222495670968e-07, "loss": 0.189, "reward": 0.27588833356276155, "reward_std": 0.8127910792827606, "rewards/cosine_scaled_reward": -0.19538918882608414, "rewards/format_reward": 0.6666666939854622, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 1264.1666870117188, "epoch": 0.456, "grad_norm": 3.8049582826738373, "kl": 0.47314453125, "learning_rate": 2.0730776160846853e-07, "loss": 0.1823, "reward": 0.055698491632938385, "reward_std": 0.49411067366600037, "rewards/cosine_scaled_reward": -0.21173409838229418, "rewards/format_reward": 0.4791666828095913, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 1013.3750457763672, "epoch": 0.45714285714285713, "grad_norm": 7.251771375036044, "kl": 0.4078369140625, "learning_rate": 2.0528000059645995e-07, "loss": 0.175, "reward": 0.2562308683991432, "reward_std": 0.2563706263899803, "rewards/cosine_scaled_reward": -0.2260512337088585, "rewards/format_reward": 0.7083333488553762, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 952.7291870117188, "epoch": 0.4582857142857143, "grad_norm": 8.82258461767532, "kl": 0.45355224609375, "learning_rate": 2.032690407508949e-07, "loss": 0.1529, "reward": 0.4902263447875157, "reward_std": 0.5446355119347572, "rewards/cosine_scaled_reward": -0.11947017908096313, "rewards/format_reward": 0.729166679084301, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 1302.8542175292969, "epoch": 0.4594285714285714, "grad_norm": 9.144934630730456, "kl": 0.51953125, "learning_rate": 2.0127498008311922e-07, "loss": 0.1489, "reward": 0.0001004636287689209, "reward_std": 0.5631029531359673, "rewards/cosine_scaled_reward": -0.28119976818561554, "rewards/format_reward": 0.5625000223517418, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 1430.5833740234375, "epoch": 0.4605714285714286, "grad_norm": 1.9477748820622875, "kl": 0.3515625, "learning_rate": 1.9929791578083655e-07, "loss": 0.2408, "reward": -0.06413780152797699, "reward_std": 0.7934899777173996, "rewards/cosine_scaled_reward": -0.2195689007639885, "rewards/format_reward": 0.3750000037252903, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 1262.8333587646484, "epoch": 0.4617142857142857, "grad_norm": 3.330875199108497, "kl": 0.19140625, "learning_rate": 1.9733794420337213e-07, "loss": 0.1344, "reward": 0.1329102972522378, "reward_std": 0.5511343032121658, "rewards/cosine_scaled_reward": -0.25646152906119823, "rewards/format_reward": 0.645833358168602, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 1030.2916870117188, "epoch": 0.46285714285714286, "grad_norm": 7.2006501333137996, "kl": 0.147216796875, "learning_rate": 1.9539516087697517e-07, "loss": 0.1614, "reward": 0.41257511638104916, "reward_std": 0.4603617787361145, "rewards/cosine_scaled_reward": -0.14787913113832474, "rewards/format_reward": 0.7083333432674408, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 1085.1666870117188, "epoch": 0.464, "grad_norm": 1.7990135612572722, "kl": 0.25146484375, "learning_rate": 1.934696604901642e-07, "loss": 0.1199, "reward": -0.0262349434196949, "reward_std": 0.4924147129058838, "rewards/cosine_scaled_reward": -0.2839508093893528, "rewards/format_reward": 0.5416666828095913, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 953.4375, "epoch": 0.46514285714285714, "grad_norm": 2.205384781012483, "kl": 0.16357421875, "learning_rate": 1.915615368891117e-07, "loss": 0.0901, "reward": 0.5169772207736969, "reward_std": 0.28926569409668446, "rewards/cosine_scaled_reward": -0.0748447310179472, "rewards/format_reward": 0.6666666828095913, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 914.9583587646484, "epoch": 0.4662857142857143, "grad_norm": 2.497956913876472, "kl": 0.27496337890625, "learning_rate": 1.8967088307307e-07, "loss": 0.1155, "reward": 0.3262156348209828, "reward_std": 0.6255160942673683, "rewards/cosine_scaled_reward": -0.13897553086280823, "rewards/format_reward": 0.6041666716337204, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 1040.8333435058594, "epoch": 0.4674285714285714, "grad_norm": 8.69885706641019, "kl": 0.2950439453125, "learning_rate": 1.8779779118983867e-07, "loss": 0.1446, "reward": 0.45548180863261223, "reward_std": 0.683892697095871, "rewards/cosine_scaled_reward": -0.1472591133788228, "rewards/format_reward": 0.7500000149011612, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 1113.8750305175781, "epoch": 0.4685714285714286, "grad_norm": 2.827095364325863, "kl": 0.17364501953125, "learning_rate": 1.8594235253127372e-07, "loss": 0.1365, "reward": -0.055647075176239014, "reward_std": 0.5701718181371689, "rewards/cosine_scaled_reward": -0.3299068883061409, "rewards/format_reward": 0.6041666939854622, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 1180.7500305175781, "epoch": 0.4697142857142857, "grad_norm": 6.312691045251246, "kl": 0.2171630859375, "learning_rate": 1.8410465752883758e-07, "loss": 0.26, "reward": -0.027378916274756193, "reward_std": 0.5135050415992737, "rewards/cosine_scaled_reward": -0.33660613000392914, "rewards/format_reward": 0.6458333432674408, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 936.0625152587891, "epoch": 0.47085714285714286, "grad_norm": 8.466457070247934, "kl": 0.207763671875, "learning_rate": 1.822847957491922e-07, "loss": 0.2152, "reward": 0.2903781367931515, "reward_std": 0.6151079386472702, "rewards/cosine_scaled_reward": -0.24022759683430195, "rewards/format_reward": 0.770833358168602, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 1127.9375610351562, "epoch": 0.472, "grad_norm": 7.005816452720984, "kl": 0.23388671875, "learning_rate": 1.804828558898332e-07, "loss": 0.2359, "reward": -0.05256163072772324, "reward_std": 0.5086416229605675, "rewards/cosine_scaled_reward": -0.30753082782030106, "rewards/format_reward": 0.5625000223517418, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 1254.2292175292969, "epoch": 0.47314285714285714, "grad_norm": 3.1930529627345146, "kl": 0.30908203125, "learning_rate": 1.7869892577476722e-07, "loss": 0.091, "reward": 0.27630291134119034, "reward_std": 0.601336345076561, "rewards/cosine_scaled_reward": -0.12226520664989948, "rewards/format_reward": 0.520833358168602, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 1198.7083740234375, "epoch": 0.4742857142857143, "grad_norm": 1.9203274121236615, "kl": 0.27783203125, "learning_rate": 1.7693309235023127e-07, "loss": 0.1839, "reward": 0.15045135095715523, "reward_std": 0.8359555453062057, "rewards/cosine_scaled_reward": -0.21644099615514278, "rewards/format_reward": 0.5833333507180214, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 1303.2708435058594, "epoch": 0.4754285714285714, "grad_norm": 5.219130595783076, "kl": 0.288330078125, "learning_rate": 1.7518544168045524e-07, "loss": 0.2384, "reward": 0.06198018416762352, "reward_std": 0.7209452688694, "rewards/cosine_scaled_reward": -0.2502599246799946, "rewards/format_reward": 0.5625000149011612, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 1104.3958740234375, "epoch": 0.4765714285714286, "grad_norm": 343.4311543801194, "kl": 3.455078125, "learning_rate": 1.7345605894346726e-07, "loss": 0.3667, "reward": 0.25671100057661533, "reward_std": 0.5841851308941841, "rewards/cosine_scaled_reward": -0.19456118065863848, "rewards/format_reward": 0.645833358168602, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 1071.9375305175781, "epoch": 0.4777142857142857, "grad_norm": 3.5739561302927703, "kl": 0.18438720703125, "learning_rate": 1.7174502842694212e-07, "loss": 0.0318, "reward": 0.18263494968414307, "reward_std": 0.688008576631546, "rewards/cosine_scaled_reward": -0.25243253633379936, "rewards/format_reward": 0.6875000149011612, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 1059.2500457763672, "epoch": 0.47885714285714287, "grad_norm": 42.82614000306872, "kl": 14.88720703125, "learning_rate": 1.7005243352409333e-07, "loss": 0.182, "reward": 0.10820261249318719, "reward_std": 0.658612459897995, "rewards/cosine_scaled_reward": -0.24798204004764557, "rewards/format_reward": 0.6041666716337204, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 1066.7917022705078, "epoch": 0.48, "grad_norm": 7.563689623131912, "kl": 0.54296875, "learning_rate": 1.6837835672960831e-07, "loss": 0.1366, "reward": 0.24830662203021348, "reward_std": 0.6641267538070679, "rewards/cosine_scaled_reward": -0.19876337423920631, "rewards/format_reward": 0.6458333432674408, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 1140.8958740234375, "epoch": 0.48114285714285715, "grad_norm": 5.102712434876203, "kl": 0.455322265625, "learning_rate": 1.6672287963562852e-07, "loss": 0.238, "reward": 0.22175164567306638, "reward_std": 0.48806294053792953, "rewards/cosine_scaled_reward": -0.19120752811431885, "rewards/format_reward": 0.6041666865348816, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 1181.3333587646484, "epoch": 0.48228571428571426, "grad_norm": 11.187728016893017, "kl": 0.7470703125, "learning_rate": 1.6508608292777203e-07, "loss": 0.2428, "reward": 0.016264647245407104, "reward_std": 0.7520715892314911, "rewards/cosine_scaled_reward": -0.27311767637729645, "rewards/format_reward": 0.5625000149011612, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 1151.5625305175781, "epoch": 0.48342857142857143, "grad_norm": 36.484656907353894, "kl": 1.12109375, "learning_rate": 1.6346804638120098e-07, "loss": 0.225, "reward": 0.166658578440547, "reward_std": 0.5137820392847061, "rewards/cosine_scaled_reward": -0.20833738893270493, "rewards/format_reward": 0.5833333432674408, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 1222.4167022705078, "epoch": 0.4845714285714286, "grad_norm": 5.314021913144739, "kl": 0.468994140625, "learning_rate": 1.6186884885673413e-07, "loss": 0.0791, "reward": -0.053052062867209315, "reward_std": 0.5032695159316063, "rewards/cosine_scaled_reward": -0.349442720413208, "rewards/format_reward": 0.6458333507180214, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 1303.0417022705078, "epoch": 0.4857142857142857, "grad_norm": 15.439357915372184, "kl": 0.76171875, "learning_rate": 1.6028856829700258e-07, "loss": 0.1567, "reward": 0.06288054899778217, "reward_std": 0.8221424967050552, "rewards/cosine_scaled_reward": -0.24980972707271576, "rewards/format_reward": 0.5625000111758709, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 1363.2292175292969, "epoch": 0.4868571428571429, "grad_norm": 16.190560753791, "kl": 0.64306640625, "learning_rate": 1.5872728172265146e-07, "loss": 0.2057, "reward": 0.0070614293217659, "reward_std": 0.8801029026508331, "rewards/cosine_scaled_reward": -0.18396929651498795, "rewards/format_reward": 0.3750000111758709, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 1062.6250457763672, "epoch": 0.488, "grad_norm": 5.208018104302035, "kl": 0.289306640625, "learning_rate": 1.5718506522858572e-07, "loss": 0.2392, "reward": 0.1040960568934679, "reward_std": 0.7021225243806839, "rewards/cosine_scaled_reward": -0.21878531202673912, "rewards/format_reward": 0.5416666865348816, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 1168.6250457763672, "epoch": 0.48914285714285716, "grad_norm": 1.7936384513629215, "kl": 0.194366455078125, "learning_rate": 1.5566199398026147e-07, "loss": 0.1231, "reward": 0.1094297245144844, "reward_std": 0.5426923930644989, "rewards/cosine_scaled_reward": -0.247368473559618, "rewards/format_reward": 0.6041666716337204, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 1089.6875305175781, "epoch": 0.49028571428571427, "grad_norm": 3.242866515089598, "kl": 0.18408203125, "learning_rate": 1.5415814221002265e-07, "loss": 0.1081, "reward": 0.4839252680540085, "reward_std": 0.5947171896696091, "rewards/cosine_scaled_reward": -0.03928736597299576, "rewards/format_reward": 0.5625000260770321, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 1313.5833587646484, "epoch": 0.49142857142857144, "grad_norm": 1.478054069262014, "kl": 0.21612548828125, "learning_rate": 1.5267358321348285e-07, "loss": 0.1273, "reward": 0.15572084113955498, "reward_std": 0.5618212074041367, "rewards/cosine_scaled_reward": -0.18255625164601952, "rewards/format_reward": 0.5208333358168602, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 975.4791870117188, "epoch": 0.49257142857142855, "grad_norm": 3.541465585724065, "kl": 0.1605224609375, "learning_rate": 1.5120838934595337e-07, "loss": 0.1148, "reward": 0.420807933434844, "reward_std": 0.890654593706131, "rewards/cosine_scaled_reward": -0.11251270584762096, "rewards/format_reward": 0.6458333507180214, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 1181.062515258789, "epoch": 0.4937142857142857, "grad_norm": 3.350973639300781, "kl": 0.155517578125, "learning_rate": 1.4976263201891613e-07, "loss": 0.1027, "reward": 0.032605723943561316, "reward_std": 0.5731803774833679, "rewards/cosine_scaled_reward": -0.2753637991845608, "rewards/format_reward": 0.5833333507180214, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 1207.0417175292969, "epoch": 0.4948571428571429, "grad_norm": 4.990349151202906, "kl": 0.185546875, "learning_rate": 1.483363816965435e-07, "loss": 0.1393, "reward": 0.08886189805343747, "reward_std": 0.4594448246061802, "rewards/cosine_scaled_reward": -0.23681906727142632, "rewards/format_reward": 0.5625000298023224, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 859.1250457763672, "epoch": 0.496, "grad_norm": 1.9877951359345267, "kl": 0.17950439453125, "learning_rate": 1.469297078922642e-07, "loss": 0.0512, "reward": 1.2721150815486908, "reward_std": 0.6770742386579514, "rewards/cosine_scaled_reward": 0.20897419564425945, "rewards/format_reward": 0.8541666716337204, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 1266.375015258789, "epoch": 0.49714285714285716, "grad_norm": 1.8972601097369153, "kl": 0.2255859375, "learning_rate": 1.4554267916537495e-07, "loss": 0.1234, "reward": 0.10697830189019442, "reward_std": 0.531020175665617, "rewards/cosine_scaled_reward": -0.22776086255908012, "rewards/format_reward": 0.5625000149011612, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 1315.5625305175781, "epoch": 0.4982857142857143, "grad_norm": 2.490164316553904, "kl": 0.2635498046875, "learning_rate": 1.4417536311769885e-07, "loss": 0.1196, "reward": -0.10972822457551956, "reward_std": 0.5596715956926346, "rewards/cosine_scaled_reward": -0.2840307876467705, "rewards/format_reward": 0.4583333395421505, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 1060.5208587646484, "epoch": 0.49942857142857144, "grad_norm": 1.9387158266765225, "kl": 0.294189453125, "learning_rate": 1.4282782639029128e-07, "loss": 0.0174, "reward": 0.52107123285532, "reward_std": 0.5726887807250023, "rewards/cosine_scaled_reward": -0.05196441989392042, "rewards/format_reward": 0.6250000298023224, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 921.3125457763672, "epoch": 0.5005714285714286, "grad_norm": 8.654811309244227, "kl": 0.2535400390625, "learning_rate": 1.4150013466019114e-07, "loss": 0.1354, "reward": 0.20009983237832785, "reward_std": 0.6868909299373627, "rewards/cosine_scaled_reward": -0.20203341665910557, "rewards/format_reward": 0.6041666865348816, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 1083.0833740234375, "epoch": 0.5017142857142857, "grad_norm": 5.3889889905872375, "kl": 0.40216064453125, "learning_rate": 1.4019235263722034e-07, "loss": 0.2461, "reward": 0.11843711510300636, "reward_std": 0.5985070914030075, "rewards/cosine_scaled_reward": -0.2636981066316366, "rewards/format_reward": 0.6458333432674408, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 1088.8750305175781, "epoch": 0.5028571428571429, "grad_norm": 4.149099334589977, "kl": 0.30615234375, "learning_rate": 1.3890454406082956e-07, "loss": 0.033, "reward": 0.11394692957401276, "reward_std": 0.6579174622893333, "rewards/cosine_scaled_reward": -0.24510987009853125, "rewards/format_reward": 0.6041666865348816, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 984.3125610351562, "epoch": 0.504, "grad_norm": 16.782102815445953, "kl": 0.367919921875, "learning_rate": 1.3763677169699217e-07, "loss": 0.0977, "reward": 0.3985663428902626, "reward_std": 0.42315196245908737, "rewards/cosine_scaled_reward": -0.134050190448761, "rewards/format_reward": 0.6666666828095913, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 988.4583587646484, "epoch": 0.5051428571428571, "grad_norm": 4.106390753028162, "kl": 0.31378173828125, "learning_rate": 1.3638909733514452e-07, "loss": 0.0056, "reward": 0.13055693171918392, "reward_std": 0.48535653203725815, "rewards/cosine_scaled_reward": -0.27847154438495636, "rewards/format_reward": 0.6875000223517418, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 1004.3750152587891, "epoch": 0.5062857142857143, "grad_norm": 8.08171445493757, "kl": 0.16259765625, "learning_rate": 1.351615817851748e-07, "loss": 0.2301, "reward": 0.30977149307727814, "reward_std": 0.6895428746938705, "rewards/cosine_scaled_reward": -0.18886426091194153, "rewards/format_reward": 0.6875000298023224, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 1087.9375305175781, "epoch": 0.5074285714285715, "grad_norm": 3.849891000062917, "kl": 0.1669921875, "learning_rate": 1.3395428487445914e-07, "loss": 0.0975, "reward": 0.4580417312681675, "reward_std": 0.640699241310358, "rewards/cosine_scaled_reward": -0.14597914181649685, "rewards/format_reward": 0.7500000298023224, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 1224.2500457763672, "epoch": 0.5085714285714286, "grad_norm": 23.59411548899569, "kl": 0.82958984375, "learning_rate": 1.3276726544494571e-07, "loss": 0.2131, "reward": 0.051861570216715336, "reward_std": 0.6278680041432381, "rewards/cosine_scaled_reward": -0.2553192190825939, "rewards/format_reward": 0.5625, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 1067.9791870117188, "epoch": 0.5097142857142857, "grad_norm": 3.656936199785778, "kl": 0.24365234375, "learning_rate": 1.316005813502869e-07, "loss": 0.0234, "reward": 0.36385649256408215, "reward_std": 0.7834623008966446, "rewards/cosine_scaled_reward": -0.1722384188324213, "rewards/format_reward": 0.7083333432674408, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 1174.7083740234375, "epoch": 0.5108571428571429, "grad_norm": 2.1759216078948036, "kl": 0.3828125, "learning_rate": 1.3045428945301953e-07, "loss": 0.2194, "reward": 0.23982627410441637, "reward_std": 0.5332969650626183, "rewards/cosine_scaled_reward": -0.21342020854353905, "rewards/format_reward": 0.6666666865348816, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 1180.2083740234375, "epoch": 0.512, "grad_norm": 4.2366039265569135, "kl": 0.31494140625, "learning_rate": 1.2932844562179352e-07, "loss": 0.1963, "reward": 0.3762773834168911, "reward_std": 0.6801744475960732, "rewards/cosine_scaled_reward": -0.14519466273486614, "rewards/format_reward": 0.666666679084301, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 994.3958740234375, "epoch": 0.5131428571428571, "grad_norm": 4.146583173336839, "kl": 0.148681640625, "learning_rate": 1.2822310472864885e-07, "loss": 0.1922, "reward": 0.36078188568353653, "reward_std": 0.737194113433361, "rewards/cosine_scaled_reward": -0.15294241392984986, "rewards/format_reward": 0.6666667014360428, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 851.8958587646484, "epoch": 0.5142857142857142, "grad_norm": 51.8228987068238, "kl": 0.534942626953125, "learning_rate": 1.2713832064634125e-07, "loss": 0.1269, "reward": 0.5865043960511684, "reward_std": 0.4706997238099575, "rewards/cosine_scaled_reward": -0.10258114710450172, "rewards/format_reward": 0.7916666865348816, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 1079.7916870117188, "epoch": 0.5154285714285715, "grad_norm": 6.392599999015184, "kl": 0.2735595703125, "learning_rate": 1.260741462457165e-07, "loss": 0.2626, "reward": 0.22280075028538704, "reward_std": 0.6088056340813637, "rewards/cosine_scaled_reward": -0.18026629835367203, "rewards/format_reward": 0.5833333395421505, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 1046.8542022705078, "epoch": 0.5165714285714286, "grad_norm": 8.715599338320725, "kl": 0.152099609375, "learning_rate": 1.2503063339313356e-07, "loss": 0.2189, "reward": -0.0023173224180936813, "reward_std": 0.5100973732769489, "rewards/cosine_scaled_reward": -0.3136586770415306, "rewards/format_reward": 0.6250000298023224, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 1271.9375305175781, "epoch": 0.5177142857142857, "grad_norm": 2.4908553038859917, "kl": 0.40869140625, "learning_rate": 1.2400783294793668e-07, "loss": 0.1805, "reward": 0.027155719697475433, "reward_std": 0.5863115191459656, "rewards/cosine_scaled_reward": -0.23642215505242348, "rewards/format_reward": 0.5000000149011612, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 1104.6667175292969, "epoch": 0.5188571428571429, "grad_norm": 42.815024473876115, "kl": 2.04296875, "learning_rate": 1.2300579475997657e-07, "loss": 0.039, "reward": 0.2878073714673519, "reward_std": 0.6589629650115967, "rewards/cosine_scaled_reward": -0.13734631799161434, "rewards/format_reward": 0.5625000223517418, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 1183.250015258789, "epoch": 0.52, "grad_norm": 2.6108705721314824, "kl": 0.306640625, "learning_rate": 1.220245676671809e-07, "loss": 0.1199, "reward": 0.3790533752180636, "reward_std": 0.4862861856818199, "rewards/cosine_scaled_reward": -0.13338997215032578, "rewards/format_reward": 0.645833358168602, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 1168.4375305175781, "epoch": 0.5211428571428571, "grad_norm": 25.701940158931713, "kl": 0.477294921875, "learning_rate": 1.2106419949317388e-07, "loss": 0.1681, "reward": 0.2994745699688792, "reward_std": 0.7066301554441452, "rewards/cosine_scaled_reward": -0.15234605269506574, "rewards/format_reward": 0.6041666865348816, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 1007.4167022705078, "epoch": 0.5222857142857142, "grad_norm": 8.570796851314613, "kl": 0.186279296875, "learning_rate": 1.2012473704494537e-07, "loss": 0.3132, "reward": 0.400404367595911, "reward_std": 0.5747000873088837, "rewards/cosine_scaled_reward": -0.15396450087428093, "rewards/format_reward": 0.708333358168602, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 1272.8958435058594, "epoch": 0.5234285714285715, "grad_norm": 4.345232068890798, "kl": 0.48974609375, "learning_rate": 1.1920622611056974e-07, "loss": 0.2715, "reward": 0.2525772713124752, "reward_std": 0.8047986179590225, "rewards/cosine_scaled_reward": -0.12371136248111725, "rewards/format_reward": 0.5000000298023224, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 1063.625015258789, "epoch": 0.5245714285714286, "grad_norm": 5.1821320020363615, "kl": 0.15771484375, "learning_rate": 1.1830871145697412e-07, "loss": 0.1275, "reward": 0.02093285135924816, "reward_std": 0.42146630585193634, "rewards/cosine_scaled_reward": -0.3124502506107092, "rewards/format_reward": 0.645833358168602, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 1044.2708435058594, "epoch": 0.5257142857142857, "grad_norm": 3.3010395921201514, "kl": 0.267333984375, "learning_rate": 1.1743223682775649e-07, "loss": 0.1046, "reward": 0.2678923445455439, "reward_std": 0.896328404545784, "rewards/cosine_scaled_reward": -0.12647049874067307, "rewards/format_reward": 0.5208333656191826, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 1053.4583587646484, "epoch": 0.5268571428571428, "grad_norm": 1.478075619341315, "kl": 0.21600341796875, "learning_rate": 1.1657684494105386e-07, "loss": 0.0555, "reward": 0.3391416594386101, "reward_std": 0.9088789522647858, "rewards/cosine_scaled_reward": -0.20542917400598526, "rewards/format_reward": 0.7500000223517418, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 1103.2083740234375, "epoch": 0.528, "grad_norm": 9530.342121672113, "kl": 28.46978759765625, "learning_rate": 1.1574257748745986e-07, "loss": 1.3293, "reward": 0.14297988126054406, "reward_std": 0.5064843520522118, "rewards/cosine_scaled_reward": -0.25142673472873867, "rewards/format_reward": 0.645833358168602, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 1043.3333587646484, "epoch": 0.5291428571428571, "grad_norm": 4.739243744092973, "kl": 0.39892578125, "learning_rate": 1.1492947512799328e-07, "loss": 0.2493, "reward": 0.6755956448614597, "reward_std": 0.4871959462761879, "rewards/cosine_scaled_reward": 0.025297801941633224, "rewards/format_reward": 0.6250000298023224, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 992.2916717529297, "epoch": 0.5302857142857142, "grad_norm": 122.21833055898026, "kl": 1.33843994140625, "learning_rate": 1.1413757749211602e-07, "loss": 0.2572, "reward": 0.29958341596648097, "reward_std": 0.8296171501278877, "rewards/cosine_scaled_reward": -0.20437496528029442, "rewards/format_reward": 0.7083333432674408, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 1240.5417175292969, "epoch": 0.5314285714285715, "grad_norm": 6.845067293294205, "kl": 0.55908203125, "learning_rate": 1.1336692317580158e-07, "loss": 0.1986, "reward": 0.072305912617594, "reward_std": 0.4831778481602669, "rewards/cosine_scaled_reward": -0.2138470560312271, "rewards/format_reward": 0.5000000186264515, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 1192.9167175292969, "epoch": 0.5325714285714286, "grad_norm": 9.93163371597492, "kl": 0.49163818359375, "learning_rate": 1.1261754973965422e-07, "loss": 0.0928, "reward": 0.04001780319958925, "reward_std": 0.44342009350657463, "rewards/cosine_scaled_reward": -0.28207441698759794, "rewards/format_reward": 0.6041666865348816, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 1341.1667175292969, "epoch": 0.5337142857142857, "grad_norm": 19.835786495839272, "kl": 0.8251953125, "learning_rate": 1.1188949370707787e-07, "loss": 0.2635, "reward": 0.1306269969791174, "reward_std": 0.6591696962714195, "rewards/cosine_scaled_reward": -0.21593650616705418, "rewards/format_reward": 0.5625000298023224, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 1138.7708740234375, "epoch": 0.5348571428571428, "grad_norm": 13.935934940776233, "kl": 0.576904296875, "learning_rate": 1.1118279056249653e-07, "loss": 0.0976, "reward": 0.37931894324719906, "reward_std": 0.5462356135249138, "rewards/cosine_scaled_reward": -0.10200719349086285, "rewards/format_reward": 0.5833333432674408, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 1028.6041870117188, "epoch": 0.536, "grad_norm": 2.6967193006473216, "kl": 0.26171875, "learning_rate": 1.1049747474962444e-07, "loss": 0.0529, "reward": 0.40807172656059265, "reward_std": 0.6494475156068802, "rewards/cosine_scaled_reward": -0.13971414044499397, "rewards/format_reward": 0.6875000149011612, "step": 469 }, { "clip_ratio": 0.0, "completion_length": 848.5833587646484, "epoch": 0.5371428571428571, "grad_norm": 1.7855628531087904, "kl": 0.1328125, "learning_rate": 1.0983357966978745e-07, "loss": 0.0217, "reward": 0.6918718162924051, "reward_std": 0.5211210399866104, "rewards/cosine_scaled_reward": -0.0811474658548832, "rewards/format_reward": 0.8541666865348816, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 1070.7500457763672, "epoch": 0.5382857142857143, "grad_norm": 6.038242137859596, "kl": 0.169921875, "learning_rate": 1.0919113768029517e-07, "loss": 0.2149, "reward": 0.06172482669353485, "reward_std": 0.5211478099226952, "rewards/cosine_scaled_reward": -0.29205426201224327, "rewards/format_reward": 0.6458333432674408, "step": 471 }, { "clip_ratio": 0.0, "completion_length": 842.7083435058594, "epoch": 0.5394285714285715, "grad_norm": 12.109988243714355, "kl": 0.2894287109375, "learning_rate": 1.0857018009286381e-07, "loss": -0.0496, "reward": 0.46792223304510117, "reward_std": 0.54752978682518, "rewards/cosine_scaled_reward": -0.17228887975215912, "rewards/format_reward": 0.8125000149011612, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 1181.1875305175781, "epoch": 0.5405714285714286, "grad_norm": 6.409193674543087, "kl": 0.3604736328125, "learning_rate": 1.0797073717209013e-07, "loss": 0.0613, "reward": -0.049041745252907276, "reward_std": 0.5112807080149651, "rewards/cosine_scaled_reward": -0.2641042061150074, "rewards/format_reward": 0.4791666865348816, "step": 473 }, { "clip_ratio": 0.0, "completion_length": 990.3750305175781, "epoch": 0.5417142857142857, "grad_norm": 9.264374683069418, "kl": 0.11328125, "learning_rate": 1.0739283813397639e-07, "loss": 0.1628, "reward": 0.32282854616642, "reward_std": 0.7814144194126129, "rewards/cosine_scaled_reward": -0.20316907577216625, "rewards/format_reward": 0.7291667014360428, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 962.4792022705078, "epoch": 0.5428571428571428, "grad_norm": 4.242696196218054, "kl": 0.12603759765625, "learning_rate": 1.068365111445064e-07, "loss": 0.1483, "reward": 0.08424473810009658, "reward_std": 0.48827143758535385, "rewards/cosine_scaled_reward": -0.28079431876540184, "rewards/format_reward": 0.6458333544433117, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 1048.8958587646484, "epoch": 0.544, "grad_norm": 18.108937894089827, "kl": 0.4520263671875, "learning_rate": 1.063017833182728e-07, "loss": 0.1426, "reward": 0.3813807927072048, "reward_std": 0.6394810080528259, "rewards/cosine_scaled_reward": -0.05930961295962334, "rewards/format_reward": 0.5000000223517418, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 723.4791870117188, "epoch": 0.5451428571428572, "grad_norm": 5.68071346914076, "kl": 0.156005859375, "learning_rate": 1.0578868071715544e-07, "loss": 0.0836, "reward": 0.7284884303808212, "reward_std": 0.6032212525606155, "rewards/cosine_scaled_reward": -0.08367248624563217, "rewards/format_reward": 0.8958333432674408, "step": 477 }, { "clip_ratio": 0.0, "completion_length": 1013.3750305175781, "epoch": 0.5462857142857143, "grad_norm": 4.283725447191352, "kl": 0.0955657958984375, "learning_rate": 1.0529722834905125e-07, "loss": 0.1397, "reward": 0.49316432885825634, "reward_std": 0.45135799795389175, "rewards/cosine_scaled_reward": -0.12841782718896866, "rewards/format_reward": 0.7500000149011612, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 998.3125, "epoch": 0.5474285714285714, "grad_norm": 1.2914537281090146, "kl": 0.15277099609375, "learning_rate": 1.0482745016665526e-07, "loss": 0.0674, "reward": 0.31127920374274254, "reward_std": 0.6323697119951248, "rewards/cosine_scaled_reward": -0.16727706603705883, "rewards/format_reward": 0.6458333432674408, "step": 479 }, { "clip_ratio": 0.0, "completion_length": 984.8750152587891, "epoch": 0.5485714285714286, "grad_norm": 1.5465556570908303, "kl": 0.077606201171875, "learning_rate": 1.0437936906629334e-07, "loss": 0.0463, "reward": 0.5994082670658827, "reward_std": 0.37920307368040085, "rewards/cosine_scaled_reward": -0.1169625474140048, "rewards/format_reward": 0.8333333432674408, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 1145.3541870117188, "epoch": 0.5497142857142857, "grad_norm": 2.3612933288431535, "kl": 0.150390625, "learning_rate": 1.0395300688680625e-07, "loss": 0.0852, "reward": 0.18339010886847973, "reward_std": 0.6312093585729599, "rewards/cosine_scaled_reward": -0.1999716181308031, "rewards/format_reward": 0.5833333432674408, "step": 481 }, { "clip_ratio": 0.0, "completion_length": 1099.2291870117188, "epoch": 0.5508571428571428, "grad_norm": 3.3478046762143303, "kl": 0.1209716796875, "learning_rate": 1.0354838440848501e-07, "loss": 0.1351, "reward": 0.4303822033107281, "reward_std": 0.5441673323512077, "rewards/cosine_scaled_reward": -0.10772557370364666, "rewards/format_reward": 0.6458333488553762, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 972.8958587646484, "epoch": 0.552, "grad_norm": 2.070698520552659, "kl": 0.1766357421875, "learning_rate": 1.0316552135205837e-07, "loss": 0.1898, "reward": 0.3021550700068474, "reward_std": 0.6595650911331177, "rewards/cosine_scaled_reward": -0.244755819439888, "rewards/format_reward": 0.7916666865348816, "step": 483 }, { "clip_ratio": 0.0, "completion_length": 966.4375, "epoch": 0.5531428571428572, "grad_norm": 7.4718244138049785, "kl": 0.11669921875, "learning_rate": 1.0280443637773163e-07, "loss": 0.1535, "reward": 0.5982861579395831, "reward_std": 0.72054024040699, "rewards/cosine_scaled_reward": -0.034190285950899124, "rewards/format_reward": 0.6666666716337204, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 850.2292022705078, "epoch": 0.5542857142857143, "grad_norm": 2.6747580946696172, "kl": 0.171142578125, "learning_rate": 1.0246514708427701e-07, "loss": 0.0845, "reward": 0.440962532768026, "reward_std": 0.4621984176337719, "rewards/cosine_scaled_reward": -0.1545187532901764, "rewards/format_reward": 0.7500000298023224, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 1098.4791870117188, "epoch": 0.5554285714285714, "grad_norm": 5.009044167350138, "kl": 0.1492919921875, "learning_rate": 1.0214767000817596e-07, "loss": 0.1657, "reward": 0.31744778295978904, "reward_std": 0.8680954575538635, "rewards/cosine_scaled_reward": -0.15377611527219415, "rewards/format_reward": 0.6250000149011612, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 1250.9167022705078, "epoch": 0.5565714285714286, "grad_norm": 2.168704280565103, "kl": 0.3330078125, "learning_rate": 1.0185202062281336e-07, "loss": 0.1075, "reward": 0.09373046457767487, "reward_std": 0.7844668254256248, "rewards/cosine_scaled_reward": -0.18230143561959267, "rewards/format_reward": 0.4583333432674408, "step": 487 }, { "clip_ratio": 0.0, "completion_length": 1182.0417175292969, "epoch": 0.5577142857142857, "grad_norm": 68.49302633471272, "kl": 1.04931640625, "learning_rate": 1.0157821333772304e-07, "loss": 0.2607, "reward": 0.0013678865507245064, "reward_std": 0.5483251512050629, "rewards/cosine_scaled_reward": -0.2805660478770733, "rewards/format_reward": 0.5625000149011612, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 898.8958740234375, "epoch": 0.5588571428571428, "grad_norm": 3.3267908174810983, "kl": 0.22412109375, "learning_rate": 1.013262614978859e-07, "loss": 0.1155, "reward": 0.9380166502669454, "reward_std": 0.38279012218117714, "rewards/cosine_scaled_reward": 0.10442498326301575, "rewards/format_reward": 0.7291666716337204, "step": 489 }, { "clip_ratio": 0.0, "completion_length": 1022.5417175292969, "epoch": 0.56, "grad_norm": 1.2964210055945644, "kl": 0.142425537109375, "learning_rate": 1.0109617738307911e-07, "loss": 0.1375, "reward": 0.1352614858187735, "reward_std": 0.5779989808797836, "rewards/cosine_scaled_reward": -0.29695259779691696, "rewards/format_reward": 0.7291666716337204, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 1071.2708740234375, "epoch": 0.5611428571428572, "grad_norm": 5.198263303721915, "kl": 0.270751953125, "learning_rate": 1.0088797220727779e-07, "loss": 0.1398, "reward": 0.360213914886117, "reward_std": 0.5864584296941757, "rewards/cosine_scaled_reward": -0.12197639048099518, "rewards/format_reward": 0.6041666716337204, "step": 491 }, { "clip_ratio": 0.0, "completion_length": 1146.0208740234375, "epoch": 0.5622857142857143, "grad_norm": 25.32884427185481, "kl": 0.860107421875, "learning_rate": 1.0070165611810855e-07, "loss": 0.279, "reward": 0.3603329248726368, "reward_std": 0.4203804060816765, "rewards/cosine_scaled_reward": -0.11150021478533745, "rewards/format_reward": 0.583333358168602, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 1079.6250457763672, "epoch": 0.5634285714285714, "grad_norm": 5.39013483275012, "kl": 0.4027099609375, "learning_rate": 1.005372381963547e-07, "loss": 0.2018, "reward": 0.24866360798478127, "reward_std": 0.6557547599077225, "rewards/cosine_scaled_reward": -0.21941821463406086, "rewards/format_reward": 0.6875000298023224, "step": 493 }, { "clip_ratio": 0.0, "completion_length": 1020.0417022705078, "epoch": 0.5645714285714286, "grad_norm": 39.118419014119006, "kl": 1.0677490234375, "learning_rate": 1.0039472645551372e-07, "loss": 0.2737, "reward": 0.027191074565052986, "reward_std": 0.4351058676838875, "rewards/cosine_scaled_reward": -0.3301544785499573, "rewards/format_reward": 0.6875000149011612, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 1194.8958740234375, "epoch": 0.5657142857142857, "grad_norm": 5.938670898030579, "kl": 0.630859375, "learning_rate": 1.002741278414069e-07, "loss": 0.2055, "reward": 0.374758190009743, "reward_std": 0.6815578863024712, "rewards/cosine_scaled_reward": -0.09387091733515263, "rewards/format_reward": 0.5625000298023224, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 1120.3542022705078, "epoch": 0.5668571428571428, "grad_norm": 16.235625518016562, "kl": 0.50927734375, "learning_rate": 1.0017544823184055e-07, "loss": 0.297, "reward": 0.40772235160693526, "reward_std": 0.8966069668531418, "rewards/cosine_scaled_reward": -0.09822217002511024, "rewards/format_reward": 0.6041666865348816, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 827.4791870117188, "epoch": 0.568, "grad_norm": 6.892460021170429, "kl": 6.8536376953125, "learning_rate": 1.0009869243631952e-07, "loss": 0.2026, "reward": 0.8302161321043968, "reward_std": 0.560060553252697, "rewards/cosine_scaled_reward": 0.06094140186905861, "rewards/format_reward": 0.708333358168602, "step": 497 }, { "clip_ratio": 0.0, "completion_length": 1077.1250305175781, "epoch": 0.5691428571428572, "grad_norm": 10.255398655040155, "kl": 0.54931640625, "learning_rate": 1.000438641958131e-07, "loss": 0.2299, "reward": 0.033572545275092125, "reward_std": 0.4632219597697258, "rewards/cosine_scaled_reward": -0.30613040924072266, "rewards/format_reward": 0.645833358168602, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 1336.7083435058594, "epoch": 0.5702857142857143, "grad_norm": 27.33692044026225, "kl": 0.934326171875, "learning_rate": 1.0001096618257236e-07, "loss": 0.1642, "reward": -0.13300850987434387, "reward_std": 0.6832303777337074, "rewards/cosine_scaled_reward": -0.28525424748659134, "rewards/format_reward": 0.4375000149011612, "step": 499 }, { "clip_ratio": 0.0, "completion_length": 1017.9375305175781, "epoch": 0.5714285714285714, "grad_norm": 2.4162040111238334, "kl": 0.2301025390625, "learning_rate": 1e-07, "loss": 0.1131, "reward": 0.13043908029794693, "reward_std": 0.5788910314440727, "rewards/cosine_scaled_reward": -0.29936380684375763, "rewards/format_reward": 0.729166679084301, "step": 500 }, { "epoch": 0.5714285714285714, "step": 500, "total_flos": 0.0, "train_loss": 0.7532739232839085, "train_runtime": 13678.504, "train_samples_per_second": 1.755, "train_steps_per_second": 0.037 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }