{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5725714285714286, "eval_steps": 500, "global_step": 501, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 2422.4583740234375, "epoch": 0.001142857142857143, "grad_norm": 0.2217974215745926, "kl": 0.0, "learning_rate": 2e-08, "loss": -0.0, "reward": 0.08467595651745796, "reward_std": 0.16326796263456345, "rewards/cosine_scaled_reward": -0.1178167313337326, "rewards/format_reward": 0.5625000055879354, "step": 1 }, { "completion_length": 2853.5209045410156, "epoch": 0.002285714285714286, "grad_norm": 0.20924700796604156, "kl": 0.0, "learning_rate": 4e-08, "loss": 0.0, "reward": 0.10309412330389023, "reward_std": 0.17991704028099775, "rewards/cosine_scaled_reward": -0.002566009759902954, "rewards/format_reward": 0.3958333432674408, "step": 2 }, { "completion_length": 3397.4375610351562, "epoch": 0.0034285714285714284, "grad_norm": 0.19675247371196747, "kl": 4.780292510986328e-05, "learning_rate": 6e-08, "loss": 0.0, "reward": -0.0882117748260498, "reward_std": 0.15910092927515507, "rewards/cosine_scaled_reward": -0.24059850473713595, "rewards/format_reward": 0.1458333395421505, "step": 3 }, { "completion_length": 2135.854248046875, "epoch": 0.004571428571428572, "grad_norm": 0.24512815475463867, "kl": 3.281235694885254e-05, "learning_rate": 8e-08, "loss": 0.0, "reward": 0.06895642727613449, "reward_std": 0.18431758135557175, "rewards/cosine_scaled_reward": -0.2114272527396679, "rewards/format_reward": 0.6875, "step": 4 }, { "completion_length": 3394.7916870117188, "epoch": 0.005714285714285714, "grad_norm": 0.16346529126167297, "kl": 4.309415817260742e-05, "learning_rate": 1e-07, "loss": 0.0, "reward": -0.0332807507365942, "reward_std": 0.15036426112055779, "rewards/cosine_scaled_reward": -0.20040559396147728, "rewards/format_reward": 0.27083334513008595, "step": 5 }, { "completion_length": 3109.8125610351562, "epoch": 0.006857142857142857, "grad_norm": 0.1748201549053192, "kl": 5.0067901611328125e-05, "learning_rate": 1.2e-07, "loss": 0.0, "reward": -0.004397711425554007, "reward_std": 0.17683846689760685, "rewards/cosine_scaled_reward": -0.18609080463647842, "rewards/format_reward": 0.3541666716337204, "step": 6 }, { "completion_length": 2984.1875915527344, "epoch": 0.008, "grad_norm": 0.17349053919315338, "kl": 2.6702880859375e-05, "learning_rate": 1.4e-07, "loss": 0.0, "reward": 0.10000149812549353, "reward_std": 0.21995110251009464, "rewards/cosine_scaled_reward": -0.08740782551467419, "rewards/format_reward": 0.5625000149011612, "step": 7 }, { "completion_length": 2732.604248046875, "epoch": 0.009142857142857144, "grad_norm": 0.20784002542495728, "kl": 2.619624137878418e-05, "learning_rate": 1.6e-07, "loss": 0.0, "reward": 0.17328114074189216, "reward_std": 0.17557422909885645, "rewards/cosine_scaled_reward": 0.11275472119450569, "rewards/format_reward": 0.4375000149011612, "step": 8 }, { "completion_length": 3065.7501220703125, "epoch": 0.010285714285714285, "grad_norm": 0.20398905873298645, "kl": 3.987550735473633e-05, "learning_rate": 1.8e-07, "loss": 0.0, "reward": 0.037340812385082245, "reward_std": 0.1735632587224245, "rewards/cosine_scaled_reward": -0.12494563823565841, "rewards/format_reward": 0.3958333432674408, "step": 9 }, { "completion_length": 2657.9375610351562, "epoch": 0.011428571428571429, "grad_norm": 0.19967561960220337, "kl": 2.9385089874267578e-05, "learning_rate": 2e-07, "loss": 0.0, "reward": 0.010853782296180725, "reward_std": 0.15543875843286514, "rewards/cosine_scaled_reward": -0.19134788773953915, "rewards/format_reward": 0.416666679084301, "step": 10 }, { "completion_length": 3354.8750610351562, "epoch": 0.012571428571428572, "grad_norm": 0.1932716816663742, "kl": 3.567337989807129e-05, "learning_rate": 2.1999999999999998e-07, "loss": 0.0, "reward": -0.11862106062471867, "reward_std": 0.09151901677250862, "rewards/cosine_scaled_reward": -0.2814338691532612, "rewards/format_reward": 0.10416666977107525, "step": 11 }, { "completion_length": 2655.7500610351562, "epoch": 0.013714285714285714, "grad_norm": 0.17851270735263824, "kl": 2.9772520065307617e-05, "learning_rate": 2.4e-07, "loss": 0.0, "reward": 0.10605565179139376, "reward_std": 0.22570379823446274, "rewards/cosine_scaled_reward": -0.10964457504451275, "rewards/format_reward": 0.6250000223517418, "step": 12 }, { "completion_length": 3022.9375, "epoch": 0.014857142857142857, "grad_norm": 0.17700907588005066, "kl": 3.466010093688965e-05, "learning_rate": 2.6e-07, "loss": 0.0, "reward": 0.07587421184871346, "reward_std": 0.16161255538463593, "rewards/cosine_scaled_reward": -0.05128329060971737, "rewards/format_reward": 0.3958333507180214, "step": 13 }, { "completion_length": 2932.8334350585938, "epoch": 0.016, "grad_norm": 0.254742294549942, "kl": 2.9355287551879883e-05, "learning_rate": 2.8e-07, "loss": 0.0, "reward": 0.025470299180597067, "reward_std": 0.1742431577295065, "rewards/cosine_scaled_reward": -0.1397148072719574, "rewards/format_reward": 0.3750000074505806, "step": 14 }, { "completion_length": 2626.7708740234375, "epoch": 0.017142857142857144, "grad_norm": 0.22797353565692902, "kl": 2.1383166313171387e-05, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.1231200685724616, "reward_std": 0.1542098424397409, "rewards/cosine_scaled_reward": -0.003275785595178604, "rewards/format_reward": 0.479166679084301, "step": 15 }, { "completion_length": 3389.6875610351562, "epoch": 0.018285714285714287, "grad_norm": 0.2034512609243393, "kl": 3.987550735473633e-05, "learning_rate": 3.2e-07, "loss": 0.0, "reward": -0.1371253952383995, "reward_std": 0.09440502244979143, "rewards/cosine_scaled_reward": -0.30822891741991043, "rewards/format_reward": 0.08333333395421505, "step": 16 }, { "completion_length": 2332.000045776367, "epoch": 0.019428571428571427, "grad_norm": 0.2589235007762909, "kl": 4.1365623474121094e-05, "learning_rate": 3.4000000000000003e-07, "loss": 0.0, "reward": 0.11966872774064541, "reward_std": 0.19295101519674063, "rewards/cosine_scaled_reward": -0.06464683637022972, "rewards/format_reward": 0.5833333432674408, "step": 17 }, { "completion_length": 2953.0625610351562, "epoch": 0.02057142857142857, "grad_norm": 0.2123759686946869, "kl": 2.2158026695251465e-05, "learning_rate": 3.6e-07, "loss": 0.0, "reward": 0.014435535296797752, "reward_std": 0.14266836643218994, "rewards/cosine_scaled_reward": -0.14808162674307823, "rewards/format_reward": 0.35416666977107525, "step": 18 }, { "completion_length": 2997.0208740234375, "epoch": 0.021714285714285714, "grad_norm": 0.17227208614349365, "kl": 2.823770046234131e-05, "learning_rate": 3.7999999999999996e-07, "loss": 0.0, "reward": 0.10392235964536667, "reward_std": 0.18285458162426949, "rewards/cosine_scaled_reward": 0.03459303081035614, "rewards/format_reward": 0.3333333432674408, "step": 19 }, { "completion_length": 2278.2083740234375, "epoch": 0.022857142857142857, "grad_norm": 0.19066420197486877, "kl": 1.7568469047546387e-05, "learning_rate": 4e-07, "loss": 0.0, "reward": 0.3048325777053833, "reward_std": 0.22199945896863937, "rewards/cosine_scaled_reward": 0.18835287913680077, "rewards/format_reward": 0.7916666716337204, "step": 20 }, { "completion_length": 2712.2500610351562, "epoch": 0.024, "grad_norm": 0.31355181336402893, "kl": 4.044175148010254e-05, "learning_rate": 4.1999999999999995e-07, "loss": 0.0, "reward": 0.06256838142871857, "reward_std": 0.09118476416915655, "rewards/cosine_scaled_reward": -0.08784030005335808, "rewards/format_reward": 0.4166666679084301, "step": 21 }, { "completion_length": 1939.479248046875, "epoch": 0.025142857142857144, "grad_norm": 0.2796088457107544, "kl": 2.4668872356414795e-05, "learning_rate": 4.3999999999999997e-07, "loss": 0.0, "reward": 0.14422845281660557, "reward_std": 0.16695418395102024, "rewards/cosine_scaled_reward": -0.08684419468045235, "rewards/format_reward": 0.7291666865348816, "step": 22 }, { "completion_length": 2500.5000915527344, "epoch": 0.026285714285714287, "grad_norm": 0.20349635183811188, "kl": 2.5181099772453308e-05, "learning_rate": 4.6e-07, "loss": 0.0, "reward": 0.1400964381173253, "reward_std": 0.19208323769271374, "rewards/cosine_scaled_reward": 0.02415267378091812, "rewards/format_reward": 0.5000000055879354, "step": 23 }, { "completion_length": 2973.8750610351562, "epoch": 0.027428571428571427, "grad_norm": 0.24578233063220978, "kl": 2.899765968322754e-05, "learning_rate": 4.8e-07, "loss": 0.0, "reward": 0.12173501029610634, "reward_std": 0.2856454011052847, "rewards/cosine_scaled_reward": 0.019161097705364227, "rewards/format_reward": 0.4375000149011612, "step": 24 }, { "completion_length": 2711.7708740234375, "epoch": 0.02857142857142857, "grad_norm": 0.21293707191944122, "kl": 2.804398536682129e-05, "learning_rate": 5e-07, "loss": 0.0, "reward": 0.14046749907174672, "reward_std": 0.18061792477965355, "rewards/cosine_scaled_reward": 0.04513426125049591, "rewards/format_reward": 0.43750001303851604, "step": 25 }, { "completion_length": 3003.5833740234375, "epoch": 0.029714285714285714, "grad_norm": 0.15608130395412445, "kl": 3.5881996154785156e-05, "learning_rate": 5.2e-07, "loss": 0.0, "reward": -0.01579144282732159, "reward_std": 0.16360154375433922, "rewards/cosine_scaled_reward": -0.2414436973631382, "rewards/format_reward": 0.416666679084301, "step": 26 }, { "completion_length": 2902.5001220703125, "epoch": 0.030857142857142857, "grad_norm": 0.18425676226615906, "kl": 3.713369369506836e-05, "learning_rate": 5.4e-07, "loss": 0.0, "reward": 0.08107080624904484, "reward_std": 0.20751382037997246, "rewards/cosine_scaled_reward": -0.0728834755718708, "rewards/format_reward": 0.4583333507180214, "step": 27 }, { "completion_length": 2746.5625610351562, "epoch": 0.032, "grad_norm": 0.1847945749759674, "kl": 2.713128924369812e-05, "learning_rate": 5.6e-07, "loss": 0.0, "reward": 0.12077801371924579, "reward_std": 0.10570883378386497, "rewards/cosine_scaled_reward": -0.006200019270181656, "rewards/format_reward": 0.4791666716337204, "step": 28 }, { "completion_length": 3335.1041870117188, "epoch": 0.03314285714285714, "grad_norm": 0.25038447976112366, "kl": 3.349781036376953e-05, "learning_rate": 5.8e-07, "loss": 0.0, "reward": -0.08451610105112195, "reward_std": 0.1400617677718401, "rewards/cosine_scaled_reward": -0.24863700196146965, "rewards/format_reward": 0.1666666716337204, "step": 29 }, { "completion_length": 3021.479248046875, "epoch": 0.03428571428571429, "grad_norm": 0.1678503155708313, "kl": 2.0056962966918945e-05, "learning_rate": 6e-07, "loss": 0.0, "reward": 0.09385669013136066, "reward_std": 0.2677013725042343, "rewards/cosine_scaled_reward": -0.06881354004144669, "rewards/format_reward": 0.5000000204890966, "step": 30 }, { "completion_length": 3090.5000610351562, "epoch": 0.03542857142857143, "grad_norm": 0.18752458691596985, "kl": 4.112720489501953e-05, "learning_rate": 6.2e-07, "loss": 0.0, "reward": 0.002387374173849821, "reward_std": 0.13765401393175125, "rewards/cosine_scaled_reward": -0.11916127428412437, "rewards/format_reward": 0.25, "step": 31 }, { "completion_length": 3108.416748046875, "epoch": 0.036571428571428574, "grad_norm": 0.16679808497428894, "kl": 4.252791404724121e-05, "learning_rate": 6.4e-07, "loss": 0.0, "reward": 0.06192967155948281, "reward_std": 0.14284559153020382, "rewards/cosine_scaled_reward": -0.08569362014532089, "rewards/format_reward": 0.41666667722165585, "step": 32 }, { "completion_length": 3207.25, "epoch": 0.037714285714285714, "grad_norm": 0.15977373719215393, "kl": 4.246830940246582e-05, "learning_rate": 6.6e-07, "loss": 0.0, "reward": 0.058800650760531425, "reward_std": 0.24303261190652847, "rewards/cosine_scaled_reward": -0.052079036831855774, "rewards/format_reward": 0.33333334140479565, "step": 33 }, { "completion_length": 2264.1875610351562, "epoch": 0.038857142857142854, "grad_norm": 0.3427360951900482, "kl": 4.741549491882324e-05, "learning_rate": 6.800000000000001e-07, "loss": 0.0, "reward": 0.1383383944630623, "reward_std": 0.16219486575573683, "rewards/cosine_scaled_reward": -0.015951670706272125, "rewards/format_reward": 0.5625, "step": 34 }, { "completion_length": 2973.2709045410156, "epoch": 0.04, "grad_norm": 0.23589861392974854, "kl": 2.699345350265503e-05, "learning_rate": 7e-07, "loss": 0.0, "reward": 0.08740013558417559, "reward_std": 0.199846301227808, "rewards/cosine_scaled_reward": -0.025347106158733368, "rewards/format_reward": 0.37500001676380634, "step": 35 }, { "completion_length": 3439.7708740234375, "epoch": 0.04114285714285714, "grad_norm": 0.17564038932323456, "kl": 3.764033317565918e-05, "learning_rate": 7.2e-07, "loss": 0.0, "reward": -0.0576315198559314, "reward_std": 0.16324451565742493, "rewards/cosine_scaled_reward": -0.18479974195361137, "rewards/format_reward": 0.14583333767950535, "step": 36 }, { "completion_length": 3292.8125610351562, "epoch": 0.04228571428571429, "grad_norm": 0.17926602065563202, "kl": 3.460794687271118e-05, "learning_rate": 7.4e-07, "loss": 0.0, "reward": -0.06420499971136451, "reward_std": 0.0939055010676384, "rewards/cosine_scaled_reward": -0.22917584516108036, "rewards/format_reward": 0.2083333432674408, "step": 37 }, { "completion_length": 3287.8333740234375, "epoch": 0.04342857142857143, "grad_norm": 0.16871048510074615, "kl": 2.1064653992652893e-05, "learning_rate": 7.599999999999999e-07, "loss": 0.0, "reward": -0.06460437178611755, "reward_std": 0.12681686133146286, "rewards/cosine_scaled_reward": -0.19554772414267063, "rewards/format_reward": 0.14583333395421505, "step": 38 }, { "completion_length": 2764.0834350585938, "epoch": 0.044571428571428574, "grad_norm": 0.19446837902069092, "kl": 2.3424625396728516e-05, "learning_rate": 7.799999999999999e-07, "loss": 0.0, "reward": 0.14937732741236687, "reward_std": 0.16726944874972105, "rewards/cosine_scaled_reward": 0.029096633195877075, "rewards/format_reward": 0.5208333432674408, "step": 39 }, { "completion_length": 2582.2291870117188, "epoch": 0.045714285714285714, "grad_norm": 0.22617147862911224, "kl": 8.857250213623047e-05, "learning_rate": 8e-07, "loss": 0.0, "reward": 0.0381743386387825, "reward_std": 0.1509313378483057, "rewards/cosine_scaled_reward": -0.17509616166353226, "rewards/format_reward": 0.5000000298023224, "step": 40 }, { "completion_length": 2984.5208740234375, "epoch": 0.046857142857142854, "grad_norm": 0.15349487960338593, "kl": 3.3795833587646484e-05, "learning_rate": 8.199999999999999e-07, "loss": 0.0, "reward": 0.02739573549479246, "reward_std": 0.17404801957309246, "rewards/cosine_scaled_reward": -0.1257568709552288, "rewards/format_reward": 0.35416667349636555, "step": 41 }, { "completion_length": 2834.354248046875, "epoch": 0.048, "grad_norm": 0.2802078127861023, "kl": 4.3451786041259766e-05, "learning_rate": 8.399999999999999e-07, "loss": 0.0, "reward": -0.10002315789461136, "reward_std": 0.06375296134501696, "rewards/cosine_scaled_reward": -0.33834878355264664, "rewards/format_reward": 0.2916666679084301, "step": 42 }, { "completion_length": 3081.2708740234375, "epoch": 0.04914285714285714, "grad_norm": 0.1934126913547516, "kl": 4.869326949119568e-05, "learning_rate": 8.599999999999999e-07, "loss": 0.0, "reward": -0.020910249557346106, "reward_std": 0.1607950385659933, "rewards/cosine_scaled_reward": -0.17546747345477343, "rewards/format_reward": 0.27083333767950535, "step": 43 }, { "completion_length": 2764.1041870117188, "epoch": 0.05028571428571429, "grad_norm": 0.2302480936050415, "kl": 0.0001684427261352539, "learning_rate": 8.799999999999999e-07, "loss": 0.0, "reward": 0.10999520868062973, "reward_std": 0.1614558193832636, "rewards/cosine_scaled_reward": -0.03578119818121195, "rewards/format_reward": 0.5000000167638063, "step": 44 }, { "completion_length": 3439.75, "epoch": 0.05142857142857143, "grad_norm": 0.1567574143409729, "kl": 7.593631744384766e-05, "learning_rate": 9e-07, "loss": 0.0, "reward": 0.0033219801262021065, "reward_std": 0.18966009467840195, "rewards/cosine_scaled_reward": -0.09686505701392889, "rewards/format_reward": 0.2083333395421505, "step": 45 }, { "completion_length": 3180.3125610351562, "epoch": 0.052571428571428575, "grad_norm": 0.1937006413936615, "kl": 8.797645568847656e-05, "learning_rate": 9.2e-07, "loss": 0.0, "reward": -0.12023838749155402, "reward_std": 0.10050216782838106, "rewards/cosine_scaled_reward": -0.30665014684200287, "rewards/format_reward": 0.14583333395421505, "step": 46 }, { "completion_length": 2772.8750610351562, "epoch": 0.053714285714285714, "grad_norm": 0.26727426052093506, "kl": 3.7726014852523804e-05, "learning_rate": 9.399999999999999e-07, "loss": 0.0, "reward": 0.1865750551223755, "reward_std": 0.2490063440054655, "rewards/cosine_scaled_reward": 0.08368490543216467, "rewards/format_reward": 0.5416666772216558, "step": 47 }, { "completion_length": 2653.0833435058594, "epoch": 0.054857142857142854, "grad_norm": 0.24322590231895447, "kl": 0.00028768181800842285, "learning_rate": 9.6e-07, "loss": 0.0, "reward": 0.012025836622342467, "reward_std": 0.15032149478793144, "rewards/cosine_scaled_reward": -0.164933942258358, "rewards/format_reward": 0.37500000558793545, "step": 48 }, { "completion_length": 2394.625030517578, "epoch": 0.056, "grad_norm": 0.2265518307685852, "kl": 0.00019857287406921387, "learning_rate": 9.8e-07, "loss": 0.0, "reward": 0.05718126241117716, "reward_std": 0.20790210086852312, "rewards/cosine_scaled_reward": -0.14911458548158407, "rewards/format_reward": 0.5208333432674408, "step": 49 }, { "completion_length": 2885.375030517578, "epoch": 0.05714285714285714, "grad_norm": 0.16697874665260315, "kl": 0.00014549493789672852, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.13894587382674217, "reward_std": 0.18473267927765846, "rewards/cosine_scaled_reward": 0.08396464586257935, "rewards/format_reward": 0.3750000111758709, "step": 50 }, { "completion_length": 2379.729248046875, "epoch": 0.05828571428571429, "grad_norm": 0.24292853474617004, "kl": 0.0005617141723632812, "learning_rate": 9.999890338174275e-07, "loss": 0.0, "reward": 0.013561536557972431, "reward_std": 0.11890909075737, "rewards/cosine_scaled_reward": -0.22164202854037285, "rewards/format_reward": 0.5, "step": 51 }, { "completion_length": 2913.3750610351562, "epoch": 0.05942857142857143, "grad_norm": 0.20502108335494995, "kl": 0.0010059773921966553, "learning_rate": 9.999561358041868e-07, "loss": 0.0, "reward": 0.11341270804405212, "reward_std": 0.22254380024969578, "rewards/cosine_scaled_reward": 0.017230519093573093, "rewards/format_reward": 0.3958333432674408, "step": 52 }, { "completion_length": 2824.0000610351562, "epoch": 0.060571428571428575, "grad_norm": 0.19779708981513977, "kl": 0.0003744959831237793, "learning_rate": 9.999013075636804e-07, "loss": 0.0, "reward": 0.13543249666690826, "reward_std": 0.2240281980484724, "rewards/cosine_scaled_reward": 0.0030871573835611343, "rewards/format_reward": 0.5208333507180214, "step": 53 }, { "completion_length": 2788.7708740234375, "epoch": 0.061714285714285715, "grad_norm": 0.15000097453594208, "kl": 0.00011551380157470703, "learning_rate": 9.998245517681593e-07, "loss": 0.0, "reward": 0.21685698628425598, "reward_std": 0.19154726713895798, "rewards/cosine_scaled_reward": 0.14863055292516947, "rewards/format_reward": 0.5416666846722364, "step": 54 }, { "completion_length": 2974.1458740234375, "epoch": 0.06285714285714286, "grad_norm": 0.21617470681667328, "kl": 0.0005542635917663574, "learning_rate": 9.997258721585931e-07, "loss": 0.0, "reward": 0.08489698823541403, "reward_std": 0.2278457134962082, "rewards/cosine_scaled_reward": -0.019719617441296577, "rewards/format_reward": 0.3750000037252903, "step": 55 }, { "completion_length": 2817.2291870117188, "epoch": 0.064, "grad_norm": 0.18829624354839325, "kl": 0.0001971721649169922, "learning_rate": 9.996052735444862e-07, "loss": 0.0, "reward": 0.06905208062380552, "reward_std": 0.1425587795674801, "rewards/cosine_scaled_reward": -0.07445717975497246, "rewards/format_reward": 0.41666667349636555, "step": 56 }, { "completion_length": 3364.979248046875, "epoch": 0.06514285714285714, "grad_norm": 0.14393611252307892, "kl": 0.00021660327911376953, "learning_rate": 9.994627618036452e-07, "loss": 0.0, "reward": 0.00830092839896679, "reward_std": 0.16145257651805878, "rewards/cosine_scaled_reward": -0.12042163184378296, "rewards/format_reward": 0.27083334513008595, "step": 57 }, { "completion_length": 2198.6875915527344, "epoch": 0.06628571428571428, "grad_norm": 0.1959652602672577, "kl": 0.0017719268798828125, "learning_rate": 9.992983438818915e-07, "loss": 0.0001, "reward": 0.15597197646275163, "reward_std": 0.21204736083745956, "rewards/cosine_scaled_reward": -0.04709303192794323, "rewards/format_reward": 0.6875000111758709, "step": 58 }, { "completion_length": 2729.5209350585938, "epoch": 0.06742857142857143, "grad_norm": 0.19808489084243774, "kl": 0.00023877620697021484, "learning_rate": 9.991120277927223e-07, "loss": 0.0, "reward": 0.05562853440642357, "reward_std": 0.11674479860812426, "rewards/cosine_scaled_reward": -0.08065373566932976, "rewards/format_reward": 0.3750000111758709, "step": 59 }, { "completion_length": 2926.0209045410156, "epoch": 0.06857142857142857, "grad_norm": 0.16942426562309265, "kl": 0.0005669593811035156, "learning_rate": 9.989038226169207e-07, "loss": 0.0, "reward": 0.034913462586700916, "reward_std": 0.20762313529849052, "rewards/cosine_scaled_reward": -0.14165519177913666, "rewards/format_reward": 0.41666667722165585, "step": 60 }, { "completion_length": 3075.6875610351562, "epoch": 0.06971428571428571, "grad_norm": 0.1606554239988327, "kl": 0.0004697442054748535, "learning_rate": 9.98673738502114e-07, "loss": 0.0, "reward": 0.07319872255902737, "reward_std": 0.22705655079334974, "rewards/cosine_scaled_reward": -0.08769511990249157, "rewards/format_reward": 0.4583333358168602, "step": 61 }, { "completion_length": 2523.416748046875, "epoch": 0.07085714285714285, "grad_norm": 0.20859591662883759, "kl": 0.0012617111206054688, "learning_rate": 9.98421786662277e-07, "loss": 0.0001, "reward": 0.07555722631514072, "reward_std": 0.2749594282358885, "rewards/cosine_scaled_reward": -0.11566509678959846, "rewards/format_reward": 0.5208333488553762, "step": 62 }, { "completion_length": 2357.104248046875, "epoch": 0.072, "grad_norm": 0.20600537955760956, "kl": 0.0014723539352416992, "learning_rate": 9.981479793771866e-07, "loss": 0.0001, "reward": 0.173787857638672, "reward_std": 0.22000426054000854, "rewards/cosine_scaled_reward": 0.0005267849192023277, "rewards/format_reward": 0.6666666828095913, "step": 63 }, { "completion_length": 2755.9376220703125, "epoch": 0.07314285714285715, "grad_norm": 0.1938377171754837, "kl": 0.0010743141174316406, "learning_rate": 9.97852329991824e-07, "loss": 0.0, "reward": 0.07827219244791195, "reward_std": 0.16359133645892143, "rewards/cosine_scaled_reward": -0.09208118915557861, "rewards/format_reward": 0.4791666865348816, "step": 64 }, { "completion_length": 2812.6250610351562, "epoch": 0.07428571428571429, "grad_norm": 0.20439141988754272, "kl": 0.000774383544921875, "learning_rate": 9.975348529157229e-07, "loss": 0.0, "reward": 0.06977530382573605, "reward_std": 0.1315849106758833, "rewards/cosine_scaled_reward": -0.0596696212887764, "rewards/format_reward": 0.39583334885537624, "step": 65 }, { "completion_length": 2111.187515258789, "epoch": 0.07542857142857143, "grad_norm": 0.42952272295951843, "kl": 0.012227535247802734, "learning_rate": 9.971955636222684e-07, "loss": 0.0005, "reward": 0.1832641065120697, "reward_std": 0.10912141762673855, "rewards/cosine_scaled_reward": 0.08996204845607281, "rewards/format_reward": 0.520833333954215, "step": 66 }, { "completion_length": 3414.4375610351562, "epoch": 0.07657142857142857, "grad_norm": 0.13563436269760132, "kl": 0.0007442962378263474, "learning_rate": 9.968344786479415e-07, "loss": 0.0, "reward": -0.06709635467268527, "reward_std": 0.10008220560848713, "rewards/cosine_scaled_reward": -0.21500887908041477, "rewards/format_reward": 0.1666666716337204, "step": 67 }, { "completion_length": 2049.916732788086, "epoch": 0.07771428571428571, "grad_norm": 0.30434882640838623, "kl": 0.0040187835693359375, "learning_rate": 9.964516155915151e-07, "loss": 0.0002, "reward": 0.1919882227666676, "reward_std": 0.2173364106565714, "rewards/cosine_scaled_reward": 0.06528550758957863, "rewards/format_reward": 0.6041666716337204, "step": 68 }, { "completion_length": 2336.0834045410156, "epoch": 0.07885714285714286, "grad_norm": 0.24304541945457458, "kl": 0.0032415390014648438, "learning_rate": 9.960469931131936e-07, "loss": 0.0001, "reward": 0.011424195021390915, "reward_std": 0.1404279638081789, "rewards/cosine_scaled_reward": -0.23959477618336678, "rewards/format_reward": 0.5208333507180214, "step": 69 }, { "completion_length": 3096.729248046875, "epoch": 0.08, "grad_norm": 0.1957111656665802, "kl": 0.0008525848388671875, "learning_rate": 9.956206309337066e-07, "loss": 0.0, "reward": -0.011866447050124407, "reward_std": 0.15496444329619408, "rewards/cosine_scaled_reward": -0.17906717211008072, "rewards/format_reward": 0.31250001303851604, "step": 70 }, { "completion_length": 2526.9583740234375, "epoch": 0.08114285714285714, "grad_norm": 0.23573040962219238, "kl": 0.0010199546813964844, "learning_rate": 9.951725498333448e-07, "loss": 0.0, "reward": 0.08477263187523931, "reward_std": 0.1251183496788144, "rewards/cosine_scaled_reward": -0.0376198529265821, "rewards/format_reward": 0.3958333395421505, "step": 71 }, { "completion_length": 2892.0625610351562, "epoch": 0.08228571428571428, "grad_norm": 0.2383488118648529, "kl": 0.0015397071838378906, "learning_rate": 9.947027716509488e-07, "loss": 0.0001, "reward": 0.018520042300224304, "reward_std": 0.20084666088223457, "rewards/cosine_scaled_reward": -0.16570908017456532, "rewards/format_reward": 0.3958333358168602, "step": 72 }, { "completion_length": 3483.604248046875, "epoch": 0.08342857142857144, "grad_norm": 0.16850998997688293, "kl": 0.0005273818969726562, "learning_rate": 9.942113192828444e-07, "loss": 0.0, "reward": -0.08224166510626674, "reward_std": 0.16050567850470543, "rewards/cosine_scaled_reward": -0.21999531239271164, "rewards/format_reward": 0.12500000186264515, "step": 73 }, { "completion_length": 3002.0626220703125, "epoch": 0.08457142857142858, "grad_norm": 0.20608282089233398, "kl": 0.0013448596000671387, "learning_rate": 9.93698216681727e-07, "loss": 0.0001, "reward": 0.07716270606033504, "reward_std": 0.188118364661932, "rewards/cosine_scaled_reward": -0.005718638189136982, "rewards/format_reward": 0.3125000111758709, "step": 74 }, { "completion_length": 2866.187530517578, "epoch": 0.08571428571428572, "grad_norm": 0.17602171003818512, "kl": 0.0012335777282714844, "learning_rate": 9.931634888554935e-07, "loss": 0.0, "reward": 0.12930196640081704, "reward_std": 0.1540745897218585, "rewards/cosine_scaled_reward": 0.009427577257156372, "rewards/format_reward": 0.4791666753590107, "step": 75 }, { "completion_length": 2869.7501220703125, "epoch": 0.08685714285714285, "grad_norm": 0.18825611472129822, "kl": 0.0005376338958740234, "learning_rate": 9.926071618660237e-07, "loss": 0.0, "reward": -0.019133321475237608, "reward_std": 0.12899490632116795, "rewards/cosine_scaled_reward": -0.25955629348754883, "rewards/format_reward": 0.4375000186264515, "step": 76 }, { "completion_length": 3044.3333740234375, "epoch": 0.088, "grad_norm": 0.16967883706092834, "kl": 0.000682830810546875, "learning_rate": 9.9202926282791e-07, "loss": 0.0, "reward": -0.0321506354957819, "reward_std": 0.13603493571281433, "rewards/cosine_scaled_reward": -0.2693170513957739, "rewards/format_reward": 0.41666667349636555, "step": 77 }, { "completion_length": 2871.6250610351562, "epoch": 0.08914285714285715, "grad_norm": 0.17775395512580872, "kl": 0.0005066394805908203, "learning_rate": 9.91429819907136e-07, "loss": 0.0, "reward": 0.08913127763662487, "reward_std": 0.2095542810857296, "rewards/cosine_scaled_reward": -0.024471648037433624, "rewards/format_reward": 0.3958333432674408, "step": 78 }, { "completion_length": 2288.125045776367, "epoch": 0.09028571428571429, "grad_norm": 0.24125170707702637, "kl": 0.0012928247451782227, "learning_rate": 9.908088623197048e-07, "loss": 0.0001, "reward": 0.127546064555645, "reward_std": 0.16784489154815674, "rewards/cosine_scaled_reward": -0.05105599761009216, "rewards/format_reward": 0.5833333544433117, "step": 79 }, { "completion_length": 3323.729248046875, "epoch": 0.09142857142857143, "grad_norm": 0.17265021800994873, "kl": 0.0015497207641601562, "learning_rate": 9.901664203302124e-07, "loss": 0.0001, "reward": 0.02169090462848544, "reward_std": 0.18521080538630486, "rewards/cosine_scaled_reward": -0.09200811013579369, "rewards/format_reward": 0.27083333395421505, "step": 80 }, { "completion_length": 2767.479217529297, "epoch": 0.09257142857142857, "grad_norm": 0.37805935740470886, "kl": 0.0035076141357421875, "learning_rate": 9.895025252503755e-07, "loss": 0.0001, "reward": -0.07034268043935299, "reward_std": 0.1365505950525403, "rewards/cosine_scaled_reward": -0.32657285034656525, "rewards/format_reward": 0.3750000149011612, "step": 81 }, { "completion_length": 2669.354248046875, "epoch": 0.09371428571428571, "grad_norm": 0.21049590408802032, "kl": 0.0014095306396484375, "learning_rate": 9.888172094375033e-07, "loss": 0.0001, "reward": 0.09870939329266548, "reward_std": 0.15531447157263756, "rewards/cosine_scaled_reward": -0.018129711039364338, "rewards/format_reward": 0.4166666716337204, "step": 82 }, { "completion_length": 2694.4791870117188, "epoch": 0.09485714285714286, "grad_norm": 0.23234984278678894, "kl": 0.001300811767578125, "learning_rate": 9.881105062929221e-07, "loss": 0.0001, "reward": -0.022123297676444054, "reward_std": 0.10592562891542912, "rewards/cosine_scaled_reward": -0.20149537362158298, "rewards/format_reward": 0.3125, "step": 83 }, { "completion_length": 3089.9375610351562, "epoch": 0.096, "grad_norm": 0.1673467606306076, "kl": 0.0008716583251953125, "learning_rate": 9.873824502603459e-07, "loss": 0.0, "reward": 0.06725652515888214, "reward_std": 0.1797430757433176, "rewards/cosine_scaled_reward": -0.06916102161630988, "rewards/format_reward": 0.3958333358168602, "step": 84 }, { "completion_length": 2968.2500610351562, "epoch": 0.09714285714285714, "grad_norm": 0.18632689118385315, "kl": 0.0009512901306152344, "learning_rate": 9.866330768241983e-07, "loss": 0.0, "reward": 0.03277226095087826, "reward_std": 0.22309906780719757, "rewards/cosine_scaled_reward": -0.18480883911252022, "rewards/format_reward": 0.500000013038516, "step": 85 }, { "completion_length": 2637.3125610351562, "epoch": 0.09828571428571428, "grad_norm": 0.19769161939620972, "kl": 0.0012178421020507812, "learning_rate": 9.85862422507884e-07, "loss": 0.0, "reward": 0.06074523401912302, "reward_std": 0.09560158289968967, "rewards/cosine_scaled_reward": -0.11061662063002586, "rewards/format_reward": 0.4583333432674408, "step": 86 }, { "completion_length": 2545.104217529297, "epoch": 0.09942857142857142, "grad_norm": 0.28536996245384216, "kl": 0.0024814605712890625, "learning_rate": 9.850705248720068e-07, "loss": 0.0001, "reward": 0.09510067413793877, "reward_std": 0.19984697178006172, "rewards/cosine_scaled_reward": -0.11003985954448581, "rewards/format_reward": 0.5833333432674408, "step": 87 }, { "completion_length": 2481.416748046875, "epoch": 0.10057142857142858, "grad_norm": 0.19572964310646057, "kl": 0.0021457672119140625, "learning_rate": 9.8425742251254e-07, "loss": 0.0001, "reward": 0.07864797674119473, "reward_std": 0.2205824851989746, "rewards/cosine_scaled_reward": -0.13493631035089493, "rewards/format_reward": 0.5625000074505806, "step": 88 }, { "completion_length": 2725.3958740234375, "epoch": 0.10171428571428572, "grad_norm": 0.22061605751514435, "kl": 0.001773834228515625, "learning_rate": 9.83423155058946e-07, "loss": 0.0001, "reward": 0.04609313979744911, "reward_std": 0.23168375715613365, "rewards/cosine_scaled_reward": -0.14034401858225465, "rewards/format_reward": 0.4583333544433117, "step": 89 }, { "completion_length": 2356.3125610351562, "epoch": 0.10285714285714286, "grad_norm": 0.2666766047477722, "kl": 0.00469207763671875, "learning_rate": 9.825677631722435e-07, "loss": 0.0002, "reward": 0.015945925377309322, "reward_std": 0.1416610088199377, "rewards/cosine_scaled_reward": -0.25287802144885063, "rewards/format_reward": 0.5625000204890966, "step": 90 }, { "completion_length": 2725.7084045410156, "epoch": 0.104, "grad_norm": 0.22179532051086426, "kl": 0.0018901824951171875, "learning_rate": 9.816912885430258e-07, "loss": 0.0001, "reward": 0.03749770554713905, "reward_std": 0.19320211559534073, "rewards/cosine_scaled_reward": -0.1673196665942669, "rewards/format_reward": 0.4791666679084301, "step": 91 }, { "completion_length": 2502.5209045410156, "epoch": 0.10514285714285715, "grad_norm": 0.21331754326820374, "kl": 0.0034551620483398438, "learning_rate": 9.807937738894303e-07, "loss": 0.0001, "reward": 0.020164599642157555, "reward_std": 0.13452178053557873, "rewards/cosine_scaled_reward": -0.22247609682381153, "rewards/format_reward": 0.5208333358168602, "step": 92 }, { "completion_length": 3322.6666870117188, "epoch": 0.10628571428571429, "grad_norm": 0.1798219233751297, "kl": 0.00267791748046875, "learning_rate": 9.798752629550546e-07, "loss": 0.0001, "reward": -0.10304510593414307, "reward_std": 0.08899393863976002, "rewards/cosine_scaled_reward": -0.27239324152469635, "rewards/format_reward": 0.1458333395421505, "step": 93 }, { "completion_length": 2910.2708740234375, "epoch": 0.10742857142857143, "grad_norm": 0.24701499938964844, "kl": 0.003047943115234375, "learning_rate": 9.78935800506826e-07, "loss": 0.0001, "reward": 0.08131833001971245, "reward_std": 0.09326184960082173, "rewards/cosine_scaled_reward": -0.01211006660014391, "rewards/format_reward": 0.3333333432674408, "step": 94 }, { "completion_length": 3434.4375, "epoch": 0.10857142857142857, "grad_norm": 0.14336159825325012, "kl": 0.001483917236328125, "learning_rate": 9.779754323328192e-07, "loss": 0.0001, "reward": -0.030428330413997173, "reward_std": 0.16425743699073792, "rewards/cosine_scaled_reward": -0.17323063872754574, "rewards/format_reward": 0.22916667349636555, "step": 95 }, { "completion_length": 2605.7083740234375, "epoch": 0.10971428571428571, "grad_norm": 0.19360791146755219, "kl": 0.0031585693359375, "learning_rate": 9.769942052400235e-07, "loss": 0.0001, "reward": 0.09746365912724286, "reward_std": 0.14501910284161568, "rewards/cosine_scaled_reward": -0.017119438387453556, "rewards/format_reward": 0.4166666679084301, "step": 96 }, { "completion_length": 2752.5001220703125, "epoch": 0.11085714285714286, "grad_norm": 0.25784891843795776, "kl": 0.0029249191284179688, "learning_rate": 9.759921670520634e-07, "loss": 0.0001, "reward": 0.07194726821035147, "reward_std": 0.18822367396205664, "rewards/cosine_scaled_reward": -0.0948387160897255, "rewards/format_reward": 0.45833335630595684, "step": 97 }, { "completion_length": 2697.916717529297, "epoch": 0.112, "grad_norm": 0.20602712035179138, "kl": 0.0015430450439453125, "learning_rate": 9.749693666068663e-07, "loss": 0.0001, "reward": 0.04305866325739771, "reward_std": 0.0769603131338954, "rewards/cosine_scaled_reward": -0.19654148817062378, "rewards/format_reward": 0.5625000149011612, "step": 98 }, { "completion_length": 2768.1458587646484, "epoch": 0.11314285714285714, "grad_norm": 0.251437783241272, "kl": 0.0034656524658203125, "learning_rate": 9.739258537542835e-07, "loss": 0.0001, "reward": 0.03451013471931219, "reward_std": 0.13935161381959915, "rewards/cosine_scaled_reward": -0.1048834016546607, "rewards/format_reward": 0.33333334140479565, "step": 99 }, { "completion_length": 2503.8959045410156, "epoch": 0.11428571428571428, "grad_norm": 0.19386953115463257, "kl": 0.0031108856201171875, "learning_rate": 9.728616793536587e-07, "loss": 0.0001, "reward": 0.13897301501128823, "reward_std": 0.17272601835429668, "rewards/cosine_scaled_reward": 0.008669085800647736, "rewards/format_reward": 0.5208333507180214, "step": 100 }, { "completion_length": 2197.6250610351562, "epoch": 0.11542857142857142, "grad_norm": 0.28790751099586487, "kl": 0.0026721954345703125, "learning_rate": 9.717768952713511e-07, "loss": 0.0001, "reward": 0.034005239605903625, "reward_std": 0.06839179154485464, "rewards/cosine_scaled_reward": -0.2176961973309517, "rewards/format_reward": 0.5625000055879354, "step": 101 }, { "completion_length": 1823.7500305175781, "epoch": 0.11657142857142858, "grad_norm": 0.2623526155948639, "kl": 0.00592041015625, "learning_rate": 9.706715543782064e-07, "loss": 0.0002, "reward": 0.07911303732544184, "reward_std": 0.20092780143022537, "rewards/cosine_scaled_reward": -0.20255927927792072, "rewards/format_reward": 0.708333358168602, "step": 102 }, { "completion_length": 2647.9583740234375, "epoch": 0.11771428571428572, "grad_norm": 0.24559658765792847, "kl": 0.0040340423583984375, "learning_rate": 9.695457105469804e-07, "loss": 0.0002, "reward": 0.08460952155292034, "reward_std": 0.18700673431158066, "rewards/cosine_scaled_reward": -0.09807442128658295, "rewards/format_reward": 0.5208333544433117, "step": 103 }, { "completion_length": 2513.1666870117188, "epoch": 0.11885714285714286, "grad_norm": 0.32273057103157043, "kl": 0.003582000732421875, "learning_rate": 9.683994186497132e-07, "loss": 0.0001, "reward": 0.03470417996868491, "reward_std": 0.16441558115184307, "rewards/cosine_scaled_reward": -0.18330081552267075, "rewards/format_reward": 0.5, "step": 104 }, { "completion_length": 2185.4584350585938, "epoch": 0.12, "grad_norm": 0.23857823014259338, "kl": 0.0029306411743164062, "learning_rate": 9.672327345550543e-07, "loss": 0.0001, "reward": 0.08454111497849226, "reward_std": 0.22059074230492115, "rewards/cosine_scaled_reward": -0.12463724735425785, "rewards/format_reward": 0.5625000149011612, "step": 105 }, { "completion_length": 2065.8958587646484, "epoch": 0.12114285714285715, "grad_norm": 0.1924944669008255, "kl": 0.0017528533935546875, "learning_rate": 9.66045715125541e-07, "loss": 0.0001, "reward": 0.31528329849243164, "reward_std": 0.17143471166491508, "rewards/cosine_scaled_reward": 0.2455898243933916, "rewards/format_reward": 0.7291666828095913, "step": 106 }, { "completion_length": 2458.8959350585938, "epoch": 0.12228571428571429, "grad_norm": 0.2932869493961334, "kl": 0.00742340087890625, "learning_rate": 9.648384182148252e-07, "loss": 0.0003, "reward": 0.08245569933205843, "reward_std": 0.15556151047348976, "rewards/cosine_scaled_reward": -0.1426069140434265, "rewards/format_reward": 0.6041666865348816, "step": 107 }, { "completion_length": 2469.7083740234375, "epoch": 0.12342857142857143, "grad_norm": 0.2758703827857971, "kl": 0.0032558441162109375, "learning_rate": 9.636109026648554e-07, "loss": 0.0001, "reward": 0.14540181402117014, "reward_std": 0.13690084591507912, "rewards/cosine_scaled_reward": 0.003204292617738247, "rewards/format_reward": 0.5625000149011612, "step": 108 }, { "completion_length": 2933.104248046875, "epoch": 0.12457142857142857, "grad_norm": 0.21141788363456726, "kl": 0.0032711029052734375, "learning_rate": 9.623632283030077e-07, "loss": 0.0001, "reward": 0.003511659801006317, "reward_std": 0.10087954625487328, "rewards/cosine_scaled_reward": -0.1504979059100151, "rewards/format_reward": 0.31250000186264515, "step": 109 }, { "completion_length": 2657.3750915527344, "epoch": 0.12571428571428572, "grad_norm": 0.20710720121860504, "kl": 0.0029621124267578125, "learning_rate": 9.610954559391704e-07, "loss": 0.0001, "reward": 0.1238765474408865, "reward_std": 0.2178866695612669, "rewards/cosine_scaled_reward": -0.048304086085408926, "rewards/format_reward": 0.5625, "step": 110 }, { "completion_length": 3061.3333740234375, "epoch": 0.12685714285714286, "grad_norm": 0.2273649424314499, "kl": 0.0052642822265625, "learning_rate": 9.598076473627796e-07, "loss": 0.0002, "reward": 0.015934795141220093, "reward_std": 0.1893920563161373, "rewards/cosine_scaled_reward": -0.1148507222533226, "rewards/format_reward": 0.29166667722165585, "step": 111 }, { "completion_length": 2947.8750610351562, "epoch": 0.128, "grad_norm": 0.1635778397321701, "kl": 0.0029401779174804688, "learning_rate": 9.58499865339809e-07, "loss": 0.0001, "reward": 0.0888732923194766, "reward_std": 0.20546680688858032, "rewards/cosine_scaled_reward": -0.035623449832201004, "rewards/format_reward": 0.416666679084301, "step": 112 }, { "completion_length": 1932.7291870117188, "epoch": 0.12914285714285714, "grad_norm": 0.2718316316604614, "kl": 0.004009246826171875, "learning_rate": 9.571721736097088e-07, "loss": 0.0002, "reward": 0.15375141613185406, "reward_std": 0.172605000436306, "rewards/cosine_scaled_reward": -0.07143362984061241, "rewards/format_reward": 0.7291666865348816, "step": 113 }, { "completion_length": 2297.2083740234375, "epoch": 0.13028571428571428, "grad_norm": 0.21740828454494476, "kl": 0.003414154052734375, "learning_rate": 9.55824636882301e-07, "loss": 0.0001, "reward": 0.08895601518452168, "reward_std": 0.12472244258970022, "rewards/cosine_scaled_reward": -0.14328179135918617, "rewards/format_reward": 0.6250000149011612, "step": 114 }, { "completion_length": 2783.791717529297, "epoch": 0.13142857142857142, "grad_norm": 0.1956191211938858, "kl": 0.0034732818603515625, "learning_rate": 9.54457320834625e-07, "loss": 0.0001, "reward": 0.05227021034806967, "reward_std": 0.16717309318482876, "rewards/cosine_scaled_reward": -0.1101748300716281, "rewards/format_reward": 0.416666679084301, "step": 115 }, { "completion_length": 3274.3333740234375, "epoch": 0.13257142857142856, "grad_norm": 0.17732104659080505, "kl": 0.00379180908203125, "learning_rate": 9.530702921077358e-07, "loss": 0.0002, "reward": -0.029843004420399666, "reward_std": 0.12473610881716013, "rewards/cosine_scaled_reward": -0.16310015879571438, "rewards/format_reward": 0.2083333358168602, "step": 116 }, { "completion_length": 2867.0208740234375, "epoch": 0.1337142857142857, "grad_norm": 0.17805035412311554, "kl": 0.00621795654296875, "learning_rate": 9.516636183034564e-07, "loss": 0.0002, "reward": -0.017832530662417412, "reward_std": 0.18124784901738167, "rewards/cosine_scaled_reward": -0.2212834618985653, "rewards/format_reward": 0.37500000931322575, "step": 117 }, { "completion_length": 2933.2709350585938, "epoch": 0.13485714285714287, "grad_norm": 0.17978739738464355, "kl": 0.002979278564453125, "learning_rate": 9.502373679810839e-07, "loss": 0.0001, "reward": 0.06652609840966761, "reward_std": 0.2244921512901783, "rewards/cosine_scaled_reward": -0.06611594557762146, "rewards/format_reward": 0.3958333395421505, "step": 118 }, { "completion_length": 1993.6667175292969, "epoch": 0.136, "grad_norm": 0.2700473666191101, "kl": 0.0044612884521484375, "learning_rate": 9.487916106540465e-07, "loss": 0.0002, "reward": 0.1687219077721238, "reward_std": 0.11934308893978596, "rewards/cosine_scaled_reward": -0.020776577293872833, "rewards/format_reward": 0.6875000149011612, "step": 119 }, { "completion_length": 2145.3959350585938, "epoch": 0.13714285714285715, "grad_norm": 0.2796230912208557, "kl": 0.0054168701171875, "learning_rate": 9.473264167865171e-07, "loss": 0.0002, "reward": 0.1229192796163261, "reward_std": 0.11130889505147934, "rewards/cosine_scaled_reward": -0.05790109490044415, "rewards/format_reward": 0.583333358168602, "step": 120 }, { "completion_length": 1227.0208587646484, "epoch": 0.1382857142857143, "grad_norm": 0.27715376019477844, "kl": 0.003421783447265625, "learning_rate": 9.458418577899774e-07, "loss": 0.0001, "reward": 0.24974779039621353, "reward_std": 0.13967820443212986, "rewards/cosine_scaled_reward": 0.03080811072140932, "rewards/format_reward": 0.8958333432674408, "step": 121 }, { "completion_length": 2661.104217529297, "epoch": 0.13942857142857143, "grad_norm": 0.20075838267803192, "kl": 0.00502777099609375, "learning_rate": 9.443380060197385e-07, "loss": 0.0002, "reward": 0.09934048354625702, "reward_std": 0.13651033863425255, "rewards/cosine_scaled_reward": -0.05175904370844364, "rewards/format_reward": 0.47916667722165585, "step": 122 }, { "completion_length": 2671.2708435058594, "epoch": 0.14057142857142857, "grad_norm": 0.2119200974702835, "kl": 0.00371551513671875, "learning_rate": 9.428149347714143e-07, "loss": 0.0001, "reward": 0.09425406157970428, "reward_std": 0.16130803525447845, "rewards/cosine_scaled_reward": -0.058947376906871796, "rewards/format_reward": 0.4791666716337204, "step": 123 }, { "completion_length": 2159.729248046875, "epoch": 0.1417142857142857, "grad_norm": 0.2579522430896759, "kl": 0.008348464965820312, "learning_rate": 9.412727182773486e-07, "loss": 0.0003, "reward": 0.15733091381844133, "reward_std": 0.22176896408200264, "rewards/cosine_scaled_reward": -0.009153680875897408, "rewards/format_reward": 0.6250000149011612, "step": 124 }, { "completion_length": 2673.312530517578, "epoch": 0.14285714285714285, "grad_norm": 0.180611252784729, "kl": 0.002933502197265625, "learning_rate": 9.397114317029974e-07, "loss": 0.0001, "reward": 0.0762571319937706, "reward_std": 0.15331977792084217, "rewards/cosine_scaled_reward": -0.050458282232284546, "rewards/format_reward": 0.3958333358168602, "step": 125 }, { "completion_length": 2936.3125610351562, "epoch": 0.144, "grad_norm": 0.19579581916332245, "kl": 0.002716064453125, "learning_rate": 9.381311511432658e-07, "loss": 0.0001, "reward": 0.056234278716146946, "reward_std": 0.19673431850969791, "rewards/cosine_scaled_reward": -0.11217856779694557, "rewards/format_reward": 0.4375000149011612, "step": 126 }, { "completion_length": 2970.2709350585938, "epoch": 0.14514285714285713, "grad_norm": 0.17747008800506592, "kl": 0.00502777099609375, "learning_rate": 9.36531953618799e-07, "loss": 0.0002, "reward": -0.00631782878190279, "reward_std": 0.1282958798110485, "rewards/cosine_scaled_reward": -0.15889330580830574, "rewards/format_reward": 0.29166667349636555, "step": 127 }, { "completion_length": 2776.479217529297, "epoch": 0.1462857142857143, "grad_norm": 0.1870415359735489, "kl": 0.004581451416015625, "learning_rate": 9.34913917072228e-07, "loss": 0.0002, "reward": 0.14590335823595524, "reward_std": 0.15995058370754123, "rewards/cosine_scaled_reward": 0.04219770431518555, "rewards/format_reward": 0.4791666716337204, "step": 128 }, { "completion_length": 3122.1875610351562, "epoch": 0.14742857142857144, "grad_norm": 0.20150424540042877, "kl": 0.00522613525390625, "learning_rate": 9.332771203643714e-07, "loss": 0.0002, "reward": -0.016852401662617922, "reward_std": 0.20990226045250893, "rewards/cosine_scaled_reward": -0.17971983924508095, "rewards/format_reward": 0.29166667349636555, "step": 129 }, { "completion_length": 2580.7500610351562, "epoch": 0.14857142857142858, "grad_norm": 0.21353274583816528, "kl": 0.0032367706298828125, "learning_rate": 9.316216432703916e-07, "loss": 0.0001, "reward": -0.017789321020245552, "reward_std": 0.11740593425929546, "rewards/cosine_scaled_reward": -0.23309939168393612, "rewards/format_reward": 0.39583334885537624, "step": 130 }, { "completion_length": 2513.166717529297, "epoch": 0.14971428571428572, "grad_norm": 0.2525118887424469, "kl": 0.005584716796875, "learning_rate": 9.299475664759068e-07, "loss": 0.0002, "reward": 0.2067520273849368, "reward_std": 0.14082133676856756, "rewards/cosine_scaled_reward": 0.11485651880502701, "rewards/format_reward": 0.5625000074505806, "step": 131 }, { "completion_length": 2662.729248046875, "epoch": 0.15085714285714286, "grad_norm": 0.1865711510181427, "kl": 0.003742218017578125, "learning_rate": 9.282549715730579e-07, "loss": 0.0002, "reward": 0.12205465510487556, "reward_std": 0.19720683433115482, "rewards/cosine_scaled_reward": 0.016718512400984764, "rewards/format_reward": 0.4375000111758709, "step": 132 }, { "completion_length": 2677.687530517578, "epoch": 0.152, "grad_norm": 0.20966395735740662, "kl": 0.0047454833984375, "learning_rate": 9.265439410565328e-07, "loss": 0.0002, "reward": -0.03675843123346567, "reward_std": 0.10049015143886209, "rewards/cosine_scaled_reward": -0.2899044156074524, "rewards/format_reward": 0.4375000149011612, "step": 133 }, { "completion_length": 2357.9375610351562, "epoch": 0.15314285714285714, "grad_norm": 0.23465649783611298, "kl": 0.0073699951171875, "learning_rate": 9.248145583195447e-07, "loss": 0.0003, "reward": 0.062477301340550184, "reward_std": 0.1254201289266348, "rewards/cosine_scaled_reward": -0.14919789135456085, "rewards/format_reward": 0.5416666865348816, "step": 134 }, { "completion_length": 1581.6875305175781, "epoch": 0.15428571428571428, "grad_norm": 0.2664959728717804, "kl": 0.006793975830078125, "learning_rate": 9.230669076497687e-07, "loss": 0.0003, "reward": 0.3097005020827055, "reward_std": 0.17315510800108314, "rewards/cosine_scaled_reward": 0.22469835728406906, "rewards/format_reward": 0.7291666716337204, "step": 135 }, { "completion_length": 1855.5416870117188, "epoch": 0.15542857142857142, "grad_norm": 0.20499049127101898, "kl": 0.005344390869140625, "learning_rate": 9.213010742252327e-07, "loss": 0.0002, "reward": 0.20292381010949612, "reward_std": 0.16355958580970764, "rewards/cosine_scaled_reward": 0.014593502506613731, "rewards/format_reward": 0.7500000149011612, "step": 136 }, { "completion_length": 2680.916748046875, "epoch": 0.15657142857142858, "grad_norm": 0.16747643053531647, "kl": 0.0043182373046875, "learning_rate": 9.195171441101668e-07, "loss": 0.0002, "reward": -0.0012684596003964543, "reward_std": 0.13910184241831303, "rewards/cosine_scaled_reward": -0.2319859191775322, "rewards/format_reward": 0.4583333544433117, "step": 137 }, { "completion_length": 1877.8750305175781, "epoch": 0.15771428571428572, "grad_norm": 0.18857340514659882, "kl": 0.004955291748046875, "learning_rate": 9.177152042508077e-07, "loss": 0.0002, "reward": 0.12323585152626038, "reward_std": 0.15464136935770512, "rewards/cosine_scaled_reward": -0.15813787409570068, "rewards/format_reward": 0.7916666865348816, "step": 138 }, { "completion_length": 2733.104248046875, "epoch": 0.15885714285714286, "grad_norm": 0.21658539772033691, "kl": 0.00566864013671875, "learning_rate": 9.158953424711624e-07, "loss": 0.0002, "reward": 0.016465216875076294, "reward_std": 0.21444908529520035, "rewards/cosine_scaled_reward": -0.20799199864268303, "rewards/format_reward": 0.479166679084301, "step": 139 }, { "completion_length": 2077.416778564453, "epoch": 0.16, "grad_norm": 0.4652515947818756, "kl": 0.007762908935546875, "learning_rate": 9.140576474687263e-07, "loss": 0.0003, "reward": 0.09515286330133677, "reward_std": 0.08214545156806707, "rewards/cosine_scaled_reward": -0.10833002626895905, "rewards/format_reward": 0.5833333432674408, "step": 140 }, { "completion_length": 1763.3958740234375, "epoch": 0.16114285714285714, "grad_norm": 0.2535211443901062, "kl": 0.00640869140625, "learning_rate": 9.122022088101613e-07, "loss": 0.0003, "reward": 0.11987213883548975, "reward_std": 0.2014489434659481, "rewards/cosine_scaled_reward": -0.14208032563328743, "rewards/format_reward": 0.7500000149011612, "step": 141 }, { "completion_length": 2025.5625305175781, "epoch": 0.16228571428571428, "grad_norm": 0.24902018904685974, "kl": 0.0047359466552734375, "learning_rate": 9.103291169269299e-07, "loss": 0.0002, "reward": 0.09324084036052227, "reward_std": 0.1687856111675501, "rewards/cosine_scaled_reward": -0.19433005712926388, "rewards/format_reward": 0.7500000074505806, "step": 142 }, { "completion_length": 1862.187515258789, "epoch": 0.16342857142857142, "grad_norm": 0.29880306124687195, "kl": 0.009990692138671875, "learning_rate": 9.084384631108882e-07, "loss": 0.0004, "reward": 0.11504282057285309, "reward_std": 0.12508848682045937, "rewards/cosine_scaled_reward": -0.13555557653307915, "rewards/format_reward": 0.7083333432674408, "step": 143 }, { "completion_length": 2255.604248046875, "epoch": 0.16457142857142856, "grad_norm": 0.24714051187038422, "kl": 0.005096435546875, "learning_rate": 9.065303395098358e-07, "loss": 0.0002, "reward": 0.1180584009271115, "reward_std": 0.1518205851316452, "rewards/cosine_scaled_reward": -0.07565461099147797, "rewards/format_reward": 0.6041666716337204, "step": 144 }, { "completion_length": 1702.604232788086, "epoch": 0.1657142857142857, "grad_norm": 0.26052579283714294, "kl": 0.00627899169921875, "learning_rate": 9.046048391230247e-07, "loss": 0.0003, "reward": 0.2528261498955544, "reward_std": 0.16752439364790916, "rewards/cosine_scaled_reward": 0.11284054815769196, "rewards/format_reward": 0.7500000149011612, "step": 145 }, { "completion_length": 1841.0625457763672, "epoch": 0.16685714285714287, "grad_norm": 0.21570232510566711, "kl": 0.00521087646484375, "learning_rate": 9.026620557966279e-07, "loss": 0.0002, "reward": 0.07159285247325897, "reward_std": 0.12313777953386307, "rewards/cosine_scaled_reward": -0.22563189081847668, "rewards/format_reward": 0.7291666716337204, "step": 146 }, { "completion_length": 1733.9792022705078, "epoch": 0.168, "grad_norm": 0.2516489624977112, "kl": 0.0065460205078125, "learning_rate": 9.007020842191634e-07, "loss": 0.0003, "reward": 0.1541627710685134, "reward_std": 0.16804708167910576, "rewards/cosine_scaled_reward": -0.08956858143210411, "rewards/format_reward": 0.7708333432674408, "step": 147 }, { "completion_length": 1162.2500457763672, "epoch": 0.16914285714285715, "grad_norm": 0.2465033233165741, "kl": 0.00592041015625, "learning_rate": 8.987250199168808e-07, "loss": 0.0002, "reward": 0.18499324843287468, "reward_std": 0.12528827972710133, "rewards/cosine_scaled_reward": -0.11035721749067307, "rewards/format_reward": 0.9375000149011612, "step": 148 }, { "completion_length": 1805.7500762939453, "epoch": 0.1702857142857143, "grad_norm": 0.2204548716545105, "kl": 0.0048065185546875, "learning_rate": 8.967309592491052e-07, "loss": 0.0002, "reward": 0.17903725989162922, "reward_std": 0.18916313350200653, "rewards/cosine_scaled_reward": -0.05147257540374994, "rewards/format_reward": 0.791666679084301, "step": 149 }, { "completion_length": 1744.7500305175781, "epoch": 0.17142857142857143, "grad_norm": 0.30003637075424194, "kl": 0.0056915283203125, "learning_rate": 8.9471999940354e-07, "loss": 0.0002, "reward": 0.1566769634373486, "reward_std": 0.20850111544132233, "rewards/cosine_scaled_reward": -0.07834354601800442, "rewards/format_reward": 0.7500000298023224, "step": 150 }, { "completion_length": 1863.3750610351562, "epoch": 0.17257142857142857, "grad_norm": 0.25705352425575256, "kl": 0.0056915283203125, "learning_rate": 8.926922383915315e-07, "loss": 0.0002, "reward": 0.15223714988678694, "reward_std": 0.20249696634709835, "rewards/cosine_scaled_reward": -0.06782015133649111, "rewards/format_reward": 0.7083333432674408, "step": 151 }, { "completion_length": 2103.9375915527344, "epoch": 0.1737142857142857, "grad_norm": 0.3347667455673218, "kl": 0.01369476318359375, "learning_rate": 8.906477750432903e-07, "loss": 0.0005, "reward": 0.07069700676947832, "reward_std": 0.19799878261983395, "rewards/cosine_scaled_reward": -0.16938491538167, "rewards/format_reward": 0.6041666716337204, "step": 152 }, { "completion_length": 2156.0833740234375, "epoch": 0.17485714285714285, "grad_norm": 0.28918376564979553, "kl": 0.0103912353515625, "learning_rate": 8.88586709003076e-07, "loss": 0.0004, "reward": -0.005467447452247143, "reward_std": 0.1560390181839466, "rewards/cosine_scaled_reward": -0.31410151347517967, "rewards/format_reward": 0.604166679084301, "step": 153 }, { "completion_length": 2673.4375915527344, "epoch": 0.176, "grad_norm": 0.19442158937454224, "kl": 0.00507354736328125, "learning_rate": 8.865091407243394e-07, "loss": 0.0002, "reward": 0.16628247933113016, "reward_std": 0.2260502576828003, "rewards/cosine_scaled_reward": 0.04180636256933212, "rewards/format_reward": 0.5625, "step": 154 }, { "completion_length": 1959.3750610351562, "epoch": 0.17714285714285713, "grad_norm": 0.26108378171920776, "kl": 0.0069427490234375, "learning_rate": 8.844151714648274e-07, "loss": 0.0003, "reward": 0.11699163541197777, "reward_std": 0.15181562304496765, "rewards/cosine_scaled_reward": -0.09085014648735523, "rewards/format_reward": 0.6250000223517418, "step": 155 }, { "completion_length": 2226.208465576172, "epoch": 0.1782857142857143, "grad_norm": 0.20904701948165894, "kl": 0.00543212890625, "learning_rate": 8.823049032816478e-07, "loss": 0.0002, "reward": 0.16564705595374107, "reward_std": 0.20499678701162338, "rewards/cosine_scaled_reward": 0.02991688810288906, "rewards/format_reward": 0.5833333395421505, "step": 156 }, { "completion_length": 2092.666748046875, "epoch": 0.17942857142857144, "grad_norm": 0.21900877356529236, "kl": 0.00652313232421875, "learning_rate": 8.801784390262943e-07, "loss": 0.0003, "reward": 0.07087472919374704, "reward_std": 0.140676137059927, "rewards/cosine_scaled_reward": -0.22038735449314117, "rewards/format_reward": 0.7083333432674408, "step": 157 }, { "completion_length": 1413.7917175292969, "epoch": 0.18057142857142858, "grad_norm": 0.2717598080635071, "kl": 0.0084686279296875, "learning_rate": 8.780358823396352e-07, "loss": 0.0003, "reward": 0.2156252167187631, "reward_std": 0.17803873494267464, "rewards/cosine_scaled_reward": -0.02755984291434288, "rewards/format_reward": 0.8750000149011612, "step": 158 }, { "completion_length": 2011.4167175292969, "epoch": 0.18171428571428572, "grad_norm": 0.23411135375499725, "kl": 0.006175994873046875, "learning_rate": 8.758773376468604e-07, "loss": 0.0002, "reward": -0.005382127594202757, "reward_std": 0.08645158167928457, "rewards/cosine_scaled_reward": -0.32433751225471497, "rewards/format_reward": 0.6250000055879354, "step": 159 }, { "completion_length": 1737.875015258789, "epoch": 0.18285714285714286, "grad_norm": 0.2569064199924469, "kl": 0.00926971435546875, "learning_rate": 8.737029101523929e-07, "loss": 0.0004, "reward": 0.13197340350598097, "reward_std": 0.16681465320289135, "rewards/cosine_scaled_reward": -0.10385934263467789, "rewards/format_reward": 0.7083333432674408, "step": 160 }, { "completion_length": 1677.7083740234375, "epoch": 0.184, "grad_norm": 0.35787054896354675, "kl": 0.0095367431640625, "learning_rate": 8.715127058347614e-07, "loss": 0.0004, "reward": 0.14398455526679754, "reward_std": 0.1860233135521412, "rewards/cosine_scaled_reward": -0.14442253485321999, "rewards/format_reward": 0.8333333432674408, "step": 161 }, { "completion_length": 1775.1041870117188, "epoch": 0.18514285714285714, "grad_norm": 0.2751931846141815, "kl": 0.01062774658203125, "learning_rate": 8.693068314414344e-07, "loss": 0.0004, "reward": 0.06949560716748238, "reward_std": 0.13854376738891006, "rewards/cosine_scaled_reward": -0.22351703885942698, "rewards/format_reward": 0.7083333432674408, "step": 162 }, { "completion_length": 1850.6250305175781, "epoch": 0.18628571428571428, "grad_norm": 0.23756393790245056, "kl": 0.00836181640625, "learning_rate": 8.670853944836176e-07, "loss": 0.0003, "reward": 0.19672297313809395, "reward_std": 0.14653840195387602, "rewards/cosine_scaled_reward": 0.020640600472688675, "rewards/format_reward": 0.7083333432674408, "step": 163 }, { "completion_length": 996.1875305175781, "epoch": 0.18742857142857142, "grad_norm": 0.2397727370262146, "kl": 0.00862884521484375, "learning_rate": 8.648485032310144e-07, "loss": 0.0003, "reward": 0.19845793303102255, "reward_std": 0.14494088664650917, "rewards/cosine_scaled_reward": -0.09628591779619455, "rewards/format_reward": 0.9583333432674408, "step": 164 }, { "completion_length": 1629.5417785644531, "epoch": 0.18857142857142858, "grad_norm": 0.2794504463672638, "kl": 0.0097198486328125, "learning_rate": 8.625962667065487e-07, "loss": 0.0004, "reward": 0.1287133637815714, "reward_std": 0.14059951156377792, "rewards/cosine_scaled_reward": -0.12269960716366768, "rewards/format_reward": 0.7500000149011612, "step": 165 }, { "completion_length": 1685.6459197998047, "epoch": 0.18971428571428572, "grad_norm": 0.19377848505973816, "kl": 0.00667572021484375, "learning_rate": 8.603287946810513e-07, "loss": 0.0003, "reward": 0.13115534372627735, "reward_std": 0.12431170884519815, "rewards/cosine_scaled_reward": -0.13324624300003052, "rewards/format_reward": 0.7708333432674408, "step": 166 }, { "completion_length": 1135.8541717529297, "epoch": 0.19085714285714286, "grad_norm": 0.2520783543586731, "kl": 0.01160430908203125, "learning_rate": 8.580461976679099e-07, "loss": 0.0005, "reward": 0.1306325439363718, "reward_std": 0.1219279421493411, "rewards/cosine_scaled_reward": -0.25054977741092443, "rewards/format_reward": 1.0, "step": 167 }, { "completion_length": 2059.979248046875, "epoch": 0.192, "grad_norm": 0.28084325790405273, "kl": 0.00867462158203125, "learning_rate": 8.557485869176825e-07, "loss": 0.0003, "reward": 0.057329361559823155, "reward_std": 0.1840306594967842, "rewards/cosine_scaled_reward": -0.22565960884094238, "rewards/format_reward": 0.6666666865348816, "step": 168 }, { "completion_length": 1109.2291870117188, "epoch": 0.19314285714285714, "grad_norm": 0.2355085015296936, "kl": 0.0082244873046875, "learning_rate": 8.534360744126753e-07, "loss": 0.0003, "reward": 0.353638730943203, "reward_std": 0.13861669786274433, "rewards/cosine_scaled_reward": 0.20219615730457008, "rewards/format_reward": 0.9583333432674408, "step": 169 }, { "completion_length": 1543.041732788086, "epoch": 0.19428571428571428, "grad_norm": 0.24035528302192688, "kl": 0.00717926025390625, "learning_rate": 8.511087728614862e-07, "loss": 0.0003, "reward": 0.22046024352312088, "reward_std": 0.10455834213644266, "rewards/cosine_scaled_reward": 0.03571847453713417, "rewards/format_reward": 0.7708333432674408, "step": 170 }, { "completion_length": 1828.3542175292969, "epoch": 0.19542857142857142, "grad_norm": 0.22787348926067352, "kl": 0.006011962890625, "learning_rate": 8.487667956935087e-07, "loss": 0.0002, "reward": 0.11731340177357197, "reward_std": 0.12587883695960045, "rewards/cosine_scaled_reward": -0.10857231728732586, "rewards/format_reward": 0.6666666679084301, "step": 171 }, { "completion_length": 1744.6875762939453, "epoch": 0.19657142857142856, "grad_norm": 0.28158068656921387, "kl": 0.01123046875, "learning_rate": 8.464102570534061e-07, "loss": 0.0004, "reward": 0.24253746680915356, "reward_std": 0.1607684139162302, "rewards/cosine_scaled_reward": 0.10688387602567673, "rewards/format_reward": 0.7083333432674408, "step": 172 }, { "completion_length": 1016.3750152587891, "epoch": 0.1977142857142857, "grad_norm": 0.3139473795890808, "kl": 0.008453369140625, "learning_rate": 8.440392717955475e-07, "loss": 0.0003, "reward": 0.18927664309740067, "reward_std": 0.13135373406112194, "rewards/cosine_scaled_reward": -0.09657359251286834, "rewards/format_reward": 0.9166666716337204, "step": 173 }, { "completion_length": 1124.9167022705078, "epoch": 0.19885714285714284, "grad_norm": 0.2685360610485077, "kl": 0.0121917724609375, "learning_rate": 8.416539554784089e-07, "loss": 0.0005, "reward": 0.1454550283960998, "reward_std": 0.15982593782246113, "rewards/cosine_scaled_reward": -0.17720130027737468, "rewards/format_reward": 0.9166666865348816, "step": 174 }, { "completion_length": 1657.1041870117188, "epoch": 0.2, "grad_norm": 0.3601691722869873, "kl": 0.0103912353515625, "learning_rate": 8.392544243589427e-07, "loss": 0.0004, "reward": 0.12271542008966208, "reward_std": 0.15946769155561924, "rewards/cosine_scaled_reward": -0.12916328758001328, "rewards/format_reward": 0.7291666865348816, "step": 175 }, { "completion_length": 1527.2917022705078, "epoch": 0.20114285714285715, "grad_norm": 0.26887398958206177, "kl": 0.00707244873046875, "learning_rate": 8.368407953869103e-07, "loss": 0.0003, "reward": 0.18501460179686546, "reward_std": 0.19407307356595993, "rewards/cosine_scaled_reward": -0.07830056175589561, "rewards/format_reward": 0.8750000149011612, "step": 176 }, { "completion_length": 1690.9584045410156, "epoch": 0.2022857142857143, "grad_norm": 0.31082624197006226, "kl": 0.0101470947265625, "learning_rate": 8.344131861991828e-07, "loss": 0.0004, "reward": 0.18209635582752526, "reward_std": 0.17130816169083118, "rewards/cosine_scaled_reward": -0.045992735773324966, "rewards/format_reward": 0.7916666865348816, "step": 177 }, { "completion_length": 1539.2708587646484, "epoch": 0.20342857142857143, "grad_norm": 0.2660159170627594, "kl": 0.0113372802734375, "learning_rate": 8.319717151140072e-07, "loss": 0.0005, "reward": 0.16729554254561663, "reward_std": 0.17553477734327316, "rewards/cosine_scaled_reward": -0.10018930211663246, "rewards/format_reward": 0.8333333432674408, "step": 178 }, { "completion_length": 1726.9166870117188, "epoch": 0.20457142857142857, "grad_norm": 0.28772586584091187, "kl": 0.01047515869140625, "learning_rate": 8.295165011252396e-07, "loss": 0.0004, "reward": 0.055085474625229836, "reward_std": 0.13400842808187008, "rewards/cosine_scaled_reward": -0.24172668159008026, "rewards/format_reward": 0.6875000149011612, "step": 179 }, { "completion_length": 1345.9583740234375, "epoch": 0.2057142857142857, "grad_norm": 0.31531670689582825, "kl": 0.01061248779296875, "learning_rate": 8.270476638965461e-07, "loss": 0.0004, "reward": 0.3026774749159813, "reward_std": 0.21598787233233452, "rewards/cosine_scaled_reward": 0.16247792541980743, "rewards/format_reward": 0.8333333432674408, "step": 180 }, { "completion_length": 1659.3125915527344, "epoch": 0.20685714285714285, "grad_norm": 0.2561354339122772, "kl": 0.0099945068359375, "learning_rate": 8.245653237555705e-07, "loss": 0.0004, "reward": 0.10958964750170708, "reward_std": 0.12489994987845421, "rewards/cosine_scaled_reward": -0.16683262959122658, "rewards/format_reward": 0.7500000149011612, "step": 181 }, { "completion_length": 1639.8542022705078, "epoch": 0.208, "grad_norm": 0.23428499698638916, "kl": 0.0068817138671875, "learning_rate": 8.220696016880687e-07, "loss": 0.0003, "reward": 0.1196091203019023, "reward_std": 0.1348266238346696, "rewards/cosine_scaled_reward": -0.1318911425769329, "rewards/format_reward": 0.7291666865348816, "step": 182 }, { "completion_length": 914.8958587646484, "epoch": 0.20914285714285713, "grad_norm": 0.27674025297164917, "kl": 0.009857177734375, "learning_rate": 8.195606193320136e-07, "loss": 0.0004, "reward": 0.2514411360025406, "reward_std": 0.1492533190175891, "rewards/cosine_scaled_reward": -0.020158587023615837, "rewards/format_reward": 1.0, "step": 183 }, { "completion_length": 1333.4166870117188, "epoch": 0.2102857142857143, "grad_norm": 0.43891313672065735, "kl": 0.0129852294921875, "learning_rate": 8.170384989716657e-07, "loss": 0.0005, "reward": 0.10875812824815512, "reward_std": 0.12206488661468029, "rewards/cosine_scaled_reward": -0.20721979439258575, "rewards/format_reward": 0.833333358168602, "step": 184 }, { "completion_length": 1179.8542022705078, "epoch": 0.21142857142857144, "grad_norm": 0.2733169496059418, "kl": 0.0095977783203125, "learning_rate": 8.145033635316128e-07, "loss": 0.0004, "reward": 0.18451187200844288, "reward_std": 0.11414193641394377, "rewards/cosine_scaled_reward": -0.10295613948255777, "rewards/format_reward": 0.9166666716337204, "step": 185 }, { "completion_length": 1856.5000915527344, "epoch": 0.21257142857142858, "grad_norm": 0.2664458155632019, "kl": 0.01073455810546875, "learning_rate": 8.119553365707802e-07, "loss": 0.0004, "reward": 0.043668435886502266, "reward_std": 0.10437687300145626, "rewards/cosine_scaled_reward": -0.23532377928495407, "rewards/format_reward": 0.645833333954215, "step": 186 }, { "completion_length": 1468.1667175292969, "epoch": 0.21371428571428572, "grad_norm": 0.2900819778442383, "kl": 0.01163482666015625, "learning_rate": 8.093945422764069e-07, "loss": 0.0005, "reward": 0.08582017477601767, "reward_std": 0.06885203067213297, "rewards/cosine_scaled_reward": -0.26070513017475605, "rewards/format_reward": 0.8541666716337204, "step": 187 }, { "completion_length": 1885.1250305175781, "epoch": 0.21485714285714286, "grad_norm": 0.35954371094703674, "kl": 0.0137176513671875, "learning_rate": 8.068211054579943e-07, "loss": 0.0005, "reward": 0.022899489849805832, "reward_std": 0.13436525873839855, "rewards/cosine_scaled_reward": -0.2905898615717888, "rewards/format_reward": 0.6666666865348816, "step": 188 }, { "completion_length": 1000.6667175292969, "epoch": 0.216, "grad_norm": 0.4214836061000824, "kl": 0.011138916015625, "learning_rate": 8.04235151541222e-07, "loss": 0.0004, "reward": 0.12046158779412508, "reward_std": 0.11919869109988213, "rewards/cosine_scaled_reward": -0.21752063930034637, "rewards/format_reward": 0.8958333432674408, "step": 189 }, { "completion_length": 864.8333435058594, "epoch": 0.21714285714285714, "grad_norm": 0.31973347067832947, "kl": 0.0091705322265625, "learning_rate": 8.01636806561836e-07, "loss": 0.0004, "reward": 0.2107975222170353, "reward_std": 0.12566878646612167, "rewards/cosine_scaled_reward": -0.08725808188319206, "rewards/format_reward": 0.9791666716337204, "step": 190 }, { "completion_length": 880.0417022705078, "epoch": 0.21828571428571428, "grad_norm": 0.28507736325263977, "kl": 0.00927734375, "learning_rate": 7.990261971595048e-07, "loss": 0.0004, "reward": 0.2328539378941059, "reward_std": 0.11715810932219028, "rewards/cosine_scaled_reward": -0.04595772549510002, "rewards/format_reward": 0.9791666716337204, "step": 191 }, { "completion_length": 1500.5208435058594, "epoch": 0.21942857142857142, "grad_norm": 0.2340683490037918, "kl": 0.009979248046875, "learning_rate": 7.964034505716476e-07, "loss": 0.0004, "reward": 0.06548994965851307, "reward_std": 0.1052602413110435, "rewards/cosine_scaled_reward": -0.2918730489909649, "rewards/format_reward": 0.8333333432674408, "step": 192 }, { "completion_length": 1814.8333740234375, "epoch": 0.22057142857142858, "grad_norm": 0.41579189896583557, "kl": 0.0130615234375, "learning_rate": 7.93768694627233e-07, "loss": 0.0005, "reward": 0.07099857088178396, "reward_std": 0.21988878399133682, "rewards/cosine_scaled_reward": -0.20412776619195938, "rewards/format_reward": 0.666666679084301, "step": 193 }, { "completion_length": 1969.8125305175781, "epoch": 0.22171428571428572, "grad_norm": 0.24314527213573456, "kl": 0.0137786865234375, "learning_rate": 7.911220577405484e-07, "loss": 0.0006, "reward": 0.16568910889327526, "reward_std": 0.20464131236076355, "rewards/cosine_scaled_reward": -0.05609820410609245, "rewards/format_reward": 0.7500000298023224, "step": 194 }, { "completion_length": 895.2500152587891, "epoch": 0.22285714285714286, "grad_norm": 0.24260254204273224, "kl": 0.0110931396484375, "learning_rate": 7.884636689049422e-07, "loss": 0.0004, "reward": 0.20478515326976776, "reward_std": 0.15891419537365437, "rewards/cosine_scaled_reward": -0.09407798573374748, "rewards/format_reward": 0.9791666716337204, "step": 195 }, { "completion_length": 1451.8125610351562, "epoch": 0.224, "grad_norm": 0.294848769903183, "kl": 0.01446533203125, "learning_rate": 7.857936576865356e-07, "loss": 0.0006, "reward": 0.1500989394262433, "reward_std": 0.17363658919930458, "rewards/cosine_scaled_reward": -0.13708563521504402, "rewards/format_reward": 0.8541666865348816, "step": 196 }, { "completion_length": 878.0000305175781, "epoch": 0.22514285714285714, "grad_norm": 0.5663582682609558, "kl": 0.0134735107421875, "learning_rate": 7.831121542179086e-07, "loss": 0.0005, "reward": 0.2094090636819601, "reward_std": 0.18339472636580467, "rewards/cosine_scaled_reward": -0.07614962011575699, "rewards/format_reward": 0.9583333432674408, "step": 197 }, { "completion_length": 1199.4792022705078, "epoch": 0.22628571428571428, "grad_norm": 0.3617823123931885, "kl": 0.01446533203125, "learning_rate": 7.804192891917571e-07, "loss": 0.0006, "reward": 0.21828181482851505, "reward_std": 0.1545956265181303, "rewards/cosine_scaled_reward": 0.009393613785505295, "rewards/format_reward": 0.8125000149011612, "step": 198 }, { "completion_length": 1102.9583587646484, "epoch": 0.22742857142857142, "grad_norm": 0.30051255226135254, "kl": 0.012176513671875, "learning_rate": 7.777151938545235e-07, "loss": 0.0005, "reward": 0.12175077851861715, "reward_std": 0.14734287559986115, "rewards/cosine_scaled_reward": -0.21858063712716103, "rewards/format_reward": 0.8958333432674408, "step": 199 }, { "completion_length": 1168.0208740234375, "epoch": 0.22857142857142856, "grad_norm": 0.3199549913406372, "kl": 0.0143585205078125, "learning_rate": 7.75e-07, "loss": 0.0006, "reward": 0.21139243245124817, "reward_std": 0.21977539733052254, "rewards/cosine_scaled_reward": -0.05342887528240681, "rewards/format_reward": 0.9166666716337204, "step": 200 }, { "completion_length": 1201.8542022705078, "epoch": 0.2297142857142857, "grad_norm": 0.31781795620918274, "kl": 0.0167999267578125, "learning_rate": 7.72273839962904e-07, "loss": 0.0007, "reward": 0.2658701539039612, "reward_std": 0.16695143841207027, "rewards/cosine_scaled_reward": 0.11313707195222378, "rewards/format_reward": 0.7708333432674408, "step": 201 }, { "completion_length": 1165.8333740234375, "epoch": 0.23085714285714284, "grad_norm": 0.2970917224884033, "kl": 0.0138092041015625, "learning_rate": 7.695368466124296e-07, "loss": 0.0006, "reward": 0.30872404226101935, "reward_std": 0.10743364132940769, "rewards/cosine_scaled_reward": 0.15722724050283432, "rewards/format_reward": 0.8541666716337204, "step": 202 }, { "completion_length": 936.3750305175781, "epoch": 0.232, "grad_norm": 0.2789294123649597, "kl": 0.014068603515625, "learning_rate": 7.667891533457718e-07, "loss": 0.0006, "reward": 0.1976920198649168, "reward_std": 0.12933177780359983, "rewards/cosine_scaled_reward": -0.047469355165958405, "rewards/format_reward": 0.8541666716337204, "step": 203 }, { "completion_length": 823.8958740234375, "epoch": 0.23314285714285715, "grad_norm": 0.3535368740558624, "kl": 0.021209716796875, "learning_rate": 7.640308940816239e-07, "loss": 0.0008, "reward": 0.272999856621027, "reward_std": 0.13164477981626987, "rewards/cosine_scaled_reward": 0.03568187169730663, "rewards/format_reward": 0.9791666716337204, "step": 204 }, { "completion_length": 1051.4375305175781, "epoch": 0.2342857142857143, "grad_norm": 0.38391605019569397, "kl": 0.0142364501953125, "learning_rate": 7.612622032536507e-07, "loss": 0.0006, "reward": 0.31229156255722046, "reward_std": 0.18256139010190964, "rewards/cosine_scaled_reward": 0.09398576989769936, "rewards/format_reward": 0.9791666716337204, "step": 205 }, { "completion_length": 1573.5625305175781, "epoch": 0.23542857142857143, "grad_norm": 0.25632062554359436, "kl": 0.0153961181640625, "learning_rate": 7.584832158039378e-07, "loss": 0.0006, "reward": 0.08040904346853495, "reward_std": 0.13550865650177002, "rewards/cosine_scaled_reward": -0.24311146512627602, "rewards/format_reward": 0.7916666865348816, "step": 206 }, { "completion_length": 1301.8750305175781, "epoch": 0.23657142857142857, "grad_norm": 0.4267790615558624, "kl": 0.0200653076171875, "learning_rate": 7.556940671764124e-07, "loss": 0.0008, "reward": 0.1367853432893753, "reward_std": 0.15140823274850845, "rewards/cosine_scaled_reward": -0.1685753520578146, "rewards/format_reward": 0.8541666865348816, "step": 207 }, { "completion_length": 981.8541717529297, "epoch": 0.2377142857142857, "grad_norm": 0.30258461833000183, "kl": 0.01776123046875, "learning_rate": 7.528948933102438e-07, "loss": 0.0007, "reward": 0.23792664706707, "reward_std": 0.07922358997166157, "rewards/cosine_scaled_reward": 0.021127969026565552, "rewards/format_reward": 0.8750000149011612, "step": 208 }, { "completion_length": 871.6667022705078, "epoch": 0.23885714285714285, "grad_norm": 0.39612019062042236, "kl": 0.020599365234375, "learning_rate": 7.500858306332172e-07, "loss": 0.0008, "reward": 0.2716928757727146, "reward_std": 0.07322061248123646, "rewards/cosine_scaled_reward": 0.03265971131622791, "rewards/format_reward": 0.9791666716337204, "step": 209 }, { "completion_length": 1105.1666870117188, "epoch": 0.24, "grad_norm": 0.2929202616214752, "kl": 0.0206146240234375, "learning_rate": 7.472670160550848e-07, "loss": 0.0008, "reward": 0.1834174320101738, "reward_std": 0.13372715562582016, "rewards/cosine_scaled_reward": -0.10380983352661133, "rewards/format_reward": 0.9166666865348816, "step": 210 }, { "completion_length": 1300.9167022705078, "epoch": 0.24114285714285713, "grad_norm": 0.3637829124927521, "kl": 0.025543212890625, "learning_rate": 7.444385869608921e-07, "loss": 0.001, "reward": 0.2249005874618888, "reward_std": 0.1836324967443943, "rewards/cosine_scaled_reward": 0.016723649576306343, "rewards/format_reward": 0.833333358168602, "step": 211 }, { "completion_length": 604.8958511352539, "epoch": 0.2422857142857143, "grad_norm": 0.3450697362422943, "kl": 0.020843505859375, "learning_rate": 7.416006812042827e-07, "loss": 0.0008, "reward": 0.4489349313080311, "reward_std": 0.18488763272762299, "rewards/cosine_scaled_reward": 0.34709828346967697, "rewards/format_reward": 1.0, "step": 212 }, { "completion_length": 895.4583740234375, "epoch": 0.24342857142857144, "grad_norm": 0.4644909203052521, "kl": 0.03045654296875, "learning_rate": 7.387534371007797e-07, "loss": 0.0012, "reward": 0.20922063663601875, "reward_std": 0.15041885478422046, "rewards/cosine_scaled_reward": -0.07130975648760796, "rewards/format_reward": 0.9375000149011612, "step": 213 }, { "completion_length": 1157.5208587646484, "epoch": 0.24457142857142858, "grad_norm": 0.37042832374572754, "kl": 0.0235595703125, "learning_rate": 7.358969934210438e-07, "loss": 0.0009, "reward": 0.2893129959702492, "reward_std": 0.10477330908179283, "rewards/cosine_scaled_reward": 0.09050430357456207, "rewards/format_reward": 0.9166666865348816, "step": 214 }, { "completion_length": 1058.2292175292969, "epoch": 0.24571428571428572, "grad_norm": 0.3636704683303833, "kl": 0.0178985595703125, "learning_rate": 7.330314893841101e-07, "loss": 0.0007, "reward": 0.16766999289393425, "reward_std": 0.10021563433110714, "rewards/cosine_scaled_reward": -0.11640440672636032, "rewards/format_reward": 0.8750000149011612, "step": 215 }, { "completion_length": 720.9583587646484, "epoch": 0.24685714285714286, "grad_norm": 0.3740937411785126, "kl": 0.028900146484375, "learning_rate": 7.301570646506027e-07, "loss": 0.0012, "reward": 0.40459009259939194, "reward_std": 0.15503079071640968, "rewards/cosine_scaled_reward": 0.2827944550663233, "rewards/format_reward": 0.9791666716337204, "step": 216 }, { "completion_length": 911.9375305175781, "epoch": 0.248, "grad_norm": 0.3024314343929291, "kl": 0.026031494140625, "learning_rate": 7.27273859315928e-07, "loss": 0.001, "reward": 0.27230124548077583, "reward_std": 0.1195518053136766, "rewards/cosine_scaled_reward": 0.0563269704580307, "rewards/format_reward": 0.9375, "step": 217 }, { "completion_length": 943.0000152587891, "epoch": 0.24914285714285714, "grad_norm": 0.3462475836277008, "kl": 0.02203369140625, "learning_rate": 7.243820139034464e-07, "loss": 0.0009, "reward": 0.18083516135811806, "reward_std": 0.11691945977509022, "rewards/cosine_scaled_reward": -0.11971011944115162, "rewards/format_reward": 0.9375, "step": 218 }, { "completion_length": 860.8958740234375, "epoch": 0.2502857142857143, "grad_norm": 0.5894182920455933, "kl": 0.042236328125, "learning_rate": 7.214816693576234e-07, "loss": 0.0017, "reward": 0.28585051745176315, "reward_std": 0.17869126796722412, "rewards/cosine_scaled_reward": 0.06777300871908665, "rewards/format_reward": 0.9583333432674408, "step": 219 }, { "completion_length": 1099.4792175292969, "epoch": 0.25142857142857145, "grad_norm": 0.6433284282684326, "kl": 0.0411224365234375, "learning_rate": 7.185729670371604e-07, "loss": 0.0016, "reward": 0.136014673858881, "reward_std": 0.10110826417803764, "rewards/cosine_scaled_reward": -0.178237933665514, "rewards/format_reward": 0.8750000149011612, "step": 220 }, { "completion_length": 755.8958435058594, "epoch": 0.25257142857142856, "grad_norm": 0.31108343601226807, "kl": 0.026397705078125, "learning_rate": 7.156560487081051e-07, "loss": 0.0011, "reward": 0.28532416746020317, "reward_std": 0.13717094622552395, "rewards/cosine_scaled_reward": 0.05212143436074257, "rewards/format_reward": 1.0, "step": 221 }, { "completion_length": 852.5000152587891, "epoch": 0.2537142857142857, "grad_norm": 0.4194568395614624, "kl": 0.02886962890625, "learning_rate": 7.127310565369415e-07, "loss": 0.0012, "reward": 0.28025223314762115, "reward_std": 0.11780218174681067, "rewards/cosine_scaled_reward": 0.06205835938453674, "rewards/format_reward": 0.9375000149011612, "step": 222 }, { "completion_length": 974.6250305175781, "epoch": 0.25485714285714284, "grad_norm": 0.3448916971683502, "kl": 0.024322509765625, "learning_rate": 7.097981330836616e-07, "loss": 0.001, "reward": 0.2804653272032738, "reward_std": 0.11768303625285625, "rewards/cosine_scaled_reward": 0.08824966102838516, "rewards/format_reward": 0.8750000149011612, "step": 223 }, { "completion_length": 1306.0833740234375, "epoch": 0.256, "grad_norm": 0.2909132242202759, "kl": 0.023040771484375, "learning_rate": 7.068574212948169e-07, "loss": 0.0009, "reward": 0.15027360804378986, "reward_std": 0.10174741875380278, "rewards/cosine_scaled_reward": -0.14634443912655115, "rewards/format_reward": 0.875, "step": 224 }, { "completion_length": 896.0833435058594, "epoch": 0.2571428571428571, "grad_norm": 0.36576658487319946, "kl": 0.024383544921875, "learning_rate": 7.039090644965509e-07, "loss": 0.001, "reward": 0.24017007276415825, "reward_std": 0.14254865422844887, "rewards/cosine_scaled_reward": -0.03498839866369963, "rewards/format_reward": 0.9791666716337204, "step": 225 }, { "completion_length": 1024.3542175292969, "epoch": 0.2582857142857143, "grad_norm": 0.2723851799964905, "kl": 0.022674560546875, "learning_rate": 7.009532063876148e-07, "loss": 0.0009, "reward": 0.3273099660873413, "reward_std": 0.1705116555094719, "rewards/cosine_scaled_reward": 0.13967445865273476, "rewards/format_reward": 0.9791666716337204, "step": 226 }, { "completion_length": 715.3958511352539, "epoch": 0.25942857142857145, "grad_norm": 0.46781402826309204, "kl": 0.02642822265625, "learning_rate": 6.979899910323624e-07, "loss": 0.0011, "reward": 0.23077429085969925, "reward_std": 0.1300568049773574, "rewards/cosine_scaled_reward": -0.04618716798722744, "rewards/format_reward": 0.9583333432674408, "step": 227 }, { "completion_length": 701.3333587646484, "epoch": 0.26057142857142856, "grad_norm": 0.5052315592765808, "kl": 0.03411865234375, "learning_rate": 6.950195628537299e-07, "loss": 0.0014, "reward": 0.34306391701102257, "reward_std": 0.17169221863150597, "rewards/cosine_scaled_reward": 0.15333737805485725, "rewards/format_reward": 0.9791666716337204, "step": 228 }, { "completion_length": 923.5625152587891, "epoch": 0.26171428571428573, "grad_norm": 0.3489846885204315, "kl": 0.027099609375, "learning_rate": 6.920420666261961e-07, "loss": 0.0011, "reward": 0.30176902934908867, "reward_std": 0.09760609082877636, "rewards/cosine_scaled_reward": 0.12013271264731884, "rewards/format_reward": 0.895833358168602, "step": 229 }, { "completion_length": 1267.5000305175781, "epoch": 0.26285714285714284, "grad_norm": 0.4087165594100952, "kl": 0.021514892578125, "learning_rate": 6.890576474687263e-07, "loss": 0.0009, "reward": 0.16810158640146255, "reward_std": 0.1460794433951378, "rewards/cosine_scaled_reward": -0.12243080325424671, "rewards/format_reward": 0.8958333432674408, "step": 230 }, { "completion_length": 1029.8542175292969, "epoch": 0.264, "grad_norm": 0.34104540944099426, "kl": 0.02410888671875, "learning_rate": 6.860664508377001e-07, "loss": 0.001, "reward": 0.23954131081700325, "reward_std": 0.15675314888358116, "rewards/cosine_scaled_reward": 0.012257816269993782, "rewards/format_reward": 0.8958333432674408, "step": 231 }, { "completion_length": 998.5417022705078, "epoch": 0.2651428571428571, "grad_norm": 0.36861056089401245, "kl": 0.025665283203125, "learning_rate": 6.83068622519821e-07, "loss": 0.001, "reward": 0.1624880088493228, "reward_std": 0.11741040972992778, "rewards/cosine_scaled_reward": -0.17423859052360058, "rewards/format_reward": 0.9583333432674408, "step": 232 }, { "completion_length": 762.1666870117188, "epoch": 0.2662857142857143, "grad_norm": 0.39426419138908386, "kl": 0.02581787109375, "learning_rate": 6.800643086250121e-07, "loss": 0.001, "reward": 0.17850053682923317, "reward_std": 0.04843510780483484, "rewards/cosine_scaled_reward": -0.15792253613471985, "rewards/format_reward": 1.0, "step": 233 }, { "completion_length": 1197.9167175292969, "epoch": 0.2674285714285714, "grad_norm": 0.4176238179206848, "kl": 0.0281982421875, "learning_rate": 6.770536555792944e-07, "loss": 0.0011, "reward": 0.19166827760636806, "reward_std": 0.08724211249500513, "rewards/cosine_scaled_reward": -0.06389070302248001, "rewards/format_reward": 0.8541666716337204, "step": 234 }, { "completion_length": 955.2916870117188, "epoch": 0.26857142857142857, "grad_norm": 0.36116015911102295, "kl": 0.025665283203125, "learning_rate": 6.740368101176495e-07, "loss": 0.001, "reward": 0.3791688084602356, "reward_std": 0.148554977029562, "rewards/cosine_scaled_reward": 0.2871663346886635, "rewards/format_reward": 0.8750000149011612, "step": 235 }, { "completion_length": 1169.2083740234375, "epoch": 0.26971428571428574, "grad_norm": 0.33890408277511597, "kl": 0.026214599609375, "learning_rate": 6.710139192768694e-07, "loss": 0.001, "reward": 0.20721980184316635, "reward_std": 0.20111924316734076, "rewards/cosine_scaled_reward": -0.03280853480100632, "rewards/format_reward": 0.8541666716337204, "step": 236 }, { "completion_length": 1131.5833435058594, "epoch": 0.27085714285714285, "grad_norm": 0.3021850883960724, "kl": 0.02716064453125, "learning_rate": 6.679851303883891e-07, "loss": 0.0011, "reward": 0.2407778836786747, "reward_std": 0.1393668632954359, "rewards/cosine_scaled_reward": 0.03790582902729511, "rewards/format_reward": 0.8541666716337204, "step": 237 }, { "completion_length": 834.9375305175781, "epoch": 0.272, "grad_norm": 0.42039623856544495, "kl": 0.028900146484375, "learning_rate": 6.649505910711058e-07, "loss": 0.0012, "reward": 0.29237839579582214, "reward_std": 0.12461261451244354, "rewards/cosine_scaled_reward": 0.07683929987251759, "rewards/format_reward": 0.9583333432674408, "step": 238 }, { "completion_length": 969.5417022705078, "epoch": 0.27314285714285713, "grad_norm": 0.41346174478530884, "kl": 0.0263671875, "learning_rate": 6.619104492241847e-07, "loss": 0.0011, "reward": 0.33568411134183407, "reward_std": 0.16493239253759384, "rewards/cosine_scaled_reward": 0.17511842213571072, "rewards/format_reward": 0.9166666865348816, "step": 239 }, { "completion_length": 789.6250305175781, "epoch": 0.2742857142857143, "grad_norm": 0.6287127733230591, "kl": 0.0360107421875, "learning_rate": 6.588648530198504e-07, "loss": 0.0014, "reward": 0.22957515716552734, "reward_std": 0.13560683466494083, "rewards/cosine_scaled_reward": -0.04795573343290016, "rewards/format_reward": 0.9791666716337204, "step": 240 }, { "completion_length": 1139.4583892822266, "epoch": 0.2754285714285714, "grad_norm": 0.46157172322273254, "kl": 0.031707763671875, "learning_rate": 6.558139508961654e-07, "loss": 0.0013, "reward": 0.14395110495388508, "reward_std": 0.17530641239136457, "rewards/cosine_scaled_reward": -0.15174288675189018, "rewards/format_reward": 0.8333333432674408, "step": 241 }, { "completion_length": 829.8333435058594, "epoch": 0.2765714285714286, "grad_norm": 0.6682753562927246, "kl": 0.03924560546875, "learning_rate": 6.527578915497951e-07, "loss": 0.0016, "reward": 0.23567525297403336, "reward_std": 0.10210139374248683, "rewards/cosine_scaled_reward": -0.014930504374206066, "rewards/format_reward": 0.9375000149011612, "step": 242 }, { "completion_length": 1312.354232788086, "epoch": 0.2777142857142857, "grad_norm": 0.3977852463722229, "kl": 0.031341552734375, "learning_rate": 6.496968239287603e-07, "loss": 0.0013, "reward": 0.18383393343538046, "reward_std": 0.17372377216815948, "rewards/cosine_scaled_reward": -0.05972663313150406, "rewards/format_reward": 0.8125000149011612, "step": 243 }, { "completion_length": 1070.1458740234375, "epoch": 0.27885714285714286, "grad_norm": 0.43918874859809875, "kl": 0.02935791015625, "learning_rate": 6.466308972251785e-07, "loss": 0.0012, "reward": 0.33063168078660965, "reward_std": 0.2109587863087654, "rewards/cosine_scaled_reward": 0.16319299302995205, "rewards/format_reward": 0.9375000149011612, "step": 244 }, { "completion_length": 1171.3125457763672, "epoch": 0.28, "grad_norm": 0.36638176441192627, "kl": 0.0313720703125, "learning_rate": 6.435602608679916e-07, "loss": 0.0013, "reward": 0.2441106867045164, "reward_std": 0.241492111235857, "rewards/cosine_scaled_reward": 0.011673031374812126, "rewards/format_reward": 0.895833358168602, "step": 245 }, { "completion_length": 978.7083740234375, "epoch": 0.28114285714285714, "grad_norm": 0.3780803084373474, "kl": 0.03387451171875, "learning_rate": 6.404850645156841e-07, "loss": 0.0014, "reward": 0.2604539059102535, "reward_std": 0.13962816260755062, "rewards/cosine_scaled_reward": 0.054232267662882805, "rewards/format_reward": 0.8958333432674408, "step": 246 }, { "completion_length": 1271.2083587646484, "epoch": 0.2822857142857143, "grad_norm": 0.5234243869781494, "kl": 0.030364990234375, "learning_rate": 6.374054580489873e-07, "loss": 0.0012, "reward": 0.1341675203293562, "reward_std": 0.137354108504951, "rewards/cosine_scaled_reward": -0.1583083188161254, "rewards/format_reward": 0.8333333432674408, "step": 247 }, { "completion_length": 912.6666870117188, "epoch": 0.2834285714285714, "grad_norm": 0.42227983474731445, "kl": 0.03240966796875, "learning_rate": 6.343215915635761e-07, "loss": 0.0013, "reward": 0.34676025272347033, "reward_std": 0.14511901792138815, "rewards/cosine_scaled_reward": 0.2087520956993103, "rewards/format_reward": 0.8958333432674408, "step": 248 }, { "completion_length": 1055.1042022705078, "epoch": 0.2845714285714286, "grad_norm": 0.4296765625476837, "kl": 0.036590576171875, "learning_rate": 6.31233615362752e-07, "loss": 0.0015, "reward": 0.38768328726291656, "reward_std": 0.19723143242299557, "rewards/cosine_scaled_reward": 0.32842716574668884, "rewards/format_reward": 0.8125, "step": 249 }, { "completion_length": 900.5000152587891, "epoch": 0.2857142857142857, "grad_norm": 0.5194025635719299, "kl": 0.035919189453125, "learning_rate": 6.281416799501187e-07, "loss": 0.0014, "reward": 0.23128428310155869, "reward_std": 0.14652666449546814, "rewards/cosine_scaled_reward": -0.04282047459855676, "rewards/format_reward": 0.9583333432674408, "step": 250 }, { "completion_length": 789.0833435058594, "epoch": 0.28685714285714287, "grad_norm": 0.7570492625236511, "kl": 0.038604736328125, "learning_rate": 6.25045936022246e-07, "loss": 0.0015, "reward": 0.2817927971482277, "reward_std": 0.15719792805612087, "rewards/cosine_scaled_reward": 0.08707437012344599, "rewards/format_reward": 0.9166666716337204, "step": 251 }, { "completion_length": 967.3750305175781, "epoch": 0.288, "grad_norm": 1.0730613470077515, "kl": 0.037933349609375, "learning_rate": 6.219465344613258e-07, "loss": 0.0015, "reward": 0.22731942497193813, "reward_std": 0.06802771054208279, "rewards/cosine_scaled_reward": -0.03143314644694328, "rewards/format_reward": 0.9166666865348816, "step": 252 }, { "completion_length": 1083.3750305175781, "epoch": 0.28914285714285715, "grad_norm": 0.5252629518508911, "kl": 0.04913330078125, "learning_rate": 6.188436263278172e-07, "loss": 0.002, "reward": 0.2587835565209389, "reward_std": 0.22102728486061096, "rewards/cosine_scaled_reward": 0.03293989598751068, "rewards/format_reward": 0.895833358168602, "step": 253 }, { "completion_length": 1079.0208587646484, "epoch": 0.29028571428571426, "grad_norm": 0.5835139751434326, "kl": 0.040008544921875, "learning_rate": 6.157373628530852e-07, "loss": 0.0016, "reward": 0.2297833226621151, "reward_std": 0.1996668577194214, "rewards/cosine_scaled_reward": -0.020830905064940453, "rewards/format_reward": 0.9166666716337204, "step": 254 }, { "completion_length": 1205.0833587646484, "epoch": 0.2914285714285714, "grad_norm": 0.5719550848007202, "kl": 0.03509521484375, "learning_rate": 6.126278954320294e-07, "loss": 0.0014, "reward": 0.14546580612659454, "reward_std": 0.15089455991983414, "rewards/cosine_scaled_reward": -0.14702815748751163, "rewards/format_reward": 0.8541666865348816, "step": 255 }, { "completion_length": 993.3750457763672, "epoch": 0.2925714285714286, "grad_norm": 0.5411979556083679, "kl": 0.036834716796875, "learning_rate": 6.095153756157051e-07, "loss": 0.0015, "reward": 0.23095286265015602, "reward_std": 0.09542735107243061, "rewards/cosine_scaled_reward": -0.014969693031162024, "rewards/format_reward": 0.9166666865348816, "step": 256 }, { "completion_length": 1043.5208740234375, "epoch": 0.2937142857142857, "grad_norm": 0.363566130399704, "kl": 0.030120849609375, "learning_rate": 6.06399955103937e-07, "loss": 0.0012, "reward": 0.26867521926760674, "reward_std": 0.19283109530806541, "rewards/cosine_scaled_reward": 0.029048915952444077, "rewards/format_reward": 0.9791666716337204, "step": 257 }, { "completion_length": 1490.562515258789, "epoch": 0.2948571428571429, "grad_norm": 0.4172511696815491, "kl": 0.036346435546875, "learning_rate": 6.032817857379256e-07, "loss": 0.0015, "reward": 0.24154049530625343, "reward_std": 0.22705786675214767, "rewards/cosine_scaled_reward": 0.04046456143260002, "rewards/format_reward": 0.8541666865348816, "step": 258 }, { "completion_length": 878.7500457763672, "epoch": 0.296, "grad_norm": 0.44050225615501404, "kl": 0.040191650390625, "learning_rate": 6.001610194928464e-07, "loss": 0.0016, "reward": 0.3169959019869566, "reward_std": 0.10027684271335602, "rewards/cosine_scaled_reward": 0.12391193583607674, "rewards/format_reward": 0.9583333432674408, "step": 259 }, { "completion_length": 697.8541946411133, "epoch": 0.29714285714285715, "grad_norm": 0.5850368738174438, "kl": 0.05401611328125, "learning_rate": 5.97037808470444e-07, "loss": 0.0022, "reward": 0.3795708492398262, "reward_std": 0.14518114551901817, "rewards/cosine_scaled_reward": 0.24731983616948128, "rewards/format_reward": 0.9583333432674408, "step": 260 }, { "completion_length": 1203.5208435058594, "epoch": 0.29828571428571427, "grad_norm": 1.0009416341781616, "kl": 0.041900634765625, "learning_rate": 5.939123048916173e-07, "loss": 0.0017, "reward": 0.1595192812383175, "reward_std": 0.15826651267707348, "rewards/cosine_scaled_reward": -0.1179631557315588, "rewards/format_reward": 0.8541666865348816, "step": 261 }, { "completion_length": 1150.1250305175781, "epoch": 0.29942857142857143, "grad_norm": 0.6016647219657898, "kl": 0.0426025390625, "learning_rate": 5.907846610890011e-07, "loss": 0.0017, "reward": 0.13380123488605022, "reward_std": 0.07536024926230311, "rewards/cosine_scaled_reward": -0.17903191782534122, "rewards/format_reward": 0.8750000149011612, "step": 262 }, { "completion_length": 1022.6458740234375, "epoch": 0.30057142857142854, "grad_norm": 0.6816222667694092, "kl": 0.04901123046875, "learning_rate": 5.87655029499542e-07, "loss": 0.002, "reward": 0.18971112743020058, "reward_std": 0.11624582670629025, "rewards/cosine_scaled_reward": -0.12517122831195593, "rewards/format_reward": 0.9791666716337204, "step": 263 }, { "completion_length": 949.7917022705078, "epoch": 0.3017142857142857, "grad_norm": 0.5342040657997131, "kl": 0.0382080078125, "learning_rate": 5.845235626570683e-07, "loss": 0.0015, "reward": 0.2280478999018669, "reward_std": 0.14583226293325424, "rewards/cosine_scaled_reward": -0.03753554215654731, "rewards/format_reward": 0.9375000149011612, "step": 264 }, { "completion_length": 884.4375457763672, "epoch": 0.3028571428571429, "grad_norm": 0.6255229115486145, "kl": 0.03778076171875, "learning_rate": 5.813904131848564e-07, "loss": 0.0015, "reward": 0.21849878132343292, "reward_std": 0.15134539641439915, "rewards/cosine_scaled_reward": -0.03120946791023016, "rewards/format_reward": 0.9166666865348816, "step": 265 }, { "completion_length": 1083.125015258789, "epoch": 0.304, "grad_norm": 231.69403076171875, "kl": 2.13037109375, "learning_rate": 5.78255733788191e-07, "loss": 0.0847, "reward": 0.199378851801157, "reward_std": 0.16411220282316208, "rewards/cosine_scaled_reward": -0.05068040080368519, "rewards/format_reward": 0.8750000149011612, "step": 266 }, { "completion_length": 1565.2500305175781, "epoch": 0.30514285714285716, "grad_norm": 0.4458254873752594, "kl": 0.07904052734375, "learning_rate": 5.751196772469237e-07, "loss": 0.0032, "reward": 0.08514338824898005, "reward_std": 0.040075195487588644, "rewards/cosine_scaled_reward": -0.20036707818508148, "rewards/format_reward": 0.7291666716337204, "step": 267 }, { "completion_length": 750.1666717529297, "epoch": 0.3062857142857143, "grad_norm": 0.46984851360321045, "kl": 0.044677734375, "learning_rate": 5.71982396408026e-07, "loss": 0.0018, "reward": 0.2486685924232006, "reward_std": 0.15828723087906837, "rewards/cosine_scaled_reward": 0.00051740906201303, "rewards/format_reward": 0.9583333432674408, "step": 268 }, { "completion_length": 1221.5833587646484, "epoch": 0.30742857142857144, "grad_norm": 0.4291362762451172, "kl": 0.04345703125, "learning_rate": 5.688440441781398e-07, "loss": 0.0017, "reward": 0.23198699206113815, "reward_std": 0.1171283945441246, "rewards/cosine_scaled_reward": 0.009990142658352852, "rewards/format_reward": 0.8750000149011612, "step": 269 }, { "completion_length": 1106.333366394043, "epoch": 0.30857142857142855, "grad_norm": 0.37922003865242004, "kl": 0.050811767578125, "learning_rate": 5.657047735161255e-07, "loss": 0.002, "reward": 0.30448032915592194, "reward_std": 0.2079874724149704, "rewards/cosine_scaled_reward": 0.13500287476927042, "rewards/format_reward": 0.9166666716337204, "step": 270 }, { "completion_length": 952.1667022705078, "epoch": 0.3097142857142857, "grad_norm": 0.5318814516067505, "kl": 0.0511474609375, "learning_rate": 5.625647374256061e-07, "loss": 0.002, "reward": 0.29093681275844574, "reward_std": 0.07009987439960241, "rewards/cosine_scaled_reward": 0.10035991668701172, "rewards/format_reward": 0.9166666716337204, "step": 271 }, { "completion_length": 1200.0833740234375, "epoch": 0.31085714285714283, "grad_norm": 1.1470524072647095, "kl": 0.0628662109375, "learning_rate": 5.594240889475106e-07, "loss": 0.0025, "reward": 0.18854557536542416, "reward_std": 0.16053328290581703, "rewards/cosine_scaled_reward": -0.05628577619791031, "rewards/format_reward": 0.8333333432674408, "step": 272 }, { "completion_length": 865.2917022705078, "epoch": 0.312, "grad_norm": 0.6232411861419678, "kl": 0.05157470703125, "learning_rate": 5.562829811526154e-07, "loss": 0.0021, "reward": 0.29768793657422066, "reward_std": 0.14682695735245943, "rewards/cosine_scaled_reward": 0.10405893321149051, "rewards/format_reward": 0.9375000149011612, "step": 273 }, { "completion_length": 670.333366394043, "epoch": 0.31314285714285717, "grad_norm": 0.6015110015869141, "kl": 0.0523681640625, "learning_rate": 5.531415671340826e-07, "loss": 0.0021, "reward": 0.29236118495464325, "reward_std": 0.203184824436903, "rewards/cosine_scaled_reward": 0.09015073929913342, "rewards/format_reward": 0.9375000149011612, "step": 274 }, { "completion_length": 1250.3750305175781, "epoch": 0.3142857142857143, "grad_norm": 0.9596053957939148, "kl": 0.08544921875, "learning_rate": 5.5e-07, "loss": 0.0034, "reward": 0.2736106924712658, "reward_std": 0.17104216665029526, "rewards/cosine_scaled_reward": 0.09908981248736382, "rewards/format_reward": 0.8541666716337204, "step": 275 }, { "completion_length": 841.0208511352539, "epoch": 0.31542857142857145, "grad_norm": 0.7267608642578125, "kl": 0.07403564453125, "learning_rate": 5.468584328659172e-07, "loss": 0.003, "reward": 0.22102046012878418, "reward_std": 0.12854661233723164, "rewards/cosine_scaled_reward": -0.028625067323446274, "rewards/format_reward": 0.9166666716337204, "step": 276 }, { "completion_length": 798.1458511352539, "epoch": 0.31657142857142856, "grad_norm": 0.9901447892189026, "kl": 0.062255859375, "learning_rate": 5.437170188473847e-07, "loss": 0.0025, "reward": 0.333826519548893, "reward_std": 0.15261034481227398, "rewards/cosine_scaled_reward": 0.1640820149332285, "rewards/format_reward": 0.9583333432674408, "step": 277 }, { "completion_length": 1076.7500305175781, "epoch": 0.3177142857142857, "grad_norm": 0.7171893119812012, "kl": 0.0806884765625, "learning_rate": 5.405759110524894e-07, "loss": 0.0032, "reward": 0.2774466313421726, "reward_std": 0.08555892016738653, "rewards/cosine_scaled_reward": 0.05030255578458309, "rewards/format_reward": 0.9375000149011612, "step": 278 }, { "completion_length": 1309.4167175292969, "epoch": 0.31885714285714284, "grad_norm": 1.2347135543823242, "kl": 0.0941162109375, "learning_rate": 5.37435262574394e-07, "loss": 0.0038, "reward": 0.2149769775569439, "reward_std": 0.15968616120517254, "rewards/cosine_scaled_reward": -0.03698759526014328, "rewards/format_reward": 0.895833358168602, "step": 279 }, { "completion_length": 1025.1250305175781, "epoch": 0.32, "grad_norm": 0.9825677275657654, "kl": 0.119384765625, "learning_rate": 5.342952264838747e-07, "loss": 0.0048, "reward": 0.39643559604883194, "reward_std": 0.2180429883301258, "rewards/cosine_scaled_reward": 0.2991796247661114, "rewards/format_reward": 0.9166666865348816, "step": 280 }, { "completion_length": 1768.3959045410156, "epoch": 0.3211428571428571, "grad_norm": 1.470959186553955, "kl": 0.212890625, "learning_rate": 5.311559558218603e-07, "loss": 0.0085, "reward": 0.1398650612682104, "reward_std": 0.20232918485999107, "rewards/cosine_scaled_reward": -0.08653179882094264, "rewards/format_reward": 0.708333358168602, "step": 281 }, { "completion_length": 946.9583435058594, "epoch": 0.3222857142857143, "grad_norm": 0.9021422863006592, "kl": 0.12164306640625, "learning_rate": 5.28017603591974e-07, "loss": 0.0049, "reward": 0.3111142925918102, "reward_std": 0.13951029535382986, "rewards/cosine_scaled_reward": 0.1274205455556512, "rewards/format_reward": 0.9375000149011612, "step": 282 }, { "completion_length": 1651.0000915527344, "epoch": 0.32342857142857145, "grad_norm": 1.3504232168197632, "kl": 0.2061767578125, "learning_rate": 5.248803227530763e-07, "loss": 0.0083, "reward": 0.24972382094711065, "reward_std": 0.2437318116426468, "rewards/cosine_scaled_reward": 0.0964762894436717, "rewards/format_reward": 0.7708333507180214, "step": 283 }, { "completion_length": 753.3750305175781, "epoch": 0.32457142857142857, "grad_norm": 0.621091365814209, "kl": 0.04949951171875, "learning_rate": 5.21744266211809e-07, "loss": 0.002, "reward": 0.22959614172577858, "reward_std": 0.09632645733654499, "rewards/cosine_scaled_reward": -0.04986579902470112, "rewards/format_reward": 0.9791666716337204, "step": 284 }, { "completion_length": 664.3541717529297, "epoch": 0.32571428571428573, "grad_norm": 0.7218449115753174, "kl": 0.06561279296875, "learning_rate": 5.186095868151436e-07, "loss": 0.0026, "reward": 0.26567720249295235, "reward_std": 0.0979077285155654, "rewards/cosine_scaled_reward": 0.012893570587038994, "rewards/format_reward": 0.9791666716337204, "step": 285 }, { "completion_length": 982.6875457763672, "epoch": 0.32685714285714285, "grad_norm": 1.1418261528015137, "kl": 0.12713623046875, "learning_rate": 5.154764373429315e-07, "loss": 0.0051, "reward": 0.21292153897229582, "reward_std": 0.12249759212136269, "rewards/cosine_scaled_reward": -0.03761478420346975, "rewards/format_reward": 0.8958333432674408, "step": 286 }, { "completion_length": 719.6250305175781, "epoch": 0.328, "grad_norm": 1.5979467630386353, "kl": 0.135498046875, "learning_rate": 5.123449705004581e-07, "loss": 0.0054, "reward": 0.2671673148870468, "reward_std": 0.1586742140352726, "rewards/cosine_scaled_reward": 0.039879145566374063, "rewards/format_reward": 0.9375, "step": 287 }, { "completion_length": 1212.6250305175781, "epoch": 0.3291428571428571, "grad_norm": 1.60840904712677, "kl": 0.2393798828125, "learning_rate": 5.09215338910999e-07, "loss": 0.0096, "reward": 0.1874447762966156, "reward_std": 0.18980350345373154, "rewards/cosine_scaled_reward": -0.07510642020497471, "rewards/format_reward": 0.8541666716337204, "step": 288 }, { "completion_length": 839.2916946411133, "epoch": 0.3302857142857143, "grad_norm": 1.0780935287475586, "kl": 0.0870361328125, "learning_rate": 5.060876951083828e-07, "loss": 0.0035, "reward": 0.32771413400769234, "reward_std": 0.0728826243430376, "rewards/cosine_scaled_reward": 0.15436820732429624, "rewards/format_reward": 0.9583333432674408, "step": 289 }, { "completion_length": 715.3750305175781, "epoch": 0.3314285714285714, "grad_norm": 1.1252237558364868, "kl": 0.13330078125, "learning_rate": 5.02962191529556e-07, "loss": 0.0053, "reward": 0.30614541471004486, "reward_std": 0.14806292206048965, "rewards/cosine_scaled_reward": 0.09028282668441534, "rewards/format_reward": 0.9791666716337204, "step": 290 }, { "completion_length": 1231.5208587646484, "epoch": 0.3325714285714286, "grad_norm": 1.5286831855773926, "kl": 0.339599609375, "learning_rate": 4.998389805071536e-07, "loss": 0.0136, "reward": 0.24789002537727356, "reward_std": 0.18409395217895508, "rewards/cosine_scaled_reward": 0.038463436998426914, "rewards/format_reward": 0.8750000149011612, "step": 291 }, { "completion_length": 1147.9375305175781, "epoch": 0.33371428571428574, "grad_norm": 2.318183183670044, "kl": 0.29669189453125, "learning_rate": 4.967182142620745e-07, "loss": 0.0119, "reward": 0.16479212266858667, "reward_std": 0.12424674537032843, "rewards/cosine_scaled_reward": -0.13350017089396715, "rewards/format_reward": 0.8958333432674408, "step": 292 }, { "completion_length": 702.208366394043, "epoch": 0.33485714285714285, "grad_norm": 1.4020707607269287, "kl": 0.141845703125, "learning_rate": 4.93600044896063e-07, "loss": 0.0057, "reward": 0.2788621224462986, "reward_std": 0.0915048886090517, "rewards/cosine_scaled_reward": 0.037897709757089615, "rewards/format_reward": 0.9791666716337204, "step": 293 }, { "completion_length": 1155.2291793823242, "epoch": 0.336, "grad_norm": 2.4730300903320312, "kl": 0.3013916015625, "learning_rate": 4.904846243842949e-07, "loss": 0.0121, "reward": 0.24240204505622387, "reward_std": 0.16233912482857704, "rewards/cosine_scaled_reward": 0.022549863904714584, "rewards/format_reward": 0.8750000149011612, "step": 294 }, { "completion_length": 1060.7917175292969, "epoch": 0.33714285714285713, "grad_norm": 4.8620381355285645, "kl": 0.2877197265625, "learning_rate": 4.873721045679706e-07, "loss": 0.0115, "reward": 0.3373976834118366, "reward_std": 0.09286624100059271, "rewards/cosine_scaled_reward": 0.20970024168491364, "rewards/format_reward": 0.8750000149011612, "step": 295 }, { "completion_length": 919.1042022705078, "epoch": 0.3382857142857143, "grad_norm": 5.048527240753174, "kl": 0.2198486328125, "learning_rate": 4.842626371469149e-07, "loss": 0.0088, "reward": 0.2283915039151907, "reward_std": 0.15014977380633354, "rewards/cosine_scaled_reward": -0.03464532503858209, "rewards/format_reward": 0.9375000149011612, "step": 296 }, { "completion_length": 1544.4375610351562, "epoch": 0.3394285714285714, "grad_norm": 3.76399564743042, "kl": 0.8408203125, "learning_rate": 4.811563736721829e-07, "loss": 0.0336, "reward": 0.26248094253242016, "reward_std": 0.22022581845521927, "rewards/cosine_scaled_reward": 0.07221121434122324, "rewards/format_reward": 0.8541666716337204, "step": 297 }, { "completion_length": 751.5208435058594, "epoch": 0.3405714285714286, "grad_norm": 14.281402587890625, "kl": 0.94287109375, "learning_rate": 4.780534655386743e-07, "loss": 0.0377, "reward": 0.2596895806491375, "reward_std": 0.12608115747570992, "rewards/cosine_scaled_reward": 0.009351173415780067, "rewards/format_reward": 0.9791666716337204, "step": 298 }, { "completion_length": 1187.4583892822266, "epoch": 0.3417142857142857, "grad_norm": 14.567778587341309, "kl": 1.1982421875, "learning_rate": 4.749540639777539e-07, "loss": 0.0478, "reward": 0.2817165367305279, "reward_std": 0.17178547009825706, "rewards/cosine_scaled_reward": 0.0913948267698288, "rewards/format_reward": 0.8958333432674408, "step": 299 }, { "completion_length": 1541.5208740234375, "epoch": 0.34285714285714286, "grad_norm": 8.866561889648438, "kl": 1.501953125, "learning_rate": 4.7185832004988133e-07, "loss": 0.0602, "reward": 0.24100394919514656, "reward_std": 0.16152354702353477, "rewards/cosine_scaled_reward": 0.07104794564656913, "rewards/format_reward": 0.770833358168602, "step": 300 }, { "completion_length": 1074.4792022705078, "epoch": 0.344, "grad_norm": 9.721318244934082, "kl": 1.5791015625, "learning_rate": 4.68766384637248e-07, "loss": 0.0632, "reward": 0.21970979683101177, "reward_std": 0.14997759833931923, "rewards/cosine_scaled_reward": -0.01651475703692995, "rewards/format_reward": 0.8541666716337204, "step": 301 }, { "completion_length": 1186.3958435058594, "epoch": 0.34514285714285714, "grad_norm": 7.556268692016602, "kl": 0.8861083984375, "learning_rate": 4.656784084364238e-07, "loss": 0.0355, "reward": 0.27779902052134275, "reward_std": 0.17153581604361534, "rewards/cosine_scaled_reward": 0.11099794041365385, "rewards/format_reward": 0.833333358168602, "step": 302 }, { "completion_length": 650.3958435058594, "epoch": 0.3462857142857143, "grad_norm": 1.4734262228012085, "kl": 0.1495361328125, "learning_rate": 4.6259454195101267e-07, "loss": 0.006, "reward": 0.28030719608068466, "reward_std": 0.12815302750095725, "rewards/cosine_scaled_reward": 0.03950038552284241, "rewards/format_reward": 1.0, "step": 303 }, { "completion_length": 1164.5833740234375, "epoch": 0.3474285714285714, "grad_norm": 5.996427536010742, "kl": 0.5029296875, "learning_rate": 4.59514935484316e-07, "loss": 0.0201, "reward": 0.2070623217150569, "reward_std": 0.17195116728544235, "rewards/cosine_scaled_reward": -0.03806167421862483, "rewards/format_reward": 0.875, "step": 304 }, { "completion_length": 1264.6875457763672, "epoch": 0.3485714285714286, "grad_norm": 4.440830230712891, "kl": 0.78515625, "learning_rate": 4.5643973913200837e-07, "loss": 0.0314, "reward": 0.19386939704418182, "reward_std": 0.1649198681116104, "rewards/cosine_scaled_reward": -0.07410681061446667, "rewards/format_reward": 0.8958333432674408, "step": 305 }, { "completion_length": 1062.0208587646484, "epoch": 0.3497142857142857, "grad_norm": 3.8561646938323975, "kl": 0.7784423828125, "learning_rate": 4.5336910277482155e-07, "loss": 0.0312, "reward": 0.3398265913128853, "reward_std": 0.16305011231452227, "rewards/cosine_scaled_reward": 0.22889345418661833, "rewards/format_reward": 0.8541666716337204, "step": 306 }, { "completion_length": 815.958366394043, "epoch": 0.35085714285714287, "grad_norm": 3.5555624961853027, "kl": 0.2808837890625, "learning_rate": 4.503031760712397e-07, "loss": 0.0112, "reward": 0.23237643763422966, "reward_std": 0.15361445024609566, "rewards/cosine_scaled_reward": -0.018825537525117397, "rewards/format_reward": 0.9166666716337204, "step": 307 }, { "completion_length": 1562.2500610351562, "epoch": 0.352, "grad_norm": 4.974001884460449, "kl": 1.275390625, "learning_rate": 4.4724210845020494e-07, "loss": 0.051, "reward": 0.22685429267585278, "reward_std": 0.15579739212989807, "rewards/cosine_scaled_reward": 0.051724355667829514, "rewards/format_reward": 0.7708333432674408, "step": 308 }, { "completion_length": 1321.6250305175781, "epoch": 0.35314285714285715, "grad_norm": 4.7091288566589355, "kl": 1.13330078125, "learning_rate": 4.441860491038345e-07, "loss": 0.0454, "reward": 0.155959352850914, "reward_std": 0.14562787115573883, "rewards/cosine_scaled_reward": -0.12656080474698683, "rewards/format_reward": 0.8541666865348816, "step": 309 }, { "completion_length": 743.4583587646484, "epoch": 0.35428571428571426, "grad_norm": 3.701563596725464, "kl": 0.5528564453125, "learning_rate": 4.4113514698014953e-07, "loss": 0.0221, "reward": 0.21535425633192062, "reward_std": 0.08894449099898338, "rewards/cosine_scaled_reward": -0.06778281182050705, "rewards/format_reward": 0.9583333432674408, "step": 310 }, { "completion_length": 968.6042022705078, "epoch": 0.3554285714285714, "grad_norm": 2.395068407058716, "kl": 0.6717529296875, "learning_rate": 4.3808955077581546e-07, "loss": 0.0269, "reward": 0.2611798755824566, "reward_std": 0.1802541073411703, "rewards/cosine_scaled_reward": 0.031661394983530045, "rewards/format_reward": 0.9375, "step": 311 }, { "completion_length": 940.3958740234375, "epoch": 0.3565714285714286, "grad_norm": 2.636352777481079, "kl": 0.6422119140625, "learning_rate": 4.350494089288943e-07, "loss": 0.0257, "reward": 0.3778625950217247, "reward_std": 0.08777109067887068, "rewards/cosine_scaled_reward": 0.23694994347169995, "rewards/format_reward": 0.9791666716337204, "step": 312 }, { "completion_length": 1889.1458740234375, "epoch": 0.3577142857142857, "grad_norm": 13.118053436279297, "kl": 2.5419921875, "learning_rate": 4.3201486961161093e-07, "loss": 0.1018, "reward": 0.17284774128347635, "reward_std": 0.15150598622858524, "rewards/cosine_scaled_reward": 0.028017327189445496, "rewards/format_reward": 0.6041666865348816, "step": 313 }, { "completion_length": 990.4375, "epoch": 0.3588571428571429, "grad_norm": 4.434592247009277, "kl": 0.952392578125, "learning_rate": 4.2898608072313045e-07, "loss": 0.038, "reward": 0.3436597026884556, "reward_std": 0.1606369400396943, "rewards/cosine_scaled_reward": 0.18015967262908816, "rewards/format_reward": 0.9375000149011612, "step": 314 }, { "completion_length": 1430.0417175292969, "epoch": 0.36, "grad_norm": 22.22004508972168, "kl": 2.7587890625, "learning_rate": 4.2596318988235037e-07, "loss": 0.1104, "reward": 0.20831938460469246, "reward_std": 0.18246332369744778, "rewards/cosine_scaled_reward": 0.010364960879087448, "rewards/format_reward": 0.7708333432674408, "step": 315 }, { "completion_length": 1698.8958892822266, "epoch": 0.36114285714285715, "grad_norm": 19.014558792114258, "kl": 2.826171875, "learning_rate": 4.2294634442070553e-07, "loss": 0.1132, "reward": 0.09894545935094357, "reward_std": 0.14997925981879234, "rewards/cosine_scaled_reward": -0.16700951755046844, "rewards/format_reward": 0.708333358168602, "step": 316 }, { "completion_length": 1184.3750457763672, "epoch": 0.36228571428571427, "grad_norm": 6.429854393005371, "kl": 0.93701171875, "learning_rate": 4.1993569137498776e-07, "loss": 0.0375, "reward": 0.22195260971784592, "reward_std": 0.12669595796614885, "rewards/cosine_scaled_reward": 0.025904617039486766, "rewards/format_reward": 0.8125000149011612, "step": 317 }, { "completion_length": 627.1666717529297, "epoch": 0.36342857142857143, "grad_norm": 5.908717155456543, "kl": 0.2835693359375, "learning_rate": 4.1693137748017915e-07, "loss": 0.0113, "reward": 0.2619051970541477, "reward_std": 0.09561531990766525, "rewards/cosine_scaled_reward": 0.01050032302737236, "rewards/format_reward": 0.9791666716337204, "step": 318 }, { "completion_length": 804.3541946411133, "epoch": 0.36457142857142855, "grad_norm": 2.954686403274536, "kl": 0.2249755859375, "learning_rate": 4.1393354916230005e-07, "loss": 0.009, "reward": 0.22523995116353035, "reward_std": 0.12681083008646965, "rewards/cosine_scaled_reward": -0.04524955153465271, "rewards/format_reward": 0.9583333432674408, "step": 319 }, { "completion_length": 783.7083587646484, "epoch": 0.3657142857142857, "grad_norm": 2.1158158779144287, "kl": 0.711181640625, "learning_rate": 4.1094235253127374e-07, "loss": 0.0285, "reward": 0.3419484347105026, "reward_std": 0.21943408623337746, "rewards/cosine_scaled_reward": 0.16983008198440075, "rewards/format_reward": 0.9583333432674408, "step": 320 }, { "completion_length": 922.1667022705078, "epoch": 0.3668571428571429, "grad_norm": 2.8445801734924316, "kl": 0.773681640625, "learning_rate": 4.079579333738039e-07, "loss": 0.031, "reward": 0.296135351061821, "reward_std": 0.13727245666086674, "rewards/cosine_scaled_reward": 0.1043023755773902, "rewards/format_reward": 0.9166666865348816, "step": 321 }, { "completion_length": 973.7708587646484, "epoch": 0.368, "grad_norm": 6.260856628417969, "kl": 0.9462890625, "learning_rate": 4.0498043714627006e-07, "loss": 0.0378, "reward": 0.27746494114398956, "reward_std": 0.1653186201583594, "rewards/cosine_scaled_reward": 0.04642700869590044, "rewards/format_reward": 0.9583333432674408, "step": 322 }, { "completion_length": 967.7500305175781, "epoch": 0.36914285714285716, "grad_norm": 2.8763952255249023, "kl": 0.751953125, "learning_rate": 4.020100089676376e-07, "loss": 0.03, "reward": 0.21484559401869774, "reward_std": 0.1617780439555645, "rewards/cosine_scaled_reward": 0.021803006529808044, "rewards/format_reward": 0.770833358168602, "step": 323 }, { "completion_length": 1026.7500457763672, "epoch": 0.3702857142857143, "grad_norm": 3.994277000427246, "kl": 0.528564453125, "learning_rate": 3.9904679361238526e-07, "loss": 0.0211, "reward": 0.17972036078572273, "reward_std": 0.12885506637394428, "rewards/cosine_scaled_reward": -0.09272653982043266, "rewards/format_reward": 0.8750000149011612, "step": 324 }, { "completion_length": 1173.5000305175781, "epoch": 0.37142857142857144, "grad_norm": 4.228631019592285, "kl": 1.06005859375, "learning_rate": 3.9609093550344907e-07, "loss": 0.0424, "reward": 0.2462325319647789, "reward_std": 0.14830639958381653, "rewards/cosine_scaled_reward": 0.026797104626893997, "rewards/format_reward": 0.8750000149011612, "step": 325 }, { "completion_length": 888.0208435058594, "epoch": 0.37257142857142855, "grad_norm": 4.49200963973999, "kl": 0.5068359375, "learning_rate": 3.931425787051832e-07, "loss": 0.0202, "reward": 0.25065916404128075, "reward_std": 0.13965209759771824, "rewards/cosine_scaled_reward": -0.005883699515834451, "rewards/format_reward": 0.9583333432674408, "step": 326 }, { "completion_length": 1339.3541870117188, "epoch": 0.3737142857142857, "grad_norm": 3.5500009059906006, "kl": 1.1552734375, "learning_rate": 3.902018669163384e-07, "loss": 0.0463, "reward": 0.25347794964909554, "reward_std": 0.1426910199224949, "rewards/cosine_scaled_reward": 0.10218502581119537, "rewards/format_reward": 0.7708333432674408, "step": 327 }, { "completion_length": 1180.291732788086, "epoch": 0.37485714285714283, "grad_norm": 4.94123649597168, "kl": 0.6202392578125, "learning_rate": 3.872689434630585e-07, "loss": 0.0248, "reward": 0.1488689538091421, "reward_std": 0.16009290027432144, "rewards/cosine_scaled_reward": -0.110639663413167, "rewards/format_reward": 0.7916666865348816, "step": 328 }, { "completion_length": 795.3958435058594, "epoch": 0.376, "grad_norm": 2.8720061779022217, "kl": 0.61962890625, "learning_rate": 3.843439512918949e-07, "loss": 0.0248, "reward": 0.34844836220145226, "reward_std": 0.19519308768212795, "rewards/cosine_scaled_reward": 0.19574240781366825, "rewards/format_reward": 0.9375000149011612, "step": 329 }, { "completion_length": 1049.062515258789, "epoch": 0.37714285714285717, "grad_norm": 4.559622287750244, "kl": 1.11297607421875, "learning_rate": 3.8142703296283953e-07, "loss": 0.0445, "reward": 0.17201771400868893, "reward_std": 0.13709559640847147, "rewards/cosine_scaled_reward": -0.07045471575111151, "rewards/format_reward": 0.7916666716337204, "step": 330 }, { "completion_length": 1200.6042022705078, "epoch": 0.3782857142857143, "grad_norm": 7.140752792358398, "kl": 0.95654296875, "learning_rate": 3.785183306423767e-07, "loss": 0.0382, "reward": 0.21743589686229825, "reward_std": 0.16704676859080791, "rewards/cosine_scaled_reward": 0.019605567678809166, "rewards/format_reward": 0.7708333432674408, "step": 331 }, { "completion_length": 922.0416870117188, "epoch": 0.37942857142857145, "grad_norm": 4.033151149749756, "kl": 0.5589599609375, "learning_rate": 3.7561798609655373e-07, "loss": 0.0223, "reward": 0.2475113496184349, "reward_std": 0.09849587455391884, "rewards/cosine_scaled_reward": 0.02367660589516163, "rewards/format_reward": 0.8958333432674408, "step": 332 }, { "completion_length": 1204.8125457763672, "epoch": 0.38057142857142856, "grad_norm": 5.942810535430908, "kl": 1.6142578125, "learning_rate": 3.72726140684072e-07, "loss": 0.0646, "reward": 0.21941883862018585, "reward_std": 0.16955386567860842, "rewards/cosine_scaled_reward": -0.021143442951142788, "rewards/format_reward": 0.8541666865348816, "step": 333 }, { "completion_length": 1596.3333740234375, "epoch": 0.38171428571428573, "grad_norm": 5.5197062492370605, "kl": 2.220703125, "learning_rate": 3.6984293534939737e-07, "loss": 0.0888, "reward": 0.10062389634549618, "reward_std": 0.14927823096513748, "rewards/cosine_scaled_reward": -0.1727069392800331, "rewards/format_reward": 0.7291666865348816, "step": 334 }, { "completion_length": 1187.5208587646484, "epoch": 0.38285714285714284, "grad_norm": 5.572638988494873, "kl": 1.5341796875, "learning_rate": 3.6696851061588994e-07, "loss": 0.0613, "reward": 0.22269335761666298, "reward_std": 0.21076252683997154, "rewards/cosine_scaled_reward": 0.010578749410342425, "rewards/format_reward": 0.833333358168602, "step": 335 }, { "completion_length": 1502.2708740234375, "epoch": 0.384, "grad_norm": 11.447399139404297, "kl": 2.0029296875, "learning_rate": 3.641030065789562e-07, "loss": 0.08, "reward": 0.3133535422384739, "reward_std": 0.15564018487930298, "rewards/cosine_scaled_reward": 0.16394114650029223, "rewards/format_reward": 0.8750000149011612, "step": 336 }, { "completion_length": 1438.2083435058594, "epoch": 0.3851428571428571, "grad_norm": 13.819022178649902, "kl": 2.9853515625, "learning_rate": 3.612465628992203e-07, "loss": 0.1196, "reward": 0.14050729386508465, "reward_std": 0.12717061396688223, "rewards/cosine_scaled_reward": -0.12903356552124023, "rewards/format_reward": 0.7916666716337204, "step": 337 }, { "completion_length": 967.8542175292969, "epoch": 0.3862857142857143, "grad_norm": 6.60633659362793, "kl": 0.771484375, "learning_rate": 3.5839931879571725e-07, "loss": 0.0308, "reward": 0.23312357813119888, "reward_std": 0.15779702737927437, "rewards/cosine_scaled_reward": -0.00266027613542974, "rewards/format_reward": 0.8958333432674408, "step": 338 }, { "completion_length": 1477.1042022705078, "epoch": 0.38742857142857146, "grad_norm": 4.277693271636963, "kl": 2.021484375, "learning_rate": 3.555614130391079e-07, "loss": 0.0808, "reward": 0.1290037864819169, "reward_std": 0.15569127723574638, "rewards/cosine_scaled_reward": -0.14523668447509408, "rewards/format_reward": 0.770833358168602, "step": 339 }, { "completion_length": 1283.1875610351562, "epoch": 0.38857142857142857, "grad_norm": 4.185704231262207, "kl": 1.314453125, "learning_rate": 3.5273298394491515e-07, "loss": 0.0527, "reward": 0.22219331376254559, "reward_std": 0.15621252916753292, "rewards/cosine_scaled_reward": -0.00760066881775856, "rewards/format_reward": 0.8541666865348816, "step": 340 }, { "completion_length": 1035.7083740234375, "epoch": 0.38971428571428574, "grad_norm": 3.491429328918457, "kl": 0.86773681640625, "learning_rate": 3.4991416936678276e-07, "loss": 0.0347, "reward": 0.31788708828389645, "reward_std": 0.1590380184352398, "rewards/cosine_scaled_reward": 0.181830701418221, "rewards/format_reward": 0.8541666716337204, "step": 341 }, { "completion_length": 1241.1250610351562, "epoch": 0.39085714285714285, "grad_norm": 9.60562515258789, "kl": 1.732421875, "learning_rate": 3.471051066897562e-07, "loss": 0.0695, "reward": 0.20543073303997517, "reward_std": 0.18304480239748955, "rewards/cosine_scaled_reward": -0.0165388360619545, "rewards/format_reward": 0.8125000149011612, "step": 342 }, { "completion_length": 1640.7291870117188, "epoch": 0.392, "grad_norm": 3.7616071701049805, "kl": 2.134765625, "learning_rate": 3.4430593282358777e-07, "loss": 0.0854, "reward": 0.20511594600975513, "reward_std": 0.20067058503627777, "rewards/cosine_scaled_reward": 0.04553595371544361, "rewards/format_reward": 0.6875000223517418, "step": 343 }, { "completion_length": 999.3333435058594, "epoch": 0.3931428571428571, "grad_norm": 3.4487907886505127, "kl": 1.2158203125, "learning_rate": 3.4151678419606233e-07, "loss": 0.0487, "reward": 0.3157181851565838, "reward_std": 0.15319289080798626, "rewards/cosine_scaled_reward": 0.1548741078004241, "rewards/format_reward": 0.8958333432674408, "step": 344 }, { "completion_length": 1208.2917022705078, "epoch": 0.3942857142857143, "grad_norm": 8.388733863830566, "kl": 1.6865234375, "learning_rate": 3.387377967463493e-07, "loss": 0.0674, "reward": 0.21780548989772797, "reward_std": 0.1018645241856575, "rewards/cosine_scaled_reward": 0.01978399232029915, "rewards/format_reward": 0.7916666716337204, "step": 345 }, { "completion_length": 1449.6667175292969, "epoch": 0.3954285714285714, "grad_norm": 3.9231159687042236, "kl": 1.39453125, "learning_rate": 3.359691059183761e-07, "loss": 0.0559, "reward": 0.1776282787322998, "reward_std": 0.22602825611829758, "rewards/cosine_scaled_reward": -0.068118030205369, "rewards/format_reward": 0.8125000298023224, "step": 346 }, { "completion_length": 1282.0625610351562, "epoch": 0.3965714285714286, "grad_norm": 5.337953567504883, "kl": 1.654296875, "learning_rate": 3.3321084665422803e-07, "loss": 0.0661, "reward": 0.1596915554255247, "reward_std": 0.15587524510920048, "rewards/cosine_scaled_reward": -0.1326259132474661, "rewards/format_reward": 0.8750000149011612, "step": 347 }, { "completion_length": 1164.6458587646484, "epoch": 0.3977142857142857, "grad_norm": 5.311245918273926, "kl": 1.1435546875, "learning_rate": 3.3046315338757026e-07, "loss": 0.0457, "reward": 0.21894944459199905, "reward_std": 0.17160159163177013, "rewards/cosine_scaled_reward": 0.0019014915451407433, "rewards/format_reward": 0.8333333432674408, "step": 348 }, { "completion_length": 1220.3333740234375, "epoch": 0.39885714285714285, "grad_norm": 3.8204071521759033, "kl": 0.9813232421875, "learning_rate": 3.2772616003709616e-07, "loss": 0.0393, "reward": 0.19654851406812668, "reward_std": 0.1623307168483734, "rewards/cosine_scaled_reward": 0.015234269667416811, "rewards/format_reward": 0.7291666865348816, "step": 349 }, { "completion_length": 848.5416870117188, "epoch": 0.4, "grad_norm": 5.6252923011779785, "kl": 0.5443115234375, "learning_rate": 3.250000000000001e-07, "loss": 0.0218, "reward": 0.21261978708207607, "reward_std": 0.12894014036282897, "rewards/cosine_scaled_reward": -0.05402694642543793, "rewards/format_reward": 0.9166666716337204, "step": 350 }, { "completion_length": 666.6458587646484, "epoch": 0.40114285714285713, "grad_norm": 3.1464481353759766, "kl": 0.19720458984375, "learning_rate": 3.222848061454764e-07, "loss": 0.0079, "reward": 0.2615892142057419, "reward_std": 0.11285886983387172, "rewards/cosine_scaled_reward": 0.08792655169963837, "rewards/format_reward": 0.8333333432674408, "step": 351 }, { "completion_length": 708.1875152587891, "epoch": 0.4022857142857143, "grad_norm": 1.8742324113845825, "kl": 0.176513671875, "learning_rate": 3.195807108082429e-07, "loss": 0.007, "reward": 0.25918786600232124, "reward_std": 0.14790157228708267, "rewards/cosine_scaled_reward": 0.02013987605459988, "rewards/format_reward": 0.9583333432674408, "step": 352 }, { "completion_length": 567.8750228881836, "epoch": 0.4034285714285714, "grad_norm": 2.343339204788208, "kl": 0.1270751953125, "learning_rate": 3.168878457820915e-07, "loss": 0.0051, "reward": 0.32720309868454933, "reward_std": 0.1316857635974884, "rewards/cosine_scaled_reward": 0.1704409494996071, "rewards/format_reward": 0.9375000149011612, "step": 353 }, { "completion_length": 771.395866394043, "epoch": 0.4045714285714286, "grad_norm": 1.972775936126709, "kl": 0.3314208984375, "learning_rate": 3.142063423134644e-07, "loss": 0.0132, "reward": 0.3329825960099697, "reward_std": 0.12802427215501666, "rewards/cosine_scaled_reward": 0.1915791854262352, "rewards/format_reward": 0.8958333432674408, "step": 354 }, { "completion_length": 702.7708435058594, "epoch": 0.4057142857142857, "grad_norm": 3.016925096511841, "kl": 0.361572265625, "learning_rate": 3.115363310950578e-07, "loss": 0.0145, "reward": 0.24151447787880898, "reward_std": 0.15231290087103844, "rewards/cosine_scaled_reward": 0.005993319675326347, "rewards/format_reward": 0.9166666865348816, "step": 355 }, { "completion_length": 743.0416870117188, "epoch": 0.40685714285714286, "grad_norm": 2.12235951423645, "kl": 0.3642578125, "learning_rate": 3.0887794225945143e-07, "loss": 0.0146, "reward": 0.20886093750596046, "reward_std": 0.12367752194404602, "rewards/cosine_scaled_reward": -0.034687506034970284, "rewards/format_reward": 0.8750000298023224, "step": 356 }, { "completion_length": 690.1667022705078, "epoch": 0.408, "grad_norm": 2.7192955017089844, "kl": 0.541748046875, "learning_rate": 3.062313053727671e-07, "loss": 0.0217, "reward": 0.1517220288515091, "reward_std": 0.137606555595994, "rewards/cosine_scaled_reward": -0.1275198075454682, "rewards/format_reward": 0.8333333432674408, "step": 357 }, { "completion_length": 663.1875152587891, "epoch": 0.40914285714285714, "grad_norm": 1.9349286556243896, "kl": 0.3486328125, "learning_rate": 3.0359654942835247e-07, "loss": 0.0139, "reward": 0.237051859498024, "reward_std": 0.133856151252985, "rewards/cosine_scaled_reward": 0.017080800607800484, "rewards/format_reward": 0.8750000149011612, "step": 358 }, { "completion_length": 532.7708435058594, "epoch": 0.4102857142857143, "grad_norm": 1.5907737016677856, "kl": 0.326416015625, "learning_rate": 3.0097380284049523e-07, "loss": 0.013, "reward": 0.21317852661013603, "reward_std": 0.12036791862919927, "rewards/cosine_scaled_reward": -0.012709625691059045, "rewards/format_reward": 0.8541666865348816, "step": 359 }, { "completion_length": 875.8125305175781, "epoch": 0.4114285714285714, "grad_norm": 2.8240396976470947, "kl": 0.654296875, "learning_rate": 2.9836319343816397e-07, "loss": 0.0261, "reward": 0.2169089950621128, "reward_std": 0.17012035474181175, "rewards/cosine_scaled_reward": -0.006922222673892975, "rewards/format_reward": 0.8541666865348816, "step": 360 }, { "completion_length": 847.7500305175781, "epoch": 0.4125714285714286, "grad_norm": 1.8902132511138916, "kl": 0.4189453125, "learning_rate": 2.9576484845877793e-07, "loss": 0.0167, "reward": 0.1609317921102047, "reward_std": 0.10422449884936213, "rewards/cosine_scaled_reward": -0.11488345637917519, "rewards/format_reward": 0.8541666865348816, "step": 361 }, { "completion_length": 632.3750381469727, "epoch": 0.4137142857142857, "grad_norm": 2.6179628372192383, "kl": 0.35009765625, "learning_rate": 2.931788945420058e-07, "loss": 0.014, "reward": 0.3026861660182476, "reward_std": 0.1447454523295164, "rewards/cosine_scaled_reward": 0.16610896587371826, "rewards/format_reward": 0.833333358168602, "step": 362 }, { "completion_length": 511.87501525878906, "epoch": 0.41485714285714287, "grad_norm": 1.8824483156204224, "kl": 0.365234375, "learning_rate": 2.9060545772359305e-07, "loss": 0.0146, "reward": 0.37631480023264885, "reward_std": 0.11094034370034933, "rewards/cosine_scaled_reward": 0.240870613604784, "rewards/format_reward": 0.9791666716337204, "step": 363 }, { "completion_length": 709.7916717529297, "epoch": 0.416, "grad_norm": 2.7415215969085693, "kl": 0.4144287109375, "learning_rate": 2.8804466342921987e-07, "loss": 0.0166, "reward": 0.20897530019283295, "reward_std": 0.1315334120299667, "rewards/cosine_scaled_reward": -0.026804480701684952, "rewards/format_reward": 0.8541666716337204, "step": 364 }, { "completion_length": 829.6875, "epoch": 0.41714285714285715, "grad_norm": 2.784470558166504, "kl": 0.498779296875, "learning_rate": 2.854966364683872e-07, "loss": 0.0199, "reward": 0.24345804192125797, "reward_std": 0.18701540678739548, "rewards/cosine_scaled_reward": 0.04010058380663395, "rewards/format_reward": 0.8541666865348816, "step": 365 }, { "completion_length": 618.4583511352539, "epoch": 0.41828571428571426, "grad_norm": 2.6888554096221924, "kl": 0.5294189453125, "learning_rate": 2.829615010283344e-07, "loss": 0.0212, "reward": 0.3234354443848133, "reward_std": 0.10294704465195537, "rewards/cosine_scaled_reward": 0.15299547836184502, "rewards/format_reward": 0.9375, "step": 366 }, { "completion_length": 787.6458435058594, "epoch": 0.41942857142857143, "grad_norm": 4.666651725769043, "kl": 0.5, "learning_rate": 2.8043938066798645e-07, "loss": 0.02, "reward": 0.1875544860959053, "reward_std": 0.11255431175231934, "rewards/cosine_scaled_reward": -0.09769667312502861, "rewards/format_reward": 0.895833358168602, "step": 367 }, { "completion_length": 878.5833587646484, "epoch": 0.4205714285714286, "grad_norm": 4.423238277435303, "kl": 0.61328125, "learning_rate": 2.7793039831193133e-07, "loss": 0.0245, "reward": 0.23587674275040627, "reward_std": 0.2058828752487898, "rewards/cosine_scaled_reward": 0.002545563504099846, "rewards/format_reward": 0.8750000149011612, "step": 368 }, { "completion_length": 803.9166870117188, "epoch": 0.4217142857142857, "grad_norm": 2.6107845306396484, "kl": 0.5390625, "learning_rate": 2.7543467624442956e-07, "loss": 0.0216, "reward": 0.23068195581436157, "reward_std": 0.18911143392324448, "rewards/cosine_scaled_reward": 0.0068113189190626144, "rewards/format_reward": 0.8541667014360428, "step": 369 }, { "completion_length": 654.3541946411133, "epoch": 0.4228571428571429, "grad_norm": 1.7799855470657349, "kl": 0.40087890625, "learning_rate": 2.729523361034538e-07, "loss": 0.016, "reward": 0.27163012884557247, "reward_std": 0.0873602069914341, "rewards/cosine_scaled_reward": 0.050700387451797724, "rewards/format_reward": 0.9166666865348816, "step": 370 }, { "completion_length": 404.12500381469727, "epoch": 0.424, "grad_norm": 2.2593891620635986, "kl": 0.3084716796875, "learning_rate": 2.7048349887476037e-07, "loss": 0.0123, "reward": 0.42342475056648254, "reward_std": 0.12662230944260955, "rewards/cosine_scaled_reward": 0.3172451863065362, "rewards/format_reward": 0.9791666716337204, "step": 371 }, { "completion_length": 944.2083587646484, "epoch": 0.42514285714285716, "grad_norm": 2.5881381034851074, "kl": 0.47509765625, "learning_rate": 2.6802828488599294e-07, "loss": 0.019, "reward": 0.26049695909023285, "reward_std": 0.1806934680789709, "rewards/cosine_scaled_reward": 0.07411355525255203, "rewards/format_reward": 0.8541666865348816, "step": 372 }, { "completion_length": 419.6666793823242, "epoch": 0.42628571428571427, "grad_norm": 3.1281840801239014, "kl": 0.197998046875, "learning_rate": 2.655868138008171e-07, "loss": 0.0079, "reward": 0.2753082141280174, "reward_std": 0.08415311574935913, "rewards/cosine_scaled_reward": 0.04036270547658205, "rewards/format_reward": 0.9791666716337204, "step": 373 }, { "completion_length": 699.1041870117188, "epoch": 0.42742857142857144, "grad_norm": 1.5137349367141724, "kl": 0.3175048828125, "learning_rate": 2.631592046130896e-07, "loss": 0.0127, "reward": 0.3290976844727993, "reward_std": 0.046409351751208305, "rewards/cosine_scaled_reward": 0.1492198146879673, "rewards/format_reward": 0.9791666716337204, "step": 374 }, { "completion_length": 650.9583587646484, "epoch": 0.42857142857142855, "grad_norm": 1.9261364936828613, "kl": 0.3038330078125, "learning_rate": 2.6074557564105724e-07, "loss": 0.0122, "reward": 0.3337393254041672, "reward_std": 0.18936175480484962, "rewards/cosine_scaled_reward": 0.1655629649758339, "rewards/format_reward": 0.9583333432674408, "step": 375 }, { "completion_length": 634.4375305175781, "epoch": 0.4297142857142857, "grad_norm": 1.5000464916229248, "kl": 0.322509765625, "learning_rate": 2.583460445215911e-07, "loss": 0.0129, "reward": 0.26660849899053574, "reward_std": 0.1392253851518035, "rewards/cosine_scaled_reward": 0.02780479285866022, "rewards/format_reward": 0.9791666716337204, "step": 376 }, { "completion_length": 595.2083358764648, "epoch": 0.4308571428571429, "grad_norm": 3.6291823387145996, "kl": 0.236572265625, "learning_rate": 2.5596072820445254e-07, "loss": 0.0095, "reward": 0.22076689451932907, "reward_std": 0.10216541588306427, "rewards/cosine_scaled_reward": -0.029933474957942963, "rewards/format_reward": 0.9166666865348816, "step": 377 }, { "completion_length": 625.2916870117188, "epoch": 0.432, "grad_norm": 4.674941539764404, "kl": 0.37646484375, "learning_rate": 2.5358974294659373e-07, "loss": 0.015, "reward": 0.2812824919819832, "reward_std": 0.16731494944542646, "rewards/cosine_scaled_reward": 0.070025734603405, "rewards/format_reward": 0.9375000149011612, "step": 378 }, { "completion_length": 718.4791870117188, "epoch": 0.43314285714285716, "grad_norm": 2.064146041870117, "kl": 0.308837890625, "learning_rate": 2.512332043064913e-07, "loss": 0.0124, "reward": 0.24909690767526627, "reward_std": 0.19286740198731422, "rewards/cosine_scaled_reward": 0.040470815263688564, "rewards/format_reward": 0.8750000298023224, "step": 379 }, { "completion_length": 615.1041870117188, "epoch": 0.4342857142857143, "grad_norm": 1.500081181526184, "kl": 0.2486572265625, "learning_rate": 2.488912271385139e-07, "loss": 0.0099, "reward": 0.2780802324414253, "reward_std": 0.1460455246269703, "rewards/cosine_scaled_reward": 0.0755425647366792, "rewards/format_reward": 0.9166666865348816, "step": 380 }, { "completion_length": 893.1041870117188, "epoch": 0.43542857142857144, "grad_norm": 2.6146016120910645, "kl": 0.56689453125, "learning_rate": 2.465639255873246e-07, "loss": 0.0227, "reward": 0.18425273895263672, "reward_std": 0.11559674330055714, "rewards/cosine_scaled_reward": -0.10113179916515946, "rewards/format_reward": 0.9166666716337204, "step": 381 }, { "completion_length": 664.7291717529297, "epoch": 0.43657142857142855, "grad_norm": 1.732223629951477, "kl": 0.3623046875, "learning_rate": 2.4425141308231765e-07, "loss": 0.0145, "reward": 0.2202533408999443, "reward_std": 0.10179286450147629, "rewards/cosine_scaled_reward": -0.055003027722705156, "rewards/format_reward": 0.9583333432674408, "step": 382 }, { "completion_length": 685.9375152587891, "epoch": 0.4377142857142857, "grad_norm": 1.6889846324920654, "kl": 0.4630126953125, "learning_rate": 2.4195380233209006e-07, "loss": 0.0186, "reward": 0.296797938644886, "reward_std": 0.1770482063293457, "rewards/cosine_scaled_reward": 0.10568510554730892, "rewards/format_reward": 0.9375000149011612, "step": 383 }, { "completion_length": 632.2083587646484, "epoch": 0.43885714285714283, "grad_norm": 1.6566052436828613, "kl": 0.366455078125, "learning_rate": 2.3967120531894857e-07, "loss": 0.0147, "reward": 0.35000767558813095, "reward_std": 0.1569829611107707, "rewards/cosine_scaled_reward": 0.19327298738062382, "rewards/format_reward": 0.9583333432674408, "step": 384 }, { "completion_length": 780.2291870117188, "epoch": 0.44, "grad_norm": 1.7832815647125244, "kl": 0.399169921875, "learning_rate": 2.374037332934512e-07, "loss": 0.016, "reward": 0.2819976694881916, "reward_std": 0.21332971518859267, "rewards/cosine_scaled_reward": 0.09683341905474663, "rewards/format_reward": 0.895833358168602, "step": 385 }, { "completion_length": 783.395866394043, "epoch": 0.44114285714285717, "grad_norm": 3.6944339275360107, "kl": 0.5302734375, "learning_rate": 2.3515149676898552e-07, "loss": 0.0212, "reward": 0.33711181953549385, "reward_std": 0.1739945001900196, "rewards/cosine_scaled_reward": 0.18900193041190505, "rewards/format_reward": 0.9166666865348816, "step": 386 }, { "completion_length": 754.6667022705078, "epoch": 0.4422857142857143, "grad_norm": 2.154081106185913, "kl": 0.440673828125, "learning_rate": 2.3291460551638237e-07, "loss": 0.0176, "reward": 0.27119171619415283, "reward_std": 0.14445213042199612, "rewards/cosine_scaled_reward": 0.049738235771656036, "rewards/format_reward": 0.9375000149011612, "step": 387 }, { "completion_length": 504.9583435058594, "epoch": 0.44342857142857145, "grad_norm": 3.3471145629882812, "kl": 0.25335693359375, "learning_rate": 2.306931685585657e-07, "loss": 0.0101, "reward": 0.28428031131625175, "reward_std": 0.13408437930047512, "rewards/cosine_scaled_reward": 0.06798929907381535, "rewards/format_reward": 0.9583333432674408, "step": 388 }, { "completion_length": 696.9375076293945, "epoch": 0.44457142857142856, "grad_norm": 2.761345386505127, "kl": 0.4833984375, "learning_rate": 2.2848729416523859e-07, "loss": 0.0194, "reward": 0.25267717987298965, "reward_std": 0.14519160240888596, "rewards/cosine_scaled_reward": 0.0351974181830883, "rewards/format_reward": 0.9166666865348816, "step": 389 }, { "completion_length": 753.5208435058594, "epoch": 0.44571428571428573, "grad_norm": 2.958491325378418, "kl": 0.285400390625, "learning_rate": 2.2629708984760706e-07, "loss": 0.0114, "reward": 0.22106869518756866, "reward_std": 0.13949389569461346, "rewards/cosine_scaled_reward": -0.03185149934142828, "rewards/format_reward": 0.9166666865348816, "step": 390 }, { "completion_length": 537.9375152587891, "epoch": 0.44685714285714284, "grad_norm": 4.256007194519043, "kl": 0.3111572265625, "learning_rate": 2.2412266235313973e-07, "loss": 0.0124, "reward": 0.3115757443010807, "reward_std": 0.11429702304303646, "rewards/cosine_scaled_reward": 0.11512432107701898, "rewards/format_reward": 0.9583333432674408, "step": 391 }, { "completion_length": 838.3541870117188, "epoch": 0.448, "grad_norm": 2.729635715484619, "kl": 0.6220703125, "learning_rate": 2.2196411766036487e-07, "loss": 0.0249, "reward": 0.2522091865539551, "reward_std": 0.1718084905296564, "rewards/cosine_scaled_reward": 0.020234670490026474, "rewards/format_reward": 0.9166666865348816, "step": 392 }, { "completion_length": 852.6875, "epoch": 0.4491428571428571, "grad_norm": 1.6498888731002808, "kl": 0.60888671875, "learning_rate": 2.1982156097370557e-07, "loss": 0.0243, "reward": 0.2434215433895588, "reward_std": 0.12160942330956459, "rewards/cosine_scaled_reward": 0.013085559476166964, "rewards/format_reward": 0.9166666716337204, "step": 393 }, { "completion_length": 824.3333435058594, "epoch": 0.4502857142857143, "grad_norm": 2.7091050148010254, "kl": 0.7236328125, "learning_rate": 2.1769509671835223e-07, "loss": 0.029, "reward": 0.18753256276249886, "reward_std": 0.12459465954452753, "rewards/cosine_scaled_reward": -0.08782842010259628, "rewards/format_reward": 0.895833358168602, "step": 394 }, { "completion_length": 664.4791870117188, "epoch": 0.4514285714285714, "grad_norm": 6.248814582824707, "kl": 0.5771484375, "learning_rate": 2.1558482853517253e-07, "loss": 0.0231, "reward": 0.33338726311922073, "reward_std": 0.21797305718064308, "rewards/cosine_scaled_reward": 0.2101506469771266, "rewards/format_reward": 0.8541666865348816, "step": 395 }, { "completion_length": 1013.8750610351562, "epoch": 0.45257142857142857, "grad_norm": 3.0674831867218018, "kl": 0.6494140625, "learning_rate": 2.134908592756607e-07, "loss": 0.026, "reward": 0.23558954149484634, "reward_std": 0.20473281294107437, "rewards/cosine_scaled_reward": 0.015076925046741962, "rewards/format_reward": 0.8750000149011612, "step": 396 }, { "completion_length": 855.2291793823242, "epoch": 0.45371428571428574, "grad_norm": 3.855968713760376, "kl": 0.6875, "learning_rate": 2.1141329099692406e-07, "loss": 0.0276, "reward": 0.22103890031576157, "reward_std": 0.12477581389248371, "rewards/cosine_scaled_reward": -0.0030180364847183228, "rewards/format_reward": 0.8541667014360428, "step": 397 }, { "completion_length": 520.5625152587891, "epoch": 0.45485714285714285, "grad_norm": 1.9709500074386597, "kl": 0.393798828125, "learning_rate": 2.0935222495670968e-07, "loss": 0.0158, "reward": 0.28140144422650337, "reward_std": 0.18673024326562881, "rewards/cosine_scaled_reward": 0.10547193023376167, "rewards/format_reward": 0.8750000298023224, "step": 398 }, { "completion_length": 884.4792098999023, "epoch": 0.456, "grad_norm": 1.933139443397522, "kl": 0.5419921875, "learning_rate": 2.0730776160846853e-07, "loss": 0.0217, "reward": 0.31746701896190643, "reward_std": 0.12484544701874256, "rewards/cosine_scaled_reward": 0.12474868167191744, "rewards/format_reward": 0.9791666716337204, "step": 399 }, { "completion_length": 664.6250152587891, "epoch": 0.45714285714285713, "grad_norm": 2.7360846996307373, "kl": 0.324951171875, "learning_rate": 2.0528000059645995e-07, "loss": 0.013, "reward": 0.33173611015081406, "reward_std": 0.17646266520023346, "rewards/cosine_scaled_reward": 0.1954391412436962, "rewards/format_reward": 0.895833358168602, "step": 400 }, { "completion_length": 800.0000228881836, "epoch": 0.4582857142857143, "grad_norm": 4.1139702796936035, "kl": 0.578857421875, "learning_rate": 2.032690407508949e-07, "loss": 0.0232, "reward": 0.2239944487810135, "reward_std": 0.08999339491128922, "rewards/cosine_scaled_reward": -0.02286973362788558, "rewards/format_reward": 0.9166666865348816, "step": 401 }, { "completion_length": 753.8333587646484, "epoch": 0.4594285714285714, "grad_norm": 3.397742986679077, "kl": 0.60693359375, "learning_rate": 2.0127498008311922e-07, "loss": 0.0243, "reward": 0.2723863087594509, "reward_std": 0.11056133639067411, "rewards/cosine_scaled_reward": 0.07869760692119598, "rewards/format_reward": 0.8750000298023224, "step": 402 }, { "completion_length": 881.3125228881836, "epoch": 0.4605714285714286, "grad_norm": 2.4054489135742188, "kl": 0.50732421875, "learning_rate": 1.9929791578083655e-07, "loss": 0.0203, "reward": 0.29440774768590927, "reward_std": 0.20821228995919228, "rewards/cosine_scaled_reward": 0.1350764511153102, "rewards/format_reward": 0.833333358168602, "step": 403 }, { "completion_length": 894.3125305175781, "epoch": 0.4617142857142857, "grad_norm": 1.7774850130081177, "kl": 0.4434814453125, "learning_rate": 1.9733794420337213e-07, "loss": 0.0178, "reward": 0.251909576356411, "reward_std": 0.11644712742418051, "rewards/cosine_scaled_reward": 0.03503032587468624, "rewards/format_reward": 0.8958333432674408, "step": 404 }, { "completion_length": 473.0416793823242, "epoch": 0.46285714285714286, "grad_norm": 3.206921100616455, "kl": 0.348388671875, "learning_rate": 1.9539516087697517e-07, "loss": 0.014, "reward": 0.4068675935268402, "reward_std": 0.1689185332506895, "rewards/cosine_scaled_reward": 0.3205347741022706, "rewards/format_reward": 0.9166666865348816, "step": 405 }, { "completion_length": 812.1875152587891, "epoch": 0.464, "grad_norm": 2.4138338565826416, "kl": 0.650390625, "learning_rate": 1.934696604901642e-07, "loss": 0.026, "reward": 0.2720562443137169, "reward_std": 0.15532352775335312, "rewards/cosine_scaled_reward": 0.04482491686940193, "rewards/format_reward": 0.9583333432674408, "step": 406 }, { "completion_length": 909.5625457763672, "epoch": 0.46514285714285714, "grad_norm": 1.4642677307128906, "kl": 0.3577880859375, "learning_rate": 1.915615368891117e-07, "loss": 0.0143, "reward": 0.28285788744688034, "reward_std": 0.08199168555438519, "rewards/cosine_scaled_reward": 0.06744274031370878, "rewards/format_reward": 0.9375000149011612, "step": 407 }, { "completion_length": 862.7292022705078, "epoch": 0.4662857142857143, "grad_norm": 1.6780357360839844, "kl": 0.33648681640625, "learning_rate": 1.8967088307307e-07, "loss": 0.0135, "reward": 0.3260022923350334, "reward_std": 0.14766751788556576, "rewards/cosine_scaled_reward": 0.18159106373786926, "rewards/format_reward": 0.8958333432674408, "step": 408 }, { "completion_length": 1122.0625610351562, "epoch": 0.4674285714285714, "grad_norm": 2.675428628921509, "kl": 0.5927734375, "learning_rate": 1.8779779118983867e-07, "loss": 0.0237, "reward": 0.23924916982650757, "reward_std": 0.16297158412635326, "rewards/cosine_scaled_reward": 0.010674363002181053, "rewards/format_reward": 0.8750000149011612, "step": 409 }, { "completion_length": 599.2916793823242, "epoch": 0.4685714285714286, "grad_norm": 1.1615325212478638, "kl": 0.2940673828125, "learning_rate": 1.8594235253127372e-07, "loss": 0.0118, "reward": 0.29889997094869614, "reward_std": 0.10456442041322589, "rewards/cosine_scaled_reward": 0.109024278819561, "rewards/format_reward": 0.9375000149011612, "step": 410 }, { "completion_length": 1026.7500457763672, "epoch": 0.4697142857142857, "grad_norm": 1.6268342733383179, "kl": 0.50048828125, "learning_rate": 1.8410465752883758e-07, "loss": 0.02, "reward": 0.19551028311252594, "reward_std": 0.13933155499398708, "rewards/cosine_scaled_reward": -0.06300980318337679, "rewards/format_reward": 0.8750000149011612, "step": 411 }, { "completion_length": 582.2083587646484, "epoch": 0.47085714285714286, "grad_norm": 2.1294546127319336, "kl": 0.1552734375, "learning_rate": 1.822847957491922e-07, "loss": 0.0062, "reward": 0.30995913222432137, "reward_std": 0.1351037216372788, "rewards/cosine_scaled_reward": 0.11289607360959053, "rewards/format_reward": 0.9791666716337204, "step": 412 }, { "completion_length": 1090.7291870117188, "epoch": 0.472, "grad_norm": 2.733921527862549, "kl": 0.625, "learning_rate": 1.804828558898332e-07, "loss": 0.025, "reward": 0.2250913567841053, "reward_std": 0.20157472044229507, "rewards/cosine_scaled_reward": 0.05265781283378601, "rewards/format_reward": 0.770833358168602, "step": 413 }, { "completion_length": 1250.8125457763672, "epoch": 0.47314285714285714, "grad_norm": 2.3217246532440186, "kl": 0.6865234375, "learning_rate": 1.7869892577476722e-07, "loss": 0.0275, "reward": 0.12924479320645332, "reward_std": 0.11672937124967575, "rewards/cosine_scaled_reward": -0.15619563311338425, "rewards/format_reward": 0.8125000298023224, "step": 414 }, { "completion_length": 798.8125152587891, "epoch": 0.4742857142857143, "grad_norm": 1.6752110719680786, "kl": 0.323974609375, "learning_rate": 1.7693309235023127e-07, "loss": 0.013, "reward": 0.26490871235728264, "reward_std": 0.12728729285299778, "rewards/cosine_scaled_reward": 0.04056476056575775, "rewards/format_reward": 0.9375000149011612, "step": 415 }, { "completion_length": 762.3125152587891, "epoch": 0.4754285714285714, "grad_norm": 2.545586109161377, "kl": 0.38720703125, "learning_rate": 1.7518544168045524e-07, "loss": 0.0155, "reward": 0.3332759514451027, "reward_std": 0.20331228151917458, "rewards/cosine_scaled_reward": 0.20666369702666998, "rewards/format_reward": 0.8750000298023224, "step": 416 }, { "completion_length": 847.5208587646484, "epoch": 0.4765714285714286, "grad_norm": 4.405890941619873, "kl": 0.59912109375, "learning_rate": 1.7345605894346726e-07, "loss": 0.024, "reward": 0.2588097732514143, "reward_std": 0.10329146264120936, "rewards/cosine_scaled_reward": 0.079451072961092, "rewards/format_reward": 0.833333358168602, "step": 417 }, { "completion_length": 725.7500228881836, "epoch": 0.4777142857142857, "grad_norm": 1.1350117921829224, "kl": 0.1854248046875, "learning_rate": 1.7174502842694212e-07, "loss": 0.0074, "reward": 0.3451504148542881, "reward_std": 0.11309620877727866, "rewards/cosine_scaled_reward": 0.22333572059869766, "rewards/format_reward": 0.8958333432674408, "step": 418 }, { "completion_length": 952.1250457763672, "epoch": 0.47885714285714287, "grad_norm": 1.628246545791626, "kl": 0.3779296875, "learning_rate": 1.7005243352409333e-07, "loss": 0.0151, "reward": 0.2661770395934582, "reward_std": 0.18611376360058784, "rewards/cosine_scaled_reward": 0.044335355050861835, "rewards/format_reward": 0.9375000149011612, "step": 419 }, { "completion_length": 625.7500305175781, "epoch": 0.48, "grad_norm": 2.212934732437134, "kl": 0.283935546875, "learning_rate": 1.6837835672960831e-07, "loss": 0.0114, "reward": 0.2875404357910156, "reward_std": 0.06989867007359862, "rewards/cosine_scaled_reward": 0.06372192548587918, "rewards/format_reward": 0.9791666716337204, "step": 420 }, { "completion_length": 1054.8750305175781, "epoch": 0.48114285714285715, "grad_norm": 1.8930859565734863, "kl": 0.498046875, "learning_rate": 1.6672287963562852e-07, "loss": 0.0199, "reward": 0.14944718219339848, "reward_std": 0.07058797753416002, "rewards/cosine_scaled_reward": -0.13888883031904697, "rewards/format_reward": 0.8541666716337204, "step": 421 }, { "completion_length": 692.8333511352539, "epoch": 0.48228571428571426, "grad_norm": 1.4261200428009033, "kl": 0.274169921875, "learning_rate": 1.6508608292777203e-07, "loss": 0.0109, "reward": 0.2754727974534035, "reward_std": 0.2123397421091795, "rewards/cosine_scaled_reward": 0.08445676229894161, "rewards/format_reward": 0.895833358168602, "step": 422 }, { "completion_length": 674.4583435058594, "epoch": 0.48342857142857143, "grad_norm": 0.9602965116500854, "kl": 0.1572265625, "learning_rate": 1.6346804638120098e-07, "loss": 0.0063, "reward": 0.22294003143906593, "reward_std": 0.09419110557064414, "rewards/cosine_scaled_reward": -0.03287830948829651, "rewards/format_reward": 0.9375000149011612, "step": 423 }, { "completion_length": 977.8333587646484, "epoch": 0.4845714285714286, "grad_norm": 1.9027003049850464, "kl": 0.476806640625, "learning_rate": 1.6186884885673413e-07, "loss": 0.0191, "reward": 0.1344720460474491, "reward_std": 0.09721486736088991, "rewards/cosine_scaled_reward": -0.14570957981050014, "rewards/format_reward": 0.8125000298023224, "step": 424 }, { "completion_length": 909.5625152587891, "epoch": 0.4857142857142857, "grad_norm": 2.298877477645874, "kl": 0.2701416015625, "learning_rate": 1.6028856829700258e-07, "loss": 0.0108, "reward": 0.36824068054556847, "reward_std": 0.2004400845617056, "rewards/cosine_scaled_reward": 0.2859506346285343, "rewards/format_reward": 0.8541666865348816, "step": 425 }, { "completion_length": 742.1458435058594, "epoch": 0.4868571428571429, "grad_norm": 1.202286720275879, "kl": 0.2359619140625, "learning_rate": 1.5872728172265146e-07, "loss": 0.0094, "reward": 0.30135222524404526, "reward_std": 0.1660460252314806, "rewards/cosine_scaled_reward": 0.09177330369129777, "rewards/format_reward": 0.9791666716337204, "step": 426 }, { "completion_length": 1044.0833587646484, "epoch": 0.488, "grad_norm": 1.643343448638916, "kl": 0.37158203125, "learning_rate": 1.5718506522858572e-07, "loss": 0.0149, "reward": 0.2385825931560248, "reward_std": 0.18827398866415024, "rewards/cosine_scaled_reward": 0.0275964867323637, "rewards/format_reward": 0.8541666716337204, "step": 427 }, { "completion_length": 675.270866394043, "epoch": 0.48914285714285716, "grad_norm": 2.0458035469055176, "kl": 0.2125244140625, "learning_rate": 1.5566199398026147e-07, "loss": 0.0085, "reward": 0.19954436272382736, "reward_std": 0.1332020191475749, "rewards/cosine_scaled_reward": -0.05004553310573101, "rewards/format_reward": 0.8750000149011612, "step": 428 }, { "completion_length": 634.6041870117188, "epoch": 0.49028571428571427, "grad_norm": 2.4598546028137207, "kl": 0.2874755859375, "learning_rate": 1.5415814221002265e-07, "loss": 0.0115, "reward": 0.2507454603910446, "reward_std": 0.12321926653385162, "rewards/cosine_scaled_reward": 0.028900128789246082, "rewards/format_reward": 0.9166666865348816, "step": 429 }, { "completion_length": 656.1041870117188, "epoch": 0.49142857142857144, "grad_norm": 1.5537323951721191, "kl": 0.3310546875, "learning_rate": 1.5267358321348285e-07, "loss": 0.0132, "reward": 0.2351117692887783, "reward_std": 0.09419954661279917, "rewards/cosine_scaled_reward": -0.009213123470544815, "rewards/format_reward": 0.9375000149011612, "step": 430 }, { "completion_length": 759.9375305175781, "epoch": 0.49257142857142855, "grad_norm": 1.788845419883728, "kl": 0.2548828125, "learning_rate": 1.5120838934595337e-07, "loss": 0.0102, "reward": 0.2387339137494564, "reward_std": 0.07598517555743456, "rewards/cosine_scaled_reward": -0.01977238431572914, "rewards/format_reward": 0.9583333432674408, "step": 431 }, { "completion_length": 960.8958892822266, "epoch": 0.4937142857142857, "grad_norm": 2.001716136932373, "kl": 0.4464111328125, "learning_rate": 1.4976263201891613e-07, "loss": 0.0178, "reward": 0.14512689411640167, "reward_std": 0.12307741120457649, "rewards/cosine_scaled_reward": -0.1353957038372755, "rewards/format_reward": 0.833333358168602, "step": 432 }, { "completion_length": 882.3958587646484, "epoch": 0.4948571428571429, "grad_norm": 1.8085757493972778, "kl": 0.4130859375, "learning_rate": 1.483363816965435e-07, "loss": 0.0165, "reward": 0.2680644765496254, "reward_std": 0.1378046851605177, "rewards/cosine_scaled_reward": 0.06896508717909455, "rewards/format_reward": 0.8958333432674408, "step": 433 }, { "completion_length": 1028.9583740234375, "epoch": 0.496, "grad_norm": 1.627824068069458, "kl": 0.48828125, "learning_rate": 1.469297078922642e-07, "loss": 0.0195, "reward": 0.15616464614868164, "reward_std": 0.09353521838784218, "rewards/cosine_scaled_reward": -0.1369161568582058, "rewards/format_reward": 0.8750000298023224, "step": 434 }, { "completion_length": 704.1458511352539, "epoch": 0.49714285714285716, "grad_norm": 1.4600574970245361, "kl": 0.2398681640625, "learning_rate": 1.4554267916537495e-07, "loss": 0.0096, "reward": 0.21123821288347244, "reward_std": 0.0739213996566832, "rewards/cosine_scaled_reward": -0.07106086798012257, "rewards/format_reward": 0.9583333432674408, "step": 435 }, { "completion_length": 633.7083587646484, "epoch": 0.4982857142857143, "grad_norm": 1.1855252981185913, "kl": 0.1748046875, "learning_rate": 1.4417536311769885e-07, "loss": 0.007, "reward": 0.3384803608059883, "reward_std": 0.11635629087686539, "rewards/cosine_scaled_reward": 0.17041505128145218, "rewards/format_reward": 0.9791666716337204, "step": 436 }, { "completion_length": 989.9375305175781, "epoch": 0.49942857142857144, "grad_norm": 2.1023504734039307, "kl": 0.428955078125, "learning_rate": 1.4282782639029128e-07, "loss": 0.0172, "reward": 0.21580657176673412, "reward_std": 0.1318113338202238, "rewards/cosine_scaled_reward": 2.1494925022125244e-06, "rewards/format_reward": 0.833333358168602, "step": 437 }, { "completion_length": 838.8125152587891, "epoch": 0.5005714285714286, "grad_norm": 2.6563596725463867, "kl": 0.3740234375, "learning_rate": 1.4150013466019114e-07, "loss": 0.015, "reward": 0.19345827773213387, "reward_std": 0.11686964053660631, "rewards/cosine_scaled_reward": -0.08464339142665267, "rewards/format_reward": 0.9166666865348816, "step": 438 }, { "completion_length": 849.9166870117188, "epoch": 0.5017142857142857, "grad_norm": 2.2154955863952637, "kl": 0.329345703125, "learning_rate": 1.4019235263722034e-07, "loss": 0.0132, "reward": 0.25742702558636665, "reward_std": 0.05162803456187248, "rewards/cosine_scaled_reward": 0.028857290744781494, "rewards/format_reward": 0.9375000149011612, "step": 439 }, { "completion_length": 1011.0625305175781, "epoch": 0.5028571428571429, "grad_norm": 1.8645977973937988, "kl": 0.3740234375, "learning_rate": 1.3890454406082956e-07, "loss": 0.0149, "reward": 0.20875129848718643, "reward_std": 0.1203762274235487, "rewards/cosine_scaled_reward": -0.059487443417310715, "rewards/format_reward": 0.9166666865348816, "step": 440 }, { "completion_length": 1148.1458892822266, "epoch": 0.504, "grad_norm": 2.8735504150390625, "kl": 0.6162109375, "learning_rate": 1.3763677169699217e-07, "loss": 0.0247, "reward": 0.22919971123337746, "reward_std": 0.1521160490810871, "rewards/cosine_scaled_reward": -0.01628560572862625, "rewards/format_reward": 0.9166666865348816, "step": 441 }, { "completion_length": 601.2500152587891, "epoch": 0.5051428571428571, "grad_norm": 1.896121621131897, "kl": 0.26953125, "learning_rate": 1.3638909733514452e-07, "loss": 0.0108, "reward": 0.3021908104419708, "reward_std": 0.1476821806281805, "rewards/cosine_scaled_reward": 0.08549025654792786, "rewards/format_reward": 1.0, "step": 442 }, { "completion_length": 1717.5000305175781, "epoch": 0.5062857142857143, "grad_norm": 3.0878093242645264, "kl": 0.8271484375, "learning_rate": 1.351615817851748e-07, "loss": 0.0332, "reward": 0.1658440548926592, "reward_std": 0.17402243241667747, "rewards/cosine_scaled_reward": -0.097936000674963, "rewards/format_reward": 0.8333333432674408, "step": 443 }, { "completion_length": 1006.1250305175781, "epoch": 0.5074285714285715, "grad_norm": 1.3861217498779297, "kl": 0.4755859375, "learning_rate": 1.3395428487445914e-07, "loss": 0.019, "reward": 0.1939697340130806, "reward_std": 0.16821600310504436, "rewards/cosine_scaled_reward": -0.025334704667329788, "rewards/format_reward": 0.7916666865348816, "step": 444 }, { "completion_length": 1133.9167175292969, "epoch": 0.5085714285714286, "grad_norm": 1.6720504760742188, "kl": 0.55859375, "learning_rate": 1.3276726544494571e-07, "loss": 0.0224, "reward": 0.17472673952579498, "reward_std": 0.12209436483681202, "rewards/cosine_scaled_reward": -0.07853817380964756, "rewards/format_reward": 0.8333333432674408, "step": 445 }, { "completion_length": 939.3958740234375, "epoch": 0.5097142857142857, "grad_norm": 2.431069850921631, "kl": 0.3890380859375, "learning_rate": 1.316005813502869e-07, "loss": 0.0156, "reward": 0.3032803349196911, "reward_std": 0.11415216024033725, "rewards/cosine_scaled_reward": 0.15343379974365234, "rewards/format_reward": 0.8750000149011612, "step": 446 }, { "completion_length": 711.8958511352539, "epoch": 0.5108571428571429, "grad_norm": 1.5871496200561523, "kl": 0.314208984375, "learning_rate": 1.3045428945301953e-07, "loss": 0.0126, "reward": 0.28583668172359467, "reward_std": 0.11146066524088383, "rewards/cosine_scaled_reward": 0.08133360557258129, "rewards/format_reward": 0.9375000149011612, "step": 447 }, { "completion_length": 702.833366394043, "epoch": 0.512, "grad_norm": 1.9198870658874512, "kl": 0.271240234375, "learning_rate": 1.2932844562179352e-07, "loss": 0.0108, "reward": 0.2957136631011963, "reward_std": 0.1658521294593811, "rewards/cosine_scaled_reward": 0.1130102090537548, "rewards/format_reward": 0.9166666865348816, "step": 448 }, { "completion_length": 1004.0625305175781, "epoch": 0.5131428571428571, "grad_norm": 2.9622910022735596, "kl": 0.56982421875, "learning_rate": 1.2822310472864885e-07, "loss": 0.0228, "reward": 0.15535307675600052, "reward_std": 0.09953637048602104, "rewards/cosine_scaled_reward": -0.10826568305492401, "rewards/format_reward": 0.8125000298023224, "step": 449 }, { "completion_length": 1061.1041870117188, "epoch": 0.5142857142857142, "grad_norm": 4.076004981994629, "kl": 0.4691162109375, "learning_rate": 1.2713832064634125e-07, "loss": 0.0188, "reward": 0.2389738243073225, "reward_std": 0.06195556779857725, "rewards/cosine_scaled_reward": 0.014589065685868263, "rewards/format_reward": 0.8958333432674408, "step": 450 }, { "completion_length": 773.9791717529297, "epoch": 0.5154285714285715, "grad_norm": 1.3433425426483154, "kl": 0.2930908203125, "learning_rate": 1.260741462457165e-07, "loss": 0.0117, "reward": 0.2921364903450012, "reward_std": 0.14939963072538376, "rewards/cosine_scaled_reward": 0.1167010497301817, "rewards/format_reward": 0.8958333432674408, "step": 451 }, { "completion_length": 921.0417022705078, "epoch": 0.5165714285714286, "grad_norm": 1.3134700059890747, "kl": 0.49072265625, "learning_rate": 1.2503063339313356e-07, "loss": 0.0196, "reward": 0.3404446206986904, "reward_std": 0.10839604772627354, "rewards/cosine_scaled_reward": 0.17344675585627556, "rewards/format_reward": 0.9583333432674408, "step": 452 }, { "completion_length": 861.020866394043, "epoch": 0.5177142857142857, "grad_norm": 2.2251570224761963, "kl": 0.46728515625, "learning_rate": 1.2400783294793668e-07, "loss": 0.0187, "reward": 0.2773829586803913, "reward_std": 0.15733196586370468, "rewards/cosine_scaled_reward": 0.08839295897632837, "rewards/format_reward": 0.8958333432674408, "step": 453 }, { "completion_length": 851.1041717529297, "epoch": 0.5188571428571429, "grad_norm": 1.61007559299469, "kl": 0.38916015625, "learning_rate": 1.2300579475997657e-07, "loss": 0.0156, "reward": 0.19814873486757278, "reward_std": 0.0741408159956336, "rewards/cosine_scaled_reward": -0.07693859562277794, "rewards/format_reward": 0.9166666716337204, "step": 454 }, { "completion_length": 1000.7708435058594, "epoch": 0.52, "grad_norm": 1.8291538953781128, "kl": 0.419921875, "learning_rate": 1.220245676671809e-07, "loss": 0.0168, "reward": 0.23621458560228348, "reward_std": 0.0763126676902175, "rewards/cosine_scaled_reward": -0.024763476103544235, "rewards/format_reward": 0.9583333432674408, "step": 455 }, { "completion_length": 1324.5208435058594, "epoch": 0.5211428571428571, "grad_norm": 2.513134479522705, "kl": 0.638671875, "learning_rate": 1.2106419949317388e-07, "loss": 0.0255, "reward": 0.18161046132445335, "reward_std": 0.1733924150466919, "rewards/cosine_scaled_reward": -0.03243409004062414, "rewards/format_reward": 0.770833358168602, "step": 456 }, { "completion_length": 870.0208511352539, "epoch": 0.5222857142857142, "grad_norm": 1.5032035112380981, "kl": 0.3619384765625, "learning_rate": 1.2012473704494537e-07, "loss": 0.0145, "reward": 0.2208106629550457, "reward_std": 0.11814653314650059, "rewards/cosine_scaled_reward": -0.04481527768075466, "rewards/format_reward": 0.9375000149011612, "step": 457 }, { "completion_length": 705.3541870117188, "epoch": 0.5234285714285715, "grad_norm": 1.6593698263168335, "kl": 0.306396484375, "learning_rate": 1.1920622611056974e-07, "loss": 0.0122, "reward": 0.2111760675907135, "reward_std": 0.11810605227947235, "rewards/cosine_scaled_reward": -0.050768627785146236, "rewards/format_reward": 0.9166666716337204, "step": 458 }, { "completion_length": 812.6250152587891, "epoch": 0.5245714285714286, "grad_norm": 1.5582921504974365, "kl": 0.3636474609375, "learning_rate": 1.1830871145697412e-07, "loss": 0.0145, "reward": 0.2737140394747257, "reward_std": 0.15337855741381645, "rewards/cosine_scaled_reward": 0.060518473386764526, "rewards/format_reward": 0.9375000149011612, "step": 459 }, { "completion_length": 1277.0625305175781, "epoch": 0.5257142857142857, "grad_norm": 2.547988176345825, "kl": 0.6357421875, "learning_rate": 1.1743223682775649e-07, "loss": 0.0254, "reward": 0.2075492013245821, "reward_std": 0.1327127330005169, "rewards/cosine_scaled_reward": -0.03499909117817879, "rewards/format_reward": 0.8750000298023224, "step": 460 }, { "completion_length": 1054.395866394043, "epoch": 0.5268571428571428, "grad_norm": 2.893979549407959, "kl": 0.411865234375, "learning_rate": 1.1657684494105386e-07, "loss": 0.0165, "reward": 0.2216620147228241, "reward_std": 0.10644234623759985, "rewards/cosine_scaled_reward": -0.020575307309627533, "rewards/format_reward": 0.8958333432674408, "step": 461 }, { "completion_length": 892.3958587646484, "epoch": 0.528, "grad_norm": 2.313384532928467, "kl": 0.351318359375, "learning_rate": 1.1574257748745986e-07, "loss": 0.0141, "reward": 0.2483520209789276, "reward_std": 0.10706382803618908, "rewards/cosine_scaled_reward": 0.003389929741388187, "rewards/format_reward": 0.9375000149011612, "step": 462 }, { "completion_length": 1179.1458587646484, "epoch": 0.5291428571428571, "grad_norm": 1.6139668226242065, "kl": 0.47802734375, "learning_rate": 1.1492947512799328e-07, "loss": 0.0192, "reward": 0.2772967256605625, "reward_std": 0.20775877684354782, "rewards/cosine_scaled_reward": 0.08868502452969551, "rewards/format_reward": 0.895833358168602, "step": 463 }, { "completion_length": 794.0417022705078, "epoch": 0.5302857142857142, "grad_norm": 3.3112828731536865, "kl": 0.3238525390625, "learning_rate": 1.1413757749211602e-07, "loss": 0.013, "reward": 0.4120164215564728, "reward_std": 0.08271403051912785, "rewards/cosine_scaled_reward": 0.2880483344197273, "rewards/format_reward": 1.0, "step": 464 }, { "completion_length": 1033.770851135254, "epoch": 0.5314285714285715, "grad_norm": 3.2056000232696533, "kl": 0.5601806640625, "learning_rate": 1.1336692317580158e-07, "loss": 0.0224, "reward": 0.21623021364212036, "reward_std": 0.1453854814171791, "rewards/cosine_scaled_reward": -0.020927468314766884, "rewards/format_reward": 0.8750000149011612, "step": 465 }, { "completion_length": 998.7291870117188, "epoch": 0.5325714285714286, "grad_norm": 3.7151010036468506, "kl": 0.403076171875, "learning_rate": 1.1261754973965422e-07, "loss": 0.0161, "reward": 0.29429249092936516, "reward_std": 0.18617680110037327, "rewards/cosine_scaled_reward": 0.1186856310814619, "rewards/format_reward": 0.895833358168602, "step": 466 }, { "completion_length": 886.3125152587891, "epoch": 0.5337142857142857, "grad_norm": 2.900324583053589, "kl": 0.359130859375, "learning_rate": 1.1188949370707787e-07, "loss": 0.0144, "reward": 0.1838582344353199, "reward_std": 0.08407878689467907, "rewards/cosine_scaled_reward": -0.10531684756278992, "rewards/format_reward": 0.9166666865348816, "step": 467 }, { "completion_length": 1013.1250305175781, "epoch": 0.5348571428571428, "grad_norm": 4.182558536529541, "kl": 0.511962890625, "learning_rate": 1.1118279056249653e-07, "loss": 0.0205, "reward": 0.22167166136205196, "reward_std": 0.16611425578594208, "rewards/cosine_scaled_reward": -0.009507376700639725, "rewards/format_reward": 0.8750000149011612, "step": 468 }, { "completion_length": 818.8541717529297, "epoch": 0.536, "grad_norm": 2.5078601837158203, "kl": 0.44287109375, "learning_rate": 1.1049747474962444e-07, "loss": 0.0177, "reward": 0.2699749916791916, "reward_std": 0.14100558124482632, "rewards/cosine_scaled_reward": 0.04105002619326115, "rewards/format_reward": 0.9583333432674408, "step": 469 }, { "completion_length": 862.7500457763672, "epoch": 0.5371428571428571, "grad_norm": 4.434298515319824, "kl": 0.3955078125, "learning_rate": 1.0983357966978745e-07, "loss": 0.0158, "reward": 0.2209617905318737, "reward_std": 0.12931668665260077, "rewards/cosine_scaled_reward": -0.040073212236166, "rewards/format_reward": 0.9166666865348816, "step": 470 }, { "completion_length": 815.1458435058594, "epoch": 0.5382857142857143, "grad_norm": 2.6913864612579346, "kl": 0.518310546875, "learning_rate": 1.0919113768029517e-07, "loss": 0.0207, "reward": 0.32279589399695396, "reward_std": 0.13653478678315878, "rewards/cosine_scaled_reward": 0.16807591170072556, "rewards/format_reward": 0.9166666865348816, "step": 471 }, { "completion_length": 1087.3125457763672, "epoch": 0.5394285714285715, "grad_norm": 5.872448921203613, "kl": 0.68798828125, "learning_rate": 1.0857018009286381e-07, "loss": 0.0275, "reward": 0.23642034083604813, "reward_std": 0.1380892600864172, "rewards/cosine_scaled_reward": -0.015175793319940567, "rewards/format_reward": 0.9375000149011612, "step": 472 }, { "completion_length": 912.5208892822266, "epoch": 0.5405714285714286, "grad_norm": 4.524196624755859, "kl": 0.4337158203125, "learning_rate": 1.0797073717209013e-07, "loss": 0.0174, "reward": 0.2329195737838745, "reward_std": 0.07549590291455388, "rewards/cosine_scaled_reward": -0.0402354933321476, "rewards/format_reward": 0.9791666716337204, "step": 473 }, { "completion_length": 1044.1458587646484, "epoch": 0.5417142857142857, "grad_norm": 4.339442253112793, "kl": 0.6923828125, "learning_rate": 1.0739283813397639e-07, "loss": 0.0277, "reward": 0.38777855038642883, "reward_std": 0.1328403279185295, "rewards/cosine_scaled_reward": 0.2592271640896797, "rewards/format_reward": 0.9583333432674408, "step": 474 }, { "completion_length": 1066.0625305175781, "epoch": 0.5428571428571428, "grad_norm": 4.152096748352051, "kl": 0.771484375, "learning_rate": 1.068365111445064e-07, "loss": 0.0308, "reward": 0.21325199492275715, "reward_std": 0.1212292481213808, "rewards/cosine_scaled_reward": -0.04740248993039131, "rewards/format_reward": 0.9166666716337204, "step": 475 }, { "completion_length": 723.2500152587891, "epoch": 0.544, "grad_norm": 3.75915265083313, "kl": 0.3544921875, "learning_rate": 1.063017833182728e-07, "loss": 0.0142, "reward": 0.2565084397792816, "reward_std": 0.15998263470828533, "rewards/cosine_scaled_reward": 0.02486741030588746, "rewards/format_reward": 0.9375000149011612, "step": 476 }, { "completion_length": 1109.4792022705078, "epoch": 0.5451428571428572, "grad_norm": 5.251732349395752, "kl": 0.932373046875, "learning_rate": 1.0578868071715544e-07, "loss": 0.0374, "reward": 0.33160366117954254, "reward_std": 0.1636774968355894, "rewards/cosine_scaled_reward": 0.18443169072270393, "rewards/format_reward": 0.895833358168602, "step": 477 }, { "completion_length": 891.1666717529297, "epoch": 0.5462857142857143, "grad_norm": 19.40025520324707, "kl": 1.05810546875, "learning_rate": 1.0529722834905125e-07, "loss": 0.0423, "reward": 0.27620090916752815, "reward_std": 0.09143605083227158, "rewards/cosine_scaled_reward": 0.054348638746887445, "rewards/format_reward": 0.9583333432674408, "step": 478 }, { "completion_length": 1093.7083892822266, "epoch": 0.5474285714285714, "grad_norm": 2.574824333190918, "kl": 0.957763671875, "learning_rate": 1.0482745016665526e-07, "loss": 0.0384, "reward": 0.23002658411860466, "reward_std": 0.09741083486005664, "rewards/cosine_scaled_reward": -0.05071627616416663, "rewards/format_reward": 0.9791666716337204, "step": 479 }, { "completion_length": 943.3125305175781, "epoch": 0.5485714285714286, "grad_norm": 5.030829906463623, "kl": 1.1236572265625, "learning_rate": 1.0437936906629334e-07, "loss": 0.0449, "reward": 0.25394725054502487, "reward_std": 0.09345280891284347, "rewards/cosine_scaled_reward": 0.010230874177068472, "rewards/format_reward": 0.9583333432674408, "step": 480 }, { "completion_length": 858.2916870117188, "epoch": 0.5497142857142857, "grad_norm": 9.557243347167969, "kl": 0.66064453125, "learning_rate": 1.0395300688680625e-07, "loss": 0.0264, "reward": 0.18073679506778717, "reward_std": 0.09469743259251118, "rewards/cosine_scaled_reward": -0.11896948399953544, "rewards/format_reward": 0.9375000149011612, "step": 481 }, { "completion_length": 928.4791870117188, "epoch": 0.5508571428571428, "grad_norm": 7.392604827880859, "kl": 0.98583984375, "learning_rate": 1.0354838440848501e-07, "loss": 0.0394, "reward": 0.3079821467399597, "reward_std": 0.17601807694882154, "rewards/cosine_scaled_reward": 0.12785004265606403, "rewards/format_reward": 0.9375000149011612, "step": 482 }, { "completion_length": 999.3125381469727, "epoch": 0.552, "grad_norm": 12.308825492858887, "kl": 0.9666748046875, "learning_rate": 1.0316552135205837e-07, "loss": 0.0386, "reward": 0.23309217020869255, "reward_std": 0.12896554730832577, "rewards/cosine_scaled_reward": -0.023963168496266007, "rewards/format_reward": 0.9375000149011612, "step": 483 }, { "completion_length": 532.5208587646484, "epoch": 0.5531428571428572, "grad_norm": 0.9649083018302917, "kl": 0.09912109375, "learning_rate": 1.0280443637773163e-07, "loss": 0.004, "reward": 0.26917488500475883, "reward_std": 0.1440052017569542, "rewards/cosine_scaled_reward": 0.0880583431571722, "rewards/format_reward": 0.8750000149011612, "step": 484 }, { "completion_length": 516.5625152587891, "epoch": 0.5542857142857143, "grad_norm": 2.671757936477661, "kl": 0.1514892578125, "learning_rate": 1.0246514708427701e-07, "loss": 0.0061, "reward": 0.238181222230196, "reward_std": 0.11508800461888313, "rewards/cosine_scaled_reward": -0.026438767090439796, "rewards/format_reward": 0.9583333432674408, "step": 485 }, { "completion_length": 508.0833511352539, "epoch": 0.5554285714285714, "grad_norm": 3.1047403812408447, "kl": 0.2269287109375, "learning_rate": 1.0214767000817596e-07, "loss": 0.0091, "reward": 0.34923529624938965, "reward_std": 0.1248536161147058, "rewards/cosine_scaled_reward": 0.1781093254685402, "rewards/format_reward": 1.0, "step": 486 }, { "completion_length": 1057.6875457763672, "epoch": 0.5565714285714286, "grad_norm": 26.792709350585938, "kl": 1.476806640625, "learning_rate": 1.0185202062281336e-07, "loss": 0.0591, "reward": 0.2871815077960491, "reward_std": 0.19810123462229967, "rewards/cosine_scaled_reward": 0.1576450327411294, "rewards/format_reward": 0.7916666716337204, "step": 487 }, { "completion_length": 548.0625152587891, "epoch": 0.5577142857142857, "grad_norm": 3.7524454593658447, "kl": 0.1658935546875, "learning_rate": 1.0157821333772304e-07, "loss": 0.0067, "reward": 0.2495635598897934, "reward_std": 0.07641960121691227, "rewards/cosine_scaled_reward": 0.009819753468036652, "rewards/format_reward": 0.9375000149011612, "step": 488 }, { "completion_length": 996.6250152587891, "epoch": 0.5588571428571428, "grad_norm": 9.453791618347168, "kl": 1.5107421875, "learning_rate": 1.013262614978859e-07, "loss": 0.0605, "reward": 0.2142932452261448, "reward_std": 0.12809766456484795, "rewards/cosine_scaled_reward": -0.02317894995212555, "rewards/format_reward": 0.8750000298023224, "step": 489 }, { "completion_length": 986.8125305175781, "epoch": 0.56, "grad_norm": 7.285546779632568, "kl": 0.8212890625, "learning_rate": 1.0109617738307911e-07, "loss": 0.0329, "reward": 0.2442498840391636, "reward_std": 0.13104667328298092, "rewards/cosine_scaled_reward": 0.01568269287236035, "rewards/format_reward": 0.9166666716337204, "step": 490 }, { "completion_length": 1420.2083587646484, "epoch": 0.5611428571428572, "grad_norm": 22.101282119750977, "kl": 1.87841796875, "learning_rate": 1.0088797220727779e-07, "loss": 0.0753, "reward": 0.21484718471765518, "reward_std": 0.11170701449736953, "rewards/cosine_scaled_reward": -0.022915068548172712, "rewards/format_reward": 0.8750000149011612, "step": 491 }, { "completion_length": 910.9791984558105, "epoch": 0.5622857142857143, "grad_norm": 3.313934803009033, "kl": 0.6575927734375, "learning_rate": 1.0070165611810855e-07, "loss": 0.0263, "reward": 0.2877778969705105, "reward_std": 0.10931823030114174, "rewards/cosine_scaled_reward": 0.10159274004399776, "rewards/format_reward": 0.9166666716337204, "step": 492 }, { "completion_length": 837.1666870117188, "epoch": 0.5634285714285714, "grad_norm": 2.4096004962921143, "kl": 0.582763671875, "learning_rate": 1.005372381963547e-07, "loss": 0.0234, "reward": 0.2918214835226536, "reward_std": 0.17298330925405025, "rewards/cosine_scaled_reward": 0.06514766626060009, "rewards/format_reward": 1.0, "step": 493 }, { "completion_length": 744.5833587646484, "epoch": 0.5645714285714286, "grad_norm": 2.4392518997192383, "kl": 0.5224609375, "learning_rate": 1.0039472645551372e-07, "loss": 0.0209, "reward": 0.2916877344250679, "reward_std": 0.17531706020236015, "rewards/cosine_scaled_reward": 0.07210396369919181, "rewards/format_reward": 0.9791666716337204, "step": 494 }, { "completion_length": 989.5416870117188, "epoch": 0.5657142857142857, "grad_norm": 10.89637279510498, "kl": 1.00146484375, "learning_rate": 1.002741278414069e-07, "loss": 0.0401, "reward": 0.2885359302163124, "reward_std": 0.18854881078004837, "rewards/cosine_scaled_reward": 0.09755662828683853, "rewards/format_reward": 0.9166666865348816, "step": 495 }, { "completion_length": 645.6666793823242, "epoch": 0.5668571428571428, "grad_norm": 3.9345898628234863, "kl": 0.3958740234375, "learning_rate": 1.0017544823184055e-07, "loss": 0.0158, "reward": 0.30592789500951767, "reward_std": 0.10496470844373107, "rewards/cosine_scaled_reward": 0.11379924323409796, "rewards/format_reward": 0.9375, "step": 496 }, { "completion_length": 859.6875305175781, "epoch": 0.568, "grad_norm": 3.6183197498321533, "kl": 0.384765625, "learning_rate": 1.0009869243631952e-07, "loss": 0.0154, "reward": 0.39471010118722916, "reward_std": 0.09985048417001963, "rewards/cosine_scaled_reward": 0.25956878811120987, "rewards/format_reward": 1.0, "step": 497 }, { "completion_length": 910.5000152587891, "epoch": 0.5691428571428572, "grad_norm": 3.7000491619110107, "kl": 0.6484375, "learning_rate": 1.000438641958131e-07, "loss": 0.0259, "reward": 0.26462262123823166, "reward_std": 0.13342786859720945, "rewards/cosine_scaled_reward": 0.042564581613987684, "rewards/format_reward": 0.9375000149011612, "step": 498 }, { "completion_length": 1140.8542022705078, "epoch": 0.5702857142857143, "grad_norm": 8.791346549987793, "kl": 1.2587890625, "learning_rate": 1.0001096618257236e-07, "loss": 0.0503, "reward": 0.23132820427417755, "reward_std": 0.14284729957580566, "rewards/cosine_scaled_reward": -0.013288982212543488, "rewards/format_reward": 0.9166666865348816, "step": 499 }, { "completion_length": 724.208366394043, "epoch": 0.5714285714285714, "grad_norm": 16.891372680664062, "kl": 0.3909912109375, "learning_rate": 1e-07, "loss": 0.0156, "reward": 0.22501353546977043, "reward_std": 0.16359347105026245, "rewards/cosine_scaled_reward": -0.008939757943153381, "rewards/format_reward": 0.8750000149011612, "step": 500 }, { "completion_length": 1138.708366394043, "epoch": 0.5725714285714286, "grad_norm": 1.5487465858459473, "kl": 0.29150390625, "learning_rate": 1.0001096618257236e-07, "loss": 0.0117, "reward": 0.21772483922541142, "reward_std": 0.14172286912798882, "rewards/cosine_scaled_reward": -0.009133230894804, "rewards/format_reward": 0.8333333432674408, "step": 501 }, { "epoch": 0.5725714285714286, "step": 501, "total_flos": 0.0, "train_loss": 2.328130582016623e-05, "train_runtime": 113.9487, "train_samples_per_second": 210.621, "train_steps_per_second": 4.388 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }