{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5714285714285714, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 3001.9584350585938, "epoch": 0.001142857142857143, "grad_norm": 0.18922260403633118, "kl": 0.0, "learning_rate": 2e-08, "loss": -0.0, "reward": -0.010712452232837677, "reward_std": 0.48354096710681915, "rewards/cosine_scaled_reward": -0.1928562317043543, "rewards/format_reward": 0.37500000558793545, "step": 1 }, { "completion_length": 2822.541717529297, "epoch": 0.002285714285714286, "grad_norm": 0.28424975275993347, "kl": 0.0, "learning_rate": 4e-08, "loss": 0.0, "reward": 0.4385625521535985, "reward_std": 0.8208381980657578, "rewards/cosine_scaled_reward": -0.009885392151772976, "rewards/format_reward": 0.4583333432674408, "step": 2 }, { "completion_length": 2882.4166870117188, "epoch": 0.0034285714285714284, "grad_norm": 0.18410934507846832, "kl": 3.517171717248857e-05, "learning_rate": 6e-08, "loss": 0.0, "reward": -0.291525443084538, "reward_std": 0.3761885389685631, "rewards/cosine_scaled_reward": -0.27076271921396255, "rewards/format_reward": 0.25, "step": 3 }, { "completion_length": 3245.416748046875, "epoch": 0.004571428571428572, "grad_norm": 0.16615347564220428, "kl": 2.9280781745910645e-05, "learning_rate": 8e-08, "loss": 0.0, "reward": -0.25264428183436394, "reward_std": 0.4561151713132858, "rewards/cosine_scaled_reward": -0.24090547114610672, "rewards/format_reward": 0.22916667349636555, "step": 4 }, { "completion_length": 2911.3334350585938, "epoch": 0.005714285714285714, "grad_norm": 0.21166956424713135, "kl": 3.331899642944336e-05, "learning_rate": 1e-07, "loss": 0.0, "reward": 0.8400040492415428, "reward_std": 0.885560505092144, "rewards/cosine_scaled_reward": 0.18041866831481457, "rewards/format_reward": 0.479166679084301, "step": 5 }, { "completion_length": 2720.89599609375, "epoch": 0.006857142857142857, "grad_norm": 0.23326514661312103, "kl": 4.035234451293945e-05, "learning_rate": 1.2e-07, "loss": 0.0, "reward": 0.2041575275361538, "reward_std": 0.6658071056008339, "rewards/cosine_scaled_reward": -0.11667125090025365, "rewards/format_reward": 0.4375, "step": 6 }, { "completion_length": 2360.6458740234375, "epoch": 0.008, "grad_norm": 0.2280312329530716, "kl": 1.850724220275879e-05, "learning_rate": 1.4e-07, "loss": 0.0, "reward": 0.7341702952980995, "reward_std": 0.44598812609910965, "rewards/cosine_scaled_reward": 0.09625181555747986, "rewards/format_reward": 0.5416666865348816, "step": 7 }, { "completion_length": 2888.8750610351562, "epoch": 0.009142857142857144, "grad_norm": 0.20181676745414734, "kl": 2.9474496841430664e-05, "learning_rate": 1.6e-07, "loss": 0.0, "reward": -0.007411351427435875, "reward_std": 0.6588219478726387, "rewards/cosine_scaled_reward": -0.2016223482787609, "rewards/format_reward": 0.3958333395421505, "step": 8 }, { "completion_length": 3309.8541870117188, "epoch": 0.010285714285714285, "grad_norm": 0.18089492619037628, "kl": 4.0411949157714844e-05, "learning_rate": 1.8e-07, "loss": 0.0, "reward": 0.08656559139490128, "reward_std": 0.7023323476314545, "rewards/cosine_scaled_reward": -0.060883864760398865, "rewards/format_reward": 0.2083333358168602, "step": 9 }, { "completion_length": 2354.729217529297, "epoch": 0.011428571428571429, "grad_norm": 0.2087497115135193, "kl": 3.822147846221924e-05, "learning_rate": 2e-07, "loss": 0.0, "reward": 0.43211155757308006, "reward_std": 0.7549905180931091, "rewards/cosine_scaled_reward": -0.054777566343545914, "rewards/format_reward": 0.5416666679084301, "step": 10 }, { "completion_length": 2589.041778564453, "epoch": 0.012571428571428572, "grad_norm": 0.2392556220293045, "kl": 4.547834396362305e-05, "learning_rate": 2.1999999999999998e-07, "loss": 0.0, "reward": 0.570576427038759, "reward_std": 1.0850151628255844, "rewards/cosine_scaled_reward": -0.006378462538123131, "rewards/format_reward": 0.5833333432674408, "step": 11 }, { "completion_length": 2472.4375610351562, "epoch": 0.013714285714285714, "grad_norm": 0.2759428322315216, "kl": 2.5950372219085693e-05, "learning_rate": 2.4e-07, "loss": 0.0, "reward": 0.7166856527328491, "reward_std": 0.7806050479412079, "rewards/cosine_scaled_reward": 0.056259457021951675, "rewards/format_reward": 0.6041666716337204, "step": 12 }, { "completion_length": 2164.854232788086, "epoch": 0.014857142857142857, "grad_norm": 0.1908554881811142, "kl": 2.492964267730713e-05, "learning_rate": 2.6e-07, "loss": 0.0, "reward": 0.7641210779547691, "reward_std": 0.6774896830320358, "rewards/cosine_scaled_reward": 0.10081052035093307, "rewards/format_reward": 0.5625, "step": 13 }, { "completion_length": 2820.2501220703125, "epoch": 0.016, "grad_norm": 0.18801091611385345, "kl": 3.5822391510009766e-05, "learning_rate": 2.8e-07, "loss": 0.0, "reward": -0.07261240109801292, "reward_std": 0.6130082383751869, "rewards/cosine_scaled_reward": -0.2133895456790924, "rewards/format_reward": 0.3541666753590107, "step": 14 }, { "completion_length": 3089.104248046875, "epoch": 0.017142857142857144, "grad_norm": 0.19688484072685242, "kl": 3.3468008041381836e-05, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.27143352539860643, "reward_std": 0.8015548288822174, "rewards/cosine_scaled_reward": 0.0003000907599925995, "rewards/format_reward": 0.2708333469927311, "step": 15 }, { "completion_length": 2362.6876220703125, "epoch": 0.018285714285714287, "grad_norm": 0.24537329375743866, "kl": 2.1159648895263672e-05, "learning_rate": 3.2e-07, "loss": 0.0, "reward": 0.7649998441338539, "reward_std": 1.021081954240799, "rewards/cosine_scaled_reward": 0.11166658625006676, "rewards/format_reward": 0.5416666865348816, "step": 16 }, { "completion_length": 3128.25, "epoch": 0.019428571428571427, "grad_norm": 0.20609241724014282, "kl": 4.242360591888428e-05, "learning_rate": 3.4000000000000003e-07, "loss": 0.0, "reward": -0.19378644227981567, "reward_std": 0.5115947872400284, "rewards/cosine_scaled_reward": -0.21147656068205833, "rewards/format_reward": 0.2291666716337204, "step": 17 }, { "completion_length": 2980.9793090820312, "epoch": 0.02057142857142857, "grad_norm": 0.2714909315109253, "kl": 3.966689109802246e-05, "learning_rate": 3.6e-07, "loss": 0.0, "reward": -0.022395100444555283, "reward_std": 0.6723635420203209, "rewards/cosine_scaled_reward": -0.16744754649698734, "rewards/format_reward": 0.3125000037252903, "step": 18 }, { "completion_length": 3212.604248046875, "epoch": 0.021714285714285714, "grad_norm": 0.17160819470882416, "kl": 3.719329833984375e-05, "learning_rate": 3.7999999999999996e-07, "loss": 0.0, "reward": 0.07102994620800018, "reward_std": 0.8850104063749313, "rewards/cosine_scaled_reward": -0.1207350417971611, "rewards/format_reward": 0.31250001303851604, "step": 19 }, { "completion_length": 2613.6041870117188, "epoch": 0.022857142857142857, "grad_norm": 0.24837209284305573, "kl": 3.3915042877197266e-05, "learning_rate": 4e-07, "loss": 0.0, "reward": 0.33390188589692116, "reward_std": 0.713263601064682, "rewards/cosine_scaled_reward": -0.08304904773831367, "rewards/format_reward": 0.5000000149011612, "step": 20 }, { "completion_length": 2582.041748046875, "epoch": 0.024, "grad_norm": 0.2642858326435089, "kl": 2.1037645637989044e-05, "learning_rate": 4.1999999999999995e-07, "loss": 0.0, "reward": 0.2965797185897827, "reward_std": 0.5356749221682549, "rewards/cosine_scaled_reward": -0.06004347978159785, "rewards/format_reward": 0.41666667722165585, "step": 21 }, { "completion_length": 3307.6459350585938, "epoch": 0.025142857142857144, "grad_norm": 0.22419147193431854, "kl": 4.1961669921875e-05, "learning_rate": 4.3999999999999997e-07, "loss": 0.0, "reward": 0.25835999101400375, "reward_std": 1.1261206567287445, "rewards/cosine_scaled_reward": -0.04790334962308407, "rewards/format_reward": 0.3541666716337204, "step": 22 }, { "completion_length": 3249.1876220703125, "epoch": 0.026285714285714287, "grad_norm": 0.19173863530158997, "kl": 4.4405460357666016e-05, "learning_rate": 4.6e-07, "loss": 0.0, "reward": 0.27271851897239685, "reward_std": 0.7990642189979553, "rewards/cosine_scaled_reward": -0.04072406329214573, "rewards/format_reward": 0.3541666828095913, "step": 23 }, { "completion_length": 2154.25, "epoch": 0.027428571428571427, "grad_norm": 0.259212851524353, "kl": 1.8768012523651123e-05, "learning_rate": 4.8e-07, "loss": 0.0, "reward": 0.5731075070798397, "reward_std": 0.8421577215194702, "rewards/cosine_scaled_reward": -0.02594624925404787, "rewards/format_reward": 0.6250000074505806, "step": 24 }, { "completion_length": 2784.7916870117188, "epoch": 0.02857142857142857, "grad_norm": 0.29162946343421936, "kl": 3.090500831604004e-05, "learning_rate": 5e-07, "loss": 0.0, "reward": 0.1705078724771738, "reward_std": 0.6685621440410614, "rewards/cosine_scaled_reward": -0.08141273260116577, "rewards/format_reward": 0.3333333358168602, "step": 25 }, { "completion_length": 3185.729248046875, "epoch": 0.029714285714285714, "grad_norm": 0.15754370391368866, "kl": 2.549588680267334e-05, "learning_rate": 5.2e-07, "loss": 0.0, "reward": 0.13516026688739657, "reward_std": 0.6272664293646812, "rewards/cosine_scaled_reward": -0.05741987004876137, "rewards/format_reward": 0.2500000111758709, "step": 26 }, { "completion_length": 3129.2083740234375, "epoch": 0.030857142857142857, "grad_norm": 0.16376672685146332, "kl": 2.86102294921875e-05, "learning_rate": 5.4e-07, "loss": 0.0, "reward": 0.0651654414832592, "reward_std": 0.5805819556117058, "rewards/cosine_scaled_reward": -0.08200062438845634, "rewards/format_reward": 0.22916667722165585, "step": 27 }, { "completion_length": 3173.4791870117188, "epoch": 0.032, "grad_norm": 0.2187095433473587, "kl": 3.802776336669922e-05, "learning_rate": 5.6e-07, "loss": 0.0, "reward": 0.1244891807436943, "reward_std": 0.8137174546718597, "rewards/cosine_scaled_reward": -0.07317209523171186, "rewards/format_reward": 0.27083334885537624, "step": 28 }, { "completion_length": 3206.0208740234375, "epoch": 0.03314285714285714, "grad_norm": 0.15626287460327148, "kl": 1.7024576663970947e-05, "learning_rate": 5.8e-07, "loss": 0.0, "reward": -0.0882865646854043, "reward_std": 0.6182056441903114, "rewards/cosine_scaled_reward": -0.13789328234270215, "rewards/format_reward": 0.1875000111758709, "step": 29 }, { "completion_length": 3293.979248046875, "epoch": 0.03428571428571429, "grad_norm": 0.176454558968544, "kl": 2.8930604457855225e-05, "learning_rate": 6e-07, "loss": 0.0, "reward": 0.12017922103404999, "reward_std": 0.8002806901931763, "rewards/cosine_scaled_reward": -0.10657705180346966, "rewards/format_reward": 0.33333334885537624, "step": 30 }, { "completion_length": 2556.0625610351562, "epoch": 0.03542857142857143, "grad_norm": 0.2976699471473694, "kl": 3.6090612411499023e-05, "learning_rate": 6.2e-07, "loss": 0.0, "reward": 0.13020960986614227, "reward_std": 0.5589020624756813, "rewards/cosine_scaled_reward": -0.15364519506692886, "rewards/format_reward": 0.43750001303851604, "step": 31 }, { "completion_length": 3466.125, "epoch": 0.036571428571428574, "grad_norm": 0.15761366486549377, "kl": 3.0994415283203125e-05, "learning_rate": 6.4e-07, "loss": 0.0, "reward": -0.34774322621524334, "reward_std": 0.4613388404250145, "rewards/cosine_scaled_reward": -0.22595495358109474, "rewards/format_reward": 0.1041666679084301, "step": 32 }, { "completion_length": 3078.729248046875, "epoch": 0.037714285714285714, "grad_norm": 0.17744146287441254, "kl": 1.9311904907226562e-05, "learning_rate": 6.6e-07, "loss": 0.0, "reward": 0.1523735709488392, "reward_std": 0.7702403217554092, "rewards/cosine_scaled_reward": -0.10089654847979546, "rewards/format_reward": 0.354166679084301, "step": 33 }, { "completion_length": 3068.2083740234375, "epoch": 0.038857142857142854, "grad_norm": 0.2183830887079239, "kl": 1.940131187438965e-05, "learning_rate": 6.800000000000001e-07, "loss": 0.0, "reward": 0.029434625059366226, "reward_std": 0.7817529812455177, "rewards/cosine_scaled_reward": -0.1415326923597604, "rewards/format_reward": 0.3125000111758709, "step": 34 }, { "completion_length": 3028.916748046875, "epoch": 0.04, "grad_norm": 0.1779097616672516, "kl": 2.0578503608703613e-05, "learning_rate": 7e-07, "loss": 0.0, "reward": 0.2327469252049923, "reward_std": 0.9538670182228088, "rewards/cosine_scaled_reward": -0.09195987693965435, "rewards/format_reward": 0.41666667349636555, "step": 35 }, { "completion_length": 2689.3959350585938, "epoch": 0.04114285714285714, "grad_norm": 0.16330143809318542, "kl": 4.976987838745117e-05, "learning_rate": 7.2e-07, "loss": 0.0, "reward": 0.5622920989990234, "reward_std": 0.39920446276664734, "rewards/cosine_scaled_reward": 0.07281268946826458, "rewards/format_reward": 0.4166666716337204, "step": 36 }, { "completion_length": 2801.6459045410156, "epoch": 0.04228571428571429, "grad_norm": 0.19838035106658936, "kl": 4.4226646423339844e-05, "learning_rate": 7.4e-07, "loss": 0.0, "reward": 0.41371238604187965, "reward_std": 0.575165145099163, "rewards/cosine_scaled_reward": -0.07439382094889879, "rewards/format_reward": 0.5625000149011612, "step": 37 }, { "completion_length": 3009.5000610351562, "epoch": 0.04342857142857143, "grad_norm": 0.1800134778022766, "kl": 6.097555160522461e-05, "learning_rate": 7.599999999999999e-07, "loss": 0.0, "reward": 0.12262389855459332, "reward_std": 0.6652626991271973, "rewards/cosine_scaled_reward": -0.15743806213140488, "rewards/format_reward": 0.4375000149011612, "step": 38 }, { "completion_length": 3139.604248046875, "epoch": 0.044571428571428574, "grad_norm": 0.23411938548088074, "kl": 4.2323023080825806e-05, "learning_rate": 7.799999999999999e-07, "loss": 0.0, "reward": 0.24589091911911964, "reward_std": 0.8911770880222321, "rewards/cosine_scaled_reward": -0.07497121207416058, "rewards/format_reward": 0.3958333395421505, "step": 39 }, { "completion_length": 3011.8958740234375, "epoch": 0.045714285714285714, "grad_norm": 0.1625184565782547, "kl": 4.693865776062012e-05, "learning_rate": 8e-07, "loss": 0.0, "reward": 0.12772860191762447, "reward_std": 0.7576778829097748, "rewards/cosine_scaled_reward": -0.12363571301102638, "rewards/format_reward": 0.3750000037252903, "step": 40 }, { "completion_length": 3124.2500610351562, "epoch": 0.046857142857142854, "grad_norm": 0.17387458682060242, "kl": 1.8164515495300293e-05, "learning_rate": 8.199999999999999e-07, "loss": 0.0, "reward": 0.21644378546625376, "reward_std": 0.6694483831524849, "rewards/cosine_scaled_reward": -0.06886144913733006, "rewards/format_reward": 0.354166679084301, "step": 41 }, { "completion_length": 2182.5000610351562, "epoch": 0.048, "grad_norm": 0.34351351857185364, "kl": 0.00022931396961212158, "learning_rate": 8.399999999999999e-07, "loss": 0.0, "reward": 0.6580724753439426, "reward_std": 0.8123672902584076, "rewards/cosine_scaled_reward": 0.016536223702132702, "rewards/format_reward": 0.6250000111758709, "step": 42 }, { "completion_length": 2828.7709350585938, "epoch": 0.04914285714285714, "grad_norm": 0.19178950786590576, "kl": 3.0837953090667725e-05, "learning_rate": 8.599999999999999e-07, "loss": 0.0, "reward": 0.31538891326636076, "reward_std": 0.8877717405557632, "rewards/cosine_scaled_reward": -0.06105554662644863, "rewards/format_reward": 0.4375000149011612, "step": 43 }, { "completion_length": 3036.6250610351562, "epoch": 0.05028571428571429, "grad_norm": 0.22349753975868225, "kl": 0.0003269314765930176, "learning_rate": 8.799999999999999e-07, "loss": 0.0, "reward": -0.01474527781829238, "reward_std": 0.5097765326499939, "rewards/cosine_scaled_reward": -0.15320597402751446, "rewards/format_reward": 0.291666679084301, "step": 44 }, { "completion_length": 2778.5000610351562, "epoch": 0.05142857142857143, "grad_norm": 0.18280261754989624, "kl": 5.6609511375427246e-05, "learning_rate": 9e-07, "loss": 0.0, "reward": 0.8612850233912468, "reward_std": 1.1412108689546585, "rewards/cosine_scaled_reward": 0.13897587358951569, "rewards/format_reward": 0.583333358168602, "step": 45 }, { "completion_length": 2998.229248046875, "epoch": 0.052571428571428575, "grad_norm": 0.16858145594596863, "kl": 5.383044481277466e-05, "learning_rate": 9.2e-07, "loss": 0.0, "reward": 0.28540395572781563, "reward_std": 0.43213801458477974, "rewards/cosine_scaled_reward": -0.02396468259394169, "rewards/format_reward": 0.3333333358168602, "step": 46 }, { "completion_length": 2551.6458587646484, "epoch": 0.053714285714285714, "grad_norm": 0.23799559473991394, "kl": 9.156018495559692e-05, "learning_rate": 9.399999999999999e-07, "loss": 0.0, "reward": 0.44500812888145447, "reward_std": 0.783466711640358, "rewards/cosine_scaled_reward": -0.04832928255200386, "rewards/format_reward": 0.5416666865348816, "step": 47 }, { "completion_length": 2939.1041870117188, "epoch": 0.054857142857142854, "grad_norm": 0.18564291298389435, "kl": 0.00010335445404052734, "learning_rate": 9.6e-07, "loss": 0.0, "reward": 0.34564049541950226, "reward_std": 0.9494538530707359, "rewards/cosine_scaled_reward": -0.02509642019867897, "rewards/format_reward": 0.3958333395421505, "step": 48 }, { "completion_length": 2282.895835876465, "epoch": 0.056, "grad_norm": 0.23054172098636627, "kl": 0.00024145841598510742, "learning_rate": 9.8e-07, "loss": 0.0, "reward": 0.44885391741991043, "reward_std": 0.7631752789020538, "rewards/cosine_scaled_reward": -0.06723971478641033, "rewards/format_reward": 0.5833333358168602, "step": 49 }, { "completion_length": 2204.1041870117188, "epoch": 0.05714285714285714, "grad_norm": 0.29597678780555725, "kl": 0.0005988925695419312, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.5718545913696289, "reward_std": 0.6146985068917274, "rewards/cosine_scaled_reward": 0.02551062125712633, "rewards/format_reward": 0.5208333432674408, "step": 50 }, { "completion_length": 3201.6666870117188, "epoch": 0.05828571428571429, "grad_norm": 0.15334689617156982, "kl": 8.338689804077148e-05, "learning_rate": 9.999890338174275e-07, "loss": 0.0, "reward": 0.14526839554309845, "reward_std": 0.8655073121190071, "rewards/cosine_scaled_reward": -0.07319913152605295, "rewards/format_reward": 0.2916666716337204, "step": 51 }, { "completion_length": 3215.0000610351562, "epoch": 0.05942857142857143, "grad_norm": 0.17531076073646545, "kl": 0.0001531541347503662, "learning_rate": 9.999561358041868e-07, "loss": 0.0, "reward": -0.015788130462169647, "reward_std": 0.7165202274918556, "rewards/cosine_scaled_reward": -0.13289407594129443, "rewards/format_reward": 0.2500000074505806, "step": 52 }, { "completion_length": 2991.8125610351562, "epoch": 0.060571428571428575, "grad_norm": 0.28014782071113586, "kl": 0.000295490026473999, "learning_rate": 9.999013075636804e-07, "loss": 0.0, "reward": -0.08945630304515362, "reward_std": 0.6164149194955826, "rewards/cosine_scaled_reward": -0.1905614770948887, "rewards/format_reward": 0.2916666828095913, "step": 53 }, { "completion_length": 2437.4375, "epoch": 0.061714285714285715, "grad_norm": 0.21041618287563324, "kl": 0.00014019012451171875, "learning_rate": 9.998245517681593e-07, "loss": 0.0, "reward": 0.08023202046751976, "reward_std": 0.43379058688879013, "rewards/cosine_scaled_reward": -0.18905067443847656, "rewards/format_reward": 0.4583333432674408, "step": 54 }, { "completion_length": 3115.2501220703125, "epoch": 0.06285714285714286, "grad_norm": 0.18965038657188416, "kl": 0.00021153688430786133, "learning_rate": 9.997258721585931e-07, "loss": 0.0, "reward": 0.14397013932466507, "reward_std": 0.7335802316665649, "rewards/cosine_scaled_reward": -0.07384827360510826, "rewards/format_reward": 0.291666679084301, "step": 55 }, { "completion_length": 3097.0625, "epoch": 0.064, "grad_norm": 0.1861303448677063, "kl": 0.0006959438323974609, "learning_rate": 9.996052735444862e-07, "loss": 0.0, "reward": 0.21384139358997345, "reward_std": 0.7854617610573769, "rewards/cosine_scaled_reward": -0.038912639021873474, "rewards/format_reward": 0.29166667722165585, "step": 56 }, { "completion_length": 2959.479248046875, "epoch": 0.06514285714285714, "grad_norm": 0.17092932760715485, "kl": 0.0004864931106567383, "learning_rate": 9.994627618036452e-07, "loss": 0.0, "reward": -0.0896548442542553, "reward_std": 0.597286906093359, "rewards/cosine_scaled_reward": -0.22191076539456844, "rewards/format_reward": 0.3541666716337204, "step": 57 }, { "completion_length": 3173.9583740234375, "epoch": 0.06628571428571428, "grad_norm": 0.16764329373836517, "kl": 0.0009320974349975586, "learning_rate": 9.992983438818915e-07, "loss": 0.0, "reward": -0.259409268386662, "reward_std": 0.41252440214157104, "rewards/cosine_scaled_reward": -0.22345462441444397, "rewards/format_reward": 0.1875, "step": 58 }, { "completion_length": 2903.4375610351562, "epoch": 0.06742857142857143, "grad_norm": 0.2438689023256302, "kl": 0.0004966259002685547, "learning_rate": 9.991120277927223e-07, "loss": 0.0, "reward": 0.5037799216806889, "reward_std": 0.6180380508303642, "rewards/cosine_scaled_reward": 0.05397331342101097, "rewards/format_reward": 0.39583334513008595, "step": 59 }, { "completion_length": 2846.9375610351562, "epoch": 0.06857142857142857, "grad_norm": 0.19560052454471588, "kl": 0.0006766319274902344, "learning_rate": 9.989038226169207e-07, "loss": 0.0, "reward": 0.619435504078865, "reward_std": 0.54927659034729, "rewards/cosine_scaled_reward": 0.12221772782504559, "rewards/format_reward": 0.3750000149011612, "step": 60 }, { "completion_length": 2432.729248046875, "epoch": 0.06971428571428571, "grad_norm": 0.18966424465179443, "kl": 0.00019347667694091797, "learning_rate": 9.98673738502114e-07, "loss": 0.0, "reward": 0.8035399168729782, "reward_std": 0.6529746800661087, "rewards/cosine_scaled_reward": 0.12051995098590851, "rewards/format_reward": 0.5625000149011612, "step": 61 }, { "completion_length": 3356.4583740234375, "epoch": 0.07085714285714285, "grad_norm": 0.1480625420808792, "kl": 0.0005307793617248535, "learning_rate": 9.98421786662277e-07, "loss": 0.0, "reward": 0.33844682574272156, "reward_std": 0.7905219346284866, "rewards/cosine_scaled_reward": 0.023390088230371475, "rewards/format_reward": 0.29166667722165585, "step": 62 }, { "completion_length": 1956.7084350585938, "epoch": 0.072, "grad_norm": 0.26979929208755493, "kl": 0.006456255912780762, "learning_rate": 9.981479793771866e-07, "loss": 0.0003, "reward": 0.7974750846624374, "reward_std": 0.7315621674060822, "rewards/cosine_scaled_reward": 0.05498753860592842, "rewards/format_reward": 0.6875000149011612, "step": 63 }, { "completion_length": 2895.8958740234375, "epoch": 0.07314285714285715, "grad_norm": 0.14358438551425934, "kl": 0.00026297569274902344, "learning_rate": 9.97852329991824e-07, "loss": 0.0, "reward": 0.6833263337612152, "reward_std": 0.5810272544622421, "rewards/cosine_scaled_reward": 0.12291315197944641, "rewards/format_reward": 0.4375000149011612, "step": 64 }, { "completion_length": 2582.1875610351562, "epoch": 0.07428571428571429, "grad_norm": 0.1719641089439392, "kl": 0.0006394386291503906, "learning_rate": 9.975348529157229e-07, "loss": 0.0, "reward": 0.4943835213780403, "reward_std": 0.9102021306753159, "rewards/cosine_scaled_reward": -0.002808244898915291, "rewards/format_reward": 0.5000000074505806, "step": 65 }, { "completion_length": 3066.2709350585938, "epoch": 0.07542857142857143, "grad_norm": 0.15914808213710785, "kl": 0.00035455822944641113, "learning_rate": 9.971955636222684e-07, "loss": 0.0, "reward": 0.270260289311409, "reward_std": 0.6933658719062805, "rewards/cosine_scaled_reward": -0.10445320140570402, "rewards/format_reward": 0.47916667722165585, "step": 66 }, { "completion_length": 3021.1666870117188, "epoch": 0.07657142857142857, "grad_norm": 0.15889614820480347, "kl": 0.0007028579711914062, "learning_rate": 9.968344786479415e-07, "loss": 0.0, "reward": 0.2754221335053444, "reward_std": 0.6702268719673157, "rewards/cosine_scaled_reward": -0.028955606278032064, "rewards/format_reward": 0.33333334140479565, "step": 67 }, { "completion_length": 2520.7291870117188, "epoch": 0.07771428571428571, "grad_norm": 0.25743117928504944, "kl": 0.001796722412109375, "learning_rate": 9.964516155915151e-07, "loss": 0.0001, "reward": 0.2481890171766281, "reward_std": 0.4365886226296425, "rewards/cosine_scaled_reward": -0.06340551376342773, "rewards/format_reward": 0.375, "step": 68 }, { "completion_length": 2519.2500610351562, "epoch": 0.07885714285714286, "grad_norm": 0.1728557050228119, "kl": 0.0004031658172607422, "learning_rate": 9.960469931131936e-07, "loss": 0.0, "reward": 0.4316702373325825, "reward_std": 0.455346904695034, "rewards/cosine_scaled_reward": -0.02374822273850441, "rewards/format_reward": 0.4791666716337204, "step": 69 }, { "completion_length": 3165.7708740234375, "epoch": 0.08, "grad_norm": 0.13548843562602997, "kl": 0.0004048347473144531, "learning_rate": 9.956206309337066e-07, "loss": 0.0, "reward": 0.46206507831811905, "reward_std": 0.8794394135475159, "rewards/cosine_scaled_reward": -0.008550799917429686, "rewards/format_reward": 0.4791666716337204, "step": 70 }, { "completion_length": 2529.8751220703125, "epoch": 0.08114285714285714, "grad_norm": 0.22490063309669495, "kl": 0.003349781036376953, "learning_rate": 9.951725498333448e-07, "loss": 0.0001, "reward": 0.5197920426726341, "reward_std": 0.823016032576561, "rewards/cosine_scaled_reward": -0.010937327519059181, "rewards/format_reward": 0.5416666865348816, "step": 71 }, { "completion_length": 2654.4791870117188, "epoch": 0.08228571428571428, "grad_norm": 0.20955628156661987, "kl": 0.0012530684471130371, "learning_rate": 9.947027716509488e-07, "loss": 0.0001, "reward": 0.7414695173501968, "reward_std": 0.5663128644227982, "rewards/cosine_scaled_reward": 0.05823474656790495, "rewards/format_reward": 0.625, "step": 72 }, { "completion_length": 1640.041748046875, "epoch": 0.08342857142857144, "grad_norm": 0.2790416479110718, "kl": 0.005096435546875, "learning_rate": 9.942113192828444e-07, "loss": 0.0002, "reward": 0.91811203956604, "reward_std": 0.8141122311353683, "rewards/cosine_scaled_reward": 0.08405601512640715, "rewards/format_reward": 0.75, "step": 73 }, { "completion_length": 2470.5833740234375, "epoch": 0.08457142857142858, "grad_norm": 0.1816757470369339, "kl": 0.001283407211303711, "learning_rate": 9.93698216681727e-07, "loss": 0.0001, "reward": 0.595103541854769, "reward_std": 0.6585821881890297, "rewards/cosine_scaled_reward": 0.01630176231265068, "rewards/format_reward": 0.5625000074505806, "step": 74 }, { "completion_length": 2660.8750915527344, "epoch": 0.08571428571428572, "grad_norm": 0.2641614079475403, "kl": 0.0019817352294921875, "learning_rate": 9.931634888554935e-07, "loss": 0.0001, "reward": 0.2930721901357174, "reward_std": 0.7745417281985283, "rewards/cosine_scaled_reward": -0.06179725006222725, "rewards/format_reward": 0.4166666679084301, "step": 75 }, { "completion_length": 2939.604248046875, "epoch": 0.08685714285714285, "grad_norm": 0.3027961552143097, "kl": 0.0008873939514160156, "learning_rate": 9.926071618660237e-07, "loss": 0.0, "reward": 0.3248658664524555, "reward_std": 0.909210205078125, "rewards/cosine_scaled_reward": -0.014650408178567886, "rewards/format_reward": 0.3541666828095913, "step": 76 }, { "completion_length": 3152.1666870117188, "epoch": 0.088, "grad_norm": 0.16062307357788086, "kl": 0.001007080078125, "learning_rate": 9.9202926282791e-07, "loss": 0.0, "reward": 0.11907588690519333, "reward_std": 0.613786868751049, "rewards/cosine_scaled_reward": -0.06546205282211304, "rewards/format_reward": 0.25000000558793545, "step": 77 }, { "completion_length": 2635.041717529297, "epoch": 0.08914285714285715, "grad_norm": 0.19166067242622375, "kl": 0.0008903741836547852, "learning_rate": 9.91429819907136e-07, "loss": 0.0, "reward": 0.5825019255280495, "reward_std": 0.7261854261159897, "rewards/cosine_scaled_reward": 0.06208430230617523, "rewards/format_reward": 0.4583333507180214, "step": 78 }, { "completion_length": 2755.5209350585938, "epoch": 0.09028571428571429, "grad_norm": 0.17263904213905334, "kl": 0.0004019737243652344, "learning_rate": 9.908088623197048e-07, "loss": 0.0, "reward": 0.3458161875605583, "reward_std": 0.717200756072998, "rewards/cosine_scaled_reward": -0.056258589029312134, "rewards/format_reward": 0.4583333432674408, "step": 79 }, { "completion_length": 3565.4375, "epoch": 0.09142857142857143, "grad_norm": 0.1458665281534195, "kl": 0.00030177831649780273, "learning_rate": 9.901664203302124e-07, "loss": 0.0, "reward": -0.254756236448884, "reward_std": 0.5783224925398827, "rewards/cosine_scaled_reward": -0.17946145310997963, "rewards/format_reward": 0.10416666977107525, "step": 80 }, { "completion_length": 3124.291748046875, "epoch": 0.09257142857142857, "grad_norm": 0.17081034183502197, "kl": 0.0008420944213867188, "learning_rate": 9.895025252503755e-07, "loss": 0.0, "reward": 0.06758889555931091, "reward_std": 0.7439121454954147, "rewards/cosine_scaled_reward": -0.10162222757935524, "rewards/format_reward": 0.27083333767950535, "step": 81 }, { "completion_length": 3004.0833740234375, "epoch": 0.09371428571428571, "grad_norm": 0.17911851406097412, "kl": 0.0006622076034545898, "learning_rate": 9.888172094375033e-07, "loss": 0.0, "reward": 0.3707614615559578, "reward_std": 0.8215866684913635, "rewards/cosine_scaled_reward": -0.0437859346420737, "rewards/format_reward": 0.45833334885537624, "step": 82 }, { "completion_length": 3440.3125, "epoch": 0.09485714285714286, "grad_norm": 0.15295840799808502, "kl": 0.00017774105072021484, "learning_rate": 9.881105062929221e-07, "loss": 0.0, "reward": -0.39509791135787964, "reward_std": 0.5668186843395233, "rewards/cosine_scaled_reward": -0.260048970580101, "rewards/format_reward": 0.125, "step": 83 }, { "completion_length": 2530.7083740234375, "epoch": 0.096, "grad_norm": 0.2006414830684662, "kl": 0.0003807544708251953, "learning_rate": 9.873824502603459e-07, "loss": 0.0, "reward": 0.7480560662224889, "reward_std": 1.0157663226127625, "rewards/cosine_scaled_reward": 0.0927780270576477, "rewards/format_reward": 0.5625000149011612, "step": 84 }, { "completion_length": 3185.8541870117188, "epoch": 0.09714285714285714, "grad_norm": 0.17822831869125366, "kl": 0.0009975433349609375, "learning_rate": 9.866330768241983e-07, "loss": 0.0, "reward": 0.07128806412220001, "reward_std": 0.8152596428990364, "rewards/cosine_scaled_reward": -0.08935598330572248, "rewards/format_reward": 0.2500000111758709, "step": 85 }, { "completion_length": 2827.5625610351562, "epoch": 0.09828571428571428, "grad_norm": 0.1663668006658554, "kl": 0.0010325908660888672, "learning_rate": 9.85862422507884e-07, "loss": 0.0, "reward": 0.19359283335506916, "reward_std": 0.644717700779438, "rewards/cosine_scaled_reward": -0.10112026333808899, "rewards/format_reward": 0.3958333544433117, "step": 86 }, { "completion_length": 2942.5, "epoch": 0.09942857142857142, "grad_norm": 0.1894853264093399, "kl": 0.0014657974243164062, "learning_rate": 9.850705248720068e-07, "loss": 0.0001, "reward": 0.1782783716917038, "reward_std": 0.7724725604057312, "rewards/cosine_scaled_reward": -0.08794412622228265, "rewards/format_reward": 0.3541666716337204, "step": 87 }, { "completion_length": 2836.3959350585938, "epoch": 0.10057142857142858, "grad_norm": 0.1908150315284729, "kl": 0.0014390945434570312, "learning_rate": 9.8425742251254e-07, "loss": 0.0001, "reward": 0.3470733240246773, "reward_std": 0.8534664362668991, "rewards/cosine_scaled_reward": -0.024379996582865715, "rewards/format_reward": 0.3958333432674408, "step": 88 }, { "completion_length": 3278.6041870117188, "epoch": 0.10171428571428572, "grad_norm": 0.1539601981639862, "kl": 0.0004895925521850586, "learning_rate": 9.83423155058946e-07, "loss": 0.0, "reward": 0.30410441011190414, "reward_std": 0.6761599257588387, "rewards/cosine_scaled_reward": 0.006218895316123962, "rewards/format_reward": 0.29166667722165585, "step": 89 }, { "completion_length": 2985.3958740234375, "epoch": 0.10285714285714286, "grad_norm": 0.17722909152507782, "kl": 0.00044274330139160156, "learning_rate": 9.825677631722435e-07, "loss": 0.0, "reward": 0.2143753319978714, "reward_std": 0.6936175674200058, "rewards/cosine_scaled_reward": -0.0594789981842041, "rewards/format_reward": 0.3333333358168602, "step": 90 }, { "completion_length": 3058.5000610351562, "epoch": 0.104, "grad_norm": 0.19192735850811005, "kl": 0.0004374980926513672, "learning_rate": 9.816912885430258e-07, "loss": 0.0, "reward": 0.302869388833642, "reward_std": 0.5636695921421051, "rewards/cosine_scaled_reward": -0.04648197069764137, "rewards/format_reward": 0.3958333432674408, "step": 91 }, { "completion_length": 2699.6250610351562, "epoch": 0.10514285714285715, "grad_norm": 0.17412729561328888, "kl": 0.0012140274047851562, "learning_rate": 9.807937738894303e-07, "loss": 0.0, "reward": 0.5564358681440353, "reward_std": 0.717531181871891, "rewards/cosine_scaled_reward": 0.059467924758791924, "rewards/format_reward": 0.4375000111758709, "step": 92 }, { "completion_length": 2346.1250610351562, "epoch": 0.10628571428571429, "grad_norm": 0.2216739058494568, "kl": 0.0015277862548828125, "learning_rate": 9.798752629550546e-07, "loss": 0.0001, "reward": 0.5152877140790224, "reward_std": 0.6053595095872879, "rewards/cosine_scaled_reward": -0.0027728192508220673, "rewards/format_reward": 0.5208333358168602, "step": 93 }, { "completion_length": 3211.8333740234375, "epoch": 0.10742857142857143, "grad_norm": 0.18879717588424683, "kl": 0.000827789306640625, "learning_rate": 9.78935800506826e-07, "loss": 0.0, "reward": -0.015797210857272148, "reward_std": 0.735307015478611, "rewards/cosine_scaled_reward": -0.13289860635995865, "rewards/format_reward": 0.2500000037252903, "step": 94 }, { "completion_length": 3198.1459350585938, "epoch": 0.10857142857142857, "grad_norm": 0.15773996710777283, "kl": 0.00040841102600097656, "learning_rate": 9.779754323328192e-07, "loss": 0.0, "reward": -0.016605263575911522, "reward_std": 0.7409057542681694, "rewards/cosine_scaled_reward": -0.16455264016985893, "rewards/format_reward": 0.3125000037252903, "step": 95 }, { "completion_length": 2382.2084350585938, "epoch": 0.10971428571428571, "grad_norm": 0.2195434868335724, "kl": 0.0015625953674316406, "learning_rate": 9.769942052400235e-07, "loss": 0.0001, "reward": 0.3714839336462319, "reward_std": 0.5286353975534439, "rewards/cosine_scaled_reward": -0.08509137481451035, "rewards/format_reward": 0.5416666865348816, "step": 96 }, { "completion_length": 2969.479248046875, "epoch": 0.11085714285714286, "grad_norm": 0.22521458566188812, "kl": 0.0011968612670898438, "learning_rate": 9.759921670520634e-07, "loss": 0.0, "reward": 0.023297425359487534, "reward_std": 0.6409126222133636, "rewards/cosine_scaled_reward": -0.14460130035877228, "rewards/format_reward": 0.3125000111758709, "step": 97 }, { "completion_length": 2782.479248046875, "epoch": 0.112, "grad_norm": 0.48463404178619385, "kl": 0.01540231704711914, "learning_rate": 9.749693666068663e-07, "loss": 0.0006, "reward": 0.2789543569087982, "reward_std": 0.6075774282217026, "rewards/cosine_scaled_reward": -0.05843949131667614, "rewards/format_reward": 0.39583333395421505, "step": 98 }, { "completion_length": 2909.1458435058594, "epoch": 0.11314285714285714, "grad_norm": 0.19729964435100555, "kl": 0.000751495361328125, "learning_rate": 9.739258537542835e-07, "loss": 0.0, "reward": 0.3117349073290825, "reward_std": 0.5036360248923302, "rewards/cosine_scaled_reward": -0.010799217969179153, "rewards/format_reward": 0.33333333395421505, "step": 99 }, { "completion_length": 3006.9375610351562, "epoch": 0.11428571428571428, "grad_norm": 0.1744341254234314, "kl": 0.0009255409240722656, "learning_rate": 9.728616793536587e-07, "loss": 0.0, "reward": 0.4609271613880992, "reward_std": 0.859523817896843, "rewards/cosine_scaled_reward": 0.022130253724753857, "rewards/format_reward": 0.416666679084301, "step": 100 }, { "completion_length": 2650.979217529297, "epoch": 0.11542857142857142, "grad_norm": 0.18743358552455902, "kl": 0.0010256767272949219, "learning_rate": 9.717768952713511e-07, "loss": 0.0, "reward": 0.21766437217593193, "reward_std": 0.6801646202802658, "rewards/cosine_scaled_reward": -0.09950115904211998, "rewards/format_reward": 0.41666667722165585, "step": 101 }, { "completion_length": 2554.1875610351562, "epoch": 0.11657142857142858, "grad_norm": 0.18099477887153625, "kl": 0.001209259033203125, "learning_rate": 9.706715543782064e-07, "loss": 0.0, "reward": 0.29797927755862474, "reward_std": 0.4223637208342552, "rewards/cosine_scaled_reward": -0.10101036727428436, "rewards/format_reward": 0.5, "step": 102 }, { "completion_length": 2658.979248046875, "epoch": 0.11771428571428572, "grad_norm": 0.15931963920593262, "kl": 0.0010652542114257812, "learning_rate": 9.695457105469804e-07, "loss": 0.0, "reward": 0.23173093050718307, "reward_std": 0.6561538353562355, "rewards/cosine_scaled_reward": -0.10288454219698906, "rewards/format_reward": 0.4375, "step": 103 }, { "completion_length": 2505.5833740234375, "epoch": 0.11885714285714286, "grad_norm": 0.18099236488342285, "kl": 0.0005369186401367188, "learning_rate": 9.683994186497132e-07, "loss": 0.0, "reward": 0.9495851993560791, "reward_std": 0.7366478592157364, "rewards/cosine_scaled_reward": 0.19354257080703974, "rewards/format_reward": 0.5625000149011612, "step": 104 }, { "completion_length": 2778.3125, "epoch": 0.12, "grad_norm": 0.20704622566699982, "kl": 0.0013036727905273438, "learning_rate": 9.672327345550543e-07, "loss": 0.0001, "reward": 0.24978191778063774, "reward_std": 0.7690765783190727, "rewards/cosine_scaled_reward": -0.09385904669761658, "rewards/format_reward": 0.4375000074505806, "step": 105 }, { "completion_length": 2914.6250610351562, "epoch": 0.12114285714285715, "grad_norm": 0.1921072155237198, "kl": 0.001560211181640625, "learning_rate": 9.66045715125541e-07, "loss": 0.0001, "reward": 0.3622821723110974, "reward_std": 0.912909746170044, "rewards/cosine_scaled_reward": -0.01677557732909918, "rewards/format_reward": 0.3958333395421505, "step": 106 }, { "completion_length": 2109.354248046875, "epoch": 0.12228571428571429, "grad_norm": 0.2092333436012268, "kl": 0.0012311935424804688, "learning_rate": 9.648384182148252e-07, "loss": 0.0, "reward": 0.5422526616603136, "reward_std": 0.8620414137840271, "rewards/cosine_scaled_reward": -0.062207008711993694, "rewards/format_reward": 0.6666666865348816, "step": 107 }, { "completion_length": 2214.9583587646484, "epoch": 0.12342857142857143, "grad_norm": 0.2060472071170807, "kl": 0.0030279159545898438, "learning_rate": 9.636109026648554e-07, "loss": 0.0001, "reward": 0.6110497042536736, "reward_std": 0.7519760131835938, "rewards/cosine_scaled_reward": 0.03469152469187975, "rewards/format_reward": 0.5416666716337204, "step": 108 }, { "completion_length": 2803.2083740234375, "epoch": 0.12457142857142857, "grad_norm": 0.20012950897216797, "kl": 0.0010724067687988281, "learning_rate": 9.623632283030077e-07, "loss": 0.0, "reward": 0.08136957883834839, "reward_std": 0.6037605702877045, "rewards/cosine_scaled_reward": -0.15723188465926796, "rewards/format_reward": 0.3958333395421505, "step": 109 }, { "completion_length": 2464.7500610351562, "epoch": 0.12571428571428572, "grad_norm": 0.22272928059101105, "kl": 0.0024871826171875, "learning_rate": 9.610954559391704e-07, "loss": 0.0001, "reward": 0.9461969807744026, "reward_std": 0.842521145939827, "rewards/cosine_scaled_reward": 0.1710151496808976, "rewards/format_reward": 0.604166679084301, "step": 110 }, { "completion_length": 1819.3750610351562, "epoch": 0.12685714285714286, "grad_norm": 0.24292264878749847, "kl": 0.0017080307006835938, "learning_rate": 9.598076473627796e-07, "loss": 0.0001, "reward": 0.7363898158073425, "reward_std": 0.7160477414727211, "rewards/cosine_scaled_reward": 0.003611571155488491, "rewards/format_reward": 0.7291666716337204, "step": 111 }, { "completion_length": 2938.8333740234375, "epoch": 0.128, "grad_norm": 0.20110748708248138, "kl": 0.0015575885772705078, "learning_rate": 9.58499865339809e-07, "loss": 0.0001, "reward": 0.47850653529167175, "reward_std": 0.8684659749269485, "rewards/cosine_scaled_reward": 0.020503249019384384, "rewards/format_reward": 0.4375000149011612, "step": 112 }, { "completion_length": 2170.791717529297, "epoch": 0.12914285714285714, "grad_norm": 0.22352100908756256, "kl": 0.0018672943115234375, "learning_rate": 9.571721736097088e-07, "loss": 0.0001, "reward": 0.8810203373432159, "reward_std": 0.6223750859498978, "rewards/cosine_scaled_reward": 0.09676016308367252, "rewards/format_reward": 0.6875000074505806, "step": 113 }, { "completion_length": 1761.1250457763672, "epoch": 0.13028571428571428, "grad_norm": 0.22250227630138397, "kl": 0.0011968612670898438, "learning_rate": 9.55824636882301e-07, "loss": 0.0, "reward": 0.8257871624082327, "reward_std": 0.5129944495856762, "rewards/cosine_scaled_reward": 0.08997690677642822, "rewards/format_reward": 0.6458333432674408, "step": 114 }, { "completion_length": 2406.6875610351562, "epoch": 0.13142857142857142, "grad_norm": 0.18063588440418243, "kl": 0.0014677047729492188, "learning_rate": 9.54457320834625e-07, "loss": 0.0001, "reward": 0.7925823777914047, "reward_std": 1.05247762799263, "rewards/cosine_scaled_reward": 0.08379119075834751, "rewards/format_reward": 0.6250000149011612, "step": 115 }, { "completion_length": 2417.8958435058594, "epoch": 0.13257142857142856, "grad_norm": 0.23391500115394592, "kl": 0.004772186279296875, "learning_rate": 9.530702921077358e-07, "loss": 0.0002, "reward": 0.15583854354918003, "reward_std": 0.6483651623129845, "rewards/cosine_scaled_reward": -0.14083073096117005, "rewards/format_reward": 0.4375, "step": 116 }, { "completion_length": 3263.666748046875, "epoch": 0.1337142857142857, "grad_norm": 0.1551298350095749, "kl": 0.0015048980712890625, "learning_rate": 9.516636183034564e-07, "loss": 0.0001, "reward": -0.017866918817162514, "reward_std": 0.6443519741296768, "rewards/cosine_scaled_reward": -0.10268345987424254, "rewards/format_reward": 0.18750000558793545, "step": 117 }, { "completion_length": 2010.8750305175781, "epoch": 0.13485714285714287, "grad_norm": 0.21352525055408478, "kl": 0.001308441162109375, "learning_rate": 9.502373679810839e-07, "loss": 0.0001, "reward": 0.8165969103574753, "reward_std": 0.8080126643180847, "rewards/cosine_scaled_reward": 0.012465095147490501, "rewards/format_reward": 0.7916666865348816, "step": 118 }, { "completion_length": 1884.0625610351562, "epoch": 0.136, "grad_norm": 0.27621325850486755, "kl": 0.0025424957275390625, "learning_rate": 9.487916106540465e-07, "loss": 0.0001, "reward": 0.5952838063240051, "reward_std": 0.5625797361135483, "rewards/cosine_scaled_reward": -0.046108097303658724, "rewards/format_reward": 0.6875, "step": 119 }, { "completion_length": 2755.3959350585938, "epoch": 0.13714285714285715, "grad_norm": 0.23236262798309326, "kl": 0.0017757415771484375, "learning_rate": 9.473264167865171e-07, "loss": 0.0001, "reward": 0.063610197044909, "reward_std": 0.8038829490542412, "rewards/cosine_scaled_reward": -0.18694491172209382, "rewards/format_reward": 0.4375000074505806, "step": 120 }, { "completion_length": 2782.5833740234375, "epoch": 0.1382857142857143, "grad_norm": 0.186203733086586, "kl": 0.0014486312866210938, "learning_rate": 9.458418577899774e-07, "loss": 0.0001, "reward": 0.2508644163608551, "reward_std": 0.5808881223201752, "rewards/cosine_scaled_reward": -0.09331781789660454, "rewards/format_reward": 0.4375000111758709, "step": 121 }, { "completion_length": 2584.0625915527344, "epoch": 0.13942857142857143, "grad_norm": 0.2748485803604126, "kl": 0.0027008056640625, "learning_rate": 9.443380060197385e-07, "loss": 0.0001, "reward": 0.24610598012804985, "reward_std": 0.4119979292154312, "rewards/cosine_scaled_reward": -0.11653035134077072, "rewards/format_reward": 0.47916667722165585, "step": 122 }, { "completion_length": 2821.416748046875, "epoch": 0.14057142857142857, "grad_norm": 0.27934524416923523, "kl": 0.0057964324951171875, "learning_rate": 9.428149347714143e-07, "loss": 0.0002, "reward": 0.10106497257947922, "reward_std": 0.7212768346071243, "rewards/cosine_scaled_reward": -0.10571751650422812, "rewards/format_reward": 0.31250000186264515, "step": 123 }, { "completion_length": 2463.0000915527344, "epoch": 0.1417142857142857, "grad_norm": 0.22744229435920715, "kl": 0.0014429092407226562, "learning_rate": 9.412727182773486e-07, "loss": 0.0001, "reward": 0.8311970978975296, "reward_std": 0.8409435376524925, "rewards/cosine_scaled_reward": 0.11351519823074341, "rewards/format_reward": 0.6041666716337204, "step": 124 }, { "completion_length": 3028.6459350585938, "epoch": 0.14285714285714285, "grad_norm": 0.230963334441185, "kl": 0.0016689300537109375, "learning_rate": 9.397114317029974e-07, "loss": 0.0001, "reward": 0.08886492438614368, "reward_std": 0.5733988359570503, "rewards/cosine_scaled_reward": -0.13265088573098183, "rewards/format_reward": 0.35416666977107525, "step": 125 }, { "completion_length": 1943.6667175292969, "epoch": 0.144, "grad_norm": 0.26326608657836914, "kl": 0.0029306411743164062, "learning_rate": 9.381311511432658e-07, "loss": 0.0001, "reward": 0.6195674315094948, "reward_std": 0.7148094028234482, "rewards/cosine_scaled_reward": -0.03396627772599459, "rewards/format_reward": 0.6875000149011612, "step": 126 }, { "completion_length": 2635.9166870117188, "epoch": 0.14514285714285713, "grad_norm": 0.2009022980928421, "kl": 0.0012750625610351562, "learning_rate": 9.36531953618799e-07, "loss": 0.0001, "reward": 0.41267674416303635, "reward_std": 0.8958253264427185, "rewards/cosine_scaled_reward": -0.022828295826911926, "rewards/format_reward": 0.4583333395421505, "step": 127 }, { "completion_length": 2321.2500610351562, "epoch": 0.1462857142857143, "grad_norm": 0.19144511222839355, "kl": 0.00170135498046875, "learning_rate": 9.34913917072228e-07, "loss": 0.0001, "reward": 0.5039072521030903, "reward_std": 0.8606824576854706, "rewards/cosine_scaled_reward": -0.029296381399035454, "rewards/format_reward": 0.5625000149011612, "step": 128 }, { "completion_length": 1837.7917175292969, "epoch": 0.14742857142857144, "grad_norm": 0.22352828085422516, "kl": 0.0069904327392578125, "learning_rate": 9.332771203643714e-07, "loss": 0.0003, "reward": 0.5292131304740906, "reward_std": 0.6835447549819946, "rewards/cosine_scaled_reward": -0.09997677942737937, "rewards/format_reward": 0.7291666716337204, "step": 129 }, { "completion_length": 2737.7918090820312, "epoch": 0.14857142857142858, "grad_norm": 0.21162550151348114, "kl": 0.00159454345703125, "learning_rate": 9.316216432703916e-07, "loss": 0.0001, "reward": 0.1000329963862896, "reward_std": 0.6349897980690002, "rewards/cosine_scaled_reward": -0.15831685485318303, "rewards/format_reward": 0.416666679084301, "step": 130 }, { "completion_length": 2639.3750610351562, "epoch": 0.14971428571428572, "grad_norm": 0.21849536895751953, "kl": 0.001689910888671875, "learning_rate": 9.299475664759068e-07, "loss": 0.0001, "reward": 0.3575108479708433, "reward_std": 0.7335042506456375, "rewards/cosine_scaled_reward": -0.07124457694590092, "rewards/format_reward": 0.5000000055879354, "step": 131 }, { "completion_length": 2595.791748046875, "epoch": 0.15085714285714286, "grad_norm": 0.2819630801677704, "kl": 0.0028476715087890625, "learning_rate": 9.282549715730579e-07, "loss": 0.0001, "reward": -0.14514993596822023, "reward_std": 0.4842909276485443, "rewards/cosine_scaled_reward": -0.2704916410148144, "rewards/format_reward": 0.3958333432674408, "step": 132 }, { "completion_length": 2752.2084350585938, "epoch": 0.152, "grad_norm": 0.20234017074108124, "kl": 0.002986907958984375, "learning_rate": 9.265439410565328e-07, "loss": 0.0001, "reward": 0.15741661936044693, "reward_std": 0.6936222016811371, "rewards/cosine_scaled_reward": -0.15045835822820663, "rewards/format_reward": 0.4583333507180214, "step": 133 }, { "completion_length": 1920.0000610351562, "epoch": 0.15314285714285714, "grad_norm": 0.30035078525543213, "kl": 0.00372314453125, "learning_rate": 9.248145583195447e-07, "loss": 0.0001, "reward": 0.7273320555686951, "reward_std": 0.7046244740486145, "rewards/cosine_scaled_reward": 0.01991601102054119, "rewards/format_reward": 0.6875, "step": 134 }, { "completion_length": 2167.8334350585938, "epoch": 0.15428571428571428, "grad_norm": 0.2100658118724823, "kl": 0.0017566680908203125, "learning_rate": 9.230669076497687e-07, "loss": 0.0001, "reward": 0.4464118145406246, "reward_std": 0.3928603231906891, "rewards/cosine_scaled_reward": -0.03721076436340809, "rewards/format_reward": 0.520833333954215, "step": 135 }, { "completion_length": 2357.2084350585938, "epoch": 0.15542857142857142, "grad_norm": 0.24143747985363007, "kl": 0.00250244140625, "learning_rate": 9.213010742252327e-07, "loss": 0.0001, "reward": 0.6343938559293747, "reward_std": 0.7614049315452576, "rewards/cosine_scaled_reward": 0.004696924239397049, "rewards/format_reward": 0.6250000149011612, "step": 136 }, { "completion_length": 2482.1875610351562, "epoch": 0.15657142857142858, "grad_norm": 0.22769631445407867, "kl": 0.003353118896484375, "learning_rate": 9.195171441101668e-07, "loss": 0.0001, "reward": 0.09337181597948074, "reward_std": 0.6429625153541565, "rewards/cosine_scaled_reward": -0.18248076736927032, "rewards/format_reward": 0.4583333395421505, "step": 137 }, { "completion_length": 2043.5000610351562, "epoch": 0.15771428571428572, "grad_norm": 0.2516387403011322, "kl": 0.003204345703125, "learning_rate": 9.177152042508077e-07, "loss": 0.0001, "reward": 0.9434101283550262, "reward_std": 0.8629068434238434, "rewards/cosine_scaled_reward": 0.13837172836065292, "rewards/format_reward": 0.6666666865348816, "step": 138 }, { "completion_length": 2047.5625, "epoch": 0.15885714285714286, "grad_norm": 0.2453654259443283, "kl": 0.0018482208251953125, "learning_rate": 9.158953424711624e-07, "loss": 0.0001, "reward": 0.7086100317537785, "reward_std": 0.5908297449350357, "rewards/cosine_scaled_reward": 0.00013833213597536087, "rewards/format_reward": 0.7083333507180214, "step": 139 }, { "completion_length": 2375.7084350585938, "epoch": 0.16, "grad_norm": 0.1979781836271286, "kl": 0.0024309158325195312, "learning_rate": 9.140576474687263e-07, "loss": 0.0001, "reward": 0.619663898833096, "reward_std": 0.41604873538017273, "rewards/cosine_scaled_reward": 0.007748600095510483, "rewards/format_reward": 0.6041666716337204, "step": 140 }, { "completion_length": 2381.7708740234375, "epoch": 0.16114285714285714, "grad_norm": 0.23859204351902008, "kl": 0.003826141357421875, "learning_rate": 9.122022088101613e-07, "loss": 0.0002, "reward": 0.3829444032162428, "reward_std": 0.5856426432728767, "rewards/cosine_scaled_reward": -0.05852780118584633, "rewards/format_reward": 0.5000000149011612, "step": 141 }, { "completion_length": 2265.4584045410156, "epoch": 0.16228571428571428, "grad_norm": 0.26838216185569763, "kl": 0.0041828155517578125, "learning_rate": 9.103291169269299e-07, "loss": 0.0002, "reward": 0.459966566413641, "reward_std": 0.5846913754940033, "rewards/cosine_scaled_reward": -0.030433382838964462, "rewards/format_reward": 0.5208333358168602, "step": 142 }, { "completion_length": 2539.479248046875, "epoch": 0.16342857142857142, "grad_norm": 0.18913578987121582, "kl": 0.0029649734497070312, "learning_rate": 9.084384631108882e-07, "loss": 0.0001, "reward": 0.42852520011365414, "reward_std": 0.6579816788434982, "rewards/cosine_scaled_reward": -0.05657072924077511, "rewards/format_reward": 0.5416666865348816, "step": 143 }, { "completion_length": 2627.8125610351562, "epoch": 0.16457142857142856, "grad_norm": 0.1790352761745453, "kl": 0.0033435821533203125, "learning_rate": 9.065303395098358e-07, "loss": 0.0001, "reward": 0.7180662602186203, "reward_std": 0.9851722121238708, "rewards/cosine_scaled_reward": 0.05694979056715965, "rewards/format_reward": 0.6041666939854622, "step": 144 }, { "completion_length": 2526.541778564453, "epoch": 0.1657142857142857, "grad_norm": 0.22108972072601318, "kl": 0.0050811767578125, "learning_rate": 9.046048391230247e-07, "loss": 0.0002, "reward": 0.40608268324285746, "reward_std": 0.8329223841428757, "rewards/cosine_scaled_reward": -0.0573753397911787, "rewards/format_reward": 0.5208333432674408, "step": 145 }, { "completion_length": 2217.0625610351562, "epoch": 0.16685714285714287, "grad_norm": 0.37226402759552, "kl": 0.00292205810546875, "learning_rate": 9.026620557966279e-07, "loss": 0.0001, "reward": 0.29790709912776947, "reward_std": 0.6913768872618675, "rewards/cosine_scaled_reward": -0.19479646161198616, "rewards/format_reward": 0.6875000149011612, "step": 146 }, { "completion_length": 2100.500030517578, "epoch": 0.168, "grad_norm": 0.1791330724954605, "kl": 0.0022411346435546875, "learning_rate": 9.007020842191634e-07, "loss": 0.0001, "reward": 0.6166809126734734, "reward_std": 0.7499666661024094, "rewards/cosine_scaled_reward": -0.014576207846403122, "rewards/format_reward": 0.6458333432674408, "step": 147 }, { "completion_length": 1953.1667022705078, "epoch": 0.16914285714285715, "grad_norm": 0.26837047934532166, "kl": 0.0032806396484375, "learning_rate": 8.987250199168808e-07, "loss": 0.0001, "reward": 0.22773092985153198, "reward_std": 0.5684618726372719, "rewards/cosine_scaled_reward": -0.21946788486093283, "rewards/format_reward": 0.6666666679084301, "step": 148 }, { "completion_length": 2537.8958740234375, "epoch": 0.1702857142857143, "grad_norm": 0.19118516147136688, "kl": 0.0032138824462890625, "learning_rate": 8.967309592491052e-07, "loss": 0.0001, "reward": 0.9459018707275391, "reward_std": 0.6409400217235088, "rewards/cosine_scaled_reward": 0.18128425255417824, "rewards/format_reward": 0.5833333488553762, "step": 149 }, { "completion_length": 2327.9375610351562, "epoch": 0.17142857142857143, "grad_norm": 0.22041891515254974, "kl": 0.004886627197265625, "learning_rate": 8.9471999940354e-07, "loss": 0.0002, "reward": 0.2033998966217041, "reward_std": 0.6696746721863747, "rewards/cosine_scaled_reward": -0.1587167321704328, "rewards/format_reward": 0.5208333488553762, "step": 150 }, { "completion_length": 2248.9375610351562, "epoch": 0.17257142857142857, "grad_norm": 0.20176808536052704, "kl": 0.0022144317626953125, "learning_rate": 8.926922383915315e-07, "loss": 0.0001, "reward": 0.2744840234518051, "reward_std": 0.46313488483428955, "rewards/cosine_scaled_reward": -0.11275799572467804, "rewards/format_reward": 0.5000000149011612, "step": 151 }, { "completion_length": 2933.0833740234375, "epoch": 0.1737142857142857, "grad_norm": 0.26598456501960754, "kl": 0.005481719970703125, "learning_rate": 8.906477750432903e-07, "loss": 0.0002, "reward": 0.04549443535506725, "reward_std": 0.48727741837501526, "rewards/cosine_scaled_reward": -0.13350277952849865, "rewards/format_reward": 0.3125000074505806, "step": 152 }, { "completion_length": 2075.729278564453, "epoch": 0.17485714285714285, "grad_norm": 0.19798634946346283, "kl": 0.0023260116577148438, "learning_rate": 8.88586709003076e-07, "loss": 0.0001, "reward": 0.9074295610189438, "reward_std": 0.5949664637446404, "rewards/cosine_scaled_reward": 0.09954808466136456, "rewards/format_reward": 0.7083333432674408, "step": 153 }, { "completion_length": 2754.791748046875, "epoch": 0.176, "grad_norm": 0.19806884229183197, "kl": 0.0027751922607421875, "learning_rate": 8.865091407243394e-07, "loss": 0.0001, "reward": 0.6114984676241875, "reward_std": 0.6458263993263245, "rewards/cosine_scaled_reward": 0.08699923381209373, "rewards/format_reward": 0.4375000149011612, "step": 154 }, { "completion_length": 3066.354248046875, "epoch": 0.17714285714285713, "grad_norm": 0.29352447390556335, "kl": 0.00366973876953125, "learning_rate": 8.844151714648274e-07, "loss": 0.0001, "reward": 0.017834719270467758, "reward_std": 0.8173775672912598, "rewards/cosine_scaled_reward": -0.14733264222741127, "rewards/format_reward": 0.31250000558793545, "step": 155 }, { "completion_length": 2754.4375610351562, "epoch": 0.1782857142857143, "grad_norm": 0.19924919307231903, "kl": 0.005084991455078125, "learning_rate": 8.823049032816478e-07, "loss": 0.0002, "reward": 0.06147514842450619, "reward_std": 0.5927468463778496, "rewards/cosine_scaled_reward": -0.14634575322270393, "rewards/format_reward": 0.3541666828095913, "step": 156 }, { "completion_length": 2206.250030517578, "epoch": 0.17942857142857144, "grad_norm": 0.23515692353248596, "kl": 0.0025005340576171875, "learning_rate": 8.801784390262943e-07, "loss": 0.0001, "reward": 0.9370372518897057, "reward_std": 0.8283505141735077, "rewards/cosine_scaled_reward": 0.1456019375473261, "rewards/format_reward": 0.6458333507180214, "step": 157 }, { "completion_length": 2862.7708740234375, "epoch": 0.18057142857142858, "grad_norm": 0.19533918797969818, "kl": 0.004573822021484375, "learning_rate": 8.780358823396352e-07, "loss": 0.0002, "reward": -0.053055196069180965, "reward_std": 0.6198497340083122, "rewards/cosine_scaled_reward": -0.23486093431711197, "rewards/format_reward": 0.41666667349636555, "step": 158 }, { "completion_length": 1894.8542175292969, "epoch": 0.18171428571428572, "grad_norm": 0.22211147844791412, "kl": 0.002826690673828125, "learning_rate": 8.758773376468604e-07, "loss": 0.0001, "reward": 1.2009564340114594, "reward_std": 0.7813936918973923, "rewards/cosine_scaled_reward": 0.1942282197996974, "rewards/format_reward": 0.8125000149011612, "step": 159 }, { "completion_length": 1964.9792175292969, "epoch": 0.18285714285714286, "grad_norm": 0.21944580972194672, "kl": 0.003116607666015625, "learning_rate": 8.737029101523929e-07, "loss": 0.0001, "reward": 0.9964812844991684, "reward_std": 0.7849611788988113, "rewards/cosine_scaled_reward": 0.10240732878446579, "rewards/format_reward": 0.7916666716337204, "step": 160 }, { "completion_length": 2853.4583740234375, "epoch": 0.184, "grad_norm": 0.1943131983280182, "kl": 0.00357818603515625, "learning_rate": 8.715127058347614e-07, "loss": 0.0001, "reward": 0.10683573782444, "reward_std": 0.6206659823656082, "rewards/cosine_scaled_reward": -0.1549154706299305, "rewards/format_reward": 0.41666667722165585, "step": 161 }, { "completion_length": 2725.604248046875, "epoch": 0.18514285714285714, "grad_norm": 0.18736310303211212, "kl": 0.00328826904296875, "learning_rate": 8.693068314414344e-07, "loss": 0.0001, "reward": 0.16440774500370026, "reward_std": 0.7531605362892151, "rewards/cosine_scaled_reward": -0.1782128056511283, "rewards/format_reward": 0.5208333507180214, "step": 162 }, { "completion_length": 2378.604248046875, "epoch": 0.18628571428571428, "grad_norm": 0.27985262870788574, "kl": 0.00499725341796875, "learning_rate": 8.670853944836176e-07, "loss": 0.0002, "reward": 0.3438632491452154, "reward_std": 0.8545256406068802, "rewards/cosine_scaled_reward": -0.06765171512961388, "rewards/format_reward": 0.479166679084301, "step": 163 }, { "completion_length": 2041.791748046875, "epoch": 0.18742857142857142, "grad_norm": 0.2726307511329651, "kl": 0.00522613525390625, "learning_rate": 8.648485032310144e-07, "loss": 0.0002, "reward": 0.1412298008799553, "reward_std": 0.45817675441503525, "rewards/cosine_scaled_reward": -0.18980177072808146, "rewards/format_reward": 0.520833333954215, "step": 164 }, { "completion_length": 2056.8958740234375, "epoch": 0.18857142857142858, "grad_norm": 0.24950121343135834, "kl": 0.0033512115478515625, "learning_rate": 8.625962667065487e-07, "loss": 0.0001, "reward": 0.4896044433116913, "reward_std": 0.6808345168828964, "rewards/cosine_scaled_reward": -0.057281110901385546, "rewards/format_reward": 0.6041666828095913, "step": 165 }, { "completion_length": 1701.5625, "epoch": 0.18971428571428572, "grad_norm": 0.20629195868968964, "kl": 0.0029201507568359375, "learning_rate": 8.603287946810513e-07, "loss": 0.0001, "reward": 0.4639076357707381, "reward_std": 0.4746507927775383, "rewards/cosine_scaled_reward": -0.13262954354286194, "rewards/format_reward": 0.7291666716337204, "step": 166 }, { "completion_length": 2674.4375610351562, "epoch": 0.19085714285714286, "grad_norm": 0.19312238693237305, "kl": 0.004119873046875, "learning_rate": 8.580461976679099e-07, "loss": 0.0002, "reward": 0.26991652697324753, "reward_std": 0.8362310528755188, "rewards/cosine_scaled_reward": -0.11504174256697297, "rewards/format_reward": 0.5000000111758709, "step": 167 }, { "completion_length": 1723.5209045410156, "epoch": 0.192, "grad_norm": 0.19439440965652466, "kl": 0.002704620361328125, "learning_rate": 8.557485869176825e-07, "loss": 0.0001, "reward": 0.7088751941919327, "reward_std": 0.7652025148272514, "rewards/cosine_scaled_reward": -0.051812431775033474, "rewards/format_reward": 0.8125000149011612, "step": 168 }, { "completion_length": 2231.979248046875, "epoch": 0.19314285714285714, "grad_norm": 0.2904442250728607, "kl": 0.004784584045410156, "learning_rate": 8.534360744126753e-07, "loss": 0.0002, "reward": 0.9261031150817871, "reward_std": 0.9859992563724518, "rewards/cosine_scaled_reward": 0.12971824035048485, "rewards/format_reward": 0.6666666939854622, "step": 169 }, { "completion_length": 2114.0208740234375, "epoch": 0.19428571428571428, "grad_norm": 0.19766280055046082, "kl": 0.003971099853515625, "learning_rate": 8.511087728614862e-07, "loss": 0.0002, "reward": 0.3019937239587307, "reward_std": 0.6615323200821877, "rewards/cosine_scaled_reward": -0.1510864682495594, "rewards/format_reward": 0.6041666865348816, "step": 170 }, { "completion_length": 1614.7708435058594, "epoch": 0.19542857142857142, "grad_norm": 0.23038722574710846, "kl": 0.0032196044921875, "learning_rate": 8.487667956935087e-07, "loss": 0.0001, "reward": 0.5918858665972948, "reward_std": 0.47077811881899834, "rewards/cosine_scaled_reward": -0.09989039599895477, "rewards/format_reward": 0.7916666716337204, "step": 171 }, { "completion_length": 1724.5833740234375, "epoch": 0.19657142857142856, "grad_norm": 0.2515551447868347, "kl": 0.00432586669921875, "learning_rate": 8.464102570534061e-07, "loss": 0.0002, "reward": 0.8076295026112348, "reward_std": 0.5722271054983139, "rewards/cosine_scaled_reward": 0.04964808002114296, "rewards/format_reward": 0.7083333432674408, "step": 172 }, { "completion_length": 2433.4375610351562, "epoch": 0.1977142857142857, "grad_norm": 0.3010346591472626, "kl": 0.004909515380859375, "learning_rate": 8.440392717955475e-07, "loss": 0.0002, "reward": 0.34551432851003483, "reward_std": 0.7096427381038666, "rewards/cosine_scaled_reward": -0.08765951948589645, "rewards/format_reward": 0.5208333488553762, "step": 173 }, { "completion_length": 2419.979217529297, "epoch": 0.19885714285714284, "grad_norm": 0.19969363510608673, "kl": 0.005977630615234375, "learning_rate": 8.416539554784089e-07, "loss": 0.0002, "reward": 0.27888998575508595, "reward_std": 0.5231342613697052, "rewards/cosine_scaled_reward": -0.1001383513212204, "rewards/format_reward": 0.4791666716337204, "step": 174 }, { "completion_length": 2777.1250610351562, "epoch": 0.2, "grad_norm": 0.17995108664035797, "kl": 0.0071868896484375, "learning_rate": 8.392544243589427e-07, "loss": 0.0003, "reward": 0.6402685008943081, "reward_std": 0.7186409756541252, "rewards/cosine_scaled_reward": 0.0909675620496273, "rewards/format_reward": 0.4583333395421505, "step": 175 }, { "completion_length": 2093.25, "epoch": 0.20114285714285715, "grad_norm": 0.20400448143482208, "kl": 0.00519561767578125, "learning_rate": 8.368407953869103e-07, "loss": 0.0002, "reward": 0.5985848978161812, "reward_std": 0.7769260108470917, "rewards/cosine_scaled_reward": -0.03404088690876961, "rewards/format_reward": 0.6666666716337204, "step": 176 }, { "completion_length": 2018.1875915527344, "epoch": 0.2022857142857143, "grad_norm": 0.21771669387817383, "kl": 0.004276275634765625, "learning_rate": 8.344131861991828e-07, "loss": 0.0002, "reward": 0.5329161509871483, "reward_std": 0.5947398841381073, "rewards/cosine_scaled_reward": -0.0772919338196516, "rewards/format_reward": 0.6875000149011612, "step": 177 }, { "completion_length": 1553.2083892822266, "epoch": 0.20342857142857143, "grad_norm": 0.2806382477283478, "kl": 0.00424957275390625, "learning_rate": 8.319717151140072e-07, "loss": 0.0002, "reward": 0.9667995385825634, "reward_std": 0.4322159215807915, "rewards/cosine_scaled_reward": 0.09798309206962585, "rewards/format_reward": 0.770833358168602, "step": 178 }, { "completion_length": 2260.9584350585938, "epoch": 0.20457142857142857, "grad_norm": 0.19229480624198914, "kl": 0.00655364990234375, "learning_rate": 8.295165011252396e-07, "loss": 0.0003, "reward": 0.6450915709137917, "reward_std": 0.6193302199244499, "rewards/cosine_scaled_reward": 0.030879119411110878, "rewards/format_reward": 0.583333358168602, "step": 179 }, { "completion_length": 2204.104248046875, "epoch": 0.2057142857142857, "grad_norm": 0.34143543243408203, "kl": 0.00551605224609375, "learning_rate": 8.270476638965461e-07, "loss": 0.0002, "reward": 0.49652543663978577, "reward_std": 0.9920015186071396, "rewards/cosine_scaled_reward": -0.06423728261142969, "rewards/format_reward": 0.6250000111758709, "step": 180 }, { "completion_length": 2376.3333740234375, "epoch": 0.20685714285714285, "grad_norm": 0.1844940185546875, "kl": 0.002460479736328125, "learning_rate": 8.245653237555705e-07, "loss": 0.0001, "reward": 0.7340436186641455, "reward_std": 0.7672436386346817, "rewards/cosine_scaled_reward": 0.033688463270664215, "rewards/format_reward": 0.6666666865348816, "step": 181 }, { "completion_length": 1781.0208435058594, "epoch": 0.208, "grad_norm": 0.27145451307296753, "kl": 0.005458831787109375, "learning_rate": 8.220696016880687e-07, "loss": 0.0002, "reward": 0.6259329319000244, "reward_std": 0.7968147397041321, "rewards/cosine_scaled_reward": -0.062033540569245815, "rewards/format_reward": 0.7500000111758709, "step": 182 }, { "completion_length": 2343.8959045410156, "epoch": 0.20914285714285713, "grad_norm": 0.2297639399766922, "kl": 0.00566864013671875, "learning_rate": 8.195606193320136e-07, "loss": 0.0002, "reward": 0.11410272493958473, "reward_std": 0.5572097525000572, "rewards/cosine_scaled_reward": -0.2658653110265732, "rewards/format_reward": 0.6458333432674408, "step": 183 }, { "completion_length": 1841.2292175292969, "epoch": 0.2102857142857143, "grad_norm": 0.2628481388092041, "kl": 0.003875732421875, "learning_rate": 8.170384989716657e-07, "loss": 0.0002, "reward": 1.0498279109597206, "reward_std": 0.812163732945919, "rewards/cosine_scaled_reward": 0.1394973020069301, "rewards/format_reward": 0.770833358168602, "step": 184 }, { "completion_length": 2690.666748046875, "epoch": 0.21142857142857144, "grad_norm": 0.2707008719444275, "kl": 0.006504058837890625, "learning_rate": 8.145033635316128e-07, "loss": 0.0003, "reward": 0.12935106456279755, "reward_std": 0.6062737256288528, "rewards/cosine_scaled_reward": -0.15407447703182697, "rewards/format_reward": 0.4375000149011612, "step": 185 }, { "completion_length": 2648.7083740234375, "epoch": 0.21257142857142858, "grad_norm": 0.277004599571228, "kl": 0.00493621826171875, "learning_rate": 8.119553365707802e-07, "loss": 0.0002, "reward": 0.3933283071964979, "reward_std": 0.6029615625739098, "rewards/cosine_scaled_reward": -0.07416917383670807, "rewards/format_reward": 0.5416666865348816, "step": 186 }, { "completion_length": 2165.416778564453, "epoch": 0.21371428571428572, "grad_norm": 0.2298295795917511, "kl": 0.005840301513671875, "learning_rate": 8.093945422764069e-07, "loss": 0.0002, "reward": 0.7806095313280821, "reward_std": 0.7954358160495758, "rewards/cosine_scaled_reward": 0.025721419602632523, "rewards/format_reward": 0.7291666865348816, "step": 187 }, { "completion_length": 1475.4167022705078, "epoch": 0.21485714285714286, "grad_norm": 0.24691948294639587, "kl": 0.005809783935546875, "learning_rate": 8.068211054579943e-07, "loss": 0.0002, "reward": 1.0945345759391785, "reward_std": 0.8786479085683823, "rewards/cosine_scaled_reward": 0.10976729169487953, "rewards/format_reward": 0.8750000149011612, "step": 188 }, { "completion_length": 1539.8333740234375, "epoch": 0.216, "grad_norm": 0.27775290608406067, "kl": 0.0064697265625, "learning_rate": 8.04235151541222e-07, "loss": 0.0003, "reward": 0.6156105473637581, "reward_std": 0.7454669773578644, "rewards/cosine_scaled_reward": -0.119278060272336, "rewards/format_reward": 0.8541666865348816, "step": 189 }, { "completion_length": 2440.979278564453, "epoch": 0.21714285714285714, "grad_norm": 0.22604604065418243, "kl": 0.00592041015625, "learning_rate": 8.01636806561836e-07, "loss": 0.0002, "reward": 0.4239200847223401, "reward_std": 0.8845669329166412, "rewards/cosine_scaled_reward": -0.038039978593587875, "rewards/format_reward": 0.500000013038516, "step": 190 }, { "completion_length": 1904.729248046875, "epoch": 0.21828571428571428, "grad_norm": 0.2662159204483032, "kl": 0.00586700439453125, "learning_rate": 7.990261971595048e-07, "loss": 0.0002, "reward": 0.4985937252640724, "reward_std": 0.7315638810396194, "rewards/cosine_scaled_reward": -0.09445315971970558, "rewards/format_reward": 0.6875000074505806, "step": 191 }, { "completion_length": 2417.1875610351562, "epoch": 0.21942857142857142, "grad_norm": 0.27427810430526733, "kl": 0.009246826171875, "learning_rate": 7.964034505716476e-07, "loss": 0.0004, "reward": 0.3124541025608778, "reward_std": 0.6425688564777374, "rewards/cosine_scaled_reward": -0.12502295151352882, "rewards/format_reward": 0.5625000074505806, "step": 192 }, { "completion_length": 1865.0208740234375, "epoch": 0.22057142857142858, "grad_norm": 0.2562018930912018, "kl": 0.01080322265625, "learning_rate": 7.93768694627233e-07, "loss": 0.0004, "reward": 0.5529625415802002, "reward_std": 0.5897716134786606, "rewards/cosine_scaled_reward": -0.0985187292098999, "rewards/format_reward": 0.7500000149011612, "step": 193 }, { "completion_length": 1062.4375305175781, "epoch": 0.22171428571428572, "grad_norm": 0.22855594754219055, "kl": 0.0037994384765625, "learning_rate": 7.911220577405484e-07, "loss": 0.0002, "reward": 1.7262530326843262, "reward_std": 0.826399639248848, "rewards/cosine_scaled_reward": 0.3631264716386795, "rewards/format_reward": 1.0, "step": 194 }, { "completion_length": 1257.5625305175781, "epoch": 0.22285714285714286, "grad_norm": 0.26371893286705017, "kl": 0.00643157958984375, "learning_rate": 7.884636689049422e-07, "loss": 0.0003, "reward": 1.151860922574997, "reward_std": 0.6702793166041374, "rewards/cosine_scaled_reward": 0.12801377475261688, "rewards/format_reward": 0.8958333432674408, "step": 195 }, { "completion_length": 2729.5000610351562, "epoch": 0.224, "grad_norm": 0.2403058111667633, "kl": 0.00676727294921875, "learning_rate": 7.857936576865356e-07, "loss": 0.0003, "reward": 0.26913030445575714, "reward_std": 0.6797884181141853, "rewards/cosine_scaled_reward": -0.08418486639857292, "rewards/format_reward": 0.4375, "step": 196 }, { "completion_length": 2123.791717529297, "epoch": 0.22514285714285714, "grad_norm": 0.22864989936351776, "kl": 0.006134033203125, "learning_rate": 7.831121542179086e-07, "loss": 0.0002, "reward": 0.06347193196415901, "reward_std": 0.40899810940027237, "rewards/cosine_scaled_reward": -0.24951404333114624, "rewards/format_reward": 0.5625000149011612, "step": 197 }, { "completion_length": 2316.791748046875, "epoch": 0.22628571428571428, "grad_norm": 0.2166266292333603, "kl": 0.00603485107421875, "learning_rate": 7.804192891917571e-07, "loss": 0.0002, "reward": 0.4236091636121273, "reward_std": 0.794644683599472, "rewards/cosine_scaled_reward": -0.1215287372469902, "rewards/format_reward": 0.6666666865348816, "step": 198 }, { "completion_length": 2175.5416870117188, "epoch": 0.22742857142857142, "grad_norm": 0.2332044243812561, "kl": 0.005603790283203125, "learning_rate": 7.777151938545235e-07, "loss": 0.0002, "reward": 1.2629163265228271, "reward_std": 0.7567542046308517, "rewards/cosine_scaled_reward": 0.2252081297338009, "rewards/format_reward": 0.8125000298023224, "step": 199 }, { "completion_length": 1545.8125305175781, "epoch": 0.22857142857142856, "grad_norm": 0.3451651632785797, "kl": 0.006610870361328125, "learning_rate": 7.75e-07, "loss": 0.0003, "reward": 1.1884014122188091, "reward_std": 0.868816927075386, "rewards/cosine_scaled_reward": 0.1983673730865121, "rewards/format_reward": 0.7916666716337204, "step": 200 }, { "completion_length": 1524.0625610351562, "epoch": 0.2297142857142857, "grad_norm": 0.21861064434051514, "kl": 0.0051116943359375, "learning_rate": 7.72273839962904e-07, "loss": 0.0002, "reward": 0.675473814830184, "reward_std": 0.6859661787748337, "rewards/cosine_scaled_reward": -0.07892975211143494, "rewards/format_reward": 0.8333333432674408, "step": 201 }, { "completion_length": 1321.3959045410156, "epoch": 0.23085714285714284, "grad_norm": 0.24629908800125122, "kl": 0.007568359375, "learning_rate": 7.695368466124296e-07, "loss": 0.0003, "reward": 0.9415311962366104, "reward_std": 0.7775374501943588, "rewards/cosine_scaled_reward": 0.02284892648458481, "rewards/format_reward": 0.8958333432674408, "step": 202 }, { "completion_length": 1379.7709045410156, "epoch": 0.232, "grad_norm": 0.27627113461494446, "kl": 0.00753021240234375, "learning_rate": 7.667891533457718e-07, "loss": 0.0003, "reward": 1.1194992661476135, "reward_std": 0.7730197608470917, "rewards/cosine_scaled_reward": 0.09099959582090378, "rewards/format_reward": 0.9375000149011612, "step": 203 }, { "completion_length": 1120.1667022705078, "epoch": 0.23314285714285715, "grad_norm": 0.26729604601860046, "kl": 0.00640869140625, "learning_rate": 7.640308940816239e-07, "loss": 0.0003, "reward": 1.2363095879554749, "reward_std": 0.8477204591035843, "rewards/cosine_scaled_reward": 0.15982142463326454, "rewards/format_reward": 0.9166666716337204, "step": 204 }, { "completion_length": 1971.2291870117188, "epoch": 0.2342857142857143, "grad_norm": 0.2195984125137329, "kl": 0.00698089599609375, "learning_rate": 7.612622032536507e-07, "loss": 0.0003, "reward": 0.8974205702543259, "reward_std": 0.8895229697227478, "rewards/cosine_scaled_reward": 0.052876945585012436, "rewards/format_reward": 0.7916666716337204, "step": 205 }, { "completion_length": 2061.3334350585938, "epoch": 0.23542857142857143, "grad_norm": 0.268889844417572, "kl": 0.007232666015625, "learning_rate": 7.584832158039378e-07, "loss": 0.0003, "reward": 0.2433365173637867, "reward_std": 0.5611164793372154, "rewards/cosine_scaled_reward": -0.2324984148144722, "rewards/format_reward": 0.708333358168602, "step": 206 }, { "completion_length": 1904.5000305175781, "epoch": 0.23657142857142857, "grad_norm": 0.19144296646118164, "kl": 0.00501251220703125, "learning_rate": 7.556940671764124e-07, "loss": 0.0002, "reward": 0.9468748420476913, "reward_std": 0.6613385528326035, "rewards/cosine_scaled_reward": 0.05677075684070587, "rewards/format_reward": 0.8333333432674408, "step": 207 }, { "completion_length": 1996.8125610351562, "epoch": 0.2377142857142857, "grad_norm": 0.29941245913505554, "kl": 0.00930023193359375, "learning_rate": 7.528948933102438e-07, "loss": 0.0004, "reward": 0.29106441140174866, "reward_std": 0.6116437911987305, "rewards/cosine_scaled_reward": -0.17738447710871696, "rewards/format_reward": 0.6458333544433117, "step": 208 }, { "completion_length": 1285.9166870117188, "epoch": 0.23885714285714285, "grad_norm": 0.3566973805427551, "kl": 0.00695037841796875, "learning_rate": 7.500858306332172e-07, "loss": 0.0003, "reward": 0.5402148813009262, "reward_std": 0.7111145555973053, "rewards/cosine_scaled_reward": -0.14655922167003155, "rewards/format_reward": 0.8333333432674408, "step": 209 }, { "completion_length": 1275.5000305175781, "epoch": 0.24, "grad_norm": 0.2605917155742645, "kl": 0.006221771240234375, "learning_rate": 7.472670160550848e-07, "loss": 0.0002, "reward": 1.3301078528165817, "reward_std": 0.8438884019851685, "rewards/cosine_scaled_reward": 0.18588725943118334, "rewards/format_reward": 0.9583333432674408, "step": 210 }, { "completion_length": 2328.6251220703125, "epoch": 0.24114285714285713, "grad_norm": 0.1887713074684143, "kl": 0.0092010498046875, "learning_rate": 7.444385869608921e-07, "loss": 0.0004, "reward": 0.6505604535341263, "reward_std": 0.5875601321458817, "rewards/cosine_scaled_reward": -0.008053132332861423, "rewards/format_reward": 0.6666666865348816, "step": 211 }, { "completion_length": 1282.166732788086, "epoch": 0.2422857142857143, "grad_norm": 0.2532815635204315, "kl": 0.0076446533203125, "learning_rate": 7.416006812042827e-07, "loss": 0.0003, "reward": 1.1198171079158783, "reward_std": 0.8018105626106262, "rewards/cosine_scaled_reward": 0.11199186649173498, "rewards/format_reward": 0.8958333432674408, "step": 212 }, { "completion_length": 1855.0834045410156, "epoch": 0.24342857142857144, "grad_norm": 0.2252466082572937, "kl": 0.0081024169921875, "learning_rate": 7.387534371007797e-07, "loss": 0.0003, "reward": 0.2633536756038666, "reward_std": 0.42707760632038116, "rewards/cosine_scaled_reward": -0.2224898338317871, "rewards/format_reward": 0.7083333432674408, "step": 213 }, { "completion_length": 1971.8959045410156, "epoch": 0.24457142857142858, "grad_norm": 0.1935078501701355, "kl": 0.004627227783203125, "learning_rate": 7.358969934210438e-07, "loss": 0.0002, "reward": 0.7197396508418024, "reward_std": 0.6319501101970673, "rewards/cosine_scaled_reward": -0.056796859949827194, "rewards/format_reward": 0.8333333432674408, "step": 214 }, { "completion_length": 1413.3541870117188, "epoch": 0.24571428571428572, "grad_norm": 0.2069859504699707, "kl": 0.00742340087890625, "learning_rate": 7.330314893841101e-07, "loss": 0.0003, "reward": 1.0066201090812683, "reward_std": 0.9675450921058655, "rewards/cosine_scaled_reward": 0.03456003498286009, "rewards/format_reward": 0.9375000149011612, "step": 215 }, { "completion_length": 2199.2083435058594, "epoch": 0.24685714285714286, "grad_norm": 0.2034756988286972, "kl": 0.00646209716796875, "learning_rate": 7.301570646506027e-07, "loss": 0.0003, "reward": 0.47810695320367813, "reward_std": 0.7864377945661545, "rewards/cosine_scaled_reward": -0.12552986666560173, "rewards/format_reward": 0.7291666865348816, "step": 216 }, { "completion_length": 1527.8958740234375, "epoch": 0.248, "grad_norm": 0.2671460807323456, "kl": 0.006622314453125, "learning_rate": 7.27273859315928e-07, "loss": 0.0003, "reward": 0.7772237807512283, "reward_std": 0.6489354968070984, "rewards/cosine_scaled_reward": -0.038471437990665436, "rewards/format_reward": 0.8541666865348816, "step": 217 }, { "completion_length": 1419.3542175292969, "epoch": 0.24914285714285714, "grad_norm": 0.2513315677642822, "kl": 0.00665283203125, "learning_rate": 7.243820139034464e-07, "loss": 0.0003, "reward": 1.1047292775474489, "reward_std": 0.6393595859408379, "rewards/cosine_scaled_reward": 0.1252813059836626, "rewards/format_reward": 0.8541666716337204, "step": 218 }, { "completion_length": 1275.3125305175781, "epoch": 0.2502857142857143, "grad_norm": 0.2648639380931854, "kl": 0.00737762451171875, "learning_rate": 7.214816693576234e-07, "loss": 0.0003, "reward": 0.6924525499343872, "reward_std": 0.6107815653085709, "rewards/cosine_scaled_reward": -0.13294040283653885, "rewards/format_reward": 0.9583333432674408, "step": 219 }, { "completion_length": 1247.9791717529297, "epoch": 0.25142857142857145, "grad_norm": 0.22622907161712646, "kl": 0.006031036376953125, "learning_rate": 7.185729670371604e-07, "loss": 0.0002, "reward": 0.9419594034552574, "reward_std": 0.675844706594944, "rewards/cosine_scaled_reward": 0.023063029162585735, "rewards/format_reward": 0.8958333432674408, "step": 220 }, { "completion_length": 2002.0833740234375, "epoch": 0.25257142857142856, "grad_norm": 0.218685120344162, "kl": 0.0069580078125, "learning_rate": 7.156560487081051e-07, "loss": 0.0003, "reward": 0.6575891096144915, "reward_std": 0.6497488841414452, "rewards/cosine_scaled_reward": 0.026711229234933853, "rewards/format_reward": 0.6041666716337204, "step": 221 }, { "completion_length": 2015.6250915527344, "epoch": 0.2537142857142857, "grad_norm": 0.20831483602523804, "kl": 0.0071258544921875, "learning_rate": 7.127310565369415e-07, "loss": 0.0003, "reward": 0.14067217335104942, "reward_std": 0.48574624210596085, "rewards/cosine_scaled_reward": -0.28383059799671173, "rewards/format_reward": 0.7083333432674408, "step": 222 }, { "completion_length": 1517.1875305175781, "epoch": 0.25485714285714284, "grad_norm": 0.24125142395496368, "kl": 0.005817413330078125, "learning_rate": 7.097981330836616e-07, "loss": 0.0002, "reward": 0.6348569616675377, "reward_std": 0.5405807122588158, "rewards/cosine_scaled_reward": -0.09923820104449987, "rewards/format_reward": 0.8333333432674408, "step": 223 }, { "completion_length": 1914.0000305175781, "epoch": 0.256, "grad_norm": 0.2622263431549072, "kl": 0.00728607177734375, "learning_rate": 7.068574212948169e-07, "loss": 0.0003, "reward": 0.5525996647775173, "reward_std": 0.5521951243281364, "rewards/cosine_scaled_reward": -0.04661682341247797, "rewards/format_reward": 0.6458333432674408, "step": 224 }, { "completion_length": 1250.0000305175781, "epoch": 0.2571428571428571, "grad_norm": 0.2181866317987442, "kl": 0.00460052490234375, "learning_rate": 7.039090644965509e-07, "loss": 0.0002, "reward": 1.2948355674743652, "reward_std": 0.6228364408016205, "rewards/cosine_scaled_reward": 0.16825110744684935, "rewards/format_reward": 0.9583333432674408, "step": 225 }, { "completion_length": 1461.6666870117188, "epoch": 0.2582857142857143, "grad_norm": 0.36098772287368774, "kl": 0.00760650634765625, "learning_rate": 7.009532063876148e-07, "loss": 0.0003, "reward": 0.3821214698255062, "reward_std": 0.5764878466725349, "rewards/cosine_scaled_reward": -0.20477261394262314, "rewards/format_reward": 0.7916666716337204, "step": 226 }, { "completion_length": 1328.5416870117188, "epoch": 0.25942857142857145, "grad_norm": 0.27139514684677124, "kl": 0.00934600830078125, "learning_rate": 6.979899910323624e-07, "loss": 0.0004, "reward": 0.7815765663981438, "reward_std": 0.7309335023164749, "rewards/cosine_scaled_reward": -0.03629505028948188, "rewards/format_reward": 0.8541666865348816, "step": 227 }, { "completion_length": 1626.4167175292969, "epoch": 0.26057142857142856, "grad_norm": 0.23888561129570007, "kl": 0.00714874267578125, "learning_rate": 6.950195628537299e-07, "loss": 0.0003, "reward": 0.6099164858460426, "reward_std": 0.7778853923082352, "rewards/cosine_scaled_reward": -0.09087510220706463, "rewards/format_reward": 0.7916666716337204, "step": 228 }, { "completion_length": 1440.6875457763672, "epoch": 0.26171428571428573, "grad_norm": 0.2864842116832733, "kl": 0.0119171142578125, "learning_rate": 6.920420666261961e-07, "loss": 0.0005, "reward": 0.7616857700049877, "reward_std": 0.7498719990253448, "rewards/cosine_scaled_reward": -0.025407111272215843, "rewards/format_reward": 0.8125000149011612, "step": 229 }, { "completion_length": 1527.8958740234375, "epoch": 0.26285714285714284, "grad_norm": 0.2429640144109726, "kl": 0.00768280029296875, "learning_rate": 6.890576474687263e-07, "loss": 0.0003, "reward": 0.9651975035667419, "reward_std": 0.824803501367569, "rewards/cosine_scaled_reward": 0.06593206711113453, "rewards/format_reward": 0.8333333432674408, "step": 230 }, { "completion_length": 1934.2084045410156, "epoch": 0.264, "grad_norm": 0.3963330090045929, "kl": 0.0092926025390625, "learning_rate": 6.860664508377001e-07, "loss": 0.0004, "reward": 0.6242162762209773, "reward_std": 0.8598367348313332, "rewards/cosine_scaled_reward": -0.03164188005030155, "rewards/format_reward": 0.6875000149011612, "step": 231 }, { "completion_length": 1455.8333740234375, "epoch": 0.2651428571428571, "grad_norm": 0.23361533880233765, "kl": 0.00798797607421875, "learning_rate": 6.83068622519821e-07, "loss": 0.0003, "reward": 0.7301613166928291, "reward_std": 0.7596315294504166, "rewards/cosine_scaled_reward": -0.07241935143247247, "rewards/format_reward": 0.8750000149011612, "step": 232 }, { "completion_length": 2010.8333740234375, "epoch": 0.2662857142857143, "grad_norm": 0.2177191823720932, "kl": 0.0087738037109375, "learning_rate": 6.800643086250121e-07, "loss": 0.0004, "reward": 0.6955921053886414, "reward_std": 0.8746853768825531, "rewards/cosine_scaled_reward": -0.0480372947640717, "rewards/format_reward": 0.7916666865348816, "step": 233 }, { "completion_length": 1696.8541870117188, "epoch": 0.2674285714285714, "grad_norm": 0.2090214192867279, "kl": 0.008090972900390625, "learning_rate": 6.770536555792944e-07, "loss": 0.0003, "reward": 0.6760512292385101, "reward_std": 0.6585969775915146, "rewards/cosine_scaled_reward": -0.0578077242244035, "rewards/format_reward": 0.7916666716337204, "step": 234 }, { "completion_length": 1389.6667175292969, "epoch": 0.26857142857142857, "grad_norm": 0.2434709221124649, "kl": 0.006389617919921875, "learning_rate": 6.740368101176495e-07, "loss": 0.0003, "reward": 0.6067800773307681, "reward_std": 0.41319192945957184, "rewards/cosine_scaled_reward": -0.1028599888086319, "rewards/format_reward": 0.8125, "step": 235 }, { "completion_length": 1396.0625305175781, "epoch": 0.26971428571428574, "grad_norm": 0.23188619315624237, "kl": 0.0070953369140625, "learning_rate": 6.710139192768694e-07, "loss": 0.0003, "reward": 0.5102774500846863, "reward_std": 0.4952257424592972, "rewards/cosine_scaled_reward": -0.21361128613352776, "rewards/format_reward": 0.9375, "step": 236 }, { "completion_length": 1236.3958892822266, "epoch": 0.27085714285714285, "grad_norm": 0.20340608060359955, "kl": 0.00693511962890625, "learning_rate": 6.679851303883891e-07, "loss": 0.0003, "reward": 0.9460461437702179, "reward_std": 0.5520051866769791, "rewards/cosine_scaled_reward": -0.01656026765704155, "rewards/format_reward": 0.9791666716337204, "step": 237 }, { "completion_length": 1293.0208435058594, "epoch": 0.272, "grad_norm": 0.28505584597587585, "kl": 0.00867462158203125, "learning_rate": 6.649505910711058e-07, "loss": 0.0003, "reward": 0.6806494742631912, "reward_std": 0.7478103041648865, "rewards/cosine_scaled_reward": -0.11800861544907093, "rewards/format_reward": 0.9166666716337204, "step": 238 }, { "completion_length": 1973.0000915527344, "epoch": 0.27314285714285713, "grad_norm": 0.29544979333877563, "kl": 0.0121917724609375, "learning_rate": 6.619104492241847e-07, "loss": 0.0005, "reward": 0.5413463786244392, "reward_std": 0.7216374576091766, "rewards/cosine_scaled_reward": -0.10432682058308274, "rewards/format_reward": 0.7500000149011612, "step": 239 }, { "completion_length": 1372.6250305175781, "epoch": 0.2742857142857143, "grad_norm": 0.2549286484718323, "kl": 0.00878143310546875, "learning_rate": 6.588648530198504e-07, "loss": 0.0004, "reward": 1.4046210646629333, "reward_std": 0.5067310631275177, "rewards/cosine_scaled_reward": 0.2543938383460045, "rewards/format_reward": 0.8958333432674408, "step": 240 }, { "completion_length": 1410.0417175292969, "epoch": 0.2754285714285714, "grad_norm": 0.32747364044189453, "kl": 0.009124755859375, "learning_rate": 6.558139508961654e-07, "loss": 0.0004, "reward": 0.8229547590017319, "reward_std": 0.6901429891586304, "rewards/cosine_scaled_reward": -0.026022649370133877, "rewards/format_reward": 0.8750000149011612, "step": 241 }, { "completion_length": 1473.6875228881836, "epoch": 0.2765714285714286, "grad_norm": 0.3592205345630646, "kl": 0.012908935546875, "learning_rate": 6.527578915497951e-07, "loss": 0.0005, "reward": 0.7551293671131134, "reward_std": 0.663729339838028, "rewards/cosine_scaled_reward": -0.0599353089928627, "rewards/format_reward": 0.8750000149011612, "step": 242 }, { "completion_length": 1187.0625610351562, "epoch": 0.2777142857142857, "grad_norm": 0.22532759606838226, "kl": 0.0076751708984375, "learning_rate": 6.496968239287603e-07, "loss": 0.0003, "reward": 0.8520705178380013, "reward_std": 0.45504674315452576, "rewards/cosine_scaled_reward": -0.05313139781355858, "rewards/format_reward": 0.9583333432674408, "step": 243 }, { "completion_length": 1377.7291870117188, "epoch": 0.27885714285714286, "grad_norm": 0.2351863533258438, "kl": 0.00748443603515625, "learning_rate": 6.466308972251785e-07, "loss": 0.0003, "reward": 1.1842002123594284, "reward_std": 1.0244361460208893, "rewards/cosine_scaled_reward": 0.15460011083632708, "rewards/format_reward": 0.8750000149011612, "step": 244 }, { "completion_length": 1255.4167175292969, "epoch": 0.28, "grad_norm": 0.34309011697769165, "kl": 0.01132965087890625, "learning_rate": 6.435602608679916e-07, "loss": 0.0005, "reward": 0.8439341634511948, "reward_std": 0.6376037150621414, "rewards/cosine_scaled_reward": -0.025949583388864994, "rewards/format_reward": 0.8958333432674408, "step": 245 }, { "completion_length": 1144.6250305175781, "epoch": 0.28114285714285714, "grad_norm": 0.27464959025382996, "kl": 0.0088958740234375, "learning_rate": 6.404850645156841e-07, "loss": 0.0004, "reward": 0.9792153835296631, "reward_std": 0.5227902606129646, "rewards/cosine_scaled_reward": 2.4352222681045532e-05, "rewards/format_reward": 0.9791666716337204, "step": 246 }, { "completion_length": 985.2083587646484, "epoch": 0.2822857142857143, "grad_norm": 0.3299963176250458, "kl": 0.01165771484375, "learning_rate": 6.374054580489873e-07, "loss": 0.0005, "reward": 1.1308997794985771, "reward_std": 0.699935294687748, "rewards/cosine_scaled_reward": 0.09669988602399826, "rewards/format_reward": 0.9375, "step": 247 }, { "completion_length": 1719.0208740234375, "epoch": 0.2834285714285714, "grad_norm": 0.27527329325675964, "kl": 0.0107879638671875, "learning_rate": 6.343215915635761e-07, "loss": 0.0004, "reward": 0.808366172015667, "reward_std": 0.725908488035202, "rewards/cosine_scaled_reward": -0.012483585625886917, "rewards/format_reward": 0.8333333432674408, "step": 248 }, { "completion_length": 1970.2083740234375, "epoch": 0.2845714285714286, "grad_norm": 0.27165958285331726, "kl": 0.011810302734375, "learning_rate": 6.31233615362752e-07, "loss": 0.0005, "reward": 0.3851170837879181, "reward_std": 0.7313886731863022, "rewards/cosine_scaled_reward": -0.16160813719034195, "rewards/format_reward": 0.7083333432674408, "step": 249 }, { "completion_length": 1342.3750305175781, "epoch": 0.2857142857142857, "grad_norm": 0.2968628704547882, "kl": 0.00905609130859375, "learning_rate": 6.281416799501187e-07, "loss": 0.0004, "reward": 0.8386699110269547, "reward_std": 0.7474230378866196, "rewards/cosine_scaled_reward": -0.02858173381537199, "rewards/format_reward": 0.8958333432674408, "step": 250 }, { "completion_length": 1774.0208740234375, "epoch": 0.28685714285714287, "grad_norm": 0.39884844422340393, "kl": 0.0139312744140625, "learning_rate": 6.25045936022246e-07, "loss": 0.0006, "reward": 1.0918036848306656, "reward_std": 1.1108526289463043, "rewards/cosine_scaled_reward": 0.1917351707816124, "rewards/format_reward": 0.708333358168602, "step": 251 }, { "completion_length": 1604.7500610351562, "epoch": 0.288, "grad_norm": 0.32672712206840515, "kl": 0.0103912353515625, "learning_rate": 6.219465344613258e-07, "loss": 0.0004, "reward": 0.7797054275870323, "reward_std": 0.6628324761986732, "rewards/cosine_scaled_reward": -0.06848062574863434, "rewards/format_reward": 0.9166666716337204, "step": 252 }, { "completion_length": 1578.9167175292969, "epoch": 0.28914285714285715, "grad_norm": 0.3023277223110199, "kl": 0.01739501953125, "learning_rate": 6.188436263278172e-07, "loss": 0.0007, "reward": 0.7126835100352764, "reward_std": 0.786981999874115, "rewards/cosine_scaled_reward": -0.039491571485996246, "rewards/format_reward": 0.7916666716337204, "step": 253 }, { "completion_length": 1142.6458435058594, "epoch": 0.29028571428571426, "grad_norm": 0.4636065363883972, "kl": 0.0141448974609375, "learning_rate": 6.157373628530852e-07, "loss": 0.0006, "reward": 1.1053122580051422, "reward_std": 0.47931085526943207, "rewards/cosine_scaled_reward": 0.16723946738056839, "rewards/format_reward": 0.7708333432674408, "step": 254 }, { "completion_length": 1286.0833435058594, "epoch": 0.2914285714285714, "grad_norm": 0.28858116269111633, "kl": 0.009735107421875, "learning_rate": 6.126278954320294e-07, "loss": 0.0004, "reward": 1.1211883053183556, "reward_std": 0.4671914726495743, "rewards/cosine_scaled_reward": 0.09184413589537144, "rewards/format_reward": 0.9375000149011612, "step": 255 }, { "completion_length": 1213.2083740234375, "epoch": 0.2925714285714286, "grad_norm": 0.32043886184692383, "kl": 0.015838623046875, "learning_rate": 6.095153756157051e-07, "loss": 0.0006, "reward": 0.823145791888237, "reward_std": 0.5544667765498161, "rewards/cosine_scaled_reward": -0.0675937756896019, "rewards/format_reward": 0.9583333432674408, "step": 256 }, { "completion_length": 1316.8541870117188, "epoch": 0.2937142857142857, "grad_norm": 0.23760062456130981, "kl": 0.0098114013671875, "learning_rate": 6.06399955103937e-07, "loss": 0.0004, "reward": 0.49779732525348663, "reward_std": 0.46498920768499374, "rewards/cosine_scaled_reward": -0.23026802763342857, "rewards/format_reward": 0.9583333432674408, "step": 257 }, { "completion_length": 1751.2708435058594, "epoch": 0.2948571428571429, "grad_norm": 0.254151314496994, "kl": 0.0131378173828125, "learning_rate": 6.032817857379256e-07, "loss": 0.0005, "reward": 0.6079542301595211, "reward_std": 0.6472693011164665, "rewards/cosine_scaled_reward": -0.10227290168404579, "rewards/format_reward": 0.8125000149011612, "step": 258 }, { "completion_length": 978.6458435058594, "epoch": 0.296, "grad_norm": 0.29362747073173523, "kl": 0.00855255126953125, "learning_rate": 6.001610194928464e-07, "loss": 0.0003, "reward": 0.7723531350493431, "reward_std": 0.7641957998275757, "rewards/cosine_scaled_reward": -0.10340679436922073, "rewards/format_reward": 0.9791666716337204, "step": 259 }, { "completion_length": 1655.1875305175781, "epoch": 0.29714285714285715, "grad_norm": 0.38929465413093567, "kl": 0.0145721435546875, "learning_rate": 5.97037808470444e-07, "loss": 0.0006, "reward": 0.37256848718971014, "reward_std": 0.6071458011865616, "rewards/cosine_scaled_reward": -0.18871578108519316, "rewards/format_reward": 0.7500000298023224, "step": 260 }, { "completion_length": 1198.8333892822266, "epoch": 0.29828571428571427, "grad_norm": 0.2716739773750305, "kl": 0.0102081298828125, "learning_rate": 5.939123048916173e-07, "loss": 0.0004, "reward": 0.633615754544735, "reward_std": 0.586112380027771, "rewards/cosine_scaled_reward": -0.1519421450793743, "rewards/format_reward": 0.9375000149011612, "step": 261 }, { "completion_length": 1484.1042175292969, "epoch": 0.29942857142857143, "grad_norm": 0.2628130316734314, "kl": 0.0106353759765625, "learning_rate": 5.907846610890011e-07, "loss": 0.0004, "reward": 0.4845110587775707, "reward_std": 0.536693274974823, "rewards/cosine_scaled_reward": -0.18482780829071999, "rewards/format_reward": 0.8541666716337204, "step": 262 }, { "completion_length": 1064.0833587646484, "epoch": 0.30057142857142854, "grad_norm": 0.36321043968200684, "kl": 0.0098876953125, "learning_rate": 5.87655029499542e-07, "loss": 0.0004, "reward": 0.9257199168205261, "reward_std": 0.8942077457904816, "rewards/cosine_scaled_reward": -0.016306710429489613, "rewards/format_reward": 0.9583333432674408, "step": 263 }, { "completion_length": 1316.1042175292969, "epoch": 0.3017142857142857, "grad_norm": 0.26243484020233154, "kl": 0.00811004638671875, "learning_rate": 5.845235626570683e-07, "loss": 0.0003, "reward": 0.6479791402816772, "reward_std": 0.6652240380644798, "rewards/cosine_scaled_reward": -0.15517710940912366, "rewards/format_reward": 0.9583333432674408, "step": 264 }, { "completion_length": 1909.6250915527344, "epoch": 0.3028571428571429, "grad_norm": 0.2833210229873657, "kl": 0.01206207275390625, "learning_rate": 5.813904131848564e-07, "loss": 0.0005, "reward": 0.7709198147058487, "reward_std": 0.6459413915872574, "rewards/cosine_scaled_reward": 4.323199391365051e-05, "rewards/format_reward": 0.7708333432674408, "step": 265 }, { "completion_length": 1645.9583740234375, "epoch": 0.304, "grad_norm": 0.3645997941493988, "kl": 0.01513671875, "learning_rate": 5.78255733788191e-07, "loss": 0.0006, "reward": 0.4407457821071148, "reward_std": 0.6104073449969292, "rewards/cosine_scaled_reward": -0.1858771131373942, "rewards/format_reward": 0.8125000149011612, "step": 266 }, { "completion_length": 1644.0209045410156, "epoch": 0.30514285714285716, "grad_norm": 0.27599290013313293, "kl": 0.014923095703125, "learning_rate": 5.751196772469237e-07, "loss": 0.0006, "reward": 0.7639178857207298, "reward_std": 0.5990442484617233, "rewards/cosine_scaled_reward": -0.013874400407075882, "rewards/format_reward": 0.7916666716337204, "step": 267 }, { "completion_length": 1681.4583587646484, "epoch": 0.3062857142857143, "grad_norm": 0.2269752472639084, "kl": 0.01216888427734375, "learning_rate": 5.71982396408026e-07, "loss": 0.0005, "reward": 0.5377051346004009, "reward_std": 0.6556981913745403, "rewards/cosine_scaled_reward": -0.14781412575393915, "rewards/format_reward": 0.8333333358168602, "step": 268 }, { "completion_length": 1047.1250457763672, "epoch": 0.30742857142857144, "grad_norm": 0.3485262095928192, "kl": 0.01123046875, "learning_rate": 5.688440441781398e-07, "loss": 0.0005, "reward": 0.68592269718647, "reward_std": 0.5478300377726555, "rewards/cosine_scaled_reward": -0.13620532862842083, "rewards/format_reward": 0.9583333432674408, "step": 269 }, { "completion_length": 1012.0208587646484, "epoch": 0.30857142857142855, "grad_norm": 0.3116324543952942, "kl": 0.007354736328125, "learning_rate": 5.657047735161255e-07, "loss": 0.0003, "reward": 0.9341000914573669, "reward_std": 0.6274480298161507, "rewards/cosine_scaled_reward": -0.03294998221099377, "rewards/format_reward": 1.0, "step": 270 }, { "completion_length": 1732.6459350585938, "epoch": 0.3097142857142857, "grad_norm": 0.42208629846572876, "kl": 0.016632080078125, "learning_rate": 5.625647374256061e-07, "loss": 0.0007, "reward": 0.7596426885575056, "reward_std": 0.77412149310112, "rewards/cosine_scaled_reward": -0.0055953278206288815, "rewards/format_reward": 0.7708333432674408, "step": 271 }, { "completion_length": 2177.270965576172, "epoch": 0.31085714285714283, "grad_norm": 0.5940233469009399, "kl": 0.02764892578125, "learning_rate": 5.594240889475106e-07, "loss": 0.0011, "reward": 0.4223189577460289, "reward_std": 0.8890358135104179, "rewards/cosine_scaled_reward": -0.10134052112698555, "rewards/format_reward": 0.6250000223517418, "step": 272 }, { "completion_length": 1954.8333740234375, "epoch": 0.312, "grad_norm": 0.3138795495033264, "kl": 0.02874755859375, "learning_rate": 5.562829811526154e-07, "loss": 0.0012, "reward": 0.3654465600848198, "reward_std": 0.5650510713458061, "rewards/cosine_scaled_reward": -0.1506100632250309, "rewards/format_reward": 0.6666666865348816, "step": 273 }, { "completion_length": 1267.4583587646484, "epoch": 0.31314285714285717, "grad_norm": 0.274538516998291, "kl": 0.01172637939453125, "learning_rate": 5.531415671340826e-07, "loss": 0.0005, "reward": 0.837937019765377, "reward_std": 0.6332506015896797, "rewards/cosine_scaled_reward": -0.03936483711004257, "rewards/format_reward": 0.9166666716337204, "step": 274 }, { "completion_length": 2021.4583740234375, "epoch": 0.3142857142857143, "grad_norm": 0.30807891488075256, "kl": 0.02382659912109375, "learning_rate": 5.5e-07, "loss": 0.001, "reward": 0.9066380485892296, "reward_std": 0.9834412485361099, "rewards/cosine_scaled_reward": 0.1095690238289535, "rewards/format_reward": 0.6875000223517418, "step": 275 }, { "completion_length": 1135.8750305175781, "epoch": 0.31542857142857145, "grad_norm": 0.2739965319633484, "kl": 0.010345458984375, "learning_rate": 5.468584328659172e-07, "loss": 0.0004, "reward": 1.0236308723688126, "reward_std": 0.5140665993094444, "rewards/cosine_scaled_reward": 0.04306542640551925, "rewards/format_reward": 0.9375000149011612, "step": 276 }, { "completion_length": 1247.2500305175781, "epoch": 0.31657142857142856, "grad_norm": 0.37156689167022705, "kl": 0.00988006591796875, "learning_rate": 5.437170188473847e-07, "loss": 0.0004, "reward": 0.8927154019474983, "reward_std": 0.8232090175151825, "rewards/cosine_scaled_reward": -0.0015590004622936249, "rewards/format_reward": 0.8958333432674408, "step": 277 }, { "completion_length": 1309.3125610351562, "epoch": 0.3177142857142857, "grad_norm": 0.2857230603694916, "kl": 0.0121002197265625, "learning_rate": 5.405759110524894e-07, "loss": 0.0005, "reward": 0.942589208483696, "reward_std": 0.6875655725598335, "rewards/cosine_scaled_reward": 0.012961250729858875, "rewards/format_reward": 0.9166666716337204, "step": 278 }, { "completion_length": 1435.4791717529297, "epoch": 0.31885714285714284, "grad_norm": 0.43713676929473877, "kl": 0.01840972900390625, "learning_rate": 5.37435262574394e-07, "loss": 0.0007, "reward": 0.37974046915769577, "reward_std": 0.5385972559452057, "rewards/cosine_scaled_reward": -0.2163797914981842, "rewards/format_reward": 0.8125, "step": 279 }, { "completion_length": 1156.062515258789, "epoch": 0.32, "grad_norm": 0.2634413242340088, "kl": 0.0093536376953125, "learning_rate": 5.342952264838747e-07, "loss": 0.0004, "reward": 0.980226680636406, "reward_std": 0.5487889721989632, "rewards/cosine_scaled_reward": 0.0005300038028508425, "rewards/format_reward": 0.9791666716337204, "step": 280 }, { "completion_length": 1187.3125457763672, "epoch": 0.3211428571428571, "grad_norm": 0.32134756445884705, "kl": 0.008182525634765625, "learning_rate": 5.311559558218603e-07, "loss": 0.0003, "reward": 0.9637440145015717, "reward_std": 0.8387380540370941, "rewards/cosine_scaled_reward": 0.0027053444646298885, "rewards/format_reward": 0.9583333432674408, "step": 281 }, { "completion_length": 1651.604232788086, "epoch": 0.3222857142857143, "grad_norm": 0.37761881947517395, "kl": 0.0242919921875, "learning_rate": 5.28017603591974e-07, "loss": 0.001, "reward": 0.5987202003598213, "reward_std": 0.6141614019870758, "rewards/cosine_scaled_reward": -0.10688992030918598, "rewards/format_reward": 0.8125000149011612, "step": 282 }, { "completion_length": 1207.1250610351562, "epoch": 0.32342857142857145, "grad_norm": 0.3965184688568115, "kl": 0.0155487060546875, "learning_rate": 5.248803227530763e-07, "loss": 0.0006, "reward": 0.6064153388142586, "reward_std": 0.6052896529436111, "rewards/cosine_scaled_reward": -0.16554233682109043, "rewards/format_reward": 0.9375000149011612, "step": 283 }, { "completion_length": 1520.5625610351562, "epoch": 0.32457142857142857, "grad_norm": 0.4802095890045166, "kl": 0.01385498046875, "learning_rate": 5.21744266211809e-07, "loss": 0.0006, "reward": 0.6298409104347229, "reward_std": 0.7485504075884819, "rewards/cosine_scaled_reward": -0.10174621269106865, "rewards/format_reward": 0.833333358168602, "step": 284 }, { "completion_length": 1631.4166870117188, "epoch": 0.32571428571428573, "grad_norm": 0.5242775678634644, "kl": 0.0250244140625, "learning_rate": 5.186095868151436e-07, "loss": 0.001, "reward": 0.4431188479065895, "reward_std": 0.6355130672454834, "rewards/cosine_scaled_reward": -0.20552390813827515, "rewards/format_reward": 0.8541666865348816, "step": 285 }, { "completion_length": 1278.2917175292969, "epoch": 0.32685714285714285, "grad_norm": 0.2374052256345749, "kl": 0.009552001953125, "learning_rate": 5.154764373429315e-07, "loss": 0.0004, "reward": 0.9486149102449417, "reward_std": 0.6938119828701019, "rewards/cosine_scaled_reward": -0.025692567229270935, "rewards/format_reward": 1.0, "step": 286 }, { "completion_length": 1549.5208740234375, "epoch": 0.328, "grad_norm": 0.5496286749839783, "kl": 0.0344696044921875, "learning_rate": 5.123449705004581e-07, "loss": 0.0014, "reward": 0.6046592518687248, "reward_std": 0.7527553886175156, "rewards/cosine_scaled_reward": -0.12475371174514294, "rewards/format_reward": 0.8541666865348816, "step": 287 }, { "completion_length": 906.6041717529297, "epoch": 0.3291428571428571, "grad_norm": 0.40935972332954407, "kl": 0.014556884765625, "learning_rate": 5.09215338910999e-07, "loss": 0.0006, "reward": 1.1515108793973923, "reward_std": 0.670855775475502, "rewards/cosine_scaled_reward": 0.07575542479753494, "rewards/format_reward": 1.0, "step": 288 }, { "completion_length": 1302.3958587646484, "epoch": 0.3302857142857143, "grad_norm": 0.33956480026245117, "kl": 0.01055145263671875, "learning_rate": 5.060876951083828e-07, "loss": 0.0004, "reward": 0.8325834274291992, "reward_std": 0.469268262386322, "rewards/cosine_scaled_reward": -0.0628749430179596, "rewards/format_reward": 0.9583333432674408, "step": 289 }, { "completion_length": 1219.1667022705078, "epoch": 0.3314285714285714, "grad_norm": 0.5517088174819946, "kl": 0.0243072509765625, "learning_rate": 5.02962191529556e-07, "loss": 0.001, "reward": 1.3289316296577454, "reward_std": 0.8555012494325638, "rewards/cosine_scaled_reward": 0.1957157626748085, "rewards/format_reward": 0.9375000149011612, "step": 290 }, { "completion_length": 1010.8541870117188, "epoch": 0.3325714285714286, "grad_norm": 0.44879335165023804, "kl": 0.020660400390625, "learning_rate": 4.998389805071536e-07, "loss": 0.0008, "reward": 0.886590301990509, "reward_std": 0.6713649779558182, "rewards/cosine_scaled_reward": 0.00579514354467392, "rewards/format_reward": 0.8750000149011612, "step": 291 }, { "completion_length": 1223.687515258789, "epoch": 0.33371428571428574, "grad_norm": 0.3632715344429016, "kl": 0.0165252685546875, "learning_rate": 4.967182142620745e-07, "loss": 0.0007, "reward": 0.8943772986531258, "reward_std": 0.8097474128007889, "rewards/cosine_scaled_reward": -0.0007280493155121803, "rewards/format_reward": 0.8958333432674408, "step": 292 }, { "completion_length": 1374.0208740234375, "epoch": 0.33485714285714285, "grad_norm": 0.4008006155490875, "kl": 0.0230865478515625, "learning_rate": 4.93600044896063e-07, "loss": 0.0009, "reward": 0.6955159157514572, "reward_std": 0.6655403971672058, "rewards/cosine_scaled_reward": -0.11057540401816368, "rewards/format_reward": 0.9166666865348816, "step": 293 }, { "completion_length": 1269.2917022705078, "epoch": 0.336, "grad_norm": 0.20706599950790405, "kl": 0.00942230224609375, "learning_rate": 4.904846243842949e-07, "loss": 0.0004, "reward": 1.2973814010620117, "reward_std": 0.7491715997457504, "rewards/cosine_scaled_reward": 0.16952402517199516, "rewards/format_reward": 0.9583333432674408, "step": 294 }, { "completion_length": 2128.9375610351562, "epoch": 0.33714285714285713, "grad_norm": 0.3372494876384735, "kl": 0.03973388671875, "learning_rate": 4.873721045679706e-07, "loss": 0.0016, "reward": 0.4113108851015568, "reward_std": 0.6322937309741974, "rewards/cosine_scaled_reward": -0.11726122908294201, "rewards/format_reward": 0.6458333432674408, "step": 295 }, { "completion_length": 938.1042022705078, "epoch": 0.3382857142857143, "grad_norm": 0.3853781521320343, "kl": 0.01088714599609375, "learning_rate": 4.842626371469149e-07, "loss": 0.0004, "reward": 1.2917412221431732, "reward_std": 0.859960287809372, "rewards/cosine_scaled_reward": 0.16670391708612442, "rewards/format_reward": 0.9583333432674408, "step": 296 }, { "completion_length": 1234.5000305175781, "epoch": 0.3394285714285714, "grad_norm": 0.32493123412132263, "kl": 0.02001953125, "learning_rate": 4.811563736721829e-07, "loss": 0.0008, "reward": 0.5709987878799438, "reward_std": 0.5739526003599167, "rewards/cosine_scaled_reward": -0.1728339404799044, "rewards/format_reward": 0.9166666865348816, "step": 297 }, { "completion_length": 1737.8750305175781, "epoch": 0.3405714285714286, "grad_norm": 0.6288060545921326, "kl": 0.02978515625, "learning_rate": 4.780534655386743e-07, "loss": 0.0012, "reward": 0.7676291763782501, "reward_std": 0.6836749911308289, "rewards/cosine_scaled_reward": -0.0328520848415792, "rewards/format_reward": 0.833333358168602, "step": 298 }, { "completion_length": 1495.9583740234375, "epoch": 0.3417142857142857, "grad_norm": 0.5687032341957092, "kl": 0.02923583984375, "learning_rate": 4.749540639777539e-07, "loss": 0.0012, "reward": 0.7650880664587021, "reward_std": 0.7526437044143677, "rewards/cosine_scaled_reward": -0.0757893230766058, "rewards/format_reward": 0.9166666716337204, "step": 299 }, { "completion_length": 1482.9167022705078, "epoch": 0.34285714285714286, "grad_norm": 0.307375431060791, "kl": 0.0286102294921875, "learning_rate": 4.7185832004988133e-07, "loss": 0.0011, "reward": 1.0939996913075447, "reward_std": 0.6226199977099895, "rewards/cosine_scaled_reward": 0.10949981957674026, "rewards/format_reward": 0.875, "step": 300 }, { "completion_length": 1668.7708740234375, "epoch": 0.344, "grad_norm": 0.3655478358268738, "kl": 0.0385589599609375, "learning_rate": 4.68766384637248e-07, "loss": 0.0015, "reward": 0.9393804222345352, "reward_std": 0.5472998023033142, "rewards/cosine_scaled_reward": 0.04260684549808502, "rewards/format_reward": 0.8541666865348816, "step": 301 }, { "completion_length": 1460.1667175292969, "epoch": 0.34514285714285714, "grad_norm": 0.5648999810218811, "kl": 0.0326690673828125, "learning_rate": 4.656784084364238e-07, "loss": 0.0013, "reward": 0.8058248609304428, "reward_std": 0.5371805727481842, "rewards/cosine_scaled_reward": -0.034587569534778595, "rewards/format_reward": 0.8750000149011612, "step": 302 }, { "completion_length": 1176.3958740234375, "epoch": 0.3462857142857143, "grad_norm": 0.2893832325935364, "kl": 0.01714324951171875, "learning_rate": 4.6259454195101267e-07, "loss": 0.0007, "reward": 0.9501863233745098, "reward_std": 0.7773154973983765, "rewards/cosine_scaled_reward": 0.006343139801174402, "rewards/format_reward": 0.9375000149011612, "step": 303 }, { "completion_length": 1049.2708587646484, "epoch": 0.3474285714285714, "grad_norm": 0.3396141231060028, "kl": 0.017486572265625, "learning_rate": 4.59514935484316e-07, "loss": 0.0007, "reward": 1.3814565241336823, "reward_std": 0.7759077772498131, "rewards/cosine_scaled_reward": 0.21156160347163677, "rewards/format_reward": 0.9583333432674408, "step": 304 }, { "completion_length": 1675.1667175292969, "epoch": 0.3485714285714286, "grad_norm": 0.32787981629371643, "kl": 0.0314483642578125, "learning_rate": 4.5643973913200837e-07, "loss": 0.0013, "reward": 0.7963635921478271, "reward_std": 0.7029048502445221, "rewards/cosine_scaled_reward": -0.018484866246581078, "rewards/format_reward": 0.8333333432674408, "step": 305 }, { "completion_length": 1478.7916870117188, "epoch": 0.3497142857142857, "grad_norm": 0.7724860310554504, "kl": 0.04205322265625, "learning_rate": 4.5336910277482155e-07, "loss": 0.0017, "reward": 0.934022843837738, "reward_std": 0.6418131068348885, "rewards/cosine_scaled_reward": 0.039928069338202477, "rewards/format_reward": 0.8541666716337204, "step": 306 }, { "completion_length": 1466.062515258789, "epoch": 0.35085714285714287, "grad_norm": 0.7901880741119385, "kl": 0.048187255859375, "learning_rate": 4.503031760712397e-07, "loss": 0.0019, "reward": 1.208159700036049, "reward_std": 0.9337977021932602, "rewards/cosine_scaled_reward": 0.16657985746860504, "rewards/format_reward": 0.8750000149011612, "step": 307 }, { "completion_length": 1011.0, "epoch": 0.352, "grad_norm": 0.43838247656822205, "kl": 0.02608489990234375, "learning_rate": 4.4724210845020494e-07, "loss": 0.001, "reward": 0.8291152790188789, "reward_std": 0.4770050719380379, "rewards/cosine_scaled_reward": -0.022942371666431427, "rewards/format_reward": 0.875, "step": 308 }, { "completion_length": 1304.812515258789, "epoch": 0.35314285714285715, "grad_norm": 0.34449389576911926, "kl": 0.027618408203125, "learning_rate": 4.441860491038345e-07, "loss": 0.0011, "reward": 0.9580995887517929, "reward_std": 0.9287254959344864, "rewards/cosine_scaled_reward": 0.020716451108455658, "rewards/format_reward": 0.9166666716337204, "step": 309 }, { "completion_length": 1587.1875610351562, "epoch": 0.35428571428571426, "grad_norm": 0.3894862234592438, "kl": 0.0413360595703125, "learning_rate": 4.4113514698014953e-07, "loss": 0.0017, "reward": 0.967779666185379, "reward_std": 0.49595198780298233, "rewards/cosine_scaled_reward": 0.04638980980962515, "rewards/format_reward": 0.875, "step": 310 }, { "completion_length": 1005.9583587646484, "epoch": 0.3554285714285714, "grad_norm": 0.5111984610557556, "kl": 0.0352020263671875, "learning_rate": 4.3808955077581546e-07, "loss": 0.0014, "reward": 0.8524645194411278, "reward_std": 0.6346431374549866, "rewards/cosine_scaled_reward": -0.042517755180597305, "rewards/format_reward": 0.9375, "step": 311 }, { "completion_length": 1219.1042175292969, "epoch": 0.3565714285714286, "grad_norm": 0.24494509398937225, "kl": 0.0164337158203125, "learning_rate": 4.350494089288943e-07, "loss": 0.0007, "reward": 1.1846633851528168, "reward_std": 0.8452874422073364, "rewards/cosine_scaled_reward": 0.09233169769868255, "rewards/format_reward": 1.0, "step": 312 }, { "completion_length": 1772.6459045410156, "epoch": 0.3577142857142857, "grad_norm": 0.32232236862182617, "kl": 0.05014801025390625, "learning_rate": 4.3201486961161093e-07, "loss": 0.002, "reward": 0.7452734671533108, "reward_std": 0.5130416378378868, "rewards/cosine_scaled_reward": -0.09611329552717507, "rewards/format_reward": 0.9375000149011612, "step": 313 }, { "completion_length": 1450.9167022705078, "epoch": 0.3588571428571429, "grad_norm": 0.7095328569412231, "kl": 0.04193115234375, "learning_rate": 4.2898608072313045e-07, "loss": 0.0017, "reward": 0.695801317691803, "reward_std": 0.6888641864061356, "rewards/cosine_scaled_reward": -0.07918267324566841, "rewards/format_reward": 0.8541666865348816, "step": 314 }, { "completion_length": 890.2083435058594, "epoch": 0.36, "grad_norm": 0.8648350238800049, "kl": 0.02002716064453125, "learning_rate": 4.2596318988235037e-07, "loss": 0.0008, "reward": 0.8261366635560989, "reward_std": 0.5876666381955147, "rewards/cosine_scaled_reward": -0.07651501428335905, "rewards/format_reward": 0.9791666716337204, "step": 315 }, { "completion_length": 1495.2917175292969, "epoch": 0.36114285714285715, "grad_norm": 0.7055426239967346, "kl": 0.0330047607421875, "learning_rate": 4.2294634442070553e-07, "loss": 0.0013, "reward": 0.5078188478946686, "reward_std": 0.6738529801368713, "rewards/cosine_scaled_reward": -0.18359058536589146, "rewards/format_reward": 0.8750000149011612, "step": 316 }, { "completion_length": 2028.3959350585938, "epoch": 0.36228571428571427, "grad_norm": 0.6188425421714783, "kl": 0.1023101806640625, "learning_rate": 4.1993569137498776e-07, "loss": 0.0041, "reward": 0.6971778050065041, "reward_std": 0.6844265758991241, "rewards/cosine_scaled_reward": 0.015255570411682129, "rewards/format_reward": 0.6666666865348816, "step": 317 }, { "completion_length": 1409.0833740234375, "epoch": 0.36342857142857143, "grad_norm": 0.4974954426288605, "kl": 0.04302978515625, "learning_rate": 4.1693137748017915e-07, "loss": 0.0017, "reward": 0.9140654609072953, "reward_std": 0.590464636683464, "rewards/cosine_scaled_reward": 0.040366058237850666, "rewards/format_reward": 0.8333333432674408, "step": 318 }, { "completion_length": 1569.2500305175781, "epoch": 0.36457142857142855, "grad_norm": 0.3885625898838043, "kl": 0.0399017333984375, "learning_rate": 4.1393354916230005e-07, "loss": 0.0016, "reward": 0.7486050575971603, "reward_std": 0.9239681512117386, "rewards/cosine_scaled_reward": -0.08403081598225981, "rewards/format_reward": 0.9166666865348816, "step": 319 }, { "completion_length": 1059.1875305175781, "epoch": 0.3657142857142857, "grad_norm": 0.5842506289482117, "kl": 0.0222625732421875, "learning_rate": 4.1094235253127374e-07, "loss": 0.0009, "reward": 0.872907280921936, "reward_std": 0.7717489525675774, "rewards/cosine_scaled_reward": -0.0635463809594512, "rewards/format_reward": 1.0, "step": 320 }, { "completion_length": 1675.8334045410156, "epoch": 0.3668571428571429, "grad_norm": 1.7648111581802368, "kl": 0.10723876953125, "learning_rate": 4.079579333738039e-07, "loss": 0.0043, "reward": 0.6553980484604836, "reward_std": 0.6528958007693291, "rewards/cosine_scaled_reward": -0.06813432276248932, "rewards/format_reward": 0.7916666865348816, "step": 321 }, { "completion_length": 1275.2708587646484, "epoch": 0.368, "grad_norm": 0.6659455895423889, "kl": 0.0338134765625, "learning_rate": 4.0498043714627006e-07, "loss": 0.0013, "reward": 1.113847702741623, "reward_std": 0.7288860827684402, "rewards/cosine_scaled_reward": 0.14025717787444592, "rewards/format_reward": 0.8333333432674408, "step": 322 }, { "completion_length": 1612.9166717529297, "epoch": 0.36914285714285716, "grad_norm": 0.6660499572753906, "kl": 0.0994873046875, "learning_rate": 4.020100089676376e-07, "loss": 0.004, "reward": 1.0578741058707237, "reward_std": 0.49694500118494034, "rewards/cosine_scaled_reward": 0.09143703989684582, "rewards/format_reward": 0.8750000149011612, "step": 323 }, { "completion_length": 1938.4584045410156, "epoch": 0.3702857142857143, "grad_norm": 1.3754856586456299, "kl": 0.1189422607421875, "learning_rate": 3.9904679361238526e-07, "loss": 0.0047, "reward": 0.27103549893945456, "reward_std": 0.5563001856207848, "rewards/cosine_scaled_reward": -0.23948227241635323, "rewards/format_reward": 0.7500000149011612, "step": 324 }, { "completion_length": 953.395881652832, "epoch": 0.37142857142857144, "grad_norm": 0.36472955346107483, "kl": 0.0133514404296875, "learning_rate": 3.9609093550344907e-07, "loss": 0.0005, "reward": 1.0393343269824982, "reward_std": 0.7779577225446701, "rewards/cosine_scaled_reward": 0.01966716069728136, "rewards/format_reward": 1.0, "step": 325 }, { "completion_length": 1773.9167175292969, "epoch": 0.37257142857142855, "grad_norm": 1.3614015579223633, "kl": 0.09857177734375, "learning_rate": 3.931425787051832e-07, "loss": 0.0039, "reward": 0.4826292358338833, "reward_std": 0.624052882194519, "rewards/cosine_scaled_reward": -0.1753520662896335, "rewards/format_reward": 0.833333358168602, "step": 326 }, { "completion_length": 1750.354232788086, "epoch": 0.3737142857142857, "grad_norm": 1.2214010953903198, "kl": 0.092681884765625, "learning_rate": 3.902018669163384e-07, "loss": 0.0037, "reward": 0.6829137187451124, "reward_std": 0.5776621401309967, "rewards/cosine_scaled_reward": -0.05437648296356201, "rewards/format_reward": 0.7916666865348816, "step": 327 }, { "completion_length": 1816.6250610351562, "epoch": 0.37485714285714283, "grad_norm": 0.6874251961708069, "kl": 0.08489990234375, "learning_rate": 3.872689434630585e-07, "loss": 0.0034, "reward": 0.796313688158989, "reward_std": 1.0243088752031326, "rewards/cosine_scaled_reward": -0.008093174546957016, "rewards/format_reward": 0.8125000298023224, "step": 328 }, { "completion_length": 2015.5000305175781, "epoch": 0.376, "grad_norm": 1.2615182399749756, "kl": 0.15081787109375, "learning_rate": 3.843439512918949e-07, "loss": 0.006, "reward": 0.6527662584558129, "reward_std": 0.7077807486057281, "rewards/cosine_scaled_reward": -0.09028353914618492, "rewards/format_reward": 0.833333358168602, "step": 329 }, { "completion_length": 2259.291717529297, "epoch": 0.37714285714285717, "grad_norm": 1.1345741748809814, "kl": 0.1787109375, "learning_rate": 3.8142703296283953e-07, "loss": 0.0071, "reward": 0.5901373848319054, "reward_std": 0.9298846423625946, "rewards/cosine_scaled_reward": -0.03826466016471386, "rewards/format_reward": 0.6666666865348816, "step": 330 }, { "completion_length": 2008.9584045410156, "epoch": 0.3782857142857143, "grad_norm": 1.42428719997406, "kl": 0.228515625, "learning_rate": 3.785183306423767e-07, "loss": 0.0092, "reward": 0.315967773552984, "reward_std": 0.7076915055513382, "rewards/cosine_scaled_reward": -0.19618277810513973, "rewards/format_reward": 0.7083333432674408, "step": 331 }, { "completion_length": 1344.7917175292969, "epoch": 0.37942857142857145, "grad_norm": 1.2130067348480225, "kl": 0.17629241943359375, "learning_rate": 3.7561798609655373e-07, "loss": 0.007, "reward": 1.3930619359016418, "reward_std": 0.7922802865505219, "rewards/cosine_scaled_reward": 0.26944761723279953, "rewards/format_reward": 0.8541666716337204, "step": 332 }, { "completion_length": 1645.3750305175781, "epoch": 0.38057142857142856, "grad_norm": 1.1035624742507935, "kl": 0.212158203125, "learning_rate": 3.72726140684072e-07, "loss": 0.0085, "reward": 0.28750851564109325, "reward_std": 0.3911990597844124, "rewards/cosine_scaled_reward": -0.252079077064991, "rewards/format_reward": 0.7916666865348816, "step": 333 }, { "completion_length": 1476.8333740234375, "epoch": 0.38171428571428573, "grad_norm": 1.2739105224609375, "kl": 0.1046142578125, "learning_rate": 3.6984293534939737e-07, "loss": 0.0042, "reward": 0.836659163236618, "reward_std": 0.8671058118343353, "rewards/cosine_scaled_reward": -0.04000374022871256, "rewards/format_reward": 0.9166666865348816, "step": 334 }, { "completion_length": 1723.3333892822266, "epoch": 0.38285714285714284, "grad_norm": 1.672484040260315, "kl": 0.21124267578125, "learning_rate": 3.6696851061588994e-07, "loss": 0.0084, "reward": 0.5473275234689936, "reward_std": 0.6023431569337845, "rewards/cosine_scaled_reward": -0.11175291612744331, "rewards/format_reward": 0.770833358168602, "step": 335 }, { "completion_length": 1291.8542175292969, "epoch": 0.384, "grad_norm": 1.1949892044067383, "kl": 0.13958740234375, "learning_rate": 3.641030065789562e-07, "loss": 0.0056, "reward": 0.43108636140823364, "reward_std": 0.42728982865810394, "rewards/cosine_scaled_reward": -0.22195683978497982, "rewards/format_reward": 0.8750000149011612, "step": 336 }, { "completion_length": 1024.3333740234375, "epoch": 0.3851428571428571, "grad_norm": 0.902114748954773, "kl": 0.11954498291015625, "learning_rate": 3.612465628992203e-07, "loss": 0.0048, "reward": 1.0072421729564667, "reward_std": 0.6713532879948616, "rewards/cosine_scaled_reward": 0.045287732034921646, "rewards/format_reward": 0.9166666865348816, "step": 337 }, { "completion_length": 1448.9375457763672, "epoch": 0.3862857142857143, "grad_norm": 1.2535922527313232, "kl": 0.1561737060546875, "learning_rate": 3.5839931879571725e-07, "loss": 0.0062, "reward": 0.5455589033663273, "reward_std": 0.535648949444294, "rewards/cosine_scaled_reward": -0.13347055204212666, "rewards/format_reward": 0.8125000149011612, "step": 338 }, { "completion_length": 2035.6459045410156, "epoch": 0.38742857142857146, "grad_norm": 0.746334969997406, "kl": 0.178955078125, "learning_rate": 3.555614130391079e-07, "loss": 0.0071, "reward": 0.593096449971199, "reward_std": 0.948510006070137, "rewards/cosine_scaled_reward": -0.08886844478547573, "rewards/format_reward": 0.770833358168602, "step": 339 }, { "completion_length": 1074.1041870117188, "epoch": 0.38857142857142857, "grad_norm": 0.5353119373321533, "kl": 0.041778564453125, "learning_rate": 3.5273298394491515e-07, "loss": 0.0017, "reward": 1.1473649591207504, "reward_std": 0.7414836436510086, "rewards/cosine_scaled_reward": 0.07368248514831066, "rewards/format_reward": 1.0, "step": 340 }, { "completion_length": 1197.7083892822266, "epoch": 0.38971428571428574, "grad_norm": 1.1976772546768188, "kl": 0.1300048828125, "learning_rate": 3.4991416936678276e-07, "loss": 0.0052, "reward": 1.3422877192497253, "reward_std": 0.6801351606845856, "rewards/cosine_scaled_reward": 0.1919771609827876, "rewards/format_reward": 0.9583333432674408, "step": 341 }, { "completion_length": 1645.479248046875, "epoch": 0.39085714285714285, "grad_norm": 1.725953459739685, "kl": 0.33575439453125, "learning_rate": 3.471051066897562e-07, "loss": 0.0135, "reward": 0.45842229574918747, "reward_std": 0.5379992946982384, "rewards/cosine_scaled_reward": -0.1457888763397932, "rewards/format_reward": 0.7500000149011612, "step": 342 }, { "completion_length": 1599.0416870117188, "epoch": 0.392, "grad_norm": 1.9615150690078735, "kl": 0.2664794921875, "learning_rate": 3.4430593282358777e-07, "loss": 0.0107, "reward": 0.8948207944631577, "reward_std": 0.6805202513933182, "rewards/cosine_scaled_reward": 0.009910388849675655, "rewards/format_reward": 0.8750000149011612, "step": 343 }, { "completion_length": 1742.8750610351562, "epoch": 0.3931428571428571, "grad_norm": 41.373931884765625, "kl": 0.8536376953125, "learning_rate": 3.4151678419606233e-07, "loss": 0.0343, "reward": 0.8157278522849083, "reward_std": 0.7099937200546265, "rewards/cosine_scaled_reward": -0.029636098071932793, "rewards/format_reward": 0.8750000149011612, "step": 344 }, { "completion_length": 1436.8125610351562, "epoch": 0.3942857142857143, "grad_norm": 1.8816604614257812, "kl": 0.26708984375, "learning_rate": 3.387377967463493e-07, "loss": 0.0107, "reward": 0.9264494627714157, "reward_std": 0.6063774973154068, "rewards/cosine_scaled_reward": 0.015308059751987457, "rewards/format_reward": 0.8958333432674408, "step": 345 }, { "completion_length": 1363.9792022705078, "epoch": 0.3954285714285714, "grad_norm": 2.3164970874786377, "kl": 0.09549713134765625, "learning_rate": 3.359691059183761e-07, "loss": 0.0038, "reward": 1.0577785670757294, "reward_std": 0.8408964425325394, "rewards/cosine_scaled_reward": 0.07055594399571419, "rewards/format_reward": 0.9166666716337204, "step": 346 }, { "completion_length": 1337.7083892822266, "epoch": 0.3965714285714286, "grad_norm": 1.1808565855026245, "kl": 0.2160491943359375, "learning_rate": 3.3321084665422803e-07, "loss": 0.0086, "reward": 1.5314601063728333, "reward_std": 0.8420404642820358, "rewards/cosine_scaled_reward": 0.31781339878216386, "rewards/format_reward": 0.895833358168602, "step": 347 }, { "completion_length": 1250.9791870117188, "epoch": 0.3977142857142857, "grad_norm": 1.317704439163208, "kl": 0.178863525390625, "learning_rate": 3.3046315338757026e-07, "loss": 0.0072, "reward": 0.746107667684555, "reward_std": 0.5483127310872078, "rewards/cosine_scaled_reward": -0.10611284070182592, "rewards/format_reward": 0.9583333432674408, "step": 348 }, { "completion_length": 1193.000015258789, "epoch": 0.39885714285714285, "grad_norm": 1.4739476442337036, "kl": 0.249725341796875, "learning_rate": 3.2772616003709616e-07, "loss": 0.01, "reward": 1.2662739604711533, "reward_std": 0.7532782405614853, "rewards/cosine_scaled_reward": 0.18522025644779205, "rewards/format_reward": 0.895833358168602, "step": 349 }, { "completion_length": 1177.3541870117188, "epoch": 0.4, "grad_norm": 2.2546327114105225, "kl": 0.24993896484375, "learning_rate": 3.250000000000001e-07, "loss": 0.01, "reward": 1.0857526510953903, "reward_std": 0.6757391728460789, "rewards/cosine_scaled_reward": 0.09495963307563215, "rewards/format_reward": 0.8958333432674408, "step": 350 }, { "completion_length": 1196.0000457763672, "epoch": 0.40114285714285713, "grad_norm": 1.1820554733276367, "kl": 0.15570068359375, "learning_rate": 3.222848061454764e-07, "loss": 0.0062, "reward": 0.5796034894883633, "reward_std": 0.5287084579467773, "rewards/cosine_scaled_reward": -0.178948275744915, "rewards/format_reward": 0.9375000149011612, "step": 351 }, { "completion_length": 1270.5416870117188, "epoch": 0.4022857142857143, "grad_norm": 1.5620218515396118, "kl": 0.12603759765625, "learning_rate": 3.195807108082429e-07, "loss": 0.005, "reward": 0.7529177367687225, "reward_std": 0.6913501024246216, "rewards/cosine_scaled_reward": -0.09229113161563873, "rewards/format_reward": 0.9375000149011612, "step": 352 }, { "completion_length": 1443.1666870117188, "epoch": 0.4034285714285714, "grad_norm": 2.1429245471954346, "kl": 0.18780517578125, "learning_rate": 3.168878457820915e-07, "loss": 0.0075, "reward": 0.9008820652961731, "reward_std": 0.6463882178068161, "rewards/cosine_scaled_reward": -0.007892303168773651, "rewards/format_reward": 0.9166666716337204, "step": 353 }, { "completion_length": 1230.6458587646484, "epoch": 0.4045714285714286, "grad_norm": 0.9503077268600464, "kl": 0.2082061767578125, "learning_rate": 3.142063423134644e-07, "loss": 0.0083, "reward": 0.4887809455394745, "reward_std": 0.4611463025212288, "rewards/cosine_scaled_reward": -0.23477619886398315, "rewards/format_reward": 0.9583333432674408, "step": 354 }, { "completion_length": 930.0625305175781, "epoch": 0.4057142857142857, "grad_norm": 0.8462334275245667, "kl": 0.1320648193359375, "learning_rate": 3.115363310950578e-07, "loss": 0.0053, "reward": 0.8714643996208906, "reward_std": 0.35595114156603813, "rewards/cosine_scaled_reward": -0.022601131349802017, "rewards/format_reward": 0.9166666716337204, "step": 355 }, { "completion_length": 1521.2500610351562, "epoch": 0.40685714285714286, "grad_norm": 3.6062703132629395, "kl": 0.31671142578125, "learning_rate": 3.0887794225945143e-07, "loss": 0.0126, "reward": 0.7512324824929237, "reward_std": 0.7443573772907257, "rewards/cosine_scaled_reward": -0.05146709643304348, "rewards/format_reward": 0.8541666865348816, "step": 356 }, { "completion_length": 1307.0625305175781, "epoch": 0.408, "grad_norm": 2.073002576828003, "kl": 0.2852783203125, "learning_rate": 3.062313053727671e-07, "loss": 0.0114, "reward": 0.6376607120037079, "reward_std": 0.5459994077682495, "rewards/cosine_scaled_reward": -0.13950299471616745, "rewards/format_reward": 0.9166666865348816, "step": 357 }, { "completion_length": 1298.2083587646484, "epoch": 0.40914285714285714, "grad_norm": 1.5467488765716553, "kl": 0.34130859375, "learning_rate": 3.0359654942835247e-07, "loss": 0.0136, "reward": 0.881379060447216, "reward_std": 0.8507445156574249, "rewards/cosine_scaled_reward": 0.003189507406204939, "rewards/format_reward": 0.875, "step": 358 }, { "completion_length": 1172.7916870117188, "epoch": 0.4102857142857143, "grad_norm": 1.671886682510376, "kl": 0.314453125, "learning_rate": 3.0097380284049523e-07, "loss": 0.0126, "reward": 0.7981488406658173, "reward_std": 0.5063908323645592, "rewards/cosine_scaled_reward": -0.04884226247668266, "rewards/format_reward": 0.895833358168602, "step": 359 }, { "completion_length": 990.9792175292969, "epoch": 0.4114285714285714, "grad_norm": 2.374612808227539, "kl": 0.41845703125, "learning_rate": 2.9836319343816397e-07, "loss": 0.0168, "reward": 1.1945213824510574, "reward_std": 0.7698078602552414, "rewards/cosine_scaled_reward": 0.11809402331709862, "rewards/format_reward": 0.9583333432674408, "step": 360 }, { "completion_length": 1524.3541870117188, "epoch": 0.4125714285714286, "grad_norm": 3.4913723468780518, "kl": 1.1523895263671875, "learning_rate": 2.9576484845877793e-07, "loss": 0.0461, "reward": 0.6581477224826813, "reward_std": 0.6756026446819305, "rewards/cosine_scaled_reward": -0.05634281662059948, "rewards/format_reward": 0.7708333432674408, "step": 361 }, { "completion_length": 1185.2708587646484, "epoch": 0.4137142857142857, "grad_norm": 34.620460510253906, "kl": 0.832366943359375, "learning_rate": 2.931788945420058e-07, "loss": 0.0334, "reward": 0.8447651118040085, "reward_std": 0.7907865196466446, "rewards/cosine_scaled_reward": -0.046367482747882605, "rewards/format_reward": 0.9375000149011612, "step": 362 }, { "completion_length": 1141.0208892822266, "epoch": 0.41485714285714287, "grad_norm": 2.2585678100585938, "kl": 0.352935791015625, "learning_rate": 2.9060545772359305e-07, "loss": 0.0141, "reward": 0.5232757963240147, "reward_std": 0.5396992526948452, "rewards/cosine_scaled_reward": -0.1966954478994012, "rewards/format_reward": 0.9166666865348816, "step": 363 }, { "completion_length": 1359.1667175292969, "epoch": 0.416, "grad_norm": 3.5956501960754395, "kl": 0.715576171875, "learning_rate": 2.8804466342921987e-07, "loss": 0.0286, "reward": 0.6145341023802757, "reward_std": 0.5419690161943436, "rewards/cosine_scaled_reward": -0.17189963907003403, "rewards/format_reward": 0.9583333432674408, "step": 364 }, { "completion_length": 1670.229248046875, "epoch": 0.41714285714285715, "grad_norm": 3.5323920249938965, "kl": 1.1943359375, "learning_rate": 2.854966364683872e-07, "loss": 0.0479, "reward": 0.40414058696478605, "reward_std": 0.691382423043251, "rewards/cosine_scaled_reward": -0.17292970418930054, "rewards/format_reward": 0.7500000074505806, "step": 365 }, { "completion_length": 1720.5000305175781, "epoch": 0.41828571428571426, "grad_norm": 3.34942364692688, "kl": 1.042236328125, "learning_rate": 2.829615010283344e-07, "loss": 0.0417, "reward": 0.6996188908815384, "reward_std": 0.7247656881809235, "rewards/cosine_scaled_reward": -0.0460239015519619, "rewards/format_reward": 0.791666679084301, "step": 366 }, { "completion_length": 927.6458587646484, "epoch": 0.41942857142857143, "grad_norm": 1.461083173751831, "kl": 0.38287353515625, "learning_rate": 2.8043938066798645e-07, "loss": 0.0153, "reward": 1.2472570352256298, "reward_std": 0.6670772060751915, "rewards/cosine_scaled_reward": 0.14446185529232025, "rewards/format_reward": 0.9583333432674408, "step": 367 }, { "completion_length": 1301.9792175292969, "epoch": 0.4205714285714286, "grad_norm": 2.861830711364746, "kl": 0.337615966796875, "learning_rate": 2.7793039831193133e-07, "loss": 0.0135, "reward": 1.2516463994979858, "reward_std": 1.0124248266220093, "rewards/cosine_scaled_reward": 0.17790652811527252, "rewards/format_reward": 0.8958333432674408, "step": 368 }, { "completion_length": 1215.5625, "epoch": 0.4217142857142857, "grad_norm": 1.6978230476379395, "kl": 0.335235595703125, "learning_rate": 2.7543467624442956e-07, "loss": 0.0134, "reward": 1.1675912141799927, "reward_std": 0.5593391507863998, "rewards/cosine_scaled_reward": 0.13587890937924385, "rewards/format_reward": 0.8958333432674408, "step": 369 }, { "completion_length": 1309.5208740234375, "epoch": 0.4228571428571429, "grad_norm": 2.2963132858276367, "kl": 0.4232177734375, "learning_rate": 2.729523361034538e-07, "loss": 0.0169, "reward": 0.5871782079339027, "reward_std": 0.778529703617096, "rewards/cosine_scaled_reward": -0.14391089417040348, "rewards/format_reward": 0.8750000298023224, "step": 370 }, { "completion_length": 1090.1042022705078, "epoch": 0.424, "grad_norm": 0.9345715045928955, "kl": 0.28076171875, "learning_rate": 2.7048349887476037e-07, "loss": 0.0112, "reward": 0.6981654912233353, "reward_std": 0.6373357623815536, "rewards/cosine_scaled_reward": -0.09883392881602049, "rewards/format_reward": 0.8958333432674408, "step": 371 }, { "completion_length": 1309.9375610351562, "epoch": 0.42514285714285716, "grad_norm": 0.7092093825340271, "kl": 0.486602783203125, "learning_rate": 2.6802828488599294e-07, "loss": 0.0194, "reward": 0.8811748586595058, "reward_std": 0.5044156312942505, "rewards/cosine_scaled_reward": -0.007329270243644714, "rewards/format_reward": 0.8958333432674408, "step": 372 }, { "completion_length": 1460.7708587646484, "epoch": 0.42628571428571427, "grad_norm": 2.416278839111328, "kl": 0.5400390625, "learning_rate": 2.655868138008171e-07, "loss": 0.0216, "reward": 0.9694189727306366, "reward_std": 0.6003681719303131, "rewards/cosine_scaled_reward": 0.0472094789147377, "rewards/format_reward": 0.8750000149011612, "step": 373 }, { "completion_length": 1132.5208587646484, "epoch": 0.42742857142857144, "grad_norm": 1.0274022817611694, "kl": 0.29541015625, "learning_rate": 2.631592046130896e-07, "loss": 0.0118, "reward": 1.1347306370735168, "reward_std": 0.5300607345998287, "rewards/cosine_scaled_reward": 0.07778198271989822, "rewards/format_reward": 0.9791666716337204, "step": 374 }, { "completion_length": 1205.6042175292969, "epoch": 0.42857142857142855, "grad_norm": 1.4747833013534546, "kl": 0.336669921875, "learning_rate": 2.6074557564105724e-07, "loss": 0.0135, "reward": 0.914167582988739, "reward_std": 0.7538186013698578, "rewards/cosine_scaled_reward": -0.011666236445307732, "rewards/format_reward": 0.9375000149011612, "step": 375 }, { "completion_length": 1323.9583740234375, "epoch": 0.4297142857142857, "grad_norm": 1.5346781015396118, "kl": 0.40960693359375, "learning_rate": 2.583460445215911e-07, "loss": 0.0164, "reward": 0.9741204902529716, "reward_std": 0.8010425865650177, "rewards/cosine_scaled_reward": 0.018310231156647205, "rewards/format_reward": 0.9375000149011612, "step": 376 }, { "completion_length": 1173.8333892822266, "epoch": 0.4308571428571429, "grad_norm": 1.3860251903533936, "kl": 0.32177734375, "learning_rate": 2.5596072820445254e-07, "loss": 0.0129, "reward": 1.1005370616912842, "reward_std": 0.8687849044799805, "rewards/cosine_scaled_reward": 0.11276852712035179, "rewards/format_reward": 0.8750000149011612, "step": 377 }, { "completion_length": 1077.0208587646484, "epoch": 0.432, "grad_norm": 1.6222304105758667, "kl": 0.325439453125, "learning_rate": 2.5358974294659373e-07, "loss": 0.013, "reward": 0.7557893544435501, "reward_std": 0.46596937626600266, "rewards/cosine_scaled_reward": -0.12210530787706375, "rewards/format_reward": 1.0, "step": 378 }, { "completion_length": 1240.3333740234375, "epoch": 0.43314285714285716, "grad_norm": 1.0828309059143066, "kl": 0.356201171875, "learning_rate": 2.512332043064913e-07, "loss": 0.0142, "reward": 0.8545220792293549, "reward_std": 0.6498586684465408, "rewards/cosine_scaled_reward": -0.05190563574433327, "rewards/format_reward": 0.9583333432674408, "step": 379 }, { "completion_length": 1270.6667175292969, "epoch": 0.4342857142857143, "grad_norm": 1.8103889226913452, "kl": 0.383026123046875, "learning_rate": 2.488912271385139e-07, "loss": 0.0153, "reward": 0.6777837425470352, "reward_std": 0.7819753885269165, "rewards/cosine_scaled_reward": -0.08819146640598774, "rewards/format_reward": 0.8541666716337204, "step": 380 }, { "completion_length": 1318.7292175292969, "epoch": 0.43542857142857144, "grad_norm": 1.7676132917404175, "kl": 0.482666015625, "learning_rate": 2.465639255873246e-07, "loss": 0.0193, "reward": 0.7255931571125984, "reward_std": 0.6526116281747818, "rewards/cosine_scaled_reward": -0.10595342982560396, "rewards/format_reward": 0.9375000149011612, "step": 381 }, { "completion_length": 1141.7917022705078, "epoch": 0.43657142857142855, "grad_norm": 1.832429051399231, "kl": 0.3201904296875, "learning_rate": 2.4425141308231765e-07, "loss": 0.0128, "reward": 0.7391386441886425, "reward_std": 0.7291494160890579, "rewards/cosine_scaled_reward": -0.07834736630320549, "rewards/format_reward": 0.8958333432674408, "step": 382 }, { "completion_length": 1615.541732788086, "epoch": 0.4377142857142857, "grad_norm": 2.4247608184814453, "kl": 1.1395721435546875, "learning_rate": 2.4195380233209006e-07, "loss": 0.0456, "reward": 0.8856848031282425, "reward_std": 0.5492001250386238, "rewards/cosine_scaled_reward": 0.005342394113540649, "rewards/format_reward": 0.8750000149011612, "step": 383 }, { "completion_length": 1421.0833435058594, "epoch": 0.43885714285714283, "grad_norm": 1.442662000656128, "kl": 0.374847412109375, "learning_rate": 2.3967120531894857e-07, "loss": 0.015, "reward": 0.32889158837497234, "reward_std": 0.4332951605319977, "rewards/cosine_scaled_reward": -0.26263754442334175, "rewards/format_reward": 0.8541666865348816, "step": 384 }, { "completion_length": 1334.5208435058594, "epoch": 0.44, "grad_norm": 1.4730134010314941, "kl": 0.422088623046875, "learning_rate": 2.374037332934512e-07, "loss": 0.0169, "reward": 0.8442478328943253, "reward_std": 0.6724153012037277, "rewards/cosine_scaled_reward": -0.05704276263713837, "rewards/format_reward": 0.9583333432674408, "step": 385 }, { "completion_length": 1111.4583740234375, "epoch": 0.44114285714285717, "grad_norm": 1.538129448890686, "kl": 0.181640625, "learning_rate": 2.3515149676898552e-07, "loss": 0.0073, "reward": 0.8161994330585003, "reward_std": 0.4277452155947685, "rewards/cosine_scaled_reward": -0.06065032631158829, "rewards/format_reward": 0.9375000149011612, "step": 386 }, { "completion_length": 1218.0208435058594, "epoch": 0.4422857142857143, "grad_norm": 1.626770257949829, "kl": 0.309906005859375, "learning_rate": 2.3291460551638237e-07, "loss": 0.0124, "reward": 0.6058969795703888, "reward_std": 0.7323561906814575, "rewards/cosine_scaled_reward": -0.1658015362918377, "rewards/format_reward": 0.9375000149011612, "step": 387 }, { "completion_length": 1258.187515258789, "epoch": 0.44342857142857145, "grad_norm": 1.7491554021835327, "kl": 0.3914794921875, "learning_rate": 2.306931685585657e-07, "loss": 0.0156, "reward": 0.7627854868769646, "reward_std": 0.7414772808551788, "rewards/cosine_scaled_reward": -0.05610728543251753, "rewards/format_reward": 0.8750000149011612, "step": 388 }, { "completion_length": 1268.4375305175781, "epoch": 0.44457142857142856, "grad_norm": 1.6758959293365479, "kl": 0.440826416015625, "learning_rate": 2.2848729416523859e-07, "loss": 0.0176, "reward": 1.1525626480579376, "reward_std": 0.7809968441724777, "rewards/cosine_scaled_reward": 0.12836465798318386, "rewards/format_reward": 0.895833358168602, "step": 389 }, { "completion_length": 940.9166870117188, "epoch": 0.44571428571428573, "grad_norm": 1.7198045253753662, "kl": 0.3330078125, "learning_rate": 2.2629708984760706e-07, "loss": 0.0133, "reward": 0.9264688044786453, "reward_std": 0.6931325197219849, "rewards/cosine_scaled_reward": -0.03676560753956437, "rewards/format_reward": 1.0, "step": 390 }, { "completion_length": 978.7291717529297, "epoch": 0.44685714285714284, "grad_norm": 0.6671801805496216, "kl": 0.029083251953125, "learning_rate": 2.2412266235313973e-07, "loss": 0.0012, "reward": 0.6134733706712723, "reward_std": 0.4728550612926483, "rewards/cosine_scaled_reward": -0.1932633202522993, "rewards/format_reward": 1.0, "step": 391 }, { "completion_length": 1309.5417175292969, "epoch": 0.448, "grad_norm": 2.3814332485198975, "kl": 0.251953125, "learning_rate": 2.2196411766036487e-07, "loss": 0.0101, "reward": 1.0050799548625946, "reward_std": 0.8785246461629868, "rewards/cosine_scaled_reward": 0.07545664254575968, "rewards/format_reward": 0.8541666865348816, "step": 392 }, { "completion_length": 1097.3125305175781, "epoch": 0.4491428571428571, "grad_norm": 1.722952961921692, "kl": 0.2423095703125, "learning_rate": 2.1982156097370557e-07, "loss": 0.0097, "reward": 0.832743689417839, "reward_std": 0.8567025661468506, "rewards/cosine_scaled_reward": -0.0419615093851462, "rewards/format_reward": 0.9166666865348816, "step": 393 }, { "completion_length": 1223.0833740234375, "epoch": 0.4502857142857143, "grad_norm": 2.990572452545166, "kl": 0.4295654296875, "learning_rate": 2.1769509671835223e-07, "loss": 0.0172, "reward": 0.5410304628312588, "reward_std": 0.6248824968934059, "rewards/cosine_scaled_reward": -0.13573478162288666, "rewards/format_reward": 0.8125000149011612, "step": 394 }, { "completion_length": 1189.500015258789, "epoch": 0.4514285714285714, "grad_norm": 2.629859685897827, "kl": 0.321044921875, "learning_rate": 2.1558482853517253e-07, "loss": 0.0128, "reward": 0.5656278505921364, "reward_std": 0.7980157136917114, "rewards/cosine_scaled_reward": -0.1130194254219532, "rewards/format_reward": 0.7916667014360428, "step": 395 }, { "completion_length": 1043.1458435058594, "epoch": 0.45257142857142857, "grad_norm": 1.0702786445617676, "kl": 0.0563507080078125, "learning_rate": 2.134908592756607e-07, "loss": 0.0023, "reward": 0.7221028283238411, "reward_std": 0.5050450935959816, "rewards/cosine_scaled_reward": -0.11811527609825134, "rewards/format_reward": 0.9583333432674408, "step": 396 }, { "completion_length": 1230.7292175292969, "epoch": 0.45371428571428574, "grad_norm": 3.2597548961639404, "kl": 0.392822265625, "learning_rate": 2.1141329099692406e-07, "loss": 0.0157, "reward": 1.312897451221943, "reward_std": 0.6480658948421478, "rewards/cosine_scaled_reward": 0.2293653730303049, "rewards/format_reward": 0.8541667014360428, "step": 397 }, { "completion_length": 1381.5625305175781, "epoch": 0.45485714285714285, "grad_norm": 1.6307339668273926, "kl": 0.20714569091796875, "learning_rate": 2.0935222495670968e-07, "loss": 0.0083, "reward": 0.7576578855514526, "reward_std": 0.7408057749271393, "rewards/cosine_scaled_reward": -0.10033774503972381, "rewards/format_reward": 0.9583333432674408, "step": 398 }, { "completion_length": 1487.0208740234375, "epoch": 0.456, "grad_norm": 2.9457855224609375, "kl": 0.426025390625, "learning_rate": 2.0730776160846853e-07, "loss": 0.0171, "reward": 0.6220748424530029, "reward_std": 0.5146789476275444, "rewards/cosine_scaled_reward": -0.09521258249878883, "rewards/format_reward": 0.8125000149011612, "step": 399 }, { "completion_length": 1041.7708587646484, "epoch": 0.45714285714285713, "grad_norm": 5.961241722106934, "kl": 0.289337158203125, "learning_rate": 2.0528000059645995e-07, "loss": 0.0116, "reward": 0.8267193324863911, "reward_std": 0.6178570240736008, "rewards/cosine_scaled_reward": -0.04497369006276131, "rewards/format_reward": 0.9166666716337204, "step": 400 }, { "completion_length": 965.6250305175781, "epoch": 0.4582857142857143, "grad_norm": 1.4047675132751465, "kl": 0.1659698486328125, "learning_rate": 2.032690407508949e-07, "loss": 0.0066, "reward": 0.982437789440155, "reward_std": 0.7459337636828423, "rewards/cosine_scaled_reward": 0.0537188770249486, "rewards/format_reward": 0.8750000149011612, "step": 401 }, { "completion_length": 1609.729248046875, "epoch": 0.4594285714285714, "grad_norm": 2.311453342437744, "kl": 0.615234375, "learning_rate": 2.0127498008311922e-07, "loss": 0.0246, "reward": 0.3766894303262234, "reward_std": 0.658165842294693, "rewards/cosine_scaled_reward": -0.2387386392802, "rewards/format_reward": 0.8541666865348816, "step": 402 }, { "completion_length": 1466.9375610351562, "epoch": 0.4605714285714286, "grad_norm": 2.0651133060455322, "kl": 0.7806396484375, "learning_rate": 1.9929791578083655e-07, "loss": 0.0312, "reward": 0.5563420876860619, "reward_std": 0.6794729232788086, "rewards/cosine_scaled_reward": -0.1489123017527163, "rewards/format_reward": 0.8541666865348816, "step": 403 }, { "completion_length": 1289.2708740234375, "epoch": 0.4617142857142857, "grad_norm": 2.5164127349853516, "kl": 0.3046875, "learning_rate": 1.9733794420337213e-07, "loss": 0.0122, "reward": 0.8672967702150345, "reward_std": 0.7338433116674423, "rewards/cosine_scaled_reward": -0.03510164050385356, "rewards/format_reward": 0.9375000149011612, "step": 404 }, { "completion_length": 1246.5833587646484, "epoch": 0.46285714285714286, "grad_norm": 1.7397247552871704, "kl": 0.14056396484375, "learning_rate": 1.9539516087697517e-07, "loss": 0.0056, "reward": 0.7972104996442795, "reward_std": 0.6757538244128227, "rewards/cosine_scaled_reward": 0.0027718953788280487, "rewards/format_reward": 0.7916667014360428, "step": 405 }, { "completion_length": 1569.5209045410156, "epoch": 0.464, "grad_norm": 3.1697490215301514, "kl": 1.05712890625, "learning_rate": 1.934696604901642e-07, "loss": 0.0423, "reward": 0.7342821173369884, "reward_std": 0.6933150887489319, "rewards/cosine_scaled_reward": -0.05994228646159172, "rewards/format_reward": 0.8541666716337204, "step": 406 }, { "completion_length": 970.0625457763672, "epoch": 0.46514285714285714, "grad_norm": 2.7586686611175537, "kl": 0.496337890625, "learning_rate": 1.915615368891117e-07, "loss": 0.0198, "reward": 0.7943353094160557, "reward_std": 0.5273813158273697, "rewards/cosine_scaled_reward": -0.07158234342932701, "rewards/format_reward": 0.9375000149011612, "step": 407 }, { "completion_length": 954.7083435058594, "epoch": 0.4662857142857143, "grad_norm": 1.7943812608718872, "kl": 0.168701171875, "learning_rate": 1.8967088307307e-07, "loss": 0.0067, "reward": 0.9154380261898041, "reward_std": 0.63597172498703, "rewards/cosine_scaled_reward": -0.02144765853881836, "rewards/format_reward": 0.9583333432674408, "step": 408 }, { "completion_length": 1268.2291717529297, "epoch": 0.4674285714285714, "grad_norm": 0.9967200756072998, "kl": 0.53662109375, "learning_rate": 1.8779779118983867e-07, "loss": 0.0215, "reward": 0.9619596749544144, "reward_std": 0.6098195463418961, "rewards/cosine_scaled_reward": 0.022646483033895493, "rewards/format_reward": 0.9166666865348816, "step": 409 }, { "completion_length": 1628.0000610351562, "epoch": 0.4685714285714286, "grad_norm": 5.845465660095215, "kl": 0.8017578125, "learning_rate": 1.8594235253127372e-07, "loss": 0.0321, "reward": 0.6380558162927628, "reward_std": 0.5308569446206093, "rewards/cosine_scaled_reward": -0.14972211251733825, "rewards/format_reward": 0.9375000149011612, "step": 410 }, { "completion_length": 1665.7083740234375, "epoch": 0.4697142857142857, "grad_norm": 1.869497537612915, "kl": 0.796875, "learning_rate": 1.8410465752883758e-07, "loss": 0.0319, "reward": 0.31100673973560333, "reward_std": 0.5528412610292435, "rewards/cosine_scaled_reward": -0.27157998457551, "rewards/format_reward": 0.8541666865348816, "step": 411 }, { "completion_length": 1114.604232788086, "epoch": 0.47085714285714286, "grad_norm": 1.987026333808899, "kl": 0.284912109375, "learning_rate": 1.822847957491922e-07, "loss": 0.0114, "reward": 0.7781837359070778, "reward_std": 0.5904239565134048, "rewards/cosine_scaled_reward": -0.09007478877902031, "rewards/format_reward": 0.9583333432674408, "step": 412 }, { "completion_length": 1371.5417175292969, "epoch": 0.472, "grad_norm": 2.023742198944092, "kl": 0.481689453125, "learning_rate": 1.804828558898332e-07, "loss": 0.0193, "reward": 0.7588743381202221, "reward_std": 0.7781134992837906, "rewards/cosine_scaled_reward": -0.06847950583323836, "rewards/format_reward": 0.8958333432674408, "step": 413 }, { "completion_length": 1710.7292175292969, "epoch": 0.47314285714285714, "grad_norm": 3.7236738204956055, "kl": 1.0467529296875, "learning_rate": 1.7869892577476722e-07, "loss": 0.042, "reward": 0.6331272795796394, "reward_std": 0.7422880977392197, "rewards/cosine_scaled_reward": -0.06885303813032806, "rewards/format_reward": 0.770833358168602, "step": 414 }, { "completion_length": 1409.2708740234375, "epoch": 0.4742857142857143, "grad_norm": 2.454448938369751, "kl": 0.506103515625, "learning_rate": 1.7693309235023127e-07, "loss": 0.0202, "reward": 0.9847299754619598, "reward_std": 0.6669813543558121, "rewards/cosine_scaled_reward": 0.023614969104528427, "rewards/format_reward": 0.9375000149011612, "step": 415 }, { "completion_length": 1729.4791870117188, "epoch": 0.4754285714285714, "grad_norm": 2.803572416305542, "kl": 0.96875, "learning_rate": 1.7518544168045524e-07, "loss": 0.0387, "reward": 0.4970005638897419, "reward_std": 0.8149459362030029, "rewards/cosine_scaled_reward": -0.10566640645265579, "rewards/format_reward": 0.7083333432674408, "step": 416 }, { "completion_length": 1810.1875457763672, "epoch": 0.4765714285714286, "grad_norm": 5.9517717361450195, "kl": 1.576904296875, "learning_rate": 1.7345605894346726e-07, "loss": 0.0632, "reward": 0.5241810567677021, "reward_std": 0.64292823523283, "rewards/cosine_scaled_reward": -0.08165947627276182, "rewards/format_reward": 0.6875000074505806, "step": 417 }, { "completion_length": 1492.8958740234375, "epoch": 0.4777142857142857, "grad_norm": 4.798679828643799, "kl": 0.8544921875, "learning_rate": 1.7174502842694212e-07, "loss": 0.0342, "reward": 0.4330623224377632, "reward_std": 0.5981302931904793, "rewards/cosine_scaled_reward": -0.22096885181963444, "rewards/format_reward": 0.8750000149011612, "step": 418 }, { "completion_length": 1555.2708435058594, "epoch": 0.47885714285714287, "grad_norm": 4.279539585113525, "kl": 0.86328125, "learning_rate": 1.7005243352409333e-07, "loss": 0.0346, "reward": 0.5117529258131981, "reward_std": 0.6584911718964577, "rewards/cosine_scaled_reward": -0.1399568784981966, "rewards/format_reward": 0.7916666865348816, "step": 419 }, { "completion_length": 1216.3542175292969, "epoch": 0.48, "grad_norm": 2.101778030395508, "kl": 0.41534423828125, "learning_rate": 1.6837835672960831e-07, "loss": 0.0167, "reward": 0.896861981600523, "reward_std": 0.856042891740799, "rewards/cosine_scaled_reward": 0.0005143135786056519, "rewards/format_reward": 0.8958333432674408, "step": 420 }, { "completion_length": 1162.3750457763672, "epoch": 0.48114285714285715, "grad_norm": 2.8854591846466064, "kl": 0.33807373046875, "learning_rate": 1.6672287963562852e-07, "loss": 0.0135, "reward": 0.8688920065760612, "reward_std": 0.7225290387868881, "rewards/cosine_scaled_reward": -0.013470660895109177, "rewards/format_reward": 0.895833358168602, "step": 421 }, { "completion_length": 1370.8541870117188, "epoch": 0.48228571428571426, "grad_norm": 1.7124054431915283, "kl": 0.51220703125, "learning_rate": 1.6508608292777203e-07, "loss": 0.0205, "reward": 0.7653668001294136, "reward_std": 0.7626539617776871, "rewards/cosine_scaled_reward": -0.07564992923289537, "rewards/format_reward": 0.9166666865348816, "step": 422 }, { "completion_length": 1183.708396911621, "epoch": 0.48342857142857143, "grad_norm": 1.421221137046814, "kl": 0.293243408203125, "learning_rate": 1.6346804638120098e-07, "loss": 0.0117, "reward": 0.9164831042289734, "reward_std": 0.533292543143034, "rewards/cosine_scaled_reward": -0.01050846092402935, "rewards/format_reward": 0.9375000149011612, "step": 423 }, { "completion_length": 1333.6250305175781, "epoch": 0.4845714285714286, "grad_norm": 2.020719289779663, "kl": 0.547119140625, "learning_rate": 1.6186884885673413e-07, "loss": 0.0219, "reward": 0.5903327092528343, "reward_std": 0.5696832239627838, "rewards/cosine_scaled_reward": -0.11108366213738918, "rewards/format_reward": 0.8125000298023224, "step": 424 }, { "completion_length": 1426.3542175292969, "epoch": 0.4857142857142857, "grad_norm": 2.2707631587982178, "kl": 0.79052734375, "learning_rate": 1.6028856829700258e-07, "loss": 0.0316, "reward": 0.5219599902629852, "reward_std": 0.5760443955659866, "rewards/cosine_scaled_reward": -0.14527002349495888, "rewards/format_reward": 0.8125000149011612, "step": 425 }, { "completion_length": 1259.3333740234375, "epoch": 0.4868571428571429, "grad_norm": 2.267915725708008, "kl": 0.395263671875, "learning_rate": 1.5872728172265146e-07, "loss": 0.0158, "reward": 0.8012158274650574, "reward_std": 0.667873740196228, "rewards/cosine_scaled_reward": -0.06814211048185825, "rewards/format_reward": 0.9375000149011612, "step": 426 }, { "completion_length": 1615.0416870117188, "epoch": 0.488, "grad_norm": 2.34485125541687, "kl": 0.85791015625, "learning_rate": 1.5718506522858572e-07, "loss": 0.0343, "reward": 0.7913745269179344, "reward_std": 0.7925111949443817, "rewards/cosine_scaled_reward": -0.00014608167111873627, "rewards/format_reward": 0.7916666865348816, "step": 427 }, { "completion_length": 1747.8333740234375, "epoch": 0.48914285714285716, "grad_norm": 2.9694297313690186, "kl": 0.9736328125, "learning_rate": 1.5566199398026147e-07, "loss": 0.0389, "reward": 0.55228191614151, "reward_std": 0.6319544315338135, "rewards/cosine_scaled_reward": -0.1509423702955246, "rewards/format_reward": 0.8541666865348816, "step": 428 }, { "completion_length": 1631.5417175292969, "epoch": 0.49028571428571427, "grad_norm": 3.960716724395752, "kl": 1.03509521484375, "learning_rate": 1.5415814221002265e-07, "loss": 0.0414, "reward": 0.8934581205248833, "reward_std": 0.5868038833141327, "rewards/cosine_scaled_reward": 0.03006240352988243, "rewards/format_reward": 0.833333358168602, "step": 429 }, { "completion_length": 1621.4375610351562, "epoch": 0.49142857142857144, "grad_norm": 8.593777656555176, "kl": 1.48388671875, "learning_rate": 1.5267358321348285e-07, "loss": 0.0594, "reward": 0.48539859987795353, "reward_std": 0.7700510919094086, "rewards/cosine_scaled_reward": -0.12188405683264136, "rewards/format_reward": 0.7291666716337204, "step": 430 }, { "completion_length": 1201.0208587646484, "epoch": 0.49257142857142855, "grad_norm": 1.7287312746047974, "kl": 0.67041015625, "learning_rate": 1.5120838934595337e-07, "loss": 0.0268, "reward": 0.9445644542574883, "reward_std": 0.5076880529522896, "rewards/cosine_scaled_reward": 0.0035321786999702454, "rewards/format_reward": 0.9375000149011612, "step": 431 }, { "completion_length": 1444.2084045410156, "epoch": 0.4937142857142857, "grad_norm": 3.417004346847534, "kl": 0.6171875, "learning_rate": 1.4976263201891613e-07, "loss": 0.0247, "reward": 0.6378493383526802, "reward_std": 0.7811735272407532, "rewards/cosine_scaled_reward": -0.09774199151434004, "rewards/format_reward": 0.8333333432674408, "step": 432 }, { "completion_length": 1373.5417175292969, "epoch": 0.4948571428571429, "grad_norm": 1.7710994482040405, "kl": 0.712890625, "learning_rate": 1.483363816965435e-07, "loss": 0.0286, "reward": 0.6063527911901474, "reward_std": 0.6604797914624214, "rewards/cosine_scaled_reward": -0.144740279763937, "rewards/format_reward": 0.8958333432674408, "step": 433 }, { "completion_length": 977.5208435058594, "epoch": 0.496, "grad_norm": 1.736846685409546, "kl": 0.294708251953125, "learning_rate": 1.469297078922642e-07, "loss": 0.0118, "reward": 1.2148061096668243, "reward_std": 0.5557400360703468, "rewards/cosine_scaled_reward": 0.11781970039010048, "rewards/format_reward": 0.9791666716337204, "step": 434 }, { "completion_length": 1433.8542175292969, "epoch": 0.49714285714285716, "grad_norm": 2.2994189262390137, "kl": 1.136810302734375, "learning_rate": 1.4554267916537495e-07, "loss": 0.0454, "reward": 0.4324219524860382, "reward_std": 0.8103198558092117, "rewards/cosine_scaled_reward": -0.1900390349328518, "rewards/format_reward": 0.8125000149011612, "step": 435 }, { "completion_length": 1499.3750305175781, "epoch": 0.4982857142857143, "grad_norm": 2.9299046993255615, "kl": 0.6876220703125, "learning_rate": 1.4417536311769885e-07, "loss": 0.0275, "reward": 0.6081612259149551, "reward_std": 0.8525863587856293, "rewards/cosine_scaled_reward": -0.1334194028750062, "rewards/format_reward": 0.8750000149011612, "step": 436 }, { "completion_length": 1131.1458892822266, "epoch": 0.49942857142857144, "grad_norm": 1.970673680305481, "kl": 0.24951171875, "learning_rate": 1.4282782639029128e-07, "loss": 0.01, "reward": 0.8882552683353424, "reward_std": 0.7682344913482666, "rewards/cosine_scaled_reward": -0.02462236536666751, "rewards/format_reward": 0.9375000149011612, "step": 437 }, { "completion_length": 1025.5208740234375, "epoch": 0.5005714285714286, "grad_norm": 1.1531387567520142, "kl": 0.20703125, "learning_rate": 1.4150013466019114e-07, "loss": 0.0083, "reward": 0.9214688986539841, "reward_std": 0.6478733271360397, "rewards/cosine_scaled_reward": -0.03926557023078203, "rewards/format_reward": 1.0, "step": 438 }, { "completion_length": 1004.7708740234375, "epoch": 0.5017142857142857, "grad_norm": 1.5415436029434204, "kl": 0.22320556640625, "learning_rate": 1.4019235263722034e-07, "loss": 0.0089, "reward": 0.6858842521905899, "reward_std": 0.581585705280304, "rewards/cosine_scaled_reward": -0.1466412227600813, "rewards/format_reward": 0.9791666716337204, "step": 439 }, { "completion_length": 1667.5000610351562, "epoch": 0.5028571428571429, "grad_norm": 1.8554582595825195, "kl": 0.833984375, "learning_rate": 1.3890454406082956e-07, "loss": 0.0334, "reward": 0.6555005759000778, "reward_std": 0.8219043761491776, "rewards/cosine_scaled_reward": -0.0784997058508452, "rewards/format_reward": 0.8125000298023224, "step": 440 }, { "completion_length": 1507.8542175292969, "epoch": 0.504, "grad_norm": 3.88932466506958, "kl": 0.70849609375, "learning_rate": 1.3763677169699217e-07, "loss": 0.0284, "reward": 0.7302692234516144, "reward_std": 0.6611650586128235, "rewards/cosine_scaled_reward": -0.07236538827419281, "rewards/format_reward": 0.8750000298023224, "step": 441 }, { "completion_length": 1471.1250305175781, "epoch": 0.5051428571428571, "grad_norm": 3.0517144203186035, "kl": 0.61572265625, "learning_rate": 1.3638909733514452e-07, "loss": 0.0247, "reward": 0.6592699624598026, "reward_std": 0.7461230009794235, "rewards/cosine_scaled_reward": -0.11828170018270612, "rewards/format_reward": 0.8958333432674408, "step": 442 }, { "completion_length": 1453.1458740234375, "epoch": 0.5062857142857143, "grad_norm": 2.053025722503662, "kl": 0.724609375, "learning_rate": 1.351615817851748e-07, "loss": 0.029, "reward": 0.5722346976399422, "reward_std": 0.643949382007122, "rewards/cosine_scaled_reward": -0.1305493265390396, "rewards/format_reward": 0.833333358168602, "step": 443 }, { "completion_length": 1375.6875305175781, "epoch": 0.5074285714285715, "grad_norm": 1.6717604398727417, "kl": 0.594390869140625, "learning_rate": 1.3395428487445914e-07, "loss": 0.0237, "reward": 1.0815926790237427, "reward_std": 0.7383088618516922, "rewards/cosine_scaled_reward": 0.08246299810707569, "rewards/format_reward": 0.9166666865348816, "step": 444 }, { "completion_length": 1785.2708740234375, "epoch": 0.5085714285714286, "grad_norm": 3.6348721981048584, "kl": 1.021484375, "learning_rate": 1.3276726544494571e-07, "loss": 0.0408, "reward": 0.6278277039527893, "reward_std": 0.6637818515300751, "rewards/cosine_scaled_reward": -0.10275283083319664, "rewards/format_reward": 0.8333333432674408, "step": 445 }, { "completion_length": 1080.1458587646484, "epoch": 0.5097142857142857, "grad_norm": 1.7892571687698364, "kl": 0.1240234375, "learning_rate": 1.316005813502869e-07, "loss": 0.005, "reward": 0.8095807060599327, "reward_std": 0.6449108496308327, "rewards/cosine_scaled_reward": -0.053542979061603546, "rewards/format_reward": 0.9166666865348816, "step": 446 }, { "completion_length": 1139.9166870117188, "epoch": 0.5108571428571429, "grad_norm": 1.82309091091156, "kl": 0.305145263671875, "learning_rate": 1.3045428945301953e-07, "loss": 0.0122, "reward": 0.9205853343009949, "reward_std": 0.7828942686319351, "rewards/cosine_scaled_reward": -0.02929066913202405, "rewards/format_reward": 0.9791666716337204, "step": 447 }, { "completion_length": 1278.1250305175781, "epoch": 0.512, "grad_norm": 2.259813070297241, "kl": 0.251708984375, "learning_rate": 1.2932844562179352e-07, "loss": 0.0101, "reward": 0.8140432685613632, "reward_std": 0.658391922712326, "rewards/cosine_scaled_reward": -0.05131170805543661, "rewards/format_reward": 0.9166666865348816, "step": 448 }, { "completion_length": 1173.3125305175781, "epoch": 0.5131428571428571, "grad_norm": 1.361348032951355, "kl": 0.3782958984375, "learning_rate": 1.2822310472864885e-07, "loss": 0.0151, "reward": 0.9565356224775314, "reward_std": 0.7290000915527344, "rewards/cosine_scaled_reward": 0.019934438169002533, "rewards/format_reward": 0.9166666716337204, "step": 449 }, { "completion_length": 1371.1667175292969, "epoch": 0.5142857142857142, "grad_norm": 3.834918737411499, "kl": 0.513427734375, "learning_rate": 1.2713832064634125e-07, "loss": 0.0205, "reward": 0.7457224242389202, "reward_std": 0.5297245979309082, "rewards/cosine_scaled_reward": -0.04380548745393753, "rewards/format_reward": 0.8333333432674408, "step": 450 }, { "completion_length": 1255.6250305175781, "epoch": 0.5154285714285715, "grad_norm": 2.335192918777466, "kl": 0.6416015625, "learning_rate": 1.260741462457165e-07, "loss": 0.0257, "reward": 1.0948645919561386, "reward_std": 0.7944803088903427, "rewards/cosine_scaled_reward": 0.10993227222934365, "rewards/format_reward": 0.8750000298023224, "step": 451 }, { "completion_length": 1190.6666870117188, "epoch": 0.5165714285714286, "grad_norm": 1.7608739137649536, "kl": 0.500244140625, "learning_rate": 1.2503063339313356e-07, "loss": 0.02, "reward": 0.5270581915974617, "reward_std": 0.6462119966745377, "rewards/cosine_scaled_reward": -0.1843875776976347, "rewards/format_reward": 0.895833358168602, "step": 452 }, { "completion_length": 1610.3125610351562, "epoch": 0.5177142857142857, "grad_norm": 2.1905384063720703, "kl": 0.96484375, "learning_rate": 1.2400783294793668e-07, "loss": 0.0386, "reward": 0.5264739021658897, "reward_std": 0.707670621573925, "rewards/cosine_scaled_reward": -0.1325964080169797, "rewards/format_reward": 0.7916666865348816, "step": 453 }, { "completion_length": 1178.0000305175781, "epoch": 0.5188571428571429, "grad_norm": 1.7162837982177734, "kl": 0.55322265625, "learning_rate": 1.2300579475997657e-07, "loss": 0.0221, "reward": 0.7971140295267105, "reward_std": 0.5913056135177612, "rewards/cosine_scaled_reward": -0.059776326175779104, "rewards/format_reward": 0.9166666865348816, "step": 454 }, { "completion_length": 1480.6250305175781, "epoch": 0.52, "grad_norm": 4.159846305847168, "kl": 1.05126953125, "learning_rate": 1.220245676671809e-07, "loss": 0.0421, "reward": 0.6366595476865768, "reward_std": 0.44621995836496353, "rewards/cosine_scaled_reward": -0.11917022056877613, "rewards/format_reward": 0.8750000149011612, "step": 455 }, { "completion_length": 1499.1458740234375, "epoch": 0.5211428571428571, "grad_norm": 3.277935743331909, "kl": 0.708251953125, "learning_rate": 1.2106419949317388e-07, "loss": 0.0284, "reward": 0.7240877486765385, "reward_std": 0.5471076965332031, "rewards/cosine_scaled_reward": -0.05462279508356005, "rewards/format_reward": 0.8333333432674408, "step": 456 }, { "completion_length": 1379.7292175292969, "epoch": 0.5222857142857142, "grad_norm": 2.5969083309173584, "kl": 0.6796875, "learning_rate": 1.2012473704494537e-07, "loss": 0.0272, "reward": 0.869850842282176, "reward_std": 0.700515478849411, "rewards/cosine_scaled_reward": 0.02867540717124939, "rewards/format_reward": 0.8125000298023224, "step": 457 }, { "completion_length": 1159.9791870117188, "epoch": 0.5234285714285715, "grad_norm": 2.355196952819824, "kl": 0.31744384765625, "learning_rate": 1.1920622611056974e-07, "loss": 0.0127, "reward": 0.923922210931778, "reward_std": 0.5514720380306244, "rewards/cosine_scaled_reward": -0.017205584794282913, "rewards/format_reward": 0.9583333432674408, "step": 458 }, { "completion_length": 1273.5833892822266, "epoch": 0.5245714285714286, "grad_norm": 4.148768901824951, "kl": 0.59765625, "learning_rate": 1.1830871145697412e-07, "loss": 0.0239, "reward": 0.574098750948906, "reward_std": 0.7631915658712387, "rewards/cosine_scaled_reward": -0.16086730360984802, "rewards/format_reward": 0.895833358168602, "step": 459 }, { "completion_length": 1453.9792175292969, "epoch": 0.5257142857142857, "grad_norm": 3.0608623027801514, "kl": 0.8349609375, "learning_rate": 1.1743223682775649e-07, "loss": 0.0335, "reward": 1.0540032014250755, "reward_std": 0.7098206132650375, "rewards/cosine_scaled_reward": 0.07908494677394629, "rewards/format_reward": 0.8958333432674408, "step": 460 }, { "completion_length": 1280.1667175292969, "epoch": 0.5268571428571428, "grad_norm": 1.6031488180160522, "kl": 0.51483154296875, "learning_rate": 1.1657684494105386e-07, "loss": 0.0206, "reward": 1.0345441699028015, "reward_std": 0.464998334646225, "rewards/cosine_scaled_reward": 0.05893874540925026, "rewards/format_reward": 0.9166666865348816, "step": 461 }, { "completion_length": 1348.2083740234375, "epoch": 0.528, "grad_norm": 2.087766647338867, "kl": 0.288330078125, "learning_rate": 1.1574257748745986e-07, "loss": 0.0115, "reward": 0.5173667185008526, "reward_std": 0.5679958164691925, "rewards/cosine_scaled_reward": -0.18923332425765693, "rewards/format_reward": 0.8958333432674408, "step": 462 }, { "completion_length": 1184.8333892822266, "epoch": 0.5291428571428571, "grad_norm": 2.3618013858795166, "kl": 0.67828369140625, "learning_rate": 1.1492947512799328e-07, "loss": 0.027, "reward": 0.8587629348039627, "reward_std": 0.5902151763439178, "rewards/cosine_scaled_reward": 0.002298124134540558, "rewards/format_reward": 0.8541666865348816, "step": 463 }, { "completion_length": 1381.3750610351562, "epoch": 0.5302857142857142, "grad_norm": 2.5598762035369873, "kl": 0.4290771484375, "learning_rate": 1.1413757749211602e-07, "loss": 0.0172, "reward": 0.8569861799478531, "reward_std": 0.5644854605197906, "rewards/cosine_scaled_reward": -0.04025692865252495, "rewards/format_reward": 0.9375000149011612, "step": 464 }, { "completion_length": 1754.0000610351562, "epoch": 0.5314285714285715, "grad_norm": 3.649667739868164, "kl": 1.2236328125, "learning_rate": 1.1336692317580158e-07, "loss": 0.049, "reward": 0.6691107526421547, "reward_std": 0.7533622533082962, "rewards/cosine_scaled_reward": -0.050861308351159096, "rewards/format_reward": 0.770833358168602, "step": 465 }, { "completion_length": 1626.979248046875, "epoch": 0.5325714285714286, "grad_norm": 4.014570713043213, "kl": 1.1494140625, "learning_rate": 1.1261754973965422e-07, "loss": 0.046, "reward": 0.39536508079618216, "reward_std": 0.5428843200206757, "rewards/cosine_scaled_reward": -0.18773413076996803, "rewards/format_reward": 0.7708333432674408, "step": 466 }, { "completion_length": 1410.1875305175781, "epoch": 0.5337142857142857, "grad_norm": 6.032597064971924, "kl": 0.89404296875, "learning_rate": 1.1188949370707787e-07, "loss": 0.0358, "reward": 0.7157600894570351, "reward_std": 0.6754554212093353, "rewards/cosine_scaled_reward": -0.07961997389793396, "rewards/format_reward": 0.8750000298023224, "step": 467 }, { "completion_length": 1505.0208740234375, "epoch": 0.5348571428571428, "grad_norm": 3.0937063694000244, "kl": 1.083251953125, "learning_rate": 1.1118279056249653e-07, "loss": 0.0434, "reward": 1.0479508265852928, "reward_std": 0.7750177532434464, "rewards/cosine_scaled_reward": 0.08647541608661413, "rewards/format_reward": 0.8750000298023224, "step": 468 }, { "completion_length": 1116.0208435058594, "epoch": 0.536, "grad_norm": 2.7102174758911133, "kl": 0.68310546875, "learning_rate": 1.1049747474962444e-07, "loss": 0.0273, "reward": 0.8038362823426723, "reward_std": 0.7213334441184998, "rewards/cosine_scaled_reward": -0.04599856585264206, "rewards/format_reward": 0.8958333432674408, "step": 469 }, { "completion_length": 1321.1458740234375, "epoch": 0.5371428571428571, "grad_norm": 6.9517316818237305, "kl": 1.054931640625, "learning_rate": 1.0983357966978745e-07, "loss": 0.0422, "reward": 0.6858988218009472, "reward_std": 0.6414925083518028, "rewards/cosine_scaled_reward": -0.0424672719091177, "rewards/format_reward": 0.7708333432674408, "step": 470 }, { "completion_length": 1012.8125457763672, "epoch": 0.5382857142857143, "grad_norm": 15.012682914733887, "kl": 2.00054931640625, "learning_rate": 1.0919113768029517e-07, "loss": 0.0798, "reward": 0.7205886021256447, "reward_std": 0.697634182870388, "rewards/cosine_scaled_reward": -0.1084557194262743, "rewards/format_reward": 0.9375000149011612, "step": 471 }, { "completion_length": 1118.0000305175781, "epoch": 0.5394285714285715, "grad_norm": 1.921370506286621, "kl": 0.70703125, "learning_rate": 1.0857018009286381e-07, "loss": 0.0282, "reward": 0.8347459137439728, "reward_std": 0.7502800822257996, "rewards/cosine_scaled_reward": -0.03054371359758079, "rewards/format_reward": 0.895833358168602, "step": 472 }, { "completion_length": 1343.3333740234375, "epoch": 0.5405714285714286, "grad_norm": 1.8898627758026123, "kl": 0.716796875, "learning_rate": 1.0797073717209013e-07, "loss": 0.0287, "reward": 0.3787691295146942, "reward_std": 0.4741926044225693, "rewards/cosine_scaled_reward": -0.2585321292281151, "rewards/format_reward": 0.8958333432674408, "step": 473 }, { "completion_length": 1074.5417022705078, "epoch": 0.5417142857142857, "grad_norm": 1.3952125310897827, "kl": 0.275390625, "learning_rate": 1.0739283813397639e-07, "loss": 0.011, "reward": 1.0772456228733063, "reward_std": 0.8115750551223755, "rewards/cosine_scaled_reward": 0.09070614166557789, "rewards/format_reward": 0.895833358168602, "step": 474 }, { "completion_length": 1536.7708740234375, "epoch": 0.5428571428571428, "grad_norm": 1.9422410726547241, "kl": 0.89697265625, "learning_rate": 1.068365111445064e-07, "loss": 0.036, "reward": 0.7740568369626999, "reward_std": 0.7255712598562241, "rewards/cosine_scaled_reward": -0.06088825827464461, "rewards/format_reward": 0.8958333432674408, "step": 475 }, { "completion_length": 1238.6250305175781, "epoch": 0.544, "grad_norm": 1.791479468345642, "kl": 0.49365234375, "learning_rate": 1.063017833182728e-07, "loss": 0.0198, "reward": 1.1282098963856697, "reward_std": 0.7448728978633881, "rewards/cosine_scaled_reward": 0.1266049058176577, "rewards/format_reward": 0.8750000298023224, "step": 476 }, { "completion_length": 926.4167175292969, "epoch": 0.5451428571428572, "grad_norm": 1.0713694095611572, "kl": 0.20794677734375, "learning_rate": 1.0578868071715544e-07, "loss": 0.0083, "reward": 1.242761254310608, "reward_std": 0.48385217040777206, "rewards/cosine_scaled_reward": 0.13179726898670197, "rewards/format_reward": 0.9791666716337204, "step": 477 }, { "completion_length": 1389.8750457763672, "epoch": 0.5462857142857143, "grad_norm": 1.6024447679519653, "kl": 0.315399169921875, "learning_rate": 1.0529722834905125e-07, "loss": 0.0126, "reward": 0.5958605632185936, "reward_std": 0.5410900861024857, "rewards/cosine_scaled_reward": -0.17081973887979984, "rewards/format_reward": 0.9375000149011612, "step": 478 }, { "completion_length": 1060.0000305175781, "epoch": 0.5474285714285714, "grad_norm": 1.9674124717712402, "kl": 0.351318359375, "learning_rate": 1.0482745016665526e-07, "loss": 0.0141, "reward": 0.9066205322742462, "reward_std": 0.8546933829784393, "rewards/cosine_scaled_reward": -0.036273106932640076, "rewards/format_reward": 0.9791666716337204, "step": 479 }, { "completion_length": 1312.5208740234375, "epoch": 0.5485714285714286, "grad_norm": 1.8547768592834473, "kl": 0.3245697021484375, "learning_rate": 1.0437936906629334e-07, "loss": 0.013, "reward": 1.0583224594593048, "reward_std": 0.5566529557108879, "rewards/cosine_scaled_reward": 0.04999455437064171, "rewards/format_reward": 0.9583333432674408, "step": 480 }, { "completion_length": 1382.7500610351562, "epoch": 0.5497142857142857, "grad_norm": 1.5025635957717896, "kl": 0.718505859375, "learning_rate": 1.0395300688680625e-07, "loss": 0.0287, "reward": 0.6311604380607605, "reward_std": 0.6715650781989098, "rewards/cosine_scaled_reward": -0.1323364470154047, "rewards/format_reward": 0.8958333432674408, "step": 481 }, { "completion_length": 1061.8750305175781, "epoch": 0.5508571428571428, "grad_norm": 2.283987522125244, "kl": 0.368408203125, "learning_rate": 1.0354838440848501e-07, "loss": 0.0148, "reward": 0.9338921532034874, "reward_std": 0.663320891559124, "rewards/cosine_scaled_reward": -0.022637249901890755, "rewards/format_reward": 0.9791666716337204, "step": 482 }, { "completion_length": 1664.0000305175781, "epoch": 0.552, "grad_norm": 3.247492551803589, "kl": 1.43994140625, "learning_rate": 1.0316552135205837e-07, "loss": 0.0576, "reward": 0.4382214695215225, "reward_std": 0.6629593223333359, "rewards/cosine_scaled_reward": -0.1454725954681635, "rewards/format_reward": 0.7291666865348816, "step": 483 }, { "completion_length": 1166.9375457763672, "epoch": 0.5531428571428572, "grad_norm": 1.635184407234192, "kl": 0.59814453125, "learning_rate": 1.0280443637773163e-07, "loss": 0.0239, "reward": 1.209791585803032, "reward_std": 0.48193909227848053, "rewards/cosine_scaled_reward": 0.12572911009192467, "rewards/format_reward": 0.9583333432674408, "step": 484 }, { "completion_length": 1562.9167022705078, "epoch": 0.5542857142857143, "grad_norm": 1.5177675485610962, "kl": 1.32794189453125, "learning_rate": 1.0246514708427701e-07, "loss": 0.0532, "reward": 0.6973680332303047, "reward_std": 0.6229738295078278, "rewards/cosine_scaled_reward": -0.07839931827038527, "rewards/format_reward": 0.8541666716337204, "step": 485 }, { "completion_length": 1258.0416870117188, "epoch": 0.5554285714285714, "grad_norm": 1.6634312868118286, "kl": 0.598846435546875, "learning_rate": 1.0214767000817596e-07, "loss": 0.0239, "reward": 0.9936239048838615, "reward_std": 0.7174008414149284, "rewards/cosine_scaled_reward": 0.038478586822748184, "rewards/format_reward": 0.9166666865348816, "step": 486 }, { "completion_length": 1263.3125610351562, "epoch": 0.5565714285714286, "grad_norm": 3.9874322414398193, "kl": 0.56591796875, "learning_rate": 1.0185202062281336e-07, "loss": 0.0226, "reward": 0.9569487422704697, "reward_std": 0.776068776845932, "rewards/cosine_scaled_reward": 0.009724359028041363, "rewards/format_reward": 0.9375000149011612, "step": 487 }, { "completion_length": 1440.8333435058594, "epoch": 0.5577142857142857, "grad_norm": 3.2814691066741943, "kl": 0.67236328125, "learning_rate": 1.0157821333772304e-07, "loss": 0.0269, "reward": 0.4271909072995186, "reward_std": 0.5705131366848946, "rewards/cosine_scaled_reward": -0.2447379156947136, "rewards/format_reward": 0.9166666865348816, "step": 488 }, { "completion_length": 983.0625457763672, "epoch": 0.5588571428571428, "grad_norm": 1.347419261932373, "kl": 0.39642333984375, "learning_rate": 1.013262614978859e-07, "loss": 0.0158, "reward": 1.1842274367809296, "reward_std": 0.5909858122467995, "rewards/cosine_scaled_reward": 0.11294705420732498, "rewards/format_reward": 0.9583333432674408, "step": 489 }, { "completion_length": 1493.3958740234375, "epoch": 0.56, "grad_norm": 1.7785853147506714, "kl": 0.92236328125, "learning_rate": 1.0109617738307911e-07, "loss": 0.0368, "reward": 0.5224965363740921, "reward_std": 0.5668257884681225, "rewards/cosine_scaled_reward": -0.1450017336755991, "rewards/format_reward": 0.8125000298023224, "step": 490 }, { "completion_length": 1626.2916870117188, "epoch": 0.5611428571428572, "grad_norm": 1.94631028175354, "kl": 1.02734375, "learning_rate": 1.0088797220727779e-07, "loss": 0.0411, "reward": 0.6662824004888535, "reward_std": 0.6974209845066071, "rewards/cosine_scaled_reward": -0.07310881093144417, "rewards/format_reward": 0.8125000298023224, "step": 491 }, { "completion_length": 1049.5833740234375, "epoch": 0.5622857142857143, "grad_norm": 2.7264091968536377, "kl": 0.43701171875, "learning_rate": 1.0070165611810855e-07, "loss": 0.0175, "reward": 0.8967996649444103, "reward_std": 0.40792329236865044, "rewards/cosine_scaled_reward": -0.02035021036863327, "rewards/format_reward": 0.9375, "step": 492 }, { "completion_length": 1251.0000610351562, "epoch": 0.5634285714285714, "grad_norm": 1.9115831851959229, "kl": 0.535888671875, "learning_rate": 1.005372381963547e-07, "loss": 0.0215, "reward": 0.7575453743338585, "reward_std": 0.6094752550125122, "rewards/cosine_scaled_reward": -0.11081065610051155, "rewards/format_reward": 0.9791666716337204, "step": 493 }, { "completion_length": 1035.9375305175781, "epoch": 0.5645714285714286, "grad_norm": 3.7157654762268066, "kl": 0.62744140625, "learning_rate": 1.0039472645551372e-07, "loss": 0.0251, "reward": 0.49045146629214287, "reward_std": 0.4288835674524307, "rewards/cosine_scaled_reward": -0.21310760331107304, "rewards/format_reward": 0.9166666865348816, "step": 494 }, { "completion_length": 1336.2291870117188, "epoch": 0.5657142857142857, "grad_norm": 1.2452759742736816, "kl": 0.7421875, "learning_rate": 1.002741278414069e-07, "loss": 0.0296, "reward": 1.002578854560852, "reward_std": 0.5317458659410477, "rewards/cosine_scaled_reward": 0.05337274447083473, "rewards/format_reward": 0.895833358168602, "step": 495 }, { "completion_length": 1363.2500305175781, "epoch": 0.5668571428571428, "grad_norm": 1.7971941232681274, "kl": 0.7275390625, "learning_rate": 1.0017544823184055e-07, "loss": 0.0291, "reward": 1.0207700282335281, "reward_std": 0.8502290099859238, "rewards/cosine_scaled_reward": 0.09371834748890251, "rewards/format_reward": 0.833333358168602, "step": 496 }, { "completion_length": 1141.7500305175781, "epoch": 0.568, "grad_norm": 2.350130796432495, "kl": 0.56689453125, "learning_rate": 1.0009869243631952e-07, "loss": 0.0227, "reward": 1.0808594226837158, "reward_std": 0.7858606725931168, "rewards/cosine_scaled_reward": 0.14459637086838484, "rewards/format_reward": 0.7916666865348816, "step": 497 }, { "completion_length": 1354.0625610351562, "epoch": 0.5691428571428572, "grad_norm": 2.9282283782958984, "kl": 0.8212890625, "learning_rate": 1.000438641958131e-07, "loss": 0.0329, "reward": 0.7300854474306107, "reward_std": 0.6824060678482056, "rewards/cosine_scaled_reward": -0.08287395909428596, "rewards/format_reward": 0.895833358168602, "step": 498 }, { "completion_length": 1752.0625610351562, "epoch": 0.5702857142857143, "grad_norm": 1.9427164793014526, "kl": 0.99267578125, "learning_rate": 1.0001096618257236e-07, "loss": 0.0397, "reward": 0.6041746586561203, "reward_std": 0.5943188220262527, "rewards/cosine_scaled_reward": -0.12499601114541292, "rewards/format_reward": 0.8541666865348816, "step": 499 }, { "completion_length": 1389.7292175292969, "epoch": 0.5714285714285714, "grad_norm": 3.8614306449890137, "kl": 0.90966796875, "learning_rate": 1e-07, "loss": 0.0364, "reward": 0.8081704080104828, "reward_std": 0.7910114228725433, "rewards/cosine_scaled_reward": -0.022998109459877014, "rewards/format_reward": 0.8541666716337204, "step": 500 }, { "epoch": 0.5714285714285714, "step": 500, "total_flos": 0.0, "train_loss": 0.007592763372914685, "train_runtime": 38120.128, "train_samples_per_second": 0.63, "train_steps_per_second": 0.013 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }