{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5714285714285714, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 3001.9584350585938, "epoch": 0.001142857142857143, "grad_norm": 0.1892756074666977, "kl": 0.0, "learning_rate": 2e-08, "loss": -0.0, "reward": -0.05305394693277776, "reward_std": 0.17984477430582047, "rewards/cosine_scaled_reward": -0.1928562317043543, "rewards/format_reward": 0.37500000558793545, "step": 1 }, { "completion_length": 2822.541717529297, "epoch": 0.002285714285714286, "grad_norm": 0.264278769493103, "kl": 0.0, "learning_rate": 4e-08, "loss": 0.0, "reward": 0.11233407678082585, "reward_std": 0.2779897153377533, "rewards/cosine_scaled_reward": -0.009885392151772976, "rewards/format_reward": 0.4583333432674408, "step": 2 }, { "completion_length": 2963.6875610351562, "epoch": 0.0034285714285714284, "grad_norm": 0.19026866555213928, "kl": 3.454089164733887e-05, "learning_rate": 6e-08, "loss": 0.0, "reward": -0.16433760710060596, "reward_std": 0.12963848933577538, "rewards/cosine_scaled_reward": -0.30146655440330505, "rewards/format_reward": 0.27083333395421505, "step": 3 }, { "completion_length": 3083.52099609375, "epoch": 0.004571428571428572, "grad_norm": 0.18971964716911316, "kl": 2.5644898414611816e-05, "learning_rate": 8e-08, "loss": 0.0, "reward": -0.06517243073903956, "reward_std": 0.20367862656712532, "rewards/cosine_scaled_reward": -0.19678325578570366, "rewards/format_reward": 0.33333334140479565, "step": 4 }, { "completion_length": 2776.6875610351562, "epoch": 0.005714285714285714, "grad_norm": 0.19452892243862152, "kl": 3.6656856536865234e-05, "learning_rate": 1e-07, "loss": 0.0, "reward": 0.2221625093370676, "reward_std": 0.30077361315488815, "rewards/cosine_scaled_reward": 0.1190804042853415, "rewards/format_reward": 0.5000000204890966, "step": 5 }, { "completion_length": 2797.916748046875, "epoch": 0.006857142857142857, "grad_norm": 0.2365235984325409, "kl": 3.871321678161621e-05, "learning_rate": 1.2e-07, "loss": 0.0, "reward": 0.11705643311142921, "reward_std": 0.27914606034755707, "rewards/cosine_scaled_reward": -0.020885439589619637, "rewards/format_reward": 0.5208333507180214, "step": 6 }, { "completion_length": 2599.1250610351562, "epoch": 0.008, "grad_norm": 0.19415141642093658, "kl": 2.275407314300537e-05, "learning_rate": 1.4e-07, "loss": 0.0, "reward": 0.1348338108509779, "reward_std": 0.2222888581454754, "rewards/cosine_scaled_reward": 0.006776571273803711, "rewards/format_reward": 0.5, "step": 7 }, { "completion_length": 2841.3750610351562, "epoch": 0.009142857142857144, "grad_norm": 0.18534274399280548, "kl": 3.381073474884033e-05, "learning_rate": 1.6e-07, "loss": 0.0, "reward": -0.04779001139104366, "reward_std": 0.2098814733326435, "rewards/cosine_scaled_reward": -0.19928531628102064, "rewards/format_reward": 0.4166666716337204, "step": 8 }, { "completion_length": 3102.0208740234375, "epoch": 0.010285714285714285, "grad_norm": 0.20473794639110565, "kl": 4.3511390686035156e-05, "learning_rate": 1.8e-07, "loss": 0.0, "reward": 0.029760856181383133, "reward_std": 0.2512381225824356, "rewards/cosine_scaled_reward": -0.053303517401218414, "rewards/format_reward": 0.2708333432674408, "step": 9 }, { "completion_length": 2247.4376220703125, "epoch": 0.011428571428571429, "grad_norm": 0.225472092628479, "kl": 3.329664468765259e-05, "learning_rate": 2e-07, "loss": 0.0, "reward": 0.13082122802734375, "reward_std": 0.2447206862270832, "rewards/cosine_scaled_reward": -0.02737809531390667, "rewards/format_reward": 0.5833333358168602, "step": 10 }, { "completion_length": 2586.8334350585938, "epoch": 0.012571428571428572, "grad_norm": 0.2847403287887573, "kl": 3.662705421447754e-05, "learning_rate": 2.1999999999999998e-07, "loss": 0.0, "reward": 0.11637254571542144, "reward_std": 0.3530980758368969, "rewards/cosine_scaled_reward": -0.021436475217342377, "rewards/format_reward": 0.5208333432674408, "step": 11 }, { "completion_length": 2676.1251220703125, "epoch": 0.013714285714285714, "grad_norm": 0.1856570988893509, "kl": 2.3916363716125488e-05, "learning_rate": 2.4e-07, "loss": 0.0, "reward": 0.15559198334813118, "reward_std": 0.27254677191376686, "rewards/cosine_scaled_reward": 0.00797954574227333, "rewards/format_reward": 0.5833333432674408, "step": 12 }, { "completion_length": 2201.8959197998047, "epoch": 0.014857142857142857, "grad_norm": 0.21396547555923462, "kl": 2.2470951080322266e-05, "learning_rate": 2.6e-07, "loss": 0.0, "reward": 0.19927317276597023, "reward_std": 0.2963304966688156, "rewards/cosine_scaled_reward": 0.04285899642854929, "rewards/format_reward": 0.6458333432674408, "step": 13 }, { "completion_length": 2980.9584350585938, "epoch": 0.016, "grad_norm": 0.20101284980773926, "kl": 4.291534423828125e-05, "learning_rate": 2.8e-07, "loss": 0.0, "reward": 0.06336207129061222, "reward_std": 0.31606999784708023, "rewards/cosine_scaled_reward": -0.04676309134811163, "rewards/format_reward": 0.3750000074505806, "step": 14 }, { "completion_length": 3176.7500610351562, "epoch": 0.017142857142857144, "grad_norm": 0.19117361307144165, "kl": 5.060434341430664e-05, "learning_rate": 3e-07, "loss": 0.0, "reward": -0.06747908261604607, "reward_std": 0.307153781875968, "rewards/cosine_scaled_reward": -0.15347641706466675, "rewards/format_reward": 0.20833333395421505, "step": 15 }, { "completion_length": 2186.104248046875, "epoch": 0.018285714285714287, "grad_norm": 0.256155788898468, "kl": 1.3686716556549072e-05, "learning_rate": 3.2e-07, "loss": 0.0, "reward": 0.23619388276711106, "reward_std": 0.26740751788020134, "rewards/cosine_scaled_reward": 0.1003237534314394, "rewards/format_reward": 0.6041666865348816, "step": 16 }, { "completion_length": 3134.7916870117188, "epoch": 0.019428571428571427, "grad_norm": 0.17361606657505035, "kl": 4.1812658309936523e-05, "learning_rate": 3.4000000000000003e-07, "loss": 0.0, "reward": -0.10287437587976456, "reward_std": 0.14985607378184795, "rewards/cosine_scaled_reward": -0.20927279442548752, "rewards/format_reward": 0.22916667722165585, "step": 17 }, { "completion_length": 3086.3959350585938, "epoch": 0.02057142857142857, "grad_norm": 0.23153060674667358, "kl": 5.2809715270996094e-05, "learning_rate": 3.6e-07, "loss": 0.0, "reward": -0.048283468931913376, "reward_std": 0.28065528720617294, "rewards/cosine_scaled_reward": -0.15300771407783031, "rewards/format_reward": 0.27083333395421505, "step": 18 }, { "completion_length": 3197.229248046875, "epoch": 0.021714285714285714, "grad_norm": 0.15150664746761322, "kl": 2.8431415557861328e-05, "learning_rate": 3.7999999999999996e-07, "loss": 0.0, "reward": -0.04134747572243214, "reward_std": 0.25655536726117134, "rewards/cosine_scaled_reward": -0.1307651773095131, "rewards/format_reward": 0.2291666679084301, "step": 19 }, { "completion_length": 2551.729217529297, "epoch": 0.022857142857142857, "grad_norm": 0.23382136225700378, "kl": 3.3795833587646484e-05, "learning_rate": 4e-07, "loss": 0.0, "reward": 0.06840577768161893, "reward_std": 0.2175830602645874, "rewards/cosine_scaled_reward": -0.08142943307757378, "rewards/format_reward": 0.5000000149011612, "step": 20 }, { "completion_length": 2715.0626220703125, "epoch": 0.024, "grad_norm": 0.2693728506565094, "kl": 2.7894973754882812e-05, "learning_rate": 4.1999999999999995e-07, "loss": 0.0, "reward": 0.09808723395690322, "reward_std": 0.2364928014576435, "rewards/cosine_scaled_reward": -0.02434827946126461, "rewards/format_reward": 0.45833333395421505, "step": 21 }, { "completion_length": 3276.0625610351562, "epoch": 0.025142857142857144, "grad_norm": 0.19304966926574707, "kl": 3.9517879486083984e-05, "learning_rate": 4.3999999999999997e-07, "loss": 0.0, "reward": -0.08807891490869224, "reward_std": 0.2701016888022423, "rewards/cosine_scaled_reward": -0.1670425608754158, "rewards/format_reward": 0.1666666716337204, "step": 22 }, { "completion_length": 3304.3959350585938, "epoch": 0.026285714285714287, "grad_norm": 0.15851277112960815, "kl": 5.078315734863281e-05, "learning_rate": 4.6e-07, "loss": 0.0, "reward": -0.04373410594416782, "reward_std": 0.24115629866719246, "rewards/cosine_scaled_reward": -0.1398780420422554, "rewards/format_reward": 0.2500000111758709, "step": 23 }, { "completion_length": 2280.3333740234375, "epoch": 0.027428571428571427, "grad_norm": 0.21461711823940277, "kl": 8.79727303981781e-06, "learning_rate": 4.8e-07, "loss": 0.0, "reward": 0.2311121616512537, "reward_std": 0.23989838548004627, "rewards/cosine_scaled_reward": 0.09470203705132008, "rewards/format_reward": 0.6041666772216558, "step": 24 }, { "completion_length": 2743.0416870117188, "epoch": 0.02857142857142857, "grad_norm": 0.2246316522359848, "kl": 2.2739171981811523e-05, "learning_rate": 5e-07, "loss": 0.0, "reward": 0.10195200517773628, "reward_std": 0.2438894398510456, "rewards/cosine_scaled_reward": 0.006860721856355667, "rewards/format_reward": 0.3750000037252903, "step": 25 }, { "completion_length": 3110.2500610351562, "epoch": 0.029714285714285714, "grad_norm": 0.17850153148174286, "kl": 2.816319465637207e-05, "learning_rate": 5.2e-07, "loss": 0.0, "reward": 0.12602470815181732, "reward_std": 0.3109820708632469, "rewards/cosine_scaled_reward": 0.0360703282058239, "rewards/format_reward": 0.3750000149011612, "step": 26 }, { "completion_length": 3011.854248046875, "epoch": 0.030857142857142857, "grad_norm": 0.1936245560646057, "kl": 3.886222839355469e-05, "learning_rate": 5.4e-07, "loss": 0.0, "reward": 0.0031234845519065857, "reward_std": 0.15648237243294716, "rewards/cosine_scaled_reward": -0.07742472551763058, "rewards/format_reward": 0.25, "step": 27 }, { "completion_length": 3133.7708740234375, "epoch": 0.032, "grad_norm": 0.19556812942028046, "kl": 4.5865774154663086e-05, "learning_rate": 5.6e-07, "loss": 0.0, "reward": 0.05877143098041415, "reward_std": 0.32328512519598007, "rewards/cosine_scaled_reward": -0.043721283320337534, "rewards/format_reward": 0.35416667722165585, "step": 28 }, { "completion_length": 3163.6666870117188, "epoch": 0.03314285714285714, "grad_norm": 0.16339358687400818, "kl": 2.5033950805664062e-05, "learning_rate": 5.8e-07, "loss": 0.0, "reward": 0.027387002483010292, "reward_std": 0.337462954223156, "rewards/cosine_scaled_reward": -0.07489126734435558, "rewards/format_reward": 0.3333333469927311, "step": 29 }, { "completion_length": 3104.3959350585938, "epoch": 0.03428571428571429, "grad_norm": 0.20371638238430023, "kl": 3.0308961868286133e-05, "learning_rate": 6e-07, "loss": 0.0, "reward": 0.08465440198779106, "reward_std": 0.2969431169331074, "rewards/cosine_scaled_reward": -0.006935901008546352, "rewards/format_reward": 0.3541666828095913, "step": 30 }, { "completion_length": 2491.2708740234375, "epoch": 0.03542857142857143, "grad_norm": 0.3123537600040436, "kl": 4.607439041137695e-05, "learning_rate": 6.2e-07, "loss": 0.0, "reward": -0.005427001044154167, "reward_std": 0.19760142639279366, "rewards/cosine_scaled_reward": -0.1529179997742176, "rewards/format_reward": 0.4375000074505806, "step": 31 }, { "completion_length": 3482.5208740234375, "epoch": 0.036571428571428574, "grad_norm": 0.15471217036247253, "kl": 3.0040740966796875e-05, "learning_rate": 6.4e-07, "loss": 0.0, "reward": -0.12795986607670784, "reward_std": 0.22705752402544022, "rewards/cosine_scaled_reward": -0.20094125345349312, "rewards/format_reward": 0.1041666679084301, "step": 32 }, { "completion_length": 2918.4166870117188, "epoch": 0.037714285714285714, "grad_norm": 0.1735975742340088, "kl": 2.60770320892334e-05, "learning_rate": 6.6e-07, "loss": 0.0, "reward": 0.13466424029320478, "reward_std": 0.30455923825502396, "rewards/cosine_scaled_reward": 0.016905852127820253, "rewards/format_reward": 0.4791666828095913, "step": 33 }, { "completion_length": 3103.1875610351562, "epoch": 0.038857142857142854, "grad_norm": 0.20565363764762878, "kl": 2.7477741241455078e-05, "learning_rate": 6.800000000000001e-07, "loss": 0.0, "reward": -0.0920376293361187, "reward_std": 0.22466492280364037, "rewards/cosine_scaled_reward": -0.19513528188690543, "rewards/format_reward": 0.2291666716337204, "step": 34 }, { "completion_length": 2871.791748046875, "epoch": 0.04, "grad_norm": 0.16891056299209595, "kl": 3.1381845474243164e-05, "learning_rate": 7e-07, "loss": 0.0, "reward": 0.09445883147418499, "reward_std": 0.32046519964933395, "rewards/cosine_scaled_reward": -0.028155114501714706, "rewards/format_reward": 0.43750001303851604, "step": 35 }, { "completion_length": 2533.5625915527344, "epoch": 0.04114285714285714, "grad_norm": 0.19785140454769135, "kl": 2.5272369384765625e-05, "learning_rate": 7.2e-07, "loss": 0.0, "reward": 0.19070486724376678, "reward_std": 0.13970058038830757, "rewards/cosine_scaled_reward": 0.08542170003056526, "rewards/format_reward": 0.47916667722165585, "step": 36 }, { "completion_length": 2755.2083740234375, "epoch": 0.04228571428571429, "grad_norm": 0.21290314197540283, "kl": 2.9027462005615234e-05, "learning_rate": 7.4e-07, "loss": 0.0, "reward": 0.02860638126730919, "reward_std": 0.2937544248998165, "rewards/cosine_scaled_reward": -0.1261284868232906, "rewards/format_reward": 0.4791666865348816, "step": 37 }, { "completion_length": 3023.4376220703125, "epoch": 0.04342857142857143, "grad_norm": 0.19873568415641785, "kl": 3.618001937866211e-05, "learning_rate": 7.599999999999999e-07, "loss": 0.0, "reward": -0.013171980390325189, "reward_std": 0.20296454429626465, "rewards/cosine_scaled_reward": -0.1557461880147457, "rewards/format_reward": 0.416666679084301, "step": 38 }, { "completion_length": 2770.166748046875, "epoch": 0.044571428571428574, "grad_norm": 0.21600858867168427, "kl": 3.2961368560791016e-05, "learning_rate": 7.799999999999999e-07, "loss": 0.0, "reward": 0.04621700069401413, "reward_std": 0.3573795333504677, "rewards/cosine_scaled_reward": -0.0878668837249279, "rewards/format_reward": 0.4375, "step": 39 }, { "completion_length": 2921.8334350585938, "epoch": 0.045714285714285714, "grad_norm": 0.1665465086698532, "kl": 1.576542854309082e-05, "learning_rate": 8e-07, "loss": 0.0, "reward": -0.08147826488129795, "reward_std": 0.2727391682565212, "rewards/cosine_scaled_reward": -0.23862353712320328, "rewards/format_reward": 0.3958333432674408, "step": 40 }, { "completion_length": 3219.4375, "epoch": 0.046857142857142854, "grad_norm": 0.15641310811042786, "kl": 2.47955322265625e-05, "learning_rate": 8.199999999999999e-07, "loss": 0.0, "reward": 0.04313786077545956, "reward_std": 0.2788317874073982, "rewards/cosine_scaled_reward": -0.045081598684191704, "rewards/format_reward": 0.2916666716337204, "step": 41 }, { "completion_length": 2030.6250610351562, "epoch": 0.048, "grad_norm": 0.3318580389022827, "kl": 5.9932470321655273e-05, "learning_rate": 8.399999999999999e-07, "loss": 0.0, "reward": 0.1814209260046482, "reward_std": 0.24633333832025528, "rewards/cosine_scaled_reward": -0.004054427146911621, "rewards/format_reward": 0.7083333507180214, "step": 42 }, { "completion_length": 2986.7084350585938, "epoch": 0.04914285714285714, "grad_norm": 0.18334151804447174, "kl": 2.3711472749710083e-05, "learning_rate": 8.599999999999999e-07, "loss": 0.0, "reward": 0.022090405225753784, "reward_std": 0.3203391507267952, "rewards/cosine_scaled_reward": -0.10132655501365662, "rewards/format_reward": 0.39583334140479565, "step": 43 }, { "completion_length": 2783.3959350585938, "epoch": 0.05028571428571429, "grad_norm": 0.2206653207540512, "kl": 0.00010889768600463867, "learning_rate": 8.799999999999999e-07, "loss": 0.0, "reward": 0.13657060265541077, "reward_std": 0.29330166801810265, "rewards/cosine_scaled_reward": 0.028908416628837585, "rewards/format_reward": 0.4375000260770321, "step": 44 }, { "completion_length": 2881.1250610351562, "epoch": 0.05142857142857143, "grad_norm": 0.17664436995983124, "kl": 2.9489398002624512e-05, "learning_rate": 9e-07, "loss": 0.0, "reward": 0.2449277015402913, "reward_std": 0.40640798956155777, "rewards/cosine_scaled_reward": 0.17139457043958828, "rewards/format_reward": 0.4375000111758709, "step": 45 }, { "completion_length": 3004.812530517578, "epoch": 0.052571428571428575, "grad_norm": 0.17481741309165955, "kl": 1.0464340448379517e-05, "learning_rate": 9.2e-07, "loss": 0.0, "reward": 0.054059505462646484, "reward_std": 0.1310178069397807, "rewards/cosine_scaled_reward": -0.04911454766988754, "rewards/format_reward": 0.3541666716337204, "step": 46 }, { "completion_length": 2466.6458740234375, "epoch": 0.053714285714285714, "grad_norm": 0.2526704668998718, "kl": 2.4911249056458473e-05, "learning_rate": 9.399999999999999e-07, "loss": 0.0, "reward": 0.14142334461212158, "reward_std": 0.30394549667835236, "rewards/cosine_scaled_reward": -0.018136408179998398, "rewards/format_reward": 0.6041666865348816, "step": 47 }, { "completion_length": 2828.0833740234375, "epoch": 0.054857142857142854, "grad_norm": 0.1843711882829666, "kl": 5.0902366638183594e-05, "learning_rate": 9.6e-07, "loss": 0.0, "reward": 0.17658406496047974, "reward_std": 0.41882630810141563, "rewards/cosine_scaled_reward": 0.07139459624886513, "rewards/format_reward": 0.4583333432674408, "step": 48 }, { "completion_length": 2165.8541870117188, "epoch": 0.056, "grad_norm": 0.2018611580133438, "kl": 8.557736873626709e-05, "learning_rate": 9.8e-07, "loss": 0.0, "reward": 0.2922453209757805, "reward_std": 0.29717790707945824, "rewards/cosine_scaled_reward": 0.1657434180378914, "rewards/format_reward": 0.625, "step": 49 }, { "completion_length": 2188.0834350585938, "epoch": 0.05714285714285714, "grad_norm": 0.34193405508995056, "kl": 0.00023797806352376938, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.2388845265377313, "reward_std": 0.29164354503154755, "rewards/cosine_scaled_reward": 0.10962018743157387, "rewards/format_reward": 0.583333358168602, "step": 50 }, { "completion_length": 3217.541748046875, "epoch": 0.05828571428571429, "grad_norm": 0.16055454313755035, "kl": 5.5164098739624023e-05, "learning_rate": 9.999890338174275e-07, "loss": 0.0, "reward": -0.053118061856366694, "reward_std": 0.2539052776992321, "rewards/cosine_scaled_reward": -0.1391819417476654, "rewards/format_reward": 0.20833334513008595, "step": 51 }, { "completion_length": 3114.9584350585938, "epoch": 0.05942857142857143, "grad_norm": 0.16836906969547272, "kl": 0.00012135505676269531, "learning_rate": 9.999561358041868e-07, "loss": 0.0, "reward": -0.03695523925125599, "reward_std": 0.2873278111219406, "rewards/cosine_scaled_reward": -0.15903650410473347, "rewards/format_reward": 0.3333333507180214, "step": 52 }, { "completion_length": 2896.8541870117188, "epoch": 0.060571428571428575, "grad_norm": 0.2661495506763458, "kl": 0.00020489096641540527, "learning_rate": 9.999013075636804e-07, "loss": 0.0, "reward": -0.06146936025470495, "reward_std": 0.17227579653263092, "rewards/cosine_scaled_reward": -0.18547281250357628, "rewards/format_reward": 0.3125, "step": 53 }, { "completion_length": 2592.7708740234375, "epoch": 0.061714285714285715, "grad_norm": 0.18709887564182281, "kl": 7.784366607666016e-05, "learning_rate": 9.998245517681593e-07, "loss": 0.0, "reward": -0.08744588773697615, "reward_std": 0.13443115912377834, "rewards/cosine_scaled_reward": -0.2370000034570694, "rewards/format_reward": 0.375, "step": 54 }, { "completion_length": 3108.9583740234375, "epoch": 0.06285714285714286, "grad_norm": 0.17392441630363464, "kl": 0.00017219781875610352, "learning_rate": 9.997258721585931e-07, "loss": 0.0, "reward": -0.04910636320710182, "reward_std": 0.23171953111886978, "rewards/cosine_scaled_reward": -0.15009244764223695, "rewards/format_reward": 0.25000000558793545, "step": 55 }, { "completion_length": 3085.729248046875, "epoch": 0.064, "grad_norm": 0.1863940954208374, "kl": 0.00041985511779785156, "learning_rate": 9.996052735444862e-07, "loss": 0.0, "reward": 0.0016605854034423828, "reward_std": 0.26457204297184944, "rewards/cosine_scaled_reward": -0.09207657910883427, "rewards/format_reward": 0.2916666716337204, "step": 56 }, { "completion_length": 2939.916717529297, "epoch": 0.06514285714285714, "grad_norm": 0.21433454751968384, "kl": 0.0003230571746826172, "learning_rate": 9.994627618036452e-07, "loss": 0.0, "reward": -0.0446708258241415, "reward_std": 0.2267843373119831, "rewards/cosine_scaled_reward": -0.16660560760647058, "rewards/format_reward": 0.3333333395421505, "step": 57 }, { "completion_length": 3045.0626220703125, "epoch": 0.06628571428571428, "grad_norm": 0.2040744423866272, "kl": 0.0006006956100463867, "learning_rate": 9.992983438818915e-07, "loss": 0.0, "reward": -0.0063889771699905396, "reward_std": 0.32527972012758255, "rewards/cosine_scaled_reward": -0.1043020358774811, "rewards/format_reward": 0.2916666716337204, "step": 58 }, { "completion_length": 2990.0000610351562, "epoch": 0.06742857142857143, "grad_norm": 0.21800561249256134, "kl": 0.0008556842803955078, "learning_rate": 9.991120277927223e-07, "loss": 0.0, "reward": 0.11553543619811535, "reward_std": 0.2411040961742401, "rewards/cosine_scaled_reward": 0.03165358677506447, "rewards/format_reward": 0.35416667722165585, "step": 59 }, { "completion_length": 2764.4376220703125, "epoch": 0.06857142857142857, "grad_norm": 0.19482655823230743, "kl": 0.0002484321594238281, "learning_rate": 9.989038226169207e-07, "loss": 0.0, "reward": 0.11085247556911781, "reward_std": 0.19889794662594795, "rewards/cosine_scaled_reward": 0.03661669138818979, "rewards/format_reward": 0.31250000186264515, "step": 60 }, { "completion_length": 2435.7084045410156, "epoch": 0.06971428571428571, "grad_norm": 0.19535353779792786, "kl": 0.0001939535140991211, "learning_rate": 9.98673738502114e-07, "loss": 0.0, "reward": 0.20854798704385757, "reward_std": 0.2243325486779213, "rewards/cosine_scaled_reward": 0.0953704472631216, "rewards/format_reward": 0.5208333432674408, "step": 61 }, { "completion_length": 3321.3959350585938, "epoch": 0.07085714285714285, "grad_norm": 0.17011719942092896, "kl": 0.0002614259719848633, "learning_rate": 9.98421786662277e-07, "loss": 0.0, "reward": 0.07449256628751755, "reward_std": 0.320609487593174, "rewards/cosine_scaled_reward": -0.013772495090961456, "rewards/format_reward": 0.3333333432674408, "step": 62 }, { "completion_length": 2038.0000610351562, "epoch": 0.072, "grad_norm": 0.22595028579235077, "kl": 0.002492666244506836, "learning_rate": 9.981479793771866e-07, "loss": 0.0001, "reward": 0.1877956110984087, "reward_std": 0.2516198009252548, "rewards/cosine_scaled_reward": 0.009264111518859863, "rewards/format_reward": 0.6875000074505806, "step": 63 }, { "completion_length": 3070.6666870117188, "epoch": 0.07314285714285715, "grad_norm": 0.15192179381847382, "kl": 0.00020563602447509766, "learning_rate": 9.97852329991824e-07, "loss": 0.0, "reward": 0.14650224149227142, "reward_std": 0.3490786999464035, "rewards/cosine_scaled_reward": 0.059755959548056126, "rewards/format_reward": 0.39583334885537624, "step": 64 }, { "completion_length": 2546.2709350585938, "epoch": 0.07428571428571429, "grad_norm": 0.18594862520694733, "kl": 0.0012459754943847656, "learning_rate": 9.975348529157229e-07, "loss": 0.0, "reward": 0.11259761109249666, "reward_std": 0.30045806616544724, "rewards/cosine_scaled_reward": -0.027794551104307175, "rewards/format_reward": 0.5208333432674408, "step": 65 }, { "completion_length": 2988.0, "epoch": 0.07542857142857143, "grad_norm": 0.16415195167064667, "kl": 0.0010949969291687012, "learning_rate": 9.971955636222684e-07, "loss": 0.0, "reward": -0.034010959789156914, "reward_std": 0.27774763107299805, "rewards/cosine_scaled_reward": -0.16830014809966087, "rewards/format_reward": 0.37500000186264515, "step": 66 }, { "completion_length": 3065.8333740234375, "epoch": 0.07657142857142857, "grad_norm": 0.1508760303258896, "kl": 0.0005504488945007324, "learning_rate": 9.968344786479415e-07, "loss": 0.0, "reward": 0.17344105429947376, "reward_std": 0.3407328389585018, "rewards/cosine_scaled_reward": 0.0939273051917553, "rewards/format_reward": 0.39583334885537624, "step": 67 }, { "completion_length": 2521.9583740234375, "epoch": 0.07771428571428571, "grad_norm": 0.2780892252922058, "kl": 0.0031766891479492188, "learning_rate": 9.964516155915151e-07, "loss": 0.0001, "reward": -0.006004307419061661, "reward_std": 0.21513504162430763, "rewards/cosine_scaled_reward": -0.142730213701725, "rewards/format_reward": 0.3958333432674408, "step": 68 }, { "completion_length": 2483.6875610351562, "epoch": 0.07885714285714286, "grad_norm": 0.1821538656949997, "kl": 0.000514984130859375, "learning_rate": 9.960469931131936e-07, "loss": 0.0, "reward": 0.13737047230824828, "reward_std": 0.2913679927587509, "rewards/cosine_scaled_reward": 0.00387752428650856, "rewards/format_reward": 0.5208333432674408, "step": 69 }, { "completion_length": 3145.9791870117188, "epoch": 0.08, "grad_norm": 0.1587502807378769, "kl": 0.0017343759536743164, "learning_rate": 9.956206309337066e-07, "loss": 0.0001, "reward": 0.10869743674993515, "reward_std": 0.26583526469767094, "rewards/cosine_scaled_reward": 0.0119447261095047, "rewards/format_reward": 0.3958333432674408, "step": 70 }, { "completion_length": 2392.4584045410156, "epoch": 0.08114285714285714, "grad_norm": 0.2353833019733429, "kl": 0.0014793872833251953, "learning_rate": 9.951725498333448e-07, "loss": 0.0001, "reward": 0.06978913443163037, "reward_std": 0.23073862865567207, "rewards/cosine_scaled_reward": -0.08474473655223846, "rewards/format_reward": 0.5208333432674408, "step": 71 }, { "completion_length": 2445.25, "epoch": 0.08228571428571428, "grad_norm": 0.36222851276397705, "kl": 0.007985234260559082, "learning_rate": 9.947027716509488e-07, "loss": 0.0003, "reward": 0.28675128147006035, "reward_std": 0.26310405880212784, "rewards/cosine_scaled_reward": 0.15304938424378633, "rewards/format_reward": 0.645833358168602, "step": 72 }, { "completion_length": 2111.916717529297, "epoch": 0.08342857142857144, "grad_norm": 0.2483416646718979, "kl": 0.0014190673828125, "learning_rate": 9.942113192828444e-07, "loss": 0.0001, "reward": 0.2418079525232315, "reward_std": 0.23962385952472687, "rewards/cosine_scaled_reward": 0.0939856581389904, "rewards/format_reward": 0.6458333432674408, "step": 73 }, { "completion_length": 2321.0625915527344, "epoch": 0.08457142857142858, "grad_norm": 0.20480823516845703, "kl": 0.0015668869018554688, "learning_rate": 9.93698216681727e-07, "loss": 0.0001, "reward": 0.1605784334242344, "reward_std": 0.3177975155413151, "rewards/cosine_scaled_reward": 0.00783238559961319, "rewards/format_reward": 0.6041666865348816, "step": 74 }, { "completion_length": 2604.187530517578, "epoch": 0.08571428571428572, "grad_norm": 0.2458542287349701, "kl": 0.0021820068359375, "learning_rate": 9.931634888554935e-07, "loss": 0.0001, "reward": 0.02317179925739765, "reward_std": 0.24085771292448044, "rewards/cosine_scaled_reward": -0.10294242203235626, "rewards/format_reward": 0.3958333395421505, "step": 75 }, { "completion_length": 2759.1459350585938, "epoch": 0.08685714285714285, "grad_norm": 0.19815924763679504, "kl": 0.0006747245788574219, "learning_rate": 9.926071618660237e-07, "loss": 0.0, "reward": 0.17977549135684967, "reward_std": 0.22920313104987144, "rewards/cosine_scaled_reward": 0.060693852603435516, "rewards/format_reward": 0.5000000074505806, "step": 76 }, { "completion_length": 3166.2500610351562, "epoch": 0.088, "grad_norm": 0.17464476823806763, "kl": 0.000640869140625, "learning_rate": 9.9202926282791e-07, "loss": 0.0, "reward": 0.0053346361964941025, "reward_std": 0.2872692942619324, "rewards/cosine_scaled_reward": -0.07941475044935942, "rewards/format_reward": 0.2500000037252903, "step": 77 }, { "completion_length": 2879.6875610351562, "epoch": 0.08914285714285715, "grad_norm": 0.17484599351882935, "kl": 0.0008184909820556641, "learning_rate": 9.91429819907136e-07, "loss": 0.0, "reward": 0.031344905495643616, "reward_std": 0.2594750449061394, "rewards/cosine_scaled_reward": -0.07576533406972885, "rewards/format_reward": 0.35416667722165585, "step": 78 }, { "completion_length": 2861.7083740234375, "epoch": 0.09028571428571429, "grad_norm": 0.18350628018379211, "kl": 0.0004169940948486328, "learning_rate": 9.908088623197048e-07, "loss": 0.0, "reward": 0.13247722014784813, "reward_std": 0.2174980491399765, "rewards/cosine_scaled_reward": 0.024736488237977028, "rewards/format_reward": 0.4375000074505806, "step": 79 }, { "completion_length": 3313.1041870117188, "epoch": 0.09142857142857143, "grad_norm": 0.15940068662166595, "kl": 0.00030362606048583984, "learning_rate": 9.901664203302124e-07, "loss": 0.0, "reward": -0.07443897030316293, "reward_std": 0.20227612555027008, "rewards/cosine_scaled_reward": -0.16378207132220268, "rewards/format_reward": 0.2083333395421505, "step": 80 }, { "completion_length": 3127.8750610351562, "epoch": 0.09257142857142857, "grad_norm": 0.17740540206432343, "kl": 0.0021224021911621094, "learning_rate": 9.895025252503755e-07, "loss": 0.0001, "reward": 0.025630776770412922, "reward_std": 0.31113822758197784, "rewards/cosine_scaled_reward": -0.04427819326519966, "rewards/format_reward": 0.22916666977107525, "step": 81 }, { "completion_length": 2915.4791870117188, "epoch": 0.09371428571428571, "grad_norm": 0.17501351237297058, "kl": 0.0007612705230712891, "learning_rate": 9.888172094375033e-07, "loss": 0.0, "reward": -0.0019456367008388042, "reward_std": 0.26062510162591934, "rewards/cosine_scaled_reward": -0.13531204964965582, "rewards/format_reward": 0.3958333432674408, "step": 82 }, { "completion_length": 3508.1875610351562, "epoch": 0.09485714285714286, "grad_norm": 0.15616776049137115, "kl": 0.0002772808074951172, "learning_rate": 9.881105062929221e-07, "loss": 0.0, "reward": -0.13871120987460017, "reward_std": 0.21259715221822262, "rewards/cosine_scaled_reward": -0.20505784079432487, "rewards/format_reward": 0.0833333358168602, "step": 83 }, { "completion_length": 2658.7708740234375, "epoch": 0.096, "grad_norm": 0.19438283145427704, "kl": 0.000476837158203125, "learning_rate": 9.873824502603459e-07, "loss": 0.0, "reward": 0.2181803850689903, "reward_std": 0.32128985971212387, "rewards/cosine_scaled_reward": 0.09941665828227997, "rewards/format_reward": 0.5416666716337204, "step": 84 }, { "completion_length": 3100.2083740234375, "epoch": 0.09714285714285714, "grad_norm": 0.17281532287597656, "kl": 0.0006203651428222656, "learning_rate": 9.866330768241983e-07, "loss": 0.0, "reward": 0.0023652464151382446, "reward_std": 0.27234018221497536, "rewards/cosine_scaled_reward": -0.0952380420640111, "rewards/format_reward": 0.2916666753590107, "step": 85 }, { "completion_length": 2836.354248046875, "epoch": 0.09828571428571428, "grad_norm": 0.16363169252872467, "kl": 0.0010099411010742188, "learning_rate": 9.85862422507884e-07, "loss": 0.0, "reward": -0.03766683675348759, "reward_std": 0.21276579797267914, "rewards/cosine_scaled_reward": -0.1654567075893283, "rewards/format_reward": 0.3541666716337204, "step": 86 }, { "completion_length": 3115.354248046875, "epoch": 0.09942857142857142, "grad_norm": 0.18644234538078308, "kl": 0.002384185791015625, "learning_rate": 9.850705248720068e-07, "loss": 0.0001, "reward": -0.032251302152872086, "reward_std": 0.24311810731887817, "rewards/cosine_scaled_reward": -0.1445614593103528, "rewards/format_reward": 0.3125000074505806, "step": 87 }, { "completion_length": 2530.5833740234375, "epoch": 0.10057142857142858, "grad_norm": 0.19104546308517456, "kl": 0.0012140274047851562, "learning_rate": 9.8425742251254e-07, "loss": 0.0, "reward": 0.20276049276435515, "reward_std": 0.26876165717840195, "rewards/cosine_scaled_reward": 0.0905617168173194, "rewards/format_reward": 0.5208333432674408, "step": 88 }, { "completion_length": 3106.9583740234375, "epoch": 0.10171428571428572, "grad_norm": 0.15616898238658905, "kl": 0.0005943775177001953, "learning_rate": 9.83423155058946e-07, "loss": 0.0, "reward": 0.14823118224740028, "reward_std": 0.25535859540104866, "rewards/cosine_scaled_reward": 0.07057950645685196, "rewards/format_reward": 0.3541666716337204, "step": 89 }, { "completion_length": 3029.5625610351562, "epoch": 0.10285714285714286, "grad_norm": 0.16313032805919647, "kl": 0.0008497238159179688, "learning_rate": 9.825677631722435e-07, "loss": 0.0, "reward": 0.10649214684963226, "reward_std": 0.3150200583040714, "rewards/cosine_scaled_reward": 0.01097804619348608, "rewards/format_reward": 0.3750000149011612, "step": 90 }, { "completion_length": 3242.4583740234375, "epoch": 0.104, "grad_norm": 0.1812453716993332, "kl": 0.0005056858062744141, "learning_rate": 9.816912885430258e-07, "loss": 0.0, "reward": -0.016676221042871475, "reward_std": 0.2188626378774643, "rewards/cosine_scaled_reward": -0.1111946739256382, "rewards/format_reward": 0.27083333395421505, "step": 91 }, { "completion_length": 2669.2709350585938, "epoch": 0.10514285714285715, "grad_norm": 0.1882462203502655, "kl": 0.0012416839599609375, "learning_rate": 9.807937738894303e-07, "loss": 0.0, "reward": 0.12547802366316319, "reward_std": 0.21191112510859966, "rewards/cosine_scaled_reward": 0.0004324503242969513, "rewards/format_reward": 0.47916667722165585, "step": 92 }, { "completion_length": 2442.0000610351562, "epoch": 0.10628571428571429, "grad_norm": 0.2229626625776291, "kl": 0.001369476318359375, "learning_rate": 9.798752629550546e-07, "loss": 0.0001, "reward": 0.15711925993673503, "reward_std": 0.2689987123012543, "rewards/cosine_scaled_reward": 0.030423451215028763, "rewards/format_reward": 0.5208333432674408, "step": 93 }, { "completion_length": 3294.2500610351562, "epoch": 0.10742857142857143, "grad_norm": 0.16960301995277405, "kl": 0.0009059906005859375, "learning_rate": 9.78935800506826e-07, "loss": 0.0, "reward": -0.04972950741648674, "reward_std": 0.20636767707765102, "rewards/cosine_scaled_reward": -0.11113300547003746, "rewards/format_reward": 0.14583333395421505, "step": 94 }, { "completion_length": 3159.791748046875, "epoch": 0.10857142857142857, "grad_norm": 0.14427055418491364, "kl": 0.00042057037353515625, "learning_rate": 9.779754323328192e-07, "loss": 0.0, "reward": -0.11810954473912716, "reward_std": 0.1571381390094757, "rewards/cosine_scaled_reward": -0.23719513043761253, "rewards/format_reward": 0.2500000037252903, "step": 95 }, { "completion_length": 2514.312530517578, "epoch": 0.10971428571428571, "grad_norm": 0.24939848482608795, "kl": 0.0015869140625, "learning_rate": 9.769942052400235e-07, "loss": 0.0001, "reward": 0.07279813662171364, "reward_std": 0.17102814465761185, "rewards/cosine_scaled_reward": -0.06484721228480339, "rewards/format_reward": 0.479166679084301, "step": 96 }, { "completion_length": 2968.104248046875, "epoch": 0.11085714285714286, "grad_norm": 0.2483038604259491, "kl": 0.0010707378387451172, "learning_rate": 9.759921670520634e-07, "loss": 0.0, "reward": 0.08805497021785413, "reward_std": 0.24433772545307875, "rewards/cosine_scaled_reward": -0.018354391679167747, "rewards/format_reward": 0.3958333395421505, "step": 97 }, { "completion_length": 2667.791717529297, "epoch": 0.112, "grad_norm": 0.20968908071517944, "kl": 0.0012552738189697266, "learning_rate": 9.749693666068663e-07, "loss": 0.0001, "reward": 0.050092536956071854, "reward_std": 0.19893871247768402, "rewards/cosine_scaled_reward": -0.0555708110332489, "rewards/format_reward": 0.35416667722165585, "step": 98 }, { "completion_length": 2831.875030517578, "epoch": 0.11314285714285714, "grad_norm": 0.20566846430301666, "kl": 0.0006589889526367188, "learning_rate": 9.739258537542835e-07, "loss": 0.0, "reward": 0.14856231398880482, "reward_std": 0.24846229702234268, "rewards/cosine_scaled_reward": 0.042768311919644475, "rewards/format_reward": 0.4375000223517418, "step": 99 }, { "completion_length": 3068.729248046875, "epoch": 0.11428571428571428, "grad_norm": 0.19074195623397827, "kl": 0.0006690025329589844, "learning_rate": 9.728616793536587e-07, "loss": 0.0, "reward": 0.1303813187405467, "reward_std": 0.2905452437698841, "rewards/cosine_scaled_reward": 0.04835892841219902, "rewards/format_reward": 0.3750000149011612, "step": 100 }, { "completion_length": 2701.229248046875, "epoch": 0.11542857142857142, "grad_norm": 0.22680342197418213, "kl": 0.00116729736328125, "learning_rate": 9.717768952713511e-07, "loss": 0.0, "reward": 0.10164168104529381, "reward_std": 0.29300505481660366, "rewards/cosine_scaled_reward": 0.005210310220718384, "rewards/format_reward": 0.375, "step": 101 }, { "completion_length": 2577.4168090820312, "epoch": 0.11657142857142858, "grad_norm": 0.16007407009601593, "kl": 0.0012121200561523438, "learning_rate": 9.706715543782064e-07, "loss": 0.0, "reward": 0.08228157926350832, "reward_std": 0.20707003772258759, "rewards/cosine_scaled_reward": -0.04624825902283192, "rewards/format_reward": 0.4583333432674408, "step": 102 }, { "completion_length": 2794.8541870117188, "epoch": 0.11771428571428572, "grad_norm": 0.17102256417274475, "kl": 0.0010852813720703125, "learning_rate": 9.695457105469804e-07, "loss": 0.0, "reward": 0.08034474775195122, "reward_std": 0.23851236328482628, "rewards/cosine_scaled_reward": -0.034777700901031494, "rewards/format_reward": 0.416666679084301, "step": 103 }, { "completion_length": 2514.0626220703125, "epoch": 0.11885714285714286, "grad_norm": 0.18136081099510193, "kl": 0.0005865097045898438, "learning_rate": 9.683994186497132e-07, "loss": 0.0, "reward": 0.25316645577549934, "reward_std": 0.23620141111314297, "rewards/cosine_scaled_reward": 0.16903768852353096, "rewards/format_reward": 0.4791666716337204, "step": 104 }, { "completion_length": 2782.4584350585938, "epoch": 0.12, "grad_norm": 0.20108939707279205, "kl": 0.001331329345703125, "learning_rate": 9.672327345550543e-07, "loss": 0.0001, "reward": 0.046333261765539646, "reward_std": 0.253768190741539, "rewards/cosine_scaled_reward": -0.08485639095306396, "rewards/format_reward": 0.4166666716337204, "step": 105 }, { "completion_length": 2904.1251220703125, "epoch": 0.12114285714285715, "grad_norm": 0.19156429171562195, "kl": 0.0013818740844726562, "learning_rate": 9.66045715125541e-07, "loss": 0.0001, "reward": 0.15791430696845055, "reward_std": 0.28573495149612427, "rewards/cosine_scaled_reward": 0.0755169503390789, "rewards/format_reward": 0.3958333395421505, "step": 106 }, { "completion_length": 2129.6459045410156, "epoch": 0.12228571428571429, "grad_norm": 0.19605104625225067, "kl": 0.0010166168212890625, "learning_rate": 9.648384182148252e-07, "loss": 0.0, "reward": 0.0870693230535835, "reward_std": 0.3030298389494419, "rewards/cosine_scaled_reward": -0.10336481593549252, "rewards/format_reward": 0.6458333432674408, "step": 107 }, { "completion_length": 2147.6041870117188, "epoch": 0.12342857142857143, "grad_norm": 0.1981632560491562, "kl": 0.0015201568603515625, "learning_rate": 9.636109026648554e-07, "loss": 0.0001, "reward": 0.28709910926409066, "reward_std": 0.2012571543455124, "rewards/cosine_scaled_reward": 0.1342942900955677, "rewards/format_reward": 0.6875000149011612, "step": 108 }, { "completion_length": 3073.666748046875, "epoch": 0.12457142857142857, "grad_norm": 0.17615145444869995, "kl": 0.0011663436889648438, "learning_rate": 9.623632283030077e-07, "loss": 0.0, "reward": -0.021478969603776932, "reward_std": 0.189506895840168, "rewards/cosine_scaled_reward": -0.13986263051629066, "rewards/format_reward": 0.3333333358168602, "step": 109 }, { "completion_length": 2523.3125915527344, "epoch": 0.12571428571428572, "grad_norm": 0.19854386150836945, "kl": 0.0023508071899414062, "learning_rate": 9.610954559391704e-07, "loss": 0.0001, "reward": 0.4017367544583976, "reward_std": 0.22793768532574177, "rewards/cosine_scaled_reward": 0.2740056961774826, "rewards/format_reward": 0.7291666865348816, "step": 110 }, { "completion_length": 1766.6459197998047, "epoch": 0.12685714285714286, "grad_norm": 0.2702421545982361, "kl": 0.001590728759765625, "learning_rate": 9.598076473627796e-07, "loss": 0.0001, "reward": 0.251003984361887, "reward_std": 0.2790553644299507, "rewards/cosine_scaled_reward": 0.07046932214871049, "rewards/format_reward": 0.7500000149011612, "step": 111 }, { "completion_length": 3026.041748046875, "epoch": 0.128, "grad_norm": 0.1819557249546051, "kl": 0.0011844635009765625, "learning_rate": 9.58499865339809e-07, "loss": 0.0, "reward": 0.05430050566792488, "reward_std": 0.2979404255747795, "rewards/cosine_scaled_reward": -0.052672116085886955, "rewards/format_reward": 0.3750000074505806, "step": 112 }, { "completion_length": 2138.541717529297, "epoch": 0.12914285714285714, "grad_norm": 0.2193819135427475, "kl": 0.0021915435791015625, "learning_rate": 9.571721736097088e-07, "loss": 0.0001, "reward": 0.20246881432831287, "reward_std": 0.2825229614973068, "rewards/cosine_scaled_reward": 0.05767783522605896, "rewards/format_reward": 0.6041666697710752, "step": 113 }, { "completion_length": 1764.7083740234375, "epoch": 0.13028571428571428, "grad_norm": 0.21886524558067322, "kl": 0.001209259033203125, "learning_rate": 9.55824636882301e-07, "loss": 0.0, "reward": 0.2999228276312351, "reward_std": 0.28240448236465454, "rewards/cosine_scaled_reward": 0.13528712838888168, "rewards/format_reward": 0.7291666716337204, "step": 114 }, { "completion_length": 2939.8958740234375, "epoch": 0.13142857142857142, "grad_norm": 0.16435398161411285, "kl": 0.0017871856689453125, "learning_rate": 9.54457320834625e-07, "loss": 0.0001, "reward": 0.1701457817107439, "reward_std": 0.26418206468224525, "rewards/cosine_scaled_reward": 0.06508506275713444, "rewards/format_reward": 0.4583333432674408, "step": 115 }, { "completion_length": 2467.791748046875, "epoch": 0.13257142857142856, "grad_norm": 0.21436309814453125, "kl": 0.004275321960449219, "learning_rate": 9.530702921077358e-07, "loss": 0.0002, "reward": 0.05696129985153675, "reward_std": 0.3307487964630127, "rewards/cosine_scaled_reward": -0.08862798800691962, "rewards/format_reward": 0.47916667722165585, "step": 116 }, { "completion_length": 3445.6875610351562, "epoch": 0.1337142857142857, "grad_norm": 0.14990511536598206, "kl": 0.0016841888427734375, "learning_rate": 9.516636183034564e-07, "loss": 0.0001, "reward": -0.1102401977404952, "reward_std": 0.20692161843180656, "rewards/cosine_scaled_reward": -0.18584159016609192, "rewards/format_reward": 0.12500000186264515, "step": 117 }, { "completion_length": 1913.9792175292969, "epoch": 0.13485714285714287, "grad_norm": 0.2434745579957962, "kl": 0.0030345916748046875, "learning_rate": 9.502373679810839e-07, "loss": 0.0001, "reward": 0.27154221711680293, "reward_std": 0.22354764118790627, "rewards/cosine_scaled_reward": 0.09904532507061958, "rewards/format_reward": 0.7500000149011612, "step": 118 }, { "completion_length": 1981.0833740234375, "epoch": 0.136, "grad_norm": 0.29008790850639343, "kl": 0.004032135009765625, "learning_rate": 9.487916106540465e-07, "loss": 0.0002, "reward": 0.03949427045881748, "reward_std": 0.11318811401724815, "rewards/cosine_scaled_reward": -0.14376981928944588, "rewards/format_reward": 0.5833333432674408, "step": 119 }, { "completion_length": 2934.166748046875, "epoch": 0.13714285714285715, "grad_norm": 0.22969283163547516, "kl": 0.0017414093017578125, "learning_rate": 9.473264167865171e-07, "loss": 0.0001, "reward": 0.04561608098447323, "reward_std": 0.4027886167168617, "rewards/cosine_scaled_reward": -0.09728806652128696, "rewards/format_reward": 0.4583333507180214, "step": 120 }, { "completion_length": 2774.750030517578, "epoch": 0.1382857142857143, "grad_norm": 0.1892664134502411, "kl": 0.0012845993041992188, "learning_rate": 9.458418577899774e-07, "loss": 0.0001, "reward": 0.10860130004584789, "reward_std": 0.23990362137556076, "rewards/cosine_scaled_reward": 0.002404959872364998, "rewards/format_reward": 0.41666667722165585, "step": 121 }, { "completion_length": 2878.8333740234375, "epoch": 0.13942857142857143, "grad_norm": 0.2213921993970871, "kl": 0.0037384033203125, "learning_rate": 9.443380060197385e-07, "loss": 0.0001, "reward": -0.07514401245862246, "reward_std": 0.19457928091287613, "rewards/cosine_scaled_reward": -0.1945231556892395, "rewards/format_reward": 0.29166667349636555, "step": 122 }, { "completion_length": 2914.5000610351562, "epoch": 0.14057142857142857, "grad_norm": 1.7548092603683472, "kl": 0.0517425537109375, "learning_rate": 9.428149347714143e-07, "loss": 0.0021, "reward": -0.013575404649600387, "reward_std": 0.22872992418706417, "rewards/cosine_scaled_reward": -0.1315056849271059, "rewards/format_reward": 0.3333333395421505, "step": 123 }, { "completion_length": 2563.5625915527344, "epoch": 0.1417142857142857, "grad_norm": 0.2274925261735916, "kl": 0.00177001953125, "learning_rate": 9.412727182773486e-07, "loss": 0.0001, "reward": 0.17911872267723083, "reward_std": 0.3024190813302994, "rewards/cosine_scaled_reward": 0.028859375044703484, "rewards/format_reward": 0.6041666865348816, "step": 124 }, { "completion_length": 2916.166748046875, "epoch": 0.14285714285714285, "grad_norm": 0.19302645325660706, "kl": 0.0012264251708984375, "learning_rate": 9.397114317029974e-07, "loss": 0.0, "reward": 0.04521351866424084, "reward_std": 0.183570247143507, "rewards/cosine_scaled_reward": -0.0881996639072895, "rewards/format_reward": 0.4375000111758709, "step": 125 }, { "completion_length": 2168.3958740234375, "epoch": 0.144, "grad_norm": 0.3510819375514984, "kl": 0.0034084320068359375, "learning_rate": 9.381311511432658e-07, "loss": 0.0001, "reward": 0.07385214976966381, "reward_std": 0.2289694994688034, "rewards/cosine_scaled_reward": -0.10627560317516327, "rewards/format_reward": 0.6041666716337204, "step": 126 }, { "completion_length": 2812.9583740234375, "epoch": 0.14514285714285713, "grad_norm": 0.1580205112695694, "kl": 0.0012226104736328125, "learning_rate": 9.36531953618799e-07, "loss": 0.0, "reward": 0.07576266676187515, "reward_std": 0.22248199209570885, "rewards/cosine_scaled_reward": -0.026177331805229187, "rewards/format_reward": 0.37500000558793545, "step": 127 }, { "completion_length": 2719.9583435058594, "epoch": 0.1462857142857143, "grad_norm": 0.1932969093322754, "kl": 0.001880645751953125, "learning_rate": 9.34913917072228e-07, "loss": 0.0001, "reward": 0.12211918644607067, "reward_std": 0.22235066816210747, "rewards/cosine_scaled_reward": 0.022739887237548828, "rewards/format_reward": 0.3958333395421505, "step": 128 }, { "completion_length": 1889.4375457763672, "epoch": 0.14742857142857144, "grad_norm": 0.27860602736473083, "kl": 0.00513458251953125, "learning_rate": 9.332771203643714e-07, "loss": 0.0002, "reward": 0.191393559332937, "reward_std": 0.29455113410949707, "rewards/cosine_scaled_reward": 0.012313703075051308, "rewards/format_reward": 0.6875000074505806, "step": 129 }, { "completion_length": 3178.979248046875, "epoch": 0.14857142857142858, "grad_norm": 0.1898622065782547, "kl": 0.0014190673828125, "learning_rate": 9.316216432703916e-07, "loss": 0.0001, "reward": -0.03939210996031761, "reward_std": 0.2819094732403755, "rewards/cosine_scaled_reward": -0.1552355084568262, "rewards/format_reward": 0.3125000111758709, "step": 130 }, { "completion_length": 2920.6875610351562, "epoch": 0.14971428571428572, "grad_norm": 0.21076896786689758, "kl": 0.00131988525390625, "learning_rate": 9.299475664759068e-07, "loss": 0.0001, "reward": 0.07247342355549335, "reward_std": 0.2505417540669441, "rewards/cosine_scaled_reward": -0.04478706605732441, "rewards/format_reward": 0.4166666828095913, "step": 131 }, { "completion_length": 2784.8750610351562, "epoch": 0.15085714285714286, "grad_norm": 0.19709761440753937, "kl": 0.0018377304077148438, "learning_rate": 9.282549715730579e-07, "loss": 0.0001, "reward": 0.018537950702011585, "reward_std": 0.27707668393850327, "rewards/cosine_scaled_reward": -0.10952132218517363, "rewards/format_reward": 0.3958333395421505, "step": 132 }, { "completion_length": 2956.979248046875, "epoch": 0.152, "grad_norm": 0.23451754450798035, "kl": 0.002567291259765625, "learning_rate": 9.265439410565328e-07, "loss": 0.0001, "reward": -0.04261648189276457, "reward_std": 0.2717365212738514, "rewards/cosine_scaled_reward": -0.18468540161848068, "rewards/format_reward": 0.3958333544433117, "step": 133 }, { "completion_length": 1903.354232788086, "epoch": 0.15314285714285714, "grad_norm": 0.30252185463905334, "kl": 0.0040454864501953125, "learning_rate": 9.248145583195447e-07, "loss": 0.0002, "reward": 0.1737231626175344, "reward_std": 0.2514979690313339, "rewards/cosine_scaled_reward": -0.008977729827165604, "rewards/format_reward": 0.6875000149011612, "step": 134 }, { "completion_length": 2347.416778564453, "epoch": 0.15428571428571428, "grad_norm": 0.1927800178527832, "kl": 0.001873016357421875, "learning_rate": 9.230669076497687e-07, "loss": 0.0001, "reward": 0.09690875932574272, "reward_std": 0.20963543467223644, "rewards/cosine_scaled_reward": -0.05220409855246544, "rewards/format_reward": 0.5208333432674408, "step": 135 }, { "completion_length": 2465.5834350585938, "epoch": 0.15542857142857142, "grad_norm": 0.29496052861213684, "kl": 0.002719879150390625, "learning_rate": 9.213010742252327e-07, "loss": 0.0001, "reward": 0.04496379243209958, "reward_std": 0.30501750111579895, "rewards/cosine_scaled_reward": -0.12493768334388733, "rewards/format_reward": 0.5416666828095913, "step": 136 }, { "completion_length": 2833.5000610351562, "epoch": 0.15657142857142858, "grad_norm": 0.20566634833812714, "kl": 0.003387451171875, "learning_rate": 9.195171441101668e-07, "loss": 0.0001, "reward": -0.012183592654764652, "reward_std": 0.2676853369921446, "rewards/cosine_scaled_reward": -0.12493430450558662, "rewards/format_reward": 0.31250002048909664, "step": 137 }, { "completion_length": 2088.8125610351562, "epoch": 0.15771428571428572, "grad_norm": 0.2401277720928192, "kl": 0.0029354095458984375, "learning_rate": 9.177152042508077e-07, "loss": 0.0001, "reward": 0.31860150722786784, "reward_std": 0.2921822927892208, "rewards/cosine_scaled_reward": 0.1930521186441183, "rewards/format_reward": 0.6458333432674408, "step": 138 }, { "completion_length": 2617.1251220703125, "epoch": 0.15885714285714286, "grad_norm": 0.19433732330799103, "kl": 0.0014438629150390625, "learning_rate": 9.158953424711624e-07, "loss": 0.0001, "reward": 0.09779603406786919, "reward_std": 0.28393205627799034, "rewards/cosine_scaled_reward": -0.0396699383854866, "rewards/format_reward": 0.47916667349636555, "step": 139 }, { "completion_length": 2479.6876220703125, "epoch": 0.16, "grad_norm": 0.2356812208890915, "kl": 0.00270843505859375, "learning_rate": 9.140576474687263e-07, "loss": 0.0001, "reward": 0.16233161371201277, "reward_std": 0.21454461477696896, "rewards/cosine_scaled_reward": 0.020634490996599197, "rewards/format_reward": 0.5625000149011612, "step": 140 }, { "completion_length": 2655.8125, "epoch": 0.16114285714285714, "grad_norm": 0.21176272630691528, "kl": 0.003421783447265625, "learning_rate": 9.122022088101613e-07, "loss": 0.0001, "reward": 0.14447204023599625, "reward_std": 0.23380305618047714, "rewards/cosine_scaled_reward": 0.04617850482463837, "rewards/format_reward": 0.4166666865348816, "step": 141 }, { "completion_length": 2357.5000610351562, "epoch": 0.16228571428571428, "grad_norm": 0.23890313506126404, "kl": 0.0028333663940429688, "learning_rate": 9.103291169269299e-07, "loss": 0.0001, "reward": 0.1481823343783617, "reward_std": 0.2089283987879753, "rewards/cosine_scaled_reward": 0.023761034943163395, "rewards/format_reward": 0.5000000149011612, "step": 142 }, { "completion_length": 2750.7500610351562, "epoch": 0.16342857142857142, "grad_norm": 0.17377115786075592, "kl": 0.0030498504638671875, "learning_rate": 9.084384631108882e-07, "loss": 0.0001, "reward": 0.19583595544099808, "reward_std": 0.2811162993311882, "rewards/cosine_scaled_reward": 0.0775480642914772, "rewards/format_reward": 0.5208333432674408, "step": 143 }, { "completion_length": 2684.3334350585938, "epoch": 0.16457142857142856, "grad_norm": 0.17148694396018982, "kl": 0.0034637451171875, "learning_rate": 9.065303395098358e-07, "loss": 0.0001, "reward": 0.18883249908685684, "reward_std": 0.3707999885082245, "rewards/cosine_scaled_reward": 0.0525783970952034, "rewards/format_reward": 0.5625000260770321, "step": 144 }, { "completion_length": 2799.6876220703125, "epoch": 0.1657142857142857, "grad_norm": 0.25570377707481384, "kl": 0.00708770751953125, "learning_rate": 9.046048391230247e-07, "loss": 0.0003, "reward": 0.08236825000494719, "reward_std": 0.2996014729142189, "rewards/cosine_scaled_reward": -0.02843803663563449, "rewards/format_reward": 0.3958333507180214, "step": 145 }, { "completion_length": 2574.7083740234375, "epoch": 0.16685714285714287, "grad_norm": 0.19306336343288422, "kl": 0.003398895263671875, "learning_rate": 9.026620557966279e-07, "loss": 0.0001, "reward": -0.035824403166770935, "reward_std": 0.19786337018013, "rewards/cosine_scaled_reward": -0.23294785246253014, "rewards/format_reward": 0.5625000223517418, "step": 146 }, { "completion_length": 2272.7709045410156, "epoch": 0.168, "grad_norm": 0.21024219691753387, "kl": 0.0027141571044921875, "learning_rate": 9.007020842191634e-07, "loss": 0.0001, "reward": 0.16781320981681347, "reward_std": 0.19364098459482193, "rewards/cosine_scaled_reward": 0.029284225136507303, "rewards/format_reward": 0.5625000074505806, "step": 147 }, { "completion_length": 2210.791717529297, "epoch": 0.16914285714285715, "grad_norm": 0.3196452856063843, "kl": 0.004055023193359375, "learning_rate": 8.987250199168808e-07, "loss": 0.0002, "reward": -0.05866700294427574, "reward_std": 0.2012275978922844, "rewards/cosine_scaled_reward": -0.26218296214938164, "rewards/format_reward": 0.5625000111758709, "step": 148 }, { "completion_length": 2778.4375610351562, "epoch": 0.1702857142857143, "grad_norm": 0.19716207683086395, "kl": 0.0034618377685546875, "learning_rate": 8.967309592491052e-07, "loss": 0.0001, "reward": 0.11828946322202682, "reward_std": 0.3204687051475048, "rewards/cosine_scaled_reward": 0.00033649057149887085, "rewards/format_reward": 0.45833334885537624, "step": 149 }, { "completion_length": 2391.8125610351562, "epoch": 0.17142857142857143, "grad_norm": 0.19943048059940338, "kl": 0.004062652587890625, "learning_rate": 8.9471999940354e-07, "loss": 0.0002, "reward": -0.03545256704092026, "reward_std": 0.1607068907469511, "rewards/cosine_scaled_reward": -0.21197712421417236, "rewards/format_reward": 0.5000000204890966, "step": 150 }, { "completion_length": 2911.4583740234375, "epoch": 0.17257142857142857, "grad_norm": 0.16765278577804565, "kl": 0.0033435821533203125, "learning_rate": 8.926922383915315e-07, "loss": 0.0001, "reward": 0.005968112964183092, "reward_std": 0.13167849741876125, "rewards/cosine_scaled_reward": -0.11089767143130302, "rewards/format_reward": 0.35416667722165585, "step": 151 }, { "completion_length": 3065.5834350585938, "epoch": 0.1737142857142857, "grad_norm": 0.17622444033622742, "kl": 0.006145477294921875, "learning_rate": 8.906477750432903e-07, "loss": 0.0002, "reward": 0.008522989228367805, "reward_std": 0.2847393900156021, "rewards/cosine_scaled_reward": -0.07492140308022499, "rewards/format_reward": 0.25000000186264515, "step": 152 }, { "completion_length": 2494.6459045410156, "epoch": 0.17485714285714285, "grad_norm": 0.21263031661510468, "kl": 0.0029783248901367188, "learning_rate": 8.88586709003076e-07, "loss": 0.0001, "reward": 0.17565679177641869, "reward_std": 0.28186945989727974, "rewards/cosine_scaled_reward": 0.05296103097498417, "rewards/format_reward": 0.5208333432674408, "step": 153 }, { "completion_length": 2867.6459045410156, "epoch": 0.176, "grad_norm": 0.16246166825294495, "kl": 0.0023956298828125, "learning_rate": 8.865091407243394e-07, "loss": 0.0001, "reward": 0.29472967982292175, "reward_std": 0.23086620680987835, "rewards/cosine_scaled_reward": 0.21928169950842857, "rewards/format_reward": 0.4791666716337204, "step": 154 }, { "completion_length": 3258.6250610351562, "epoch": 0.17714285714285713, "grad_norm": 0.24034130573272705, "kl": 0.0034580230712890625, "learning_rate": 8.844151714648274e-07, "loss": 0.0001, "reward": -0.12162286974489689, "reward_std": 0.20457392558455467, "rewards/cosine_scaled_reward": -0.2198924496769905, "rewards/format_reward": 0.18750000558793545, "step": 155 }, { "completion_length": 2978.7708740234375, "epoch": 0.1782857142857143, "grad_norm": 0.17035257816314697, "kl": 0.0047149658203125, "learning_rate": 8.823049032816478e-07, "loss": 0.0002, "reward": -0.09747781977057457, "reward_std": 0.16208519786596298, "rewards/cosine_scaled_reward": -0.21525413170456886, "rewards/format_reward": 0.2708333432674408, "step": 156 }, { "completion_length": 2397.541717529297, "epoch": 0.17942857142857144, "grad_norm": 0.22931884229183197, "kl": 0.0050029754638671875, "learning_rate": 8.801784390262943e-07, "loss": 0.0002, "reward": 0.11597155407071114, "reward_std": 0.28428420424461365, "rewards/cosine_scaled_reward": -0.011876031756401062, "rewards/format_reward": 0.47916667722165585, "step": 157 }, { "completion_length": 3140.5625610351562, "epoch": 0.18057142857142858, "grad_norm": 0.19685107469558716, "kl": 0.00397491455078125, "learning_rate": 8.780358823396352e-07, "loss": 0.0002, "reward": -0.08563036471605301, "reward_std": 0.16759349219501019, "rewards/cosine_scaled_reward": -0.19394809566438198, "rewards/format_reward": 0.25, "step": 158 }, { "completion_length": 2392.500030517578, "epoch": 0.18171428571428572, "grad_norm": 0.21569089591503143, "kl": 0.0052642822265625, "learning_rate": 8.758773376468604e-07, "loss": 0.0002, "reward": 0.3384667945356341, "reward_std": 0.27638140693306923, "rewards/cosine_scaled_reward": 0.17951813712716103, "rewards/format_reward": 0.7708333432674408, "step": 159 }, { "completion_length": 3030.6458740234375, "epoch": 0.18285714285714286, "grad_norm": 0.2277865707874298, "kl": 0.004344940185546875, "learning_rate": 8.737029101523929e-07, "loss": 0.0002, "reward": 0.20380683988332748, "reward_std": 0.2774137519299984, "rewards/cosine_scaled_reward": 0.09686106536537409, "rewards/format_reward": 0.5000000149011612, "step": 160 }, { "completion_length": 3296.9791870117188, "epoch": 0.184, "grad_norm": 0.15664881467819214, "kl": 0.0045928955078125, "learning_rate": 8.715127058347614e-07, "loss": 0.0002, "reward": -0.07018525979947299, "reward_std": 0.25186169147491455, "rewards/cosine_scaled_reward": -0.17596752382814884, "rewards/format_reward": 0.2500000074505806, "step": 161 }, { "completion_length": 3012.541748046875, "epoch": 0.18514285714285714, "grad_norm": 0.194331094622612, "kl": 0.0036983489990234375, "learning_rate": 8.693068314414344e-07, "loss": 0.0001, "reward": 0.06777064688503742, "reward_std": 0.35786426812410355, "rewards/cosine_scaled_reward": -0.06025504320859909, "rewards/format_reward": 0.4375000149011612, "step": 162 }, { "completion_length": 2512.3750915527344, "epoch": 0.18628571428571428, "grad_norm": 0.277127206325531, "kl": 0.00452423095703125, "learning_rate": 8.670853944836176e-07, "loss": 0.0002, "reward": 0.026186905801296234, "reward_std": 0.3294079564511776, "rewards/cosine_scaled_reward": -0.11063731834292412, "rewards/format_reward": 0.4375000111758709, "step": 163 }, { "completion_length": 2525.187530517578, "epoch": 0.18742857142857142, "grad_norm": 0.3039884567260742, "kl": 0.00612640380859375, "learning_rate": 8.648485032310144e-07, "loss": 0.0002, "reward": -0.005198385566473007, "reward_std": 0.21642658859491348, "rewards/cosine_scaled_reward": -0.1315268948674202, "rewards/format_reward": 0.3750000149011612, "step": 164 }, { "completion_length": 2689.1459350585938, "epoch": 0.18857142857142858, "grad_norm": 0.18826548755168915, "kl": 0.00409698486328125, "learning_rate": 8.625962667065487e-07, "loss": 0.0002, "reward": 0.1363761046086438, "reward_std": 0.29318901523947716, "rewards/cosine_scaled_reward": 0.008730031549930573, "rewards/format_reward": 0.5000000223517418, "step": 165 }, { "completion_length": 2415.8125915527344, "epoch": 0.18971428571428572, "grad_norm": 0.2114183008670807, "kl": 0.00537872314453125, "learning_rate": 8.603287946810513e-07, "loss": 0.0002, "reward": 0.01206995639950037, "reward_std": 0.1482153832912445, "rewards/cosine_scaled_reward": -0.14565513283014297, "rewards/format_reward": 0.47916668467223644, "step": 166 }, { "completion_length": 2857.7084350585938, "epoch": 0.19085714285714286, "grad_norm": 0.16645163297653198, "kl": 0.0044403076171875, "learning_rate": 8.580461976679099e-07, "loss": 0.0002, "reward": -0.04430033452808857, "reward_std": 0.2540858667343855, "rewards/cosine_scaled_reward": -0.18336289003491402, "rewards/format_reward": 0.37500000186264515, "step": 167 }, { "completion_length": 2432.104248046875, "epoch": 0.192, "grad_norm": 0.18748867511749268, "kl": 0.00420379638671875, "learning_rate": 8.557485869176825e-07, "loss": 0.0002, "reward": 0.10138805268798023, "reward_std": 0.2685610204935074, "rewards/cosine_scaled_reward": -0.07673629373311996, "rewards/format_reward": 0.6250000149011612, "step": 168 }, { "completion_length": 2429.7709045410156, "epoch": 0.19314285714285714, "grad_norm": 0.2035347819328308, "kl": 0.0036020278930664062, "learning_rate": 8.534360744126753e-07, "loss": 0.0001, "reward": 0.4033849276602268, "reward_std": 0.3233870640397072, "rewards/cosine_scaled_reward": 0.28906378895044327, "rewards/format_reward": 0.6875000149011612, "step": 169 }, { "completion_length": 2623.6250610351562, "epoch": 0.19428571428571428, "grad_norm": 0.18528129160404205, "kl": 0.004909515380859375, "learning_rate": 8.511087728614862e-07, "loss": 0.0002, "reward": 0.061040711123496294, "reward_std": 0.23592132702469826, "rewards/cosine_scaled_reward": -0.05934416502714157, "rewards/format_reward": 0.4166666716337204, "step": 170 }, { "completion_length": 2023.8750457763672, "epoch": 0.19542857142857142, "grad_norm": 0.2365913987159729, "kl": 0.004756927490234375, "learning_rate": 8.487667956935087e-07, "loss": 0.0002, "reward": 0.08947444148361683, "reward_std": 0.19351089000701904, "rewards/cosine_scaled_reward": -0.08239659294486046, "rewards/format_reward": 0.5833333432674408, "step": 171 }, { "completion_length": 2301.8125610351562, "epoch": 0.19657142857142856, "grad_norm": 0.23605789244174957, "kl": 0.00450897216796875, "learning_rate": 8.464102570534061e-07, "loss": 0.0002, "reward": 0.19074868597090244, "reward_std": 0.3016255050897598, "rewards/cosine_scaled_reward": 0.04505336657166481, "rewards/format_reward": 0.583333333954215, "step": 172 }, { "completion_length": 2721.6459045410156, "epoch": 0.1977142857142857, "grad_norm": 0.24565771222114563, "kl": 0.0044689178466796875, "learning_rate": 8.440392717955475e-07, "loss": 0.0002, "reward": 0.10209671594202518, "reward_std": 0.20993922092020512, "rewards/cosine_scaled_reward": -0.0017293132841587067, "rewards/format_reward": 0.3958333432674408, "step": 173 }, { "completion_length": 2793.6250610351562, "epoch": 0.19885714285714284, "grad_norm": 0.214183509349823, "kl": 0.00710296630859375, "learning_rate": 8.416539554784089e-07, "loss": 0.0003, "reward": 0.0948445312678814, "reward_std": 0.22529449313879013, "rewards/cosine_scaled_reward": -0.006497794296592474, "rewards/format_reward": 0.3958333432674408, "step": 174 }, { "completion_length": 2957.0001220703125, "epoch": 0.2, "grad_norm": 0.21723315119743347, "kl": 0.0075054168701171875, "learning_rate": 8.392544243589427e-07, "loss": 0.0003, "reward": 0.16869353270158172, "reward_std": 0.274255458265543, "rewards/cosine_scaled_reward": 0.047592975199222565, "rewards/format_reward": 0.5000000074505806, "step": 175 }, { "completion_length": 2953.979248046875, "epoch": 0.20114285714285715, "grad_norm": 0.1722651869058609, "kl": 0.0059967041015625, "learning_rate": 8.368407953869103e-07, "loss": 0.0002, "reward": 0.13873707130551338, "reward_std": 0.29712219163775444, "rewards/cosine_scaled_reward": 0.04709775559604168, "rewards/format_reward": 0.39583334140479565, "step": 176 }, { "completion_length": 2266.5000610351562, "epoch": 0.2022857142857143, "grad_norm": 0.19328881800174713, "kl": 0.00374603271484375, "learning_rate": 8.344131861991828e-07, "loss": 0.0002, "reward": 0.06314084958285093, "reward_std": 0.22284647449851036, "rewards/cosine_scaled_reward": -0.12701627984642982, "rewards/format_reward": 0.625, "step": 177 }, { "completion_length": 2629.979248046875, "epoch": 0.20342857142857143, "grad_norm": 0.20043830573558807, "kl": 0.00550079345703125, "learning_rate": 8.319717151140072e-07, "loss": 0.0002, "reward": 0.24929668940603733, "reward_std": 0.2040734402835369, "rewards/cosine_scaled_reward": 0.14905891939997673, "rewards/format_reward": 0.520833358168602, "step": 178 }, { "completion_length": 2652.1875610351562, "epoch": 0.20457142857142857, "grad_norm": 0.1805621236562729, "kl": 0.005802154541015625, "learning_rate": 8.295165011252396e-07, "loss": 0.0002, "reward": 0.09193149022758007, "reward_std": 0.20314152538776398, "rewards/cosine_scaled_reward": -0.03507839888334274, "rewards/format_reward": 0.4583333432674408, "step": 179 }, { "completion_length": 2596.375, "epoch": 0.2057142857142857, "grad_norm": 0.249373197555542, "kl": 0.00714111328125, "learning_rate": 8.270476638965461e-07, "loss": 0.0003, "reward": 0.11438339785672724, "reward_std": 0.3229541666805744, "rewards/cosine_scaled_reward": -0.010897724889218807, "rewards/format_reward": 0.4791666753590107, "step": 180 }, { "completion_length": 3172.5625, "epoch": 0.20685714285714285, "grad_norm": 0.15354838967323303, "kl": 0.00362396240234375, "learning_rate": 8.245653237555705e-07, "loss": 0.0001, "reward": 0.14108785180724226, "reward_std": 0.25704972073435783, "rewards/cosine_scaled_reward": 0.046188078820705414, "rewards/format_reward": 0.4166666865348816, "step": 181 }, { "completion_length": 2754.1458435058594, "epoch": 0.208, "grad_norm": 0.20978248119354248, "kl": 0.006561279296875, "learning_rate": 8.220696016880687e-07, "loss": 0.0003, "reward": 0.08122844994068146, "reward_std": 0.3094419874250889, "rewards/cosine_scaled_reward": -0.03756958991289139, "rewards/format_reward": 0.41666666977107525, "step": 182 }, { "completion_length": 2732.2708740234375, "epoch": 0.20914285714285713, "grad_norm": 0.1716933399438858, "kl": 0.006378173828125, "learning_rate": 8.195606193320136e-07, "loss": 0.0003, "reward": -0.040448714047670364, "reward_std": 0.20272203534841537, "rewards/cosine_scaled_reward": -0.21793191507458687, "rewards/format_reward": 0.5, "step": 183 }, { "completion_length": 2339.041748046875, "epoch": 0.2102857142857143, "grad_norm": 0.2090311050415039, "kl": 0.004291534423828125, "learning_rate": 8.170384989716657e-07, "loss": 0.0002, "reward": 0.3101842775940895, "reward_std": 0.32960085570812225, "rewards/cosine_scaled_reward": 0.19724036287516356, "rewards/format_reward": 0.6041666679084301, "step": 184 }, { "completion_length": 3183.229248046875, "epoch": 0.21142857142857144, "grad_norm": 0.1650100201368332, "kl": 0.006046295166015625, "learning_rate": 8.145033635316128e-07, "loss": 0.0002, "reward": -0.016166038811206818, "reward_std": 0.183053657412529, "rewards/cosine_scaled_reward": -0.11014915257692337, "rewards/format_reward": 0.27083333395421505, "step": 185 }, { "completion_length": 2894.291717529297, "epoch": 0.21257142857142858, "grad_norm": 0.1902281790971756, "kl": 0.003826141357421875, "learning_rate": 8.119553365707802e-07, "loss": 0.0002, "reward": 0.037780286045745015, "reward_std": 0.23388531804084778, "rewards/cosine_scaled_reward": -0.09303178638219833, "rewards/format_reward": 0.4166666716337204, "step": 186 }, { "completion_length": 2756.3959350585938, "epoch": 0.21371428571428572, "grad_norm": 0.19003655016422272, "kl": 0.0063323974609375, "learning_rate": 8.093945422764069e-07, "loss": 0.0003, "reward": 0.23986343410797417, "reward_std": 0.294619157910347, "rewards/cosine_scaled_reward": 0.1449273396283388, "rewards/format_reward": 0.5, "step": 187 }, { "completion_length": 2735.4166870117188, "epoch": 0.21485714285714286, "grad_norm": 0.20645277202129364, "kl": 0.006011962890625, "learning_rate": 8.068211054579943e-07, "loss": 0.0002, "reward": 0.18498247489333153, "reward_std": 0.38456569984555244, "rewards/cosine_scaled_reward": 0.05256199091672897, "rewards/format_reward": 0.5625000149011612, "step": 188 }, { "completion_length": 2607.7916870117188, "epoch": 0.216, "grad_norm": 0.20103368163108826, "kl": 0.009521484375, "learning_rate": 8.04235151541222e-07, "loss": 0.0004, "reward": 0.032526164315640926, "reward_std": 0.23300225287675858, "rewards/cosine_scaled_reward": -0.11186909675598145, "rewards/format_reward": 0.4583333395421505, "step": 189 }, { "completion_length": 2812.5833740234375, "epoch": 0.21714285714285714, "grad_norm": 0.1999751627445221, "kl": 0.00589752197265625, "learning_rate": 8.01636806561836e-07, "loss": 0.0002, "reward": 0.0322457030415535, "reward_std": 0.20498248934745789, "rewards/cosine_scaled_reward": -0.07093745563179255, "rewards/format_reward": 0.3333333432674408, "step": 190 }, { "completion_length": 3061.2709350585938, "epoch": 0.21828571428571428, "grad_norm": 0.24069836735725403, "kl": 0.008758544921875, "learning_rate": 7.990261971595048e-07, "loss": 0.0004, "reward": 0.003465789370238781, "reward_std": 0.2975287064909935, "rewards/cosine_scaled_reward": -0.10922680050134659, "rewards/format_reward": 0.3333333544433117, "step": 191 }, { "completion_length": 3110.6458740234375, "epoch": 0.21942857142857142, "grad_norm": 0.18123462796211243, "kl": 0.0098114013671875, "learning_rate": 7.964034505716476e-07, "loss": 0.0004, "reward": -0.00519880885258317, "reward_std": 0.23952285945415497, "rewards/cosine_scaled_reward": -0.09877765458077192, "rewards/format_reward": 0.27083333767950535, "step": 192 }, { "completion_length": 2421.7916870117188, "epoch": 0.22057142857142858, "grad_norm": 0.2296593189239502, "kl": 0.00974273681640625, "learning_rate": 7.93768694627233e-07, "loss": 0.0004, "reward": 0.07602208573371172, "reward_std": 0.19795489311218262, "rewards/cosine_scaled_reward": -0.09206605609506369, "rewards/format_reward": 0.5625000111758709, "step": 193 }, { "completion_length": 1688.0625610351562, "epoch": 0.22171428571428572, "grad_norm": 0.24463699758052826, "kl": 0.00638580322265625, "learning_rate": 7.911220577405484e-07, "loss": 0.0003, "reward": 0.46088459342718124, "reward_std": 0.39439669996500015, "rewards/cosine_scaled_reward": 0.29850273206830025, "rewards/format_reward": 0.8541666716337204, "step": 194 }, { "completion_length": 2056.791717529297, "epoch": 0.22285714285714286, "grad_norm": 0.18850582838058472, "kl": 0.007778167724609375, "learning_rate": 7.884636689049422e-07, "loss": 0.0003, "reward": 0.21396406181156635, "reward_std": 0.24946607649326324, "rewards/cosine_scaled_reward": 0.07859774492681026, "rewards/format_reward": 0.5833333358168602, "step": 195 }, { "completion_length": 3171.6875610351562, "epoch": 0.224, "grad_norm": 0.2034706175327301, "kl": 0.005645751953125, "learning_rate": 7.857936576865356e-07, "loss": 0.0002, "reward": -0.05634847842156887, "reward_std": 0.2312827706336975, "rewards/cosine_scaled_reward": -0.1469667460769415, "rewards/format_reward": 0.2291666679084301, "step": 196 }, { "completion_length": 2725.854217529297, "epoch": 0.22514285714285714, "grad_norm": 0.18927797675132751, "kl": 0.00467681884765625, "learning_rate": 7.831121542179086e-07, "loss": 0.0002, "reward": -0.00976499728858471, "reward_std": 0.3078451603651047, "rewards/cosine_scaled_reward": -0.14113588817417622, "rewards/format_reward": 0.37500001676380634, "step": 197 }, { "completion_length": 3027.479248046875, "epoch": 0.22628571428571428, "grad_norm": 0.21629135310649872, "kl": 0.0053081512451171875, "learning_rate": 7.804192891917571e-07, "loss": 0.0002, "reward": 0.06900226790457964, "reward_std": 0.2766474634408951, "rewards/cosine_scaled_reward": -0.037792665883898735, "rewards/format_reward": 0.3750000074505806, "step": 198 }, { "completion_length": 2622.0418090820312, "epoch": 0.22742857142857142, "grad_norm": 0.20776836574077606, "kl": 0.0056915283203125, "learning_rate": 7.777151938545235e-07, "loss": 0.0002, "reward": 0.15501756500452757, "reward_std": 0.3029539883136749, "rewards/cosine_scaled_reward": 0.03127571474760771, "rewards/format_reward": 0.5208333432674408, "step": 199 }, { "completion_length": 2262.979217529297, "epoch": 0.22857142857142856, "grad_norm": 0.222330704331398, "kl": 0.0066375732421875, "learning_rate": 7.75e-07, "loss": 0.0003, "reward": 0.26917505636811256, "reward_std": 0.32473092526197433, "rewards/cosine_scaled_reward": 0.12532378360629082, "rewards/format_reward": 0.645833333954215, "step": 200 }, { "completion_length": 2621.6666870117188, "epoch": 0.2297142857142857, "grad_norm": 0.19183361530303955, "kl": 0.006618499755859375, "learning_rate": 7.72273839962904e-07, "loss": 0.0003, "reward": 0.06654832325875759, "reward_std": 0.23418250493705273, "rewards/cosine_scaled_reward": -0.06533646397292614, "rewards/format_reward": 0.4583333395421505, "step": 201 }, { "completion_length": 2380.0, "epoch": 0.23085714285714284, "grad_norm": 0.21531449258327484, "kl": 0.0079803466796875, "learning_rate": 7.695368466124296e-07, "loss": 0.0003, "reward": 0.09720260743051767, "reward_std": 0.27152542024850845, "rewards/cosine_scaled_reward": -0.07546245865523815, "rewards/format_reward": 0.6041666865348816, "step": 202 }, { "completion_length": 2418.4375610351562, "epoch": 0.232, "grad_norm": 0.18618948757648468, "kl": 0.00667572021484375, "learning_rate": 7.667891533457718e-07, "loss": 0.0003, "reward": 0.14104736736044288, "reward_std": 0.23524043709039688, "rewards/cosine_scaled_reward": -0.011097889393568039, "rewards/format_reward": 0.5833333358168602, "step": 203 }, { "completion_length": 2163.5208435058594, "epoch": 0.23314285714285715, "grad_norm": 0.30891525745391846, "kl": 0.006256103515625, "learning_rate": 7.640308940816239e-07, "loss": 0.0003, "reward": 0.2278699278831482, "reward_std": 0.3605262115597725, "rewards/cosine_scaled_reward": 0.12358852848410606, "rewards/format_reward": 0.47916667722165585, "step": 204 }, { "completion_length": 2758.729217529297, "epoch": 0.2342857142857143, "grad_norm": 0.1707957684993744, "kl": 0.00688934326171875, "learning_rate": 7.612622032536507e-07, "loss": 0.0003, "reward": 0.13417839724570513, "reward_std": 0.3422238156199455, "rewards/cosine_scaled_reward": 0.006505465134978294, "rewards/format_reward": 0.5000000204890966, "step": 205 }, { "completion_length": 3193.3125610351562, "epoch": 0.23542857142857143, "grad_norm": 0.17781589925289154, "kl": 0.00615692138671875, "learning_rate": 7.584832158039378e-07, "loss": 0.0002, "reward": 0.015708591789007187, "reward_std": 0.2507881745696068, "rewards/cosine_scaled_reward": -0.09797392599284649, "rewards/format_reward": 0.3541666716337204, "step": 206 }, { "completion_length": 2658.0, "epoch": 0.23657142857142857, "grad_norm": 0.18779563903808594, "kl": 0.00539398193359375, "learning_rate": 7.556940671764124e-07, "loss": 0.0002, "reward": 0.14627259969711304, "reward_std": 0.3163585066795349, "rewards/cosine_scaled_reward": 0.029968852177262306, "rewards/format_reward": 0.4791666716337204, "step": 207 }, { "completion_length": 3152.1458740234375, "epoch": 0.2377142857142857, "grad_norm": 0.17349769175052643, "kl": 0.0095672607421875, "learning_rate": 7.528948933102438e-07, "loss": 0.0004, "reward": -0.13337488658726215, "reward_std": 0.16799047589302063, "rewards/cosine_scaled_reward": -0.24787230044603348, "rewards/format_reward": 0.229166679084301, "step": 208 }, { "completion_length": 2363.229217529297, "epoch": 0.23885714285714285, "grad_norm": 0.2527102828025818, "kl": 0.0085296630859375, "learning_rate": 7.500858306332172e-07, "loss": 0.0003, "reward": 0.13303392380475998, "reward_std": 0.22437193989753723, "rewards/cosine_scaled_reward": -0.0008901059627532959, "rewards/format_reward": 0.5, "step": 209 }, { "completion_length": 1666.6459045410156, "epoch": 0.24, "grad_norm": 0.2683139145374298, "kl": 0.006725311279296875, "learning_rate": 7.472670160550848e-07, "loss": 0.0003, "reward": 0.2863995414227247, "reward_std": 0.15318151377141476, "rewards/cosine_scaled_reward": 0.1167343258857727, "rewards/format_reward": 0.75, "step": 210 }, { "completion_length": 3162.1876220703125, "epoch": 0.24114285714285713, "grad_norm": 0.17517906427383423, "kl": 0.0062408447265625, "learning_rate": 7.444385869608921e-07, "loss": 0.0002, "reward": -0.13221720606088638, "reward_std": 0.19711453840136528, "rewards/cosine_scaled_reward": -0.2389248050749302, "rewards/format_reward": 0.2083333358168602, "step": 211 }, { "completion_length": 2667.0, "epoch": 0.2422857142857143, "grad_norm": 0.214364692568779, "kl": 0.0103759765625, "learning_rate": 7.416006812042827e-07, "loss": 0.0004, "reward": 0.144077368080616, "reward_std": 0.30216942727565765, "rewards/cosine_scaled_reward": 0.024308504071086645, "rewards/format_reward": 0.4791666828095913, "step": 212 }, { "completion_length": 3007.2291870117188, "epoch": 0.24342857142857144, "grad_norm": 0.18140621483325958, "kl": 0.00598907470703125, "learning_rate": 7.387534371007797e-07, "loss": 0.0002, "reward": -0.014454755932092667, "reward_std": 0.13768145814538002, "rewards/cosine_scaled_reward": -0.12324847280979156, "rewards/format_reward": 0.3125000111758709, "step": 213 }, { "completion_length": 2616.979248046875, "epoch": 0.24457142857142858, "grad_norm": 0.15950439870357513, "kl": 0.0040435791015625, "learning_rate": 7.358969934210438e-07, "loss": 0.0002, "reward": 0.09677460975944996, "reward_std": 0.24564701318740845, "rewards/cosine_scaled_reward": -0.07848542928695679, "rewards/format_reward": 0.604166679084301, "step": 214 }, { "completion_length": 3177.3126220703125, "epoch": 0.24571428571428572, "grad_norm": 0.1937507539987564, "kl": 0.0090789794921875, "learning_rate": 7.330314893841101e-07, "loss": 0.0004, "reward": -0.012199342250823975, "reward_std": 0.25811920315027237, "rewards/cosine_scaled_reward": -0.11902385950088501, "rewards/format_reward": 0.3125000111758709, "step": 215 }, { "completion_length": 2959.0000610351562, "epoch": 0.24685714285714286, "grad_norm": 0.17855827510356903, "kl": 0.0058441162109375, "learning_rate": 7.301570646506027e-07, "loss": 0.0002, "reward": -0.07763024140149355, "reward_std": 0.22440576925873756, "rewards/cosine_scaled_reward": -0.2029955154284835, "rewards/format_reward": 0.3125, "step": 216 }, { "completion_length": 2965.291748046875, "epoch": 0.248, "grad_norm": 0.1786404252052307, "kl": 0.0071868896484375, "learning_rate": 7.27273859315928e-07, "loss": 0.0003, "reward": 0.09465384157374501, "reward_std": 0.27161500602960587, "rewards/cosine_scaled_reward": -0.010866086289752275, "rewards/format_reward": 0.3958333432674408, "step": 217 }, { "completion_length": 2379.7709045410156, "epoch": 0.24914285714285714, "grad_norm": 0.19728457927703857, "kl": 0.00604248046875, "learning_rate": 7.243820139034464e-07, "loss": 0.0002, "reward": 0.19949759542942047, "reward_std": 0.25249316543340683, "rewards/cosine_scaled_reward": 0.07985289907082915, "rewards/format_reward": 0.5208333432674408, "step": 218 }, { "completion_length": 2062.8750762939453, "epoch": 0.2502857142857143, "grad_norm": 0.24292118847370148, "kl": 0.0070343017578125, "learning_rate": 7.214816693576234e-07, "loss": 0.0003, "reward": 0.17538858950138092, "reward_std": 0.29175762832164764, "rewards/cosine_scaled_reward": 0.010988058522343636, "rewards/format_reward": 0.6250000204890966, "step": 219 }, { "completion_length": 2316.2083435058594, "epoch": 0.25142857142857145, "grad_norm": 0.23384706676006317, "kl": 0.0063934326171875, "learning_rate": 7.185729670371604e-07, "loss": 0.0003, "reward": 0.1457906775176525, "reward_std": 0.3087628111243248, "rewards/cosine_scaled_reward": 0.0019795289263129234, "rewards/format_reward": 0.5625000055879354, "step": 220 }, { "completion_length": 2980.979217529297, "epoch": 0.25257142857142856, "grad_norm": 0.16741648316383362, "kl": 0.00563812255859375, "learning_rate": 7.156560487081051e-07, "loss": 0.0002, "reward": 0.06579325348138809, "reward_std": 0.22489114850759506, "rewards/cosine_scaled_reward": -0.004787076264619827, "rewards/format_reward": 0.2500000111758709, "step": 221 }, { "completion_length": 3021.2291870117188, "epoch": 0.2537142857142857, "grad_norm": 0.16808848083019257, "kl": 0.0062408447265625, "learning_rate": 7.127310565369415e-07, "loss": 0.0002, "reward": -0.12883399985730648, "reward_std": 0.161459781229496, "rewards/cosine_scaled_reward": -0.2722409665584564, "rewards/format_reward": 0.31250000558793545, "step": 222 }, { "completion_length": 2279.0000610351562, "epoch": 0.25485714285714284, "grad_norm": 0.22214451432228088, "kl": 0.006351470947265625, "learning_rate": 7.097981330836616e-07, "loss": 0.0003, "reward": 0.1543485686997883, "reward_std": 0.3306116834282875, "rewards/cosine_scaled_reward": -0.030070938169956207, "rewards/format_reward": 0.6875000298023224, "step": 223 }, { "completion_length": 2914.041748046875, "epoch": 0.256, "grad_norm": 0.19999858736991882, "kl": 0.006381988525390625, "learning_rate": 7.068574212948169e-07, "loss": 0.0003, "reward": 0.12693990394473076, "reward_std": 0.26520421355962753, "rewards/cosine_scaled_reward": 0.027581635862588882, "rewards/format_reward": 0.39583334140479565, "step": 224 }, { "completion_length": 1888.7083435058594, "epoch": 0.2571428571428571, "grad_norm": 0.2063429355621338, "kl": 0.004634857177734375, "learning_rate": 7.039090644965509e-07, "loss": 0.0002, "reward": 0.2952297478914261, "reward_std": 0.3628021404147148, "rewards/cosine_scaled_reward": 0.12950634956359863, "rewards/format_reward": 0.7500000149011612, "step": 225 }, { "completion_length": 2112.5833740234375, "epoch": 0.2582857142857143, "grad_norm": 0.25479966402053833, "kl": 0.0068817138671875, "learning_rate": 7.009532063876148e-07, "loss": 0.0003, "reward": 0.016783216036856174, "reward_std": 0.17216098122298717, "rewards/cosine_scaled_reward": -0.17684979364275932, "rewards/format_reward": 0.6041666716337204, "step": 226 }, { "completion_length": 2380.8958740234375, "epoch": 0.25942857142857145, "grad_norm": 0.26156362891197205, "kl": 0.009124755859375, "learning_rate": 6.979899910323624e-07, "loss": 0.0004, "reward": 0.15587822534143925, "reward_std": 0.19110089540481567, "rewards/cosine_scaled_reward": 0.041788652539253235, "rewards/format_reward": 0.47916667722165585, "step": 227 }, { "completion_length": 2745.1458740234375, "epoch": 0.26057142857142856, "grad_norm": 0.21084943413734436, "kl": 0.008335113525390625, "learning_rate": 6.950195628537299e-07, "loss": 0.0003, "reward": -0.011233052238821983, "reward_std": 0.2842288613319397, "rewards/cosine_scaled_reward": -0.13671706430613995, "rewards/format_reward": 0.375, "step": 228 }, { "completion_length": 2820.2084045410156, "epoch": 0.26171428571428573, "grad_norm": 0.2139682024717331, "kl": 0.00830078125, "learning_rate": 6.920420666261961e-07, "loss": 0.0003, "reward": -0.028370358049869537, "reward_std": 0.22242224216461182, "rewards/cosine_scaled_reward": -0.14231275767087936, "rewards/format_reward": 0.31250000558793545, "step": 229 }, { "completion_length": 2517.6458435058594, "epoch": 0.26285714285714284, "grad_norm": 0.24873493611812592, "kl": 0.005519866943359375, "learning_rate": 6.890576474687263e-07, "loss": 0.0002, "reward": 0.07593339681625366, "reward_std": 0.2139127440750599, "rewards/cosine_scaled_reward": -0.06714182905852795, "rewards/format_reward": 0.4791666716337204, "step": 230 }, { "completion_length": 3043.354248046875, "epoch": 0.264, "grad_norm": 0.21457058191299438, "kl": 0.005893707275390625, "learning_rate": 6.860664508377001e-07, "loss": 0.0002, "reward": 0.051117491617333144, "reward_std": 0.3278818652033806, "rewards/cosine_scaled_reward": -0.0638924278318882, "rewards/format_reward": 0.3750000074505806, "step": 231 }, { "completion_length": 2739.3959350585938, "epoch": 0.2651428571428571, "grad_norm": 0.17329609394073486, "kl": 0.00769805908203125, "learning_rate": 6.83068622519821e-07, "loss": 0.0003, "reward": 0.021644635125994682, "reward_std": 0.2958267964422703, "rewards/cosine_scaled_reward": -0.11065343767404556, "rewards/format_reward": 0.4166666679084301, "step": 232 }, { "completion_length": 2905.9584350585938, "epoch": 0.2662857142857143, "grad_norm": 0.19478535652160645, "kl": 0.00627899169921875, "learning_rate": 6.800643086250121e-07, "loss": 0.0003, "reward": 0.04925770778208971, "reward_std": 0.27384280413389206, "rewards/cosine_scaled_reward": -0.07401277404278517, "rewards/format_reward": 0.4166666828095913, "step": 233 }, { "completion_length": 2620.4791870117188, "epoch": 0.2674285714285714, "grad_norm": 0.2395622432231903, "kl": 0.005405426025390625, "learning_rate": 6.770536555792944e-07, "loss": 0.0002, "reward": 0.14990946277976036, "reward_std": 0.24759047105908394, "rewards/cosine_scaled_reward": 0.008792880922555923, "rewards/format_reward": 0.5625000149011612, "step": 234 }, { "completion_length": 2266.6458435058594, "epoch": 0.26857142857142857, "grad_norm": 0.22145743668079376, "kl": 0.00689697265625, "learning_rate": 6.740368101176495e-07, "loss": 0.0003, "reward": 0.1657513678073883, "reward_std": 0.20570813119411469, "rewards/cosine_scaled_reward": 0.020839732140302658, "rewards/format_reward": 0.5625000074505806, "step": 235 }, { "completion_length": 3213.0834350585938, "epoch": 0.26971428571428574, "grad_norm": 0.16639220714569092, "kl": 0.00777435302734375, "learning_rate": 6.710139192768694e-07, "loss": 0.0003, "reward": -0.11140311323106289, "reward_std": 0.2293899804353714, "rewards/cosine_scaled_reward": -0.21164199709892273, "rewards/format_reward": 0.20833333395421505, "step": 236 }, { "completion_length": 2067.041717529297, "epoch": 0.27085714285714285, "grad_norm": 0.22358138859272003, "kl": 0.00661468505859375, "learning_rate": 6.679851303883891e-07, "loss": 0.0003, "reward": 0.17607398540712893, "reward_std": 0.2522607706487179, "rewards/cosine_scaled_reward": 0.010511822998523712, "rewards/format_reward": 0.6458333432674408, "step": 237 }, { "completion_length": 2907.1250610351562, "epoch": 0.272, "grad_norm": 0.1881432682275772, "kl": 0.0087432861328125, "learning_rate": 6.649505910711058e-07, "loss": 0.0004, "reward": 0.05860769626451656, "reward_std": 0.27075210958719254, "rewards/cosine_scaled_reward": -0.06318384176120162, "rewards/format_reward": 0.39583334513008595, "step": 238 }, { "completion_length": 2931.2709350585938, "epoch": 0.27314285714285713, "grad_norm": 0.1813727766275406, "kl": 0.00714111328125, "learning_rate": 6.619104492241847e-07, "loss": 0.0003, "reward": -0.06584363710135221, "reward_std": 0.169918742030859, "rewards/cosine_scaled_reward": -0.18867192044854164, "rewards/format_reward": 0.3125000111758709, "step": 239 }, { "completion_length": 2581.291717529297, "epoch": 0.2742857142857143, "grad_norm": 0.19227683544158936, "kl": 0.00748443603515625, "learning_rate": 6.588648530198504e-07, "loss": 0.0003, "reward": 0.35164642333984375, "reward_std": 0.21003012545406818, "rewards/cosine_scaled_reward": 0.28084333799779415, "rewards/format_reward": 0.5208333414047956, "step": 240 }, { "completion_length": 2414.0000610351562, "epoch": 0.2754285714285714, "grad_norm": 0.2242266684770584, "kl": 0.0083160400390625, "learning_rate": 6.558139508961654e-07, "loss": 0.0003, "reward": 0.0253978269174695, "reward_std": 0.23190999776124954, "rewards/cosine_scaled_reward": -0.15006050281226635, "rewards/format_reward": 0.541666679084301, "step": 241 }, { "completion_length": 2462.979217529297, "epoch": 0.2765714285714286, "grad_norm": 0.2282198965549469, "kl": 0.00994873046875, "learning_rate": 6.527578915497951e-07, "loss": 0.0004, "reward": 0.10085699986666441, "reward_std": 0.30299263074994087, "rewards/cosine_scaled_reward": -0.061745526269078255, "rewards/format_reward": 0.5625000149011612, "step": 242 }, { "completion_length": 2193.8750610351562, "epoch": 0.2777142857142857, "grad_norm": 0.2123919129371643, "kl": 0.00720977783203125, "learning_rate": 6.496968239287603e-07, "loss": 0.0003, "reward": 0.05669710412621498, "reward_std": 0.2819325216114521, "rewards/cosine_scaled_reward": -0.12275011092424393, "rewards/format_reward": 0.5833333432674408, "step": 243 }, { "completion_length": 2698.104248046875, "epoch": 0.27885714285714286, "grad_norm": 0.16919925808906555, "kl": 0.005615234375, "learning_rate": 6.466308972251785e-07, "loss": 0.0002, "reward": 0.12239578366279602, "reward_std": 0.3395903930068016, "rewards/cosine_scaled_reward": -0.010242638178169727, "rewards/format_reward": 0.5, "step": 244 }, { "completion_length": 2342.8958740234375, "epoch": 0.28, "grad_norm": 0.22528305649757385, "kl": 0.00743865966796875, "learning_rate": 6.435602608679916e-07, "loss": 0.0003, "reward": 0.0906514166854322, "reward_std": 0.2340186908841133, "rewards/cosine_scaled_reward": -0.07136806473135948, "rewards/format_reward": 0.5625000111758709, "step": 245 }, { "completion_length": 2473.6250610351562, "epoch": 0.28114285714285714, "grad_norm": 0.2235722541809082, "kl": 0.00926971435546875, "learning_rate": 6.404850645156841e-07, "loss": 0.0004, "reward": 0.08842453034594655, "reward_std": 0.2991355210542679, "rewards/cosine_scaled_reward": -0.0602152943611145, "rewards/format_reward": 0.520833358168602, "step": 246 }, { "completion_length": 1907.9792175292969, "epoch": 0.2822857142857143, "grad_norm": 0.24586226046085358, "kl": 0.00750732421875, "learning_rate": 6.374054580489873e-07, "loss": 0.0003, "reward": 0.39153139293193817, "reward_std": 0.43457452207803726, "rewards/cosine_scaled_reward": 0.23923185467720032, "rewards/format_reward": 0.7708333507180214, "step": 247 }, { "completion_length": 2732.729278564453, "epoch": 0.2834285714285714, "grad_norm": 0.35658103227615356, "kl": 0.0088958740234375, "learning_rate": 6.343215915635761e-07, "loss": 0.0004, "reward": 0.06365766655653715, "reward_std": 0.344327948987484, "rewards/cosine_scaled_reward": -0.06142286770045757, "rewards/format_reward": 0.41666667349636555, "step": 248 }, { "completion_length": 3000.3125610351562, "epoch": 0.2845714285714286, "grad_norm": 0.17208239436149597, "kl": 0.006988525390625, "learning_rate": 6.31233615362752e-07, "loss": 0.0003, "reward": 0.09354665782302618, "reward_std": 0.30788997933268547, "rewards/cosine_scaled_reward": 0.0037511512637138367, "rewards/format_reward": 0.33333334513008595, "step": 249 }, { "completion_length": 2316.8333740234375, "epoch": 0.2857142857142857, "grad_norm": 0.20622476935386658, "kl": 0.00716400146484375, "learning_rate": 6.281416799501187e-07, "loss": 0.0003, "reward": 0.21881220489740372, "reward_std": 0.42582786083221436, "rewards/cosine_scaled_reward": 0.07656998513266444, "rewards/format_reward": 0.604166692122817, "step": 250 }, { "completion_length": 3048.8750610351562, "epoch": 0.28685714285714287, "grad_norm": 0.18378515541553497, "kl": 0.008056640625, "learning_rate": 6.25045936022246e-07, "loss": 0.0003, "reward": 0.03178184013813734, "reward_std": 0.3299577981233597, "rewards/cosine_scaled_reward": -0.07074778061360121, "rewards/format_reward": 0.3333333395421505, "step": 251 }, { "completion_length": 2436.041717529297, "epoch": 0.288, "grad_norm": 0.2138858288526535, "kl": 0.007293701171875, "learning_rate": 6.219465344613258e-07, "loss": 0.0003, "reward": 0.16548656299710274, "reward_std": 0.24069152027368546, "rewards/cosine_scaled_reward": 0.003951352089643478, "rewards/format_reward": 0.6041666865348816, "step": 252 }, { "completion_length": 2785.041748046875, "epoch": 0.28914285714285715, "grad_norm": 0.36376261711120605, "kl": 0.01300048828125, "learning_rate": 6.188436263278172e-07, "loss": 0.0005, "reward": -0.04214626643806696, "reward_std": 0.27475107461214066, "rewards/cosine_scaled_reward": -0.18151451274752617, "rewards/format_reward": 0.3750000074505806, "step": 253 }, { "completion_length": 1763.5000610351562, "epoch": 0.29028571428571426, "grad_norm": 0.24696381390094757, "kl": 0.00769805908203125, "learning_rate": 6.157373628530852e-07, "loss": 0.0003, "reward": 0.36900394409894943, "reward_std": 0.284480482339859, "rewards/cosine_scaled_reward": 0.23359035700559616, "rewards/format_reward": 0.7291666939854622, "step": 254 }, { "completion_length": 2458.1459045410156, "epoch": 0.2914285714285714, "grad_norm": 0.21684472262859344, "kl": 0.00672149658203125, "learning_rate": 6.126278954320294e-07, "loss": 0.0003, "reward": 0.18040073942393064, "reward_std": 0.19080796465277672, "rewards/cosine_scaled_reward": 0.06468204781413078, "rewards/format_reward": 0.5000000298023224, "step": 255 }, { "completion_length": 1878.9375305175781, "epoch": 0.2925714285714286, "grad_norm": 0.20044869184494019, "kl": 0.00609588623046875, "learning_rate": 6.095153756157051e-07, "loss": 0.0002, "reward": 0.34711842332035303, "reward_std": 0.1859663426876068, "rewards/cosine_scaled_reward": 0.14590951800346375, "rewards/format_reward": 0.9166666716337204, "step": 256 }, { "completion_length": 2303.916778564453, "epoch": 0.2937142857142857, "grad_norm": 0.2247123122215271, "kl": 0.00698089599609375, "learning_rate": 6.06399955103937e-07, "loss": 0.0003, "reward": -0.01890793489292264, "reward_std": 0.17758683115243912, "rewards/cosine_scaled_reward": -0.22834208607673645, "rewards/format_reward": 0.604166679084301, "step": 257 }, { "completion_length": 2148.8958740234375, "epoch": 0.2948571428571429, "grad_norm": 0.20504705607891083, "kl": 0.0059661865234375, "learning_rate": 6.032817857379256e-07, "loss": 0.0002, "reward": 0.2120364010334015, "reward_std": 0.29609010368585587, "rewards/cosine_scaled_reward": 0.03294338658452034, "rewards/format_reward": 0.7291666716337204, "step": 258 }, { "completion_length": 1559.3542175292969, "epoch": 0.296, "grad_norm": 0.2891344428062439, "kl": 0.00691986083984375, "learning_rate": 6.001610194928464e-07, "loss": 0.0003, "reward": 0.09913755720481277, "reward_std": 0.2612408846616745, "rewards/cosine_scaled_reward": -0.12094379169866443, "rewards/format_reward": 0.7500000149011612, "step": 259 }, { "completion_length": 2554.3959350585938, "epoch": 0.29714285714285715, "grad_norm": 0.20883695781230927, "kl": 0.00881195068359375, "learning_rate": 5.97037808470444e-07, "loss": 0.0004, "reward": 0.01960124378092587, "reward_std": 0.26494671776890755, "rewards/cosine_scaled_reward": -0.14566705003380775, "rewards/format_reward": 0.5, "step": 260 }, { "completion_length": 1605.2083740234375, "epoch": 0.29828571428571427, "grad_norm": 0.23563401401042938, "kl": 0.0057525634765625, "learning_rate": 5.939123048916173e-07, "loss": 0.0002, "reward": 0.05563092790544033, "reward_std": 0.22800205647945404, "rewards/cosine_scaled_reward": -0.18945505563169718, "rewards/format_reward": 0.7708333432674408, "step": 261 }, { "completion_length": 2781.1458740234375, "epoch": 0.29942857142857143, "grad_norm": 0.1892234981060028, "kl": 0.01024627685546875, "learning_rate": 5.907846610890011e-07, "loss": 0.0004, "reward": -0.13614844344556332, "reward_std": 0.15351427905261517, "rewards/cosine_scaled_reward": -0.29414647817611694, "rewards/format_reward": 0.3541666679084301, "step": 262 }, { "completion_length": 1101.1667175292969, "epoch": 0.30057142857142854, "grad_norm": 0.29607102274894714, "kl": 0.0075836181640625, "learning_rate": 5.87655029499542e-07, "loss": 0.0003, "reward": 0.22132008988410234, "reward_std": 0.20786399394273758, "rewards/cosine_scaled_reward": -0.02792854979634285, "rewards/format_reward": 0.9375000149011612, "step": 263 }, { "completion_length": 2106.3333740234375, "epoch": 0.3017142857142857, "grad_norm": 0.22846685349941254, "kl": 0.00617218017578125, "learning_rate": 5.845235626570683e-07, "loss": 0.0002, "reward": 0.17461178824305534, "reward_std": 0.27014725655317307, "rewards/cosine_scaled_reward": -0.04605349525809288, "rewards/format_reward": 0.791666679084301, "step": 264 }, { "completion_length": 2958.354248046875, "epoch": 0.3028571428571429, "grad_norm": 0.18784429132938385, "kl": 0.006103515625, "learning_rate": 5.813904131848564e-07, "loss": 0.0002, "reward": 0.024637079797685146, "reward_std": 0.23631998896598816, "rewards/cosine_scaled_reward": -0.08748724311590195, "rewards/format_reward": 0.3541666679084301, "step": 265 }, { "completion_length": 2931.3750610351562, "epoch": 0.304, "grad_norm": 0.1994905024766922, "kl": 0.0108184814453125, "learning_rate": 5.78255733788191e-07, "loss": 0.0004, "reward": -0.10243229102343321, "reward_std": 0.1996695175766945, "rewards/cosine_scaled_reward": -0.24480824172496796, "rewards/format_reward": 0.33333334513008595, "step": 266 }, { "completion_length": 2326.854248046875, "epoch": 0.30514285714285716, "grad_norm": 0.20324274897575378, "kl": 0.0056915283203125, "learning_rate": 5.751196772469237e-07, "loss": 0.0002, "reward": 0.18746926821768284, "reward_std": 0.24190597608685493, "rewards/cosine_scaled_reward": 0.026176687330007553, "rewards/format_reward": 0.6458333432674408, "step": 267 }, { "completion_length": 2314.5625610351562, "epoch": 0.3062857142857143, "grad_norm": 0.19881677627563477, "kl": 0.0066680908203125, "learning_rate": 5.71982396408026e-07, "loss": 0.0003, "reward": 0.10610839445143938, "reward_std": 0.23754945397377014, "rewards/cosine_scaled_reward": -0.07855316065251827, "rewards/format_reward": 0.6458333432674408, "step": 268 }, { "completion_length": 1667.7708740234375, "epoch": 0.30742857142857144, "grad_norm": 0.2681999206542969, "kl": 0.00818634033203125, "learning_rate": 5.688440441781398e-07, "loss": 0.0003, "reward": 0.13219817238859832, "reward_std": 0.17853804863989353, "rewards/cosine_scaled_reward": -0.08864733949303627, "rewards/format_reward": 0.7708333432674408, "step": 269 }, { "completion_length": 1063.7291717529297, "epoch": 0.30857142857142855, "grad_norm": 0.2545178532600403, "kl": 0.0053863525390625, "learning_rate": 5.657047735161255e-07, "loss": 0.0002, "reward": 0.23251443728804588, "reward_std": 0.2816649228334427, "rewards/cosine_scaled_reward": -0.02202252857387066, "rewards/format_reward": 0.9583333432674408, "step": 270 }, { "completion_length": 2213.3958740234375, "epoch": 0.3097142857142857, "grad_norm": 0.186069056391716, "kl": 0.00629425048828125, "learning_rate": 5.625647374256061e-07, "loss": 0.0003, "reward": 0.2779443562030792, "reward_std": 0.22687088139355183, "rewards/cosine_scaled_reward": 0.13549616560339928, "rewards/format_reward": 0.6666666772216558, "step": 271 }, { "completion_length": 3206.4375610351562, "epoch": 0.31085714285714283, "grad_norm": 0.17066144943237305, "kl": 0.0094451904296875, "learning_rate": 5.594240889475106e-07, "loss": 0.0004, "reward": -0.0621743812225759, "reward_std": 0.26253468357026577, "rewards/cosine_scaled_reward": -0.15979204699397087, "rewards/format_reward": 0.2500000037252903, "step": 272 }, { "completion_length": 2643.3125610351562, "epoch": 0.312, "grad_norm": 0.2409026324748993, "kl": 0.00916290283203125, "learning_rate": 5.562829811526154e-07, "loss": 0.0004, "reward": -0.04340513329952955, "reward_std": 0.22447111830115318, "rewards/cosine_scaled_reward": -0.21603597328066826, "rewards/format_reward": 0.479166679084301, "step": 273 }, { "completion_length": 2134.4583740234375, "epoch": 0.31314285714285717, "grad_norm": 0.22451116144657135, "kl": 0.0064697265625, "learning_rate": 5.531415671340826e-07, "loss": 0.0003, "reward": 0.15485219238325953, "reward_std": 0.27546945214271545, "rewards/cosine_scaled_reward": -0.03998738154768944, "rewards/format_reward": 0.7083333432674408, "step": 274 }, { "completion_length": 2609.3333740234375, "epoch": 0.3142857142857143, "grad_norm": 0.2192443460226059, "kl": 0.0079498291015625, "learning_rate": 5.5e-07, "loss": 0.0003, "reward": 0.07758530229330063, "reward_std": 0.25556471943855286, "rewards/cosine_scaled_reward": -0.09019988495856524, "rewards/format_reward": 0.5625000149011612, "step": 275 }, { "completion_length": 1755.5625915527344, "epoch": 0.31542857142857145, "grad_norm": 0.2460157871246338, "kl": 0.0076751708984375, "learning_rate": 5.468584328659172e-07, "loss": 0.0003, "reward": 0.211357356980443, "reward_std": 0.2688931114971638, "rewards/cosine_scaled_reward": 0.01429477147758007, "rewards/format_reward": 0.7708333432674408, "step": 276 }, { "completion_length": 1832.6250305175781, "epoch": 0.31657142857142856, "grad_norm": 0.2850809395313263, "kl": 0.00719451904296875, "learning_rate": 5.437170188473847e-07, "loss": 0.0003, "reward": 0.1353502692654729, "reward_std": 0.28056155517697334, "rewards/cosine_scaled_reward": -0.0737249106168747, "rewards/format_reward": 0.7291666865348816, "step": 277 }, { "completion_length": 2390.979217529297, "epoch": 0.3177142857142857, "grad_norm": 0.2092747986316681, "kl": 0.0069122314453125, "learning_rate": 5.405759110524894e-07, "loss": 0.0003, "reward": 0.24532984290271997, "reward_std": 0.2878542058169842, "rewards/cosine_scaled_reward": 0.0971972830593586, "rewards/format_reward": 0.6458333544433117, "step": 278 }, { "completion_length": 2098.854248046875, "epoch": 0.31885714285714284, "grad_norm": 0.235019713640213, "kl": 0.00823211669921875, "learning_rate": 5.37435262574394e-07, "loss": 0.0003, "reward": 0.10473778727464378, "reward_std": 0.26891491189599037, "rewards/cosine_scaled_reward": -0.07466491125524044, "rewards/format_reward": 0.6250000074505806, "step": 279 }, { "completion_length": 1949.9584350585938, "epoch": 0.32, "grad_norm": 0.16544176638126373, "kl": 0.005886077880859375, "learning_rate": 5.342952264838747e-07, "loss": 0.0002, "reward": 0.25803492963314056, "reward_std": 0.22913503646850586, "rewards/cosine_scaled_reward": 0.03127211146056652, "rewards/format_reward": 0.8958333432674408, "step": 280 }, { "completion_length": 1686.1875305175781, "epoch": 0.3211428571428571, "grad_norm": 0.231206014752388, "kl": 0.007114410400390625, "learning_rate": 5.311559558218603e-07, "loss": 0.0003, "reward": 0.29051591642200947, "reward_std": 0.301223948597908, "rewards/cosine_scaled_reward": 0.09080295264720917, "rewards/format_reward": 0.8541666716337204, "step": 281 }, { "completion_length": 2663.0000915527344, "epoch": 0.3222857142857143, "grad_norm": 0.16389000415802002, "kl": 0.0096282958984375, "learning_rate": 5.28017603591974e-07, "loss": 0.0004, "reward": 0.10802310952567495, "reward_std": 0.37506720423698425, "rewards/cosine_scaled_reward": -0.0454124566167593, "rewards/format_reward": 0.5625000149011612, "step": 282 }, { "completion_length": 1539.3959045410156, "epoch": 0.32342857142857145, "grad_norm": 0.2511042654514313, "kl": 0.0092620849609375, "learning_rate": 5.248803227530763e-07, "loss": 0.0004, "reward": 0.06443386664614081, "reward_std": 0.17517686262726784, "rewards/cosine_scaled_reward": -0.1600881894119084, "rewards/format_reward": 0.7291666716337204, "step": 283 }, { "completion_length": 1875.6458435058594, "epoch": 0.32457142857142857, "grad_norm": 0.2905344069004059, "kl": 0.00885009765625, "learning_rate": 5.21744266211809e-07, "loss": 0.0004, "reward": 0.15612581057939678, "reward_std": 0.26716162264347076, "rewards/cosine_scaled_reward": -0.07781252264976501, "rewards/format_reward": 0.8333333432674408, "step": 284 }, { "completion_length": 2329.666717529297, "epoch": 0.32571428571428573, "grad_norm": 0.1955825239419937, "kl": 0.00830841064453125, "learning_rate": 5.186095868151436e-07, "loss": 0.0003, "reward": -0.014332971069961786, "reward_std": 0.2384210340678692, "rewards/cosine_scaled_reward": -0.22018385492265224, "rewards/format_reward": 0.6041666716337204, "step": 285 }, { "completion_length": 1986.8126220703125, "epoch": 0.32685714285714285, "grad_norm": 0.2045590579509735, "kl": 0.00740814208984375, "learning_rate": 5.154764373429315e-07, "loss": 0.0003, "reward": 0.2143438160419464, "reward_std": 0.33449699729681015, "rewards/cosine_scaled_reward": 0.02582629583775997, "rewards/format_reward": 0.7500000149011612, "step": 286 }, { "completion_length": 2453.8333740234375, "epoch": 0.328, "grad_norm": 0.1958751082420349, "kl": 0.0113067626953125, "learning_rate": 5.123449705004581e-07, "loss": 0.0005, "reward": -0.0014798734337091446, "reward_std": 0.20798438042402267, "rewards/cosine_scaled_reward": -0.1620903555303812, "rewards/format_reward": 0.4791666865348816, "step": 287 }, { "completion_length": 1948.229248046875, "epoch": 0.3291428571428571, "grad_norm": 0.27305203676223755, "kl": 0.011627197265625, "learning_rate": 5.09215338910999e-07, "loss": 0.0005, "reward": 0.1596199981868267, "reward_std": 0.16709502786397934, "rewards/cosine_scaled_reward": -0.02836191887035966, "rewards/format_reward": 0.708333358168602, "step": 288 }, { "completion_length": 1896.1250305175781, "epoch": 0.3302857142857143, "grad_norm": 0.2106921523809433, "kl": 0.00792694091796875, "learning_rate": 5.060876951083828e-07, "loss": 0.0003, "reward": 0.1784962136298418, "reward_std": 0.19687382131814957, "rewards/cosine_scaled_reward": -0.04321554955095053, "rewards/format_reward": 0.8125000149011612, "step": 289 }, { "completion_length": 1664.1458587646484, "epoch": 0.3314285714285714, "grad_norm": 0.21843799948692322, "kl": 0.0081329345703125, "learning_rate": 5.02962191529556e-07, "loss": 0.0003, "reward": 0.5732791125774384, "reward_std": 0.17374157533049583, "rewards/cosine_scaled_reward": 0.41034475713968277, "rewards/format_reward": 0.9791666716337204, "step": 290 }, { "completion_length": 1389.666732788086, "epoch": 0.3325714285714286, "grad_norm": 0.31224629282951355, "kl": 0.00982666015625, "learning_rate": 4.998389805071536e-07, "loss": 0.0004, "reward": 0.2626770590431988, "reward_std": 0.22773029655218124, "rewards/cosine_scaled_reward": 0.07469475641846657, "rewards/format_reward": 0.7916666716337204, "step": 291 }, { "completion_length": 1609.6458740234375, "epoch": 0.33371428571428574, "grad_norm": 0.27222633361816406, "kl": 0.009613037109375, "learning_rate": 4.967182142620745e-07, "loss": 0.0004, "reward": 0.10958927869796753, "reward_std": 0.2859853506088257, "rewards/cosine_scaled_reward": -0.1261476818472147, "rewards/format_reward": 0.7916666716337204, "step": 292 }, { "completion_length": 2273.5834350585938, "epoch": 0.33485714285714285, "grad_norm": 0.2254333794116974, "kl": 0.01031494140625, "learning_rate": 4.93600044896063e-07, "loss": 0.0004, "reward": 0.10152808134444058, "reward_std": 0.29285991936922073, "rewards/cosine_scaled_reward": -0.08520433306694031, "rewards/format_reward": 0.6458333432674408, "step": 293 }, { "completion_length": 1862.7917175292969, "epoch": 0.336, "grad_norm": 0.186554417014122, "kl": 0.00743865966796875, "learning_rate": 4.904846243842949e-07, "loss": 0.0003, "reward": 0.2801675386726856, "reward_std": 0.3269800990819931, "rewards/cosine_scaled_reward": 0.0798236517002806, "rewards/format_reward": 0.8333333432674408, "step": 294 }, { "completion_length": 2408.729248046875, "epoch": 0.33714285714285713, "grad_norm": 0.1892344355583191, "kl": 0.0077056884765625, "learning_rate": 4.873721045679706e-07, "loss": 0.0003, "reward": 0.1762567274272442, "reward_std": 0.2879105731844902, "rewards/cosine_scaled_reward": 0.032621614169329405, "rewards/format_reward": 0.5833333432674408, "step": 295 }, { "completion_length": 1477.7708892822266, "epoch": 0.3382857142857143, "grad_norm": 0.3810408413410187, "kl": 0.007415771484375, "learning_rate": 4.842626371469149e-07, "loss": 0.0003, "reward": 0.30762497521936893, "reward_std": 0.30592041462659836, "rewards/cosine_scaled_reward": 0.09398248294746736, "rewards/format_reward": 0.8958333432674408, "step": 296 }, { "completion_length": 2005.8333740234375, "epoch": 0.3394285714285714, "grad_norm": 0.23766450583934784, "kl": 0.0084686279296875, "learning_rate": 4.811563736721829e-07, "loss": 0.0003, "reward": 0.10483757872134447, "reward_std": 0.23863688111305237, "rewards/cosine_scaled_reward": -0.0952923595905304, "rewards/format_reward": 0.6875000149011612, "step": 297 }, { "completion_length": 1948.2709045410156, "epoch": 0.3405714285714286, "grad_norm": 0.22391702234745026, "kl": 0.0073699951171875, "learning_rate": 4.780534655386743e-07, "loss": 0.0003, "reward": 0.1197497546672821, "reward_std": 0.2647099569439888, "rewards/cosine_scaled_reward": -0.10869306535460055, "rewards/format_reward": 0.7916666716337204, "step": 298 }, { "completion_length": 1697.2084045410156, "epoch": 0.3417142857142857, "grad_norm": 0.2413056641817093, "kl": 0.00830078125, "learning_rate": 4.749540639777539e-07, "loss": 0.0003, "reward": 0.18542808666825294, "reward_std": 0.27115339785814285, "rewards/cosine_scaled_reward": -0.01068168506026268, "rewards/format_reward": 0.7500000149011612, "step": 299 }, { "completion_length": 1569.3125457763672, "epoch": 0.34285714285714286, "grad_norm": 0.21255984902381897, "kl": 0.00722503662109375, "learning_rate": 4.7185832004988133e-07, "loss": 0.0003, "reward": 0.3450528532266617, "reward_std": 0.2651615794748068, "rewards/cosine_scaled_reward": 0.15506408301735064, "rewards/format_reward": 0.8541666716337204, "step": 300 }, { "completion_length": 2331.666717529297, "epoch": 0.344, "grad_norm": 0.27719616889953613, "kl": 0.0101776123046875, "learning_rate": 4.68766384637248e-07, "loss": 0.0004, "reward": 0.16701876651495695, "reward_std": 0.26342786848545074, "rewards/cosine_scaled_reward": 0.03490189649164677, "rewards/format_reward": 0.5416666902601719, "step": 301 }, { "completion_length": 2015.4584045410156, "epoch": 0.34514285714285714, "grad_norm": 0.27819642424583435, "kl": 0.00850677490234375, "learning_rate": 4.656784084364238e-07, "loss": 0.0003, "reward": 0.09659364819526672, "reward_std": 0.12537105567753315, "rewards/cosine_scaled_reward": -0.09112071990966797, "rewards/format_reward": 0.6458333432674408, "step": 302 }, { "completion_length": 1485.9166870117188, "epoch": 0.3462857142857143, "grad_norm": 0.26828280091285706, "kl": 0.00751495361328125, "learning_rate": 4.6259454195101267e-07, "loss": 0.0003, "reward": 0.25270497193560004, "reward_std": 0.2928587533533573, "rewards/cosine_scaled_reward": 0.027904170332476497, "rewards/format_reward": 0.8750000149011612, "step": 303 }, { "completion_length": 1409.7917175292969, "epoch": 0.3474285714285714, "grad_norm": 0.2540818452835083, "kl": 0.006877899169921875, "learning_rate": 4.59514935484316e-07, "loss": 0.0003, "reward": 0.5040274113416672, "reward_std": 0.35688790678977966, "rewards/cosine_scaled_reward": 0.34280968457460403, "rewards/format_reward": 0.8958333432674408, "step": 304 }, { "completion_length": 2382.0208740234375, "epoch": 0.3485714285714286, "grad_norm": 0.19963636994361877, "kl": 0.009735107421875, "learning_rate": 4.5643973913200837e-07, "loss": 0.0004, "reward": 0.11577347759157419, "reward_std": 0.30082143656909466, "rewards/cosine_scaled_reward": -0.03267804719507694, "rewards/format_reward": 0.5416666865348816, "step": 305 }, { "completion_length": 1835.479248046875, "epoch": 0.3497142857142857, "grad_norm": 0.28124022483825684, "kl": 0.009246826171875, "learning_rate": 4.5336910277482155e-07, "loss": 0.0004, "reward": 0.24207957834005356, "reward_std": 0.2189928311854601, "rewards/cosine_scaled_reward": 0.0470082089304924, "rewards/format_reward": 0.7916666865348816, "step": 306 }, { "completion_length": 1760.8334197998047, "epoch": 0.35085714285714287, "grad_norm": 0.22857794165611267, "kl": 0.00923919677734375, "learning_rate": 4.503031760712397e-07, "loss": 0.0004, "reward": 0.4835302531719208, "reward_std": 0.3119688630104065, "rewards/cosine_scaled_reward": 0.32135529443621635, "rewards/format_reward": 0.895833358168602, "step": 307 }, { "completion_length": 1406.291732788086, "epoch": 0.352, "grad_norm": 0.28863319754600525, "kl": 0.00827789306640625, "learning_rate": 4.4724210845020494e-07, "loss": 0.0003, "reward": 0.2202434539794922, "reward_std": 0.25928834080696106, "rewards/cosine_scaled_reward": 0.013711273670196533, "rewards/format_reward": 0.8125000074505806, "step": 308 }, { "completion_length": 1427.7708587646484, "epoch": 0.35314285714285715, "grad_norm": 0.2582438886165619, "kl": 0.00743865966796875, "learning_rate": 4.441860491038345e-07, "loss": 0.0003, "reward": 0.38336939364671707, "reward_std": 0.34063655138015747, "rewards/cosine_scaled_reward": 0.17991949617862701, "rewards/format_reward": 0.9375000149011612, "step": 309 }, { "completion_length": 1809.1250610351562, "epoch": 0.35428571428571426, "grad_norm": 0.2695036828517914, "kl": 0.0096435546875, "learning_rate": 4.4113514698014953e-07, "loss": 0.0004, "reward": 0.21986294770613313, "reward_std": 0.18356493674218655, "rewards/cosine_scaled_reward": 0.044077259954065084, "rewards/format_reward": 0.7083333432674408, "step": 310 }, { "completion_length": 1884.6459197998047, "epoch": 0.3554285714285714, "grad_norm": 0.29340508580207825, "kl": 0.0126800537109375, "learning_rate": 4.3808955077581546e-07, "loss": 0.0005, "reward": 0.1723873894661665, "reward_std": 0.268467765301466, "rewards/cosine_scaled_reward": -0.022497177124023438, "rewards/format_reward": 0.7291666865348816, "step": 311 }, { "completion_length": 1841.6666717529297, "epoch": 0.3565714285714286, "grad_norm": 0.24835428595542908, "kl": 0.007354736328125, "learning_rate": 4.350494089288943e-07, "loss": 0.0003, "reward": 0.16523393616080284, "reward_std": 0.24913058057427406, "rewards/cosine_scaled_reward": -0.044235333101823926, "rewards/format_reward": 0.7708333432674408, "step": 312 }, { "completion_length": 1764.6042175292969, "epoch": 0.3577142857142857, "grad_norm": 0.2532818615436554, "kl": 0.00913238525390625, "learning_rate": 4.3201486961161093e-07, "loss": 0.0004, "reward": 0.2583326920866966, "reward_std": 0.26906657963991165, "rewards/cosine_scaled_reward": 0.05645761638879776, "rewards/format_reward": 0.8125000149011612, "step": 313 }, { "completion_length": 1522.0833435058594, "epoch": 0.3588571428571429, "grad_norm": 0.23159317672252655, "kl": 0.0091400146484375, "learning_rate": 4.2898608072313045e-07, "loss": 0.0004, "reward": 0.23400770500302315, "reward_std": 0.28054384142160416, "rewards/cosine_scaled_reward": 0.020135470665991306, "rewards/format_reward": 0.8333333432674408, "step": 314 }, { "completion_length": 1458.8958587646484, "epoch": 0.36, "grad_norm": 0.3017834722995758, "kl": 0.0092010498046875, "learning_rate": 4.2596318988235037e-07, "loss": 0.0004, "reward": 0.1311683924868703, "reward_std": 0.282433345913887, "rewards/cosine_scaled_reward": -0.08330600336194038, "rewards/format_reward": 0.7500000149011612, "step": 315 }, { "completion_length": 2081.9584045410156, "epoch": 0.36114285714285715, "grad_norm": 0.22785058617591858, "kl": 0.0100555419921875, "learning_rate": 4.2294634442070553e-07, "loss": 0.0004, "reward": 0.02211561892181635, "reward_std": 0.21739701926708221, "rewards/cosine_scaled_reward": -0.1856558918952942, "rewards/format_reward": 0.645833358168602, "step": 316 }, { "completion_length": 2084.8958435058594, "epoch": 0.36228571428571427, "grad_norm": 0.23225082457065582, "kl": 0.0111846923828125, "learning_rate": 4.1993569137498776e-07, "loss": 0.0004, "reward": 0.24845389276742935, "reward_std": 0.3764058109372854, "rewards/cosine_scaled_reward": 0.08984526246786118, "rewards/format_reward": 0.6875000149011612, "step": 317 }, { "completion_length": 1409.4584045410156, "epoch": 0.36342857142857143, "grad_norm": 0.29590070247650146, "kl": 0.0090789794921875, "learning_rate": 4.1693137748017915e-07, "loss": 0.0004, "reward": 0.24361606128513813, "reward_std": 0.24939386546611786, "rewards/cosine_scaled_reward": 0.04053226858377457, "rewards/format_reward": 0.8125000149011612, "step": 318 }, { "completion_length": 2240.375030517578, "epoch": 0.36457142857142855, "grad_norm": 0.18136313557624817, "kl": 0.00847625732421875, "learning_rate": 4.1393354916230005e-07, "loss": 0.0003, "reward": 0.08488270919770002, "reward_std": 0.22877753525972366, "rewards/cosine_scaled_reward": -0.10751745477318764, "rewards/format_reward": 0.6458333432674408, "step": 319 }, { "completion_length": 1544.0833740234375, "epoch": 0.3657142857142857, "grad_norm": 0.20819739997386932, "kl": 0.00759124755859375, "learning_rate": 4.1094235253127374e-07, "loss": 0.0003, "reward": 0.2654244042932987, "reward_std": 0.38539183512330055, "rewards/cosine_scaled_reward": 0.039359224028885365, "rewards/format_reward": 0.8958333432674408, "step": 320 }, { "completion_length": 1928.1042175292969, "epoch": 0.3668571428571429, "grad_norm": 0.24936559796333313, "kl": 0.010467529296875, "learning_rate": 4.079579333738039e-07, "loss": 0.0004, "reward": 0.2094899509102106, "reward_std": 0.2675232719630003, "rewards/cosine_scaled_reward": 0.017967309337109327, "rewards/format_reward": 0.7500000149011612, "step": 321 }, { "completion_length": 1580.6875915527344, "epoch": 0.368, "grad_norm": 0.20408599078655243, "kl": 0.00743865966796875, "learning_rate": 4.0498043714627006e-07, "loss": 0.0003, "reward": 0.26650879299268126, "reward_std": 0.291965126991272, "rewards/cosine_scaled_reward": 0.060018595308065414, "rewards/format_reward": 0.8541666865348816, "step": 322 }, { "completion_length": 1619.5833740234375, "epoch": 0.36914285714285716, "grad_norm": 0.32702451944351196, "kl": 0.01053619384765625, "learning_rate": 4.020100089676376e-07, "loss": 0.0004, "reward": 0.2938319506647531, "reward_std": 0.23641689121723175, "rewards/cosine_scaled_reward": 0.10801427066326141, "rewards/format_reward": 0.8125000149011612, "step": 323 }, { "completion_length": 2247.1876220703125, "epoch": 0.3702857142857143, "grad_norm": 0.17617079615592957, "kl": 0.01004791259765625, "learning_rate": 3.9904679361238526e-07, "loss": 0.0004, "reward": 0.06403969042003155, "reward_std": 0.18300126865506172, "rewards/cosine_scaled_reward": -0.14668823406100273, "rewards/format_reward": 0.6875, "step": 324 }, { "completion_length": 1287.2916717529297, "epoch": 0.37142857142857144, "grad_norm": 0.2974833548069, "kl": 0.0090179443359375, "learning_rate": 3.9609093550344907e-07, "loss": 0.0004, "reward": 0.2224448136985302, "reward_std": 0.23644054681062698, "rewards/cosine_scaled_reward": -0.036079500801861286, "rewards/format_reward": 0.9583333432674408, "step": 325 }, { "completion_length": 2086.7709045410156, "epoch": 0.37257142857142855, "grad_norm": 0.2205517441034317, "kl": 0.0098724365234375, "learning_rate": 3.931425787051832e-07, "loss": 0.0004, "reward": 0.09755693469196558, "reward_std": 0.21148419380187988, "rewards/cosine_scaled_reward": -0.13864601170644164, "rewards/format_reward": 0.7916666865348816, "step": 326 }, { "completion_length": 1831.3958740234375, "epoch": 0.3737142857142857, "grad_norm": 0.21228300034999847, "kl": 0.00830078125, "learning_rate": 3.902018669163384e-07, "loss": 0.0003, "reward": 0.33440493792295456, "reward_std": 0.3173820227384567, "rewards/cosine_scaled_reward": 0.1545264758169651, "rewards/format_reward": 0.833333358168602, "step": 327 }, { "completion_length": 1920.5000610351562, "epoch": 0.37485714285714283, "grad_norm": 0.2396845668554306, "kl": 0.009307861328125, "learning_rate": 3.872689434630585e-07, "loss": 0.0004, "reward": 0.2613999950699508, "reward_std": 0.3687175065279007, "rewards/cosine_scaled_reward": 0.07327798567712307, "rewards/format_reward": 0.7916666716337204, "step": 328 }, { "completion_length": 1893.8125610351562, "epoch": 0.376, "grad_norm": 0.20392858982086182, "kl": 0.008724212646484375, "learning_rate": 3.843439512918949e-07, "loss": 0.0003, "reward": 0.19521827809512615, "reward_std": 0.17398589849472046, "rewards/cosine_scaled_reward": -0.021663442254066467, "rewards/format_reward": 0.8125000074505806, "step": 329 }, { "completion_length": 1882.9583740234375, "epoch": 0.37714285714285717, "grad_norm": 0.21649308502674103, "kl": 0.00913238525390625, "learning_rate": 3.8142703296283953e-07, "loss": 0.0004, "reward": 0.13386597856879234, "reward_std": 0.27085599303245544, "rewards/cosine_scaled_reward": -0.09174733608961105, "rewards/format_reward": 0.7916666865348816, "step": 330 }, { "completion_length": 2221.0833740234375, "epoch": 0.3782857142857143, "grad_norm": 0.19338715076446533, "kl": 0.0114288330078125, "learning_rate": 3.785183306423767e-07, "loss": 0.0005, "reward": 0.03328318754211068, "reward_std": 0.2135927379131317, "rewards/cosine_scaled_reward": -0.1791954692453146, "rewards/format_reward": 0.6666666716337204, "step": 331 }, { "completion_length": 1648.7708587646484, "epoch": 0.37942857142857145, "grad_norm": 0.2684015929698944, "kl": 0.0113983154296875, "learning_rate": 3.7561798609655373e-07, "loss": 0.0005, "reward": 0.27905246056616306, "reward_std": 0.2478124052286148, "rewards/cosine_scaled_reward": 0.08956634253263474, "rewards/format_reward": 0.791666679084301, "step": 332 }, { "completion_length": 1530.3750610351562, "epoch": 0.38057142857142856, "grad_norm": 0.2717783451080322, "kl": 0.00899505615234375, "learning_rate": 3.72726140684072e-07, "loss": 0.0004, "reward": 0.05967093911021948, "reward_std": 0.20809690654277802, "rewards/cosine_scaled_reward": -0.20973167195916176, "rewards/format_reward": 0.8541666865348816, "step": 333 }, { "completion_length": 2227.5625610351562, "epoch": 0.38171428571428573, "grad_norm": 0.23560327291488647, "kl": 0.01085662841796875, "learning_rate": 3.6984293534939737e-07, "loss": 0.0004, "reward": 0.11222782591357827, "reward_std": 0.2779008559882641, "rewards/cosine_scaled_reward": -0.09507040795870125, "rewards/format_reward": 0.708333358168602, "step": 334 }, { "completion_length": 1943.4375915527344, "epoch": 0.38285714285714284, "grad_norm": 0.24016280472278595, "kl": 0.011260986328125, "learning_rate": 3.6696851061588994e-07, "loss": 0.0005, "reward": 0.19086962472647429, "reward_std": 0.2430224046111107, "rewards/cosine_scaled_reward": -0.0018318742513656616, "rewards/format_reward": 0.7291666716337204, "step": 335 }, { "completion_length": 1407.2708740234375, "epoch": 0.384, "grad_norm": 0.28007882833480835, "kl": 0.00952911376953125, "learning_rate": 3.641030065789562e-07, "loss": 0.0004, "reward": 0.05958843324333429, "reward_std": 0.1869518756866455, "rewards/cosine_scaled_reward": -0.20403875783085823, "rewards/format_reward": 0.833333358168602, "step": 336 }, { "completion_length": 1033.5625305175781, "epoch": 0.3851428571428571, "grad_norm": 0.3786531686782837, "kl": 0.0092010498046875, "learning_rate": 3.612465628992203e-07, "loss": 0.0004, "reward": 0.20698048546910286, "reward_std": 0.29393167048692703, "rewards/cosine_scaled_reward": -0.0597881693392992, "rewards/format_reward": 0.9791666716337204, "step": 337 }, { "completion_length": 2073.812530517578, "epoch": 0.3862857142857143, "grad_norm": 0.23650962114334106, "kl": 0.0127410888671875, "learning_rate": 3.5839931879571725e-07, "loss": 0.0005, "reward": 0.12870828062295914, "reward_std": 0.20198489911854267, "rewards/cosine_scaled_reward": -0.028040427714586258, "rewards/format_reward": 0.5833333432674408, "step": 338 }, { "completion_length": 2329.0833740234375, "epoch": 0.38742857142857146, "grad_norm": 0.18778569996356964, "kl": 0.0103607177734375, "learning_rate": 3.555614130391079e-07, "loss": 0.0004, "reward": 0.18835976673290133, "reward_std": 0.3419927656650543, "rewards/cosine_scaled_reward": -0.004078354686498642, "rewards/format_reward": 0.7500000223517418, "step": 339 }, { "completion_length": 1800.1042175292969, "epoch": 0.38857142857142857, "grad_norm": 0.24106545746326447, "kl": 0.0111236572265625, "learning_rate": 3.5273298394491515e-07, "loss": 0.0004, "reward": 0.32668111473321915, "reward_std": 0.3139052540063858, "rewards/cosine_scaled_reward": 0.1498567252419889, "rewards/format_reward": 0.7916666865348816, "step": 340 }, { "completion_length": 2232.354248046875, "epoch": 0.38971428571428574, "grad_norm": 0.29757753014564514, "kl": 0.01445770263671875, "learning_rate": 3.4991416936678276e-07, "loss": 0.0006, "reward": 0.284415390342474, "reward_std": 0.431053563952446, "rewards/cosine_scaled_reward": 0.15291668381541967, "rewards/format_reward": 0.6250000298023224, "step": 341 }, { "completion_length": 1537.2500610351562, "epoch": 0.39085714285714285, "grad_norm": 0.2549525797367096, "kl": 0.009307861328125, "learning_rate": 3.471051066897562e-07, "loss": 0.0004, "reward": 0.12320299912244081, "reward_std": 0.18919622898101807, "rewards/cosine_scaled_reward": -0.12590695265680552, "rewards/format_reward": 0.8541666865348816, "step": 342 }, { "completion_length": 2050.4166870117188, "epoch": 0.392, "grad_norm": 0.233432337641716, "kl": 0.01062774658203125, "learning_rate": 3.4430593282358777e-07, "loss": 0.0004, "reward": 0.43845948576927185, "reward_std": 0.2883150465786457, "rewards/cosine_scaled_reward": 0.28409258276224136, "rewards/format_reward": 0.833333358168602, "step": 343 }, { "completion_length": 1527.2500610351562, "epoch": 0.3931428571428571, "grad_norm": 0.28435570001602173, "kl": 0.0095977783203125, "learning_rate": 3.4151678419606233e-07, "loss": 0.0004, "reward": 0.2925238283351064, "reward_std": 0.2560243997722864, "rewards/cosine_scaled_reward": 0.07367146760225296, "rewards/format_reward": 0.8958333432674408, "step": 344 }, { "completion_length": 1440.1666870117188, "epoch": 0.3942857142857143, "grad_norm": 0.23948732018470764, "kl": 0.00936126708984375, "learning_rate": 3.387377967463493e-07, "loss": 0.0004, "reward": 0.1968644427252002, "reward_std": 0.1854616329073906, "rewards/cosine_scaled_reward": -0.031224748119711876, "rewards/format_reward": 0.8541666716337204, "step": 345 }, { "completion_length": 1666.041748046875, "epoch": 0.3954285714285714, "grad_norm": 0.2333936244249344, "kl": 0.0108184814453125, "learning_rate": 3.359691059183761e-07, "loss": 0.0004, "reward": 0.14479913003742695, "reward_std": 0.2107578031718731, "rewards/cosine_scaled_reward": -0.074419766664505, "rewards/format_reward": 0.7916666716337204, "step": 346 }, { "completion_length": 1407.9167175292969, "epoch": 0.3965714285714286, "grad_norm": 0.2971218228340149, "kl": 0.00939178466796875, "learning_rate": 3.3321084665422803e-07, "loss": 0.0004, "reward": 0.38189948722720146, "reward_std": 0.325411569327116, "rewards/cosine_scaled_reward": 0.20823876559734344, "rewards/format_reward": 0.8541666865348816, "step": 347 }, { "completion_length": 1421.041732788086, "epoch": 0.3977142857142857, "grad_norm": 0.2317306399345398, "kl": 0.00820159912109375, "learning_rate": 3.3046315338757026e-07, "loss": 0.0003, "reward": 0.1995458109304309, "reward_std": 0.2409229427576065, "rewards/cosine_scaled_reward": -0.03460083529353142, "rewards/format_reward": 0.875, "step": 348 }, { "completion_length": 1281.4375305175781, "epoch": 0.39885714285714285, "grad_norm": 0.24120782315731049, "kl": 0.007274627685546875, "learning_rate": 3.2772616003709616e-07, "loss": 0.0003, "reward": 0.45311590284109116, "reward_std": 0.2731959819793701, "rewards/cosine_scaled_reward": 0.25743514811620116, "rewards/format_reward": 0.9583333432674408, "step": 349 }, { "completion_length": 1245.7291870117188, "epoch": 0.4, "grad_norm": 0.22964408993721008, "kl": 0.009246826171875, "learning_rate": 3.250000000000001e-07, "loss": 0.0004, "reward": 0.3433077558875084, "reward_std": 0.2860936261713505, "rewards/cosine_scaled_reward": 0.15224721981212497, "rewards/format_reward": 0.8750000149011612, "step": 350 }, { "completion_length": 1456.7083587646484, "epoch": 0.40114285714285713, "grad_norm": 0.28671830892562866, "kl": 0.008087158203125, "learning_rate": 3.222848061454764e-07, "loss": 0.0003, "reward": 0.11195625504478812, "reward_std": 0.2711613141000271, "rewards/cosine_scaled_reward": -0.12700296379625797, "rewards/format_reward": 0.7916666716337204, "step": 351 }, { "completion_length": 1435.0416717529297, "epoch": 0.4022857142857143, "grad_norm": 0.2130868285894394, "kl": 0.0080718994140625, "learning_rate": 3.195807108082429e-07, "loss": 0.0003, "reward": 0.17259665206074715, "reward_std": 0.2315747793763876, "rewards/cosine_scaled_reward": -0.09190645813941956, "rewards/format_reward": 0.9375000149011612, "step": 352 }, { "completion_length": 1706.9583740234375, "epoch": 0.4034285714285714, "grad_norm": 0.21334676444530487, "kl": 0.01042938232421875, "learning_rate": 3.168878457820915e-07, "loss": 0.0004, "reward": 0.18855836428701878, "reward_std": 0.26317035034298897, "rewards/cosine_scaled_reward": -0.05418684799224138, "rewards/format_reward": 0.8958333432674408, "step": 353 }, { "completion_length": 1464.6875305175781, "epoch": 0.4045714285714286, "grad_norm": 0.22004835307598114, "kl": 0.009002685546875, "learning_rate": 3.142063423134644e-07, "loss": 0.0004, "reward": -0.002447195933200419, "reward_std": 0.11259411834180355, "rewards/cosine_scaled_reward": -0.30102576315402985, "rewards/format_reward": 0.8958333432674408, "step": 354 }, { "completion_length": 1175.0208435058594, "epoch": 0.4057142857142857, "grad_norm": 0.3179510831832886, "kl": 0.0092315673828125, "learning_rate": 3.115363310950578e-07, "loss": 0.0004, "reward": 0.20959803275763988, "reward_std": 0.12544530257582664, "rewards/cosine_scaled_reward": -0.0459451749920845, "rewards/format_reward": 0.9375, "step": 355 }, { "completion_length": 1753.0625610351562, "epoch": 0.40685714285714286, "grad_norm": 0.26044961810112, "kl": 0.01105499267578125, "learning_rate": 3.0887794225945143e-07, "loss": 0.0004, "reward": 0.11193252541124821, "reward_std": 0.17821012996137142, "rewards/cosine_scaled_reward": -0.1084582656621933, "rewards/format_reward": 0.7500000149011612, "step": 356 }, { "completion_length": 1933.3750915527344, "epoch": 0.408, "grad_norm": 0.32032617926597595, "kl": 0.0153656005859375, "learning_rate": 3.062313053727671e-07, "loss": 0.0006, "reward": 0.0698192942654714, "reward_std": 0.1654464676976204, "rewards/cosine_scaled_reward": -0.14839602261781693, "rewards/format_reward": 0.7083333432674408, "step": 357 }, { "completion_length": 1849.5625, "epoch": 0.40914285714285714, "grad_norm": 0.21455638110637665, "kl": 0.01059722900390625, "learning_rate": 3.0359654942835247e-07, "loss": 0.0004, "reward": 0.23701468179933727, "reward_std": 0.28458721190690994, "rewards/cosine_scaled_reward": 0.05581399705260992, "rewards/format_reward": 0.7500000074505806, "step": 358 }, { "completion_length": 1714.3333435058594, "epoch": 0.4102857142857143, "grad_norm": 0.23594503104686737, "kl": 0.0106353759765625, "learning_rate": 3.0097380284049523e-07, "loss": 0.0004, "reward": 0.17001938103931025, "reward_std": 0.2589455880224705, "rewards/cosine_scaled_reward": -0.047670695930719376, "rewards/format_reward": 0.7916667014360428, "step": 359 }, { "completion_length": 869.5416870117188, "epoch": 0.4114285714285714, "grad_norm": 0.2539467513561249, "kl": 0.00698089599609375, "learning_rate": 2.9836319343816397e-07, "loss": 0.0003, "reward": 0.38226132094860077, "reward_std": 0.2698897160589695, "rewards/cosine_scaled_reward": 0.1609980291686952, "rewards/format_reward": 0.9791666716337204, "step": 360 }, { "completion_length": 1785.3333587646484, "epoch": 0.4125714285714286, "grad_norm": 0.23149755597114563, "kl": 0.01030731201171875, "learning_rate": 2.9576484845877793e-07, "loss": 0.0004, "reward": 0.09932729974389076, "reward_std": 0.19216609373688698, "rewards/cosine_scaled_reward": -0.08814284577965736, "rewards/format_reward": 0.6458333432674408, "step": 361 }, { "completion_length": 1765.2708435058594, "epoch": 0.4137142857142857, "grad_norm": 0.25148525834083557, "kl": 0.01117706298828125, "learning_rate": 2.931788945420058e-07, "loss": 0.0004, "reward": 0.28765634074807167, "reward_std": 0.28115685284137726, "rewards/cosine_scaled_reward": 0.10722193028777838, "rewards/format_reward": 0.7916666716337204, "step": 362 }, { "completion_length": 1581.7916870117188, "epoch": 0.41485714285714287, "grad_norm": 0.2328980267047882, "kl": 0.0112457275390625, "learning_rate": 2.9060545772359305e-07, "loss": 0.0005, "reward": 0.15995342750102282, "reward_std": 0.3252422660589218, "rewards/cosine_scaled_reward": -0.09024253115057945, "rewards/format_reward": 0.8958333432674408, "step": 363 }, { "completion_length": 1506.9375457763672, "epoch": 0.416, "grad_norm": 0.28264883160591125, "kl": 0.0095977783203125, "learning_rate": 2.8804466342921987e-07, "loss": 0.0004, "reward": 0.08763573569012806, "reward_std": 0.21415090188384056, "rewards/cosine_scaled_reward": -0.16532962024211884, "rewards/format_reward": 0.833333358168602, "step": 364 }, { "completion_length": 2367.4166870117188, "epoch": 0.41714285714285715, "grad_norm": 0.19307875633239746, "kl": 0.014556884765625, "learning_rate": 2.854966364683872e-07, "loss": 0.0006, "reward": 0.20070525258779526, "reward_std": 0.26848191022872925, "rewards/cosine_scaled_reward": 0.022052939981222153, "rewards/format_reward": 0.7083333488553762, "step": 365 }, { "completion_length": 1509.9584045410156, "epoch": 0.41828571428571426, "grad_norm": 0.2306293249130249, "kl": 0.010223388671875, "learning_rate": 2.829615010283344e-07, "loss": 0.0004, "reward": 0.25371203664690256, "reward_std": 0.2659039720892906, "rewards/cosine_scaled_reward": 0.02641349472105503, "rewards/format_reward": 0.8958333432674408, "step": 366 }, { "completion_length": 1335.9167022705078, "epoch": 0.41942857142857143, "grad_norm": 0.24042809009552002, "kl": 0.00992584228515625, "learning_rate": 2.8043938066798645e-07, "loss": 0.0004, "reward": 0.3599996566772461, "reward_std": 0.19493130780756474, "rewards/cosine_scaled_reward": 0.13548098504543304, "rewards/format_reward": 0.9791666716337204, "step": 367 }, { "completion_length": 1371.1666870117188, "epoch": 0.4205714285714286, "grad_norm": 0.22017300128936768, "kl": 0.00868988037109375, "learning_rate": 2.7793039831193133e-07, "loss": 0.0003, "reward": 0.2111358353868127, "reward_std": 0.31247158348560333, "rewards/cosine_scaled_reward": -0.023564637638628483, "rewards/format_reward": 0.875, "step": 368 }, { "completion_length": 1303.375015258789, "epoch": 0.4217142857142857, "grad_norm": 0.27260446548461914, "kl": 0.009395599365234375, "learning_rate": 2.7543467624442956e-07, "loss": 0.0004, "reward": 0.39653632789850235, "reward_std": 0.1950348112732172, "rewards/cosine_scaled_reward": 0.21199024934321642, "rewards/format_reward": 0.8958333432674408, "step": 369 }, { "completion_length": 1707.4584045410156, "epoch": 0.4228571428571429, "grad_norm": 0.2327386736869812, "kl": 0.012054443359375, "learning_rate": 2.729523361034538e-07, "loss": 0.0005, "reward": 0.2546226102858782, "reward_std": 0.27051765471696854, "rewards/cosine_scaled_reward": 0.015558794140815735, "rewards/format_reward": 0.9375000149011612, "step": 370 }, { "completion_length": 1338.8125, "epoch": 0.424, "grad_norm": 0.2991894781589508, "kl": 0.0108642578125, "learning_rate": 2.7048349887476037e-07, "loss": 0.0004, "reward": 0.2581311799585819, "reward_std": 0.3103417530655861, "rewards/cosine_scaled_reward": 0.03514588810503483, "rewards/format_reward": 0.8958333432674408, "step": 371 }, { "completion_length": 1596.3958587646484, "epoch": 0.42514285714285716, "grad_norm": 0.20261621475219727, "kl": 0.00960540771484375, "learning_rate": 2.6802828488599294e-07, "loss": 0.0004, "reward": 0.23256312776356936, "reward_std": 0.182934682816267, "rewards/cosine_scaled_reward": 0.03540460020303726, "rewards/format_reward": 0.7916666865348816, "step": 372 }, { "completion_length": 1823.7917175292969, "epoch": 0.42628571428571427, "grad_norm": 0.2511698305606842, "kl": 0.01041412353515625, "learning_rate": 2.655868138008171e-07, "loss": 0.0004, "reward": 0.13830948527902365, "reward_std": 0.16817106679081917, "rewards/cosine_scaled_reward": -0.08003572002053261, "rewards/format_reward": 0.7708333432674408, "step": 373 }, { "completion_length": 1590.8750305175781, "epoch": 0.42742857142857144, "grad_norm": 0.24217580258846283, "kl": 0.0106048583984375, "learning_rate": 2.631592046130896e-07, "loss": 0.0004, "reward": 0.21486383816227317, "reward_std": 0.2895674481987953, "rewards/cosine_scaled_reward": -0.010343782603740692, "rewards/format_reward": 0.8541666865348816, "step": 374 }, { "completion_length": 1353.1458435058594, "epoch": 0.42857142857142855, "grad_norm": 0.2372012883424759, "kl": 0.01015472412109375, "learning_rate": 2.6074557564105724e-07, "loss": 0.0004, "reward": 0.3424642861355096, "reward_std": 0.25616780295968056, "rewards/cosine_scaled_reward": 0.1377451792359352, "rewards/format_reward": 0.8958333432674408, "step": 375 }, { "completion_length": 1862.2500305175781, "epoch": 0.4297142857142857, "grad_norm": 0.2163545936346054, "kl": 0.01139068603515625, "learning_rate": 2.583460445215911e-07, "loss": 0.0005, "reward": 0.29940345184877515, "reward_std": 0.2624346353113651, "rewards/cosine_scaled_reward": 0.14119264110922813, "rewards/format_reward": 0.7500000298023224, "step": 376 }, { "completion_length": 1338.4375305175781, "epoch": 0.4308571428571429, "grad_norm": 0.25990864634513855, "kl": 0.0098114013671875, "learning_rate": 2.5596072820445254e-07, "loss": 0.0004, "reward": 0.37805385142564774, "reward_std": 0.33201449923217297, "rewards/cosine_scaled_reward": 0.16646008286625147, "rewards/format_reward": 0.9375000149011612, "step": 377 }, { "completion_length": 1609.9583740234375, "epoch": 0.432, "grad_norm": 0.33538058400154114, "kl": 0.0113525390625, "learning_rate": 2.5358974294659373e-07, "loss": 0.0005, "reward": 0.07824116386473179, "reward_std": 0.23690377175807953, "rewards/cosine_scaled_reward": -0.17136050947010517, "rewards/format_reward": 0.8125000149011612, "step": 378 }, { "completion_length": 1537.2916870117188, "epoch": 0.43314285714285716, "grad_norm": 0.20783498883247375, "kl": 0.00994873046875, "learning_rate": 2.512332043064913e-07, "loss": 0.0004, "reward": 0.2669680919498205, "reward_std": 0.3232973664999008, "rewards/cosine_scaled_reward": 0.06161441095173359, "rewards/format_reward": 0.8333333432674408, "step": 379 }, { "completion_length": 1752.5000457763672, "epoch": 0.4342857142857143, "grad_norm": 0.3683948218822479, "kl": 0.014739990234375, "learning_rate": 2.488912271385139e-07, "loss": 0.0006, "reward": 0.08518872899003327, "reward_std": 0.26048473827540874, "rewards/cosine_scaled_reward": -0.12987486645579338, "rewards/format_reward": 0.708333358168602, "step": 380 }, { "completion_length": 1513.437515258789, "epoch": 0.43542857142857144, "grad_norm": 0.25296124815940857, "kl": 0.011993408203125, "learning_rate": 2.465639255873246e-07, "loss": 0.0005, "reward": 0.29362980648875237, "reward_std": 0.2426050491631031, "rewards/cosine_scaled_reward": 0.06906034797430038, "rewards/format_reward": 0.9166666716337204, "step": 381 }, { "completion_length": 1786.2917022705078, "epoch": 0.43657142857142855, "grad_norm": 0.23917359113693237, "kl": 0.0137481689453125, "learning_rate": 2.4425141308231765e-07, "loss": 0.0006, "reward": 0.14519746601581573, "reward_std": 0.25404126197099686, "rewards/cosine_scaled_reward": -0.06414372939616442, "rewards/format_reward": 0.7500000149011612, "step": 382 }, { "completion_length": 2034.9583740234375, "epoch": 0.4377142857142857, "grad_norm": 0.23685619235038757, "kl": 0.012542724609375, "learning_rate": 2.4195380233209006e-07, "loss": 0.0005, "reward": 0.17537298426032066, "reward_std": 0.19519681856036186, "rewards/cosine_scaled_reward": -0.002057701349258423, "rewards/format_reward": 0.6875000149011612, "step": 383 }, { "completion_length": 1755.5000305175781, "epoch": 0.43885714285714283, "grad_norm": 0.2406085878610611, "kl": 0.0112762451171875, "learning_rate": 2.3967120531894857e-07, "loss": 0.0005, "reward": 0.044221414253115654, "reward_std": 0.18394669890403748, "rewards/cosine_scaled_reward": -0.2134968489408493, "rewards/format_reward": 0.8125000149011612, "step": 384 }, { "completion_length": 1586.6459197998047, "epoch": 0.44, "grad_norm": 0.2748856544494629, "kl": 0.011138916015625, "learning_rate": 2.374037332934512e-07, "loss": 0.0004, "reward": 0.3660757765173912, "reward_std": 0.2511321157217026, "rewards/cosine_scaled_reward": 0.1509570861235261, "rewards/format_reward": 0.9583333432674408, "step": 385 }, { "completion_length": 1456.5625610351562, "epoch": 0.44114285714285717, "grad_norm": 0.22892530262470245, "kl": 0.0084381103515625, "learning_rate": 2.3515149676898552e-07, "loss": 0.0003, "reward": 0.16375851293560117, "reward_std": 0.22477589175105095, "rewards/cosine_scaled_reward": -0.07266581058502197, "rewards/format_reward": 0.8541666865348816, "step": 386 }, { "completion_length": 1412.1459045410156, "epoch": 0.4422857142857143, "grad_norm": 0.21265709400177002, "kl": 0.0097198486328125, "learning_rate": 2.3291460551638237e-07, "loss": 0.0004, "reward": 0.160978349391371, "reward_std": 0.25059961900115013, "rewards/cosine_scaled_reward": -0.10482261097058654, "rewards/format_reward": 0.9375, "step": 387 }, { "completion_length": 1464.2916870117188, "epoch": 0.44342857142857145, "grad_norm": 0.23378108441829681, "kl": 0.009490966796875, "learning_rate": 2.306931685585657e-07, "loss": 0.0004, "reward": 0.33504685014486313, "reward_std": 0.32544615119695663, "rewards/cosine_scaled_reward": 0.12704317644238472, "rewards/format_reward": 0.9166666865348816, "step": 388 }, { "completion_length": 1371.0416870117188, "epoch": 0.44457142857142856, "grad_norm": 0.3262147009372711, "kl": 0.01122283935546875, "learning_rate": 2.2848729416523859e-07, "loss": 0.0004, "reward": 0.393025110475719, "reward_std": 0.290339432656765, "rewards/cosine_scaled_reward": 0.21821050345897675, "rewards/format_reward": 0.8750000149011612, "step": 389 }, { "completion_length": 1398.8958892822266, "epoch": 0.44571428571428573, "grad_norm": 0.2878376841545105, "kl": 0.013916015625, "learning_rate": 2.2629708984760706e-07, "loss": 0.0006, "reward": 0.21481858170591295, "reward_std": 0.16455775126814842, "rewards/cosine_scaled_reward": 0.0046413615345954895, "rewards/format_reward": 0.8125000149011612, "step": 390 }, { "completion_length": 1230.7916870117188, "epoch": 0.44685714285714284, "grad_norm": 0.3208393156528473, "kl": 0.01119232177734375, "learning_rate": 2.2412266235313973e-07, "loss": 0.0004, "reward": 0.08701863046735525, "reward_std": 0.215634036809206, "rewards/cosine_scaled_reward": -0.20002751424908638, "rewards/format_reward": 0.9375000149011612, "step": 391 }, { "completion_length": 1943.3125610351562, "epoch": 0.448, "grad_norm": 0.20683653652668, "kl": 0.01094818115234375, "learning_rate": 2.2196411766036487e-07, "loss": 0.0004, "reward": 0.406577079789713, "reward_std": 0.31823114305734634, "rewards/cosine_scaled_reward": 0.25175799429416656, "rewards/format_reward": 0.8125, "step": 392 }, { "completion_length": 1398.9375457763672, "epoch": 0.4491428571428571, "grad_norm": 0.2916996479034424, "kl": 0.0109405517578125, "learning_rate": 2.1982156097370557e-07, "loss": 0.0004, "reward": 0.33830785006284714, "reward_std": 0.30660923570394516, "rewards/cosine_scaled_reward": 0.12406410835683346, "rewards/format_reward": 0.9166666865348816, "step": 393 }, { "completion_length": 1487.3334045410156, "epoch": 0.4502857142857143, "grad_norm": 0.29143252968788147, "kl": 0.01137542724609375, "learning_rate": 2.1769509671835223e-07, "loss": 0.0005, "reward": 0.06870951503515244, "reward_std": 0.18425077386200428, "rewards/cosine_scaled_reward": -0.2029971145093441, "rewards/format_reward": 0.8750000149011612, "step": 394 }, { "completion_length": 1707.5209045410156, "epoch": 0.4514285714285714, "grad_norm": 0.354472279548645, "kl": 0.0127716064453125, "learning_rate": 2.1558482853517253e-07, "loss": 0.0005, "reward": 0.16543664713390172, "reward_std": 0.2276711966842413, "rewards/cosine_scaled_reward": -0.03153465378272813, "rewards/format_reward": 0.7291666865348816, "step": 395 }, { "completion_length": 1278.7292022705078, "epoch": 0.45257142857142857, "grad_norm": 0.25319093465805054, "kl": 0.010589599609375, "learning_rate": 2.134908592756607e-07, "loss": 0.0004, "reward": 0.19444908574223518, "reward_std": 0.12517918460071087, "rewards/cosine_scaled_reward": -0.06452729552984238, "rewards/format_reward": 0.9375000149011612, "step": 396 }, { "completion_length": 1797.2500457763672, "epoch": 0.45371428571428574, "grad_norm": 0.30418673157691956, "kl": 0.017242431640625, "learning_rate": 2.1141329099692406e-07, "loss": 0.0007, "reward": 0.3547268435359001, "reward_std": 0.17775312066078186, "rewards/cosine_scaled_reward": 0.19013852439820766, "rewards/format_reward": 0.7916666716337204, "step": 397 }, { "completion_length": 1636.1875610351562, "epoch": 0.45485714285714285, "grad_norm": 0.21280616521835327, "kl": 0.0079193115234375, "learning_rate": 2.0935222495670968e-07, "loss": 0.0003, "reward": 0.32367637380957603, "reward_std": 0.2834791075438261, "rewards/cosine_scaled_reward": 0.12400428391993046, "rewards/format_reward": 0.8750000149011612, "step": 398 }, { "completion_length": 1898.1666870117188, "epoch": 0.456, "grad_norm": 0.232693612575531, "kl": 0.0127410888671875, "learning_rate": 2.0730776160846853e-07, "loss": 0.0005, "reward": 0.17925062775611877, "reward_std": 0.2557571642100811, "rewards/cosine_scaled_reward": -0.0575912781059742, "rewards/format_reward": 0.8541666865348816, "step": 399 }, { "completion_length": 1323.5208740234375, "epoch": 0.45714285714285713, "grad_norm": 0.3167197108268738, "kl": 0.0113067626953125, "learning_rate": 2.0528000059645995e-07, "loss": 0.0005, "reward": 0.22988794930279255, "reward_std": 0.1990835703909397, "rewards/cosine_scaled_reward": -0.005974147003144026, "rewards/format_reward": 0.8958333432674408, "step": 400 }, { "completion_length": 1017.5833740234375, "epoch": 0.4582857142857143, "grad_norm": 0.23701179027557373, "kl": 0.00727081298828125, "learning_rate": 2.032690407508949e-07, "loss": 0.0003, "reward": 0.2984974943101406, "reward_std": 0.20315679907798767, "rewards/cosine_scaled_reward": 0.05098311696201563, "rewards/format_reward": 0.9791666716337204, "step": 401 }, { "completion_length": 1881.5000610351562, "epoch": 0.4594285714285714, "grad_norm": 0.2918482720851898, "kl": 0.0158843994140625, "learning_rate": 2.0127498008311922e-07, "loss": 0.0006, "reward": 0.05151970311999321, "reward_std": 0.21448798850178719, "rewards/cosine_scaled_reward": -0.19786715414375067, "rewards/format_reward": 0.7916666716337204, "step": 402 }, { "completion_length": 1883.6250610351562, "epoch": 0.4605714285714286, "grad_norm": 0.18074828386306763, "kl": 0.01107025146484375, "learning_rate": 1.9929791578083655e-07, "loss": 0.0004, "reward": 0.18776276428252459, "reward_std": 0.3380716070532799, "rewards/cosine_scaled_reward": -0.027764197438955307, "rewards/format_reward": 0.8125000149011612, "step": 403 }, { "completion_length": 1436.1458740234375, "epoch": 0.4617142857142857, "grad_norm": 0.29037535190582275, "kl": 0.0111846923828125, "learning_rate": 1.9733794420337213e-07, "loss": 0.0004, "reward": 0.28416240587830544, "reward_std": 0.2827350050210953, "rewards/cosine_scaled_reward": 0.052190789952874184, "rewards/format_reward": 0.9375000149011612, "step": 404 }, { "completion_length": 1326.7084045410156, "epoch": 0.46285714285714286, "grad_norm": 0.21668171882629395, "kl": 0.0085601806640625, "learning_rate": 1.9539516087697517e-07, "loss": 0.0003, "reward": 0.34238045662641525, "reward_std": 0.2986739482730627, "rewards/cosine_scaled_reward": 0.1431681104004383, "rewards/format_reward": 0.8958333432674408, "step": 405 }, { "completion_length": 1571.479248046875, "epoch": 0.464, "grad_norm": 0.24584002792835236, "kl": 0.01373291015625, "learning_rate": 1.934696604901642e-07, "loss": 0.0005, "reward": 0.15168224461376667, "reward_std": 0.2871460150927305, "rewards/cosine_scaled_reward": -0.07575275376439095, "rewards/format_reward": 0.8125000149011612, "step": 406 }, { "completion_length": 1145.8541870117188, "epoch": 0.46514285714285714, "grad_norm": 0.38704681396484375, "kl": 0.0112457275390625, "learning_rate": 1.915615368891117e-07, "loss": 0.0004, "reward": 0.26085772132501006, "reward_std": 0.1329988930374384, "rewards/cosine_scaled_reward": 0.019920717924833298, "rewards/format_reward": 0.9375000149011612, "step": 407 }, { "completion_length": 1228.9583740234375, "epoch": 0.4662857142857143, "grad_norm": 0.28655827045440674, "kl": 0.01071929931640625, "learning_rate": 1.8967088307307e-07, "loss": 0.0004, "reward": 0.20901603996753693, "reward_std": 0.2725791148841381, "rewards/cosine_scaled_reward": -0.05561388377100229, "rewards/format_reward": 0.9583333432674408, "step": 408 }, { "completion_length": 1253.9375305175781, "epoch": 0.4674285714285714, "grad_norm": 0.23769833147525787, "kl": 0.00876617431640625, "learning_rate": 1.8779779118983867e-07, "loss": 0.0004, "reward": 0.3615655303001404, "reward_std": 0.27604348957538605, "rewards/cosine_scaled_reward": 0.15084571414627135, "rewards/format_reward": 0.9375, "step": 409 }, { "completion_length": 1575.5208435058594, "epoch": 0.4685714285714286, "grad_norm": 0.23627842962741852, "kl": 0.0123291015625, "learning_rate": 1.8594235253127372e-07, "loss": 0.0005, "reward": 0.12202017195522785, "reward_std": 0.2631267737597227, "rewards/cosine_scaled_reward": -0.13389795599505305, "rewards/format_reward": 0.8750000149011612, "step": 410 }, { "completion_length": 1910.4792175292969, "epoch": 0.4697142857142857, "grad_norm": 0.2608468234539032, "kl": 0.014801025390625, "learning_rate": 1.8410465752883758e-07, "loss": 0.0006, "reward": 0.08060921769356355, "reward_std": 0.21683987975120544, "rewards/cosine_scaled_reward": -0.14538796059787273, "rewards/format_reward": 0.7500000149011612, "step": 411 }, { "completion_length": 1346.1041717529297, "epoch": 0.47085714285714286, "grad_norm": 0.2915378212928772, "kl": 0.00960540771484375, "learning_rate": 1.822847957491922e-07, "loss": 0.0004, "reward": 0.1485822768881917, "reward_std": 0.23674238845705986, "rewards/cosine_scaled_reward": -0.1019090199843049, "rewards/format_reward": 0.8750000149011612, "step": 412 }, { "completion_length": 1397.8333740234375, "epoch": 0.472, "grad_norm": 0.2737977206707001, "kl": 0.0107574462890625, "learning_rate": 1.804828558898332e-07, "loss": 0.0004, "reward": 0.14930985309183598, "reward_std": 0.2523481696844101, "rewards/cosine_scaled_reward": -0.11631065257824957, "rewards/format_reward": 0.9166666716337204, "step": 413 }, { "completion_length": 1642.3958740234375, "epoch": 0.47314285714285714, "grad_norm": 0.20642879605293274, "kl": 0.01294708251953125, "learning_rate": 1.7869892577476722e-07, "loss": 0.0005, "reward": 0.1973862424492836, "reward_std": 0.18661364912986755, "rewards/cosine_scaled_reward": -0.008776968927122653, "rewards/format_reward": 0.7916666865348816, "step": 414 }, { "completion_length": 1843.4375305175781, "epoch": 0.4742857142857143, "grad_norm": 0.2866264581680298, "kl": 0.0138397216796875, "learning_rate": 1.7693309235023127e-07, "loss": 0.0006, "reward": 0.20638167671859264, "reward_std": 0.2785125896334648, "rewards/cosine_scaled_reward": -0.01411110907793045, "rewards/format_reward": 0.8333333432674408, "step": 415 }, { "completion_length": 1825.0625915527344, "epoch": 0.4754285714285714, "grad_norm": 0.2105662226676941, "kl": 0.0146026611328125, "learning_rate": 1.7518544168045524e-07, "loss": 0.0006, "reward": 0.1510641649365425, "reward_std": 0.33063989877700806, "rewards/cosine_scaled_reward": -0.07132141478359699, "rewards/format_reward": 0.7916666865348816, "step": 416 }, { "completion_length": 1851.4583892822266, "epoch": 0.4765714285714286, "grad_norm": 0.2664584815502167, "kl": 0.0132293701171875, "learning_rate": 1.7345605894346726e-07, "loss": 0.0005, "reward": 0.17481167800724506, "reward_std": 0.3025582581758499, "rewards/cosine_scaled_reward": -0.03878038749098778, "rewards/format_reward": 0.7916666865348816, "step": 417 }, { "completion_length": 1656.7916870117188, "epoch": 0.4777142857142857, "grad_norm": 0.33390721678733826, "kl": 0.011993408203125, "learning_rate": 1.7174502842694212e-07, "loss": 0.0005, "reward": 0.10759061248973012, "reward_std": 0.19447487592697144, "rewards/cosine_scaled_reward": -0.15984330605715513, "rewards/format_reward": 0.895833358168602, "step": 418 }, { "completion_length": 1700.5625762939453, "epoch": 0.47885714285714287, "grad_norm": 0.27000099420547485, "kl": 0.0132293701171875, "learning_rate": 1.7005243352409333e-07, "loss": 0.0005, "reward": 0.16961859352886677, "reward_std": 0.3320206254720688, "rewards/cosine_scaled_reward": -0.06891552917659283, "rewards/format_reward": 0.8541666865348816, "step": 419 }, { "completion_length": 1130.6250457763672, "epoch": 0.48, "grad_norm": 0.32490241527557373, "kl": 0.010345458984375, "learning_rate": 1.6837835672960831e-07, "loss": 0.0004, "reward": 0.16695750327198766, "reward_std": 0.22090371325612068, "rewards/cosine_scaled_reward": -0.09449976186442655, "rewards/format_reward": 0.9166666716337204, "step": 420 }, { "completion_length": 1289.3542175292969, "epoch": 0.48114285714285715, "grad_norm": 0.2906915247440338, "kl": 0.00969696044921875, "learning_rate": 1.6672287963562852e-07, "loss": 0.0004, "reward": 0.34764280915260315, "reward_std": 0.21982120722532272, "rewards/cosine_scaled_reward": 0.12819887697696686, "rewards/format_reward": 0.9375000149011612, "step": 421 }, { "completion_length": 1322.8750610351562, "epoch": 0.48228571428571426, "grad_norm": 0.21419624984264374, "kl": 0.00823974609375, "learning_rate": 1.6508608292777203e-07, "loss": 0.0003, "reward": 0.17968564480543137, "reward_std": 0.2470797523856163, "rewards/cosine_scaled_reward": -0.0866120639257133, "rewards/format_reward": 0.9583333432674408, "step": 422 }, { "completion_length": 1337.8958358764648, "epoch": 0.48342857142857143, "grad_norm": 0.3047991693019867, "kl": 0.0133056640625, "learning_rate": 1.6346804638120098e-07, "loss": 0.0005, "reward": 0.1798581276088953, "reward_std": 0.20611854176968336, "rewards/cosine_scaled_reward": -0.062100378796458244, "rewards/format_reward": 0.8750000149011612, "step": 423 }, { "completion_length": 1513.2708740234375, "epoch": 0.4845714285714286, "grad_norm": 0.2347906529903412, "kl": 0.01027679443359375, "learning_rate": 1.6186884885673413e-07, "loss": 0.0004, "reward": 0.16043103858828545, "reward_std": 0.24068767204880714, "rewards/cosine_scaled_reward": -0.09871538355946541, "rewards/format_reward": 0.9166666716337204, "step": 424 }, { "completion_length": 1886.8541870117188, "epoch": 0.4857142857142857, "grad_norm": 0.2786445915699005, "kl": 0.015716552734375, "learning_rate": 1.6028856829700258e-07, "loss": 0.0006, "reward": 0.1020525568164885, "reward_std": 0.2838766872882843, "rewards/cosine_scaled_reward": -0.14031901117414236, "rewards/format_reward": 0.8125, "step": 425 }, { "completion_length": 1222.5625305175781, "epoch": 0.4868571428571429, "grad_norm": 0.26595538854599, "kl": 0.0072479248046875, "learning_rate": 1.5872728172265146e-07, "loss": 0.0003, "reward": 0.3471098020672798, "reward_std": 0.32343775779008865, "rewards/cosine_scaled_reward": 0.12477114424109459, "rewards/format_reward": 0.9583333432674408, "step": 426 }, { "completion_length": 1624.166748046875, "epoch": 0.488, "grad_norm": 0.285559743642807, "kl": 0.01145172119140625, "learning_rate": 1.5718506522858572e-07, "loss": 0.0005, "reward": 0.3360773101449013, "reward_std": 0.2637428678572178, "rewards/cosine_scaled_reward": 0.16991135850548744, "rewards/format_reward": 0.7916666865348816, "step": 427 }, { "completion_length": 2089.6459045410156, "epoch": 0.48914285714285716, "grad_norm": 0.29123032093048096, "kl": 0.01389312744140625, "learning_rate": 1.5566199398026147e-07, "loss": 0.0006, "reward": 0.07632875768467784, "reward_std": 0.21408921107649803, "rewards/cosine_scaled_reward": -0.15921480767428875, "rewards/format_reward": 0.7708333432674408, "step": 428 }, { "completion_length": 1952.9376220703125, "epoch": 0.49028571428571427, "grad_norm": 0.2803601920604706, "kl": 0.018218994140625, "learning_rate": 1.5415814221002265e-07, "loss": 0.0007, "reward": 0.1619062926620245, "reward_std": 0.21095814555883408, "rewards/cosine_scaled_reward": -0.0001234300434589386, "rewards/format_reward": 0.6250000298023224, "step": 429 }, { "completion_length": 2041.8334350585938, "epoch": 0.49142857142857144, "grad_norm": 0.35983848571777344, "kl": 0.018157958984375, "learning_rate": 1.5267358321348285e-07, "loss": 0.0007, "reward": 0.018218230456113815, "reward_std": 0.22815853729844093, "rewards/cosine_scaled_reward": -0.2138291234150529, "rewards/format_reward": 0.708333358168602, "step": 430 }, { "completion_length": 1465.6042022705078, "epoch": 0.49257142857142855, "grad_norm": 0.38481760025024414, "kl": 0.018310546875, "learning_rate": 1.5120838934595337e-07, "loss": 0.0007, "reward": 0.25773806248616893, "reward_std": 0.24154233559966087, "rewards/cosine_scaled_reward": 0.06686290970537812, "rewards/format_reward": 0.7916666865348816, "step": 431 }, { "completion_length": 1565.5416870117188, "epoch": 0.4937142857142857, "grad_norm": 0.28354302048683167, "kl": 0.01001739501953125, "learning_rate": 1.4976263201891613e-07, "loss": 0.0004, "reward": 0.17629767209291458, "reward_std": 0.2845665439963341, "rewards/cosine_scaled_reward": -0.09245277754962444, "rewards/format_reward": 0.9583333432674408, "step": 432 }, { "completion_length": 1747.5417175292969, "epoch": 0.4948571428571429, "grad_norm": 0.28452399373054504, "kl": 0.0126953125, "learning_rate": 1.483363816965435e-07, "loss": 0.0005, "reward": 0.18188877776265144, "reward_std": 0.23814743384718895, "rewards/cosine_scaled_reward": -0.05783041566610336, "rewards/format_reward": 0.8750000149011612, "step": 433 }, { "completion_length": 1306.6875457763672, "epoch": 0.496, "grad_norm": 0.4342614412307739, "kl": 0.0144195556640625, "learning_rate": 1.469297078922642e-07, "loss": 0.0006, "reward": 0.4066113345324993, "reward_std": 0.24682630226016045, "rewards/cosine_scaled_reward": 0.19056972488760948, "rewards/format_reward": 0.9791666716337204, "step": 434 }, { "completion_length": 1489.3750305175781, "epoch": 0.49714285714285716, "grad_norm": 0.420890212059021, "kl": 0.0156402587890625, "learning_rate": 1.4554267916537495e-07, "loss": 0.0006, "reward": 0.1405728820245713, "reward_std": 0.242431428283453, "rewards/cosine_scaled_reward": -0.11428239196538925, "rewards/format_reward": 0.8750000298023224, "step": 435 }, { "completion_length": 1935.479248046875, "epoch": 0.4982857142857143, "grad_norm": 0.3561446964740753, "kl": 0.0189971923828125, "learning_rate": 1.4417536311769885e-07, "loss": 0.0008, "reward": 0.1535082533955574, "reward_std": 0.30283405259251595, "rewards/cosine_scaled_reward": -0.058026134967803955, "rewards/format_reward": 0.770833358168602, "step": 436 }, { "completion_length": 1268.5834045410156, "epoch": 0.49942857142857144, "grad_norm": 0.2628932297229767, "kl": 0.00823974609375, "learning_rate": 1.4282782639029128e-07, "loss": 0.0003, "reward": 0.26019781455397606, "reward_std": 0.22013483569025993, "rewards/cosine_scaled_reward": 0.04217514023184776, "rewards/format_reward": 0.8750000149011612, "step": 437 }, { "completion_length": 1237.3333892822266, "epoch": 0.5005714285714286, "grad_norm": 0.22354252636432648, "kl": 0.008880615234375, "learning_rate": 1.4150013466019114e-07, "loss": 0.0004, "reward": 0.18773654848337173, "reward_std": 0.22358889505267143, "rewards/cosine_scaled_reward": -0.07216969225555658, "rewards/format_reward": 0.9375000149011612, "step": 438 }, { "completion_length": 1286.6250305175781, "epoch": 0.5017142857142857, "grad_norm": 0.46097424626350403, "kl": 0.01509857177734375, "learning_rate": 1.4019235263722034e-07, "loss": 0.0006, "reward": 0.17482924927026033, "reward_std": 0.20090174488723278, "rewards/cosine_scaled_reward": -0.06697515025734901, "rewards/format_reward": 0.875, "step": 439 }, { "completion_length": 1781.8334045410156, "epoch": 0.5028571428571429, "grad_norm": 0.2967330515384674, "kl": 0.0180206298828125, "learning_rate": 1.3890454406082956e-07, "loss": 0.0007, "reward": 0.2402921486645937, "reward_std": 0.2830519266426563, "rewards/cosine_scaled_reward": 0.024581193923950195, "rewards/format_reward": 0.833333358168602, "step": 440 }, { "completion_length": 1721.9583740234375, "epoch": 0.504, "grad_norm": 0.28784990310668945, "kl": 0.01604461669921875, "learning_rate": 1.3763677169699217e-07, "loss": 0.0006, "reward": 0.22011593729257584, "reward_std": 0.19908079877495766, "rewards/cosine_scaled_reward": 0.01261664368212223, "rewards/format_reward": 0.8125000149011612, "step": 441 }, { "completion_length": 1423.8750610351562, "epoch": 0.5051428571428571, "grad_norm": 0.27160194516181946, "kl": 0.01059722900390625, "learning_rate": 1.3638909733514452e-07, "loss": 0.0004, "reward": 0.16875937953591347, "reward_std": 0.25345410220324993, "rewards/cosine_scaled_reward": -0.06454695202410221, "rewards/format_reward": 0.8541666716337204, "step": 442 }, { "completion_length": 1274.6250457763672, "epoch": 0.5062857142857143, "grad_norm": 0.22251670062541962, "kl": 0.00830841064453125, "learning_rate": 1.351615817851748e-07, "loss": 0.0003, "reward": 0.2590952720493078, "reward_std": 0.2291867546737194, "rewards/cosine_scaled_reward": 0.002004489302635193, "rewards/format_reward": 0.9791666716337204, "step": 443 }, { "completion_length": 1842.4583740234375, "epoch": 0.5074285714285715, "grad_norm": 0.27504411339759827, "kl": 0.01201629638671875, "learning_rate": 1.3395428487445914e-07, "loss": 0.0005, "reward": 0.3361460082232952, "reward_std": 0.22568980604410172, "rewards/cosine_scaled_reward": 0.13691299222409725, "rewards/format_reward": 0.895833358168602, "step": 444 }, { "completion_length": 1845.2500915527344, "epoch": 0.5085714285714286, "grad_norm": 0.20490147173404694, "kl": 0.01392364501953125, "learning_rate": 1.3276726544494571e-07, "loss": 0.0006, "reward": 0.24170970544219017, "reward_std": 0.23725956492125988, "rewards/cosine_scaled_reward": 0.05702884867787361, "rewards/format_reward": 0.7708333432674408, "step": 445 }, { "completion_length": 1136.5625305175781, "epoch": 0.5097142857142857, "grad_norm": 0.288634717464447, "kl": 0.00978851318359375, "learning_rate": 1.316005813502869e-07, "loss": 0.0004, "reward": 0.2657099813222885, "reward_std": 0.2308104708790779, "rewards/cosine_scaled_reward": 0.04564391728490591, "rewards/format_reward": 0.8750000149011612, "step": 446 }, { "completion_length": 1182.6041870117188, "epoch": 0.5108571428571429, "grad_norm": 0.251402884721756, "kl": 0.0087432861328125, "learning_rate": 1.3045428945301953e-07, "loss": 0.0003, "reward": 0.3128407448530197, "reward_std": 0.2510318458080292, "rewards/cosine_scaled_reward": 0.06458367872983217, "rewards/format_reward": 1.0, "step": 447 }, { "completion_length": 1315.2292175292969, "epoch": 0.512, "grad_norm": 0.22989130020141602, "kl": 0.007293701171875, "learning_rate": 1.2932844562179352e-07, "loss": 0.0003, "reward": 0.3753640539944172, "reward_std": 0.2728557586669922, "rewards/cosine_scaled_reward": 0.15566639928147197, "rewards/format_reward": 0.9791666716337204, "step": 448 }, { "completion_length": 1326.7917022705078, "epoch": 0.5131428571428571, "grad_norm": 0.28850504755973816, "kl": 0.0123291015625, "learning_rate": 1.2822310472864885e-07, "loss": 0.0005, "reward": 0.28183421678841114, "reward_std": 0.280775286257267, "rewards/cosine_scaled_reward": 0.07325122132897377, "rewards/format_reward": 0.8541666865348816, "step": 449 }, { "completion_length": 1575.5625457763672, "epoch": 0.5142857142857142, "grad_norm": 0.297370046377182, "kl": 0.01831817626953125, "learning_rate": 1.2713832064634125e-07, "loss": 0.0007, "reward": 0.19772536493837833, "reward_std": 0.20653804764151573, "rewards/cosine_scaled_reward": 0.010708175599575043, "rewards/format_reward": 0.7291666828095913, "step": 450 }, { "completion_length": 1457.0209045410156, "epoch": 0.5154285714285715, "grad_norm": 0.3056337237358093, "kl": 0.01351165771484375, "learning_rate": 1.260741462457165e-07, "loss": 0.0005, "reward": 0.2743101455271244, "reward_std": 0.32591526955366135, "rewards/cosine_scaled_reward": 0.0601138137280941, "rewards/format_reward": 0.8750000149011612, "step": 451 }, { "completion_length": 1302.5417175292969, "epoch": 0.5165714285714286, "grad_norm": 0.3352566063404083, "kl": 0.0123291015625, "learning_rate": 1.2503063339313356e-07, "loss": 0.0005, "reward": 0.19081053417176008, "reward_std": 0.3360443636775017, "rewards/cosine_scaled_reward": -0.04602043516933918, "rewards/format_reward": 0.8750000149011612, "step": 452 }, { "completion_length": 1659.9166870117188, "epoch": 0.5177142857142857, "grad_norm": 0.2861032783985138, "kl": 0.0186004638671875, "learning_rate": 1.2400783294793668e-07, "loss": 0.0007, "reward": 0.2529549226164818, "reward_std": 0.3462323397397995, "rewards/cosine_scaled_reward": 0.051917532458901405, "rewards/format_reward": 0.8125000149011612, "step": 453 }, { "completion_length": 1056.2083740234375, "epoch": 0.5188571428571429, "grad_norm": 0.23395924270153046, "kl": 0.009063720703125, "learning_rate": 1.2300579475997657e-07, "loss": 0.0004, "reward": 0.23422837536782026, "reward_std": 0.21533742547035217, "rewards/cosine_scaled_reward": -0.026755507104098797, "rewards/format_reward": 0.9791666716337204, "step": 454 }, { "completion_length": 1479.3333435058594, "epoch": 0.52, "grad_norm": 0.26352420449256897, "kl": 0.0125732421875, "learning_rate": 1.220245676671809e-07, "loss": 0.0005, "reward": 0.1705159144476056, "reward_std": 0.21411826461553574, "rewards/cosine_scaled_reward": -0.0704607660882175, "rewards/format_reward": 0.8750000149011612, "step": 455 }, { "completion_length": 1522.0209045410156, "epoch": 0.5211428571428571, "grad_norm": 0.29312658309936523, "kl": 0.01091766357421875, "learning_rate": 1.2106419949317388e-07, "loss": 0.0004, "reward": 0.24019585805945098, "reward_std": 0.14318828657269478, "rewards/cosine_scaled_reward": 0.02774641290307045, "rewards/format_reward": 0.833333358168602, "step": 456 }, { "completion_length": 1300.7084197998047, "epoch": 0.5222857142857142, "grad_norm": 0.23177440464496613, "kl": 0.0074005126953125, "learning_rate": 1.2012473704494537e-07, "loss": 0.0003, "reward": 0.2534996699541807, "reward_std": 0.2041100263595581, "rewards/cosine_scaled_reward": 0.003321884199976921, "rewards/format_reward": 0.9583333432674408, "step": 457 }, { "completion_length": 1500.3542175292969, "epoch": 0.5234285714285715, "grad_norm": 0.2277519702911377, "kl": 0.0123138427734375, "learning_rate": 1.1920622611056974e-07, "loss": 0.0005, "reward": 0.2181018777191639, "reward_std": 0.274239718914032, "rewards/cosine_scaled_reward": -0.008808528073132038, "rewards/format_reward": 0.8750000149011612, "step": 458 }, { "completion_length": 1216.4791870117188, "epoch": 0.5245714285714286, "grad_norm": 0.25994110107421875, "kl": 0.0108489990234375, "learning_rate": 1.1830871145697412e-07, "loss": 0.0004, "reward": 0.20663647167384624, "reward_std": 0.2075389288365841, "rewards/cosine_scaled_reward": -0.046165828593075275, "rewards/format_reward": 0.9375000149011612, "step": 459 }, { "completion_length": 1461.3750305175781, "epoch": 0.5257142857142857, "grad_norm": 0.27901095151901245, "kl": 0.012786865234375, "learning_rate": 1.1743223682775649e-07, "loss": 0.0005, "reward": 0.24853161070495844, "reward_std": 0.27109961211681366, "rewards/cosine_scaled_reward": 0.06439046189188957, "rewards/format_reward": 0.7708333432674408, "step": 460 }, { "completion_length": 1535.4584045410156, "epoch": 0.5268571428571428, "grad_norm": 0.2198079377412796, "kl": 0.0126800537109375, "learning_rate": 1.1657684494105386e-07, "loss": 0.0005, "reward": 0.27444493025541306, "reward_std": 0.287446316331625, "rewards/cosine_scaled_reward": 0.050167519599199295, "rewards/format_reward": 0.8958333432674408, "step": 461 }, { "completion_length": 1596.6667175292969, "epoch": 0.528, "grad_norm": 0.3223518133163452, "kl": 0.01030731201171875, "learning_rate": 1.1574257748745986e-07, "loss": 0.0004, "reward": 0.16964650806039572, "reward_std": 0.31031525135040283, "rewards/cosine_scaled_reward": -0.0880960263311863, "rewards/format_reward": 0.9166666865348816, "step": 462 }, { "completion_length": 1469.3750610351562, "epoch": 0.5291428571428571, "grad_norm": 0.33473342657089233, "kl": 0.0157318115234375, "learning_rate": 1.1492947512799328e-07, "loss": 0.0006, "reward": 0.25250214617699385, "reward_std": 0.20332858338952065, "rewards/cosine_scaled_reward": 0.07766706869006157, "rewards/format_reward": 0.7291666865348816, "step": 463 }, { "completion_length": 1470.5000305175781, "epoch": 0.5302857142857142, "grad_norm": 0.26016151905059814, "kl": 0.01175689697265625, "learning_rate": 1.1413757749211602e-07, "loss": 0.0005, "reward": 0.16552529204636812, "reward_std": 0.2517518103122711, "rewards/cosine_scaled_reward": -0.05065063759684563, "rewards/format_reward": 0.7916666716337204, "step": 464 }, { "completion_length": 1567.0000610351562, "epoch": 0.5314285714285715, "grad_norm": 0.2934165298938751, "kl": 0.0146026611328125, "learning_rate": 1.1336692317580158e-07, "loss": 0.0006, "reward": 0.2490998711436987, "reward_std": 0.24030263721942902, "rewards/cosine_scaled_reward": 0.02980668656527996, "rewards/format_reward": 0.8541666716337204, "step": 465 }, { "completion_length": 1849.2500610351562, "epoch": 0.5325714285714286, "grad_norm": 0.2325139194726944, "kl": 0.01602935791015625, "learning_rate": 1.1261754973965422e-07, "loss": 0.0006, "reward": 0.07940827216953039, "reward_std": 0.18165934830904007, "rewards/cosine_scaled_reward": -0.17715821415185928, "rewards/format_reward": 0.8333333432674408, "step": 466 }, { "completion_length": 1414.5625610351562, "epoch": 0.5337142857142857, "grad_norm": 0.2346559762954712, "kl": 0.00949859619140625, "learning_rate": 1.1188949370707787e-07, "loss": 0.0004, "reward": 0.21573650650680065, "reward_std": 0.20873811468482018, "rewards/cosine_scaled_reward": -0.05660676956176758, "rewards/format_reward": 1.0, "step": 467 }, { "completion_length": 1912.9376220703125, "epoch": 0.5348571428571428, "grad_norm": 0.3037513196468353, "kl": 0.01833343505859375, "learning_rate": 1.1118279056249653e-07, "loss": 0.0007, "reward": 0.2879499550908804, "reward_std": 0.28192097321152687, "rewards/cosine_scaled_reward": 0.11339114978909492, "rewards/format_reward": 0.7708333432674408, "step": 468 }, { "completion_length": 1271.7083587646484, "epoch": 0.536, "grad_norm": 0.3204426169395447, "kl": 0.014739990234375, "learning_rate": 1.1049747474962444e-07, "loss": 0.0006, "reward": 0.24237569607794285, "reward_std": 0.25575844570994377, "rewards/cosine_scaled_reward": 0.004833955317735672, "rewards/format_reward": 0.9166666716337204, "step": 469 }, { "completion_length": 1590.3125610351562, "epoch": 0.5371428571428571, "grad_norm": 0.2925020456314087, "kl": 0.0179595947265625, "learning_rate": 1.0983357966978745e-07, "loss": 0.0007, "reward": 0.3055003173649311, "reward_std": 0.19016432389616966, "rewards/cosine_scaled_reward": 0.11489966697990894, "rewards/format_reward": 0.8333333432674408, "step": 470 }, { "completion_length": 1118.3958587646484, "epoch": 0.5382857142857143, "grad_norm": 0.2672974467277527, "kl": 0.01080322265625, "learning_rate": 1.0919113768029517e-07, "loss": 0.0004, "reward": 0.05892130918800831, "reward_std": 0.20794951915740967, "rewards/cosine_scaled_reward": -0.2085135094821453, "rewards/format_reward": 0.8541666865348816, "step": 471 }, { "completion_length": 906.6458587646484, "epoch": 0.5394285714285715, "grad_norm": 0.2568436861038208, "kl": 0.0084228515625, "learning_rate": 1.0857018009286381e-07, "loss": 0.0003, "reward": 0.2956396248191595, "reward_std": 0.2609308622777462, "rewards/cosine_scaled_reward": 0.06847019493579865, "rewards/format_reward": 0.9375, "step": 472 }, { "completion_length": 1430.8333740234375, "epoch": 0.5405714285714286, "grad_norm": 0.30554234981536865, "kl": 0.01314544677734375, "learning_rate": 1.0797073717209013e-07, "loss": 0.0005, "reward": 0.05116685951361433, "reward_std": 0.2041519582271576, "rewards/cosine_scaled_reward": -0.20498012378811836, "rewards/format_reward": 0.8125000149011612, "step": 473 }, { "completion_length": 1448.8333587646484, "epoch": 0.5417142857142857, "grad_norm": 0.33508339524269104, "kl": 0.00983428955078125, "learning_rate": 1.0739283813397639e-07, "loss": 0.0004, "reward": 0.35246733389794827, "reward_std": 0.2351733185350895, "rewards/cosine_scaled_reward": 0.12773746624588966, "rewards/format_reward": 0.9791666716337204, "step": 474 }, { "completion_length": 1581.5834045410156, "epoch": 0.5428571428571428, "grad_norm": 0.42422112822532654, "kl": 0.016143798828125, "learning_rate": 1.068365111445064e-07, "loss": 0.0006, "reward": 0.22443121299147606, "reward_std": 0.31280872225761414, "rewards/cosine_scaled_reward": -0.00971287488937378, "rewards/format_reward": 0.8750000149011612, "step": 475 }, { "completion_length": 1366.2500457763672, "epoch": 0.544, "grad_norm": 0.3073312044143677, "kl": 0.0132598876953125, "learning_rate": 1.063017833182728e-07, "loss": 0.0005, "reward": 0.29938487708568573, "reward_std": 0.14425114821642637, "rewards/cosine_scaled_reward": 0.09217657893896103, "rewards/format_reward": 0.8750000149011612, "step": 476 }, { "completion_length": 965.2708435058594, "epoch": 0.5451428571428572, "grad_norm": 0.30544164776802063, "kl": 0.0073089599609375, "learning_rate": 1.0578868071715544e-07, "loss": 0.0003, "reward": 0.39359666407108307, "reward_std": 0.25950491055846214, "rewards/cosine_scaled_reward": 0.17642678815172985, "rewards/format_reward": 0.9791666716337204, "step": 477 }, { "completion_length": 1636.5209045410156, "epoch": 0.5462857142857143, "grad_norm": 0.2438599169254303, "kl": 0.011138916015625, "learning_rate": 1.0529722834905125e-07, "loss": 0.0004, "reward": 0.1690983809530735, "reward_std": 0.265783216804266, "rewards/cosine_scaled_reward": -0.06696726009249687, "rewards/format_reward": 0.8541666865348816, "step": 478 }, { "completion_length": 1457.8333740234375, "epoch": 0.5474285714285714, "grad_norm": 0.36753106117248535, "kl": 0.02089691162109375, "learning_rate": 1.0482745016665526e-07, "loss": 0.0008, "reward": 0.24042591266334057, "reward_std": 0.28960849717259407, "rewards/cosine_scaled_reward": 0.03149300068616867, "rewards/format_reward": 0.833333358168602, "step": 479 }, { "completion_length": 1351.2916870117188, "epoch": 0.5485714285714286, "grad_norm": 0.18641342222690582, "kl": 0.00542449951171875, "learning_rate": 1.0437936906629334e-07, "loss": 0.0002, "reward": 0.35221418365836143, "reward_std": 0.2596045136451721, "rewards/cosine_scaled_reward": 0.13672377169132233, "rewards/format_reward": 0.9583333432674408, "step": 480 }, { "completion_length": 1663.6458587646484, "epoch": 0.5497142857142857, "grad_norm": 0.2585221230983734, "kl": 0.019439697265625, "learning_rate": 1.0395300688680625e-07, "loss": 0.0008, "reward": 0.08225089451298118, "reward_std": 0.24548032693564892, "rewards/cosine_scaled_reward": -0.11919093690812588, "rewards/format_reward": 0.6666666716337204, "step": 481 }, { "completion_length": 1130.1250305175781, "epoch": 0.5508571428571428, "grad_norm": 0.30442705750465393, "kl": 0.0112762451171875, "learning_rate": 1.0354838440848501e-07, "loss": 0.0005, "reward": 0.2443934567272663, "reward_std": 0.24509599804878235, "rewards/cosine_scaled_reward": -0.007126476150006056, "rewards/format_reward": 0.9583333432674408, "step": 482 }, { "completion_length": 1528.354248046875, "epoch": 0.552, "grad_norm": 0.4626905024051666, "kl": 0.0183868408203125, "learning_rate": 1.0316552135205837e-07, "loss": 0.0007, "reward": 0.23332804813981056, "reward_std": 0.3575605973601341, "rewards/cosine_scaled_reward": 0.03512163204140961, "rewards/format_reward": 0.7916666716337204, "step": 483 }, { "completion_length": 1363.7500610351562, "epoch": 0.5531428571428572, "grad_norm": 0.30249202251434326, "kl": 0.01682281494140625, "learning_rate": 1.0280443637773163e-07, "loss": 0.0007, "reward": 0.35776749555952847, "reward_std": 0.2735111825168133, "rewards/cosine_scaled_reward": 0.18149008601903915, "rewards/format_reward": 0.833333358168602, "step": 484 }, { "completion_length": 1521.4584350585938, "epoch": 0.5542857142857143, "grad_norm": 0.32728955149650574, "kl": 0.017608642578125, "learning_rate": 1.0246514708427701e-07, "loss": 0.0007, "reward": 0.2742725261487067, "reward_std": 0.23011274076998234, "rewards/cosine_scaled_reward": 0.037165215238928795, "rewards/format_reward": 0.9375000149011612, "step": 485 }, { "completion_length": 1431.0625305175781, "epoch": 0.5554285714285714, "grad_norm": 0.3946894407272339, "kl": 0.01422882080078125, "learning_rate": 1.0214767000817596e-07, "loss": 0.0006, "reward": 0.3760902900248766, "reward_std": 0.2089155912399292, "rewards/cosine_scaled_reward": 0.1899992600083351, "rewards/format_reward": 0.8958333432674408, "step": 486 }, { "completion_length": 1597.7708435058594, "epoch": 0.5565714285714286, "grad_norm": 0.2671118974685669, "kl": 0.01332855224609375, "learning_rate": 1.0185202062281336e-07, "loss": 0.0005, "reward": 0.22492020577192307, "reward_std": 0.26971185952425003, "rewards/cosine_scaled_reward": -0.019520113710314035, "rewards/format_reward": 0.9166666865348816, "step": 487 }, { "completion_length": 1713.3333740234375, "epoch": 0.5577142857142857, "grad_norm": 0.2240956574678421, "kl": 0.01678466796875, "learning_rate": 1.0157821333772304e-07, "loss": 0.0007, "reward": 0.13526278641074896, "reward_std": 0.2893335670232773, "rewards/cosine_scaled_reward": -0.10202349070459604, "rewards/format_reward": 0.8333333432674408, "step": 488 }, { "completion_length": 1029.1875381469727, "epoch": 0.5588571428571428, "grad_norm": 0.31260091066360474, "kl": 0.01323699951171875, "learning_rate": 1.013262614978859e-07, "loss": 0.0005, "reward": 0.4309612764045596, "reward_std": 0.253094045445323, "rewards/cosine_scaled_reward": 0.2355925627052784, "rewards/format_reward": 0.9375000149011612, "step": 489 }, { "completion_length": 1255.187515258789, "epoch": 0.56, "grad_norm": 0.3444691300392151, "kl": 0.01079559326171875, "learning_rate": 1.0109617738307911e-07, "loss": 0.0004, "reward": 0.2048361971974373, "reward_std": 0.21078473329544067, "rewards/cosine_scaled_reward": -0.05001578852534294, "rewards/format_reward": 0.9375000149011612, "step": 490 }, { "completion_length": 1938.979248046875, "epoch": 0.5611428571428572, "grad_norm": 0.29255354404449463, "kl": 0.0247802734375, "learning_rate": 1.0088797220727779e-07, "loss": 0.001, "reward": 0.21013529412448406, "reward_std": 0.19084220007061958, "rewards/cosine_scaled_reward": 0.013043895363807678, "rewards/format_reward": 0.7708333432674408, "step": 491 }, { "completion_length": 1133.6875457763672, "epoch": 0.5622857142857143, "grad_norm": 0.4209684431552887, "kl": 0.0138702392578125, "learning_rate": 1.0070165611810855e-07, "loss": 0.0006, "reward": 0.16102333180606365, "reward_std": 0.17116187885403633, "rewards/cosine_scaled_reward": -0.0879567414522171, "rewards/format_reward": 0.8750000149011612, "step": 492 }, { "completion_length": 1594.7083740234375, "epoch": 0.5634285714285714, "grad_norm": 0.31390249729156494, "kl": 0.0111083984375, "learning_rate": 1.005372381963547e-07, "loss": 0.0004, "reward": 0.16091343411244452, "reward_std": 0.22968443483114243, "rewards/cosine_scaled_reward": -0.09001005440950394, "rewards/format_reward": 0.8958333432674408, "step": 493 }, { "completion_length": 1279.25, "epoch": 0.5645714285714286, "grad_norm": 0.33988407254219055, "kl": 0.01303863525390625, "learning_rate": 1.0039472645551372e-07, "loss": 0.0005, "reward": 0.11660805623978376, "reward_std": 0.20437689498066902, "rewards/cosine_scaled_reward": -0.14967265352606773, "rewards/format_reward": 0.8958333432674408, "step": 494 }, { "completion_length": 1453.0209045410156, "epoch": 0.5657142857142857, "grad_norm": 0.43056032061576843, "kl": 0.0152130126953125, "learning_rate": 1.002741278414069e-07, "loss": 0.0006, "reward": 0.2999148261733353, "reward_std": 0.21086948364973068, "rewards/cosine_scaled_reward": 0.07477198913693428, "rewards/format_reward": 0.9375000149011612, "step": 495 }, { "completion_length": 1207.75, "epoch": 0.5668571428571428, "grad_norm": 0.28147271275520325, "kl": 0.01139068603515625, "learning_rate": 1.0017544823184055e-07, "loss": 0.0005, "reward": 0.37165018916130066, "reward_std": 0.2604399360716343, "rewards/cosine_scaled_reward": 0.16403314599301666, "rewards/format_reward": 0.9166666865348816, "step": 496 }, { "completion_length": 1213.7708435058594, "epoch": 0.568, "grad_norm": 0.36225390434265137, "kl": 0.0146331787109375, "learning_rate": 1.0009869243631952e-07, "loss": 0.0006, "reward": 0.41349948197603226, "reward_std": 0.18646394088864326, "rewards/cosine_scaled_reward": 0.23145748116075993, "rewards/format_reward": 0.895833358168602, "step": 497 }, { "completion_length": 1448.729232788086, "epoch": 0.5691428571428572, "grad_norm": 0.3582679331302643, "kl": 0.015716552734375, "learning_rate": 1.000438641958131e-07, "loss": 0.0006, "reward": 0.09725892543792725, "reward_std": 0.24059875309467316, "rewards/cosine_scaled_reward": -0.1641225963830948, "rewards/format_reward": 0.8541666865348816, "step": 498 }, { "completion_length": 1700.8959045410156, "epoch": 0.5702857142857143, "grad_norm": 0.2774902582168579, "kl": 0.01750946044921875, "learning_rate": 1.0001096618257236e-07, "loss": 0.0007, "reward": 0.11731046251952648, "reward_std": 0.2501941677182913, "rewards/cosine_scaled_reward": -0.11795766558498144, "rewards/format_reward": 0.8125000149011612, "step": 499 }, { "completion_length": 1141.7917022705078, "epoch": 0.5714285714285714, "grad_norm": 0.32746848464012146, "kl": 0.0137176513671875, "learning_rate": 1e-07, "loss": 0.0005, "reward": 0.23398451320827007, "reward_std": 0.2770588528364897, "rewards/cosine_scaled_reward": 0.03950834833085537, "rewards/format_reward": 0.7916666865348816, "step": 500 }, { "epoch": 0.5714285714285714, "step": 500, "total_flos": 0.0, "train_loss": 0.00026818006123357916, "train_runtime": 27510.2217, "train_samples_per_second": 0.872, "train_steps_per_second": 0.018 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }