{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.856898029134533, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 2770.8472290039062, "epoch": 0.001713796058269066, "grad_norm": 0.15192405879497528, "kl": 0.0, "learning_rate": 0.0, "loss": 0.014, "reward": -0.06689765583723783, "reward_std": 0.505804143846035, "rewards/cosine_scaled_reward": -0.03344883490353823, "rewards/format_reward": 0.0, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 2785.013916015625, "epoch": 0.003427592116538132, "grad_norm": 0.1657538264989853, "kl": 0.0, "learning_rate": 2e-08, "loss": -0.0211, "reward": -0.4646243788301945, "reward_std": 0.39301297068595886, "rewards/cosine_scaled_reward": -0.23231217823922634, "rewards/format_reward": 0.0, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 2713.027801513672, "epoch": 0.005141388174807198, "grad_norm": 0.1747598648071289, "kl": 3.5196542739868164e-05, "learning_rate": 4e-08, "loss": -0.0275, "reward": -0.23865782655775547, "reward_std": 0.4481763616204262, "rewards/cosine_scaled_reward": -0.11932891746982932, "rewards/format_reward": 0.0, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 2938.5277709960938, "epoch": 0.006855184233076264, "grad_norm": 0.16107600927352905, "kl": 3.7282705307006836e-05, "learning_rate": 6e-08, "loss": -0.0289, "reward": 0.06913903169333935, "reward_std": 0.6892540901899338, "rewards/cosine_scaled_reward": 0.03456950932741165, "rewards/format_reward": 0.0, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 2532.7222290039062, "epoch": 0.00856898029134533, "grad_norm": 0.15964782238006592, "kl": 2.065300941467285e-05, "learning_rate": 8e-08, "loss": -0.0052, "reward": -0.15601756004616618, "reward_std": 0.5161308571696281, "rewards/cosine_scaled_reward": -0.07800877187401056, "rewards/format_reward": 0.0, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 3131.25, "epoch": 0.010282776349614395, "grad_norm": 0.13910692930221558, "kl": 4.1961669921875e-05, "learning_rate": 1e-07, "loss": 0.029, "reward": -0.13883829297265038, "reward_std": 0.5291023775935173, "rewards/cosine_scaled_reward": -0.06941914733033627, "rewards/format_reward": 0.0, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 2258.6944885253906, "epoch": 0.011996572407883462, "grad_norm": 0.21499329805374146, "kl": 3.059208393096924e-05, "learning_rate": 1.2e-07, "loss": -0.0297, "reward": -0.22816578298807144, "reward_std": 0.5721099078655243, "rewards/cosine_scaled_reward": -0.11408288218080997, "rewards/format_reward": 0.0, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 3106.65283203125, "epoch": 0.013710368466152529, "grad_norm": 0.15807782113552094, "kl": 3.281235694885254e-05, "learning_rate": 1.4e-07, "loss": 0.0518, "reward": -0.1028524599969387, "reward_std": 0.7277905195951462, "rewards/cosine_scaled_reward": -0.051426228135824203, "rewards/format_reward": 0.0, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 2652.2777709960938, "epoch": 0.015424164524421594, "grad_norm": 0.14988838136196136, "kl": 3.746151924133301e-05, "learning_rate": 1.6e-07, "loss": -0.0052, "reward": -0.04764566984522389, "reward_std": 0.6422684416174889, "rewards/cosine_scaled_reward": -0.023822834249585867, "rewards/format_reward": 0.0, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 2956.250030517578, "epoch": 0.01713796058269066, "grad_norm": 0.15577340126037598, "kl": 3.62396240234375e-05, "learning_rate": 1.8e-07, "loss": 0.0369, "reward": -0.09274669736623764, "reward_std": 0.6059432476758957, "rewards/cosine_scaled_reward": -0.046373344492167234, "rewards/format_reward": 0.0, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 2610.430633544922, "epoch": 0.018851756640959727, "grad_norm": 0.18031956255435944, "kl": 2.753734588623047e-05, "learning_rate": 2e-07, "loss": 0.0126, "reward": 0.17614622993642115, "reward_std": 0.7455325201153755, "rewards/cosine_scaled_reward": 0.08807311341661261, "rewards/format_reward": 0.0, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 2977.2638549804688, "epoch": 0.02056555269922879, "grad_norm": 0.15254004299640656, "kl": 3.084540367126465e-05, "learning_rate": 2.1999999999999998e-07, "loss": -0.0238, "reward": -0.2835669822525233, "reward_std": 0.6270563155412674, "rewards/cosine_scaled_reward": -0.14178348786663264, "rewards/format_reward": 0.0, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 2601.7916870117188, "epoch": 0.022279348757497857, "grad_norm": 0.1897689402103424, "kl": 4.309415817260742e-05, "learning_rate": 2.4e-07, "loss": -0.008, "reward": -0.08701697085052729, "reward_std": 0.6209904551506042, "rewards/cosine_scaled_reward": -0.04350848635658622, "rewards/format_reward": 0.0, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 2891.3472290039062, "epoch": 0.023993144815766924, "grad_norm": 0.17451944947242737, "kl": 3.2007694244384766e-05, "learning_rate": 2.6e-07, "loss": 0.0134, "reward": -0.11856314726173878, "reward_std": 0.5714613646268845, "rewards/cosine_scaled_reward": -0.059281568974256516, "rewards/format_reward": 0.0, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 3376.9444580078125, "epoch": 0.02570694087403599, "grad_norm": 0.19522128999233246, "kl": 4.00543212890625e-05, "learning_rate": 2.8e-07, "loss": 0.0625, "reward": -0.3375568427145481, "reward_std": 0.5690607726573944, "rewards/cosine_scaled_reward": -0.16877843253314495, "rewards/format_reward": 0.0, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 2385.9861450195312, "epoch": 0.027420736932305057, "grad_norm": 0.17156164348125458, "kl": 3.203749656677246e-05, "learning_rate": 3e-07, "loss": 0.0479, "reward": 0.31096187606453896, "reward_std": 0.719051368534565, "rewards/cosine_scaled_reward": 0.15548093989491463, "rewards/format_reward": 0.0, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 2834.4166870117188, "epoch": 0.02913453299057412, "grad_norm": 0.1916762739419937, "kl": 3.910064697265625e-05, "learning_rate": 3.2e-07, "loss": 0.0288, "reward": -0.1371638989658095, "reward_std": 0.43335365504026413, "rewards/cosine_scaled_reward": -0.06858194415690377, "rewards/format_reward": 0.0, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 3107.9166870117188, "epoch": 0.030848329048843187, "grad_norm": 0.20290644466876984, "kl": 3.56137752532959e-05, "learning_rate": 3.4000000000000003e-07, "loss": 0.0182, "reward": -0.2907893192023039, "reward_std": 0.43716832995414734, "rewards/cosine_scaled_reward": -0.14539465866982937, "rewards/format_reward": 0.0, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 3065.611083984375, "epoch": 0.032562125107112254, "grad_norm": 0.1492234468460083, "kl": 4.0084123611450195e-05, "learning_rate": 3.6e-07, "loss": 0.0216, "reward": -0.19093798706308007, "reward_std": 0.7698801159858704, "rewards/cosine_scaled_reward": -0.09546899236738682, "rewards/format_reward": 0.0, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 3355.7222900390625, "epoch": 0.03427592116538132, "grad_norm": 0.14321106672286987, "kl": 3.36766242980957e-05, "learning_rate": 3.7999999999999996e-07, "loss": -0.0048, "reward": -0.2757381685078144, "reward_std": 0.5536239072680473, "rewards/cosine_scaled_reward": -0.1378690842539072, "rewards/format_reward": 0.0, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 2938.125, "epoch": 0.03598971722365039, "grad_norm": 0.20512644946575165, "kl": 4.1961669921875e-05, "learning_rate": 4e-07, "loss": 0.0577, "reward": -0.1858626427128911, "reward_std": 0.6686508804559708, "rewards/cosine_scaled_reward": -0.09293132461607456, "rewards/format_reward": 0.0, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 3192.2361450195312, "epoch": 0.037703513281919454, "grad_norm": 0.13245940208435059, "kl": 3.49879264831543e-05, "learning_rate": 4.1999999999999995e-07, "loss": 0.0372, "reward": -0.186855623498559, "reward_std": 0.5942067578434944, "rewards/cosine_scaled_reward": -0.09342780988663435, "rewards/format_reward": 0.0, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 3075.02783203125, "epoch": 0.03941730934018852, "grad_norm": 0.14223958551883698, "kl": 2.8640031814575195e-05, "learning_rate": 4.3999999999999997e-07, "loss": -0.0208, "reward": -0.4465179964900017, "reward_std": 0.36973506212234497, "rewards/cosine_scaled_reward": -0.223259000107646, "rewards/format_reward": 0.0, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 2707.6250610351562, "epoch": 0.04113110539845758, "grad_norm": 0.20090773701667786, "kl": 2.9414892196655273e-05, "learning_rate": 4.6e-07, "loss": 0.0292, "reward": 0.08563654706813395, "reward_std": 0.4666801244020462, "rewards/cosine_scaled_reward": 0.04281827830709517, "rewards/format_reward": 0.0, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 2578.9443969726562, "epoch": 0.04284490145672665, "grad_norm": 0.19762183725833893, "kl": 2.6911497116088867e-05, "learning_rate": 4.8e-07, "loss": 0.0547, "reward": -0.15825002267956734, "reward_std": 0.6721501722931862, "rewards/cosine_scaled_reward": -0.07912501133978367, "rewards/format_reward": 0.0, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 3199.1111450195312, "epoch": 0.044558697514995714, "grad_norm": 0.14947673678398132, "kl": 3.1381845474243164e-05, "learning_rate": 5e-07, "loss": 0.0771, "reward": -0.3339938232675195, "reward_std": 0.5660227835178375, "rewards/cosine_scaled_reward": -0.16699691163375974, "rewards/format_reward": 0.0, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 3103.3193969726562, "epoch": 0.04627249357326478, "grad_norm": 0.12868770956993103, "kl": 2.6017427444458008e-05, "learning_rate": 5.2e-07, "loss": 0.0118, "reward": -0.2791058011353016, "reward_std": 0.49328897148370743, "rewards/cosine_scaled_reward": -0.13955289125442505, "rewards/format_reward": 0.0, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 2378.2222595214844, "epoch": 0.04798628963153385, "grad_norm": 0.2462579607963562, "kl": 2.7805566787719727e-05, "learning_rate": 5.4e-07, "loss": 0.0596, "reward": 0.03218653332442045, "reward_std": 0.6807225868105888, "rewards/cosine_scaled_reward": 0.016093265498057008, "rewards/format_reward": 0.0, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 2971.291748046875, "epoch": 0.049700085689802914, "grad_norm": 0.16591639816761017, "kl": 3.515183925628662e-05, "learning_rate": 5.6e-07, "loss": 0.0141, "reward": 0.011478596366941929, "reward_std": 0.7397755682468414, "rewards/cosine_scaled_reward": 0.005739298183470964, "rewards/format_reward": 0.0, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 2913.3611450195312, "epoch": 0.05141388174807198, "grad_norm": 0.13886681199073792, "kl": 3.2573938369750977e-05, "learning_rate": 5.8e-07, "loss": 0.0258, "reward": 0.05036446265876293, "reward_std": 0.6957473307847977, "rewards/cosine_scaled_reward": 0.025182233192026615, "rewards/format_reward": 0.0, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 2665.041748046875, "epoch": 0.05312767780634105, "grad_norm": 0.16625739634037018, "kl": 2.000480890274048e-05, "learning_rate": 6e-07, "loss": 0.0045, "reward": -0.044122666819021106, "reward_std": 0.4255269840359688, "rewards/cosine_scaled_reward": -0.022061329917050898, "rewards/format_reward": 0.0, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 2951.8611450195312, "epoch": 0.054841473864610114, "grad_norm": 0.15594074130058289, "kl": 1.9147992134094238e-05, "learning_rate": 6.2e-07, "loss": 0.0942, "reward": -0.3072533793747425, "reward_std": 0.4980456754565239, "rewards/cosine_scaled_reward": -0.15362668968737125, "rewards/format_reward": 0.0, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 2260.0833435058594, "epoch": 0.056555269922879174, "grad_norm": 0.21370142698287964, "kl": 3.5665929317474365e-05, "learning_rate": 6.4e-07, "loss": 0.0063, "reward": 0.06617816537618637, "reward_std": 0.5614925771951675, "rewards/cosine_scaled_reward": 0.033089087810367346, "rewards/format_reward": 0.0, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 2807.013916015625, "epoch": 0.05826906598114824, "grad_norm": 0.20051412284374237, "kl": 1.2192875146865845e-05, "learning_rate": 6.6e-07, "loss": 0.0328, "reward": -0.17473484575748444, "reward_std": 0.6600858569145203, "rewards/cosine_scaled_reward": -0.08736742846667767, "rewards/format_reward": 0.0, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 3120.7500610351562, "epoch": 0.05998286203941731, "grad_norm": 0.13361996412277222, "kl": 3.407895565032959e-05, "learning_rate": 6.800000000000001e-07, "loss": 0.0472, "reward": -0.4979929216206074, "reward_std": 0.39260104298591614, "rewards/cosine_scaled_reward": -0.2489964533597231, "rewards/format_reward": 0.0, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 2625.0694885253906, "epoch": 0.061696658097686374, "grad_norm": 0.16467803716659546, "kl": 2.7239322662353516e-05, "learning_rate": 7e-07, "loss": -0.0168, "reward": -0.35937849269248545, "reward_std": 0.45373768359422684, "rewards/cosine_scaled_reward": -0.1796892363927327, "rewards/format_reward": 0.0, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 3042.1806030273438, "epoch": 0.06341045415595545, "grad_norm": 0.15104345977306366, "kl": 2.9146671295166016e-05, "learning_rate": 7.2e-07, "loss": 0.0068, "reward": -0.37954360246658325, "reward_std": 0.5432159453630447, "rewards/cosine_scaled_reward": -0.18977180123329163, "rewards/format_reward": 0.0, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 3193.2083740234375, "epoch": 0.06512425021422451, "grad_norm": 0.20619741082191467, "kl": 1.7097219824790955e-05, "learning_rate": 7.4e-07, "loss": 0.0389, "reward": -0.29821273358538747, "reward_std": 0.5581861883401871, "rewards/cosine_scaled_reward": -0.1491063602734357, "rewards/format_reward": 0.0, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 3018.5834045410156, "epoch": 0.06683804627249357, "grad_norm": 0.12940338253974915, "kl": 3.90857458114624e-05, "learning_rate": 7.599999999999999e-07, "loss": 0.0162, "reward": -0.25728118792176247, "reward_std": 0.34478260576725006, "rewards/cosine_scaled_reward": -0.12864059768617153, "rewards/format_reward": 0.0, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 2860.7500610351562, "epoch": 0.06855184233076264, "grad_norm": 0.25654301047325134, "kl": 0.0001112818717956543, "learning_rate": 7.799999999999999e-07, "loss": 0.0545, "reward": 0.13069207593798637, "reward_std": 0.5447051227092743, "rewards/cosine_scaled_reward": 0.06534605007618666, "rewards/format_reward": 0.0, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 2696.3611450195312, "epoch": 0.0702656383890317, "grad_norm": 0.19896458089351654, "kl": 4.2378902435302734e-05, "learning_rate": 8e-07, "loss": 0.0826, "reward": 0.2564197585452348, "reward_std": 0.6877201497554779, "rewards/cosine_scaled_reward": 0.12820987740997225, "rewards/format_reward": 0.0, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 2642.3333740234375, "epoch": 0.07197943444730077, "grad_norm": 0.1658892035484314, "kl": 0.00020813941955566406, "learning_rate": 8.199999999999999e-07, "loss": 0.0149, "reward": -0.03526473790407181, "reward_std": 0.6603178381919861, "rewards/cosine_scaled_reward": -0.017632372677326202, "rewards/format_reward": 0.0, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 2897.388916015625, "epoch": 0.07369323050556983, "grad_norm": 0.2002326250076294, "kl": 6.079673767089844e-05, "learning_rate": 8.399999999999999e-07, "loss": 0.055, "reward": 0.08917492628097534, "reward_std": 0.4714968279004097, "rewards/cosine_scaled_reward": 0.04458745941519737, "rewards/format_reward": 0.0, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 2802.9583740234375, "epoch": 0.07540702656383891, "grad_norm": 0.14357756078243256, "kl": 0.00016063451766967773, "learning_rate": 8.599999999999999e-07, "loss": 0.0109, "reward": -0.2601087912917137, "reward_std": 0.5872670859098434, "rewards/cosine_scaled_reward": -0.13005439937114716, "rewards/format_reward": 0.0, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 3034.8194580078125, "epoch": 0.07712082262210797, "grad_norm": 0.23196536302566528, "kl": 0.00012412667274475098, "learning_rate": 8.799999999999999e-07, "loss": 0.0428, "reward": -0.2070726901292801, "reward_std": 0.5877418145537376, "rewards/cosine_scaled_reward": -0.10353635251522064, "rewards/format_reward": 0.0, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 2306.2083435058594, "epoch": 0.07883461868037704, "grad_norm": 0.2409650981426239, "kl": 0.0003217458724975586, "learning_rate": 9e-07, "loss": 0.0337, "reward": -0.01094321720302105, "reward_std": 0.6599317938089371, "rewards/cosine_scaled_reward": -0.00547160767018795, "rewards/format_reward": 0.0, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 2936.1388549804688, "epoch": 0.0805484147386461, "grad_norm": 0.1777871698141098, "kl": 0.0003833882510662079, "learning_rate": 9.2e-07, "loss": -0.0387, "reward": -0.12989605404436588, "reward_std": 0.6336122080683708, "rewards/cosine_scaled_reward": -0.0649480305146426, "rewards/format_reward": 0.0, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 2661.6806030273438, "epoch": 0.08226221079691516, "grad_norm": 0.3158990442752838, "kl": 0.00035144388675689697, "learning_rate": 9.399999999999999e-07, "loss": 0.1047, "reward": 0.12476684269495308, "reward_std": 0.5184459760785103, "rewards/cosine_scaled_reward": 0.06238342053256929, "rewards/format_reward": 0.0, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 2053.8611450195312, "epoch": 0.08397600685518423, "grad_norm": 0.19082558155059814, "kl": 0.0006046295166015625, "learning_rate": 9.6e-07, "loss": -0.0144, "reward": 0.012501850724220276, "reward_std": 0.603157639503479, "rewards/cosine_scaled_reward": 0.006250927224755287, "rewards/format_reward": 0.0, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 2731.6527404785156, "epoch": 0.0856898029134533, "grad_norm": 0.22663110494613647, "kl": 0.0009310245513916016, "learning_rate": 9.8e-07, "loss": 0.0409, "reward": -0.30116934701800346, "reward_std": 0.6284962445497513, "rewards/cosine_scaled_reward": -0.1505846632644534, "rewards/format_reward": 0.0, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 2839.6944885253906, "epoch": 0.08740359897172237, "grad_norm": 0.17685562372207642, "kl": 0.0002518892288208008, "learning_rate": 1e-06, "loss": 0.0272, "reward": -0.16751686483621597, "reward_std": 0.5093529745936394, "rewards/cosine_scaled_reward": -0.08375842124223709, "rewards/format_reward": 0.0, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 3141.4583740234375, "epoch": 0.08911739502999143, "grad_norm": 0.14409120380878448, "kl": 0.0003941059112548828, "learning_rate": 9.999890338174275e-07, "loss": -0.0079, "reward": -0.19580290652811527, "reward_std": 0.589723251760006, "rewards/cosine_scaled_reward": -0.09790145605802536, "rewards/format_reward": 0.0, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 3054.9445190429688, "epoch": 0.0908311910882605, "grad_norm": 0.13203154504299164, "kl": 0.0002570152282714844, "learning_rate": 9.999561358041868e-07, "loss": 0.0455, "reward": -0.2164551168680191, "reward_std": 0.6407450139522552, "rewards/cosine_scaled_reward": -0.10822756588459015, "rewards/format_reward": 0.0, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 3393.8055419921875, "epoch": 0.09254498714652956, "grad_norm": 0.11958733946084976, "kl": 0.0005993843078613281, "learning_rate": 9.999013075636804e-07, "loss": -0.007, "reward": -0.27613697946071625, "reward_std": 0.5631539821624756, "rewards/cosine_scaled_reward": -0.13806848879903555, "rewards/format_reward": 0.0, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 3430.3055419921875, "epoch": 0.09425878320479864, "grad_norm": 0.13475047051906586, "kl": 0.0003286600112915039, "learning_rate": 9.998245517681593e-07, "loss": 0.0301, "reward": -0.2911250814795494, "reward_std": 0.5787934809923172, "rewards/cosine_scaled_reward": -0.145562544465065, "rewards/format_reward": 0.0, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 3075.1527709960938, "epoch": 0.0959725792630677, "grad_norm": 0.14396199584007263, "kl": 0.0008380413055419922, "learning_rate": 9.997258721585931e-07, "loss": 0.0481, "reward": -0.058986596763134, "reward_std": 0.5793360769748688, "rewards/cosine_scaled_reward": -0.029493287205696106, "rewards/format_reward": 0.0, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 3232.9722900390625, "epoch": 0.09768637532133675, "grad_norm": 0.14357316493988037, "kl": 0.0003731250762939453, "learning_rate": 9.996052735444862e-07, "loss": 0.0542, "reward": -0.08436356298625469, "reward_std": 0.4788799285888672, "rewards/cosine_scaled_reward": -0.042181783355772495, "rewards/format_reward": 0.0, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 3087.3194580078125, "epoch": 0.09940017137960583, "grad_norm": 0.15331892669200897, "kl": 0.0012726783752441406, "learning_rate": 9.994627618036452e-07, "loss": 0.0529, "reward": -0.29565126448869705, "reward_std": 0.5033575221896172, "rewards/cosine_scaled_reward": -0.1478256327100098, "rewards/format_reward": 0.0, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 3110.6111450195312, "epoch": 0.10111396743787489, "grad_norm": 0.14103592932224274, "kl": 0.0015869140625, "learning_rate": 9.992983438818915e-07, "loss": 0.0384, "reward": 0.018456660211086273, "reward_std": 0.8149007856845856, "rewards/cosine_scaled_reward": 0.009228323586285114, "rewards/format_reward": 0.0, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 3305.236083984375, "epoch": 0.10282776349614396, "grad_norm": 0.12172071635723114, "kl": 0.00035071372985839844, "learning_rate": 9.991120277927223e-07, "loss": 0.0086, "reward": -0.27341870963573456, "reward_std": 0.7006796821951866, "rewards/cosine_scaled_reward": -0.13670935295522213, "rewards/format_reward": 0.0, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 3224.0555419921875, "epoch": 0.10454155955441302, "grad_norm": 0.13248133659362793, "kl": 0.0005254745483398438, "learning_rate": 9.989038226169207e-07, "loss": -0.0068, "reward": -0.2998387850821018, "reward_std": 0.3452136740088463, "rewards/cosine_scaled_reward": -0.14991939440369606, "rewards/format_reward": 0.0, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 2643.5833740234375, "epoch": 0.1062553556126821, "grad_norm": 0.17902526259422302, "kl": 0.0021648406982421875, "learning_rate": 9.98673738502114e-07, "loss": 0.057, "reward": 0.017559568164870143, "reward_std": 0.5955966338515282, "rewards/cosine_scaled_reward": 0.008779789437539876, "rewards/format_reward": 0.0, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 3496.375, "epoch": 0.10796915167095116, "grad_norm": 0.1432785838842392, "kl": 0.00047206878662109375, "learning_rate": 9.98421786662277e-07, "loss": 0.0277, "reward": -0.17097678780555725, "reward_std": 0.6070086807012558, "rewards/cosine_scaled_reward": -0.08548840321600437, "rewards/format_reward": 0.0, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 2792.486114501953, "epoch": 0.10968294772922023, "grad_norm": 0.16470499336719513, "kl": 0.0011835098266601562, "learning_rate": 9.981479793771866e-07, "loss": 0.0207, "reward": -0.26402536034584045, "reward_std": 0.43254173547029495, "rewards/cosine_scaled_reward": -0.13201268389821053, "rewards/format_reward": 0.0, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 3128.4861450195312, "epoch": 0.11139674378748929, "grad_norm": 0.1882910132408142, "kl": 0.006333351135253906, "learning_rate": 9.97852329991824e-07, "loss": 0.0385, "reward": -0.0892822165042162, "reward_std": 0.6130652017891407, "rewards/cosine_scaled_reward": -0.04464110638946295, "rewards/format_reward": 0.0, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 2806.75, "epoch": 0.11311053984575835, "grad_norm": 0.15443913638591766, "kl": 0.0003552436828613281, "learning_rate": 9.975348529157229e-07, "loss": 0.0038, "reward": -0.04117146506905556, "reward_std": 0.4872736781835556, "rewards/cosine_scaled_reward": -0.02058573253452778, "rewards/format_reward": 0.0, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 3150.8194580078125, "epoch": 0.11482433590402742, "grad_norm": 0.15191471576690674, "kl": 0.0016102790832519531, "learning_rate": 9.971955636222684e-07, "loss": 0.0316, "reward": -0.23821864277124405, "reward_std": 0.5326030105352402, "rewards/cosine_scaled_reward": -0.11910932138562202, "rewards/format_reward": 0.0, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 2845.6806030273438, "epoch": 0.11653813196229648, "grad_norm": 0.1388000249862671, "kl": 0.0018000602722167969, "learning_rate": 9.968344786479415e-07, "loss": 0.0376, "reward": -0.17579936794936657, "reward_std": 0.6001454517245293, "rewards/cosine_scaled_reward": -0.08789968676865101, "rewards/format_reward": 0.0, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 3050.7361450195312, "epoch": 0.11825192802056556, "grad_norm": 0.13662724196910858, "kl": 0.0015287399291992188, "learning_rate": 9.964516155915151e-07, "loss": 0.0787, "reward": -0.09626813535578549, "reward_std": 0.6232626661658287, "rewards/cosine_scaled_reward": -0.04813406406901777, "rewards/format_reward": 0.0, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 2883.9722290039062, "epoch": 0.11996572407883462, "grad_norm": 0.18917521834373474, "kl": 0.00302886962890625, "learning_rate": 9.960469931131936e-07, "loss": -0.0608, "reward": 0.05035170167684555, "reward_std": 0.4191203862428665, "rewards/cosine_scaled_reward": 0.025175858289003372, "rewards/format_reward": 0.0, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 3137.4583740234375, "epoch": 0.12167952013710369, "grad_norm": 0.15267273783683777, "kl": 0.0017466545104980469, "learning_rate": 9.956206309337066e-07, "loss": 0.0362, "reward": -0.04426470585167408, "reward_std": 0.6740965843200684, "rewards/cosine_scaled_reward": -0.022132341749966145, "rewards/format_reward": 0.0, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 2443.0138549804688, "epoch": 0.12339331619537275, "grad_norm": 0.16214598715305328, "kl": 0.003936767578125, "learning_rate": 9.951725498333448e-07, "loss": -0.0396, "reward": 0.09306424111127853, "reward_std": 0.43733419477939606, "rewards/cosine_scaled_reward": 0.04653212707489729, "rewards/format_reward": 0.0, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 3163.513916015625, "epoch": 0.12510711225364182, "grad_norm": 0.23524802923202515, "kl": 0.018090248107910156, "learning_rate": 9.947027716509488e-07, "loss": -0.0168, "reward": -0.17970024980604649, "reward_std": 0.4914797991514206, "rewards/cosine_scaled_reward": -0.0898501230403781, "rewards/format_reward": 0.0, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 2410.25, "epoch": 0.1268209083119109, "grad_norm": 0.15706373751163483, "kl": 0.0030879974365234375, "learning_rate": 9.942113192828444e-07, "loss": 0.0191, "reward": 0.2525464817881584, "reward_std": 0.6606673151254654, "rewards/cosine_scaled_reward": 0.12627324275672436, "rewards/format_reward": 0.0, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 3146.8611450195312, "epoch": 0.12853470437017994, "grad_norm": 0.15255555510520935, "kl": 0.0032701492309570312, "learning_rate": 9.93698216681727e-07, "loss": 0.0281, "reward": -0.07365414220839739, "reward_std": 0.5634644776582718, "rewards/cosine_scaled_reward": -0.036827060393989086, "rewards/format_reward": 0.0, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 2159.249984741211, "epoch": 0.13024850042844902, "grad_norm": 0.39581403136253357, "kl": 0.018310546875, "learning_rate": 9.931634888554935e-07, "loss": -0.0072, "reward": 0.14826004952192307, "reward_std": 0.6063434556126595, "rewards/cosine_scaled_reward": 0.07413001451641321, "rewards/format_reward": 0.0, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 3143.9443969726562, "epoch": 0.1319622964867181, "grad_norm": 0.13312797248363495, "kl": 0.00225830078125, "learning_rate": 9.926071618660237e-07, "loss": 0.0387, "reward": 0.15560828521847725, "reward_std": 0.680296927690506, "rewards/cosine_scaled_reward": 0.07780414074659348, "rewards/format_reward": 0.0, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 3317.5416870117188, "epoch": 0.13367609254498714, "grad_norm": 0.13495096564292908, "kl": 0.0019426345825195312, "learning_rate": 9.9202926282791e-07, "loss": -0.0019, "reward": -0.4046759568154812, "reward_std": 0.5655369237065315, "rewards/cosine_scaled_reward": -0.20233797095716, "rewards/format_reward": 0.0, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 2373.2361755371094, "epoch": 0.1353898886032562, "grad_norm": 0.26138797402381897, "kl": 0.010517120361328125, "learning_rate": 9.91429819907136e-07, "loss": 0.0351, "reward": -0.17695464938879013, "reward_std": 0.34004002809524536, "rewards/cosine_scaled_reward": -0.08847732283174992, "rewards/format_reward": 0.0, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 3025.5000610351562, "epoch": 0.13710368466152528, "grad_norm": 0.17277857661247253, "kl": 0.0012784004211425781, "learning_rate": 9.908088623197048e-07, "loss": 0.0488, "reward": -0.08927152771502733, "reward_std": 0.6381218209862709, "rewards/cosine_scaled_reward": -0.04463577060960233, "rewards/format_reward": 0.0, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 3080.2777709960938, "epoch": 0.13881748071979436, "grad_norm": 0.14923037588596344, "kl": 0.0020084381103515625, "learning_rate": 9.901664203302124e-07, "loss": 0.0073, "reward": -0.27667392790317535, "reward_std": 0.39360568672418594, "rewards/cosine_scaled_reward": -0.13833696395158768, "rewards/format_reward": 0.0, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 2893.77783203125, "epoch": 0.1405312767780634, "grad_norm": 0.3161645531654358, "kl": 0.011153221130371094, "learning_rate": 9.895025252503755e-07, "loss": 0.0838, "reward": -0.08123429818078876, "reward_std": 0.6654616445302963, "rewards/cosine_scaled_reward": -0.040617153281345963, "rewards/format_reward": 0.0, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 2858.736083984375, "epoch": 0.14224507283633248, "grad_norm": 0.1683678925037384, "kl": 0.001474142074584961, "learning_rate": 9.888172094375033e-07, "loss": -0.0148, "reward": -0.12576034758239985, "reward_std": 0.6605924665927887, "rewards/cosine_scaled_reward": -0.06288017379119992, "rewards/format_reward": 0.0, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 2913.3194885253906, "epoch": 0.14395886889460155, "grad_norm": 0.22510592639446259, "kl": 0.0032978057861328125, "learning_rate": 9.881105062929221e-07, "loss": 0.042, "reward": -0.05945697799324989, "reward_std": 0.5878739953041077, "rewards/cosine_scaled_reward": -0.0297284796833992, "rewards/format_reward": 0.0, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 2990.7916870117188, "epoch": 0.1456726649528706, "grad_norm": 0.14112693071365356, "kl": 0.0014820098876953125, "learning_rate": 9.873824502603459e-07, "loss": 0.0518, "reward": -0.05626801133621484, "reward_std": 0.5443409904837608, "rewards/cosine_scaled_reward": -0.028134002874139696, "rewards/format_reward": 0.0, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 2831.6805419921875, "epoch": 0.14738646101113967, "grad_norm": 0.17547817528247833, "kl": 0.00238037109375, "learning_rate": 9.866330768241983e-07, "loss": 0.0229, "reward": -0.25049374252557755, "reward_std": 0.6190591081976891, "rewards/cosine_scaled_reward": -0.12524686381220818, "rewards/format_reward": 0.0, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 3439.2639770507812, "epoch": 0.14910025706940874, "grad_norm": 0.12470373511314392, "kl": 0.0005965232849121094, "learning_rate": 9.85862422507884e-07, "loss": 0.0309, "reward": -0.15761397371534258, "reward_std": 0.568816527724266, "rewards/cosine_scaled_reward": -0.07880698406370357, "rewards/format_reward": 0.0, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 3291.013916015625, "epoch": 0.15081405312767782, "grad_norm": 0.17072485387325287, "kl": 0.0011734962463378906, "learning_rate": 9.850705248720068e-07, "loss": -0.0003, "reward": -0.31209783256053925, "reward_std": 0.4534567594528198, "rewards/cosine_scaled_reward": -0.15604891628026962, "rewards/format_reward": 0.0, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 2711.6111450195312, "epoch": 0.15252784918594686, "grad_norm": 0.17909394204616547, "kl": 0.00319671630859375, "learning_rate": 9.8425742251254e-07, "loss": -0.0351, "reward": -0.39153438061475754, "reward_std": 0.44514787942171097, "rewards/cosine_scaled_reward": -0.19576718658208847, "rewards/format_reward": 0.0, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 2884.3611755371094, "epoch": 0.15424164524421594, "grad_norm": 0.1545180082321167, "kl": 0.0027666091918945312, "learning_rate": 9.83423155058946e-07, "loss": 0.0231, "reward": -0.12805988639593124, "reward_std": 0.41310104727745056, "rewards/cosine_scaled_reward": -0.06402994319796562, "rewards/format_reward": 0.0, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 2961.8194580078125, "epoch": 0.155955441302485, "grad_norm": 0.17576223611831665, "kl": 0.004119873046875, "learning_rate": 9.825677631722435e-07, "loss": 0.0293, "reward": -0.06583835743367672, "reward_std": 0.6373212188482285, "rewards/cosine_scaled_reward": -0.03291917638853192, "rewards/format_reward": 0.0, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 2929.5277709960938, "epoch": 0.15766923736075408, "grad_norm": 0.14930115640163422, "kl": 0.0034623146057128906, "learning_rate": 9.816912885430258e-07, "loss": 0.0296, "reward": -0.18032184429466724, "reward_std": 0.6196585968136787, "rewards/cosine_scaled_reward": -0.09016093239188194, "rewards/format_reward": 0.0, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 2748.7361450195312, "epoch": 0.15938303341902313, "grad_norm": 0.1628389209508896, "kl": 0.0011005401611328125, "learning_rate": 9.807937738894303e-07, "loss": 0.0544, "reward": -0.048349371179938316, "reward_std": 0.5468417555093765, "rewards/cosine_scaled_reward": -0.024174699559807777, "rewards/format_reward": 0.0, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 2974.8194580078125, "epoch": 0.1610968294772922, "grad_norm": 0.17104412615299225, "kl": 0.0025157928466796875, "learning_rate": 9.798752629550546e-07, "loss": 0.0562, "reward": -0.10820803185924888, "reward_std": 0.5462353378534317, "rewards/cosine_scaled_reward": -0.05410401395056397, "rewards/format_reward": 0.0, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 2822.3055419921875, "epoch": 0.16281062553556128, "grad_norm": 0.22087068855762482, "kl": 0.0032253265380859375, "learning_rate": 9.78935800506826e-07, "loss": 0.0157, "reward": -0.2787464428693056, "reward_std": 0.5101591870188713, "rewards/cosine_scaled_reward": -0.139373216079548, "rewards/format_reward": 0.0, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 3177.1389770507812, "epoch": 0.16452442159383032, "grad_norm": 0.13341942429542542, "kl": 0.0016889572143554688, "learning_rate": 9.779754323328192e-07, "loss": 0.0599, "reward": 0.22422180697321892, "reward_std": 0.6203102543950081, "rewards/cosine_scaled_reward": 0.11211090348660946, "rewards/format_reward": 0.0, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 3359.1666870117188, "epoch": 0.1662382176520994, "grad_norm": 0.17103053629398346, "kl": 0.0048770904541015625, "learning_rate": 9.769942052400235e-07, "loss": 0.0584, "reward": -0.34769631922245026, "reward_std": 0.5649063661694527, "rewards/cosine_scaled_reward": -0.17384816892445087, "rewards/format_reward": 0.0, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 2853.8472290039062, "epoch": 0.16795201371036847, "grad_norm": 0.16162103414535522, "kl": 0.002391815185546875, "learning_rate": 9.759921670520634e-07, "loss": -0.0363, "reward": 0.04994424246251583, "reward_std": 0.4738911837339401, "rewards/cosine_scaled_reward": 0.024972120765596628, "rewards/format_reward": 0.0, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 3113.9722290039062, "epoch": 0.16966580976863754, "grad_norm": 0.17794044315814972, "kl": 0.002719879150390625, "learning_rate": 9.749693666068663e-07, "loss": 0.0017, "reward": -0.16785867512226105, "reward_std": 0.5008634850382805, "rewards/cosine_scaled_reward": -0.08392933756113052, "rewards/format_reward": 0.0, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 2779.9862060546875, "epoch": 0.1713796058269066, "grad_norm": 0.1735229194164276, "kl": 0.005786895751953125, "learning_rate": 9.739258537542835e-07, "loss": -0.0595, "reward": -0.15765622071921825, "reward_std": 0.4426313266158104, "rewards/cosine_scaled_reward": -0.07882811967283487, "rewards/format_reward": 0.0, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 2904.5833129882812, "epoch": 0.17309340188517566, "grad_norm": 0.16130799055099487, "kl": 0.0022287368774414062, "learning_rate": 9.728616793536587e-07, "loss": -0.027, "reward": -0.2833556551486254, "reward_std": 0.41574449837207794, "rewards/cosine_scaled_reward": -0.14167783502489328, "rewards/format_reward": 0.0, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 2956.9444580078125, "epoch": 0.17480719794344474, "grad_norm": 0.14904557168483734, "kl": 0.0023751258850097656, "learning_rate": 9.717768952713511e-07, "loss": 0.0249, "reward": -0.005829242058098316, "reward_std": 0.49208924546837807, "rewards/cosine_scaled_reward": -0.002914619166404009, "rewards/format_reward": 0.0, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 3127.75, "epoch": 0.17652099400171378, "grad_norm": 0.13523682951927185, "kl": 0.0023593902587890625, "learning_rate": 9.706715543782064e-07, "loss": 0.0048, "reward": -0.16767939552664757, "reward_std": 0.497691310942173, "rewards/cosine_scaled_reward": -0.08383970521390438, "rewards/format_reward": 0.0, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 3349.888916015625, "epoch": 0.17823479005998286, "grad_norm": 0.16127026081085205, "kl": 0.002029895782470703, "learning_rate": 9.695457105469804e-07, "loss": -0.0079, "reward": -0.4253583773970604, "reward_std": 0.5213425680994987, "rewards/cosine_scaled_reward": -0.2126791886985302, "rewards/format_reward": 0.0, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 2762.3056030273438, "epoch": 0.17994858611825193, "grad_norm": 0.22534409165382385, "kl": 0.004019737243652344, "learning_rate": 9.683994186497132e-07, "loss": 0.0786, "reward": -0.13280940428376198, "reward_std": 0.6939076110720634, "rewards/cosine_scaled_reward": -0.06640470400452614, "rewards/format_reward": 0.0, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 3027.013885498047, "epoch": 0.181662382176521, "grad_norm": 0.18191885948181152, "kl": 0.001827239990234375, "learning_rate": 9.672327345550543e-07, "loss": 0.0572, "reward": -0.30150486156344414, "reward_std": 0.5941706523299217, "rewards/cosine_scaled_reward": -0.15075243171304464, "rewards/format_reward": 0.0, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 3236.4166870117188, "epoch": 0.18337617823479005, "grad_norm": 0.12520797550678253, "kl": 0.002315521240234375, "learning_rate": 9.66045715125541e-07, "loss": 0.0039, "reward": 0.061343319714069366, "reward_std": 0.5028644949197769, "rewards/cosine_scaled_reward": 0.030671661719679832, "rewards/format_reward": 0.0, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 3337.3056030273438, "epoch": 0.18508997429305912, "grad_norm": 0.14343461394309998, "kl": 0.0016498565673828125, "learning_rate": 9.648384182148252e-07, "loss": 0.0438, "reward": -0.17464184761047363, "reward_std": 0.5610974803566933, "rewards/cosine_scaled_reward": -0.08732092566788197, "rewards/format_reward": 0.0, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 2781.4305419921875, "epoch": 0.1868037703513282, "grad_norm": 0.1800822913646698, "kl": 0.0033397674560546875, "learning_rate": 9.636109026648554e-07, "loss": 0.0242, "reward": -0.26444700360298157, "reward_std": 0.5241282097995281, "rewards/cosine_scaled_reward": -0.13222350925207138, "rewards/format_reward": 0.0, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 2989.3472290039062, "epoch": 0.18851756640959727, "grad_norm": 0.24952495098114014, "kl": 0.00222015380859375, "learning_rate": 9.623632283030077e-07, "loss": 0.1294, "reward": -0.038819944486021996, "reward_std": 0.7193348854780197, "rewards/cosine_scaled_reward": -0.019409974105656147, "rewards/format_reward": 0.0, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 2775.6111450195312, "epoch": 0.19023136246786632, "grad_norm": 0.16514870524406433, "kl": 0.0029087066650390625, "learning_rate": 9.610954559391704e-07, "loss": 0.0565, "reward": -0.3495597681030631, "reward_std": 0.3909125030040741, "rewards/cosine_scaled_reward": -0.17477987939491868, "rewards/format_reward": 0.0, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 2831.8055419921875, "epoch": 0.1919451585261354, "grad_norm": 0.13825589418411255, "kl": 0.0023212432861328125, "learning_rate": 9.598076473627796e-07, "loss": 0.0231, "reward": -0.17934568971395493, "reward_std": 0.5252480655908585, "rewards/cosine_scaled_reward": -0.08967284485697746, "rewards/format_reward": 0.0, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 2977.8333740234375, "epoch": 0.19365895458440446, "grad_norm": 0.15359072387218475, "kl": 0.002410888671875, "learning_rate": 9.58499865339809e-07, "loss": 0.0053, "reward": 0.25896316685248166, "reward_std": 0.705707773566246, "rewards/cosine_scaled_reward": 0.12948158156359568, "rewards/format_reward": 0.0, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 2234.2500610351562, "epoch": 0.1953727506426735, "grad_norm": 0.17650143802165985, "kl": 0.002643585205078125, "learning_rate": 9.571721736097088e-07, "loss": -0.0481, "reward": -0.20779240669799037, "reward_std": 0.50680061429739, "rewards/cosine_scaled_reward": -0.10389620521164034, "rewards/format_reward": 0.0, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 2372.65283203125, "epoch": 0.19708654670094258, "grad_norm": 0.21576084196567535, "kl": 0.007110595703125, "learning_rate": 9.55824636882301e-07, "loss": 0.1072, "reward": 0.03794890362769365, "reward_std": 0.6275844648480415, "rewards/cosine_scaled_reward": 0.018974455073475838, "rewards/format_reward": 0.0, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 3263.27783203125, "epoch": 0.19880034275921166, "grad_norm": 0.18672628700733185, "kl": 0.0029048919677734375, "learning_rate": 9.54457320834625e-07, "loss": -0.034, "reward": -0.3033560863696039, "reward_std": 0.5516846142709255, "rewards/cosine_scaled_reward": -0.15167804807424545, "rewards/format_reward": 0.0, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 3024.2500610351562, "epoch": 0.20051413881748073, "grad_norm": 0.1308911144733429, "kl": 0.00705718994140625, "learning_rate": 9.530702921077358e-07, "loss": 0.0178, "reward": 0.19102132320404053, "reward_std": 0.7014489844441414, "rewards/cosine_scaled_reward": 0.09551066905260086, "rewards/format_reward": 0.0, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 2574.013916015625, "epoch": 0.20222793487574978, "grad_norm": 0.325631320476532, "kl": 0.013393402099609375, "learning_rate": 9.516636183034564e-07, "loss": 0.0659, "reward": -0.29521266371011734, "reward_std": 0.5856474936008453, "rewards/cosine_scaled_reward": -0.1476063383743167, "rewards/format_reward": 0.0, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 2724.3333740234375, "epoch": 0.20394173093401885, "grad_norm": 0.14827784895896912, "kl": 0.0027828216552734375, "learning_rate": 9.502373679810839e-07, "loss": 0.0141, "reward": -0.03255775198340416, "reward_std": 0.34701335430145264, "rewards/cosine_scaled_reward": -0.01627887785434723, "rewards/format_reward": 0.0, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 2813.1111450195312, "epoch": 0.20565552699228792, "grad_norm": 0.21779808402061462, "kl": 0.0053081512451171875, "learning_rate": 9.487916106540465e-07, "loss": 0.0158, "reward": -0.19739244412630796, "reward_std": 0.6424184814095497, "rewards/cosine_scaled_reward": -0.0986962317256257, "rewards/format_reward": 0.0, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 2874.0000610351562, "epoch": 0.207369323050557, "grad_norm": 0.2778118848800659, "kl": 0.0032444000244140625, "learning_rate": 9.473264167865171e-07, "loss": 0.0937, "reward": -0.15650038793683052, "reward_std": 0.5867400094866753, "rewards/cosine_scaled_reward": -0.07825020421296358, "rewards/format_reward": 0.0, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 3251.1112060546875, "epoch": 0.20908311910882604, "grad_norm": 0.12883791327476501, "kl": 0.003650665283203125, "learning_rate": 9.458418577899774e-07, "loss": 0.02, "reward": -0.30216934718191624, "reward_std": 0.5233990028500557, "rewards/cosine_scaled_reward": -0.1510846719611436, "rewards/format_reward": 0.0, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 2774.3194580078125, "epoch": 0.21079691516709512, "grad_norm": 0.20982016623020172, "kl": 0.002071380615234375, "learning_rate": 9.443380060197385e-07, "loss": 0.0498, "reward": 0.3517572022974491, "reward_std": 0.7633289247751236, "rewards/cosine_scaled_reward": 0.17587858624756336, "rewards/format_reward": 0.0, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 3077.4166259765625, "epoch": 0.2125107112253642, "grad_norm": 0.14578428864479065, "kl": 0.0024623870849609375, "learning_rate": 9.428149347714143e-07, "loss": 0.002, "reward": -0.09189963340759277, "reward_std": 0.4004024267196655, "rewards/cosine_scaled_reward": -0.04594981297850609, "rewards/format_reward": 0.0, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 2898.5556030273438, "epoch": 0.21422450728363324, "grad_norm": 0.19108974933624268, "kl": 0.0016632080078125, "learning_rate": 9.412727182773486e-07, "loss": 0.0218, "reward": 0.01400849362835288, "reward_std": 0.5958191454410553, "rewards/cosine_scaled_reward": 0.007004249142482877, "rewards/format_reward": 0.0, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 3005.916748046875, "epoch": 0.2159383033419023, "grad_norm": 0.26980966329574585, "kl": 0.004871368408203125, "learning_rate": 9.397114317029974e-07, "loss": 0.0539, "reward": -0.19987820833921432, "reward_std": 0.5232749357819557, "rewards/cosine_scaled_reward": -0.09993909671902657, "rewards/format_reward": 0.0, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 2932.5416870117188, "epoch": 0.21765209940017138, "grad_norm": 0.15654343366622925, "kl": 0.0043792724609375, "learning_rate": 9.381311511432658e-07, "loss": 0.0405, "reward": -0.17467445600777864, "reward_std": 0.5738040953874588, "rewards/cosine_scaled_reward": -0.0873372326605022, "rewards/format_reward": 0.0, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 3097.6666870117188, "epoch": 0.21936589545844046, "grad_norm": 0.16381874680519104, "kl": 0.00551605224609375, "learning_rate": 9.36531953618799e-07, "loss": -0.0288, "reward": -0.20874720811843872, "reward_std": 0.5535652860999107, "rewards/cosine_scaled_reward": -0.10437360778450966, "rewards/format_reward": 0.0, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 2531.861114501953, "epoch": 0.2210796915167095, "grad_norm": 0.26021480560302734, "kl": 0.006000518798828125, "learning_rate": 9.34913917072228e-07, "loss": 0.0459, "reward": -0.044261377304792404, "reward_std": 0.4739195331931114, "rewards/cosine_scaled_reward": -0.022130683064460754, "rewards/format_reward": 0.0, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 2091.138916015625, "epoch": 0.22279348757497858, "grad_norm": 0.22645580768585205, "kl": 0.00482940673828125, "learning_rate": 9.332771203643714e-07, "loss": -0.0704, "reward": 0.38943320140242577, "reward_std": 0.7351026237010956, "rewards/cosine_scaled_reward": 0.19471661932766438, "rewards/format_reward": 0.0, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 3235.513916015625, "epoch": 0.22450728363324765, "grad_norm": 0.14915120601654053, "kl": 0.003574371337890625, "learning_rate": 9.316216432703916e-07, "loss": 0.0073, "reward": -0.32377296313643456, "reward_std": 0.48132046312093735, "rewards/cosine_scaled_reward": -0.16188647784292698, "rewards/format_reward": 0.0, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 3044.4166870117188, "epoch": 0.2262210796915167, "grad_norm": 0.16817504167556763, "kl": 0.003704071044921875, "learning_rate": 9.299475664759068e-07, "loss": 0.0174, "reward": -0.18535634828731418, "reward_std": 0.6574838161468506, "rewards/cosine_scaled_reward": -0.0926781720481813, "rewards/format_reward": 0.0, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 3003.4444580078125, "epoch": 0.22793487574978577, "grad_norm": 0.15358978509902954, "kl": 0.0041980743408203125, "learning_rate": 9.282549715730579e-07, "loss": 0.0337, "reward": -0.05171632254496217, "reward_std": 0.5909973978996277, "rewards/cosine_scaled_reward": -0.025858158012852073, "rewards/format_reward": 0.0, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 2156.4583740234375, "epoch": 0.22964867180805484, "grad_norm": 0.1683642566204071, "kl": 0.00467681884765625, "learning_rate": 9.265439410565328e-07, "loss": 0.0033, "reward": 0.009663693606853485, "reward_std": 0.4995303153991699, "rewards/cosine_scaled_reward": 0.004831850528717041, "rewards/format_reward": 0.0, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 2540.2777709960938, "epoch": 0.23136246786632392, "grad_norm": 0.1953487992286682, "kl": 0.00795745849609375, "learning_rate": 9.248145583195447e-07, "loss": 0.0474, "reward": -0.21851413743570447, "reward_std": 0.5443524122238159, "rewards/cosine_scaled_reward": -0.10925705661065876, "rewards/format_reward": 0.0, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 2695.0694885253906, "epoch": 0.23307626392459296, "grad_norm": 0.1705743372440338, "kl": 0.0045166015625, "learning_rate": 9.230669076497687e-07, "loss": 0.0191, "reward": 0.05242172256112099, "reward_std": 0.5593772605061531, "rewards/cosine_scaled_reward": 0.026210861280560493, "rewards/format_reward": 0.0, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 3158.9444580078125, "epoch": 0.23479005998286204, "grad_norm": 0.17036336660385132, "kl": 0.00504302978515625, "learning_rate": 9.213010742252327e-07, "loss": 0.0254, "reward": 0.028430916368961334, "reward_std": 0.7066435366868973, "rewards/cosine_scaled_reward": 0.014215447008609772, "rewards/format_reward": 0.0, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 3004.0416259765625, "epoch": 0.2365038560411311, "grad_norm": 0.1331450194120407, "kl": 0.003459930419921875, "learning_rate": 9.195171441101668e-07, "loss": -0.0176, "reward": -0.014733657240867615, "reward_std": 0.5561396405100822, "rewards/cosine_scaled_reward": -0.007366828620433807, "rewards/format_reward": 0.0, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 2905.4444580078125, "epoch": 0.23821765209940018, "grad_norm": 0.17066888511180878, "kl": 0.00594329833984375, "learning_rate": 9.177152042508077e-07, "loss": 0.0097, "reward": -0.19389863312244415, "reward_std": 0.47480132430791855, "rewards/cosine_scaled_reward": -0.09694933146238327, "rewards/format_reward": 0.0, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 2243.4722595214844, "epoch": 0.23993144815766923, "grad_norm": 0.2052508443593979, "kl": 0.0069751739501953125, "learning_rate": 9.158953424711624e-07, "loss": 0.0149, "reward": -0.17606773134320974, "reward_std": 0.4814153388142586, "rewards/cosine_scaled_reward": -0.08803386008366942, "rewards/format_reward": 0.0, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 3140.3472290039062, "epoch": 0.2416452442159383, "grad_norm": 0.2093152105808258, "kl": 0.009227752685546875, "learning_rate": 9.140576474687263e-07, "loss": -0.0173, "reward": -0.2940823882818222, "reward_std": 0.46395206451416016, "rewards/cosine_scaled_reward": -0.1470412015914917, "rewards/format_reward": 0.0, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 3106.0833740234375, "epoch": 0.24335904027420738, "grad_norm": 0.15536610782146454, "kl": 0.003704071044921875, "learning_rate": 9.122022088101613e-07, "loss": -0.0133, "reward": -0.12113199383020401, "reward_std": 0.5028039142489433, "rewards/cosine_scaled_reward": -0.06056600622832775, "rewards/format_reward": 0.0, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 3045.0, "epoch": 0.24507283633247642, "grad_norm": 0.1554841846227646, "kl": 0.00397491455078125, "learning_rate": 9.103291169269299e-07, "loss": 0.0112, "reward": -0.24326159805059433, "reward_std": 0.545206792652607, "rewards/cosine_scaled_reward": -0.12163079530000687, "rewards/format_reward": 0.0, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 3281.52783203125, "epoch": 0.2467866323907455, "grad_norm": 0.15741369128227234, "kl": 0.004161834716796875, "learning_rate": 9.084384631108882e-07, "loss": 0.0205, "reward": -0.3316431827843189, "reward_std": 0.5960408300161362, "rewards/cosine_scaled_reward": -0.16582159511744976, "rewards/format_reward": 0.0, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 2848.000030517578, "epoch": 0.24850042844901457, "grad_norm": 0.16731388866901398, "kl": 0.004131317138671875, "learning_rate": 9.065303395098358e-07, "loss": 0.0076, "reward": -0.030747827142477036, "reward_std": 0.532738171517849, "rewards/cosine_scaled_reward": -0.015373910777270794, "rewards/format_reward": 0.0, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 2467.6944580078125, "epoch": 0.25021422450728364, "grad_norm": 0.21354977786540985, "kl": 0.004360198974609375, "learning_rate": 9.046048391230247e-07, "loss": 0.0206, "reward": 0.0201254915446043, "reward_std": 0.8671004623174667, "rewards/cosine_scaled_reward": 0.010062748566269875, "rewards/format_reward": 0.0, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 2855.6666870117188, "epoch": 0.2519280205655527, "grad_norm": 0.144964799284935, "kl": 0.004791259765625, "learning_rate": 9.026620557966279e-07, "loss": -0.0329, "reward": -0.2643125932663679, "reward_std": 0.5043439790606499, "rewards/cosine_scaled_reward": -0.13215629663318396, "rewards/format_reward": 0.0, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 3096.513916015625, "epoch": 0.2536418166238218, "grad_norm": 0.14218087494373322, "kl": 0.003002166748046875, "learning_rate": 9.007020842191634e-07, "loss": -0.0089, "reward": -0.221635602414608, "reward_std": 0.4477159082889557, "rewards/cosine_scaled_reward": -0.11081778630614281, "rewards/format_reward": 0.0, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 2720.263916015625, "epoch": 0.25535561268209084, "grad_norm": 0.17918290197849274, "kl": 0.004497528076171875, "learning_rate": 8.987250199168808e-07, "loss": -0.0669, "reward": -0.07472209073603153, "reward_std": 0.6641673818230629, "rewards/cosine_scaled_reward": -0.03736104257404804, "rewards/format_reward": 0.0, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 2809.666717529297, "epoch": 0.2570694087403599, "grad_norm": 0.13525010645389557, "kl": 0.004337310791015625, "learning_rate": 8.967309592491052e-07, "loss": 0.0411, "reward": 0.32146409433335066, "reward_std": 0.6463728100061417, "rewards/cosine_scaled_reward": 0.16073204344138503, "rewards/format_reward": 0.0, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 2688.6944580078125, "epoch": 0.258783204798629, "grad_norm": 0.29565665125846863, "kl": 0.0060577392578125, "learning_rate": 8.9471999940354e-07, "loss": -0.0998, "reward": -0.27624649833887815, "reward_std": 0.46585455536842346, "rewards/cosine_scaled_reward": -0.13812324171885848, "rewards/format_reward": 0.0, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 2944.888916015625, "epoch": 0.26049700085689803, "grad_norm": 0.15996259450912476, "kl": 0.005001068115234375, "learning_rate": 8.926922383915315e-07, "loss": 0.0467, "reward": -0.09553277865052223, "reward_std": 0.5195184722542763, "rewards/cosine_scaled_reward": -0.047766391187906265, "rewards/format_reward": 0.0, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 2880.5833740234375, "epoch": 0.2622107969151671, "grad_norm": 0.16603334248065948, "kl": 0.003936767578125, "learning_rate": 8.906477750432903e-07, "loss": 0.0105, "reward": -0.19235826842486858, "reward_std": 0.5736033394932747, "rewards/cosine_scaled_reward": -0.09617912326939404, "rewards/format_reward": 0.0, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 2859.0972900390625, "epoch": 0.2639245929734362, "grad_norm": 0.17567692697048187, "kl": 0.004161834716796875, "learning_rate": 8.88586709003076e-07, "loss": -0.0056, "reward": -0.19033684581518173, "reward_std": 0.5773953720927238, "rewards/cosine_scaled_reward": -0.09516842663288116, "rewards/format_reward": 0.0, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 3215.1666870117188, "epoch": 0.2656383890317052, "grad_norm": 0.14003609120845795, "kl": 0.004474639892578125, "learning_rate": 8.865091407243394e-07, "loss": 0.0216, "reward": -0.1411176547408104, "reward_std": 0.6216752380132675, "rewards/cosine_scaled_reward": -0.07055883854627609, "rewards/format_reward": 0.0, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 2929.1250610351562, "epoch": 0.26735218508997427, "grad_norm": 0.14357882738113403, "kl": 0.003513336181640625, "learning_rate": 8.844151714648274e-07, "loss": -0.0355, "reward": -0.26859963312745094, "reward_std": 0.501942828297615, "rewards/cosine_scaled_reward": -0.13429982028901577, "rewards/format_reward": 0.0, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 2811.6666870117188, "epoch": 0.26906598114824337, "grad_norm": 0.18389619886875153, "kl": 0.006900787353515625, "learning_rate": 8.823049032816478e-07, "loss": -0.049, "reward": 0.005984093062579632, "reward_std": 0.7341288924217224, "rewards/cosine_scaled_reward": 0.0029920428059995174, "rewards/format_reward": 0.0, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 2909.77783203125, "epoch": 0.2707797772065124, "grad_norm": 0.13957001268863678, "kl": 0.0042877197265625, "learning_rate": 8.801784390262943e-07, "loss": 0.0033, "reward": -0.17342954874038696, "reward_std": 0.4903194531798363, "rewards/cosine_scaled_reward": -0.08671476691961288, "rewards/format_reward": 0.0, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 3235.013916015625, "epoch": 0.27249357326478146, "grad_norm": 0.15003739297389984, "kl": 0.005523681640625, "learning_rate": 8.780358823396352e-07, "loss": 0.0068, "reward": -0.3410843312740326, "reward_std": 0.502905935049057, "rewards/cosine_scaled_reward": -0.17054216749966145, "rewards/format_reward": 0.0, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 3160.6805419921875, "epoch": 0.27420736932305056, "grad_norm": 0.1586807668209076, "kl": 0.00443267822265625, "learning_rate": 8.758773376468604e-07, "loss": 0.0141, "reward": 0.04759278893470764, "reward_std": 0.6465433575212955, "rewards/cosine_scaled_reward": 0.02379640005528927, "rewards/format_reward": 0.0, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 2961.3333129882812, "epoch": 0.2759211653813196, "grad_norm": 0.18396639823913574, "kl": 0.0078125, "learning_rate": 8.737029101523929e-07, "loss": 0.0282, "reward": -0.32911188155412674, "reward_std": 0.6032818555831909, "rewards/cosine_scaled_reward": -0.16455595009028912, "rewards/format_reward": 0.0, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 2621.2222290039062, "epoch": 0.2776349614395887, "grad_norm": 0.15461236238479614, "kl": 0.00507354736328125, "learning_rate": 8.715127058347614e-07, "loss": -0.0194, "reward": -0.4356637103483081, "reward_std": 0.36323027312755585, "rewards/cosine_scaled_reward": -0.21783185191452503, "rewards/format_reward": 0.0, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 3038.4027709960938, "epoch": 0.27934875749785776, "grad_norm": 0.12717723846435547, "kl": 0.005706787109375, "learning_rate": 8.693068314414344e-07, "loss": 0.0023, "reward": -0.04007915942929685, "reward_std": 0.6919823586940765, "rewards/cosine_scaled_reward": -0.02003958181012422, "rewards/format_reward": 0.0, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 2678.7361755371094, "epoch": 0.2810625535561268, "grad_norm": 0.19941791892051697, "kl": 0.005207061767578125, "learning_rate": 8.670853944836176e-07, "loss": 0.0441, "reward": -0.040945328772068024, "reward_std": 0.5933430567383766, "rewards/cosine_scaled_reward": -0.020472656935453415, "rewards/format_reward": 0.0, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 2895.0416870117188, "epoch": 0.2827763496143959, "grad_norm": 0.16098277270793915, "kl": 0.0064697265625, "learning_rate": 8.648485032310144e-07, "loss": 0.0293, "reward": -0.09013996832072735, "reward_std": 0.5875271111726761, "rewards/cosine_scaled_reward": -0.04506997298449278, "rewards/format_reward": 0.0, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 2754.263916015625, "epoch": 0.28449014567266495, "grad_norm": 0.15243615210056305, "kl": 0.00676727294921875, "learning_rate": 8.625962667065487e-07, "loss": 0.0191, "reward": -0.0630449466407299, "reward_std": 0.6104780063033104, "rewards/cosine_scaled_reward": -0.0315224789083004, "rewards/format_reward": 0.0, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 3024.9166870117188, "epoch": 0.286203941730934, "grad_norm": 0.14153960347175598, "kl": 0.0053863525390625, "learning_rate": 8.603287946810513e-07, "loss": 0.0428, "reward": -0.1417745603248477, "reward_std": 0.7242364957928658, "rewards/cosine_scaled_reward": -0.07088728016242385, "rewards/format_reward": 0.0, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 3074.4306030273438, "epoch": 0.2879177377892031, "grad_norm": 0.1459978222846985, "kl": 0.0064067840576171875, "learning_rate": 8.580461976679099e-07, "loss": 0.0112, "reward": -0.01038459874689579, "reward_std": 0.7124739363789558, "rewards/cosine_scaled_reward": -0.0051923105493187904, "rewards/format_reward": 0.0, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 3006.638916015625, "epoch": 0.28963153384747214, "grad_norm": 0.2151106894016266, "kl": 0.00925445556640625, "learning_rate": 8.557485869176825e-07, "loss": 0.0553, "reward": -0.2934446856379509, "reward_std": 0.5195991396903992, "rewards/cosine_scaled_reward": -0.14672234281897545, "rewards/format_reward": 0.0, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 2720.8194274902344, "epoch": 0.2913453299057412, "grad_norm": 0.16352801024913788, "kl": 0.00519561767578125, "learning_rate": 8.534360744126753e-07, "loss": 0.061, "reward": 0.10783382831141353, "reward_std": 0.6230225935578346, "rewards/cosine_scaled_reward": 0.053916911128908396, "rewards/format_reward": 0.0, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 2681.388946533203, "epoch": 0.2930591259640103, "grad_norm": 0.17118766903877258, "kl": 0.00641632080078125, "learning_rate": 8.511087728614862e-07, "loss": 0.026, "reward": -0.0785403607878834, "reward_std": 0.5736416950821877, "rewards/cosine_scaled_reward": -0.039270187029615045, "rewards/format_reward": 0.0, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 2987.2361450195312, "epoch": 0.29477292202227934, "grad_norm": 0.15370719134807587, "kl": 0.0035552978515625, "learning_rate": 8.487667956935087e-07, "loss": -0.0033, "reward": -0.02974682953208685, "reward_std": 0.5253070890903473, "rewards/cosine_scaled_reward": -0.014873407315462828, "rewards/format_reward": 0.0, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 2586.4027709960938, "epoch": 0.29648671808054844, "grad_norm": 0.22191597521305084, "kl": 0.0059356689453125, "learning_rate": 8.464102570534061e-07, "loss": -0.0092, "reward": -0.2831332399509847, "reward_std": 0.5445848181843758, "rewards/cosine_scaled_reward": -0.14156663417816162, "rewards/format_reward": 0.0, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 2655.2222900390625, "epoch": 0.2982005141388175, "grad_norm": 0.22858025133609772, "kl": 0.01116943359375, "learning_rate": 8.440392717955475e-07, "loss": -0.0181, "reward": -0.2866486459970474, "reward_std": 0.5677091330289841, "rewards/cosine_scaled_reward": -0.14332432113587856, "rewards/format_reward": 0.0, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 2976.8194580078125, "epoch": 0.29991431019708653, "grad_norm": 0.15686574578285217, "kl": 0.00734710693359375, "learning_rate": 8.416539554784089e-07, "loss": 0.0308, "reward": -0.3254437707364559, "reward_std": 0.5169026479125023, "rewards/cosine_scaled_reward": -0.16272189188748598, "rewards/format_reward": 0.0, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 2500.736114501953, "epoch": 0.30162810625535563, "grad_norm": 0.2628232538700104, "kl": 0.03589630126953125, "learning_rate": 8.392544243589427e-07, "loss": 0.0534, "reward": -0.1589430421590805, "reward_std": 0.6641415655612946, "rewards/cosine_scaled_reward": -0.07947152107954025, "rewards/format_reward": 0.0, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 3302.7361450195312, "epoch": 0.3033419023136247, "grad_norm": 0.13509048521518707, "kl": 0.003936767578125, "learning_rate": 8.368407953869103e-07, "loss": -0.0077, "reward": -0.3392331041395664, "reward_std": 0.44542837142944336, "rewards/cosine_scaled_reward": -0.1696165525354445, "rewards/format_reward": 0.0, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 2946.9306030273438, "epoch": 0.3050556983718937, "grad_norm": 0.14318227767944336, "kl": 0.004425048828125, "learning_rate": 8.344131861991828e-07, "loss": 0.0129, "reward": 0.040801383554935455, "reward_std": 0.47273271530866623, "rewards/cosine_scaled_reward": 0.02040068805217743, "rewards/format_reward": 0.0, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 3092.125, "epoch": 0.3067694944301628, "grad_norm": 0.15563301742076874, "kl": 0.010219573974609375, "learning_rate": 8.319717151140072e-07, "loss": 0.045, "reward": -0.1892098607495427, "reward_std": 0.5936430767178535, "rewards/cosine_scaled_reward": -0.09460492385551333, "rewards/format_reward": 0.0, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 1971.5138854980469, "epoch": 0.30848329048843187, "grad_norm": 0.19795306026935577, "kl": 0.00974273681640625, "learning_rate": 8.295165011252396e-07, "loss": -0.0138, "reward": -0.11939475126564503, "reward_std": 0.6153334528207779, "rewards/cosine_scaled_reward": -0.05969736957922578, "rewards/format_reward": 0.0, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 3067.1945190429688, "epoch": 0.3101970865467009, "grad_norm": 0.15797466039657593, "kl": 0.005138397216796875, "learning_rate": 8.270476638965461e-07, "loss": -0.0212, "reward": 0.10869292449206114, "reward_std": 0.6324612945318222, "rewards/cosine_scaled_reward": 0.054346468299627304, "rewards/format_reward": 0.0, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 3268.486083984375, "epoch": 0.31191088260497, "grad_norm": 0.15513566136360168, "kl": 0.006145477294921875, "learning_rate": 8.245653237555705e-07, "loss": 0.0633, "reward": -0.2609336208552122, "reward_std": 0.49053191393613815, "rewards/cosine_scaled_reward": -0.13046680949628353, "rewards/format_reward": 0.0, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 2989.84716796875, "epoch": 0.31362467866323906, "grad_norm": 0.15209534764289856, "kl": 0.00447845458984375, "learning_rate": 8.220696016880687e-07, "loss": 0.0061, "reward": -0.040327644906938076, "reward_std": 0.717703215777874, "rewards/cosine_scaled_reward": -0.02016383269801736, "rewards/format_reward": 0.0, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 2890.4583740234375, "epoch": 0.31533847472150817, "grad_norm": 0.1359020322561264, "kl": 0.006763458251953125, "learning_rate": 8.195606193320136e-07, "loss": 0.0369, "reward": -0.22703023999929428, "reward_std": 0.6005472913384438, "rewards/cosine_scaled_reward": -0.11351512093096972, "rewards/format_reward": 0.0, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 2926.3194580078125, "epoch": 0.3170522707797772, "grad_norm": 0.13524238765239716, "kl": 0.0067901611328125, "learning_rate": 8.170384989716657e-07, "loss": 0.0495, "reward": -0.17516471818089485, "reward_std": 0.5499648228287697, "rewards/cosine_scaled_reward": -0.08758235163986683, "rewards/format_reward": 0.0, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 2758.0833740234375, "epoch": 0.31876606683804626, "grad_norm": 0.19634363055229187, "kl": 0.0046234130859375, "learning_rate": 8.145033635316128e-07, "loss": -0.0094, "reward": -0.17140711098909378, "reward_std": 0.5592127367854118, "rewards/cosine_scaled_reward": -0.08570355176925659, "rewards/format_reward": 0.0, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 2399.749969482422, "epoch": 0.32047986289631536, "grad_norm": 0.14529581367969513, "kl": 0.00421905517578125, "learning_rate": 8.119553365707802e-07, "loss": 0.0233, "reward": 0.07888301834464073, "reward_std": 0.7940803468227386, "rewards/cosine_scaled_reward": 0.03944151382893324, "rewards/format_reward": 0.0, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 3158.486083984375, "epoch": 0.3221936589545844, "grad_norm": 0.15482233464717865, "kl": 0.0067596435546875, "learning_rate": 8.093945422764069e-07, "loss": -0.0061, "reward": -0.22822286747395992, "reward_std": 0.48042069375514984, "rewards/cosine_scaled_reward": -0.1141114397905767, "rewards/format_reward": 0.0, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 2875.52783203125, "epoch": 0.32390745501285345, "grad_norm": 0.16830354928970337, "kl": 0.00731658935546875, "learning_rate": 8.068211054579943e-07, "loss": -0.0214, "reward": -0.27129118889570236, "reward_std": 0.44227684289216995, "rewards/cosine_scaled_reward": -0.13564559258520603, "rewards/format_reward": 0.0, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 3101.52783203125, "epoch": 0.32562125107112255, "grad_norm": 0.17314012348651886, "kl": 0.006618499755859375, "learning_rate": 8.04235151541222e-07, "loss": 0.0361, "reward": -0.29743205150589347, "reward_std": 0.6253781244158745, "rewards/cosine_scaled_reward": -0.1487160255201161, "rewards/format_reward": 0.0, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 3005.0694580078125, "epoch": 0.3273350471293916, "grad_norm": 0.13242636620998383, "kl": 0.005157470703125, "learning_rate": 8.01636806561836e-07, "loss": 0.0398, "reward": -0.2783219777047634, "reward_std": 0.5744869485497475, "rewards/cosine_scaled_reward": -0.139160992577672, "rewards/format_reward": 0.0, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 2682.2777709960938, "epoch": 0.32904884318766064, "grad_norm": 0.1771107167005539, "kl": 0.00849151611328125, "learning_rate": 7.990261971595048e-07, "loss": -0.0275, "reward": -0.16758478805422783, "reward_std": 0.5308270826935768, "rewards/cosine_scaled_reward": -0.08379239588975906, "rewards/format_reward": 0.0, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 2224.2083587646484, "epoch": 0.33076263924592975, "grad_norm": 0.2606137990951538, "kl": 0.0111236572265625, "learning_rate": 7.964034505716476e-07, "loss": 0.0598, "reward": -0.1425977125763893, "reward_std": 0.6462048292160034, "rewards/cosine_scaled_reward": -0.0712988581508398, "rewards/format_reward": 0.0, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 2366.65283203125, "epoch": 0.3324764353041988, "grad_norm": 0.20748130977153778, "kl": 0.00627899169921875, "learning_rate": 7.93768694627233e-07, "loss": 0.0302, "reward": 0.07216466031968594, "reward_std": 0.5604969188570976, "rewards/cosine_scaled_reward": 0.03608234319835901, "rewards/format_reward": 0.0, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 3086.3889770507812, "epoch": 0.3341902313624679, "grad_norm": 0.16518257558345795, "kl": 0.007572174072265625, "learning_rate": 7.911220577405484e-07, "loss": 0.0403, "reward": -0.2750488445162773, "reward_std": 0.44911373406648636, "rewards/cosine_scaled_reward": -0.13752441480755806, "rewards/format_reward": 0.0, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 2695.0416870117188, "epoch": 0.33590402742073694, "grad_norm": 0.14707054197788239, "kl": 0.0079803466796875, "learning_rate": 7.884636689049422e-07, "loss": 0.0299, "reward": 0.3252771459519863, "reward_std": 0.7292146235704422, "rewards/cosine_scaled_reward": 0.162638571113348, "rewards/format_reward": 0.0, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 2711.7083129882812, "epoch": 0.337617823479006, "grad_norm": 0.19674766063690186, "kl": 0.00566864013671875, "learning_rate": 7.857936576865356e-07, "loss": 0.0125, "reward": 0.1904342882335186, "reward_std": 0.6823486983776093, "rewards/cosine_scaled_reward": 0.09521715994924307, "rewards/format_reward": 0.0, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 2801.2361450195312, "epoch": 0.3393316195372751, "grad_norm": 0.17002622783184052, "kl": 0.00652313232421875, "learning_rate": 7.831121542179086e-07, "loss": 0.0551, "reward": -0.1881256103515625, "reward_std": 0.41709040850400925, "rewards/cosine_scaled_reward": -0.0940628070384264, "rewards/format_reward": 0.0, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 2945.0139770507812, "epoch": 0.34104541559554413, "grad_norm": 0.17246587574481964, "kl": 0.006256103515625, "learning_rate": 7.804192891917571e-07, "loss": -0.0014, "reward": -0.20545833744108677, "reward_std": 0.5765868201851845, "rewards/cosine_scaled_reward": -0.10272916965186596, "rewards/format_reward": 0.0, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 2718.9583435058594, "epoch": 0.3427592116538132, "grad_norm": 0.184196338057518, "kl": 0.008544921875, "learning_rate": 7.777151938545235e-07, "loss": 0.016, "reward": -0.036401793360710144, "reward_std": 0.7076919972896576, "rewards/cosine_scaled_reward": -0.01820090040564537, "rewards/format_reward": 0.0, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 2624.166717529297, "epoch": 0.3444730077120823, "grad_norm": 0.2025025188922882, "kl": 0.00604248046875, "learning_rate": 7.75e-07, "loss": -0.0684, "reward": -0.23150286450982094, "reward_std": 0.4834456667304039, "rewards/cosine_scaled_reward": -0.11575142852962017, "rewards/format_reward": 0.0, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 2488.5556030273438, "epoch": 0.3461868037703513, "grad_norm": 0.15225747227668762, "kl": 0.005893707275390625, "learning_rate": 7.72273839962904e-07, "loss": -0.0317, "reward": 0.06343521224334836, "reward_std": 0.6216820403933525, "rewards/cosine_scaled_reward": 0.03171759960241616, "rewards/format_reward": 0.0, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 2638.0556030273438, "epoch": 0.34790059982862037, "grad_norm": 0.19878201186656952, "kl": 0.00801849365234375, "learning_rate": 7.695368466124296e-07, "loss": 0.0177, "reward": 0.24296507984399796, "reward_std": 0.7006724625825882, "rewards/cosine_scaled_reward": 0.12148253805935383, "rewards/format_reward": 0.0, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 2701.2499389648438, "epoch": 0.3496143958868895, "grad_norm": 0.16115106642246246, "kl": 0.005344390869140625, "learning_rate": 7.667891533457718e-07, "loss": 0.0175, "reward": -0.01583041623234749, "reward_std": 0.5048926845192909, "rewards/cosine_scaled_reward": -0.007915209047496319, "rewards/format_reward": 0.0, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 2820.15283203125, "epoch": 0.3513281919451585, "grad_norm": 0.1620146483182907, "kl": 0.00921630859375, "learning_rate": 7.640308940816239e-07, "loss": 0.0106, "reward": 0.10508427396416664, "reward_std": 0.5011924579739571, "rewards/cosine_scaled_reward": 0.05254213139414787, "rewards/format_reward": 0.0, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 2766.77783203125, "epoch": 0.35304198800342756, "grad_norm": 0.17725811898708344, "kl": 0.0068359375, "learning_rate": 7.612622032536507e-07, "loss": 0.0292, "reward": -0.025651058182120323, "reward_std": 0.6831357106566429, "rewards/cosine_scaled_reward": -0.012825531885027885, "rewards/format_reward": 0.0, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 2571.3333740234375, "epoch": 0.35475578406169667, "grad_norm": 0.2153560221195221, "kl": 0.00772857666015625, "learning_rate": 7.584832158039378e-07, "loss": -0.0053, "reward": -0.0772455558180809, "reward_std": 0.5703203156590462, "rewards/cosine_scaled_reward": -0.038622772321105, "rewards/format_reward": 0.0, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 2722.9444580078125, "epoch": 0.3564695801199657, "grad_norm": 0.2059468924999237, "kl": 0.00662994384765625, "learning_rate": 7.556940671764124e-07, "loss": 0.0612, "reward": -0.18379988404922187, "reward_std": 0.6482012867927551, "rewards/cosine_scaled_reward": -0.09189994307234883, "rewards/format_reward": 0.0, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 2714.9445190429688, "epoch": 0.3581833761782348, "grad_norm": 0.1764851063489914, "kl": 0.00818634033203125, "learning_rate": 7.528948933102438e-07, "loss": 0.0477, "reward": -0.011997078021522611, "reward_std": 0.6311939656734467, "rewards/cosine_scaled_reward": -0.005998534747050144, "rewards/format_reward": 0.0, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 2458.9583740234375, "epoch": 0.35989717223650386, "grad_norm": 0.23969826102256775, "kl": 0.00959014892578125, "learning_rate": 7.500858306332172e-07, "loss": -0.0174, "reward": -0.052909690886735916, "reward_std": 0.6342033296823502, "rewards/cosine_scaled_reward": -0.026454854756593704, "rewards/format_reward": 0.0, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 3275.9722290039062, "epoch": 0.3616109682947729, "grad_norm": 0.14406003057956696, "kl": 0.0072784423828125, "learning_rate": 7.472670160550848e-07, "loss": -0.0075, "reward": -0.4154173508286476, "reward_std": 0.47341830283403397, "rewards/cosine_scaled_reward": -0.2077086754143238, "rewards/format_reward": 0.0, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 2626.1805419921875, "epoch": 0.363324764353042, "grad_norm": 0.1677497923374176, "kl": 0.007781982421875, "learning_rate": 7.444385869608921e-07, "loss": -0.0218, "reward": -0.05068176053464413, "reward_std": 0.5218113884329796, "rewards/cosine_scaled_reward": -0.025340883061289787, "rewards/format_reward": 0.0, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 2894.8472290039062, "epoch": 0.36503856041131105, "grad_norm": 0.17942826449871063, "kl": 0.00614166259765625, "learning_rate": 7.416006812042827e-07, "loss": 0.0757, "reward": -0.09456230141222477, "reward_std": 0.6797711104154587, "rewards/cosine_scaled_reward": -0.0472811465151608, "rewards/format_reward": 0.0, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 2945.8750610351562, "epoch": 0.3667523564695801, "grad_norm": 0.1967238038778305, "kl": 0.0100555419921875, "learning_rate": 7.387534371007797e-07, "loss": -0.0128, "reward": -0.10412277281284332, "reward_std": 0.7091450989246368, "rewards/cosine_scaled_reward": -0.05206138640642166, "rewards/format_reward": 0.0, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 2597.4444580078125, "epoch": 0.3684661525278492, "grad_norm": 0.19232463836669922, "kl": 0.00603485107421875, "learning_rate": 7.358969934210438e-07, "loss": 0.0557, "reward": 0.07514850981533527, "reward_std": 0.5688696801662445, "rewards/cosine_scaled_reward": 0.03757425490766764, "rewards/format_reward": 0.0, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 3071.6805419921875, "epoch": 0.37017994858611825, "grad_norm": 0.15334580838680267, "kl": 0.00914764404296875, "learning_rate": 7.330314893841101e-07, "loss": -0.0092, "reward": -0.3550204383209348, "reward_std": 0.36161456257104874, "rewards/cosine_scaled_reward": -0.1775102224200964, "rewards/format_reward": 0.0, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 2696.5972595214844, "epoch": 0.3718937446443873, "grad_norm": 0.1864735186100006, "kl": 0.005496978759765625, "learning_rate": 7.301570646506027e-07, "loss": 0.0101, "reward": -0.07679219171404839, "reward_std": 0.6243979334831238, "rewards/cosine_scaled_reward": -0.03839609259739518, "rewards/format_reward": 0.0, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 2563.0834045410156, "epoch": 0.3736075407026564, "grad_norm": 0.16931480169296265, "kl": 0.005645751953125, "learning_rate": 7.27273859315928e-07, "loss": -0.006, "reward": -0.06438015587627888, "reward_std": 0.4739932492375374, "rewards/cosine_scaled_reward": -0.032190063036978245, "rewards/format_reward": 0.0, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 3087.9306030273438, "epoch": 0.37532133676092544, "grad_norm": 0.15486636757850647, "kl": 0.00830841064453125, "learning_rate": 7.243820139034464e-07, "loss": 0.034, "reward": -0.1913878731429577, "reward_std": 0.7374170869588852, "rewards/cosine_scaled_reward": -0.09569394029676914, "rewards/format_reward": 0.0, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 3060.9445190429688, "epoch": 0.37703513281919454, "grad_norm": 0.1392551213502884, "kl": 0.00780487060546875, "learning_rate": 7.214816693576234e-07, "loss": -0.0219, "reward": -0.3524288460612297, "reward_std": 0.4711146801710129, "rewards/cosine_scaled_reward": -0.17621441558003426, "rewards/format_reward": 0.0, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 3082.2083129882812, "epoch": 0.3787489288774636, "grad_norm": 0.15662223100662231, "kl": 0.006549835205078125, "learning_rate": 7.185729670371604e-07, "loss": 0.0127, "reward": -0.051485654432326555, "reward_std": 0.5929789990186691, "rewards/cosine_scaled_reward": -0.025742830068338662, "rewards/format_reward": 0.0, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 2778.666717529297, "epoch": 0.38046272493573263, "grad_norm": 0.17736981809139252, "kl": 0.008647918701171875, "learning_rate": 7.156560487081051e-07, "loss": 0.0084, "reward": -0.13847951218485832, "reward_std": 0.5384139195084572, "rewards/cosine_scaled_reward": -0.06923975050449371, "rewards/format_reward": 0.0, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 3187.9305419921875, "epoch": 0.38217652099400173, "grad_norm": 0.13862548768520355, "kl": 0.00606536865234375, "learning_rate": 7.127310565369415e-07, "loss": 0.0539, "reward": -0.3446214310824871, "reward_std": 0.46420831978321075, "rewards/cosine_scaled_reward": -0.1723107136785984, "rewards/format_reward": 0.0, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 3035.513916015625, "epoch": 0.3838903170522708, "grad_norm": 0.17018531262874603, "kl": 0.0105438232421875, "learning_rate": 7.097981330836616e-07, "loss": 0.0259, "reward": -0.21396764740347862, "reward_std": 0.5872293263673782, "rewards/cosine_scaled_reward": -0.10698381997644901, "rewards/format_reward": 0.0, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 2804.013916015625, "epoch": 0.3856041131105398, "grad_norm": 0.19500206410884857, "kl": 0.00768280029296875, "learning_rate": 7.068574212948169e-07, "loss": 0.1055, "reward": -0.22558368369936943, "reward_std": 0.6132937371730804, "rewards/cosine_scaled_reward": -0.11279183439910412, "rewards/format_reward": 0.0, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 2667.0555725097656, "epoch": 0.3873179091688089, "grad_norm": 0.18770119547843933, "kl": 0.007568359375, "learning_rate": 7.039090644965509e-07, "loss": 0.0042, "reward": -0.05162630486302078, "reward_std": 0.6696203723549843, "rewards/cosine_scaled_reward": -0.025813143118284643, "rewards/format_reward": 0.0, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 2906.6805419921875, "epoch": 0.389031705227078, "grad_norm": 0.16604122519493103, "kl": 0.0076141357421875, "learning_rate": 7.009532063876148e-07, "loss": 0.0146, "reward": -0.1345351382624358, "reward_std": 0.7545941472053528, "rewards/cosine_scaled_reward": -0.0672675691312179, "rewards/format_reward": 0.0, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 2913.791748046875, "epoch": 0.390745501285347, "grad_norm": 0.36757490038871765, "kl": 0.006805419921875, "learning_rate": 6.979899910323624e-07, "loss": 0.0796, "reward": 0.061263229697942734, "reward_std": 0.5674895793199539, "rewards/cosine_scaled_reward": 0.030631612986326218, "rewards/format_reward": 0.0, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 2708.4027709960938, "epoch": 0.3924592973436161, "grad_norm": 0.17164915800094604, "kl": 0.0065765380859375, "learning_rate": 6.950195628537299e-07, "loss": 0.061, "reward": -0.17019816813990474, "reward_std": 0.5833596885204315, "rewards/cosine_scaled_reward": -0.08509908034466207, "rewards/format_reward": 0.0, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 2934.0555419921875, "epoch": 0.39417309340188517, "grad_norm": 0.16252191364765167, "kl": 0.0125732421875, "learning_rate": 6.920420666261961e-07, "loss": -0.0251, "reward": -0.27500685676932335, "reward_std": 0.4450754225254059, "rewards/cosine_scaled_reward": -0.13750343304127455, "rewards/format_reward": 0.0, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 3062.4583740234375, "epoch": 0.39588688946015427, "grad_norm": 0.13106314837932587, "kl": 0.0096435546875, "learning_rate": 6.890576474687263e-07, "loss": 0.0074, "reward": -0.11593299638479948, "reward_std": 0.5865771174430847, "rewards/cosine_scaled_reward": -0.057966490276157856, "rewards/format_reward": 0.0, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 2548.4444580078125, "epoch": 0.3976006855184233, "grad_norm": 0.15283794701099396, "kl": 0.00823974609375, "learning_rate": 6.860664508377001e-07, "loss": 0.0156, "reward": 0.012726329267024994, "reward_std": 0.6339813768863678, "rewards/cosine_scaled_reward": 0.006363175809383392, "rewards/format_reward": 0.0, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 1752.4861602783203, "epoch": 0.39931448157669236, "grad_norm": 0.17673321068286896, "kl": 0.0053558349609375, "learning_rate": 6.83068622519821e-07, "loss": 0.0344, "reward": 0.3881940320134163, "reward_std": 0.6750105991959572, "rewards/cosine_scaled_reward": 0.19409702718257904, "rewards/format_reward": 0.0, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 2785.6111450195312, "epoch": 0.40102827763496146, "grad_norm": 0.172316312789917, "kl": 0.01160430908203125, "learning_rate": 6.800643086250121e-07, "loss": -0.0154, "reward": -0.2950245440006256, "reward_std": 0.6799461841583252, "rewards/cosine_scaled_reward": -0.1475122720003128, "rewards/format_reward": 0.0, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 3018.3056030273438, "epoch": 0.4027420736932305, "grad_norm": 0.15055446326732635, "kl": 0.0095977783203125, "learning_rate": 6.770536555792944e-07, "loss": -0.0457, "reward": -0.19169194623827934, "reward_std": 0.4096248298883438, "rewards/cosine_scaled_reward": -0.09584598150104284, "rewards/format_reward": 0.0, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 3233.8889770507812, "epoch": 0.40445586975149955, "grad_norm": 0.14838387072086334, "kl": 0.009735107421875, "learning_rate": 6.740368101176495e-07, "loss": 0.0306, "reward": -0.14736445620656013, "reward_std": 0.6041549146175385, "rewards/cosine_scaled_reward": -0.07368221180513501, "rewards/format_reward": 0.0, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 2462.7361755371094, "epoch": 0.40616966580976865, "grad_norm": 0.21186563372612, "kl": 0.0077667236328125, "learning_rate": 6.710139192768694e-07, "loss": 0.0142, "reward": -0.2296012807637453, "reward_std": 0.5129070654511452, "rewards/cosine_scaled_reward": -0.11480064131319523, "rewards/format_reward": 0.0, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 3028.5277709960938, "epoch": 0.4078834618680377, "grad_norm": 0.15430369973182678, "kl": 0.00980377197265625, "learning_rate": 6.679851303883891e-07, "loss": 0.0326, "reward": -0.04171431064605713, "reward_std": 0.5160864554345608, "rewards/cosine_scaled_reward": -0.020857159048318863, "rewards/format_reward": 0.0, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 2878.597198486328, "epoch": 0.40959725792630675, "grad_norm": 0.1511092185974121, "kl": 0.00661468505859375, "learning_rate": 6.649505910711058e-07, "loss": 0.0053, "reward": -0.08533445000648499, "reward_std": 0.48660216480493546, "rewards/cosine_scaled_reward": -0.04266723245382309, "rewards/format_reward": 0.0, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 2145.986114501953, "epoch": 0.41131105398457585, "grad_norm": 0.19034837186336517, "kl": 0.00772857666015625, "learning_rate": 6.619104492241847e-07, "loss": 0.0412, "reward": 0.22470230411272496, "reward_std": 0.5070570334792137, "rewards/cosine_scaled_reward": 0.11235115380259231, "rewards/format_reward": 0.0, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 2227.7361450195312, "epoch": 0.4130248500428449, "grad_norm": 0.1779133826494217, "kl": 0.008626937866210938, "learning_rate": 6.588648530198504e-07, "loss": 0.0419, "reward": -0.0513172447681427, "reward_std": 0.617318756878376, "rewards/cosine_scaled_reward": -0.025658607482910156, "rewards/format_reward": 0.0, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 3336.3333740234375, "epoch": 0.414738646101114, "grad_norm": 0.1791980117559433, "kl": 0.00740814208984375, "learning_rate": 6.558139508961654e-07, "loss": -0.0064, "reward": -0.14741731621325016, "reward_std": 0.7067866027355194, "rewards/cosine_scaled_reward": -0.07370865810662508, "rewards/format_reward": 0.0, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 3000.888916015625, "epoch": 0.41645244215938304, "grad_norm": 0.146419495344162, "kl": 0.006435394287109375, "learning_rate": 6.527578915497951e-07, "loss": 0.0311, "reward": -0.012151572853326797, "reward_std": 0.7768204510211945, "rewards/cosine_scaled_reward": -0.006075790151953697, "rewards/format_reward": 0.0, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 2964.9444885253906, "epoch": 0.4181662382176521, "grad_norm": 0.1862625777721405, "kl": 0.00849151611328125, "learning_rate": 6.496968239287603e-07, "loss": 0.0372, "reward": -0.16059484332799911, "reward_std": 0.5683267489075661, "rewards/cosine_scaled_reward": -0.08029741793870926, "rewards/format_reward": 0.0, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 1833.4166717529297, "epoch": 0.4198800342759212, "grad_norm": 0.3224428594112396, "kl": 0.0082550048828125, "learning_rate": 6.466308972251785e-07, "loss": -0.0459, "reward": 0.06598322093486786, "reward_std": 0.6559992954134941, "rewards/cosine_scaled_reward": 0.032991619780659676, "rewards/format_reward": 0.0, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 2580.0694580078125, "epoch": 0.42159383033419023, "grad_norm": 0.1514631062746048, "kl": 0.01023101806640625, "learning_rate": 6.435602608679916e-07, "loss": 0.0527, "reward": -0.02805427461862564, "reward_std": 0.6845656186342239, "rewards/cosine_scaled_reward": -0.01402713917195797, "rewards/format_reward": 0.0, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 3155.2500610351562, "epoch": 0.4233076263924593, "grad_norm": 0.1348743587732315, "kl": 0.01012420654296875, "learning_rate": 6.404850645156841e-07, "loss": 0.0538, "reward": -0.11579635553061962, "reward_std": 0.7224173843860626, "rewards/cosine_scaled_reward": -0.057898176833987236, "rewards/format_reward": 0.0, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 2719.7222290039062, "epoch": 0.4250214224507284, "grad_norm": 0.16808690130710602, "kl": 0.010040283203125, "learning_rate": 6.374054580489873e-07, "loss": -0.0027, "reward": -0.1423700600862503, "reward_std": 0.41877883672714233, "rewards/cosine_scaled_reward": -0.07118503004312515, "rewards/format_reward": 0.0, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 3017.5416870117188, "epoch": 0.4267352185089974, "grad_norm": 0.13636116683483124, "kl": 0.01111602783203125, "learning_rate": 6.343215915635761e-07, "loss": 0.0335, "reward": -0.16177499457262456, "reward_std": 0.41518206894397736, "rewards/cosine_scaled_reward": -0.08088749897433445, "rewards/format_reward": 0.0, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 2664.6666870117188, "epoch": 0.4284490145672665, "grad_norm": 0.1738223433494568, "kl": 0.00909423828125, "learning_rate": 6.31233615362752e-07, "loss": 0.0581, "reward": -0.14353771694004536, "reward_std": 0.5664958357810974, "rewards/cosine_scaled_reward": -0.07176885847002268, "rewards/format_reward": 0.0, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 2695.2916870117188, "epoch": 0.4301628106255356, "grad_norm": 0.16924519836902618, "kl": 0.009357452392578125, "learning_rate": 6.281416799501187e-07, "loss": -0.0019, "reward": 0.09459428116679192, "reward_std": 0.6146803349256516, "rewards/cosine_scaled_reward": 0.04729713872075081, "rewards/format_reward": 0.0, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 2714.5416259765625, "epoch": 0.4318766066838046, "grad_norm": 0.19001764059066772, "kl": 0.0080413818359375, "learning_rate": 6.25045936022246e-07, "loss": -0.0049, "reward": -0.00815525185316801, "reward_std": 0.5676329433917999, "rewards/cosine_scaled_reward": -0.00407763384282589, "rewards/format_reward": 0.0, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 2847.4166870117188, "epoch": 0.43359040274207367, "grad_norm": 0.1463892161846161, "kl": 0.01003265380859375, "learning_rate": 6.219465344613258e-07, "loss": -0.0337, "reward": 0.009062569588422775, "reward_std": 0.5907448679208755, "rewards/cosine_scaled_reward": 0.0045312922447919846, "rewards/format_reward": 0.0, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 2974.9166870117188, "epoch": 0.43530419880034277, "grad_norm": 0.15965710580348969, "kl": 0.009674072265625, "learning_rate": 6.188436263278172e-07, "loss": 0.0032, "reward": 0.09970302879810333, "reward_std": 0.4728682413697243, "rewards/cosine_scaled_reward": 0.04985151067376137, "rewards/format_reward": 0.0, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 2707.4027709960938, "epoch": 0.4370179948586118, "grad_norm": 0.1550796627998352, "kl": 0.01021575927734375, "learning_rate": 6.157373628530852e-07, "loss": 0.0007, "reward": -0.08888162672519684, "reward_std": 0.4977044016122818, "rewards/cosine_scaled_reward": -0.04444081336259842, "rewards/format_reward": 0.0, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 2744.763916015625, "epoch": 0.4387317909168809, "grad_norm": 0.21653364598751068, "kl": 0.009979248046875, "learning_rate": 6.126278954320294e-07, "loss": 0.0062, "reward": -0.24449253268539906, "reward_std": 0.4354872331023216, "rewards/cosine_scaled_reward": -0.12224626448005438, "rewards/format_reward": 0.0, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 2490.9722290039062, "epoch": 0.44044558697514996, "grad_norm": 0.1892397254705429, "kl": 0.00962066650390625, "learning_rate": 6.095153756157051e-07, "loss": 0.0269, "reward": 0.1365387246478349, "reward_std": 0.6730539947748184, "rewards/cosine_scaled_reward": 0.06826936185825616, "rewards/format_reward": 0.0, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 2796.7916870117188, "epoch": 0.442159383033419, "grad_norm": 0.16943146288394928, "kl": 0.0094757080078125, "learning_rate": 6.06399955103937e-07, "loss": -0.0094, "reward": -0.27603928185999393, "reward_std": 0.517802283167839, "rewards/cosine_scaled_reward": -0.13801964186131954, "rewards/format_reward": 0.0, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 2211.4305419921875, "epoch": 0.4438731790916881, "grad_norm": 0.23119370639324188, "kl": 0.0067596435546875, "learning_rate": 6.032817857379256e-07, "loss": 0.1106, "reward": -0.10240336135029793, "reward_std": 0.5084675773978233, "rewards/cosine_scaled_reward": -0.05120168812572956, "rewards/format_reward": 0.0, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 2785.3055419921875, "epoch": 0.44558697514995715, "grad_norm": 0.21458233892917633, "kl": 0.00876617431640625, "learning_rate": 6.001610194928464e-07, "loss": -0.0013, "reward": 0.051987094804644585, "reward_std": 0.5341488644480705, "rewards/cosine_scaled_reward": 0.02599355112761259, "rewards/format_reward": 0.0, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 2539.2083740234375, "epoch": 0.4473007712082262, "grad_norm": 0.1954081803560257, "kl": 0.01007843017578125, "learning_rate": 5.97037808470444e-07, "loss": 0.0316, "reward": 0.026572998613119125, "reward_std": 0.42085136845707893, "rewards/cosine_scaled_reward": 0.013286499306559563, "rewards/format_reward": 0.0, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 2897.3472900390625, "epoch": 0.4490145672664953, "grad_norm": 0.1940917670726776, "kl": 0.01140594482421875, "learning_rate": 5.939123048916173e-07, "loss": 0.0318, "reward": -0.05599740147590637, "reward_std": 0.6964142769575119, "rewards/cosine_scaled_reward": -0.027998706325888634, "rewards/format_reward": 0.0, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 2977.875, "epoch": 0.45072836332476435, "grad_norm": 0.2107793092727661, "kl": 0.011260986328125, "learning_rate": 5.907846610890011e-07, "loss": 0.0458, "reward": -0.443071685731411, "reward_std": 0.4884059280157089, "rewards/cosine_scaled_reward": -0.22153585404157639, "rewards/format_reward": 0.0, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 2835.9166564941406, "epoch": 0.4524421593830334, "grad_norm": 0.1562654972076416, "kl": 0.0097503662109375, "learning_rate": 5.87655029499542e-07, "loss": 0.0527, "reward": -0.31120575219392776, "reward_std": 0.42043986171483994, "rewards/cosine_scaled_reward": -0.15560288727283478, "rewards/format_reward": 0.0, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 3025.416748046875, "epoch": 0.4541559554413025, "grad_norm": 0.35641464591026306, "kl": 0.008880615234375, "learning_rate": 5.845235626570683e-07, "loss": 0.0023, "reward": 0.14537757262587547, "reward_std": 0.4222983121871948, "rewards/cosine_scaled_reward": 0.07268879748880863, "rewards/format_reward": 0.0, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 2631.9445190429688, "epoch": 0.45586975149957154, "grad_norm": 0.1993650197982788, "kl": 0.006988525390625, "learning_rate": 5.813904131848564e-07, "loss": 0.02, "reward": -0.21425554435700178, "reward_std": 0.6343535855412483, "rewards/cosine_scaled_reward": -0.10712776239961386, "rewards/format_reward": 0.0, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 2562.3611450195312, "epoch": 0.45758354755784064, "grad_norm": 0.22106732428073883, "kl": 0.01442718505859375, "learning_rate": 5.78255733788191e-07, "loss": 0.0396, "reward": -0.292802631855011, "reward_std": 0.3813341185450554, "rewards/cosine_scaled_reward": -0.1464013159275055, "rewards/format_reward": 0.0, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 2572.2083740234375, "epoch": 0.4592973436161097, "grad_norm": 0.1711571365594864, "kl": 0.00759124755859375, "learning_rate": 5.751196772469237e-07, "loss": 0.0197, "reward": -0.2553995121270418, "reward_std": 0.5235799252986908, "rewards/cosine_scaled_reward": -0.1276997560635209, "rewards/format_reward": 0.0, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 3053.2638549804688, "epoch": 0.46101113967437873, "grad_norm": 0.1386088728904724, "kl": 0.00843048095703125, "learning_rate": 5.71982396408026e-07, "loss": 0.0184, "reward": -0.17865224927663803, "reward_std": 0.5562375336885452, "rewards/cosine_scaled_reward": -0.08932612743228674, "rewards/format_reward": 0.0, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 2723.638916015625, "epoch": 0.46272493573264784, "grad_norm": 2.8520348072052, "kl": 0.05461883544921875, "learning_rate": 5.688440441781398e-07, "loss": -0.0068, "reward": 0.27612179331481457, "reward_std": 0.7261447310447693, "rewards/cosine_scaled_reward": 0.13806088734418154, "rewards/format_reward": 0.0, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 2795.666748046875, "epoch": 0.4644387317909169, "grad_norm": 0.1723846048116684, "kl": 0.01483154296875, "learning_rate": 5.657047735161255e-07, "loss": 0.0373, "reward": -0.03490264154970646, "reward_std": 0.6204687505960464, "rewards/cosine_scaled_reward": -0.017451307736337185, "rewards/format_reward": 0.0, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 2526.1944580078125, "epoch": 0.4661525278491859, "grad_norm": 0.2960320711135864, "kl": 0.0132598876953125, "learning_rate": 5.625647374256061e-07, "loss": 0.0815, "reward": 0.11341174505650997, "reward_std": 0.5083474740386009, "rewards/cosine_scaled_reward": 0.05670587276108563, "rewards/format_reward": 0.0, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 2640.1944580078125, "epoch": 0.46786632390745503, "grad_norm": 0.17620131373405457, "kl": 0.009765625, "learning_rate": 5.594240889475106e-07, "loss": 0.0112, "reward": 0.11540575325489044, "reward_std": 0.5552510917186737, "rewards/cosine_scaled_reward": 0.05770287476480007, "rewards/format_reward": 0.0, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 3208.8333129882812, "epoch": 0.4695801199657241, "grad_norm": 0.15742135047912598, "kl": 0.01263427734375, "learning_rate": 5.562829811526154e-07, "loss": -0.0201, "reward": -0.4686981365084648, "reward_std": 0.3511890172958374, "rewards/cosine_scaled_reward": -0.2343490682542324, "rewards/format_reward": 0.0, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 2885.9306030273438, "epoch": 0.4712939160239931, "grad_norm": 0.18644562363624573, "kl": 0.01094818115234375, "learning_rate": 5.531415671340826e-07, "loss": 0.0224, "reward": -0.2238161340355873, "reward_std": 0.5779955387115479, "rewards/cosine_scaled_reward": -0.11190806701779366, "rewards/format_reward": 0.0, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 2641.7222290039062, "epoch": 0.4730077120822622, "grad_norm": 0.2060326635837555, "kl": 0.009002685546875, "learning_rate": 5.5e-07, "loss": 0.0921, "reward": -0.10621737875044346, "reward_std": 0.572068989276886, "rewards/cosine_scaled_reward": -0.053108690306544304, "rewards/format_reward": 0.0, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 3040.3193969726562, "epoch": 0.47472150814053127, "grad_norm": 0.15230870246887207, "kl": 0.009868621826171875, "learning_rate": 5.468584328659172e-07, "loss": 0.0243, "reward": -0.27920062592602335, "reward_std": 0.4912775382399559, "rewards/cosine_scaled_reward": -0.13960031296301167, "rewards/format_reward": 0.0, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 2733.8472900390625, "epoch": 0.47643530419880037, "grad_norm": 0.16092219948768616, "kl": 0.0078125, "learning_rate": 5.437170188473847e-07, "loss": 0.0214, "reward": 0.1801936998963356, "reward_std": 0.7019116431474686, "rewards/cosine_scaled_reward": 0.09009685181081295, "rewards/format_reward": 0.0, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 3269.8194580078125, "epoch": 0.4781491002570694, "grad_norm": 0.13919058442115784, "kl": 0.01202392578125, "learning_rate": 5.405759110524894e-07, "loss": 0.0359, "reward": -0.2203904101625085, "reward_std": 0.5241215899586678, "rewards/cosine_scaled_reward": -0.11019521998241544, "rewards/format_reward": 0.0, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 2767.611083984375, "epoch": 0.47986289631533846, "grad_norm": 0.17986002564430237, "kl": 0.01087188720703125, "learning_rate": 5.37435262574394e-07, "loss": 0.0177, "reward": 0.20984390750527382, "reward_std": 0.6492117866873741, "rewards/cosine_scaled_reward": 0.10492195282131433, "rewards/format_reward": 0.0, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 2349.902801513672, "epoch": 0.48157669237360756, "grad_norm": 0.20755039155483246, "kl": 0.0091094970703125, "learning_rate": 5.342952264838747e-07, "loss": 0.0718, "reward": 0.09011890506371856, "reward_std": 0.755554661154747, "rewards/cosine_scaled_reward": 0.04505945247365162, "rewards/format_reward": 0.0, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 2690.3333740234375, "epoch": 0.4832904884318766, "grad_norm": 0.16312456130981445, "kl": 0.0081787109375, "learning_rate": 5.311559558218603e-07, "loss": -0.0225, "reward": -0.1038619177415967, "reward_std": 0.6092793643474579, "rewards/cosine_scaled_reward": -0.05193095514550805, "rewards/format_reward": 0.0, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 2992.4443969726562, "epoch": 0.48500428449014565, "grad_norm": 0.14668744802474976, "kl": 0.009429931640625, "learning_rate": 5.28017603591974e-07, "loss": 0.0094, "reward": -0.18053901614621282, "reward_std": 0.5393766239285469, "rewards/cosine_scaled_reward": -0.09026950527913868, "rewards/format_reward": 0.0, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 2731.1666870117188, "epoch": 0.48671808054841476, "grad_norm": 0.18690791726112366, "kl": 0.01348876953125, "learning_rate": 5.248803227530763e-07, "loss": 0.0274, "reward": 0.05301067978143692, "reward_std": 0.8040451109409332, "rewards/cosine_scaled_reward": 0.026505338959395885, "rewards/format_reward": 0.0, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 3109.9722290039062, "epoch": 0.4884318766066838, "grad_norm": 0.13096371293067932, "kl": 0.0122222900390625, "learning_rate": 5.21744266211809e-07, "loss": -0.0039, "reward": -0.10303456708788872, "reward_std": 0.6089868098497391, "rewards/cosine_scaled_reward": -0.05151727236807346, "rewards/format_reward": 0.0, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 2716.9166259765625, "epoch": 0.49014567266495285, "grad_norm": 0.21381936967372894, "kl": 0.00853729248046875, "learning_rate": 5.186095868151436e-07, "loss": -0.0087, "reward": -0.08554558828473091, "reward_std": 0.6172359138727188, "rewards/cosine_scaled_reward": -0.042772796005010605, "rewards/format_reward": 0.0, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 3128.3195190429688, "epoch": 0.49185946872322195, "grad_norm": 0.20883357524871826, "kl": 0.0149688720703125, "learning_rate": 5.154764373429315e-07, "loss": 0.0885, "reward": -0.16452566534280777, "reward_std": 0.6313002184033394, "rewards/cosine_scaled_reward": -0.08226283825933933, "rewards/format_reward": 0.0, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 2783.2777709960938, "epoch": 0.493573264781491, "grad_norm": 0.18023322522640228, "kl": 0.016357421875, "learning_rate": 5.123449705004581e-07, "loss": 0.0778, "reward": -0.29250151151791215, "reward_std": 0.5800458639860153, "rewards/cosine_scaled_reward": -0.14625075762160122, "rewards/format_reward": 0.0, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 2974.1944274902344, "epoch": 0.4952870608397601, "grad_norm": 0.226176917552948, "kl": 0.01458740234375, "learning_rate": 5.09215338910999e-07, "loss": 0.0448, "reward": -0.33544909581542015, "reward_std": 0.5062796398997307, "rewards/cosine_scaled_reward": -0.16772454418241978, "rewards/format_reward": 0.0, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 2436.1944274902344, "epoch": 0.49700085689802914, "grad_norm": 0.1747155487537384, "kl": 0.012481689453125, "learning_rate": 5.060876951083828e-07, "loss": -0.0449, "reward": -0.14955687522888184, "reward_std": 0.5533142015337944, "rewards/cosine_scaled_reward": -0.07477843947708607, "rewards/format_reward": 0.0, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 3039.3611450195312, "epoch": 0.4987146529562982, "grad_norm": 0.1754036843776703, "kl": 0.01313018798828125, "learning_rate": 5.02962191529556e-07, "loss": 0.0491, "reward": -0.44222037494182587, "reward_std": 0.49202967807650566, "rewards/cosine_scaled_reward": -0.22111019119620323, "rewards/format_reward": 0.0, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 2386.4722595214844, "epoch": 0.5004284490145673, "grad_norm": 0.23209446668624878, "kl": 0.0133209228515625, "learning_rate": 4.998389805071536e-07, "loss": 0.1035, "reward": 0.11830113036558032, "reward_std": 0.7409112825989723, "rewards/cosine_scaled_reward": 0.059150564251467586, "rewards/format_reward": 0.0, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 2481.1666870117188, "epoch": 0.5021422450728363, "grad_norm": 0.17551322281360626, "kl": 0.0098724365234375, "learning_rate": 4.967182142620745e-07, "loss": 0.0432, "reward": -0.1329963468015194, "reward_std": 0.5577030703425407, "rewards/cosine_scaled_reward": -0.06649817898869514, "rewards/format_reward": 0.0, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 2711.5000610351562, "epoch": 0.5038560411311054, "grad_norm": 0.18221919238567352, "kl": 0.01308441162109375, "learning_rate": 4.93600044896063e-07, "loss": 0.0587, "reward": -0.21110662072896957, "reward_std": 0.5812349170446396, "rewards/cosine_scaled_reward": -0.10555331036448479, "rewards/format_reward": 0.0, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 2619.5972290039062, "epoch": 0.5055698371893744, "grad_norm": 0.18888363242149353, "kl": 0.0113067626953125, "learning_rate": 4.904846243842949e-07, "loss": -0.0068, "reward": 0.10603267699480057, "reward_std": 0.6550966873764992, "rewards/cosine_scaled_reward": 0.053016334772109985, "rewards/format_reward": 0.0, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 2851.4583129882812, "epoch": 0.5072836332476436, "grad_norm": 0.15981672704219818, "kl": 0.0122833251953125, "learning_rate": 4.873721045679706e-07, "loss": 0.0399, "reward": 0.07413195073604584, "reward_std": 0.6663401573896408, "rewards/cosine_scaled_reward": 0.03706597909331322, "rewards/format_reward": 0.0, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 2682.5833435058594, "epoch": 0.5089974293059126, "grad_norm": 0.18823187053203583, "kl": 0.0098419189453125, "learning_rate": 4.842626371469149e-07, "loss": 0.0705, "reward": -0.035793907940387726, "reward_std": 0.5416731983423233, "rewards/cosine_scaled_reward": -0.017896955832839012, "rewards/format_reward": 0.0, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 2439.0556030273438, "epoch": 0.5107112253641817, "grad_norm": 0.17318564653396606, "kl": 0.012298583984375, "learning_rate": 4.811563736721829e-07, "loss": 0.0134, "reward": 0.026430480182170868, "reward_std": 0.5753844156861305, "rewards/cosine_scaled_reward": 0.013215240091085434, "rewards/format_reward": 0.0, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 2576.9027709960938, "epoch": 0.5124250214224507, "grad_norm": 0.21229924261569977, "kl": 0.0172576904296875, "learning_rate": 4.780534655386743e-07, "loss": 0.0552, "reward": 0.3652267027646303, "reward_std": 0.6922546178102493, "rewards/cosine_scaled_reward": 0.18261335138231516, "rewards/format_reward": 0.0, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 2756.9305419921875, "epoch": 0.5141388174807198, "grad_norm": 0.19316470623016357, "kl": 0.011383056640625, "learning_rate": 4.749540639777539e-07, "loss": 0.0299, "reward": 0.22619394585490227, "reward_std": 0.4907483011484146, "rewards/cosine_scaled_reward": 0.11309697106480598, "rewards/format_reward": 0.0, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 2826.9306640625, "epoch": 0.5158526135389888, "grad_norm": 0.233236625790596, "kl": 0.01507568359375, "learning_rate": 4.7185832004988133e-07, "loss": 0.0851, "reward": -0.13008400797843933, "reward_std": 0.7507277429103851, "rewards/cosine_scaled_reward": -0.06504200212657452, "rewards/format_reward": 0.0, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 3145.986083984375, "epoch": 0.517566409597258, "grad_norm": 0.1533445417881012, "kl": 0.0132293701171875, "learning_rate": 4.68766384637248e-07, "loss": 0.0074, "reward": -0.00015814602375030518, "reward_std": 0.7809525281190872, "rewards/cosine_scaled_reward": -7.90674239397049e-05, "rewards/format_reward": 0.0, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 3029.40283203125, "epoch": 0.519280205655527, "grad_norm": 0.1652766764163971, "kl": 0.0153656005859375, "learning_rate": 4.656784084364238e-07, "loss": 0.0139, "reward": -0.06143874488770962, "reward_std": 0.7485700696706772, "rewards/cosine_scaled_reward": -0.030719374306499958, "rewards/format_reward": 0.0, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 2729.52783203125, "epoch": 0.5209940017137961, "grad_norm": 0.18553942441940308, "kl": 0.01348876953125, "learning_rate": 4.6259454195101267e-07, "loss": -0.0083, "reward": -0.042102924548089504, "reward_std": 0.5168112218379974, "rewards/cosine_scaled_reward": -0.02105145249515772, "rewards/format_reward": 0.0, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 2928.9444580078125, "epoch": 0.5227077977720651, "grad_norm": 0.24608513712882996, "kl": 0.01422119140625, "learning_rate": 4.59514935484316e-07, "loss": 0.0859, "reward": -0.11776435747742653, "reward_std": 0.6116138771176338, "rewards/cosine_scaled_reward": -0.058882176876068115, "rewards/format_reward": 0.0, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 2568.7222290039062, "epoch": 0.5244215938303342, "grad_norm": 0.1760726422071457, "kl": 0.0126800537109375, "learning_rate": 4.5643973913200837e-07, "loss": 0.0106, "reward": -0.2896502474322915, "reward_std": 0.542039155960083, "rewards/cosine_scaled_reward": -0.1448251255787909, "rewards/format_reward": 0.0, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 2816.7916259765625, "epoch": 0.5261353898886033, "grad_norm": 0.20767201483249664, "kl": 0.015869140625, "learning_rate": 4.5336910277482155e-07, "loss": 0.0605, "reward": -0.07595526240766048, "reward_std": 0.7446087747812271, "rewards/cosine_scaled_reward": -0.03797762934118509, "rewards/format_reward": 0.0, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 2619.0, "epoch": 0.5278491859468724, "grad_norm": 0.18840792775154114, "kl": 0.01406097412109375, "learning_rate": 4.503031760712397e-07, "loss": 0.0317, "reward": -0.13000392355024815, "reward_std": 0.5407935008406639, "rewards/cosine_scaled_reward": -0.0650019682943821, "rewards/format_reward": 0.0, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 2706.4583740234375, "epoch": 0.5295629820051414, "grad_norm": 0.20385313034057617, "kl": 0.017333984375, "learning_rate": 4.4724210845020494e-07, "loss": 0.0402, "reward": 0.09998160088434815, "reward_std": 0.6437982618808746, "rewards/cosine_scaled_reward": 0.049990794621407986, "rewards/format_reward": 0.0, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 3076.0694580078125, "epoch": 0.5312767780634104, "grad_norm": 0.15351690351963043, "kl": 0.0145416259765625, "learning_rate": 4.441860491038345e-07, "loss": 0.0112, "reward": -0.1288044311950216, "reward_std": 0.5119795873761177, "rewards/cosine_scaled_reward": -0.0644022131091333, "rewards/format_reward": 0.0, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 3078.3333129882812, "epoch": 0.5329905741216795, "grad_norm": 0.21041399240493774, "kl": 0.0153045654296875, "learning_rate": 4.4113514698014953e-07, "loss": 0.0569, "reward": -0.24592324905097485, "reward_std": 0.5915715545415878, "rewards/cosine_scaled_reward": -0.12296162731945515, "rewards/format_reward": 0.0, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 2794.888916015625, "epoch": 0.5347043701799485, "grad_norm": 0.3006104528903961, "kl": 0.0116729736328125, "learning_rate": 4.3808955077581546e-07, "loss": 0.1717, "reward": 0.2339099831879139, "reward_std": 0.6782252490520477, "rewards/cosine_scaled_reward": 0.1169549860060215, "rewards/format_reward": 0.0, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 2454.65283203125, "epoch": 0.5364181662382177, "grad_norm": 0.213435098528862, "kl": 0.0183868408203125, "learning_rate": 4.350494089288943e-07, "loss": -0.0051, "reward": -0.29112886637449265, "reward_std": 0.48665956407785416, "rewards/cosine_scaled_reward": -0.14556444063782692, "rewards/format_reward": 0.0, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 2845.3194580078125, "epoch": 0.5381319622964867, "grad_norm": 0.23916800320148468, "kl": 0.0161285400390625, "learning_rate": 4.3201486961161093e-07, "loss": 0.0824, "reward": -0.16251583769917488, "reward_std": 0.4937269687652588, "rewards/cosine_scaled_reward": -0.08125792350620031, "rewards/format_reward": 0.0, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 2697.9583740234375, "epoch": 0.5398457583547558, "grad_norm": 0.19882318377494812, "kl": 0.018157958984375, "learning_rate": 4.2898608072313045e-07, "loss": 0.0178, "reward": -0.25365344155579805, "reward_std": 0.5236896127462387, "rewards/cosine_scaled_reward": -0.12682672249502502, "rewards/format_reward": 0.0, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 2544.9861450195312, "epoch": 0.5415595544130248, "grad_norm": 0.20584873855113983, "kl": 0.014862060546875, "learning_rate": 4.2596318988235037e-07, "loss": 0.0389, "reward": -0.09484067000448704, "reward_std": 0.6149067878723145, "rewards/cosine_scaled_reward": -0.04742033500224352, "rewards/format_reward": 0.0, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 2950.8333740234375, "epoch": 0.5432733504712939, "grad_norm": 0.15868861973285675, "kl": 0.018310546875, "learning_rate": 4.2294634442070553e-07, "loss": 0.0378, "reward": -0.39894504845142365, "reward_std": 0.4898769110441208, "rewards/cosine_scaled_reward": -0.19947252236306667, "rewards/format_reward": 0.0, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 2152.3195190429688, "epoch": 0.5449871465295629, "grad_norm": 0.1994917094707489, "kl": 0.0172882080078125, "learning_rate": 4.1993569137498776e-07, "loss": -0.0091, "reward": 0.24264823482371867, "reward_std": 0.6610805988311768, "rewards/cosine_scaled_reward": 0.12132412963546813, "rewards/format_reward": 0.0, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 2402.5556030273438, "epoch": 0.5467009425878321, "grad_norm": 0.2102198302745819, "kl": 0.01351165771484375, "learning_rate": 4.1693137748017915e-07, "loss": -0.0681, "reward": 0.05987721309065819, "reward_std": 0.5766515731811523, "rewards/cosine_scaled_reward": 0.029938601423054934, "rewards/format_reward": 0.0, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 2677.4027709960938, "epoch": 0.5484147386461011, "grad_norm": 0.2358679324388504, "kl": 0.01690673828125, "learning_rate": 4.1393354916230005e-07, "loss": 0.0956, "reward": -0.05587568995542824, "reward_std": 0.6320854872465134, "rewards/cosine_scaled_reward": -0.02793784497771412, "rewards/format_reward": 0.0, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 3042.4722900390625, "epoch": 0.5501285347043702, "grad_norm": 0.18476322293281555, "kl": 0.017547607421875, "learning_rate": 4.1094235253127374e-07, "loss": 0.0512, "reward": -0.2119649334345013, "reward_std": 0.585174448788166, "rewards/cosine_scaled_reward": -0.1059824712574482, "rewards/format_reward": 0.0, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 2080.375030517578, "epoch": 0.5518423307626392, "grad_norm": 0.18924832344055176, "kl": 0.0111083984375, "learning_rate": 4.079579333738039e-07, "loss": 0.0098, "reward": 0.3428979776799679, "reward_std": 0.7396816238760948, "rewards/cosine_scaled_reward": 0.1714489795267582, "rewards/format_reward": 0.0, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 2770.7916870117188, "epoch": 0.5535561268209083, "grad_norm": 0.17449912428855896, "kl": 0.0141143798828125, "learning_rate": 4.0498043714627006e-07, "loss": 0.0149, "reward": -0.15011528879404068, "reward_std": 0.5199657753109932, "rewards/cosine_scaled_reward": -0.07505764067173004, "rewards/format_reward": 0.0, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 2524.4305725097656, "epoch": 0.5552699228791774, "grad_norm": 0.25161027908325195, "kl": 0.01303863525390625, "learning_rate": 4.020100089676376e-07, "loss": 0.1119, "reward": 0.2225971333682537, "reward_std": 0.7053848057985306, "rewards/cosine_scaled_reward": 0.11129856202751398, "rewards/format_reward": 0.0, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 2823.5694580078125, "epoch": 0.5569837189374465, "grad_norm": 0.17407697439193726, "kl": 0.016265869140625, "learning_rate": 3.9904679361238526e-07, "loss": -0.0328, "reward": -0.11739783291704953, "reward_std": 0.6684166565537453, "rewards/cosine_scaled_reward": -0.058698914712294936, "rewards/format_reward": 0.0, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 2481.6944580078125, "epoch": 0.5586975149957155, "grad_norm": 0.16408374905586243, "kl": 0.01628875732421875, "learning_rate": 3.9609093550344907e-07, "loss": 0.0145, "reward": 0.05000840872526169, "reward_std": 0.4738306663930416, "rewards/cosine_scaled_reward": 0.025004200637340546, "rewards/format_reward": 0.0, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 2850.8611450195312, "epoch": 0.5604113110539846, "grad_norm": 0.1830449402332306, "kl": 0.0183563232421875, "learning_rate": 3.931425787051832e-07, "loss": 0.054, "reward": -0.26191626861691475, "reward_std": 0.4200581759214401, "rewards/cosine_scaled_reward": -0.1309581445530057, "rewards/format_reward": 0.0, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 2681.875030517578, "epoch": 0.5621251071122536, "grad_norm": 0.3444949984550476, "kl": 0.031097412109375, "learning_rate": 3.902018669163384e-07, "loss": 0.0002, "reward": 0.058326710015535355, "reward_std": 0.5914809927344322, "rewards/cosine_scaled_reward": 0.029163353145122528, "rewards/format_reward": 0.0, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 2444.1805419921875, "epoch": 0.5638389031705227, "grad_norm": 0.20234812796115875, "kl": 0.0186004638671875, "learning_rate": 3.872689434630585e-07, "loss": 0.0297, "reward": 0.015948079526424408, "reward_std": 0.5476803705096245, "rewards/cosine_scaled_reward": 0.00797403953038156, "rewards/format_reward": 0.0, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 2751.6666870117188, "epoch": 0.5655526992287918, "grad_norm": 0.20875848829746246, "kl": 0.0170745849609375, "learning_rate": 3.843439512918949e-07, "loss": 0.0404, "reward": -0.1900151213631034, "reward_std": 0.552287369966507, "rewards/cosine_scaled_reward": -0.09500756207853556, "rewards/format_reward": 0.0, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 2700.7222900390625, "epoch": 0.5672664952870609, "grad_norm": 0.17264467477798462, "kl": 0.0172119140625, "learning_rate": 3.8142703296283953e-07, "loss": 0.0526, "reward": 0.03160311561077833, "reward_std": 0.5627969726920128, "rewards/cosine_scaled_reward": 0.015801557805389166, "rewards/format_reward": 0.0, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 2822.8472900390625, "epoch": 0.5689802913453299, "grad_norm": 0.27976417541503906, "kl": 0.023895263671875, "learning_rate": 3.785183306423767e-07, "loss": 0.0355, "reward": 0.02845914661884308, "reward_std": 0.5001804158091545, "rewards/cosine_scaled_reward": 0.014229563996195793, "rewards/format_reward": 0.0, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 2853.3333740234375, "epoch": 0.570694087403599, "grad_norm": 0.1514306664466858, "kl": 0.017669677734375, "learning_rate": 3.7561798609655373e-07, "loss": -0.0082, "reward": -0.13629086455330253, "reward_std": 0.4956332743167877, "rewards/cosine_scaled_reward": -0.06814542971551418, "rewards/format_reward": 0.0, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 3072.9166870117188, "epoch": 0.572407883461868, "grad_norm": 0.14293867349624634, "kl": 0.0254974365234375, "learning_rate": 3.72726140684072e-07, "loss": 0.0174, "reward": 0.02665301039814949, "reward_std": 0.6765051260590553, "rewards/cosine_scaled_reward": 0.013326505199074745, "rewards/format_reward": 0.0, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 2824.861114501953, "epoch": 0.5741216795201372, "grad_norm": 0.19958122074604034, "kl": 0.0171356201171875, "learning_rate": 3.6984293534939737e-07, "loss": 0.0929, "reward": -0.056068588979542255, "reward_std": 0.8257120847702026, "rewards/cosine_scaled_reward": -0.028034291230142117, "rewards/format_reward": 0.0, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 2580.0, "epoch": 0.5758354755784062, "grad_norm": 0.229178324341774, "kl": 0.019195556640625, "learning_rate": 3.6696851061588994e-07, "loss": 0.006, "reward": -0.291859433054924, "reward_std": 0.4463714547455311, "rewards/cosine_scaled_reward": -0.1459297128021717, "rewards/format_reward": 0.0, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 2450.138916015625, "epoch": 0.5775492716366752, "grad_norm": 0.27258360385894775, "kl": 0.0173187255859375, "learning_rate": 3.641030065789562e-07, "loss": 0.0321, "reward": 0.07944206055253744, "reward_std": 0.6395395249128342, "rewards/cosine_scaled_reward": 0.03972102585248649, "rewards/format_reward": 0.0, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 2720.1805725097656, "epoch": 0.5792630676949443, "grad_norm": 0.20289485156536102, "kl": 0.019683837890625, "learning_rate": 3.612465628992203e-07, "loss": 0.069, "reward": 0.48021042346954346, "reward_std": 0.7420852333307266, "rewards/cosine_scaled_reward": 0.24010521546006203, "rewards/format_reward": 0.0, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 3105.0833740234375, "epoch": 0.5809768637532133, "grad_norm": 0.18909570574760437, "kl": 0.022308349609375, "learning_rate": 3.5839931879571725e-07, "loss": 0.0378, "reward": -0.22961215861141682, "reward_std": 0.5897372663021088, "rewards/cosine_scaled_reward": -0.11480608023703098, "rewards/format_reward": 0.0, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 2911.2083129882812, "epoch": 0.5826906598114824, "grad_norm": 0.20473147928714752, "kl": 0.022918701171875, "learning_rate": 3.555614130391079e-07, "loss": -0.0498, "reward": -0.1040644682943821, "reward_std": 0.57014200091362, "rewards/cosine_scaled_reward": -0.052032231353223324, "rewards/format_reward": 0.0, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 2422.777801513672, "epoch": 0.5844044558697515, "grad_norm": 0.1749623566865921, "kl": 0.019775390625, "learning_rate": 3.5273298394491515e-07, "loss": 0.0118, "reward": -0.429408997297287, "reward_std": 0.39798443764448166, "rewards/cosine_scaled_reward": -0.2147044911980629, "rewards/format_reward": 0.0, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 2946.736083984375, "epoch": 0.5861182519280206, "grad_norm": 0.20565366744995117, "kl": 0.01824951171875, "learning_rate": 3.4991416936678276e-07, "loss": 0.0572, "reward": -0.09714518021792173, "reward_std": 0.6395711675286293, "rewards/cosine_scaled_reward": -0.048572588711977005, "rewards/format_reward": 0.0, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 2240.361114501953, "epoch": 0.5878320479862896, "grad_norm": 0.19630080461502075, "kl": 0.013671875, "learning_rate": 3.471051066897562e-07, "loss": 0.0286, "reward": 0.09563972940668464, "reward_std": 0.5933751873672009, "rewards/cosine_scaled_reward": 0.047819861210882664, "rewards/format_reward": 0.0, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 2915.2916870117188, "epoch": 0.5895458440445587, "grad_norm": 0.20998388528823853, "kl": 0.0269317626953125, "learning_rate": 3.4430593282358777e-07, "loss": -0.0348, "reward": -0.3282645223662257, "reward_std": 0.49101946130394936, "rewards/cosine_scaled_reward": -0.16413226234726608, "rewards/format_reward": 0.0, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 2668.0277709960938, "epoch": 0.5912596401028277, "grad_norm": 0.25542527437210083, "kl": 0.020050048828125, "learning_rate": 3.4151678419606233e-07, "loss": 0.0754, "reward": 0.21342255361378193, "reward_std": 0.653385765850544, "rewards/cosine_scaled_reward": 0.10671127680689096, "rewards/format_reward": 0.0, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 2691.2638549804688, "epoch": 0.5929734361610969, "grad_norm": 0.21436557173728943, "kl": 0.0188446044921875, "learning_rate": 3.387377967463493e-07, "loss": 0.0297, "reward": -0.08409620448946953, "reward_std": 0.6964321285486221, "rewards/cosine_scaled_reward": -0.04204810503870249, "rewards/format_reward": 0.0, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 2423.2083740234375, "epoch": 0.5946872322193659, "grad_norm": 0.2174253612756729, "kl": 0.0170745849609375, "learning_rate": 3.359691059183761e-07, "loss": -0.0145, "reward": 0.05711523536592722, "reward_std": 0.6910872906446457, "rewards/cosine_scaled_reward": 0.02855762024410069, "rewards/format_reward": 0.0, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 2334.3194580078125, "epoch": 0.596401028277635, "grad_norm": 0.20871306955814362, "kl": 0.019622802734375, "learning_rate": 3.3321084665422803e-07, "loss": 0.0377, "reward": -0.29262126237154007, "reward_std": 0.5664101913571358, "rewards/cosine_scaled_reward": -0.14631063491106033, "rewards/format_reward": 0.0, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 2684.4861450195312, "epoch": 0.598114824335904, "grad_norm": 0.2084410935640335, "kl": 0.0155181884765625, "learning_rate": 3.3046315338757026e-07, "loss": -0.0696, "reward": 0.2747867554426193, "reward_std": 0.6360199972987175, "rewards/cosine_scaled_reward": 0.13739337399601936, "rewards/format_reward": 0.0, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 2624.9444580078125, "epoch": 0.5998286203941731, "grad_norm": 0.27559694647789, "kl": 0.01519775390625, "learning_rate": 3.2772616003709616e-07, "loss": 0.0439, "reward": 0.16777711734175682, "reward_std": 0.6573140621185303, "rewards/cosine_scaled_reward": 0.08388857543468475, "rewards/format_reward": 0.0, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 2644.0416564941406, "epoch": 0.6015424164524421, "grad_norm": 0.21829356253147125, "kl": 0.020263671875, "learning_rate": 3.250000000000001e-07, "loss": 0.019, "reward": 0.04395672678947449, "reward_std": 0.5275484099984169, "rewards/cosine_scaled_reward": 0.02197836432605982, "rewards/format_reward": 0.0, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 2948.3056030273438, "epoch": 0.6032562125107113, "grad_norm": 0.15744946897029877, "kl": 0.0189056396484375, "learning_rate": 3.222848061454764e-07, "loss": -0.0085, "reward": -0.41702286154031754, "reward_std": 0.5593557730317116, "rewards/cosine_scaled_reward": -0.20851144194602966, "rewards/format_reward": 0.0, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 2635.7222595214844, "epoch": 0.6049700085689803, "grad_norm": 0.22034288942813873, "kl": 0.021209716796875, "learning_rate": 3.195807108082429e-07, "loss": -0.0335, "reward": -0.30768171697854996, "reward_std": 0.5821868106722832, "rewards/cosine_scaled_reward": -0.15384084545075893, "rewards/format_reward": 0.0, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 2137.3055725097656, "epoch": 0.6066838046272494, "grad_norm": 0.276947557926178, "kl": 0.015472412109375, "learning_rate": 3.168878457820915e-07, "loss": 0.0844, "reward": 0.3251216746866703, "reward_std": 0.716858297586441, "rewards/cosine_scaled_reward": 0.16256084106862545, "rewards/format_reward": 0.0, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 2492.5555725097656, "epoch": 0.6083976006855184, "grad_norm": 0.2037208080291748, "kl": 0.0183258056640625, "learning_rate": 3.142063423134644e-07, "loss": -0.0014, "reward": -0.21882931515574455, "reward_std": 0.47944844514131546, "rewards/cosine_scaled_reward": -0.10941465757787228, "rewards/format_reward": 0.0, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 2614.3472290039062, "epoch": 0.6101113967437874, "grad_norm": 0.19817198812961578, "kl": 0.0220947265625, "learning_rate": 3.115363310950578e-07, "loss": 0.0141, "reward": -0.4298449754714966, "reward_std": 0.520567923784256, "rewards/cosine_scaled_reward": -0.2149224765598774, "rewards/format_reward": 0.0, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 2584.777801513672, "epoch": 0.6118251928020566, "grad_norm": 0.18728262186050415, "kl": 0.021331787109375, "learning_rate": 3.0887794225945143e-07, "loss": 0.04, "reward": 0.04458676278591156, "reward_std": 0.49945997446775436, "rewards/cosine_scaled_reward": 0.02229338139295578, "rewards/format_reward": 0.0, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 2934.513916015625, "epoch": 0.6135389888603257, "grad_norm": 0.17515863478183746, "kl": 0.017791748046875, "learning_rate": 3.062313053727671e-07, "loss": -0.0046, "reward": -0.0155550935305655, "reward_std": 0.607760101556778, "rewards/cosine_scaled_reward": -0.007777547696605325, "rewards/format_reward": 0.0, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 2598.2638549804688, "epoch": 0.6152527849185947, "grad_norm": 0.20000198483467102, "kl": 0.0205535888671875, "learning_rate": 3.0359654942835247e-07, "loss": -0.008, "reward": -0.21508236415684223, "reward_std": 0.4807446375489235, "rewards/cosine_scaled_reward": -0.10754118673503399, "rewards/format_reward": 0.0, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 2585.3333435058594, "epoch": 0.6169665809768637, "grad_norm": 0.1761714369058609, "kl": 0.01947021484375, "learning_rate": 3.0097380284049523e-07, "loss": 0.0011, "reward": -0.027444179635494947, "reward_std": 0.6417821869254112, "rewards/cosine_scaled_reward": -0.013722071889787912, "rewards/format_reward": 0.0, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 2367.3611450195312, "epoch": 0.6186803770351328, "grad_norm": 0.1938982903957367, "kl": 0.01788330078125, "learning_rate": 2.9836319343816397e-07, "loss": -0.023, "reward": 0.0992561224848032, "reward_std": 0.7357365190982819, "rewards/cosine_scaled_reward": 0.04962805658578873, "rewards/format_reward": 0.0, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 3139.541748046875, "epoch": 0.6203941730934018, "grad_norm": 0.17356501519680023, "kl": 0.024200439453125, "learning_rate": 2.9576484845877793e-07, "loss": -0.0258, "reward": -0.128750279545784, "reward_std": 0.5727476924657822, "rewards/cosine_scaled_reward": -0.06437514536082745, "rewards/format_reward": 0.0, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 2882.4306030273438, "epoch": 0.622107969151671, "grad_norm": 0.19220975041389465, "kl": 0.0243377685546875, "learning_rate": 2.931788945420058e-07, "loss": -0.0247, "reward": -0.019596407189965248, "reward_std": 0.6233709305524826, "rewards/cosine_scaled_reward": -0.009798200335353613, "rewards/format_reward": 0.0, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 2840.0555419921875, "epoch": 0.62382176520994, "grad_norm": 0.237908735871315, "kl": 0.02325439453125, "learning_rate": 2.9060545772359305e-07, "loss": 0.0684, "reward": -0.17538912501186132, "reward_std": 0.7643003761768341, "rewards/cosine_scaled_reward": -0.08769455272704363, "rewards/format_reward": 0.0, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 2791.75, "epoch": 0.6255355612682091, "grad_norm": 0.1972544640302658, "kl": 0.022613525390625, "learning_rate": 2.8804466342921987e-07, "loss": -0.0356, "reward": -0.19943542033433914, "reward_std": 0.6234779357910156, "rewards/cosine_scaled_reward": -0.09971771761775017, "rewards/format_reward": 0.0, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 2936.7916564941406, "epoch": 0.6272493573264781, "grad_norm": 0.1693785935640335, "kl": 0.022491455078125, "learning_rate": 2.854966364683872e-07, "loss": 0.0289, "reward": -0.07167929410934448, "reward_std": 0.41813354194164276, "rewards/cosine_scaled_reward": -0.035839639604091644, "rewards/format_reward": 0.0, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 2579.8056030273438, "epoch": 0.6289631533847472, "grad_norm": 0.18452903628349304, "kl": 0.02313232421875, "learning_rate": 2.829615010283344e-07, "loss": 0.0131, "reward": 0.13851050520315766, "reward_std": 0.6860260739922523, "rewards/cosine_scaled_reward": 0.06925524887628853, "rewards/format_reward": 0.0, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 3069.2361450195312, "epoch": 0.6306769494430163, "grad_norm": 0.207699254155159, "kl": 0.026519775390625, "learning_rate": 2.8043938066798645e-07, "loss": 0.0636, "reward": -0.25442312750965357, "reward_std": 0.5900055021047592, "rewards/cosine_scaled_reward": -0.12721156049519777, "rewards/format_reward": 0.0, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 2244.763916015625, "epoch": 0.6323907455012854, "grad_norm": 0.17845271527767181, "kl": 0.0156707763671875, "learning_rate": 2.7793039831193133e-07, "loss": 0.0488, "reward": 0.17914995457977057, "reward_std": 0.7317003160715103, "rewards/cosine_scaled_reward": 0.08957497263327241, "rewards/format_reward": 0.0, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 2134.1944274902344, "epoch": 0.6341045415595544, "grad_norm": 0.2277487814426422, "kl": 0.01385498046875, "learning_rate": 2.7543467624442956e-07, "loss": -0.0127, "reward": 0.11734075238928199, "reward_std": 0.5018965676426888, "rewards/cosine_scaled_reward": 0.05867037340067327, "rewards/format_reward": 0.0, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 2490.500030517578, "epoch": 0.6358183376178235, "grad_norm": 0.21075375378131866, "kl": 0.02069091796875, "learning_rate": 2.729523361034538e-07, "loss": 0.0493, "reward": -0.03656116779893637, "reward_std": 0.4987756237387657, "rewards/cosine_scaled_reward": -0.018280583899468184, "rewards/format_reward": 0.0, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 2664.625030517578, "epoch": 0.6375321336760925, "grad_norm": 0.22036224603652954, "kl": 0.01885986328125, "learning_rate": 2.7048349887476037e-07, "loss": 0.0736, "reward": -0.017365715699270368, "reward_std": 0.7068077325820923, "rewards/cosine_scaled_reward": -0.008682856685481966, "rewards/format_reward": 0.0, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 2659.3889770507812, "epoch": 0.6392459297343616, "grad_norm": 0.2022118866443634, "kl": 0.0198822021484375, "learning_rate": 2.6802828488599294e-07, "loss": 0.011, "reward": -0.049437786685302854, "reward_std": 0.5779630020260811, "rewards/cosine_scaled_reward": -0.024718896602280438, "rewards/format_reward": 0.0, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 2616.4306030273438, "epoch": 0.6409597257926307, "grad_norm": 0.1780145913362503, "kl": 0.0235595703125, "learning_rate": 2.655868138008171e-07, "loss": 0.0089, "reward": -0.017803641967475414, "reward_std": 0.6717728674411774, "rewards/cosine_scaled_reward": -0.00890181539580226, "rewards/format_reward": 0.0, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 2676.3472290039062, "epoch": 0.6426735218508998, "grad_norm": 0.15247489511966705, "kl": 0.026458740234375, "learning_rate": 2.631592046130896e-07, "loss": 0.0205, "reward": -0.31310519203543663, "reward_std": 0.5878890082240105, "rewards/cosine_scaled_reward": -0.15655260160565376, "rewards/format_reward": 0.0, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 2356.2083129882812, "epoch": 0.6443873179091688, "grad_norm": 0.2001314014196396, "kl": 0.0235595703125, "learning_rate": 2.6074557564105724e-07, "loss": 0.0174, "reward": -0.2070534396916628, "reward_std": 0.4216439947485924, "rewards/cosine_scaled_reward": -0.1035267198458314, "rewards/format_reward": 0.0, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 2577.5695190429688, "epoch": 0.6461011139674379, "grad_norm": 0.19885659217834473, "kl": 0.0179290771484375, "learning_rate": 2.583460445215911e-07, "loss": -0.0114, "reward": -0.2356225922703743, "reward_std": 0.4705282226204872, "rewards/cosine_scaled_reward": -0.1178113017231226, "rewards/format_reward": 0.0, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 2827.4305419921875, "epoch": 0.6478149100257069, "grad_norm": 0.16866172850131989, "kl": 0.022796630859375, "learning_rate": 2.5596072820445254e-07, "loss": 0.0359, "reward": -0.2195772840641439, "reward_std": 0.7464367002248764, "rewards/cosine_scaled_reward": -0.1097886401694268, "rewards/format_reward": 0.0, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 2001.8472595214844, "epoch": 0.6495287060839761, "grad_norm": 0.27339431643486023, "kl": 0.025421142578125, "learning_rate": 2.5358974294659373e-07, "loss": -0.0481, "reward": -0.053384889382869005, "reward_std": 0.7801851779222488, "rewards/cosine_scaled_reward": -0.026692438637837768, "rewards/format_reward": 0.0, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 2380.8611450195312, "epoch": 0.6512425021422451, "grad_norm": 0.49418047070503235, "kl": 0.028839111328125, "learning_rate": 2.512332043064913e-07, "loss": 0.1507, "reward": -0.04335943330079317, "reward_std": 0.7678016275167465, "rewards/cosine_scaled_reward": -0.021679717116057873, "rewards/format_reward": 0.0, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 2910.1806640625, "epoch": 0.6529562982005142, "grad_norm": 0.19250288605690002, "kl": 0.022003173828125, "learning_rate": 2.488912271385139e-07, "loss": 0.0447, "reward": -0.1130654625594616, "reward_std": 0.5473960787057877, "rewards/cosine_scaled_reward": -0.05653274059295654, "rewards/format_reward": 0.0, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 2682.9861450195312, "epoch": 0.6546700942587832, "grad_norm": 0.1798926293849945, "kl": 0.019439697265625, "learning_rate": 2.465639255873246e-07, "loss": -0.0224, "reward": -0.07310536503791809, "reward_std": 0.6817247718572617, "rewards/cosine_scaled_reward": -0.036552680656313896, "rewards/format_reward": 0.0, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 2607.763916015625, "epoch": 0.6563838903170522, "grad_norm": 0.24983283877372742, "kl": 0.026153564453125, "learning_rate": 2.4425141308231765e-07, "loss": 0.0197, "reward": -0.24107037298381329, "reward_std": 0.6102746799588203, "rewards/cosine_scaled_reward": -0.12053518556058407, "rewards/format_reward": 0.0, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 2681.277801513672, "epoch": 0.6580976863753213, "grad_norm": 0.21532803773880005, "kl": 0.0153350830078125, "learning_rate": 2.4195380233209006e-07, "loss": 0.0375, "reward": -0.2287786863744259, "reward_std": 0.5439959019422531, "rewards/cosine_scaled_reward": -0.11438935063779354, "rewards/format_reward": 0.0, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 2749.4305725097656, "epoch": 0.6598114824335904, "grad_norm": 0.23645354807376862, "kl": 0.02471923828125, "learning_rate": 2.3967120531894857e-07, "loss": -0.0225, "reward": -0.1737481877207756, "reward_std": 0.5551631152629852, "rewards/cosine_scaled_reward": -0.0868740938603878, "rewards/format_reward": 0.0, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 3019.7361450195312, "epoch": 0.6615252784918595, "grad_norm": 0.18375760316848755, "kl": 0.019195556640625, "learning_rate": 2.374037332934512e-07, "loss": 0.0429, "reward": -0.34039000049233437, "reward_std": 0.5544994547963142, "rewards/cosine_scaled_reward": -0.17019500210881233, "rewards/format_reward": 0.0, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 2555.625030517578, "epoch": 0.6632390745501285, "grad_norm": 0.2520519196987152, "kl": 0.0201873779296875, "learning_rate": 2.3515149676898552e-07, "loss": 0.0754, "reward": 0.06691954471170902, "reward_std": 0.4953342378139496, "rewards/cosine_scaled_reward": 0.03345977142453194, "rewards/format_reward": 0.0, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 2198.75, "epoch": 0.6649528706083976, "grad_norm": 0.21169999241828918, "kl": 0.0220489501953125, "learning_rate": 2.3291460551638237e-07, "loss": -0.0328, "reward": 0.10132637619972229, "reward_std": 0.6322794482111931, "rewards/cosine_scaled_reward": 0.050663191825151443, "rewards/format_reward": 0.0, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 2786.27783203125, "epoch": 0.6666666666666666, "grad_norm": 0.18405954539775848, "kl": 0.0251617431640625, "learning_rate": 2.306931685585657e-07, "loss": -0.0196, "reward": 0.03023771196603775, "reward_std": 0.46946871280670166, "rewards/cosine_scaled_reward": 0.015118852257728577, "rewards/format_reward": 0.0, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 2521.5693969726562, "epoch": 0.6683804627249358, "grad_norm": 0.19272808730602264, "kl": 0.0200347900390625, "learning_rate": 2.2848729416523859e-07, "loss": 0.0461, "reward": 0.00521535862935707, "reward_std": 0.616911455988884, "rewards/cosine_scaled_reward": 0.0026076845824718475, "rewards/format_reward": 0.0, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 2864.041748046875, "epoch": 0.6700942587832048, "grad_norm": 0.2623915672302246, "kl": 0.0235595703125, "learning_rate": 2.2629708984760706e-07, "loss": -0.002, "reward": -0.1861814223229885, "reward_std": 0.5339604392647743, "rewards/cosine_scaled_reward": -0.0930907130241394, "rewards/format_reward": 0.0, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 2380.9583740234375, "epoch": 0.6718080548414739, "grad_norm": 0.25610801577568054, "kl": 0.02032470703125, "learning_rate": 2.2412266235313973e-07, "loss": -0.0448, "reward": -0.07657308876514435, "reward_std": 0.6799488365650177, "rewards/cosine_scaled_reward": -0.038286540657281876, "rewards/format_reward": 0.0, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 2998.5139770507812, "epoch": 0.6735218508997429, "grad_norm": 0.19235925376415253, "kl": 0.0222015380859375, "learning_rate": 2.2196411766036487e-07, "loss": 0.0569, "reward": -0.001154482364654541, "reward_std": 0.5102438926696777, "rewards/cosine_scaled_reward": -0.0005772355943918228, "rewards/format_reward": 0.0, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 2115.1806030273438, "epoch": 0.675235646958012, "grad_norm": 0.2744181752204895, "kl": 0.027099609375, "learning_rate": 2.1982156097370557e-07, "loss": 0.0221, "reward": 0.058095297776162624, "reward_std": 0.718009740114212, "rewards/cosine_scaled_reward": 0.029047648888081312, "rewards/format_reward": 0.0, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 2774.1806030273438, "epoch": 0.676949443016281, "grad_norm": 0.19175058603286743, "kl": 0.025482177734375, "learning_rate": 2.1769509671835223e-07, "loss": 0.0352, "reward": -0.136960469186306, "reward_std": 0.511358916759491, "rewards/cosine_scaled_reward": -0.0684802271425724, "rewards/format_reward": 0.0, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 3057.9584350585938, "epoch": 0.6786632390745502, "grad_norm": 0.16223175823688507, "kl": 0.019256591796875, "learning_rate": 2.1558482853517253e-07, "loss": 0.0288, "reward": -0.04862111946567893, "reward_std": 0.5186164565384388, "rewards/cosine_scaled_reward": -0.024310562410391867, "rewards/format_reward": 0.0, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 2582.2916870117188, "epoch": 0.6803770351328192, "grad_norm": 0.287038654088974, "kl": 0.0208740234375, "learning_rate": 2.134908592756607e-07, "loss": -0.0666, "reward": -0.17554645985364914, "reward_std": 0.5096240639686584, "rewards/cosine_scaled_reward": -0.08777323365211487, "rewards/format_reward": 0.0, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 3108.4444580078125, "epoch": 0.6820908311910883, "grad_norm": 0.1679101139307022, "kl": 0.0231781005859375, "learning_rate": 2.1141329099692406e-07, "loss": -0.0035, "reward": 0.038632214069366455, "reward_std": 0.7707736194133759, "rewards/cosine_scaled_reward": 0.019316108897328377, "rewards/format_reward": 0.0, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 1893.5972442626953, "epoch": 0.6838046272493573, "grad_norm": 0.2708974778652191, "kl": 0.0231170654296875, "learning_rate": 2.0935222495670968e-07, "loss": 0.0065, "reward": 0.1442592293024063, "reward_std": 0.5131981894373894, "rewards/cosine_scaled_reward": 0.07212962210178375, "rewards/format_reward": 0.0, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 2922.7916870117188, "epoch": 0.6855184233076264, "grad_norm": 0.3239152133464813, "kl": 0.023956298828125, "learning_rate": 2.0730776160846853e-07, "loss": -0.0809, "reward": -0.12957404926419258, "reward_std": 0.5665386915206909, "rewards/cosine_scaled_reward": -0.06478701997548342, "rewards/format_reward": 0.0, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 2859.4444580078125, "epoch": 0.6872322193658955, "grad_norm": 0.19043125212192535, "kl": 0.0223541259765625, "learning_rate": 2.0528000059645995e-07, "loss": 0.0588, "reward": -0.32605881802737713, "reward_std": 0.5183117464184761, "rewards/cosine_scaled_reward": -0.16302942391484976, "rewards/format_reward": 0.0, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 2781.5556030273438, "epoch": 0.6889460154241646, "grad_norm": 0.26216545701026917, "kl": 0.026580810546875, "learning_rate": 2.032690407508949e-07, "loss": -0.0263, "reward": -0.4961502104997635, "reward_std": 0.3931718245148659, "rewards/cosine_scaled_reward": -0.24807510524988174, "rewards/format_reward": 0.0, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 2736.1111450195312, "epoch": 0.6906598114824336, "grad_norm": 0.21833109855651855, "kl": 0.02435302734375, "learning_rate": 2.0127498008311922e-07, "loss": 0.0585, "reward": 0.3037844013888389, "reward_std": 0.5833063200116158, "rewards/cosine_scaled_reward": 0.1518922229297459, "rewards/format_reward": 0.0, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 2712.861114501953, "epoch": 0.6923736075407027, "grad_norm": 0.2146695852279663, "kl": 0.025238037109375, "learning_rate": 1.9929791578083655e-07, "loss": -0.041, "reward": -0.21084421500563622, "reward_std": 0.4842342808842659, "rewards/cosine_scaled_reward": -0.10542210191488266, "rewards/format_reward": 0.0, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 2936.3333129882812, "epoch": 0.6940874035989717, "grad_norm": 0.18586868047714233, "kl": 0.024658203125, "learning_rate": 1.9733794420337213e-07, "loss": 0.005, "reward": 0.050316065549850464, "reward_std": 0.5316065326333046, "rewards/cosine_scaled_reward": 0.02515802625566721, "rewards/format_reward": 0.0, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 2209.0972595214844, "epoch": 0.6958011996572407, "grad_norm": 0.2406679093837738, "kl": 0.02728271484375, "learning_rate": 1.9539516087697517e-07, "loss": -0.0131, "reward": -0.021612104028463364, "reward_std": 0.5742413327097893, "rewards/cosine_scaled_reward": -0.010806052014231682, "rewards/format_reward": 0.0, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 2988.3056640625, "epoch": 0.6975149957155099, "grad_norm": 0.23395532369613647, "kl": 0.027374267578125, "learning_rate": 1.934696604901642e-07, "loss": 0.0598, "reward": -0.08433661237359047, "reward_std": 0.5562912449240685, "rewards/cosine_scaled_reward": -0.04216831736266613, "rewards/format_reward": 0.0, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 2319.6111755371094, "epoch": 0.699228791773779, "grad_norm": 0.2508476972579956, "kl": 0.0188751220703125, "learning_rate": 1.915615368891117e-07, "loss": -0.0462, "reward": 0.5069457921199501, "reward_std": 0.5437265560030937, "rewards/cosine_scaled_reward": 0.25347290316130966, "rewards/format_reward": 0.0, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 2740.7777709960938, "epoch": 0.700942587832048, "grad_norm": 0.18038439750671387, "kl": 0.030517578125, "learning_rate": 1.8967088307307e-07, "loss": 0.0239, "reward": 0.10421705152839422, "reward_std": 0.6194805726408958, "rewards/cosine_scaled_reward": 0.052108526695519686, "rewards/format_reward": 0.0, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 2735.8472595214844, "epoch": 0.702656383890317, "grad_norm": 0.2515905201435089, "kl": 0.02587890625, "learning_rate": 1.8779779118983867e-07, "loss": -0.0311, "reward": -0.1710510030388832, "reward_std": 0.5620269253849983, "rewards/cosine_scaled_reward": -0.0855255089700222, "rewards/format_reward": 0.0, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 2493.388916015625, "epoch": 0.7043701799485861, "grad_norm": 0.21452462673187256, "kl": 0.0193328857421875, "learning_rate": 1.8594235253127372e-07, "loss": -0.0157, "reward": -0.25840797275304794, "reward_std": 0.4374122992157936, "rewards/cosine_scaled_reward": -0.12920398078858852, "rewards/format_reward": 0.0, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 2339.2083740234375, "epoch": 0.7060839760068551, "grad_norm": 0.2920970320701599, "kl": 0.02203369140625, "learning_rate": 1.8410465752883758e-07, "loss": -0.0518, "reward": -0.25962352380156517, "reward_std": 0.5908957123756409, "rewards/cosine_scaled_reward": -0.129811754450202, "rewards/format_reward": 0.0, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 2770.0694580078125, "epoch": 0.7077977720651243, "grad_norm": 0.2641778290271759, "kl": 0.026947021484375, "learning_rate": 1.822847957491922e-07, "loss": 0.061, "reward": -0.15783867985010147, "reward_std": 0.5947980135679245, "rewards/cosine_scaled_reward": -0.07891935110092163, "rewards/format_reward": 0.0, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 2950.1945190429688, "epoch": 0.7095115681233933, "grad_norm": 0.2011335790157318, "kl": 0.024566650390625, "learning_rate": 1.804828558898332e-07, "loss": 0.0014, "reward": -0.0009787320159375668, "reward_std": 0.7296510636806488, "rewards/cosine_scaled_reward": -0.0004893671721220016, "rewards/format_reward": 0.0, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 2707.9305419921875, "epoch": 0.7112253641816624, "grad_norm": 0.3163622319698334, "kl": 0.027496337890625, "learning_rate": 1.7869892577476722e-07, "loss": 0.0567, "reward": -0.3990987651050091, "reward_std": 0.43145136535167694, "rewards/cosine_scaled_reward": -0.1995493769645691, "rewards/format_reward": 0.0, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 2968.0833129882812, "epoch": 0.7129391602399314, "grad_norm": 0.23538915812969208, "kl": 0.02618408203125, "learning_rate": 1.7693309235023127e-07, "loss": 0.0018, "reward": -0.08291278406977654, "reward_std": 0.4231496602296829, "rewards/cosine_scaled_reward": -0.041456387378275394, "rewards/format_reward": 0.0, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 2606.3333129882812, "epoch": 0.7146529562982005, "grad_norm": 0.3015352785587311, "kl": 0.0229949951171875, "learning_rate": 1.7518544168045524e-07, "loss": 0.0752, "reward": -0.3149372674524784, "reward_std": 0.6667703241109848, "rewards/cosine_scaled_reward": -0.1574686411768198, "rewards/format_reward": 0.0, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 2380.4445190429688, "epoch": 0.7163667523564696, "grad_norm": 0.22723092138767242, "kl": 0.0203704833984375, "learning_rate": 1.7345605894346726e-07, "loss": 0.0512, "reward": -0.34560693614184856, "reward_std": 0.4205815941095352, "rewards/cosine_scaled_reward": -0.17280346807092428, "rewards/format_reward": 0.0, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 2430.125, "epoch": 0.7180805484147387, "grad_norm": 0.23899339139461517, "kl": 0.02789306640625, "learning_rate": 1.7174502842694212e-07, "loss": -0.0302, "reward": -0.18839553371071815, "reward_std": 0.4583168476819992, "rewards/cosine_scaled_reward": -0.09419775661081076, "rewards/format_reward": 0.0, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 2552.4304809570312, "epoch": 0.7197943444730077, "grad_norm": 0.2333478480577469, "kl": 0.0173492431640625, "learning_rate": 1.7005243352409333e-07, "loss": 0.0486, "reward": -0.1561539713293314, "reward_std": 0.6325561329722404, "rewards/cosine_scaled_reward": -0.07807699032127857, "rewards/format_reward": 0.0, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 2746.0694580078125, "epoch": 0.7215081405312768, "grad_norm": 0.2854664921760559, "kl": 0.0223388671875, "learning_rate": 1.6837835672960831e-07, "loss": 0.0759, "reward": -0.08271846733987331, "reward_std": 0.6506856456398964, "rewards/cosine_scaled_reward": -0.04135924857109785, "rewards/format_reward": 0.0, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 2300.3055725097656, "epoch": 0.7232219365895458, "grad_norm": 0.17767125368118286, "kl": 0.0181427001953125, "learning_rate": 1.6672287963562852e-07, "loss": 0.0242, "reward": -0.16465576738119125, "reward_std": 0.5095989629626274, "rewards/cosine_scaled_reward": -0.08232788741588593, "rewards/format_reward": 0.0, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 2779.0694580078125, "epoch": 0.7249357326478149, "grad_norm": 0.1646861433982849, "kl": 0.019927978515625, "learning_rate": 1.6508608292777203e-07, "loss": 0.0035, "reward": -0.05413434375077486, "reward_std": 0.7594424337148666, "rewards/cosine_scaled_reward": -0.02706717373803258, "rewards/format_reward": 0.0, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 2956.7083129882812, "epoch": 0.726649528706084, "grad_norm": 0.22844961285591125, "kl": 0.03106689453125, "learning_rate": 1.6346804638120098e-07, "loss": -0.023, "reward": -0.16962197236716747, "reward_std": 0.6577330157160759, "rewards/cosine_scaled_reward": -0.08481098245829344, "rewards/format_reward": 0.0, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 2825.4444580078125, "epoch": 0.7283633247643531, "grad_norm": 0.21431787312030792, "kl": 0.03460693359375, "learning_rate": 1.6186884885673413e-07, "loss": -0.0182, "reward": -0.06549269519746304, "reward_std": 0.6411803439259529, "rewards/cosine_scaled_reward": -0.032746341079473495, "rewards/format_reward": 0.0, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 2121.5556030273438, "epoch": 0.7300771208226221, "grad_norm": 0.28984084725379944, "kl": 0.0258026123046875, "learning_rate": 1.6028856829700258e-07, "loss": -0.0539, "reward": 0.15816697012633085, "reward_std": 0.5270659551024437, "rewards/cosine_scaled_reward": 0.07908349251374602, "rewards/format_reward": 0.0, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 2789.486083984375, "epoch": 0.7317909168808912, "grad_norm": 0.20834913849830627, "kl": 0.022003173828125, "learning_rate": 1.5872728172265146e-07, "loss": -0.0337, "reward": -0.370651263743639, "reward_std": 0.526657946407795, "rewards/cosine_scaled_reward": -0.18532563000917435, "rewards/format_reward": 0.0, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 2442.777801513672, "epoch": 0.7335047129391602, "grad_norm": 0.2500321567058563, "kl": 0.021209716796875, "learning_rate": 1.5718506522858572e-07, "loss": 0.0647, "reward": 0.2879646308720112, "reward_std": 0.6987240761518478, "rewards/cosine_scaled_reward": 0.1439823191612959, "rewards/format_reward": 0.0, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 2859.0694580078125, "epoch": 0.7352185089974294, "grad_norm": 0.17108042538166046, "kl": 0.02716064453125, "learning_rate": 1.5566199398026147e-07, "loss": 0.0365, "reward": -0.21791245974600315, "reward_std": 0.5681828185915947, "rewards/cosine_scaled_reward": -0.10895622940734029, "rewards/format_reward": 0.0, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 2675.52783203125, "epoch": 0.7369323050556984, "grad_norm": 0.18789908289909363, "kl": 0.022308349609375, "learning_rate": 1.5415814221002265e-07, "loss": 0.0154, "reward": -0.023968554101884365, "reward_std": 0.5900578051805496, "rewards/cosine_scaled_reward": -0.01198427053168416, "rewards/format_reward": 0.0, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 2358.041717529297, "epoch": 0.7386461011139674, "grad_norm": 0.24318993091583252, "kl": 0.022705078125, "learning_rate": 1.5267358321348285e-07, "loss": 0.0687, "reward": 0.029904491268098354, "reward_std": 0.7376819550991058, "rewards/cosine_scaled_reward": 0.01495224516838789, "rewards/format_reward": 0.0, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 3008.9306030273438, "epoch": 0.7403598971722365, "grad_norm": 0.17112420499324799, "kl": 0.0255126953125, "learning_rate": 1.5120838934595337e-07, "loss": 0.0164, "reward": -0.09738675877451897, "reward_std": 0.39827052876353264, "rewards/cosine_scaled_reward": -0.04869337775744498, "rewards/format_reward": 0.0, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 2719.166717529297, "epoch": 0.7420736932305055, "grad_norm": 0.17819786071777344, "kl": 0.024810791015625, "learning_rate": 1.4976263201891613e-07, "loss": 0.0207, "reward": 0.0039961859583854675, "reward_std": 0.4406754970550537, "rewards/cosine_scaled_reward": 0.0019980808719992638, "rewards/format_reward": 0.0, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 2033.6805419921875, "epoch": 0.7437874892887746, "grad_norm": 0.25923460721969604, "kl": 0.0193939208984375, "learning_rate": 1.483363816965435e-07, "loss": 0.0555, "reward": -0.2742752702906728, "reward_std": 0.617987684905529, "rewards/cosine_scaled_reward": -0.1371376351453364, "rewards/format_reward": 0.0, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 2954.9027709960938, "epoch": 0.7455012853470437, "grad_norm": 0.21946591138839722, "kl": 0.021759033203125, "learning_rate": 1.469297078922642e-07, "loss": 0.0302, "reward": 0.07878507301211357, "reward_std": 0.5823550596833229, "rewards/cosine_scaled_reward": 0.03939253278076649, "rewards/format_reward": 0.0, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 2508.8056030273438, "epoch": 0.7472150814053128, "grad_norm": 0.18389303982257843, "kl": 0.022979736328125, "learning_rate": 1.4554267916537495e-07, "loss": 0.0511, "reward": 0.11886966414749622, "reward_std": 0.6237533167004585, "rewards/cosine_scaled_reward": 0.05943482369184494, "rewards/format_reward": 0.0, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 2480.375030517578, "epoch": 0.7489288774635818, "grad_norm": 0.18969161808490753, "kl": 0.022491455078125, "learning_rate": 1.4417536311769885e-07, "loss": -0.0202, "reward": -0.3488190211355686, "reward_std": 0.6528129577636719, "rewards/cosine_scaled_reward": -0.17440950870513916, "rewards/format_reward": 0.0, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 2536.4444580078125, "epoch": 0.7506426735218509, "grad_norm": 0.2745197117328644, "kl": 0.0180511474609375, "learning_rate": 1.4282782639029128e-07, "loss": -0.0504, "reward": 0.2845611646771431, "reward_std": 0.4479832947254181, "rewards/cosine_scaled_reward": 0.14228056371212006, "rewards/format_reward": 0.0, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 3101.6528930664062, "epoch": 0.7523564695801199, "grad_norm": 0.16732795536518097, "kl": 0.023773193359375, "learning_rate": 1.4150013466019114e-07, "loss": -0.0111, "reward": -0.2682619922561571, "reward_std": 0.6106480062007904, "rewards/cosine_scaled_reward": -0.134130991587881, "rewards/format_reward": 0.0, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 2572.5555725097656, "epoch": 0.7540702656383891, "grad_norm": 0.19039277732372284, "kl": 0.023834228515625, "learning_rate": 1.4019235263722034e-07, "loss": -0.0026, "reward": 0.057762331794947386, "reward_std": 0.5597369149327278, "rewards/cosine_scaled_reward": 0.02888116310350597, "rewards/format_reward": 0.0, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 2865.7916259765625, "epoch": 0.7557840616966581, "grad_norm": 0.19560997188091278, "kl": 0.024139404296875, "learning_rate": 1.3890454406082956e-07, "loss": 0.0056, "reward": -0.09995577030349523, "reward_std": 0.6689890846610069, "rewards/cosine_scaled_reward": -0.049977882008533925, "rewards/format_reward": 0.0, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 2008.4305419921875, "epoch": 0.7574978577549272, "grad_norm": 0.2906733751296997, "kl": 0.02508544921875, "learning_rate": 1.3763677169699217e-07, "loss": -0.0189, "reward": 0.1475011482834816, "reward_std": 0.6993541121482849, "rewards/cosine_scaled_reward": 0.07375057972967625, "rewards/format_reward": 0.0, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 2263.736114501953, "epoch": 0.7592116538131962, "grad_norm": 0.27822345495224, "kl": 0.027435302734375, "learning_rate": 1.3638909733514452e-07, "loss": -0.0396, "reward": 0.08332556113600731, "reward_std": 0.692223846912384, "rewards/cosine_scaled_reward": 0.04166277777403593, "rewards/format_reward": 0.0, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 2766.52783203125, "epoch": 0.7609254498714653, "grad_norm": 0.20186901092529297, "kl": 0.02777099609375, "learning_rate": 1.351615817851748e-07, "loss": 0.0635, "reward": 0.03594814520329237, "reward_std": 0.6744156032800674, "rewards/cosine_scaled_reward": 0.017974070739001036, "rewards/format_reward": 0.0, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 3045.90283203125, "epoch": 0.7626392459297343, "grad_norm": 0.24945306777954102, "kl": 0.0214080810546875, "learning_rate": 1.3395428487445914e-07, "loss": 0.0559, "reward": -0.03293860936537385, "reward_std": 0.6841256394982338, "rewards/cosine_scaled_reward": -0.016469309804961085, "rewards/format_reward": 0.0, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 2726.013885498047, "epoch": 0.7643530419880035, "grad_norm": 0.267711341381073, "kl": 0.02545166015625, "learning_rate": 1.3276726544494571e-07, "loss": -0.0273, "reward": 0.17773457616567612, "reward_std": 0.47991518676280975, "rewards/cosine_scaled_reward": 0.08886728808283806, "rewards/format_reward": 0.0, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 2652.3750610351562, "epoch": 0.7660668380462725, "grad_norm": 0.1955571472644806, "kl": 0.024688720703125, "learning_rate": 1.316005813502869e-07, "loss": -0.0013, "reward": -0.21300538629293442, "reward_std": 0.5716921538114548, "rewards/cosine_scaled_reward": -0.10650269035249949, "rewards/format_reward": 0.0, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 2869.0972290039062, "epoch": 0.7677806341045416, "grad_norm": 0.18771569430828094, "kl": 0.020751953125, "learning_rate": 1.3045428945301953e-07, "loss": 0.0449, "reward": 0.015052955597639084, "reward_std": 0.6415582820773125, "rewards/cosine_scaled_reward": 0.007526477798819542, "rewards/format_reward": 0.0, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 2951.72216796875, "epoch": 0.7694944301628106, "grad_norm": 0.186003640294075, "kl": 0.0264739990234375, "learning_rate": 1.2932844562179352e-07, "loss": -0.0117, "reward": -0.1732272356748581, "reward_std": 0.6033661440014839, "rewards/cosine_scaled_reward": -0.0866136197000742, "rewards/format_reward": 0.0, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 2393.916717529297, "epoch": 0.7712082262210797, "grad_norm": 0.18100591003894806, "kl": 0.018280029296875, "learning_rate": 1.2822310472864885e-07, "loss": 0.0174, "reward": -0.17222392931580544, "reward_std": 0.4759965166449547, "rewards/cosine_scaled_reward": -0.08611196093261242, "rewards/format_reward": 0.0, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 3247.5555419921875, "epoch": 0.7729220222793488, "grad_norm": 0.16927795112133026, "kl": 0.026092529296875, "learning_rate": 1.2713832064634125e-07, "loss": 0.0101, "reward": -0.1602705717086792, "reward_std": 0.5965098738670349, "rewards/cosine_scaled_reward": -0.0801352858543396, "rewards/format_reward": 0.0, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 2699.2222900390625, "epoch": 0.7746358183376179, "grad_norm": 0.16316962242126465, "kl": 0.0219879150390625, "learning_rate": 1.260741462457165e-07, "loss": 0.055, "reward": -0.06079525873064995, "reward_std": 0.6986799910664558, "rewards/cosine_scaled_reward": -0.030397622846066952, "rewards/format_reward": 0.0, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 2595.3472290039062, "epoch": 0.7763496143958869, "grad_norm": 0.5946676135063171, "kl": 0.029022216796875, "learning_rate": 1.2503063339313356e-07, "loss": -0.0538, "reward": -0.26486414577811956, "reward_std": 0.415864534676075, "rewards/cosine_scaled_reward": -0.13243207102641463, "rewards/format_reward": 0.0, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 2008.9305725097656, "epoch": 0.778063410454156, "grad_norm": 0.19199617207050323, "kl": 0.0153961181640625, "learning_rate": 1.2400783294793668e-07, "loss": 0.0063, "reward": 0.13748213648796082, "reward_std": 0.6150016859173775, "rewards/cosine_scaled_reward": 0.06874106079339981, "rewards/format_reward": 0.0, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 2725.0000610351562, "epoch": 0.779777206512425, "grad_norm": 0.1928061991930008, "kl": 0.02520751953125, "learning_rate": 1.2300579475997657e-07, "loss": 0.0235, "reward": -0.0030081644654273987, "reward_std": 0.7155122309923172, "rewards/cosine_scaled_reward": -0.0015040775761008263, "rewards/format_reward": 0.0, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 2560.888916015625, "epoch": 0.781491002570694, "grad_norm": 0.2278081625699997, "kl": 0.0233154296875, "learning_rate": 1.220245676671809e-07, "loss": 0.0422, "reward": -0.040069979906547815, "reward_std": 0.6579814180731773, "rewards/cosine_scaled_reward": -0.02003499452257529, "rewards/format_reward": 0.0, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 2620.638916015625, "epoch": 0.7832047986289632, "grad_norm": 0.2580767571926117, "kl": 0.0252532958984375, "learning_rate": 1.2106419949317388e-07, "loss": 0.033, "reward": 0.23012623190879822, "reward_std": 0.6976396143436432, "rewards/cosine_scaled_reward": 0.11506311595439911, "rewards/format_reward": 0.0, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 2410.513946533203, "epoch": 0.7849185946872322, "grad_norm": 0.2296840250492096, "kl": 0.0242919921875, "learning_rate": 1.2012473704494537e-07, "loss": -0.0221, "reward": -0.00019283778965473175, "reward_std": 0.6016373038291931, "rewards/cosine_scaled_reward": -9.64207574725151e-05, "rewards/format_reward": 0.0, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 2265.4583435058594, "epoch": 0.7866323907455013, "grad_norm": 0.23107987642288208, "kl": 0.0201568603515625, "learning_rate": 1.1920622611056974e-07, "loss": -0.0309, "reward": 0.18072006362490356, "reward_std": 0.5644106566905975, "rewards/cosine_scaled_reward": 0.09036003064829856, "rewards/format_reward": 0.0, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 2700.541748046875, "epoch": 0.7883461868037703, "grad_norm": 0.21530865132808685, "kl": 0.024566650390625, "learning_rate": 1.1830871145697412e-07, "loss": 0.0267, "reward": 0.030735374661162496, "reward_std": 0.7207788527011871, "rewards/cosine_scaled_reward": 0.01536769128870219, "rewards/format_reward": 0.0, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 2300.8333435058594, "epoch": 0.7900599828620394, "grad_norm": 0.31069549918174744, "kl": 0.023162841796875, "learning_rate": 1.1743223682775649e-07, "loss": -0.0563, "reward": 0.09233328700065613, "reward_std": 0.7090381979942322, "rewards/cosine_scaled_reward": 0.046166639775037766, "rewards/format_reward": 0.0, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 2608.8056030273438, "epoch": 0.7917737789203085, "grad_norm": 0.20271840691566467, "kl": 0.0250244140625, "learning_rate": 1.1657684494105386e-07, "loss": 0.0362, "reward": -0.07103721424937248, "reward_std": 0.7956888303160667, "rewards/cosine_scaled_reward": -0.03551860898733139, "rewards/format_reward": 0.0, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 2780.4444580078125, "epoch": 0.7934875749785776, "grad_norm": 0.2081584334373474, "kl": 0.0209808349609375, "learning_rate": 1.1574257748745986e-07, "loss": -0.0471, "reward": 0.021411696448922157, "reward_std": 0.48744403570890427, "rewards/cosine_scaled_reward": 0.010705851949751377, "rewards/format_reward": 0.0, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 2673.4861450195312, "epoch": 0.7952013710368466, "grad_norm": 0.19560140371322632, "kl": 0.0238037109375, "learning_rate": 1.1492947512799328e-07, "loss": -0.0409, "reward": 0.18470758572220802, "reward_std": 0.5649774596095085, "rewards/cosine_scaled_reward": 0.09235379751771688, "rewards/format_reward": 0.0, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 3105.27783203125, "epoch": 0.7969151670951157, "grad_norm": 0.20849952101707458, "kl": 0.030609130859375, "learning_rate": 1.1413757749211602e-07, "loss": 0.041, "reward": -0.284846730530262, "reward_std": 0.5418054684996605, "rewards/cosine_scaled_reward": -0.14242336247116327, "rewards/format_reward": 0.0, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 2965.8611450195312, "epoch": 0.7986289631533847, "grad_norm": 0.17242176830768585, "kl": 0.025604248046875, "learning_rate": 1.1336692317580158e-07, "loss": 0.0034, "reward": -0.3226154297590256, "reward_std": 0.5333989933133125, "rewards/cosine_scaled_reward": -0.16130771208554506, "rewards/format_reward": 0.0, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 3079.6806030273438, "epoch": 0.8003427592116538, "grad_norm": 0.20415925979614258, "kl": 0.0255126953125, "learning_rate": 1.1261754973965422e-07, "loss": 0.0969, "reward": -0.24770671501755714, "reward_std": 0.5701889246702194, "rewards/cosine_scaled_reward": -0.12385335750877857, "rewards/format_reward": 0.0, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 2234.986083984375, "epoch": 0.8020565552699229, "grad_norm": 0.3467549979686737, "kl": 0.024200439453125, "learning_rate": 1.1188949370707787e-07, "loss": 0.0855, "reward": -0.009742069989442825, "reward_std": 0.591868631541729, "rewards/cosine_scaled_reward": -0.004871031269431114, "rewards/format_reward": 0.0, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 2717.0833435058594, "epoch": 0.803770351328192, "grad_norm": 0.20191779732704163, "kl": 0.0237579345703125, "learning_rate": 1.1118279056249653e-07, "loss": 0.0622, "reward": 0.058277749456465244, "reward_std": 0.7684449702501297, "rewards/cosine_scaled_reward": 0.029138876125216484, "rewards/format_reward": 0.0, "step": 469 }, { "clip_ratio": 0.0, "completion_length": 2607.1388549804688, "epoch": 0.805484147386461, "grad_norm": 0.2015339732170105, "kl": 0.0255126953125, "learning_rate": 1.1049747474962444e-07, "loss": -0.0145, "reward": -0.2184823751449585, "reward_std": 0.5644990280270576, "rewards/cosine_scaled_reward": -0.10924118757247925, "rewards/format_reward": 0.0, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 3260.3333740234375, "epoch": 0.8071979434447301, "grad_norm": 0.2259937822818756, "kl": 0.02581787109375, "learning_rate": 1.0983357966978745e-07, "loss": 0.0565, "reward": -0.22493689320981503, "reward_std": 0.5675350055098534, "rewards/cosine_scaled_reward": -0.11246845219284296, "rewards/format_reward": 0.0, "step": 471 }, { "clip_ratio": 0.0, "completion_length": 2854.1806030273438, "epoch": 0.8089117395029991, "grad_norm": 0.21495820581912994, "kl": 0.028594970703125, "learning_rate": 1.0919113768029517e-07, "loss": -0.0313, "reward": 0.14072632044553757, "reward_std": 0.6395101621747017, "rewards/cosine_scaled_reward": 0.07036316394805908, "rewards/format_reward": 0.0, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 2890.4444580078125, "epoch": 0.8106255355612683, "grad_norm": 0.18685470521450043, "kl": 0.023468017578125, "learning_rate": 1.0857018009286381e-07, "loss": -0.0026, "reward": -0.1240294948220253, "reward_std": 0.48969001322984695, "rewards/cosine_scaled_reward": -0.06201474368572235, "rewards/format_reward": 0.0, "step": 473 }, { "clip_ratio": 0.0, "completion_length": 2470.0972900390625, "epoch": 0.8123393316195373, "grad_norm": 0.2818465232849121, "kl": 0.029144287109375, "learning_rate": 1.0797073717209013e-07, "loss": 0.0035, "reward": 0.24050107831135392, "reward_std": 0.5852163806557655, "rewards/cosine_scaled_reward": 0.12025054381228983, "rewards/format_reward": 0.0, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 2933.541748046875, "epoch": 0.8140531276778064, "grad_norm": 0.19675259292125702, "kl": 0.027496337890625, "learning_rate": 1.0739283813397639e-07, "loss": 0.0321, "reward": -0.22623535431921482, "reward_std": 0.6677599251270294, "rewards/cosine_scaled_reward": -0.1131176782073453, "rewards/format_reward": 0.0, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 2794.9862060546875, "epoch": 0.8157669237360754, "grad_norm": 0.24406589567661285, "kl": 0.02947998046875, "learning_rate": 1.068365111445064e-07, "loss": -0.0179, "reward": -0.3521595522761345, "reward_std": 0.5985631048679352, "rewards/cosine_scaled_reward": -0.17607977613806725, "rewards/format_reward": 0.0, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 2567.4722290039062, "epoch": 0.8174807197943444, "grad_norm": 0.17105937004089355, "kl": 0.016357421875, "learning_rate": 1.063017833182728e-07, "loss": 0.0385, "reward": -0.2558911629021168, "reward_std": 0.4246537983417511, "rewards/cosine_scaled_reward": -0.12794558703899384, "rewards/format_reward": 0.0, "step": 477 }, { "clip_ratio": 0.0, "completion_length": 2355.6806030273438, "epoch": 0.8191945158526135, "grad_norm": 0.24741511046886444, "kl": 0.02423095703125, "learning_rate": 1.0578868071715544e-07, "loss": 0.0854, "reward": -0.1565770129673183, "reward_std": 0.6820876449346542, "rewards/cosine_scaled_reward": -0.07828850811347365, "rewards/format_reward": 0.0, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 2624.0833435058594, "epoch": 0.8209083119108826, "grad_norm": 0.20646637678146362, "kl": 0.0313720703125, "learning_rate": 1.0529722834905125e-07, "loss": 0.0146, "reward": 0.1560894399881363, "reward_std": 0.5770560130476952, "rewards/cosine_scaled_reward": 0.0780447069555521, "rewards/format_reward": 0.0, "step": 479 }, { "clip_ratio": 0.0, "completion_length": 2327.7222290039062, "epoch": 0.8226221079691517, "grad_norm": 0.23056431114673615, "kl": 0.01910400390625, "learning_rate": 1.0482745016665526e-07, "loss": 0.0635, "reward": 0.149917745962739, "reward_std": 0.7103602811694145, "rewards/cosine_scaled_reward": 0.07495887111872435, "rewards/format_reward": 0.0, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 2461.5972900390625, "epoch": 0.8243359040274207, "grad_norm": 0.24706712365150452, "kl": 0.03350830078125, "learning_rate": 1.0437936906629334e-07, "loss": 0.0029, "reward": -0.1800133902579546, "reward_std": 0.49367547780275345, "rewards/cosine_scaled_reward": -0.09000669163651764, "rewards/format_reward": 0.0, "step": 481 }, { "clip_ratio": 0.0, "completion_length": 2864.0694580078125, "epoch": 0.8260497000856898, "grad_norm": 0.25076547265052795, "kl": 0.021514892578125, "learning_rate": 1.0395300688680625e-07, "loss": 0.0445, "reward": -0.14674655348062515, "reward_std": 0.5242787301540375, "rewards/cosine_scaled_reward": -0.07337328046560287, "rewards/format_reward": 0.0, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 2596.874969482422, "epoch": 0.8277634961439588, "grad_norm": 0.2056618481874466, "kl": 0.02398681640625, "learning_rate": 1.0354838440848501e-07, "loss": 0.0449, "reward": -0.12835523579269648, "reward_std": 0.5452019795775414, "rewards/cosine_scaled_reward": -0.06417762162163854, "rewards/format_reward": 0.0, "step": 483 }, { "clip_ratio": 0.0, "completion_length": 2179.500030517578, "epoch": 0.829477292202228, "grad_norm": 0.17514649033546448, "kl": 0.0175018310546875, "learning_rate": 1.0316552135205837e-07, "loss": 0.058, "reward": -0.154528075363487, "reward_std": 0.5336438938975334, "rewards/cosine_scaled_reward": -0.07726403628475964, "rewards/format_reward": 0.0, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 2221.6666259765625, "epoch": 0.831191088260497, "grad_norm": 0.2613643705844879, "kl": 0.028350830078125, "learning_rate": 1.0280443637773163e-07, "loss": 0.0496, "reward": -0.2054775208234787, "reward_std": 0.5721682235598564, "rewards/cosine_scaled_reward": -0.1027387659996748, "rewards/format_reward": 0.0, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 2448.3193969726562, "epoch": 0.8329048843187661, "grad_norm": 0.2419876903295517, "kl": 0.028411865234375, "learning_rate": 1.0246514708427701e-07, "loss": -0.0397, "reward": -0.06807173043489456, "reward_std": 0.507116761058569, "rewards/cosine_scaled_reward": -0.03403585962951183, "rewards/format_reward": 0.0, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 2735.7084045410156, "epoch": 0.8346186803770351, "grad_norm": 0.2406454235315323, "kl": 0.0244140625, "learning_rate": 1.0214767000817596e-07, "loss": 0.0244, "reward": -0.2896123267710209, "reward_std": 0.5326507315039635, "rewards/cosine_scaled_reward": -0.1448061689734459, "rewards/format_reward": 0.0, "step": 487 }, { "clip_ratio": 0.0, "completion_length": 2310.27783203125, "epoch": 0.8363324764353042, "grad_norm": 0.252458781003952, "kl": 0.021209716796875, "learning_rate": 1.0185202062281336e-07, "loss": 0.0239, "reward": 0.10779337584972382, "reward_std": 0.6982715576887131, "rewards/cosine_scaled_reward": 0.053896697354502976, "rewards/format_reward": 0.0, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 2743.541748046875, "epoch": 0.8380462724935732, "grad_norm": 0.17434372007846832, "kl": 0.024749755859375, "learning_rate": 1.0157821333772304e-07, "loss": 0.0124, "reward": -0.16761679388582706, "reward_std": 0.6089917570352554, "rewards/cosine_scaled_reward": -0.08380839880555868, "rewards/format_reward": 0.0, "step": 489 }, { "clip_ratio": 0.0, "completion_length": 2486.236114501953, "epoch": 0.8397600685518424, "grad_norm": 0.2090701311826706, "kl": 0.027984619140625, "learning_rate": 1.013262614978859e-07, "loss": 0.0036, "reward": -0.05014536017552018, "reward_std": 0.5763295590877533, "rewards/cosine_scaled_reward": -0.025072677060961723, "rewards/format_reward": 0.0, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 2848.888916015625, "epoch": 0.8414738646101114, "grad_norm": 0.22741778194904327, "kl": 0.029052734375, "learning_rate": 1.0109617738307911e-07, "loss": 0.0629, "reward": -0.16439465060830116, "reward_std": 0.6782207787036896, "rewards/cosine_scaled_reward": -0.08219731226563454, "rewards/format_reward": 0.0, "step": 491 }, { "clip_ratio": 0.0, "completion_length": 3189.6250610351562, "epoch": 0.8431876606683805, "grad_norm": 0.17802605032920837, "kl": 0.028839111328125, "learning_rate": 1.0088797220727779e-07, "loss": 0.0435, "reward": -0.35712628811597824, "reward_std": 0.6088190823793411, "rewards/cosine_scaled_reward": -0.17856314033269882, "rewards/format_reward": 0.0, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 2387.888885498047, "epoch": 0.8449014567266495, "grad_norm": 0.2299438714981079, "kl": 0.025177001953125, "learning_rate": 1.0070165611810855e-07, "loss": 0.0576, "reward": -0.17006561160087585, "reward_std": 0.3991905003786087, "rewards/cosine_scaled_reward": -0.08503280207514763, "rewards/format_reward": 0.0, "step": 493 }, { "clip_ratio": 0.0, "completion_length": 2941.8611450195312, "epoch": 0.8466152527849186, "grad_norm": 0.18643692135810852, "kl": 0.0261383056640625, "learning_rate": 1.005372381963547e-07, "loss": 0.0474, "reward": -0.04825907852500677, "reward_std": 0.6627323552966118, "rewards/cosine_scaled_reward": -0.02412955043837428, "rewards/format_reward": 0.0, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 2563.375030517578, "epoch": 0.8483290488431876, "grad_norm": 0.32022979855537415, "kl": 0.03094482421875, "learning_rate": 1.0039472645551372e-07, "loss": -0.0218, "reward": 0.03584544826298952, "reward_std": 0.5564405769109726, "rewards/cosine_scaled_reward": 0.017922731814906, "rewards/format_reward": 0.0, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 2572.77783203125, "epoch": 0.8500428449014568, "grad_norm": 0.23663800954818726, "kl": 0.0213623046875, "learning_rate": 1.002741278414069e-07, "loss": 0.0756, "reward": 0.13385188579559326, "reward_std": 0.6874089986085892, "rewards/cosine_scaled_reward": 0.06692593172192574, "rewards/format_reward": 0.0, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 2480.5694274902344, "epoch": 0.8517566409597258, "grad_norm": 0.2888961732387543, "kl": 0.0250244140625, "learning_rate": 1.0017544823184055e-07, "loss": 0.1087, "reward": 0.2151249535381794, "reward_std": 0.7869587689638138, "rewards/cosine_scaled_reward": 0.10756248049438, "rewards/format_reward": 0.0, "step": 497 }, { "clip_ratio": 0.0, "completion_length": 3000.1666259765625, "epoch": 0.8534704370179949, "grad_norm": 0.16567862033843994, "kl": 0.0272216796875, "learning_rate": 1.0009869243631952e-07, "loss": -0.0254, "reward": -0.33249833807349205, "reward_std": 0.48314109444618225, "rewards/cosine_scaled_reward": -0.16624917834997177, "rewards/format_reward": 0.0, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 2385.5416870117188, "epoch": 0.8551842330762639, "grad_norm": 0.20905697345733643, "kl": 0.027862548828125, "learning_rate": 1.000438641958131e-07, "loss": 0.017, "reward": 0.046394890174269676, "reward_std": 0.649334043264389, "rewards/cosine_scaled_reward": 0.02319744322448969, "rewards/format_reward": 0.0, "step": 499 }, { "clip_ratio": 0.0, "completion_length": 1828.2777862548828, "epoch": 0.856898029134533, "grad_norm": 0.25425171852111816, "kl": 0.0164031982421875, "learning_rate": 1.0001096618257236e-07, "loss": -0.0015, "reward": 0.1327105201780796, "reward_std": 0.5622994378209114, "rewards/cosine_scaled_reward": 0.06635526567697525, "rewards/format_reward": 0.0, "step": 500 }, { "epoch": 0.856898029134533, "step": 500, "total_flos": 0.0, "train_loss": 0.01978990149567835, "train_runtime": 91059.0796, "train_samples_per_second": 0.395, "train_steps_per_second": 0.005 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }