{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.08333333333333333, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 250.0, "epoch": 8.333333333333333e-05, "grad_norm": 5370.26904296875, "kl": 1680.79296875, "learning_rate": 5.0000000000000004e-08, "loss": 67.2317, "reward": 0.4166666865348816, "reward_std": 0.49601587653160095, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.0416666679084301, "step": 1 }, { "completion_length": 250.0, "epoch": 0.00016666666666666666, "grad_norm": 2.1227879524230957, "kl": 0.3190357983112335, "learning_rate": 1.0000000000000001e-07, "loss": 0.0128, "reward": 0.0416666679084301, "reward_std": 0.1178511381149292, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0416666679084301, "step": 2 }, { "completion_length": 250.0, "epoch": 0.00025, "grad_norm": 1779814.75, "kl": 204788.625, "learning_rate": 1.5000000000000002e-07, "loss": 8191.5454, "reward": 0.1666666716337204, "reward_std": 0.35634833574295044, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.0416666679084301, "step": 3 }, { "completion_length": 250.0, "epoch": 0.0003333333333333333, "grad_norm": 3.283656358718872, "kl": 1.0824432373046875, "learning_rate": 2.0000000000000002e-07, "loss": 0.0433, "reward": 0.0416666679084301, "reward_std": 0.1178511381149292, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0416666679084301, "step": 4 }, { "completion_length": 250.0, "epoch": 0.0004166666666666667, "grad_norm": 2.5271332263946533, "kl": 0.6556969881057739, "learning_rate": 2.5000000000000004e-07, "loss": 0.0262, "reward": 0.0416666679084301, "reward_std": 0.1178511381149292, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0416666679084301, "step": 5 }, { "completion_length": 250.0, "epoch": 0.0005, "grad_norm": 1.6124398708343506, "kl": 0.33115604519844055, "learning_rate": 3.0000000000000004e-07, "loss": 0.0132, "reward": 0.0833333358168602, "reward_std": 0.15430335700511932, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0833333358168602, "step": 6 }, { "completion_length": 250.0, "epoch": 0.0005833333333333334, "grad_norm": 1.7685329914093018, "kl": 0.4470013380050659, "learning_rate": 3.5000000000000004e-07, "loss": 0.0179, "reward": 0.2916666865348816, "reward_std": 0.4520675837993622, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.2916666865348816, "step": 7 }, { "completion_length": 250.0, "epoch": 0.0006666666666666666, "grad_norm": 0.2993689179420471, "kl": 0.24829219281673431, "learning_rate": 4.0000000000000003e-07, "loss": 0.0099, "reward": 0.0833333358168602, "reward_std": 0.2357022762298584, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0833333358168602, "step": 8 }, { "completion_length": 250.0, "epoch": 0.00075, "grad_norm": 995.0140991210938, "kl": 280.3321533203125, "learning_rate": 4.5000000000000003e-07, "loss": 11.2133, "reward": 0.25, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.125, "step": 9 }, { "completion_length": 250.0, "epoch": 0.0008333333333333334, "grad_norm": 0.5820446014404297, "kl": 0.38791656494140625, "learning_rate": 5.000000000000001e-07, "loss": 0.0155, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.0, "step": 10 }, { "completion_length": 250.0, "epoch": 0.0009166666666666666, "grad_norm": 2.4436404705047607, "kl": 0.6116478443145752, "learning_rate": 5.5e-07, "loss": 0.0245, "reward": 0.0833333358168602, "reward_std": 0.2357022762298584, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0833333358168602, "step": 11 }, { "completion_length": 250.0, "epoch": 0.001, "grad_norm": 5655916544.0, "kl": 771800640.0, "learning_rate": 6.000000000000001e-07, "loss": 30872024.0, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 12 }, { "completion_length": 250.0, "epoch": 0.0010833333333333333, "grad_norm": 15.001657485961914, "kl": 3.9553842544555664, "learning_rate": 6.5e-07, "loss": 0.1582, "reward": 0.0416666679084301, "reward_std": 0.1178511381149292, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0416666679084301, "step": 13 }, { "completion_length": 250.0, "epoch": 0.0011666666666666668, "grad_norm": 24.87034034729004, "kl": 6.287238121032715, "learning_rate": 7.000000000000001e-07, "loss": 0.2515, "reward": 0.0416666679084301, "reward_std": 0.1178511381149292, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0416666679084301, "step": 14 }, { "completion_length": 250.0, "epoch": 0.00125, "grad_norm": 0.8860446214675903, "kl": 0.7432097792625427, "learning_rate": 7.5e-07, "loss": 0.0297, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 15 }, { "completion_length": 250.0, "epoch": 0.0013333333333333333, "grad_norm": 16639295488.0, "kl": 2705503488.0, "learning_rate": 8.000000000000001e-07, "loss": 108220136.0, "reward": 0.1666666716337204, "reward_std": 0.35634833574295044, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.0416666679084301, "step": 16 }, { "completion_length": 250.0, "epoch": 0.0014166666666666668, "grad_norm": 0.05391894653439522, "kl": 0.284786581993103, "learning_rate": 8.500000000000001e-07, "loss": 0.0114, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 17 }, { "completion_length": 250.0, "epoch": 0.0015, "grad_norm": 1350.6583251953125, "kl": 165.6782989501953, "learning_rate": 9.000000000000001e-07, "loss": 6.6271, "reward": 0.0416666679084301, "reward_std": 0.1178511381149292, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0416666679084301, "step": 18 }, { "completion_length": 250.0, "epoch": 0.0015833333333333333, "grad_norm": 26.208847045898438, "kl": 4.027891635894775, "learning_rate": 9.500000000000001e-07, "loss": 0.1611, "reward": 0.25, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.25, "step": 19 }, { "completion_length": 250.0, "epoch": 0.0016666666666666668, "grad_norm": 1.4429972171783447, "kl": 0.31500595808029175, "learning_rate": 1.0000000000000002e-06, "loss": 0.0126, "reward": 0.1666666716337204, "reward_std": 0.30860671401023865, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.1666666716337204, "step": 20 }, { "completion_length": 250.0, "epoch": 0.00175, "grad_norm": 0.10234619677066803, "kl": 0.3105039894580841, "learning_rate": 1.0500000000000001e-06, "loss": 0.0124, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 21 }, { "completion_length": 250.0, "epoch": 0.0018333333333333333, "grad_norm": 0.3241354525089264, "kl": 0.26765021681785583, "learning_rate": 1.1e-06, "loss": 0.0107, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 22 }, { "completion_length": 250.0, "epoch": 0.0019166666666666666, "grad_norm": 1.2852263450622559, "kl": 0.31648027896881104, "learning_rate": 1.1500000000000002e-06, "loss": 0.0127, "reward": 0.25, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.0, "step": 23 }, { "completion_length": 250.0, "epoch": 0.002, "grad_norm": 2.508118152618408, "kl": 0.2790696918964386, "learning_rate": 1.2000000000000002e-06, "loss": 0.0112, "reward": 0.0416666679084301, "reward_std": 0.1178511381149292, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0416666679084301, "step": 24 }, { "completion_length": 250.0, "epoch": 0.0020833333333333333, "grad_norm": 254.1594696044922, "kl": 28.16029930114746, "learning_rate": 1.25e-06, "loss": 1.1264, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 25 }, { "completion_length": 250.0, "epoch": 0.0021666666666666666, "grad_norm": 47.175018310546875, "kl": 3.506722927093506, "learning_rate": 1.3e-06, "loss": 0.1403, "reward": 0.0416666679084301, "reward_std": 0.1178511381149292, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0416666679084301, "step": 26 }, { "completion_length": 250.0, "epoch": 0.00225, "grad_norm": 514.6583862304688, "kl": 64.28365325927734, "learning_rate": 1.3500000000000002e-06, "loss": 2.5713, "reward": 0.0416666679084301, "reward_std": 0.1178511381149292, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0416666679084301, "step": 27 }, { "completion_length": 250.0, "epoch": 0.0023333333333333335, "grad_norm": 0.4107680916786194, "kl": 0.31204113364219666, "learning_rate": 1.4000000000000001e-06, "loss": 0.0125, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 28 }, { "completion_length": 250.0, "epoch": 0.002416666666666667, "grad_norm": 242400784.0, "kl": 28910430.0, "learning_rate": 1.45e-06, "loss": 1156417.25, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 29 }, { "completion_length": 250.0, "epoch": 0.0025, "grad_norm": 0.3205853998661041, "kl": 0.24721822142601013, "learning_rate": 1.5e-06, "loss": 0.0099, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 30 }, { "completion_length": 250.0, "epoch": 0.0025833333333333333, "grad_norm": 4.041526794433594, "kl": 1.1066405773162842, "learning_rate": 1.5500000000000002e-06, "loss": 0.0443, "reward": 0.0833333358168602, "reward_std": 0.15430335700511932, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0833333358168602, "step": 31 }, { "completion_length": 250.0, "epoch": 0.0026666666666666666, "grad_norm": 160318.890625, "kl": 8970.6455078125, "learning_rate": 1.6000000000000001e-06, "loss": 358.8259, "reward": 0.375, "reward_std": 0.4520675837993622, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.375, "step": 32 }, { "completion_length": 250.0, "epoch": 0.00275, "grad_norm": 0.11564349383115768, "kl": 0.3156932294368744, "learning_rate": 1.6500000000000003e-06, "loss": 0.0126, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 33 }, { "completion_length": 250.0, "epoch": 0.0028333333333333335, "grad_norm": 11.49873161315918, "kl": 3.654203176498413, "learning_rate": 1.7000000000000002e-06, "loss": 0.1462, "reward": 0.1666666716337204, "reward_std": 0.35634833574295044, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.0416666679084301, "step": 34 }, { "completion_length": 250.0, "epoch": 0.002916666666666667, "grad_norm": 3.487309455871582, "kl": 0.47985804080963135, "learning_rate": 1.75e-06, "loss": 0.0192, "reward": 0.0416666679084301, "reward_std": 0.1178511381149292, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0416666679084301, "step": 35 }, { "completion_length": 250.0, "epoch": 0.003, "grad_norm": 1.3627883195877075, "kl": 0.7914985418319702, "learning_rate": 1.8000000000000001e-06, "loss": 0.0317, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 36 }, { "completion_length": 250.0, "epoch": 0.0030833333333333333, "grad_norm": 0.7565972208976746, "kl": 0.2916364371776581, "learning_rate": 1.85e-06, "loss": 0.0117, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 37 }, { "completion_length": 250.0, "epoch": 0.0031666666666666666, "grad_norm": 230.62786865234375, "kl": 28.600061416625977, "learning_rate": 1.9000000000000002e-06, "loss": 1.144, "reward": 0.0416666679084301, "reward_std": 0.1178511381149292, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0416666679084301, "step": 38 }, { "completion_length": 250.0, "epoch": 0.00325, "grad_norm": 14.349813461303711, "kl": 2.471902847290039, "learning_rate": 1.9500000000000004e-06, "loss": 0.0989, "reward": 0.125, "reward_std": 0.24800793826580048, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.125, "step": 39 }, { "completion_length": 250.0, "epoch": 0.0033333333333333335, "grad_norm": 0.34092557430267334, "kl": 0.2546476423740387, "learning_rate": 2.0000000000000003e-06, "loss": 0.0102, "reward": 0.0833333358168602, "reward_std": 0.15430335700511932, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0833333358168602, "step": 40 }, { "completion_length": 250.0, "epoch": 0.003416666666666667, "grad_norm": 2.506922483444214, "kl": 0.2611209452152252, "learning_rate": 2.05e-06, "loss": 0.0104, "reward": 0.375, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.25, "step": 41 }, { "completion_length": 250.0, "epoch": 0.0035, "grad_norm": 2.241769790649414, "kl": 0.6660223603248596, "learning_rate": 2.1000000000000002e-06, "loss": 0.0266, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.125, "step": 42 }, { "completion_length": 250.0, "epoch": 0.0035833333333333333, "grad_norm": 0.8412100076675415, "kl": 0.35636767745018005, "learning_rate": 2.15e-06, "loss": 0.0143, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 43 }, { "completion_length": 250.0, "epoch": 0.0036666666666666666, "grad_norm": 3.74702525138855, "kl": 1.0497126579284668, "learning_rate": 2.2e-06, "loss": 0.042, "reward": 0.0833333358168602, "reward_std": 0.15430335700511932, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0833333358168602, "step": 44 }, { "completion_length": 250.0, "epoch": 0.00375, "grad_norm": 0.1603858321905136, "kl": 0.2380465269088745, "learning_rate": 2.25e-06, "loss": 0.0095, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 45 }, { "completion_length": 250.0, "epoch": 0.003833333333333333, "grad_norm": 3.948719024658203, "kl": 0.958899974822998, "learning_rate": 2.3000000000000004e-06, "loss": 0.0384, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 46 }, { "completion_length": 250.0, "epoch": 0.003916666666666666, "grad_norm": 0.3620174825191498, "kl": 0.24766167998313904, "learning_rate": 2.35e-06, "loss": 0.0099, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.125, "step": 47 }, { "completion_length": 250.0, "epoch": 0.004, "grad_norm": 0.1155853196978569, "kl": 0.26840996742248535, "learning_rate": 2.4000000000000003e-06, "loss": 0.0107, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 48 }, { "completion_length": 250.0, "epoch": 0.004083333333333333, "grad_norm": 19.22943878173828, "kl": 2.2793242931365967, "learning_rate": 2.4500000000000003e-06, "loss": 0.0912, "reward": 0.0416666679084301, "reward_std": 0.1178511381149292, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0416666679084301, "step": 49 }, { "completion_length": 250.0, "epoch": 0.004166666666666667, "grad_norm": 2.660940647125244, "kl": 0.9060783386230469, "learning_rate": 2.5e-06, "loss": 0.0362, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 50 }, { "completion_length": 250.0, "epoch": 0.00425, "grad_norm": 2.71610426902771, "kl": 0.7481005191802979, "learning_rate": 2.55e-06, "loss": 0.0299, "reward": 0.0833333358168602, "reward_std": 0.2357022762298584, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0833333358168602, "step": 51 }, { "completion_length": 250.0, "epoch": 0.004333333333333333, "grad_norm": 4604.9912109375, "kl": 107.24817657470703, "learning_rate": 2.6e-06, "loss": 4.2899, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 52 }, { "completion_length": 250.0, "epoch": 0.004416666666666667, "grad_norm": 0.5669422745704651, "kl": 0.31511324644088745, "learning_rate": 2.6500000000000005e-06, "loss": 0.0126, "reward": 0.0416666679084301, "reward_std": 0.1178511381149292, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0416666679084301, "step": 53 }, { "completion_length": 250.0, "epoch": 0.0045, "grad_norm": 669.4596557617188, "kl": 16.526533126831055, "learning_rate": 2.7000000000000004e-06, "loss": 0.6611, "reward": 0.2083333432674408, "reward_std": 0.39591166377067566, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.2083333432674408, "step": 54 }, { "completion_length": 250.0, "epoch": 0.004583333333333333, "grad_norm": 1.862663745880127, "kl": 0.26984715461730957, "learning_rate": 2.7500000000000004e-06, "loss": 0.0108, "reward": 0.0416666679084301, "reward_std": 0.1178511381149292, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0416666679084301, "step": 55 }, { "completion_length": 250.0, "epoch": 0.004666666666666667, "grad_norm": 0.026774972677230835, "kl": 0.22219908237457275, "learning_rate": 2.8000000000000003e-06, "loss": 0.0089, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 56 }, { "completion_length": 250.0, "epoch": 0.00475, "grad_norm": 0.03144953399896622, "kl": 0.2409312129020691, "learning_rate": 2.85e-06, "loss": 0.0096, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 57 }, { "completion_length": 250.0, "epoch": 0.004833333333333334, "grad_norm": 1.8306984901428223, "kl": 0.3393392562866211, "learning_rate": 2.9e-06, "loss": 0.0136, "reward": 0.2083333432674408, "reward_std": 0.589255690574646, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.0833333358168602, "step": 58 }, { "completion_length": 250.0, "epoch": 0.004916666666666666, "grad_norm": 185.6766815185547, "kl": 31.755769729614258, "learning_rate": 2.95e-06, "loss": 1.2702, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 59 }, { "completion_length": 250.0, "epoch": 0.005, "grad_norm": 4095.82421875, "kl": 48.20116424560547, "learning_rate": 3e-06, "loss": 1.928, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 60 }, { "completion_length": 250.0, "epoch": 0.005083333333333333, "grad_norm": 85.37049102783203, "kl": 12.757698059082031, "learning_rate": 3.05e-06, "loss": 0.5103, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 61 }, { "completion_length": 250.0, "epoch": 0.005166666666666667, "grad_norm": 0.6013548374176025, "kl": 0.2426263988018036, "learning_rate": 3.1000000000000004e-06, "loss": 0.0097, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.0, "step": 62 }, { "completion_length": 250.0, "epoch": 0.00525, "grad_norm": 3.4427809715270996, "kl": 0.9819362163543701, "learning_rate": 3.1500000000000003e-06, "loss": 0.0393, "reward": 0.125, "reward_std": 0.24800795316696167, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.125, "step": 63 }, { "completion_length": 250.0, "epoch": 0.005333333333333333, "grad_norm": 0.37425747513771057, "kl": 0.3634275197982788, "learning_rate": 3.2000000000000003e-06, "loss": 0.0145, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 64 }, { "completion_length": 250.0, "epoch": 0.005416666666666667, "grad_norm": 2.08392596244812, "kl": 0.30326148867607117, "learning_rate": 3.2500000000000002e-06, "loss": 0.0121, "reward": 0.125, "reward_std": 0.24800793826580048, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.125, "step": 65 }, { "completion_length": 250.0, "epoch": 0.0055, "grad_norm": 12975.3134765625, "kl": 1615.21484375, "learning_rate": 3.3000000000000006e-06, "loss": 64.6086, "reward": 0.0416666679084301, "reward_std": 0.1178511381149292, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0416666679084301, "step": 66 }, { "completion_length": 250.0, "epoch": 0.005583333333333333, "grad_norm": 0.7209013104438782, "kl": 0.3030403256416321, "learning_rate": 3.3500000000000005e-06, "loss": 0.0121, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.125, "step": 67 }, { "completion_length": 250.0, "epoch": 0.005666666666666667, "grad_norm": 2.0693342685699463, "kl": 0.3132570683956146, "learning_rate": 3.4000000000000005e-06, "loss": 0.0125, "reward": 0.0416666679084301, "reward_std": 0.1178511381149292, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0416666679084301, "step": 68 }, { "completion_length": 250.0, "epoch": 0.00575, "grad_norm": 148.04296875, "kl": 8.410460472106934, "learning_rate": 3.45e-06, "loss": 0.3364, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 69 }, { "completion_length": 250.0, "epoch": 0.005833333333333334, "grad_norm": 2913710.75, "kl": 355526.25, "learning_rate": 3.5e-06, "loss": 14221.0527, "reward": 0.0416666679084301, "reward_std": 0.1178511381149292, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0416666679084301, "step": 70 }, { "completion_length": 250.0, "epoch": 0.005916666666666666, "grad_norm": 1.4818073511123657, "kl": 0.28156542778015137, "learning_rate": 3.5500000000000003e-06, "loss": 0.0113, "reward": 0.1666666716337204, "reward_std": 0.35634827613830566, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.1666666716337204, "step": 71 }, { "completion_length": 250.0, "epoch": 0.006, "grad_norm": 1.2178808450698853, "kl": 0.42147043347358704, "learning_rate": 3.6000000000000003e-06, "loss": 0.0169, "reward": 0.125, "reward_std": 0.24800793826580048, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.125, "step": 72 }, { "completion_length": 250.0, "epoch": 0.006083333333333333, "grad_norm": 9.053384780883789, "kl": 2.088000535964966, "learning_rate": 3.65e-06, "loss": 0.0835, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 73 }, { "completion_length": 250.0, "epoch": 0.006166666666666667, "grad_norm": 16.30586814880371, "kl": 2.418818712234497, "learning_rate": 3.7e-06, "loss": 0.0968, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 74 }, { "completion_length": 250.0, "epoch": 0.00625, "grad_norm": 0.9915629625320435, "kl": 0.4261716902256012, "learning_rate": 3.7500000000000005e-06, "loss": 0.017, "reward": 0.2916666865348816, "reward_std": 0.5473601818084717, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.1666666716337204, "step": 75 }, { "completion_length": 250.0, "epoch": 0.006333333333333333, "grad_norm": 35.2829704284668, "kl": 4.973476409912109, "learning_rate": 3.8000000000000005e-06, "loss": 0.1989, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 76 }, { "completion_length": 250.0, "epoch": 0.006416666666666667, "grad_norm": 0.44628340005874634, "kl": 0.3691655397415161, "learning_rate": 3.85e-06, "loss": 0.0148, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.0, "step": 77 }, { "completion_length": 250.0, "epoch": 0.0065, "grad_norm": 1.1199471950531006, "kl": 0.40812310576438904, "learning_rate": 3.900000000000001e-06, "loss": 0.0163, "reward": 0.5, "reward_std": 0.9258201122283936, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.25, "step": 78 }, { "completion_length": 250.0, "epoch": 0.006583333333333333, "grad_norm": 3.701707363128662, "kl": 0.302755743265152, "learning_rate": 3.95e-06, "loss": 0.0121, "reward": 0.0416666679084301, "reward_std": 0.1178511381149292, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0416666679084301, "step": 79 }, { "completion_length": 250.0, "epoch": 0.006666666666666667, "grad_norm": 173.58099365234375, "kl": 35.22575378417969, "learning_rate": 4.000000000000001e-06, "loss": 1.409, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 80 }, { "completion_length": 250.0, "epoch": 0.00675, "grad_norm": 0.9575639367103577, "kl": 0.45123860239982605, "learning_rate": 4.05e-06, "loss": 0.018, "reward": 0.0416666679084301, "reward_std": 0.1178511381149292, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0416666679084301, "step": 81 }, { "completion_length": 250.0, "epoch": 0.006833333333333334, "grad_norm": 0.15877264738082886, "kl": 0.3092237710952759, "learning_rate": 4.1e-06, "loss": 0.0124, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 82 }, { "completion_length": 250.0, "epoch": 0.0069166666666666664, "grad_norm": 4.570583820343018, "kl": 0.6996808648109436, "learning_rate": 4.15e-06, "loss": 0.028, "reward": 0.0833333358168602, "reward_std": 0.2357022762298584, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0833333358168602, "step": 83 }, { "completion_length": 250.0, "epoch": 0.007, "grad_norm": 1.1855798959732056, "kl": 0.2818334698677063, "learning_rate": 4.2000000000000004e-06, "loss": 0.0113, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.125, "step": 84 }, { "completion_length": 250.0, "epoch": 0.007083333333333333, "grad_norm": 0.5873503088951111, "kl": 0.47469863295555115, "learning_rate": 4.25e-06, "loss": 0.019, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 85 }, { "completion_length": 250.0, "epoch": 0.007166666666666667, "grad_norm": 0.5640818476676941, "kl": 0.4295016825199127, "learning_rate": 4.3e-06, "loss": 0.0172, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 86 }, { "completion_length": 250.0, "epoch": 0.00725, "grad_norm": 115.95929718017578, "kl": 21.73447036743164, "learning_rate": 4.350000000000001e-06, "loss": 0.8694, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 87 }, { "completion_length": 250.0, "epoch": 0.007333333333333333, "grad_norm": 47534.85546875, "kl": 5644.39404296875, "learning_rate": 4.4e-06, "loss": 225.7758, "reward": 0.25, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.0, "step": 88 }, { "completion_length": 250.0, "epoch": 0.007416666666666667, "grad_norm": 0.42040178179740906, "kl": 0.3909342885017395, "learning_rate": 4.450000000000001e-06, "loss": 0.0156, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 89 }, { "completion_length": 250.0, "epoch": 0.0075, "grad_norm": 14.471317291259766, "kl": 3.260627031326294, "learning_rate": 4.5e-06, "loss": 0.1304, "reward": 0.0416666679084301, "reward_std": 0.1178511381149292, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0416666679084301, "step": 90 }, { "completion_length": 250.0, "epoch": 0.007583333333333333, "grad_norm": 0.6545644998550415, "kl": 0.31067171692848206, "learning_rate": 4.5500000000000005e-06, "loss": 0.0124, "reward": 0.1666666716337204, "reward_std": 0.35634833574295044, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.1666666716337204, "step": 91 }, { "completion_length": 250.0, "epoch": 0.007666666666666666, "grad_norm": 2.0157968997955322, "kl": 0.8457777500152588, "learning_rate": 4.600000000000001e-06, "loss": 0.0338, "reward": 0.0416666679084301, "reward_std": 0.1178511381149292, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0416666679084301, "step": 92 }, { "completion_length": 250.0, "epoch": 0.00775, "grad_norm": 0.27556928992271423, "kl": 0.21060839295387268, "learning_rate": 4.65e-06, "loss": 0.0084, "reward": 0.0833333358168602, "reward_std": 0.2357022762298584, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0833333358168602, "step": 93 }, { "completion_length": 250.0, "epoch": 0.007833333333333333, "grad_norm": 1.8622896671295166, "kl": 0.38421598076820374, "learning_rate": 4.7e-06, "loss": 0.0154, "reward": 0.125, "reward_std": 0.24800795316696167, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.125, "step": 94 }, { "completion_length": 250.0, "epoch": 0.007916666666666667, "grad_norm": 1.3327813148498535, "kl": 0.26667675375938416, "learning_rate": 4.75e-06, "loss": 0.0107, "reward": 0.0416666679084301, "reward_std": 0.1178511381149292, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0416666679084301, "step": 95 }, { "completion_length": 250.0, "epoch": 0.008, "grad_norm": 0.4855498969554901, "kl": 0.2688427269458771, "learning_rate": 4.800000000000001e-06, "loss": 0.0108, "reward": 0.125, "reward_std": 0.24800795316696167, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.125, "step": 96 }, { "completion_length": 250.0, "epoch": 0.008083333333333333, "grad_norm": 0.4623148739337921, "kl": 0.26257768273353577, "learning_rate": 4.85e-06, "loss": 0.0105, "reward": 0.1666666716337204, "reward_std": 0.35634833574295044, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.1666666716337204, "step": 97 }, { "completion_length": 250.0, "epoch": 0.008166666666666666, "grad_norm": 0.6498294472694397, "kl": 0.3181808292865753, "learning_rate": 4.9000000000000005e-06, "loss": 0.0127, "reward": 0.2083333432674408, "reward_std": 0.39591166377067566, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.2083333432674408, "step": 98 }, { "completion_length": 250.0, "epoch": 0.00825, "grad_norm": 0.8791100382804871, "kl": 0.5302932262420654, "learning_rate": 4.95e-06, "loss": 0.0212, "reward": 0.4583333134651184, "reward_std": 0.46929532289505005, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.3333333432674408, "step": 99 }, { "completion_length": 250.0, "epoch": 0.008333333333333333, "grad_norm": 78.05960845947266, "kl": 18.444576263427734, "learning_rate": 5e-06, "loss": 0.7378, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.0, "step": 100 }, { "completion_length": 250.0, "epoch": 0.008416666666666666, "grad_norm": 0.6424371004104614, "kl": 0.3804784119129181, "learning_rate": 4.999984769144476e-06, "loss": 0.0152, "reward": 0.3333333432674408, "reward_std": 0.6900655627250671, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.2083333432674408, "step": 101 }, { "completion_length": 250.0, "epoch": 0.0085, "grad_norm": 115.73678588867188, "kl": 4.351790904998779, "learning_rate": 4.999939076763487e-06, "loss": 0.1741, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 102 }, { "completion_length": 250.0, "epoch": 0.008583333333333333, "grad_norm": 0.12965546548366547, "kl": 0.2807658612728119, "learning_rate": 4.999862923413781e-06, "loss": 0.0112, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 103 }, { "completion_length": 250.0, "epoch": 0.008666666666666666, "grad_norm": 2.4104604721069336, "kl": 0.5225258469581604, "learning_rate": 4.999756310023261e-06, "loss": 0.0209, "reward": 0.0416666679084301, "reward_std": 0.1178511381149292, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0416666679084301, "step": 104 }, { "completion_length": 250.0, "epoch": 0.00875, "grad_norm": 0.3326241075992584, "kl": 0.29482659697532654, "learning_rate": 4.9996192378909785e-06, "loss": 0.0118, "reward": 0.2916666567325592, "reward_std": 0.7000566720962524, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.1666666716337204, "step": 105 }, { "completion_length": 250.0, "epoch": 0.008833333333333334, "grad_norm": 1.436844825744629, "kl": 0.40061473846435547, "learning_rate": 4.999451708687114e-06, "loss": 0.016, "reward": 0.4583333134651184, "reward_std": 0.7113032937049866, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.2083333432674408, "step": 106 }, { "completion_length": 250.0, "epoch": 0.008916666666666666, "grad_norm": 95.7150650024414, "kl": 2.640369176864624, "learning_rate": 4.9992537244529585e-06, "loss": 0.1056, "reward": 0.0833333358168602, "reward_std": 0.2357022762298584, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0833333358168602, "step": 107 }, { "completion_length": 250.0, "epoch": 0.009, "grad_norm": 108.41773223876953, "kl": 6.058004856109619, "learning_rate": 4.999025287600886e-06, "loss": 0.2423, "reward": 0.125, "reward_std": 0.24800795316696167, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.125, "step": 108 }, { "completion_length": 250.0, "epoch": 0.009083333333333334, "grad_norm": 25.03618049621582, "kl": 3.463536262512207, "learning_rate": 4.998766400914329e-06, "loss": 0.1385, "reward": 0.0416666679084301, "reward_std": 0.1178511381149292, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0416666679084301, "step": 109 }, { "completion_length": 250.0, "epoch": 0.009166666666666667, "grad_norm": 0.5373579263687134, "kl": 0.3192852735519409, "learning_rate": 4.99847706754774e-06, "loss": 0.0128, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.125, "step": 110 }, { "completion_length": 250.0, "epoch": 0.00925, "grad_norm": 0.2514178454875946, "kl": 0.4298870861530304, "learning_rate": 4.998157291026553e-06, "loss": 0.0172, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 111 }, { "completion_length": 250.0, "epoch": 0.009333333333333334, "grad_norm": 49.19013214111328, "kl": 8.058541297912598, "learning_rate": 4.997807075247147e-06, "loss": 0.3223, "reward": 0.125, "reward_std": 0.17251639068126678, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.125, "step": 112 }, { "completion_length": 250.0, "epoch": 0.009416666666666667, "grad_norm": 1.157423973083496, "kl": 0.41131162643432617, "learning_rate": 4.997426424476787e-06, "loss": 0.0165, "reward": 0.0416666679084301, "reward_std": 0.1178511381149292, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0416666679084301, "step": 113 }, { "completion_length": 250.0, "epoch": 0.0095, "grad_norm": 122.4400863647461, "kl": 6.790146350860596, "learning_rate": 4.9970153433535855e-06, "loss": 0.2716, "reward": 0.0416666679084301, "reward_std": 0.1178511381149292, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0416666679084301, "step": 114 }, { "completion_length": 250.0, "epoch": 0.009583333333333333, "grad_norm": 0.07149865478277206, "kl": 0.3394841253757477, "learning_rate": 4.9965738368864345e-06, "loss": 0.0136, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 115 }, { "completion_length": 250.0, "epoch": 0.009666666666666667, "grad_norm": 1.3715343475341797, "kl": 0.4577629864215851, "learning_rate": 4.996101910454953e-06, "loss": 0.0183, "reward": 0.0833333358168602, "reward_std": 0.2357022762298584, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0833333358168602, "step": 116 }, { "completion_length": 250.0, "epoch": 0.00975, "grad_norm": 0.551726222038269, "kl": 0.3871065080165863, "learning_rate": 4.995599569809414e-06, "loss": 0.0155, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.0, "step": 117 }, { "completion_length": 250.0, "epoch": 0.009833333333333333, "grad_norm": 2.083500623703003, "kl": 0.7725878357887268, "learning_rate": 4.9950668210706795e-06, "loss": 0.0309, "reward": 0.3333333432674408, "reward_std": 0.5345224738121033, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.2083333432674408, "step": 118 }, { "completion_length": 250.0, "epoch": 0.009916666666666667, "grad_norm": 1.0463969707489014, "kl": 0.4617312550544739, "learning_rate": 4.994503670730126e-06, "loss": 0.0185, "reward": 0.1666666716337204, "reward_std": 0.30860671401023865, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.1666666716337204, "step": 119 }, { "completion_length": 250.0, "epoch": 0.01, "grad_norm": 0.4884531795978546, "kl": 0.3635513186454773, "learning_rate": 4.993910125649561e-06, "loss": 0.0145, "reward": 0.25, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.25, "step": 120 }, { "completion_length": 250.0, "epoch": 0.010083333333333333, "grad_norm": 0.30621615052223206, "kl": 0.37473881244659424, "learning_rate": 4.993286193061145e-06, "loss": 0.015, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.125, "step": 121 }, { "completion_length": 250.0, "epoch": 0.010166666666666666, "grad_norm": 88.8227767944336, "kl": 24.4356689453125, "learning_rate": 4.992631880567301e-06, "loss": 0.9774, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 122 }, { "completion_length": 250.0, "epoch": 0.01025, "grad_norm": 0.6021216511726379, "kl": 0.4579870402812958, "learning_rate": 4.991947196140619e-06, "loss": 0.0183, "reward": 0.25, "reward_std": 0.38832157850265503, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.125, "step": 123 }, { "completion_length": 250.0, "epoch": 0.010333333333333333, "grad_norm": 1.0776395797729492, "kl": 0.3739393353462219, "learning_rate": 4.9912321481237616e-06, "loss": 0.015, "reward": 0.1666666716337204, "reward_std": 0.35634833574295044, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.1666666716337204, "step": 124 }, { "completion_length": 250.0, "epoch": 0.010416666666666666, "grad_norm": 0.704728364944458, "kl": 0.33970534801483154, "learning_rate": 4.990486745229364e-06, "loss": 0.0136, "reward": 0.0416666679084301, "reward_std": 0.1178511381149292, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0416666679084301, "step": 125 }, { "completion_length": 250.0, "epoch": 0.0105, "grad_norm": 0.4066079556941986, "kl": 0.34832248091697693, "learning_rate": 4.989710996539926e-06, "loss": 0.0139, "reward": 0.375, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.25, "step": 126 }, { "completion_length": 250.0, "epoch": 0.010583333333333333, "grad_norm": 20332.4453125, "kl": 2519.667236328125, "learning_rate": 4.9889049115077e-06, "loss": 100.7867, "reward": 0.125, "reward_std": 0.17251639068126678, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.125, "step": 127 }, { "completion_length": 250.0, "epoch": 0.010666666666666666, "grad_norm": 1014.6124877929688, "kl": 151.2786102294922, "learning_rate": 4.988068499954578e-06, "loss": 6.0511, "reward": 0.0833333358168602, "reward_std": 0.2357022762298584, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0833333358168602, "step": 128 }, { "completion_length": 250.0, "epoch": 0.01075, "grad_norm": 2.2194466590881348, "kl": 0.45352834463119507, "learning_rate": 4.987201772071971e-06, "loss": 0.0181, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.125, "step": 129 }, { "completion_length": 250.0, "epoch": 0.010833333333333334, "grad_norm": 0.8629388213157654, "kl": 0.4860643446445465, "learning_rate": 4.986304738420684e-06, "loss": 0.0194, "reward": 0.2083333432674408, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.2083333432674408, "step": 130 }, { "completion_length": 250.0, "epoch": 0.010916666666666667, "grad_norm": 3.691181182861328, "kl": 0.396503210067749, "learning_rate": 4.985377409930789e-06, "loss": 0.0159, "reward": 0.0833333358168602, "reward_std": 0.15430335700511932, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0833333358168602, "step": 131 }, { "completion_length": 250.0, "epoch": 0.011, "grad_norm": 20.175403594970703, "kl": 3.6169893741607666, "learning_rate": 4.984419797901491e-06, "loss": 0.1447, "reward": 0.2083333432674408, "reward_std": 0.3535533547401428, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.2083333432674408, "step": 132 }, { "completion_length": 250.0, "epoch": 0.011083333333333334, "grad_norm": 0.6853134632110596, "kl": 0.3578898012638092, "learning_rate": 4.983431914000991e-06, "loss": 0.0143, "reward": 0.3333333432674408, "reward_std": 0.4714045226573944, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.3333333432674408, "step": 133 }, { "completion_length": 250.0, "epoch": 0.011166666666666667, "grad_norm": 2.199497699737549, "kl": 1.075109839439392, "learning_rate": 4.9824137702663424e-06, "loss": 0.043, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.125, "step": 134 }, { "completion_length": 250.0, "epoch": 0.01125, "grad_norm": 0.43966594338417053, "kl": 0.38842320442199707, "learning_rate": 4.981365379103306e-06, "loss": 0.0155, "reward": 0.0833333358168602, "reward_std": 0.15430335700511932, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0833333358168602, "step": 135 }, { "completion_length": 250.0, "epoch": 0.011333333333333334, "grad_norm": 0.5732003450393677, "kl": 0.3031355142593384, "learning_rate": 4.980286753286196e-06, "loss": 0.0121, "reward": 0.2083333432674408, "reward_std": 0.39591166377067566, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.2083333432674408, "step": 136 }, { "completion_length": 250.0, "epoch": 0.011416666666666667, "grad_norm": 1.1415404081344604, "kl": 0.47937247157096863, "learning_rate": 4.979177905957726e-06, "loss": 0.0192, "reward": 0.2083333432674408, "reward_std": 0.3053751289844513, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.2083333432674408, "step": 137 }, { "completion_length": 250.0, "epoch": 0.0115, "grad_norm": 5.704109191894531, "kl": 0.6407493352890015, "learning_rate": 4.978038850628855e-06, "loss": 0.0256, "reward": 0.0416666679084301, "reward_std": 0.1178511381149292, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0416666679084301, "step": 138 }, { "completion_length": 250.0, "epoch": 0.011583333333333333, "grad_norm": 1.1015725135803223, "kl": 0.40485408902168274, "learning_rate": 4.9768696011786095e-06, "loss": 0.0162, "reward": 0.0833333358168602, "reward_std": 0.15430335700511932, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0833333358168602, "step": 139 }, { "completion_length": 250.0, "epoch": 0.011666666666666667, "grad_norm": 0.06077095866203308, "kl": 0.2962065041065216, "learning_rate": 4.975670171853926e-06, "loss": 0.0118, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 140 }, { "completion_length": 250.0, "epoch": 0.01175, "grad_norm": 0.030355358496308327, "kl": 0.25768283009529114, "learning_rate": 4.974440577269473e-06, "loss": 0.0103, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 141 }, { "completion_length": 250.0, "epoch": 0.011833333333333333, "grad_norm": 0.22492274641990662, "kl": 0.4085735082626343, "learning_rate": 4.973180832407471e-06, "loss": 0.0163, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 142 }, { "completion_length": 250.0, "epoch": 0.011916666666666667, "grad_norm": 9.671571731567383, "kl": 1.6705416440963745, "learning_rate": 4.971890952617515e-06, "loss": 0.0668, "reward": 0.1666666716337204, "reward_std": 0.4714045524597168, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.0416666679084301, "step": 143 }, { "completion_length": 250.0, "epoch": 0.012, "grad_norm": 0.4590764343738556, "kl": 0.40203264355659485, "learning_rate": 4.970570953616383e-06, "loss": 0.0161, "reward": 0.5, "reward_std": 0.7126966714859009, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.375, "step": 144 }, { "completion_length": 250.0, "epoch": 0.012083333333333333, "grad_norm": 0.6464155912399292, "kl": 0.4423307478427887, "learning_rate": 4.9692208514878445e-06, "loss": 0.0177, "reward": 0.1666666716337204, "reward_std": 0.35634833574295044, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.1666666716337204, "step": 145 }, { "completion_length": 250.0, "epoch": 0.012166666666666666, "grad_norm": 0.4339180886745453, "kl": 0.3777730166912079, "learning_rate": 4.96784066268247e-06, "loss": 0.0151, "reward": 0.2916666567325592, "reward_std": 0.7000566720962524, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.1666666716337204, "step": 146 }, { "completion_length": 250.0, "epoch": 0.01225, "grad_norm": 1.3693031072616577, "kl": 0.4329844117164612, "learning_rate": 4.966430404017424e-06, "loss": 0.0173, "reward": 0.125, "reward_std": 0.17251640558242798, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.125, "step": 147 }, { "completion_length": 250.0, "epoch": 0.012333333333333333, "grad_norm": 0.5217644572257996, "kl": 0.3636080324649811, "learning_rate": 4.964990092676263e-06, "loss": 0.0145, "reward": 0.2916666865348816, "reward_std": 0.37533053755760193, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.2916666567325592, "step": 148 }, { "completion_length": 250.0, "epoch": 0.012416666666666666, "grad_norm": 19.131145477294922, "kl": 4.095874786376953, "learning_rate": 4.963519746208726e-06, "loss": 0.1638, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 149 }, { "completion_length": 250.0, "epoch": 0.0125, "grad_norm": 1.2726384401321411, "kl": 0.4131162762641907, "learning_rate": 4.962019382530521e-06, "loss": 0.0165, "reward": 0.2916666865348816, "reward_std": 0.41547447443008423, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.1666666716337204, "step": 150 }, { "completion_length": 250.0, "epoch": 0.012583333333333334, "grad_norm": 0.28327932953834534, "kl": 0.32023757696151733, "learning_rate": 4.960489019923105e-06, "loss": 0.0128, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.125, "step": 151 }, { "completion_length": 250.0, "epoch": 0.012666666666666666, "grad_norm": 0.3637878894805908, "kl": 0.36896905303001404, "learning_rate": 4.958928677033465e-06, "loss": 0.0148, "reward": 0.0833333358168602, "reward_std": 0.15430335700511932, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0833333358168602, "step": 152 }, { "completion_length": 250.0, "epoch": 0.01275, "grad_norm": 0.3663618862628937, "kl": 0.3021296560764313, "learning_rate": 4.957338372873886e-06, "loss": 0.0121, "reward": 0.0833333358168602, "reward_std": 0.15430335700511932, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0833333358168602, "step": 153 }, { "completion_length": 250.0, "epoch": 0.012833333333333334, "grad_norm": 1.0241296291351318, "kl": 0.3188839554786682, "learning_rate": 4.9557181268217225e-06, "loss": 0.0128, "reward": 0.25, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.125, "step": 154 }, { "completion_length": 250.0, "epoch": 0.012916666666666667, "grad_norm": 2.020049810409546, "kl": 0.40295442938804626, "learning_rate": 4.9540679586191605e-06, "loss": 0.0161, "reward": 0.2916666865348816, "reward_std": 0.2781743109226227, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.2916666865348816, "step": 155 }, { "completion_length": 250.0, "epoch": 0.013, "grad_norm": 0.6685758829116821, "kl": 0.41572338342666626, "learning_rate": 4.9523878883729794e-06, "loss": 0.0166, "reward": 0.4583333432674408, "reward_std": 0.6886264681816101, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.3333333432674408, "step": 156 }, { "completion_length": 250.0, "epoch": 0.013083333333333334, "grad_norm": 0.4234911799430847, "kl": 0.3872148096561432, "learning_rate": 4.9506779365543054e-06, "loss": 0.0155, "reward": 0.2083333432674408, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.2083333432674408, "step": 157 }, { "completion_length": 250.0, "epoch": 0.013166666666666667, "grad_norm": 0.6146082878112793, "kl": 0.3194940388202667, "learning_rate": 4.94893812399836e-06, "loss": 0.0128, "reward": 0.1666666716337204, "reward_std": 0.35634833574295044, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.1666666716337204, "step": 158 }, { "completion_length": 250.0, "epoch": 0.01325, "grad_norm": 6.197954177856445, "kl": 0.9764306545257568, "learning_rate": 4.947168471904213e-06, "loss": 0.0391, "reward": 0.1666666716337204, "reward_std": 0.2519763112068176, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.1666666716337204, "step": 159 }, { "completion_length": 250.0, "epoch": 0.013333333333333334, "grad_norm": 0.5794118046760559, "kl": 0.42038655281066895, "learning_rate": 4.9453690018345144e-06, "loss": 0.0168, "reward": 0.75, "reward_std": 1.0350983142852783, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.375, "step": 160 }, { "completion_length": 250.0, "epoch": 0.013416666666666667, "grad_norm": 23.13763999938965, "kl": 2.790783405303955, "learning_rate": 4.9435397357152406e-06, "loss": 0.1116, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 161 }, { "completion_length": 250.0, "epoch": 0.0135, "grad_norm": 0.9066824913024902, "kl": 0.4245196580886841, "learning_rate": 4.9416806958354206e-06, "loss": 0.017, "reward": 0.2083333432674408, "reward_std": 0.39591163396835327, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.2083333432674408, "step": 162 }, { "completion_length": 250.0, "epoch": 0.013583333333333333, "grad_norm": 0.6648077964782715, "kl": 0.5795211791992188, "learning_rate": 4.939791904846869e-06, "loss": 0.0232, "reward": 0.2916666865348816, "reward_std": 0.4154745042324066, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.2916666865348816, "step": 163 }, { "completion_length": 250.0, "epoch": 0.013666666666666667, "grad_norm": 0.5751465559005737, "kl": 0.3582766056060791, "learning_rate": 4.937873385763909e-06, "loss": 0.0143, "reward": 0.2916666865348816, "reward_std": 0.4154745042324066, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.2916666865348816, "step": 164 }, { "completion_length": 250.0, "epoch": 0.01375, "grad_norm": 0.407955139875412, "kl": 0.31000596284866333, "learning_rate": 4.935925161963089e-06, "loss": 0.0124, "reward": 0.2083333432674408, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.2083333432674408, "step": 165 }, { "completion_length": 250.0, "epoch": 0.013833333333333333, "grad_norm": 1.1364095211029053, "kl": 0.3804260492324829, "learning_rate": 4.933947257182901e-06, "loss": 0.0152, "reward": 0.2083333432674408, "reward_std": 0.39591163396835327, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.2083333432674408, "step": 166 }, { "completion_length": 250.0, "epoch": 0.013916666666666667, "grad_norm": 5.946568489074707, "kl": 0.4389808773994446, "learning_rate": 4.9319396955234925e-06, "loss": 0.0176, "reward": 0.4166666865348816, "reward_std": 0.7292091846466064, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.2916666567325592, "step": 167 }, { "completion_length": 250.0, "epoch": 0.014, "grad_norm": 0.48689746856689453, "kl": 0.4137730002403259, "learning_rate": 4.9299025014463665e-06, "loss": 0.0166, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 168 }, { "completion_length": 250.0, "epoch": 0.014083333333333333, "grad_norm": 0.30347901582717896, "kl": 0.2522311210632324, "learning_rate": 4.92783569977409e-06, "loss": 0.0101, "reward": 0.1666666716337204, "reward_std": 0.2519763112068176, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.1666666716337204, "step": 169 }, { "completion_length": 250.0, "epoch": 0.014166666666666666, "grad_norm": 0.7049718499183655, "kl": 0.5534805655479431, "learning_rate": 4.925739315689991e-06, "loss": 0.0221, "reward": 0.25, "reward_std": 0.3450327515602112, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.125, "step": 170 }, { "completion_length": 250.0, "epoch": 0.01425, "grad_norm": 3.0675125122070312, "kl": 0.2963431477546692, "learning_rate": 4.923613374737848e-06, "loss": 0.0119, "reward": 0.1666666716337204, "reward_std": 0.35634833574295044, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.1666666716337204, "step": 171 }, { "completion_length": 250.0, "epoch": 0.014333333333333333, "grad_norm": 0.5279906392097473, "kl": 0.276735782623291, "learning_rate": 4.921457902821578e-06, "loss": 0.0111, "reward": 0.1666666716337204, "reward_std": 0.35634833574295044, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.1666666716337204, "step": 172 }, { "completion_length": 250.0, "epoch": 0.014416666666666666, "grad_norm": 0.3988935947418213, "kl": 0.34347009658813477, "learning_rate": 4.9192729262049285e-06, "loss": 0.0137, "reward": 0.5, "reward_std": 0.5345224738121033, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.5, "step": 173 }, { "completion_length": 250.0, "epoch": 0.0145, "grad_norm": 1.7523726224899292, "kl": 0.44115620851516724, "learning_rate": 4.917058471511149e-06, "loss": 0.0176, "reward": 0.1666666716337204, "reward_std": 0.2519763112068176, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.1666666716337204, "step": 174 }, { "completion_length": 250.0, "epoch": 0.014583333333333334, "grad_norm": 0.5499840378761292, "kl": 0.37359291315078735, "learning_rate": 4.914814565722671e-06, "loss": 0.0149, "reward": 0.0833333358168602, "reward_std": 0.2357022762298584, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0833333358168602, "step": 175 }, { "completion_length": 250.0, "epoch": 0.014666666666666666, "grad_norm": 1.1000566482543945, "kl": 0.5037091374397278, "learning_rate": 4.912541236180779e-06, "loss": 0.0201, "reward": 0.2916666567325592, "reward_std": 0.7000566720962524, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.1666666716337204, "step": 176 }, { "completion_length": 250.0, "epoch": 0.01475, "grad_norm": 0.3692109286785126, "kl": 0.32822439074516296, "learning_rate": 4.910238510585275e-06, "loss": 0.0131, "reward": 0.1666666716337204, "reward_std": 0.35634833574295044, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.1666666716337204, "step": 177 }, { "completion_length": 250.0, "epoch": 0.014833333333333334, "grad_norm": 0.38307639956474304, "kl": 0.45367297530174255, "learning_rate": 4.907906416994146e-06, "loss": 0.0181, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0, "step": 178 }, { "completion_length": 250.0, "epoch": 0.014916666666666667, "grad_norm": 0.8600552678108215, "kl": 0.4305053949356079, "learning_rate": 4.905544983823214e-06, "loss": 0.0172, "reward": 0.625, "reward_std": 0.6283639669418335, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.5, "step": 179 }, { "completion_length": 250.0, "epoch": 0.015, "grad_norm": 0.8289276957511902, "kl": 0.36276039481163025, "learning_rate": 4.903154239845798e-06, "loss": 0.0145, "reward": 0.2083333432674408, "reward_std": 0.39591166377067566, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.2083333432674408, "step": 180 }, { "completion_length": 250.0, "epoch": 0.015083333333333334, "grad_norm": 0.8905088901519775, "kl": 0.42038679122924805, "learning_rate": 4.900734214192358e-06, "loss": 0.0168, "reward": 0.375, "reward_std": 0.4520675837993622, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.25, "step": 181 }, { "completion_length": 250.0, "epoch": 0.015166666666666667, "grad_norm": 0.4034233093261719, "kl": 0.35581687092781067, "learning_rate": 4.898284936350144e-06, "loss": 0.0142, "reward": 0.4166666567325592, "reward_std": 0.7292091846466064, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.2916666865348816, "step": 182 }, { "completion_length": 250.0, "epoch": 0.01525, "grad_norm": 0.392464816570282, "kl": 0.6867326498031616, "learning_rate": 4.8958064361628334e-06, "loss": 0.0275, "reward": 0.5, "reward_std": 0.4714045226573944, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.5, "step": 183 }, { "completion_length": 250.0, "epoch": 0.015333333333333332, "grad_norm": 0.7030049562454224, "kl": 0.3916150629520416, "learning_rate": 4.893298743830168e-06, "loss": 0.0157, "reward": 0.3333333432674408, "reward_std": 0.6900655627250671, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.2083333432674408, "step": 184 }, { "completion_length": 250.0, "epoch": 0.015416666666666667, "grad_norm": 0.7821568846702576, "kl": 0.7238714098930359, "learning_rate": 4.890761889907589e-06, "loss": 0.029, "reward": 0.75, "reward_std": 0.49601587653160095, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.625, "step": 185 }, { "completion_length": 250.0, "epoch": 0.0155, "grad_norm": 0.8786028027534485, "kl": 0.4036749303340912, "learning_rate": 4.888195905305859e-06, "loss": 0.0161, "reward": 0.5416666865348816, "reward_std": 0.39591166377067566, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.4166666865348816, "step": 186 }, { "completion_length": 250.0, "epoch": 0.015583333333333333, "grad_norm": 0.5510672926902771, "kl": 0.4351351857185364, "learning_rate": 4.885600821290692e-06, "loss": 0.0174, "reward": 0.4166666865348816, "reward_std": 0.49601587653160095, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.2916666865348816, "step": 187 }, { "completion_length": 250.0, "epoch": 0.015666666666666666, "grad_norm": 0.2654396891593933, "kl": 0.2575778365135193, "learning_rate": 4.882976669482368e-06, "loss": 0.0103, "reward": 0.9166666865348816, "reward_std": 0.8498365879058838, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.4166666567325592, "step": 188 }, { "completion_length": 250.0, "epoch": 0.01575, "grad_norm": 0.6470081210136414, "kl": 0.43255674839019775, "learning_rate": 4.880323481855347e-06, "loss": 0.0173, "reward": 0.4583333730697632, "reward_std": 0.5616726875305176, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.2083333432674408, "step": 189 }, { "completion_length": 250.0, "epoch": 0.015833333333333335, "grad_norm": 0.6133238673210144, "kl": 0.370347797870636, "learning_rate": 4.8776412907378845e-06, "loss": 0.0148, "reward": 0.0416666679084301, "reward_std": 0.1178511381149292, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.0416666679084301, "step": 190 }, { "completion_length": 250.0, "epoch": 0.015916666666666666, "grad_norm": 3.239405393600464, "kl": 0.5021273493766785, "learning_rate": 4.874930128811631e-06, "loss": 0.0201, "reward": 0.5416666269302368, "reward_std": 0.6651768684387207, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.2916666567325592, "step": 191 }, { "completion_length": 250.0, "epoch": 0.016, "grad_norm": 0.8710654973983765, "kl": 0.5387289524078369, "learning_rate": 4.8721900291112415e-06, "loss": 0.0215, "reward": 0.625, "reward_std": 0.6770032048225403, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.5, "step": 192 }, { "completion_length": 250.0, "epoch": 0.016083333333333335, "grad_norm": 1.41133451461792, "kl": 0.5218385457992554, "learning_rate": 4.869421025023965e-06, "loss": 0.0209, "reward": 0.5416666865348816, "reward_std": 0.7332792282104492, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.4166666865348816, "step": 193 }, { "completion_length": 250.0, "epoch": 0.016166666666666666, "grad_norm": 0.9869065880775452, "kl": 0.5528762340545654, "learning_rate": 4.866623150289241e-06, "loss": 0.0221, "reward": 0.4166666865348816, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.2916666865348816, "step": 194 }, { "completion_length": 250.0, "epoch": 0.01625, "grad_norm": 0.347484827041626, "kl": 0.3075355589389801, "learning_rate": 4.863796438998293e-06, "loss": 0.0123, "reward": 0.375, "reward_std": 0.4154745042324066, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.375, "step": 195 }, { "completion_length": 250.0, "epoch": 0.01633333333333333, "grad_norm": 0.5246497988700867, "kl": 0.47872331738471985, "learning_rate": 4.860940925593703e-06, "loss": 0.0191, "reward": 0.4166666567325592, "reward_std": 0.49601587653160095, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.4166666865348816, "step": 196 }, { "completion_length": 250.0, "epoch": 0.016416666666666666, "grad_norm": 0.5665456056594849, "kl": 0.40581831336021423, "learning_rate": 4.858056644869002e-06, "loss": 0.0162, "reward": 0.7916666865348816, "reward_std": 0.8533315062522888, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.4166666865348816, "step": 197 }, { "completion_length": 250.0, "epoch": 0.0165, "grad_norm": 0.37036922574043274, "kl": 0.4440433382987976, "learning_rate": 4.855143631968242e-06, "loss": 0.0178, "reward": 0.625, "reward_std": 0.4520675241947174, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.625, "step": 198 }, { "completion_length": 250.0, "epoch": 0.016583333333333332, "grad_norm": 1.1761598587036133, "kl": 0.5070418119430542, "learning_rate": 4.852201922385564e-06, "loss": 0.0203, "reward": 0.25, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.25, "step": 199 }, { "completion_length": 250.0, "epoch": 0.016666666666666666, "grad_norm": 0.3598565459251404, "kl": 0.38801202178001404, "learning_rate": 4.849231551964771e-06, "loss": 0.0155, "reward": 0.4583333134651184, "reward_std": 0.7753647565841675, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.2083333432674408, "step": 200 }, { "completion_length": 250.0, "epoch": 0.01675, "grad_norm": 6.092010974884033, "kl": 0.5048520565032959, "learning_rate": 4.84623255689889e-06, "loss": 0.0202, "reward": 0.2083333432674408, "reward_std": 0.39591166377067566, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.2083333432674408, "step": 201 }, { "completion_length": 250.0, "epoch": 0.016833333333333332, "grad_norm": 2.679046392440796, "kl": 0.5951515436172485, "learning_rate": 4.84320497372973e-06, "loss": 0.0238, "reward": 0.5, "reward_std": 0.7126966118812561, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.25, "step": 202 }, { "completion_length": 250.0, "epoch": 0.016916666666666667, "grad_norm": 1.4502449035644531, "kl": 0.6235775351524353, "learning_rate": 4.840148839347434e-06, "loss": 0.0249, "reward": 0.5416666269302368, "reward_std": 0.7332792282104492, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.4166666865348816, "step": 203 }, { "completion_length": 250.0, "epoch": 0.017, "grad_norm": 0.34813621640205383, "kl": 0.3194099962711334, "learning_rate": 4.837064190990036e-06, "loss": 0.0128, "reward": 0.6666666269302368, "reward_std": 0.8908708095550537, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.2916666567325592, "step": 204 }, { "completion_length": 250.0, "epoch": 0.017083333333333332, "grad_norm": 0.7955384254455566, "kl": 0.6405460238456726, "learning_rate": 4.833951066243004e-06, "loss": 0.0256, "reward": 0.4583333432674408, "reward_std": 0.46929529309272766, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.4583333134651184, "step": 205 }, { "completion_length": 250.0, "epoch": 0.017166666666666667, "grad_norm": 0.6468902826309204, "kl": 0.4588083326816559, "learning_rate": 4.830809503038781e-06, "loss": 0.0184, "reward": 0.2083333432674408, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.2083333432674408, "step": 206 }, { "completion_length": 250.0, "epoch": 0.01725, "grad_norm": 0.6557303071022034, "kl": 0.4808078706264496, "learning_rate": 4.8276395396563215e-06, "loss": 0.0192, "reward": 0.4583333432674408, "reward_std": 0.7332792282104492, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.3333333432674408, "step": 207 }, { "completion_length": 250.0, "epoch": 0.017333333333333333, "grad_norm": 0.8586503267288208, "kl": 0.7754402756690979, "learning_rate": 4.824441214720629e-06, "loss": 0.031, "reward": 0.5416666865348816, "reward_std": 0.43415671586990356, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.5416666865348816, "step": 208 }, { "completion_length": 250.0, "epoch": 0.017416666666666667, "grad_norm": 0.2894943356513977, "kl": 0.2780013978481293, "learning_rate": 4.821214567202284e-06, "loss": 0.0111, "reward": 0.3333333432674408, "reward_std": 0.4364357590675354, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.3333333432674408, "step": 209 }, { "completion_length": 250.0, "epoch": 0.0175, "grad_norm": 0.4333679676055908, "kl": 0.4271698594093323, "learning_rate": 4.817959636416969e-06, "loss": 0.0171, "reward": 0.5416666865348816, "reward_std": 0.46929532289505005, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.4166666865348816, "step": 210 }, { "completion_length": 250.0, "epoch": 0.017583333333333333, "grad_norm": 2.1452853679656982, "kl": 0.41794443130493164, "learning_rate": 4.814676462024988e-06, "loss": 0.0167, "reward": 0.375, "reward_std": 0.41547447443008423, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.25, "step": 211 }, { "completion_length": 250.0, "epoch": 0.017666666666666667, "grad_norm": 0.37924328446388245, "kl": 0.5044135451316833, "learning_rate": 4.811365084030784e-06, "loss": 0.0202, "reward": 0.4166666567325592, "reward_std": 0.6606874465942383, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.2916666865348816, "step": 212 }, { "completion_length": 250.0, "epoch": 0.01775, "grad_norm": 1.1589990854263306, "kl": 0.5812166333198547, "learning_rate": 4.808025542782453e-06, "loss": 0.0232, "reward": 0.4166666865348816, "reward_std": 0.49601587653160095, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.2916666865348816, "step": 213 }, { "completion_length": 250.0, "epoch": 0.017833333333333333, "grad_norm": 2.28521728515625, "kl": 0.5571350455284119, "learning_rate": 4.804657878971252e-06, "loss": 0.0223, "reward": 0.4583333432674408, "reward_std": 0.6651769280433655, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.3333333432674408, "step": 214 }, { "completion_length": 250.0, "epoch": 0.017916666666666668, "grad_norm": 0.861594557762146, "kl": 0.6830026507377625, "learning_rate": 4.801262133631101e-06, "loss": 0.0273, "reward": 0.8333333134651184, "reward_std": 0.8164966106414795, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.5833333730697632, "step": 215 }, { "completion_length": 250.0, "epoch": 0.018, "grad_norm": 1.8360395431518555, "kl": 1.10158371925354, "learning_rate": 4.7978383481380865e-06, "loss": 0.0441, "reward": 0.4583333730697632, "reward_std": 0.43415671586990356, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.3333333432674408, "step": 216 }, { "completion_length": 250.0, "epoch": 0.018083333333333333, "grad_norm": 0.957562267780304, "kl": 0.542965292930603, "learning_rate": 4.794386564209953e-06, "loss": 0.0217, "reward": 1.1666666269302368, "reward_std": 0.9428090453147888, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.6666666269302368, "step": 217 }, { "completion_length": 250.0, "epoch": 0.018166666666666668, "grad_norm": 0.33301106095314026, "kl": 0.3330531120300293, "learning_rate": 4.790906823905599e-06, "loss": 0.0133, "reward": 0.2916666567325592, "reward_std": 0.4520675837993622, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.2916666865348816, "step": 218 }, { "completion_length": 250.0, "epoch": 0.01825, "grad_norm": 0.9023637771606445, "kl": 0.5449975728988647, "learning_rate": 4.787399169624562e-06, "loss": 0.0218, "reward": 0.2083333432674408, "reward_std": 0.24800793826580048, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.2083333432674408, "step": 219 }, { "completion_length": 250.0, "epoch": 0.018333333333333333, "grad_norm": 4.706124782562256, "kl": 0.884423553943634, "learning_rate": 4.783863644106502e-06, "loss": 0.0354, "reward": 0.75, "reward_std": 0.7918233275413513, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.5, "step": 220 }, { "completion_length": 250.0, "epoch": 0.018416666666666668, "grad_norm": 0.7690190076828003, "kl": 0.6677464246749878, "learning_rate": 4.780300290430683e-06, "loss": 0.0267, "reward": 0.4583333432674408, "reward_std": 0.7332792282104492, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.3333333432674408, "step": 221 }, { "completion_length": 250.0, "epoch": 0.0185, "grad_norm": 0.7956643104553223, "kl": 0.4805806577205658, "learning_rate": 4.776709152015443e-06, "loss": 0.0192, "reward": 0.375, "reward_std": 0.4520675837993622, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.3750000298023224, "step": 222 }, { "completion_length": 250.0, "epoch": 0.018583333333333334, "grad_norm": 0.3264180123806, "kl": 0.46136581897735596, "learning_rate": 4.773090272617672e-06, "loss": 0.0185, "reward": 0.2916666865348816, "reward_std": 0.4520675241947174, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.2916666865348816, "step": 223 }, { "completion_length": 250.0, "epoch": 0.018666666666666668, "grad_norm": 0.33508577942848206, "kl": 0.33475160598754883, "learning_rate": 4.769443696332272e-06, "loss": 0.0134, "reward": 0.1666666716337204, "reward_std": 0.17817416787147522, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.1666666716337204, "step": 224 }, { "completion_length": 250.0, "epoch": 0.01875, "grad_norm": 0.35157206654548645, "kl": 0.48729386925697327, "learning_rate": 4.765769467591626e-06, "loss": 0.0195, "reward": 0.4166666865348816, "reward_std": 0.42724665999412537, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.4166666865348816, "step": 225 }, { "completion_length": 250.0, "epoch": 0.018833333333333334, "grad_norm": 0.44434380531311035, "kl": 0.27789148688316345, "learning_rate": 4.762067631165049e-06, "loss": 0.0111, "reward": 0.25, "reward_std": 0.34503278136253357, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.125, "step": 226 }, { "completion_length": 250.0, "epoch": 0.018916666666666665, "grad_norm": 1.0383962392807007, "kl": 0.6592018604278564, "learning_rate": 4.7583382321582525e-06, "loss": 0.0264, "reward": 0.75, "reward_std": 0.6606874465942383, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.5, "step": 227 }, { "completion_length": 250.0, "epoch": 0.019, "grad_norm": 0.7657462954521179, "kl": 0.45288151502609253, "learning_rate": 4.754581316012785e-06, "loss": 0.0181, "reward": 0.375, "reward_std": 0.4520675837993622, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.375, "step": 228 }, { "completion_length": 250.0, "epoch": 0.019083333333333334, "grad_norm": 0.35769039392471313, "kl": 0.4359067380428314, "learning_rate": 4.750796928505484e-06, "loss": 0.0174, "reward": 0.3333333432674408, "reward_std": 0.35634833574295044, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.3333333432674408, "step": 229 }, { "completion_length": 250.0, "epoch": 0.019166666666666665, "grad_norm": 0.6205803155899048, "kl": 0.43002450466156006, "learning_rate": 4.746985115747918e-06, "loss": 0.0172, "reward": 0.625, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.375, "step": 230 }, { "completion_length": 250.0, "epoch": 0.01925, "grad_norm": 0.44161456823349, "kl": 0.4217735230922699, "learning_rate": 4.743145924185821e-06, "loss": 0.0169, "reward": 0.375, "reward_std": 0.4154745042324066, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.375, "step": 231 }, { "completion_length": 250.0, "epoch": 0.019333333333333334, "grad_norm": 1.402330994606018, "kl": 0.5382075309753418, "learning_rate": 4.7392794005985324e-06, "loss": 0.0215, "reward": 0.5416666865348816, "reward_std": 0.43415671586990356, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.5416666865348816, "step": 232 }, { "completion_length": 250.0, "epoch": 0.019416666666666665, "grad_norm": 0.30393776297569275, "kl": 0.41637468338012695, "learning_rate": 4.735385592098421e-06, "loss": 0.0167, "reward": 0.2083333432674408, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.2083333432674408, "step": 233 }, { "completion_length": 250.0, "epoch": 0.0195, "grad_norm": 0.29830071330070496, "kl": 0.38214111328125, "learning_rate": 4.731464546130315e-06, "loss": 0.0153, "reward": 0.5833333730697632, "reward_std": 0.7292091846466064, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.4583333432674408, "step": 234 }, { "completion_length": 250.0, "epoch": 0.019583333333333335, "grad_norm": 0.7765196561813354, "kl": 0.6346302032470703, "learning_rate": 4.72751631047092e-06, "loss": 0.0254, "reward": 0.6250000596046448, "reward_std": 0.8249579668045044, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.3750000298023224, "step": 235 }, { "completion_length": 250.0, "epoch": 0.019666666666666666, "grad_norm": 0.5595923662185669, "kl": 0.5578448176383972, "learning_rate": 4.723540933228245e-06, "loss": 0.0223, "reward": 0.4583333432674408, "reward_std": 0.7332792282104492, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.3333333432674408, "step": 236 }, { "completion_length": 250.0, "epoch": 0.01975, "grad_norm": 3.7788710594177246, "kl": 0.4666965901851654, "learning_rate": 4.719538462841003e-06, "loss": 0.0187, "reward": 0.3333333432674408, "reward_std": 0.4714045226573944, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.3333333432674408, "step": 237 }, { "completion_length": 250.0, "epoch": 0.019833333333333335, "grad_norm": 0.2744882106781006, "kl": 0.49767276644706726, "learning_rate": 4.715508948078037e-06, "loss": 0.0199, "reward": 0.5833333730697632, "reward_std": 0.34503278136253357, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.5833333730697632, "step": 238 }, { "completion_length": 250.0, "epoch": 0.019916666666666666, "grad_norm": 37.09307098388672, "kl": 2.531292676925659, "learning_rate": 4.71145243803771e-06, "loss": 0.1013, "reward": 0.625, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.5, "step": 239 }, { "completion_length": 250.0, "epoch": 0.02, "grad_norm": 0.5616350173950195, "kl": 0.8509740829467773, "learning_rate": 4.707368982147318e-06, "loss": 0.034, "reward": 0.6666666865348816, "reward_std": 0.6900655627250671, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.5416666865348816, "step": 240 }, { "completion_length": 250.0, "epoch": 0.020083333333333335, "grad_norm": 0.3082992136478424, "kl": 0.4213227331638336, "learning_rate": 4.703258630162481e-06, "loss": 0.0169, "reward": 0.2916666567325592, "reward_std": 0.37533050775527954, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.2916666865348816, "step": 241 }, { "completion_length": 250.0, "epoch": 0.020166666666666666, "grad_norm": 0.35087594389915466, "kl": 0.34501180052757263, "learning_rate": 4.699121432166542e-06, "loss": 0.0138, "reward": 0.4583333432674408, "reward_std": 0.43415671586990356, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.4583333432674408, "step": 242 }, { "completion_length": 250.0, "epoch": 0.02025, "grad_norm": 1.2634652853012085, "kl": 0.8103005290031433, "learning_rate": 4.6949574385699514e-06, "loss": 0.0324, "reward": 1.1666666269302368, "reward_std": 0.7766431570053101, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.7916666865348816, "step": 243 }, { "completion_length": 250.0, "epoch": 0.02033333333333333, "grad_norm": 0.8633314967155457, "kl": 0.8408775925636292, "learning_rate": 4.690766700109659e-06, "loss": 0.0336, "reward": 0.8333333730697632, "reward_std": 0.6424160599708557, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.7083333730697632, "step": 244 }, { "completion_length": 250.0, "epoch": 0.020416666666666666, "grad_norm": 0.34025847911834717, "kl": 0.3001478612422943, "learning_rate": 4.68654926784849e-06, "loss": 0.012, "reward": 0.2916666865348816, "reward_std": 0.33034375309944153, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.2916666865348816, "step": 245 }, { "completion_length": 250.0, "epoch": 0.0205, "grad_norm": 0.6269357800483704, "kl": 0.6774225831031799, "learning_rate": 4.682305193174524e-06, "loss": 0.0271, "reward": 0.5, "reward_std": 0.4714045226573944, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.5, "step": 246 }, { "completion_length": 250.0, "epoch": 0.020583333333333332, "grad_norm": 0.261139839887619, "kl": 0.628397524356842, "learning_rate": 4.6780345278004744e-06, "loss": 0.0251, "reward": 0.9583333730697632, "reward_std": 0.7650604248046875, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.5833333730697632, "step": 247 }, { "completion_length": 250.0, "epoch": 0.020666666666666667, "grad_norm": 0.3640219569206238, "kl": 0.35570889711380005, "learning_rate": 4.673737323763048e-06, "loss": 0.0142, "reward": 0.4166666567325592, "reward_std": 0.49601587653160095, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.4166666567325592, "step": 248 }, { "completion_length": 250.0, "epoch": 0.02075, "grad_norm": 0.3684917688369751, "kl": 0.3530455231666565, "learning_rate": 4.669413633422322e-06, "loss": 0.0141, "reward": 0.2916666865348816, "reward_std": 0.4520675241947174, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.2916666865348816, "step": 249 }, { "completion_length": 250.0, "epoch": 0.020833333333333332, "grad_norm": 0.4557255804538727, "kl": 0.6199475526809692, "learning_rate": 4.665063509461098e-06, "loss": 0.0248, "reward": 1.1666667461395264, "reward_std": 0.7559289336204529, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.7916666865348816, "step": 250 }, { "completion_length": 250.0, "epoch": 0.020916666666666667, "grad_norm": 0.2662737965583801, "kl": 0.4792260229587555, "learning_rate": 4.6606870048842626e-06, "loss": 0.0192, "reward": 0.4583333134651184, "reward_std": 0.7113032937049866, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.3333333432674408, "step": 251 }, { "completion_length": 250.0, "epoch": 0.021, "grad_norm": 0.3056776821613312, "kl": 0.6897392868995667, "learning_rate": 4.656284173018144e-06, "loss": 0.0276, "reward": 1.2916667461395264, "reward_std": 0.6770032048225403, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.6666666865348816, "step": 252 }, { "completion_length": 250.0, "epoch": 0.021083333333333332, "grad_norm": 0.33795520663261414, "kl": 0.3045516312122345, "learning_rate": 4.65185506750986e-06, "loss": 0.0122, "reward": 0.4583333134651184, "reward_std": 0.46929532289505005, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.4583333730697632, "step": 253 }, { "completion_length": 250.0, "epoch": 0.021166666666666667, "grad_norm": 0.24205689132213593, "kl": 0.42878177762031555, "learning_rate": 4.6473997423266615e-06, "loss": 0.0172, "reward": 1.125, "reward_std": 0.589255690574646, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.7500000596046448, "step": 254 }, { "completion_length": 250.0, "epoch": 0.02125, "grad_norm": 1.4344029426574707, "kl": 0.7129054665565491, "learning_rate": 4.642918251755281e-06, "loss": 0.0285, "reward": 0.5416666269302368, "reward_std": 0.5019802451133728, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.5416666865348816, "step": 255 }, { "completion_length": 250.0, "epoch": 0.021333333333333333, "grad_norm": 2.667504072189331, "kl": 1.6249091625213623, "learning_rate": 4.638410650401267e-06, "loss": 0.065, "reward": 0.6666666269302368, "reward_std": 0.7126966714859009, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.5416666269302368, "step": 256 }, { "completion_length": 250.0, "epoch": 0.021416666666666667, "grad_norm": 0.5786775946617126, "kl": 0.8081812262535095, "learning_rate": 4.633876993188319e-06, "loss": 0.0323, "reward": 0.8333333730697632, "reward_std": 0.35634833574295044, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.5833333730697632, "step": 257 }, { "completion_length": 250.0, "epoch": 0.0215, "grad_norm": 4.823422908782959, "kl": 0.7295472025871277, "learning_rate": 4.62931733535762e-06, "loss": 0.0292, "reward": 0.7083333730697632, "reward_std": 0.7000566720962524, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.4583333432674408, "step": 258 }, { "completion_length": 250.0, "epoch": 0.021583333333333333, "grad_norm": 0.7597243785858154, "kl": 0.7165347337722778, "learning_rate": 4.62473173246716e-06, "loss": 0.0287, "reward": 0.7083333730697632, "reward_std": 0.6770032048225403, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.5833333730697632, "step": 259 }, { "completion_length": 250.0, "epoch": 0.021666666666666667, "grad_norm": 0.6498469710350037, "kl": 0.6449453830718994, "learning_rate": 4.620120240391065e-06, "loss": 0.0258, "reward": 0.5, "reward_std": 0.7559289336204529, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.375, "step": 260 }, { "completion_length": 250.0, "epoch": 0.02175, "grad_norm": 1.3547675609588623, "kl": 0.7189813256263733, "learning_rate": 4.6154829153189105e-06, "loss": 0.0288, "reward": 0.5, "reward_std": 0.4364357888698578, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.5, "step": 261 }, { "completion_length": 250.0, "epoch": 0.021833333333333333, "grad_norm": 0.38959622383117676, "kl": 0.6248072981834412, "learning_rate": 4.610819813755038e-06, "loss": 0.025, "reward": 0.6666666865348816, "reward_std": 0.7126966714859009, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.5416666865348816, "step": 262 }, { "completion_length": 250.0, "epoch": 0.021916666666666668, "grad_norm": 0.5569881200790405, "kl": 0.5907061696052551, "learning_rate": 4.60613099251787e-06, "loss": 0.0236, "reward": 0.625, "reward_std": 0.4520675837993622, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.625, "step": 263 }, { "completion_length": 250.0, "epoch": 0.022, "grad_norm": 0.9613457322120667, "kl": 0.7936864495277405, "learning_rate": 4.601416508739211e-06, "loss": 0.0317, "reward": 0.7083333730697632, "reward_std": 0.6770032048225403, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.5833333730697632, "step": 264 }, { "completion_length": 250.0, "epoch": 0.022083333333333333, "grad_norm": 0.9828578233718872, "kl": 0.6290979981422424, "learning_rate": 4.596676419863561e-06, "loss": 0.0252, "reward": 0.75, "reward_std": 0.6606874465942383, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.5, "step": 265 }, { "completion_length": 250.0, "epoch": 0.022166666666666668, "grad_norm": 20.848907470703125, "kl": 2.7031993865966797, "learning_rate": 4.591910783647405e-06, "loss": 0.1081, "reward": 0.7083333730697632, "reward_std": 0.6531364917755127, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.5833333730697632, "step": 266 }, { "completion_length": 250.0, "epoch": 0.02225, "grad_norm": 0.6771628260612488, "kl": 0.762807309627533, "learning_rate": 4.587119658158517e-06, "loss": 0.0305, "reward": 0.75, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.625, "step": 267 }, { "completion_length": 250.0, "epoch": 0.022333333333333334, "grad_norm": 1.2213846445083618, "kl": 0.7535417675971985, "learning_rate": 4.582303101775249e-06, "loss": 0.0301, "reward": 0.8333333730697632, "reward_std": 0.5909367799758911, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.7083333730697632, "step": 268 }, { "completion_length": 250.0, "epoch": 0.022416666666666668, "grad_norm": 0.3612246513366699, "kl": 0.4376307725906372, "learning_rate": 4.577461173185821e-06, "loss": 0.0175, "reward": 0.4583333432674408, "reward_std": 0.7332792282104492, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.3333333432674408, "step": 269 }, { "completion_length": 250.0, "epoch": 0.0225, "grad_norm": 0.5589926242828369, "kl": 0.43723782896995544, "learning_rate": 4.572593931387604e-06, "loss": 0.0175, "reward": 0.375, "reward_std": 0.3753305673599243, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.375, "step": 270 }, { "completion_length": 250.0, "epoch": 0.022583333333333334, "grad_norm": 0.4354017972946167, "kl": 0.6601411700248718, "learning_rate": 4.567701435686405e-06, "loss": 0.0264, "reward": 0.7083333730697632, "reward_std": 0.4520675241947174, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.7083333730697632, "step": 271 }, { "completion_length": 250.0, "epoch": 0.02266666666666667, "grad_norm": 0.31577983498573303, "kl": 0.5072486996650696, "learning_rate": 4.562783745695738e-06, "loss": 0.0203, "reward": 0.75, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.625, "step": 272 }, { "completion_length": 250.0, "epoch": 0.02275, "grad_norm": 0.44634732604026794, "kl": 0.631403386592865, "learning_rate": 4.5578409213361055e-06, "loss": 0.0253, "reward": 1.125, "reward_std": 0.5892556309700012, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.75, "step": 273 }, { "completion_length": 250.0, "epoch": 0.022833333333333334, "grad_norm": 2.6211862564086914, "kl": 0.8090041875839233, "learning_rate": 4.55287302283426e-06, "loss": 0.0324, "reward": 0.8333333134651184, "reward_std": 0.8164964914321899, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.5833333730697632, "step": 274 }, { "completion_length": 250.0, "epoch": 0.022916666666666665, "grad_norm": 5.3712663650512695, "kl": 1.6774547100067139, "learning_rate": 4.54788011072248e-06, "loss": 0.0671, "reward": 0.7916666865348816, "reward_std": 0.7332792282104492, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.5416666865348816, "step": 275 }, { "completion_length": 250.0, "epoch": 0.023, "grad_norm": 0.9446873068809509, "kl": 0.5351336002349854, "learning_rate": 4.542862245837821e-06, "loss": 0.0214, "reward": 0.4166666865348816, "reward_std": 0.49601587653160095, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.2916666567325592, "step": 276 }, { "completion_length": 250.0, "epoch": 0.023083333333333334, "grad_norm": 2.5900323390960693, "kl": 1.0092023611068726, "learning_rate": 4.537819489321385e-06, "loss": 0.0404, "reward": 0.9166666865348816, "reward_std": 0.5841830372810364, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.7916666269302368, "step": 277 }, { "completion_length": 250.0, "epoch": 0.023166666666666665, "grad_norm": 0.4755576550960541, "kl": 0.756645917892456, "learning_rate": 4.5327519026175694e-06, "loss": 0.0303, "reward": 0.8333333730697632, "reward_std": 0.4364357888698578, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.5833333730697632, "step": 278 }, { "completion_length": 250.0, "epoch": 0.02325, "grad_norm": 0.26933223009109497, "kl": 0.7350080013275146, "learning_rate": 4.527659547473317e-06, "loss": 0.0294, "reward": 1.0, "reward_std": 0.6900655031204224, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.75, "step": 279 }, { "completion_length": 189.0, "epoch": 0.023333333333333334, "grad_norm": 0.3307403028011322, "kl": 0.6269306540489197, "learning_rate": 4.522542485937369e-06, "loss": 0.0251, "reward": 1.0, "reward_std": 0.7559289336204529, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.75, "step": 280 }, { "completion_length": 250.0, "epoch": 0.023416666666666665, "grad_norm": 0.4250253140926361, "kl": 0.7219645977020264, "learning_rate": 4.517400780359505e-06, "loss": 0.0289, "reward": 1.0, "reward_std": 0.7126966118812561, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.75, "step": 281 }, { "completion_length": 250.0, "epoch": 0.0235, "grad_norm": 0.2644083499908447, "kl": 0.7858296632766724, "learning_rate": 4.512234493389785e-06, "loss": 0.0314, "reward": 1.3333333730697632, "reward_std": 0.5634361505508423, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.9583333730697632, "step": 282 }, { "completion_length": 250.0, "epoch": 0.023583333333333335, "grad_norm": 0.9490629434585571, "kl": 1.0001749992370605, "learning_rate": 4.507043687977787e-06, "loss": 0.04, "reward": 0.8333333730697632, "reward_std": 0.6900655627250671, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.5833333730697632, "step": 283 }, { "completion_length": 250.0, "epoch": 0.023666666666666666, "grad_norm": 2.613924741744995, "kl": 0.9775833487510681, "learning_rate": 4.501828427371834e-06, "loss": 0.0391, "reward": 0.875, "reward_std": 0.9074209332466125, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.375, "step": 284 }, { "completion_length": 250.0, "epoch": 0.02375, "grad_norm": 0.32292431592941284, "kl": 0.39211633801460266, "learning_rate": 4.496588775118232e-06, "loss": 0.0157, "reward": 0.4583333432674408, "reward_std": 0.43415671586990356, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.4583333432674408, "step": 285 }, { "completion_length": 250.0, "epoch": 0.023833333333333335, "grad_norm": 3.7025341987609863, "kl": 0.681837260723114, "learning_rate": 4.491324795060491e-06, "loss": 0.0273, "reward": 0.7916666865348816, "reward_std": 0.39591163396835327, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.6666666865348816, "step": 286 }, { "completion_length": 250.0, "epoch": 0.023916666666666666, "grad_norm": 0.4885029196739197, "kl": 0.9224212765693665, "learning_rate": 4.4860365513385456e-06, "loss": 0.0369, "reward": 0.9583333730697632, "reward_std": 0.7650604248046875, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.5833333730697632, "step": 287 }, { "completion_length": 250.0, "epoch": 0.024, "grad_norm": 0.3269827365875244, "kl": 0.5056464672088623, "learning_rate": 4.4807241083879774e-06, "loss": 0.0202, "reward": 0.7083333730697632, "reward_std": 0.7000566720962524, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.5833333134651184, "step": 288 }, { "completion_length": 250.0, "epoch": 0.024083333333333335, "grad_norm": 1.628382682800293, "kl": 0.7768495678901672, "learning_rate": 4.475387530939226e-06, "loss": 0.0311, "reward": 1.125, "reward_std": 0.8345229625701904, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.75, "step": 289 }, { "completion_length": 250.0, "epoch": 0.024166666666666666, "grad_norm": 0.42525872588157654, "kl": 0.44087234139442444, "learning_rate": 4.470026884016805e-06, "loss": 0.0176, "reward": 0.6666666865348816, "reward_std": 0.7126966118812561, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.5416666269302368, "step": 290 }, { "completion_length": 250.0, "epoch": 0.02425, "grad_norm": 1.2830743789672852, "kl": 0.5833079814910889, "learning_rate": 4.464642232938505e-06, "loss": 0.0233, "reward": 0.5416666865348816, "reward_std": 0.7332792282104492, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.4166666865348816, "step": 291 }, { "completion_length": 250.0, "epoch": 0.024333333333333332, "grad_norm": 0.2711179256439209, "kl": 0.6251975297927856, "learning_rate": 4.4592336433146e-06, "loss": 0.025, "reward": 1.4583333730697632, "reward_std": 0.7332792282104492, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.8333333134651184, "step": 292 }, { "completion_length": 250.0, "epoch": 0.024416666666666666, "grad_norm": 1.7788009643554688, "kl": 1.1422821283340454, "learning_rate": 4.453801181047047e-06, "loss": 0.0457, "reward": 1.25, "reward_std": 0.8864052295684814, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.75, "step": 293 }, { "completion_length": 250.0, "epoch": 0.0245, "grad_norm": 0.6440579295158386, "kl": 0.7067055106163025, "learning_rate": 4.448344912328686e-06, "loss": 0.0283, "reward": 0.9583333730697632, "reward_std": 0.5473601818084717, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.8333333730697632, "step": 294 }, { "completion_length": 250.0, "epoch": 0.024583333333333332, "grad_norm": 1.1637884378433228, "kl": 1.3202813863754272, "learning_rate": 4.442864903642428e-06, "loss": 0.0528, "reward": 1.2083333730697632, "reward_std": 0.7332792282104492, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.8333333730697632, "step": 295 }, { "completion_length": 250.0, "epoch": 0.024666666666666667, "grad_norm": 0.35194841027259827, "kl": 0.6548830270767212, "learning_rate": 4.437361221760449e-06, "loss": 0.0262, "reward": 1.0, "reward_std": 0.7126966118812561, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.75, "step": 296 }, { "completion_length": 250.0, "epoch": 0.02475, "grad_norm": 0.7272174954414368, "kl": 0.7445266842842102, "learning_rate": 4.431833933743378e-06, "loss": 0.0298, "reward": 1.0416666269302368, "reward_std": 1.0302951335906982, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.5416666865348816, "step": 297 }, { "completion_length": 250.0, "epoch": 0.024833333333333332, "grad_norm": 0.36612656712532043, "kl": 0.6318120956420898, "learning_rate": 4.426283106939474e-06, "loss": 0.0253, "reward": 0.7916666269302368, "reward_std": 0.8533315062522888, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.5416666269302368, "step": 298 }, { "completion_length": 250.0, "epoch": 0.024916666666666667, "grad_norm": 4.444971561431885, "kl": 1.4804537296295166, "learning_rate": 4.420708808983809e-06, "loss": 0.0592, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.75, "step": 299 }, { "completion_length": 250.0, "epoch": 0.025, "grad_norm": 0.3847966194152832, "kl": 0.6287730932235718, "learning_rate": 4.415111107797445e-06, "loss": 0.0252, "reward": 1.0416667461395264, "reward_std": 0.9332908391952515, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.4166666865348816, "step": 300 }, { "completion_length": 250.0, "epoch": 0.025083333333333332, "grad_norm": 1.1420190334320068, "kl": 0.900063157081604, "learning_rate": 4.409490071586606e-06, "loss": 0.036, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.875, "step": 301 }, { "completion_length": 250.0, "epoch": 0.025166666666666667, "grad_norm": 1.9523299932479858, "kl": 1.0245622396469116, "learning_rate": 4.403845768841842e-06, "loss": 0.041, "reward": 0.8333333134651184, "reward_std": 0.8164965510368347, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.5833333134651184, "step": 302 }, { "completion_length": 250.0, "epoch": 0.02525, "grad_norm": 0.2748126685619354, "kl": 0.49438124895095825, "learning_rate": 4.398178268337202e-06, "loss": 0.0198, "reward": 0.7083333730697632, "reward_std": 0.7000566720962524, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.4583333432674408, "step": 303 }, { "completion_length": 250.0, "epoch": 0.025333333333333333, "grad_norm": 0.9594418406486511, "kl": 0.8395068049430847, "learning_rate": 4.3924876391293915e-06, "loss": 0.0336, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.875, "step": 304 }, { "completion_length": 250.0, "epoch": 0.025416666666666667, "grad_norm": 0.33724528551101685, "kl": 0.4758547246456146, "learning_rate": 4.386773950556931e-06, "loss": 0.019, "reward": 0.3750000298023224, "reward_std": 0.4520675837993622, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.375, "step": 305 }, { "completion_length": 250.0, "epoch": 0.0255, "grad_norm": 0.8724843859672546, "kl": 0.882912278175354, "learning_rate": 4.381037272239311e-06, "loss": 0.0353, "reward": 1.4166666269302368, "reward_std": 0.6606874465942383, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.9166666865348816, "step": 306 }, { "completion_length": 250.0, "epoch": 0.025583333333333333, "grad_norm": 0.30091574788093567, "kl": 0.7076762318611145, "learning_rate": 4.3752776740761495e-06, "loss": 0.0283, "reward": 1.0833333730697632, "reward_std": 0.5841830968856812, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.7083333730697632, "step": 307 }, { "completion_length": 250.0, "epoch": 0.025666666666666667, "grad_norm": 0.3942156434059143, "kl": 1.1950002908706665, "learning_rate": 4.36949522624633e-06, "loss": 0.0478, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.875, "step": 308 }, { "completion_length": 250.0, "epoch": 0.02575, "grad_norm": 1.6910313367843628, "kl": 1.2377409934997559, "learning_rate": 4.3636899992071555e-06, "loss": 0.0495, "reward": 1.125, "reward_std": 0.46929532289505005, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.8750000596046448, "step": 309 }, { "completion_length": 250.0, "epoch": 0.025833333333333333, "grad_norm": 3.3976781368255615, "kl": 1.8446786403656006, "learning_rate": 4.357862063693486e-06, "loss": 0.0738, "reward": 1.3333332538604736, "reward_std": 0.9428090453147888, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.7083333730697632, "step": 310 }, { "completion_length": 250.0, "epoch": 0.025916666666666668, "grad_norm": 0.49139174818992615, "kl": 1.5378676652908325, "learning_rate": 4.352011490716875e-06, "loss": 0.0615, "reward": 1.2916666269302368, "reward_std": 0.8249579071998596, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.6666666865348816, "step": 311 }, { "completion_length": 250.0, "epoch": 0.026, "grad_norm": 0.49455150961875916, "kl": 1.1211459636688232, "learning_rate": 4.346138351564711e-06, "loss": 0.0448, "reward": 0.7083333730697632, "reward_std": 0.7000566720962524, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.5833333730697632, "step": 312 }, { "completion_length": 250.0, "epoch": 0.026083333333333333, "grad_norm": 1.1779581308364868, "kl": 1.5087566375732422, "learning_rate": 4.340242717799337e-06, "loss": 0.0604, "reward": 0.75, "reward_std": 0.8498365879058838, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.5, "step": 313 }, { "completion_length": 250.0, "epoch": 0.026166666666666668, "grad_norm": 0.3693522810935974, "kl": 0.5869612693786621, "learning_rate": 4.334324661257191e-06, "loss": 0.0235, "reward": 1.0, "reward_std": 0.9258201122283936, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.625, "step": 314 }, { "completion_length": 250.0, "epoch": 0.02625, "grad_norm": 2.1444251537323, "kl": 1.3732556104660034, "learning_rate": 4.328384254047927e-06, "loss": 0.0549, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 1.0, "step": 315 }, { "completion_length": 250.0, "epoch": 0.026333333333333334, "grad_norm": 0.30390700697898865, "kl": 0.9706352949142456, "learning_rate": 4.322421568553529e-06, "loss": 0.0388, "reward": 1.0416667461395264, "reward_std": 0.4520675837993622, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.9166666865348816, "step": 316 }, { "completion_length": 250.0, "epoch": 0.026416666666666668, "grad_norm": 1.256115436553955, "kl": 0.730143129825592, "learning_rate": 4.316436677427441e-06, "loss": 0.0292, "reward": 1.375, "reward_std": 0.8807914853096008, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.75, "step": 317 }, { "completion_length": 250.0, "epoch": 0.0265, "grad_norm": 0.3741249442100525, "kl": 0.9804970622062683, "learning_rate": 4.3104296535936695e-06, "loss": 0.0392, "reward": 1.125, "reward_std": 0.589255690574646, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.875, "step": 318 }, { "completion_length": 250.0, "epoch": 0.026583333333333334, "grad_norm": 2.2285826206207275, "kl": 1.3444833755493164, "learning_rate": 4.3044005702459055e-06, "loss": 0.0538, "reward": 1.0416666269302368, "reward_std": 0.7223747968673706, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.6666666865348816, "step": 319 }, { "completion_length": 250.0, "epoch": 0.02666666666666667, "grad_norm": 1.4974360466003418, "kl": 0.8706364035606384, "learning_rate": 4.2983495008466285e-06, "loss": 0.0348, "reward": 1.0, "reward_std": 0.7559289336204529, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.75, "step": 320 }, { "completion_length": 250.0, "epoch": 0.02675, "grad_norm": 0.8530201315879822, "kl": 0.9074989557266235, "learning_rate": 4.2922765191262075e-06, "loss": 0.0363, "reward": 0.75, "reward_std": 0.6606874465942383, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.625, "step": 321 }, { "completion_length": 250.0, "epoch": 0.026833333333333334, "grad_norm": 0.4051268398761749, "kl": 0.6288662552833557, "learning_rate": 4.286181699082008e-06, "loss": 0.0252, "reward": 0.875, "reward_std": 0.7753647565841675, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.625, "step": 322 }, { "completion_length": 250.0, "epoch": 0.026916666666666665, "grad_norm": 0.430800199508667, "kl": 0.9511799216270447, "learning_rate": 4.280065114977492e-06, "loss": 0.038, "reward": 1.0833333730697632, "reward_std": 0.8864052295684814, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.4583333432674408, "step": 323 }, { "completion_length": 250.0, "epoch": 0.027, "grad_norm": 0.48526695370674133, "kl": 1.0858272314071655, "learning_rate": 4.273926841341303e-06, "loss": 0.0434, "reward": 1.0833332538604736, "reward_std": 0.9880235195159912, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.5833333730697632, "step": 324 }, { "completion_length": 222.0, "epoch": 0.027083333333333334, "grad_norm": 0.22421292960643768, "kl": 0.834428071975708, "learning_rate": 4.267766952966369e-06, "loss": 0.0334, "reward": 1.5416667461395264, "reward_std": 0.501980185508728, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.9166666865348816, "step": 325 }, { "completion_length": 250.0, "epoch": 0.027166666666666665, "grad_norm": 0.3456939458847046, "kl": 1.3001376390457153, "learning_rate": 4.261585524908987e-06, "loss": 0.052, "reward": 1.375, "reward_std": 0.602573812007904, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.875, "step": 326 }, { "completion_length": 250.0, "epoch": 0.02725, "grad_norm": 0.3079489469528198, "kl": 0.6641181111335754, "learning_rate": 4.255382632487907e-06, "loss": 0.0266, "reward": 0.9583333730697632, "reward_std": 0.7855339050292969, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.4583333730697632, "step": 327 }, { "completion_length": 250.0, "epoch": 0.027333333333333334, "grad_norm": 0.6239248514175415, "kl": 0.5432109236717224, "learning_rate": 4.249158351283414e-06, "loss": 0.0217, "reward": 1.0416667461395264, "reward_std": 0.6770032048225403, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.7916666865348816, "step": 328 }, { "completion_length": 250.0, "epoch": 0.027416666666666666, "grad_norm": 2.2812557220458984, "kl": 1.1748898029327393, "learning_rate": 4.242912757136412e-06, "loss": 0.047, "reward": 1.0416667461395264, "reward_std": 0.7000566720962524, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.7916666865348816, "step": 329 }, { "completion_length": 250.0, "epoch": 0.0275, "grad_norm": 0.3581920266151428, "kl": 0.6617559790611267, "learning_rate": 4.236645926147493e-06, "loss": 0.0265, "reward": 1.125, "reward_std": 0.7955730557441711, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.7500000596046448, "step": 330 }, { "completion_length": 250.0, "epoch": 0.027583333333333335, "grad_norm": 1.4809401035308838, "kl": 1.5578365325927734, "learning_rate": 4.230357934676017e-06, "loss": 0.0623, "reward": 1.4583333730697632, "reward_std": 0.7546154260635376, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.8333333730697632, "step": 331 }, { "completion_length": 250.0, "epoch": 0.027666666666666666, "grad_norm": 0.35725685954093933, "kl": 1.0120397806167603, "learning_rate": 4.224048859339175e-06, "loss": 0.0405, "reward": 1.2083333730697632, "reward_std": 0.6651769280433655, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.8333333730697632, "step": 332 }, { "completion_length": 250.0, "epoch": 0.02775, "grad_norm": 0.28768303990364075, "kl": 0.9968042969703674, "learning_rate": 4.217718777011058e-06, "loss": 0.0399, "reward": 1.2916667461395264, "reward_std": 0.6283639669418335, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.9166666865348816, "step": 333 }, { "completion_length": 250.0, "epoch": 0.027833333333333335, "grad_norm": 0.8159481287002563, "kl": 0.9098507761955261, "learning_rate": 4.211367764821722e-06, "loss": 0.0364, "reward": 1.0833333730697632, "reward_std": 0.6606874465942383, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.7083333730697632, "step": 334 }, { "completion_length": 250.0, "epoch": 0.027916666666666666, "grad_norm": 28.54840087890625, "kl": 6.921156406402588, "learning_rate": 4.204995900156247e-06, "loss": 0.2768, "reward": 1.6666667461395264, "reward_std": 0.6900655627250671, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.7916667461395264, "step": 335 }, { "completion_length": 250.0, "epoch": 0.028, "grad_norm": 3.128087282180786, "kl": 1.198431134223938, "learning_rate": 4.198603260653792e-06, "loss": 0.0479, "reward": 1.5833333730697632, "reward_std": 0.527046263217926, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.8333333730697632, "step": 336 }, { "completion_length": 250.0, "epoch": 0.02808333333333333, "grad_norm": 8.092500686645508, "kl": 4.037166595458984, "learning_rate": 4.192189924206652e-06, "loss": 0.1615, "reward": 1.25, "reward_std": 0.8864052295684814, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.625, "step": 337 }, { "completion_length": 250.0, "epoch": 0.028166666666666666, "grad_norm": 2.2210144996643066, "kl": 1.3574161529541016, "learning_rate": 4.185755968959308e-06, "loss": 0.0543, "reward": 1.3333333730697632, "reward_std": 0.8164965510368347, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.7083333730697632, "step": 338 }, { "completion_length": 250.0, "epoch": 0.02825, "grad_norm": 1.8085874319076538, "kl": 0.9977900385856628, "learning_rate": 4.179301473307476e-06, "loss": 0.0399, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.875, "step": 339 }, { "completion_length": 250.0, "epoch": 0.028333333333333332, "grad_norm": 0.8611170649528503, "kl": 0.8231365084648132, "learning_rate": 4.172826515897146e-06, "loss": 0.0329, "reward": 1.1666666269302368, "reward_std": 0.7968190312385559, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.5416666865348816, "step": 340 }, { "completion_length": 250.0, "epoch": 0.028416666666666666, "grad_norm": 0.2523643970489502, "kl": 1.1845202445983887, "learning_rate": 4.166331175623631e-06, "loss": 0.0474, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.875, "step": 341 }, { "completion_length": 250.0, "epoch": 0.0285, "grad_norm": 0.3289211690425873, "kl": 0.9523929953575134, "learning_rate": 4.159815531630604e-06, "loss": 0.0381, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.875, "step": 342 }, { "completion_length": 250.0, "epoch": 0.028583333333333332, "grad_norm": 1.336962103843689, "kl": 1.3120596408843994, "learning_rate": 4.15327966330913e-06, "loss": 0.0525, "reward": 1.1666667461395264, "reward_std": 0.5634361505508423, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.7916666865348816, "step": 343 }, { "completion_length": 240.0, "epoch": 0.028666666666666667, "grad_norm": 0.3630130887031555, "kl": 1.1895368099212646, "learning_rate": 4.146723650296701e-06, "loss": 0.0476, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 1.0, "step": 344 }, { "completion_length": 250.0, "epoch": 0.02875, "grad_norm": 0.3989483416080475, "kl": 0.6941573619842529, "learning_rate": 4.140147572476269e-06, "loss": 0.0278, "reward": 0.7083333730697632, "reward_std": 0.6531365513801575, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.4583333432674408, "step": 345 }, { "completion_length": 250.0, "epoch": 0.028833333333333332, "grad_norm": 0.315608412027359, "kl": 0.5940113067626953, "learning_rate": 4.133551509975264e-06, "loss": 0.0238, "reward": 1.0416666269302368, "reward_std": 0.8439795970916748, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.6666666865348816, "step": 346 }, { "completion_length": 250.0, "epoch": 0.028916666666666667, "grad_norm": 0.28037476539611816, "kl": 0.553116500377655, "learning_rate": 4.126935543164628e-06, "loss": 0.0221, "reward": 1.25, "reward_std": 0.8864052295684814, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.625, "step": 347 }, { "completion_length": 250.0, "epoch": 0.029, "grad_norm": 0.23763324320316315, "kl": 0.6913818717002869, "learning_rate": 4.120299752657828e-06, "loss": 0.0277, "reward": 1.6666666269302368, "reward_std": 0.6424161195755005, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.9166666865348816, "step": 348 }, { "completion_length": 250.0, "epoch": 0.029083333333333333, "grad_norm": 0.367928147315979, "kl": 0.6871733665466309, "learning_rate": 4.113644219309877e-06, "loss": 0.0275, "reward": 1.375, "reward_std": 0.8054870963096619, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.75, "step": 349 }, { "completion_length": 250.0, "epoch": 0.029166666666666667, "grad_norm": 0.2744889557361603, "kl": 0.7716153860092163, "learning_rate": 4.106969024216348e-06, "loss": 0.0309, "reward": 0.9583333730697632, "reward_std": 0.8054871559143066, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.5833333134651184, "step": 350 }, { "completion_length": 250.0, "epoch": 0.02925, "grad_norm": 1.142672061920166, "kl": 1.1665054559707642, "learning_rate": 4.1002742487123896e-06, "loss": 0.0467, "reward": 1.5833333730697632, "reward_std": 0.5841830372810364, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.8333333730697632, "step": 351 }, { "completion_length": 250.0, "epoch": 0.029333333333333333, "grad_norm": 0.0448664054274559, "kl": 1.181343674659729, "learning_rate": 4.093559974371725e-06, "loss": 0.0473, "reward": 2.0, "reward_std": 0.0, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 1.0, "step": 352 }, { "completion_length": 250.0, "epoch": 0.029416666666666667, "grad_norm": 0.2805432975292206, "kl": 1.1471784114837646, "learning_rate": 4.086826283005669e-06, "loss": 0.0459, "reward": 1.2916666269302368, "reward_std": 0.6283639669418335, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.9166666865348816, "step": 353 }, { "completion_length": 250.0, "epoch": 0.0295, "grad_norm": 0.36402520537376404, "kl": 0.9837138056755066, "learning_rate": 4.080073256662128e-06, "loss": 0.0393, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.875, "step": 354 }, { "completion_length": 250.0, "epoch": 0.029583333333333333, "grad_norm": 0.24651949107646942, "kl": 0.8240635991096497, "learning_rate": 4.073300977624594e-06, "loss": 0.033, "reward": 1.6666666269302368, "reward_std": 0.6424160599708557, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.9166666865348816, "step": 355 }, { "completion_length": 250.0, "epoch": 0.029666666666666668, "grad_norm": 0.8997929692268372, "kl": 1.0346089601516724, "learning_rate": 4.066509528411151e-06, "loss": 0.0414, "reward": 1.3333333730697632, "reward_std": 0.7766431570053101, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.8333333730697632, "step": 356 }, { "completion_length": 213.0, "epoch": 0.02975, "grad_norm": 0.2964262068271637, "kl": 1.3979225158691406, "learning_rate": 4.059698991773466e-06, "loss": 0.0559, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.875, "step": 357 }, { "completion_length": 250.0, "epoch": 0.029833333333333333, "grad_norm": 0.5031925439834595, "kl": 1.1336249113082886, "learning_rate": 4.052869450695776e-06, "loss": 0.0453, "reward": 1.6666666269302368, "reward_std": 0.6424160599708557, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.7916666865348816, "step": 358 }, { "completion_length": 250.0, "epoch": 0.029916666666666668, "grad_norm": 0.4681895971298218, "kl": 0.8900135159492493, "learning_rate": 4.046020988393886e-06, "loss": 0.0356, "reward": 1.0, "reward_std": 0.8908708095550537, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.625, "step": 359 }, { "completion_length": 250.0, "epoch": 0.03, "grad_norm": 0.320224791765213, "kl": 0.7603616714477539, "learning_rate": 4.039153688314146e-06, "loss": 0.0304, "reward": 1.25, "reward_std": 0.6606874465942383, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.875, "step": 360 }, { "completion_length": 250.0, "epoch": 0.030083333333333333, "grad_norm": 0.6727330684661865, "kl": 0.9661973714828491, "learning_rate": 4.032267634132442e-06, "loss": 0.0386, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 1.0, "step": 361 }, { "completion_length": 250.0, "epoch": 0.030166666666666668, "grad_norm": 0.8547607064247131, "kl": 1.3019939661026, "learning_rate": 4.02536290975317e-06, "loss": 0.0521, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.875, "step": 362 }, { "completion_length": 250.0, "epoch": 0.03025, "grad_norm": 0.27684468030929565, "kl": 0.980901300907135, "learning_rate": 4.018439599308217e-06, "loss": 0.0392, "reward": 1.125, "reward_std": 0.8345229625701904, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.625, "step": 363 }, { "completion_length": 250.0, "epoch": 0.030333333333333334, "grad_norm": 0.6026430726051331, "kl": 0.8164455890655518, "learning_rate": 4.011497787155938e-06, "loss": 0.0327, "reward": 1.2083333730697632, "reward_std": 0.7546154260635376, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.7083333134651184, "step": 364 }, { "completion_length": 250.0, "epoch": 0.030416666666666668, "grad_norm": 0.24824610352516174, "kl": 0.7272346019744873, "learning_rate": 4.0045375578801216e-06, "loss": 0.0291, "reward": 1.2083332538604736, "reward_std": 0.8897565007209778, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.7083333730697632, "step": 365 }, { "completion_length": 250.0, "epoch": 0.0305, "grad_norm": 0.419993132352829, "kl": 0.791063666343689, "learning_rate": 3.997558996288965e-06, "loss": 0.0316, "reward": 0.875, "reward_std": 0.6651769280433655, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.625, "step": 366 }, { "completion_length": 250.0, "epoch": 0.030583333333333334, "grad_norm": 0.24291253089904785, "kl": 0.8568480014801025, "learning_rate": 3.9905621874140396e-06, "loss": 0.0343, "reward": 1.125, "reward_std": 0.5019802451133728, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.875, "step": 367 }, { "completion_length": 250.0, "epoch": 0.030666666666666665, "grad_norm": 1.9887789487838745, "kl": 1.3834556341171265, "learning_rate": 3.983547216509254e-06, "loss": 0.0553, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.75, "step": 368 }, { "completion_length": 250.0, "epoch": 0.03075, "grad_norm": 1.0339287519454956, "kl": 1.5022096633911133, "learning_rate": 3.976514169049814e-06, "loss": 0.0601, "reward": 0.8333333134651184, "reward_std": 0.8728715777397156, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.3333333432674408, "step": 369 }, { "completion_length": 250.0, "epoch": 0.030833333333333334, "grad_norm": 1.1466647386550903, "kl": 0.6712841987609863, "learning_rate": 3.969463130731183e-06, "loss": 0.0269, "reward": 0.5416666865348816, "reward_std": 0.46929532289505005, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.5416666865348816, "step": 370 }, { "completion_length": 250.0, "epoch": 0.030916666666666665, "grad_norm": 0.8428452014923096, "kl": 1.1327028274536133, "learning_rate": 3.96239418746804e-06, "loss": 0.0453, "reward": 1.6666666269302368, "reward_std": 0.6424161195755005, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.7916666865348816, "step": 371 }, { "completion_length": 250.0, "epoch": 0.031, "grad_norm": 1.3757325410842896, "kl": 0.7787545919418335, "learning_rate": 3.955307425393224e-06, "loss": 0.0312, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 1.0, "step": 372 }, { "completion_length": 250.0, "epoch": 0.031083333333333334, "grad_norm": 0.7574729323387146, "kl": 1.0990623235702515, "learning_rate": 3.948202930856697e-06, "loss": 0.044, "reward": 1.3333333730697632, "reward_std": 0.5634361505508423, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.9583333730697632, "step": 373 }, { "completion_length": 250.0, "epoch": 0.031166666666666665, "grad_norm": 4.400579452514648, "kl": 1.3642206192016602, "learning_rate": 3.941080790424483e-06, "loss": 0.0546, "reward": 1.2083333730697632, "reward_std": 0.8533315062522888, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.7083333730697632, "step": 374 }, { "completion_length": 250.0, "epoch": 0.03125, "grad_norm": 0.5696729421615601, "kl": 1.0298165082931519, "learning_rate": 3.933941090877615e-06, "loss": 0.0412, "reward": 1.6666667461395264, "reward_std": 0.4714045226573944, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.9166666865348816, "step": 375 }, { "completion_length": 250.0, "epoch": 0.03133333333333333, "grad_norm": 0.22187651693820953, "kl": 0.9323797225952148, "learning_rate": 3.92678391921108e-06, "loss": 0.0373, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.875, "step": 376 }, { "completion_length": 250.0, "epoch": 0.03141666666666667, "grad_norm": 0.35391122102737427, "kl": 1.0090752840042114, "learning_rate": 3.9196093626327535e-06, "loss": 0.0404, "reward": 1.6666667461395264, "reward_std": 0.6900655627250671, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.7916666865348816, "step": 377 }, { "completion_length": 250.0, "epoch": 0.0315, "grad_norm": 3.709547758102417, "kl": 1.3984549045562744, "learning_rate": 3.912417508562345e-06, "loss": 0.0559, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.875, "step": 378 }, { "completion_length": 250.0, "epoch": 0.03158333333333333, "grad_norm": 2.316877603530884, "kl": 1.1878688335418701, "learning_rate": 3.905208444630326e-06, "loss": 0.0475, "reward": 1.0833333730697632, "reward_std": 0.8498365879058838, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.7083333730697632, "step": 379 }, { "completion_length": 250.0, "epoch": 0.03166666666666667, "grad_norm": 0.40123090147972107, "kl": 1.305446982383728, "learning_rate": 3.897982258676867e-06, "loss": 0.0522, "reward": 1.0833333730697632, "reward_std": 0.771516740322113, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.5833333134651184, "step": 380 }, { "completion_length": 250.0, "epoch": 0.03175, "grad_norm": 0.3199678659439087, "kl": 1.1264021396636963, "learning_rate": 3.890739038750763e-06, "loss": 0.0451, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.75, "step": 381 }, { "completion_length": 250.0, "epoch": 0.03183333333333333, "grad_norm": 0.27256831526756287, "kl": 1.2236875295639038, "learning_rate": 3.88347887310836e-06, "loss": 0.0489, "reward": 1.6666667461395264, "reward_std": 0.4714045226573944, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.9166666865348816, "step": 382 }, { "completion_length": 232.0, "epoch": 0.03191666666666667, "grad_norm": 0.2633911669254303, "kl": 1.0527544021606445, "learning_rate": 3.876201850212489e-06, "loss": 0.0421, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.875, "step": 383 }, { "completion_length": 250.0, "epoch": 0.032, "grad_norm": 0.2916422486305237, "kl": 0.9229851365089417, "learning_rate": 3.868908058731376e-06, "loss": 0.0369, "reward": 1.2083332538604736, "reward_std": 0.7753647565841675, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.5833333730697632, "step": 384 }, { "completion_length": 185.0, "epoch": 0.03208333333333333, "grad_norm": 0.30826249718666077, "kl": 0.9985988140106201, "learning_rate": 3.861597587537568e-06, "loss": 0.0399, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 1.0, "step": 385 }, { "completion_length": 250.0, "epoch": 0.03216666666666667, "grad_norm": 0.0261689443141222, "kl": 1.0752596855163574, "learning_rate": 3.85427052570685e-06, "loss": 0.043, "reward": 2.0, "reward_std": 0.0, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 1.0, "step": 386 }, { "completion_length": 250.0, "epoch": 0.03225, "grad_norm": 0.3096613883972168, "kl": 1.0192590951919556, "learning_rate": 3.846926962517158e-06, "loss": 0.0408, "reward": 1.9583333730697632, "reward_std": 0.11785109341144562, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 0.9583333730697632, "step": 387 }, { "completion_length": 250.0, "epoch": 0.03233333333333333, "grad_norm": 1.7860721349716187, "kl": 0.6369985342025757, "learning_rate": 3.839566987447492e-06, "loss": 0.0255, "reward": 1.2083333730697632, "reward_std": 0.7332792282104492, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.8333333730697632, "step": 388 }, { "completion_length": 250.0, "epoch": 0.03241666666666667, "grad_norm": 0.26386263966560364, "kl": 0.7966631650924683, "learning_rate": 3.832190690176825e-06, "loss": 0.0319, "reward": 1.2083333730697632, "reward_std": 0.6651769280433655, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.8333333730697632, "step": 389 }, { "completion_length": 250.0, "epoch": 0.0325, "grad_norm": 0.32119181752204895, "kl": 0.9594342708587646, "learning_rate": 3.824798160583012e-06, "loss": 0.0384, "reward": 1.0416667461395264, "reward_std": 0.8807914853096008, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.5416666269302368, "step": 390 }, { "completion_length": 250.0, "epoch": 0.03258333333333333, "grad_norm": 0.47729289531707764, "kl": 1.0643929243087769, "learning_rate": 3.817389488741694e-06, "loss": 0.0426, "reward": 1.5, "reward_std": 0.6172133684158325, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.875, "step": 391 }, { "completion_length": 250.0, "epoch": 0.03266666666666666, "grad_norm": 1.1122965812683105, "kl": 1.5673550367355347, "learning_rate": 3.8099647649251984e-06, "loss": 0.0627, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.875, "step": 392 }, { "completion_length": 250.0, "epoch": 0.03275, "grad_norm": 2.794879198074341, "kl": 1.3167065382003784, "learning_rate": 3.802524079601442e-06, "loss": 0.0527, "reward": 1.4166667461395264, "reward_std": 0.7292091846466064, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.7916666865348816, "step": 393 }, { "completion_length": 250.0, "epoch": 0.03283333333333333, "grad_norm": 0.9184636473655701, "kl": 1.0699944496154785, "learning_rate": 3.795067523432826e-06, "loss": 0.0428, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 1.0, "step": 394 }, { "completion_length": 250.0, "epoch": 0.032916666666666664, "grad_norm": 0.21204939484596252, "kl": 1.1911050081253052, "learning_rate": 3.787595187275136e-06, "loss": 0.0476, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.875, "step": 395 }, { "completion_length": 250.0, "epoch": 0.033, "grad_norm": 0.6764863729476929, "kl": 0.9250705242156982, "learning_rate": 3.780107162176429e-06, "loss": 0.037, "reward": 1.375, "reward_std": 0.9161254167556763, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.75, "step": 396 }, { "completion_length": 250.0, "epoch": 0.03308333333333333, "grad_norm": 5.473031997680664, "kl": 1.4827823638916016, "learning_rate": 3.772603539375929e-06, "loss": 0.0593, "reward": 1.0416666269302368, "reward_std": 0.8807914853096008, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.6666666865348816, "step": 397 }, { "completion_length": 250.0, "epoch": 0.033166666666666664, "grad_norm": 3.915686845779419, "kl": 1.3783940076828003, "learning_rate": 3.7650844103029093e-06, "loss": 0.0551, "reward": 1.3333333730697632, "reward_std": 0.7766431570053101, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.8333333134651184, "step": 398 }, { "completion_length": 250.0, "epoch": 0.03325, "grad_norm": 0.343504935503006, "kl": 0.7900997400283813, "learning_rate": 3.7575498665755884e-06, "loss": 0.0316, "reward": 1.0, "reward_std": 0.9258201122283936, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.625, "step": 399 }, { "completion_length": 250.0, "epoch": 0.03333333333333333, "grad_norm": 0.32992246747016907, "kl": 0.8160569071769714, "learning_rate": 3.7500000000000005e-06, "loss": 0.0326, "reward": 1.25, "reward_std": 0.8864052295684814, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.75, "step": 400 }, { "completion_length": 250.0, "epoch": 0.033416666666666664, "grad_norm": 0.726000964641571, "kl": 1.2352204322814941, "learning_rate": 3.742434902568889e-06, "loss": 0.0494, "reward": 1.5833332538604736, "reward_std": 0.6362089514732361, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.7083333134651184, "step": 401 }, { "completion_length": 250.0, "epoch": 0.0335, "grad_norm": 0.31690701842308044, "kl": 0.9032598733901978, "learning_rate": 3.7348546664605777e-06, "loss": 0.0361, "reward": 0.9583333730697632, "reward_std": 0.7650604248046875, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.7083333730697632, "step": 402 }, { "completion_length": 250.0, "epoch": 0.03358333333333333, "grad_norm": 0.3370535969734192, "kl": 0.8206230998039246, "learning_rate": 3.7272593840378526e-06, "loss": 0.0328, "reward": 0.7916666865348816, "reward_std": 0.6651768684387207, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.6666666865348816, "step": 403 }, { "completion_length": 250.0, "epoch": 0.033666666666666664, "grad_norm": 0.3390369415283203, "kl": 1.0624157190322876, "learning_rate": 3.7196491478468322e-06, "loss": 0.0425, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 1.0, "step": 404 }, { "completion_length": 250.0, "epoch": 0.03375, "grad_norm": 0.2972298860549927, "kl": 1.257109522819519, "learning_rate": 3.7120240506158433e-06, "loss": 0.0503, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 1.0, "step": 405 }, { "completion_length": 250.0, "epoch": 0.03383333333333333, "grad_norm": 0.38008397817611694, "kl": 0.9990907907485962, "learning_rate": 3.7043841852542884e-06, "loss": 0.04, "reward": 1.4166667461395264, "reward_std": 0.49601587653160095, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.9166666865348816, "step": 406 }, { "completion_length": 250.0, "epoch": 0.033916666666666664, "grad_norm": 0.5591140389442444, "kl": 1.0515403747558594, "learning_rate": 3.6967296448515176e-06, "loss": 0.0421, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.75, "step": 407 }, { "completion_length": 250.0, "epoch": 0.034, "grad_norm": 0.6368426084518433, "kl": 0.9551165103912354, "learning_rate": 3.689060522675689e-06, "loss": 0.0382, "reward": 1.2916666269302368, "reward_std": 0.6283639669418335, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.7916666865348816, "step": 408 }, { "completion_length": 250.0, "epoch": 0.034083333333333334, "grad_norm": 0.30579978227615356, "kl": 0.6054512858390808, "learning_rate": 3.6813769121726356e-06, "loss": 0.0242, "reward": 1.0416667461395264, "reward_std": 0.4520675241947174, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.9166666865348816, "step": 409 }, { "completion_length": 250.0, "epoch": 0.034166666666666665, "grad_norm": 0.6442110538482666, "kl": 0.9826160073280334, "learning_rate": 3.6736789069647273e-06, "loss": 0.0393, "reward": 1.5416667461395264, "reward_std": 0.7332792282104492, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.7916666865348816, "step": 410 }, { "completion_length": 250.0, "epoch": 0.03425, "grad_norm": 0.264870285987854, "kl": 0.5808348059654236, "learning_rate": 3.6659666008497287e-06, "loss": 0.0232, "reward": 1.0833332538604736, "reward_std": 0.8309490084648132, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.7083333730697632, "step": 411 }, { "completion_length": 250.0, "epoch": 0.034333333333333334, "grad_norm": 0.5558726787567139, "kl": 1.1091468334197998, "learning_rate": 3.658240087799655e-06, "loss": 0.0444, "reward": 1.375, "reward_std": 0.9161254167556763, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.75, "step": 412 }, { "completion_length": 250.0, "epoch": 0.034416666666666665, "grad_norm": 0.5184251666069031, "kl": 0.9304683804512024, "learning_rate": 3.6504994619596295e-06, "loss": 0.0372, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.875, "step": 413 }, { "completion_length": 250.0, "epoch": 0.0345, "grad_norm": 1.4748616218566895, "kl": 1.3676832914352417, "learning_rate": 3.642744817646736e-06, "loss": 0.0547, "reward": 1.4583332538604736, "reward_std": 0.8345229029655457, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.7083333730697632, "step": 414 }, { "completion_length": 250.0, "epoch": 0.034583333333333334, "grad_norm": 0.43191081285476685, "kl": 1.3092877864837646, "learning_rate": 3.634976249348867e-06, "loss": 0.0524, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 1.0, "step": 415 }, { "completion_length": 250.0, "epoch": 0.034666666666666665, "grad_norm": 0.4750445485115051, "kl": 1.3486921787261963, "learning_rate": 3.627193851723577e-06, "loss": 0.0539, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.75, "step": 416 }, { "completion_length": 250.0, "epoch": 0.03475, "grad_norm": 0.5661317110061646, "kl": 1.1146034002304077, "learning_rate": 3.6193977195969243e-06, "loss": 0.0446, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 1.0, "step": 417 }, { "completion_length": 250.0, "epoch": 0.034833333333333334, "grad_norm": 0.36562928557395935, "kl": 1.1447898149490356, "learning_rate": 3.611587947962319e-06, "loss": 0.0458, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.75, "step": 418 }, { "completion_length": 250.0, "epoch": 0.034916666666666665, "grad_norm": 0.2762870788574219, "kl": 0.48091429471969604, "learning_rate": 3.6037646319793635e-06, "loss": 0.0192, "reward": 1.5833333730697632, "reward_std": 0.5841830372810364, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.8333333730697632, "step": 419 }, { "completion_length": 250.0, "epoch": 0.035, "grad_norm": 0.668820321559906, "kl": 1.2909846305847168, "learning_rate": 3.595927866972694e-06, "loss": 0.0516, "reward": 1.4583333730697632, "reward_std": 0.7332792282104492, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.7083333730697632, "step": 420 }, { "completion_length": 250.0, "epoch": 0.035083333333333334, "grad_norm": 0.4891945421695709, "kl": 1.1140190362930298, "learning_rate": 3.5880777484308193e-06, "loss": 0.0446, "reward": 1.1666667461395264, "reward_std": 0.8728715181350708, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.5416666865348816, "step": 421 }, { "completion_length": 250.0, "epoch": 0.035166666666666666, "grad_norm": 0.33090484142303467, "kl": 0.6019576191902161, "learning_rate": 3.5802143720049565e-06, "loss": 0.0241, "reward": 1.0833333730697632, "reward_std": 0.6606875061988831, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.8333333730697632, "step": 422 }, { "completion_length": 250.0, "epoch": 0.03525, "grad_norm": 0.6281843781471252, "kl": 1.1191685199737549, "learning_rate": 3.5723378335078653e-06, "loss": 0.0448, "reward": 1.2083333730697632, "reward_std": 0.7332792282104492, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.8333333730697632, "step": 423 }, { "completion_length": 250.0, "epoch": 0.035333333333333335, "grad_norm": 0.24378234148025513, "kl": 0.8178219795227051, "learning_rate": 3.564448228912682e-06, "loss": 0.0327, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.875, "step": 424 }, { "completion_length": 250.0, "epoch": 0.035416666666666666, "grad_norm": 0.24927818775177002, "kl": 1.2881877422332764, "learning_rate": 3.556545654351749e-06, "loss": 0.0515, "reward": 1.9583333730697632, "reward_std": 0.11785109341144562, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 0.9583333730697632, "step": 425 }, { "completion_length": 250.0, "epoch": 0.0355, "grad_norm": 0.4163737893104553, "kl": 0.7852768301963806, "learning_rate": 3.5486302061154433e-06, "loss": 0.0314, "reward": 1.2083332538604736, "reward_std": 0.9074209332466125, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.7083333730697632, "step": 426 }, { "completion_length": 250.0, "epoch": 0.035583333333333335, "grad_norm": 0.38066983222961426, "kl": 1.0970571041107178, "learning_rate": 3.5407019806510035e-06, "loss": 0.0439, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.875, "step": 427 }, { "completion_length": 250.0, "epoch": 0.035666666666666666, "grad_norm": 1.0730751752853394, "kl": 0.7307629585266113, "learning_rate": 3.532761074561355e-06, "loss": 0.0292, "reward": 1.5416667461395264, "reward_std": 0.501980185508728, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.7916666865348816, "step": 428 }, { "completion_length": 250.0, "epoch": 0.03575, "grad_norm": 0.2747225761413574, "kl": 0.9983835816383362, "learning_rate": 3.524807584603932e-06, "loss": 0.0399, "reward": 1.125, "reward_std": 0.8345229625701904, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.75, "step": 429 }, { "completion_length": 250.0, "epoch": 0.035833333333333335, "grad_norm": 0.3085501492023468, "kl": 1.0624085664749146, "learning_rate": 3.516841607689501e-06, "loss": 0.0425, "reward": 1.4583333730697632, "reward_std": 0.7955730557441711, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.8333333730697632, "step": 430 }, { "completion_length": 250.0, "epoch": 0.035916666666666666, "grad_norm": 0.3820912539958954, "kl": 0.8280222415924072, "learning_rate": 3.5088632408809757e-06, "loss": 0.0331, "reward": 1.0833333730697632, "reward_std": 0.6606874465942383, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.8333333730697632, "step": 431 }, { "completion_length": 250.0, "epoch": 0.036, "grad_norm": 0.31436631083488464, "kl": 0.5407013893127441, "learning_rate": 3.5008725813922383e-06, "loss": 0.0216, "reward": 1.0416666269302368, "reward_std": 0.8807914853096008, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.5416666865348816, "step": 432 }, { "completion_length": 250.0, "epoch": 0.036083333333333335, "grad_norm": 1.559922695159912, "kl": 1.1256322860717773, "learning_rate": 3.4928697265869516e-06, "loss": 0.045, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.875, "step": 433 }, { "completion_length": 250.0, "epoch": 0.036166666666666666, "grad_norm": 0.36709415912628174, "kl": 0.810982346534729, "learning_rate": 3.4848547739773782e-06, "loss": 0.0324, "reward": 0.7916666269302368, "reward_std": 0.8533315062522888, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.5416666865348816, "step": 434 }, { "completion_length": 250.0, "epoch": 0.03625, "grad_norm": 0.38521692156791687, "kl": 1.0653852224349976, "learning_rate": 3.476827821223184e-06, "loss": 0.0426, "reward": 1.1666667461395264, "reward_std": 0.8728715181350708, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.6666666865348816, "step": 435 }, { "completion_length": 250.0, "epoch": 0.036333333333333336, "grad_norm": 0.4810712933540344, "kl": 1.0313137769699097, "learning_rate": 3.4687889661302577e-06, "loss": 0.0413, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.875, "step": 436 }, { "completion_length": 250.0, "epoch": 0.03641666666666667, "grad_norm": 1.5242727994918823, "kl": 1.0612870454788208, "learning_rate": 3.460738306649509e-06, "loss": 0.0425, "reward": 0.7083333730697632, "reward_std": 0.4520675837993622, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.5833333730697632, "step": 437 }, { "completion_length": 250.0, "epoch": 0.0365, "grad_norm": 0.27597489953041077, "kl": 0.7890688180923462, "learning_rate": 3.452675940875686e-06, "loss": 0.0316, "reward": 1.6666667461395264, "reward_std": 0.7126966714859009, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.7916666865348816, "step": 438 }, { "completion_length": 250.0, "epoch": 0.036583333333333336, "grad_norm": 0.35958653688430786, "kl": 0.9022118449211121, "learning_rate": 3.4446019670461684e-06, "loss": 0.0361, "reward": 1.0, "reward_std": 0.7559289336204529, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.75, "step": 439 }, { "completion_length": 250.0, "epoch": 0.03666666666666667, "grad_norm": 0.28990885615348816, "kl": 0.7540520429611206, "learning_rate": 3.436516483539781e-06, "loss": 0.0302, "reward": 1.2916667461395264, "reward_std": 0.6770032048225403, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.5416666865348816, "step": 440 }, { "completion_length": 250.0, "epoch": 0.03675, "grad_norm": 0.4592471420764923, "kl": 1.2466713190078735, "learning_rate": 3.4284195888755877e-06, "loss": 0.0499, "reward": 1.25, "reward_std": 0.6606874465942383, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.75, "step": 441 }, { "completion_length": 250.0, "epoch": 0.036833333333333336, "grad_norm": 0.2260519117116928, "kl": 1.1097904443740845, "learning_rate": 3.4203113817116955e-06, "loss": 0.0444, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 1.0, "step": 442 }, { "completion_length": 250.0, "epoch": 0.03691666666666667, "grad_norm": 0.3158986568450928, "kl": 0.7736285924911499, "learning_rate": 3.412191960844049e-06, "loss": 0.0309, "reward": 1.1666667461395264, "reward_std": 0.942808985710144, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.5416666865348816, "step": 443 }, { "completion_length": 250.0, "epoch": 0.037, "grad_norm": 0.35627302527427673, "kl": 0.7449521422386169, "learning_rate": 3.4040614252052305e-06, "loss": 0.0298, "reward": 1.1666667461395264, "reward_std": 0.835710883140564, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.5416666865348816, "step": 444 }, { "completion_length": 250.0, "epoch": 0.037083333333333336, "grad_norm": 1.3280401229858398, "kl": 0.9883641600608826, "learning_rate": 3.39591987386325e-06, "loss": 0.0395, "reward": 1.125, "reward_std": 0.8345229625701904, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.5, "step": 445 }, { "completion_length": 250.0, "epoch": 0.03716666666666667, "grad_norm": 0.3993532359600067, "kl": 1.3265860080718994, "learning_rate": 3.387767406020343e-06, "loss": 0.0531, "reward": 1.375, "reward_std": 0.9161254167556763, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.75, "step": 446 }, { "completion_length": 250.0, "epoch": 0.03725, "grad_norm": 0.3469555079936981, "kl": 0.6623489260673523, "learning_rate": 3.3796041210117545e-06, "loss": 0.0265, "reward": 1.1666667461395264, "reward_std": 0.7968190908432007, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.6666666269302368, "step": 447 }, { "completion_length": 250.0, "epoch": 0.037333333333333336, "grad_norm": 0.7821484804153442, "kl": 1.5054004192352295, "learning_rate": 3.3714301183045382e-06, "loss": 0.0602, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 1.0, "step": 448 }, { "completion_length": 250.0, "epoch": 0.03741666666666667, "grad_norm": 0.6973661780357361, "kl": 1.2113670110702515, "learning_rate": 3.3632454974963368e-06, "loss": 0.0485, "reward": 1.2083333730697632, "reward_std": 0.7332792282104492, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.8333333730697632, "step": 449 }, { "completion_length": 250.0, "epoch": 0.0375, "grad_norm": 0.32803142070770264, "kl": 0.9755523204803467, "learning_rate": 3.3550503583141726e-06, "loss": 0.039, "reward": 1.0, "reward_std": 0.7126965522766113, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.625, "step": 450 }, { "completion_length": 250.0, "epoch": 0.03758333333333334, "grad_norm": 0.3818299472332001, "kl": 0.9919738173484802, "learning_rate": 3.346844800613229e-06, "loss": 0.0397, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.875, "step": 451 }, { "completion_length": 250.0, "epoch": 0.03766666666666667, "grad_norm": 0.3395150303840637, "kl": 1.0552934408187866, "learning_rate": 3.338628924375638e-06, "loss": 0.0422, "reward": 1.125, "reward_std": 0.8345229625701904, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.75, "step": 452 }, { "completion_length": 250.0, "epoch": 0.03775, "grad_norm": 0.22457285225391388, "kl": 1.2115274667739868, "learning_rate": 3.3304028297092583e-06, "loss": 0.0485, "reward": 1.5833333730697632, "reward_std": 0.49601587653160095, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.8333333730697632, "step": 453 }, { "completion_length": 250.0, "epoch": 0.03783333333333333, "grad_norm": 0.32445210218429565, "kl": 0.7955477237701416, "learning_rate": 3.3221666168464584e-06, "loss": 0.0318, "reward": 0.875, "reward_std": 0.7955731153488159, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.625, "step": 454 }, { "completion_length": 250.0, "epoch": 0.03791666666666667, "grad_norm": 1.0360915660858154, "kl": 1.115738034248352, "learning_rate": 3.313920386142892e-06, "loss": 0.0446, "reward": 1.4583333730697632, "reward_std": 0.7955730557441711, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.8333333730697632, "step": 455 }, { "completion_length": 250.0, "epoch": 0.038, "grad_norm": 0.26157450675964355, "kl": 1.0352263450622559, "learning_rate": 3.3056642380762783e-06, "loss": 0.0414, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.875, "step": 456 }, { "completion_length": 250.0, "epoch": 0.03808333333333333, "grad_norm": 0.33227723836898804, "kl": 0.9722086191177368, "learning_rate": 3.2973982732451753e-06, "loss": 0.0389, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 1.0, "step": 457 }, { "completion_length": 250.0, "epoch": 0.03816666666666667, "grad_norm": 0.48032528162002563, "kl": 1.262650966644287, "learning_rate": 3.2891225923677565e-06, "loss": 0.0505, "reward": 1.1666667461395264, "reward_std": 0.6424161195755005, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.5416666865348816, "step": 458 }, { "completion_length": 250.0, "epoch": 0.03825, "grad_norm": 0.2080181986093521, "kl": 1.3035730123519897, "learning_rate": 3.280837296280582e-06, "loss": 0.0521, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.875, "step": 459 }, { "completion_length": 250.0, "epoch": 0.03833333333333333, "grad_norm": 0.2657634913921356, "kl": 1.0605506896972656, "learning_rate": 3.272542485937369e-06, "loss": 0.0424, "reward": 1.0833333730697632, "reward_std": 0.6606874465942383, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.8333333134651184, "step": 460 }, { "completion_length": 250.0, "epoch": 0.03841666666666667, "grad_norm": 0.32230645418167114, "kl": 1.6198720932006836, "learning_rate": 3.2642382624077647e-06, "loss": 0.0648, "reward": 1.6666667461395264, "reward_std": 0.7126966118812561, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.7916666865348816, "step": 461 }, { "completion_length": 250.0, "epoch": 0.0385, "grad_norm": 0.31350037455558777, "kl": 1.1919293403625488, "learning_rate": 3.2559247268761117e-06, "loss": 0.0477, "reward": 1.6666667461395264, "reward_std": 0.7126966714859009, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.7916666865348816, "step": 462 }, { "completion_length": 250.0, "epoch": 0.03858333333333333, "grad_norm": 0.30738067626953125, "kl": 0.9954556226730347, "learning_rate": 3.247601980640217e-06, "loss": 0.0398, "reward": 1.625, "reward_std": 0.7000566720962524, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.8750000596046448, "step": 463 }, { "completion_length": 236.0, "epoch": 0.03866666666666667, "grad_norm": 1.2310477495193481, "kl": 1.2627601623535156, "learning_rate": 3.2392701251101172e-06, "loss": 0.0505, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 1.0, "step": 464 }, { "completion_length": 250.0, "epoch": 0.03875, "grad_norm": 0.2840711772441864, "kl": 0.8784255981445312, "learning_rate": 3.230929261806842e-06, "loss": 0.0351, "reward": 1.3333333730697632, "reward_std": 0.7766431570053101, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.8333333134651184, "step": 465 }, { "completion_length": 250.0, "epoch": 0.03883333333333333, "grad_norm": 0.22466637194156647, "kl": 1.233144760131836, "learning_rate": 3.222579492361179e-06, "loss": 0.0493, "reward": 1.5416667461395264, "reward_std": 0.7332792282104492, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.7916666865348816, "step": 466 }, { "completion_length": 250.0, "epoch": 0.03891666666666667, "grad_norm": 0.31554660201072693, "kl": 0.8954232335090637, "learning_rate": 3.214220918512434e-06, "loss": 0.0358, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.875, "step": 467 }, { "completion_length": 250.0, "epoch": 0.039, "grad_norm": 0.35043928027153015, "kl": 0.9754191040992737, "learning_rate": 3.205853642107192e-06, "loss": 0.039, "reward": 1.125, "reward_std": 0.8345229625701904, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.75, "step": 468 }, { "completion_length": 250.0, "epoch": 0.03908333333333333, "grad_norm": 0.3606829047203064, "kl": 1.6161555051803589, "learning_rate": 3.1974777650980737e-06, "loss": 0.0646, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 1.0, "step": 469 }, { "completion_length": 250.0, "epoch": 0.03916666666666667, "grad_norm": 0.3894639015197754, "kl": 0.9157559871673584, "learning_rate": 3.189093389542498e-06, "loss": 0.0366, "reward": 1.3333333730697632, "reward_std": 0.7766431570053101, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.8333333730697632, "step": 470 }, { "completion_length": 250.0, "epoch": 0.03925, "grad_norm": 0.5538480281829834, "kl": 0.9871796369552612, "learning_rate": 3.180700617601436e-06, "loss": 0.0395, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.875, "step": 471 }, { "completion_length": 250.0, "epoch": 0.03933333333333333, "grad_norm": 0.03941356763243675, "kl": 1.603979468345642, "learning_rate": 3.1722995515381644e-06, "loss": 0.0642, "reward": 2.0, "reward_std": 0.0, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 1.0, "step": 472 }, { "completion_length": 250.0, "epoch": 0.03941666666666667, "grad_norm": 0.36027953028678894, "kl": 1.4444609880447388, "learning_rate": 3.1638902937170224e-06, "loss": 0.0578, "reward": 1.9583333730697632, "reward_std": 0.11785109341144562, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 0.9583333730697632, "step": 473 }, { "completion_length": 250.0, "epoch": 0.0395, "grad_norm": 0.2484903782606125, "kl": 1.535847783088684, "learning_rate": 3.155472946602162e-06, "loss": 0.0614, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 1.0, "step": 474 }, { "completion_length": 250.0, "epoch": 0.03958333333333333, "grad_norm": 0.43076378107070923, "kl": 1.3793715238571167, "learning_rate": 3.147047612756302e-06, "loss": 0.0552, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.75, "step": 475 }, { "completion_length": 250.0, "epoch": 0.03966666666666667, "grad_norm": 0.24676185846328735, "kl": 1.1927064657211304, "learning_rate": 3.1386143948394764e-06, "loss": 0.0477, "reward": 1.5833333730697632, "reward_std": 0.7292091846466064, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.8333333730697632, "step": 476 }, { "completion_length": 250.0, "epoch": 0.03975, "grad_norm": 0.27644336223602295, "kl": 1.0569162368774414, "learning_rate": 3.130173395607785e-06, "loss": 0.0423, "reward": 1.25, "reward_std": 0.8498365879058838, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.7500000596046448, "step": 477 }, { "completion_length": 250.0, "epoch": 0.03983333333333333, "grad_norm": 1.9261839389801025, "kl": 1.384006381034851, "learning_rate": 3.121724717912138e-06, "loss": 0.0554, "reward": 1.7916667461395264, "reward_std": 0.39591163396835327, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.9166666865348816, "step": 478 }, { "completion_length": 250.0, "epoch": 0.03991666666666667, "grad_norm": 0.25609302520751953, "kl": 1.1626935005187988, "learning_rate": 3.1132684646970068e-06, "loss": 0.0465, "reward": 1.1666667461395264, "reward_std": 0.5634361505508423, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.9166666865348816, "step": 479 }, { "completion_length": 250.0, "epoch": 0.04, "grad_norm": 1.250272512435913, "kl": 1.0145455598831177, "learning_rate": 3.1048047389991693e-06, "loss": 0.0406, "reward": 1.4583333730697632, "reward_std": 0.501980185508728, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.9583333730697632, "step": 480 }, { "completion_length": 250.0, "epoch": 0.04008333333333333, "grad_norm": 0.2401411086320877, "kl": 0.8946865200996399, "learning_rate": 3.0963336439464527e-06, "loss": 0.0358, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.875, "step": 481 }, { "completion_length": 250.0, "epoch": 0.04016666666666667, "grad_norm": 0.23744302988052368, "kl": 0.5881522297859192, "learning_rate": 3.087855282756475e-06, "loss": 0.0235, "reward": 1.2916666269302368, "reward_std": 0.8249579071998596, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.7916666865348816, "step": 482 }, { "completion_length": 250.0, "epoch": 0.04025, "grad_norm": 0.5861865878105164, "kl": 1.260177493095398, "learning_rate": 3.079369758735393e-06, "loss": 0.0504, "reward": 1.5416667461395264, "reward_std": 0.7113032937049866, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.6666666865348816, "step": 483 }, { "completion_length": 250.0, "epoch": 0.04033333333333333, "grad_norm": 0.3826155364513397, "kl": 1.2853862047195435, "learning_rate": 3.0708771752766397e-06, "loss": 0.0514, "reward": 1.4583333730697632, "reward_std": 0.7332792282104492, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.8333333730697632, "step": 484 }, { "completion_length": 250.0, "epoch": 0.04041666666666666, "grad_norm": 0.22148284316062927, "kl": 1.1608880758285522, "learning_rate": 3.062377635859663e-06, "loss": 0.0464, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 1.0, "step": 485 }, { "completion_length": 250.0, "epoch": 0.0405, "grad_norm": 0.3896584212779999, "kl": 1.1302204132080078, "learning_rate": 3.053871244048669e-06, "loss": 0.0452, "reward": 1.25, "reward_std": 0.8864052295684814, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.75, "step": 486 }, { "completion_length": 250.0, "epoch": 0.04058333333333333, "grad_norm": 0.8122249245643616, "kl": 1.505839467048645, "learning_rate": 3.045358103491357e-06, "loss": 0.0602, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 1.0, "step": 487 }, { "completion_length": 250.0, "epoch": 0.04066666666666666, "grad_norm": 0.2632407248020172, "kl": 1.4302340745925903, "learning_rate": 3.0368383179176584e-06, "loss": 0.0572, "reward": 1.7916666269302368, "reward_std": 0.5892555713653564, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.9166666865348816, "step": 488 }, { "completion_length": 250.0, "epoch": 0.04075, "grad_norm": 0.30897876620292664, "kl": 0.74764084815979, "learning_rate": 3.0283119911384724e-06, "loss": 0.0299, "reward": 1.0833333730697632, "reward_std": 0.6606874465942383, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.8333333730697632, "step": 489 }, { "completion_length": 250.0, "epoch": 0.04083333333333333, "grad_norm": 0.25567445158958435, "kl": 1.0470243692398071, "learning_rate": 3.019779227044398e-06, "loss": 0.0419, "reward": 1.8333333730697632, "reward_std": 0.35634827613830566, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.9583333730697632, "step": 490 }, { "completion_length": 250.0, "epoch": 0.040916666666666664, "grad_norm": 0.3491830825805664, "kl": 1.0824482440948486, "learning_rate": 3.0112401296044756e-06, "loss": 0.0433, "reward": 1.5833333730697632, "reward_std": 0.7292091846466064, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.8333333730697632, "step": 491 }, { "completion_length": 248.0, "epoch": 0.041, "grad_norm": 0.26145610213279724, "kl": 1.5022797584533691, "learning_rate": 3.002694802864912e-06, "loss": 0.0601, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 1.0, "step": 492 }, { "completion_length": 250.0, "epoch": 0.04108333333333333, "grad_norm": 2.2558088302612305, "kl": 1.2809425592422485, "learning_rate": 2.9941433509478157e-06, "loss": 0.0512, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.875, "step": 493 }, { "completion_length": 250.0, "epoch": 0.041166666666666664, "grad_norm": 0.24117594957351685, "kl": 1.0864267349243164, "learning_rate": 2.98558587804993e-06, "loss": 0.0435, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.875, "step": 494 }, { "completion_length": 250.0, "epoch": 0.04125, "grad_norm": 1.2394402027130127, "kl": 1.355527400970459, "learning_rate": 2.9770224884413625e-06, "loss": 0.0542, "reward": 1.4583333730697632, "reward_std": 0.501980185508728, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.9583333730697632, "step": 495 }, { "completion_length": 250.0, "epoch": 0.04133333333333333, "grad_norm": 0.3969794511795044, "kl": 1.4105161428451538, "learning_rate": 2.9684532864643123e-06, "loss": 0.0564, "reward": 1.25, "reward_std": 0.6606874465942383, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.875, "step": 496 }, { "completion_length": 250.0, "epoch": 0.041416666666666664, "grad_norm": 0.2702740728855133, "kl": 1.046398639678955, "learning_rate": 2.9598783765318005e-06, "loss": 0.0419, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.875, "step": 497 }, { "completion_length": 250.0, "epoch": 0.0415, "grad_norm": 0.22273120284080505, "kl": 0.8685632944107056, "learning_rate": 2.9512978631264006e-06, "loss": 0.0347, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.875, "step": 498 }, { "completion_length": 250.0, "epoch": 0.04158333333333333, "grad_norm": 0.5162345767021179, "kl": 1.2586992979049683, "learning_rate": 2.942711850798959e-06, "loss": 0.0503, "reward": 1.4166667461395264, "reward_std": 0.7292091846466064, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.7916666865348816, "step": 499 }, { "completion_length": 250.0, "epoch": 0.041666666666666664, "grad_norm": 0.41288647055625916, "kl": 1.1819934844970703, "learning_rate": 2.9341204441673267e-06, "loss": 0.0473, "reward": 1.4166666269302368, "reward_std": 0.6606874465942383, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.9166666865348816, "step": 500 }, { "completion_length": 250.0, "epoch": 0.04175, "grad_norm": 0.8625938892364502, "kl": 0.8906731009483337, "learning_rate": 2.9255237479150815e-06, "loss": 0.0356, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 1.0, "step": 501 }, { "completion_length": 250.0, "epoch": 0.041833333333333333, "grad_norm": 0.8455320000648499, "kl": 1.6155681610107422, "learning_rate": 2.9169218667902562e-06, "loss": 0.0646, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 1.0, "step": 502 }, { "completion_length": 250.0, "epoch": 0.041916666666666665, "grad_norm": 0.2829776406288147, "kl": 1.0219650268554688, "learning_rate": 2.908314905604056e-06, "loss": 0.0409, "reward": 1.2083333730697632, "reward_std": 0.7332792282104492, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.8333333730697632, "step": 503 }, { "completion_length": 250.0, "epoch": 0.042, "grad_norm": 0.5874464511871338, "kl": 1.6597095727920532, "learning_rate": 2.8997029692295875e-06, "loss": 0.0664, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.875, "step": 504 }, { "completion_length": 250.0, "epoch": 0.042083333333333334, "grad_norm": 0.9483749270439148, "kl": 1.8449759483337402, "learning_rate": 2.8910861626005774e-06, "loss": 0.0738, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 1.0, "step": 505 }, { "completion_length": 250.0, "epoch": 0.042166666666666665, "grad_norm": 0.2268337458372116, "kl": 0.8647147417068481, "learning_rate": 2.8824645907100957e-06, "loss": 0.0346, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.875, "step": 506 }, { "completion_length": 250.0, "epoch": 0.04225, "grad_norm": 0.2466856986284256, "kl": 0.6964913010597229, "learning_rate": 2.8738383586092745e-06, "loss": 0.0279, "reward": 1.375, "reward_std": 0.6283639669418335, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.875, "step": 507 }, { "completion_length": 250.0, "epoch": 0.042333333333333334, "grad_norm": 1.2652459144592285, "kl": 0.9631326794624329, "learning_rate": 2.8652075714060296e-06, "loss": 0.0385, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.75, "step": 508 }, { "completion_length": 250.0, "epoch": 0.042416666666666665, "grad_norm": 0.27476221323013306, "kl": 0.9217783212661743, "learning_rate": 2.8565723342637797e-06, "loss": 0.0369, "reward": 0.8333333730697632, "reward_std": 0.9920317530632019, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.4583333432674408, "step": 509 }, { "completion_length": 250.0, "epoch": 0.0425, "grad_norm": 0.21916547417640686, "kl": 1.3772886991500854, "learning_rate": 2.847932752400164e-06, "loss": 0.0551, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.875, "step": 510 }, { "completion_length": 250.0, "epoch": 0.042583333333333334, "grad_norm": 0.33844467997550964, "kl": 1.338585376739502, "learning_rate": 2.8392889310857615e-06, "loss": 0.0535, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.875, "step": 511 }, { "completion_length": 250.0, "epoch": 0.042666666666666665, "grad_norm": 0.29742342233657837, "kl": 1.344675898551941, "learning_rate": 2.8306409756428067e-06, "loss": 0.0538, "reward": 1.7916666269302368, "reward_std": 0.589255690574646, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.9166666865348816, "step": 512 }, { "completion_length": 250.0, "epoch": 0.04275, "grad_norm": 0.3239578902721405, "kl": 1.0606533288955688, "learning_rate": 2.8219889914439073e-06, "loss": 0.0424, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 1.0, "step": 513 }, { "completion_length": 250.0, "epoch": 0.042833333333333334, "grad_norm": 0.3085322678089142, "kl": 0.7539732456207275, "learning_rate": 2.813333083910761e-06, "loss": 0.0302, "reward": 1.375, "reward_std": 0.7855339050292969, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.75, "step": 514 }, { "completion_length": 250.0, "epoch": 0.042916666666666665, "grad_norm": 0.645790159702301, "kl": 0.8328765630722046, "learning_rate": 2.804673358512869e-06, "loss": 0.0333, "reward": 0.875, "reward_std": 0.7955730557441711, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.625, "step": 515 }, { "completion_length": 250.0, "epoch": 0.043, "grad_norm": 0.8226057291030884, "kl": 1.01832914352417, "learning_rate": 2.7960099207662535e-06, "loss": 0.0407, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 0.875, "step": 516 }, { "completion_length": 212.0, "epoch": 0.043083333333333335, "grad_norm": 0.24604718387126923, "kl": 1.0776610374450684, "learning_rate": 2.7873428762321667e-06, "loss": 0.0431, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 1.0, "step": 517 }, { "completion_length": 250.0, "epoch": 0.043166666666666666, "grad_norm": 0.33582380414009094, "kl": 0.8894166946411133, "learning_rate": 2.778672330515814e-06, "loss": 0.0356, "reward": 1.2083333730697632, "reward_std": 0.6651769280433655, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.4583333432674408, "step": 518 }, { "completion_length": 250.0, "epoch": 0.04325, "grad_norm": 0.4405062198638916, "kl": 1.2915191650390625, "learning_rate": 2.769998389265057e-06, "loss": 0.0517, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.875, "step": 519 }, { "completion_length": 232.0, "epoch": 0.043333333333333335, "grad_norm": 1.5345605611801147, "kl": 1.553789734840393, "learning_rate": 2.761321158169134e-06, "loss": 0.0622, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.875, "step": 520 }, { "completion_length": 206.0, "epoch": 0.043416666666666666, "grad_norm": 0.25380414724349976, "kl": 1.496401071548462, "learning_rate": 2.752640742957366e-06, "loss": 0.0599, "reward": 1.7916667461395264, "reward_std": 0.39591163396835327, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.9166666865348816, "step": 521 }, { "completion_length": 250.0, "epoch": 0.0435, "grad_norm": 0.2672164738178253, "kl": 0.8028614521026611, "learning_rate": 2.743957249397874e-06, "loss": 0.0321, "reward": 1.5416666269302368, "reward_std": 0.6651768684387207, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.9166666865348816, "step": 522 }, { "completion_length": 250.0, "epoch": 0.043583333333333335, "grad_norm": 0.2636195123195648, "kl": 1.4980413913726807, "learning_rate": 2.7352707832962865e-06, "loss": 0.0599, "reward": 1.6666666269302368, "reward_std": 0.6424160599708557, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.9166666865348816, "step": 523 }, { "completion_length": 250.0, "epoch": 0.043666666666666666, "grad_norm": 1.0833460092544556, "kl": 1.3643357753753662, "learning_rate": 2.726581450494451e-06, "loss": 0.0546, "reward": 1.3333332538604736, "reward_std": 0.9258201122283936, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.7083333730697632, "step": 524 }, { "completion_length": 250.0, "epoch": 0.04375, "grad_norm": 0.22439569234848022, "kl": 1.132871150970459, "learning_rate": 2.717889356869146e-06, "loss": 0.0453, "reward": 1.3333333730697632, "reward_std": 0.7126966714859009, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.8333333730697632, "step": 525 }, { "completion_length": 250.0, "epoch": 0.043833333333333335, "grad_norm": 0.2796757221221924, "kl": 1.0327954292297363, "learning_rate": 2.70919460833079e-06, "loss": 0.0413, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 1.0, "step": 526 }, { "completion_length": 250.0, "epoch": 0.043916666666666666, "grad_norm": 0.31149035692214966, "kl": 1.374606966972351, "learning_rate": 2.700497310822147e-06, "loss": 0.055, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.875, "step": 527 }, { "completion_length": 250.0, "epoch": 0.044, "grad_norm": 0.7924208641052246, "kl": 0.987855076789856, "learning_rate": 2.6917975703170466e-06, "loss": 0.0395, "reward": 1.125, "reward_std": 0.9910312294960022, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.625, "step": 528 }, { "completion_length": 250.0, "epoch": 0.044083333333333335, "grad_norm": 0.25965285301208496, "kl": 0.8640234470367432, "learning_rate": 2.6830954928190795e-06, "loss": 0.0346, "reward": 1.6666667461395264, "reward_std": 0.6424160599708557, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.9166666865348816, "step": 529 }, { "completion_length": 250.0, "epoch": 0.04416666666666667, "grad_norm": 0.38326141238212585, "kl": 0.8228756189346313, "learning_rate": 2.6743911843603134e-06, "loss": 0.0329, "reward": 1.2916666269302368, "reward_std": 0.8249579071998596, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.7916666865348816, "step": 530 }, { "completion_length": 250.0, "epoch": 0.04425, "grad_norm": 1.5146892070770264, "kl": 1.9087010622024536, "learning_rate": 2.6656847510000013e-06, "loss": 0.0763, "reward": 1.5416667461395264, "reward_std": 0.501980185508728, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.9166666865348816, "step": 531 }, { "completion_length": 215.0, "epoch": 0.044333333333333336, "grad_norm": 0.2388104796409607, "kl": 1.3102915287017822, "learning_rate": 2.6569762988232838e-06, "loss": 0.0524, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 1.0, "step": 532 }, { "completion_length": 250.0, "epoch": 0.04441666666666667, "grad_norm": 0.33024802803993225, "kl": 0.8110081553459167, "learning_rate": 2.6482659339399047e-06, "loss": 0.0324, "reward": 1.5833333730697632, "reward_std": 0.7292091846466064, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.8333333730697632, "step": 533 }, { "completion_length": 250.0, "epoch": 0.0445, "grad_norm": 0.28449299931526184, "kl": 0.9894317388534546, "learning_rate": 2.63955376248291e-06, "loss": 0.0396, "reward": 1.4583333730697632, "reward_std": 0.7955730557441711, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.8333333730697632, "step": 534 }, { "completion_length": 250.0, "epoch": 0.044583333333333336, "grad_norm": 0.24825812876224518, "kl": 0.8840400576591492, "learning_rate": 2.6308398906073603e-06, "loss": 0.0354, "reward": 1.0416667461395264, "reward_std": 0.4520675241947174, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.9166666865348816, "step": 535 }, { "completion_length": 250.0, "epoch": 0.04466666666666667, "grad_norm": 1.5011301040649414, "kl": 1.2059112787246704, "learning_rate": 2.6221244244890336e-06, "loss": 0.0482, "reward": 1.2083333730697632, "reward_std": 0.501980185508728, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.9583333730697632, "step": 536 }, { "completion_length": 250.0, "epoch": 0.04475, "grad_norm": 0.7638510465621948, "kl": 1.5593329668045044, "learning_rate": 2.613407470323134e-06, "loss": 0.0624, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.75, "step": 537 }, { "completion_length": 168.0, "epoch": 0.044833333333333336, "grad_norm": 0.23012110590934753, "kl": 0.8535071611404419, "learning_rate": 2.604689134322999e-06, "loss": 0.0341, "reward": 1.4583333730697632, "reward_std": 0.5892556309700012, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.9583333730697632, "step": 538 }, { "completion_length": 250.0, "epoch": 0.04491666666666667, "grad_norm": 0.3940114676952362, "kl": 1.290722370147705, "learning_rate": 2.5959695227188e-06, "loss": 0.0516, "reward": 1.5416666269302368, "reward_std": 0.853331446647644, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.7916666865348816, "step": 539 }, { "completion_length": 250.0, "epoch": 0.045, "grad_norm": 0.2533872723579407, "kl": 1.0061720609664917, "learning_rate": 2.587248741756253e-06, "loss": 0.0402, "reward": 1.8333333730697632, "reward_std": 0.4714045226573944, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.9583333730697632, "step": 540 }, { "completion_length": 250.0, "epoch": 0.045083333333333336, "grad_norm": 0.2459549903869629, "kl": 1.3281100988388062, "learning_rate": 2.578526897695321e-06, "loss": 0.0531, "reward": 1.9583333730697632, "reward_std": 0.11785109341144562, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 0.9583333730697632, "step": 541 }, { "completion_length": 250.0, "epoch": 0.04516666666666667, "grad_norm": 0.45357388257980347, "kl": 1.2036126852035522, "learning_rate": 2.569804096808923e-06, "loss": 0.0481, "reward": 1.0416667461395264, "reward_std": 0.8807914853096008, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.6666666269302368, "step": 542 }, { "completion_length": 250.0, "epoch": 0.04525, "grad_norm": 0.639238715171814, "kl": 1.46484375, "learning_rate": 2.5610804453816333e-06, "loss": 0.0586, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.75, "step": 543 }, { "completion_length": 250.0, "epoch": 0.04533333333333334, "grad_norm": 0.2612769305706024, "kl": 0.8998706340789795, "learning_rate": 2.5523560497083927e-06, "loss": 0.036, "reward": 1.0833333730697632, "reward_std": 0.6606874465942383, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.8333333730697632, "step": 544 }, { "completion_length": 250.0, "epoch": 0.04541666666666667, "grad_norm": 0.3391062021255493, "kl": 0.9011841416358948, "learning_rate": 2.543631016093209e-06, "loss": 0.036, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 1.0, "step": 545 }, { "completion_length": 250.0, "epoch": 0.0455, "grad_norm": 0.909966230392456, "kl": 1.1737316846847534, "learning_rate": 2.5349054508478636e-06, "loss": 0.0469, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.875, "step": 546 }, { "completion_length": 250.0, "epoch": 0.04558333333333333, "grad_norm": 0.7492355704307556, "kl": 1.3316445350646973, "learning_rate": 2.526179460290615e-06, "loss": 0.0533, "reward": 1.125, "reward_std": 0.8345229625701904, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.75, "step": 547 }, { "completion_length": 250.0, "epoch": 0.04566666666666667, "grad_norm": 0.2507287263870239, "kl": 0.7561590671539307, "learning_rate": 2.517453150744904e-06, "loss": 0.0302, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 1.0, "step": 548 }, { "completion_length": 250.0, "epoch": 0.04575, "grad_norm": 1.408830165863037, "kl": 1.2084333896636963, "learning_rate": 2.5087266285380597e-06, "loss": 0.0483, "reward": 1.2083333730697632, "reward_std": 0.9074209332466125, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.7083333730697632, "step": 549 }, { "completion_length": 250.0, "epoch": 0.04583333333333333, "grad_norm": 0.5603641867637634, "kl": 1.1831409931182861, "learning_rate": 2.5e-06, "loss": 0.0473, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.875, "step": 550 }, { "completion_length": 250.0, "epoch": 0.04591666666666667, "grad_norm": 0.24828428030014038, "kl": 1.2554258108139038, "learning_rate": 2.4912733714619415e-06, "loss": 0.0502, "reward": 1.3333332538604736, "reward_std": 0.7559289336204529, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.8333333730697632, "step": 551 }, { "completion_length": 250.0, "epoch": 0.046, "grad_norm": 0.2216569483280182, "kl": 1.5270963907241821, "learning_rate": 2.482546849255096e-06, "loss": 0.0611, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 1.0, "step": 552 }, { "completion_length": 250.0, "epoch": 0.04608333333333333, "grad_norm": 0.7645318508148193, "kl": 0.9473637342453003, "learning_rate": 2.4738205397093863e-06, "loss": 0.0379, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.75, "step": 553 }, { "completion_length": 250.0, "epoch": 0.04616666666666667, "grad_norm": 0.2752268612384796, "kl": 0.5936354994773865, "learning_rate": 2.4650945491521372e-06, "loss": 0.0237, "reward": 1.5, "reward_std": 0.471404492855072, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.875, "step": 554 }, { "completion_length": 250.0, "epoch": 0.04625, "grad_norm": 0.029000254347920418, "kl": 1.6099191904067993, "learning_rate": 2.4563689839067913e-06, "loss": 0.0644, "reward": 2.0, "reward_std": 0.0, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 1.0, "step": 555 }, { "completion_length": 250.0, "epoch": 0.04633333333333333, "grad_norm": 0.21292659640312195, "kl": 1.1784905195236206, "learning_rate": 2.447643950291608e-06, "loss": 0.0471, "reward": 1.1666667461395264, "reward_std": 0.5634361505508423, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.9166666865348816, "step": 556 }, { "completion_length": 250.0, "epoch": 0.04641666666666667, "grad_norm": 0.23645764589309692, "kl": 1.1839085817337036, "learning_rate": 2.4389195546183676e-06, "loss": 0.0474, "reward": 1.7083333730697632, "reward_std": 0.48591262102127075, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.8333333730697632, "step": 557 }, { "completion_length": 250.0, "epoch": 0.0465, "grad_norm": 0.38307103514671326, "kl": 1.4955880641937256, "learning_rate": 2.4301959031910785e-06, "loss": 0.0598, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 1.0, "step": 558 }, { "completion_length": 250.0, "epoch": 0.04658333333333333, "grad_norm": 0.2519403398036957, "kl": 1.4596502780914307, "learning_rate": 2.4214731023046795e-06, "loss": 0.0584, "reward": 1.7083333730697632, "reward_std": 0.4520675241947174, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.9583333730697632, "step": 559 }, { "completion_length": 250.0, "epoch": 0.04666666666666667, "grad_norm": 0.8129304647445679, "kl": 1.21884024143219, "learning_rate": 2.4127512582437486e-06, "loss": 0.0488, "reward": 1.375, "reward_std": 0.9161254167556763, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.75, "step": 560 }, { "completion_length": 250.0, "epoch": 0.04675, "grad_norm": 0.6133831143379211, "kl": 1.6948648691177368, "learning_rate": 2.4040304772812002e-06, "loss": 0.0678, "reward": 1.5, "reward_std": 0.8357109427452087, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.75, "step": 561 }, { "completion_length": 250.0, "epoch": 0.04683333333333333, "grad_norm": 0.30969029664993286, "kl": 1.4396021366119385, "learning_rate": 2.3953108656770018e-06, "loss": 0.0576, "reward": 1.25, "reward_std": 0.7918233275413513, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.75, "step": 562 }, { "completion_length": 250.0, "epoch": 0.04691666666666667, "grad_norm": 0.2306750863790512, "kl": 1.3534696102142334, "learning_rate": 2.3865925296768658e-06, "loss": 0.0541, "reward": 1.2916666269302368, "reward_std": 0.6283639669418335, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.9166666865348816, "step": 563 }, { "completion_length": 250.0, "epoch": 0.047, "grad_norm": 0.2677202820777893, "kl": 1.3551838397979736, "learning_rate": 2.377875575510967e-06, "loss": 0.0542, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.875, "step": 564 }, { "completion_length": 250.0, "epoch": 0.04708333333333333, "grad_norm": 0.7604575157165527, "kl": 1.264426350593567, "learning_rate": 2.3691601093926406e-06, "loss": 0.0506, "reward": 1.1666667461395264, "reward_std": 0.50395268201828, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.7916666865348816, "step": 565 }, { "completion_length": 250.0, "epoch": 0.04716666666666667, "grad_norm": 1.7140240669250488, "kl": 1.1938304901123047, "learning_rate": 2.3604462375170905e-06, "loss": 0.0478, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.75, "step": 566 }, { "completion_length": 250.0, "epoch": 0.04725, "grad_norm": 0.3368653655052185, "kl": 1.5280756950378418, "learning_rate": 2.3517340660600965e-06, "loss": 0.0611, "reward": 1.0, "reward_std": 0.7559289336204529, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.75, "step": 567 }, { "completion_length": 250.0, "epoch": 0.04733333333333333, "grad_norm": 0.4907490015029907, "kl": 1.3557684421539307, "learning_rate": 2.3430237011767166e-06, "loss": 0.0542, "reward": 1.5416666269302368, "reward_std": 0.6651769280433655, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.9166666865348816, "step": 568 }, { "completion_length": 250.0, "epoch": 0.04741666666666667, "grad_norm": 0.4219551086425781, "kl": 1.487517237663269, "learning_rate": 2.3343152490000004e-06, "loss": 0.0595, "reward": 1.625, "reward_std": 0.6283639073371887, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.875, "step": 569 }, { "completion_length": 250.0, "epoch": 0.0475, "grad_norm": 0.37886741757392883, "kl": 0.992394745349884, "learning_rate": 2.325608815639687e-06, "loss": 0.0397, "reward": 1.1666667461395264, "reward_std": 0.7766431570053101, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.7916666865348816, "step": 570 }, { "completion_length": 250.0, "epoch": 0.04758333333333333, "grad_norm": 0.2531243562698364, "kl": 1.362313985824585, "learning_rate": 2.3169045071809217e-06, "loss": 0.0545, "reward": 1.75, "reward_std": 0.38832157850265503, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.875, "step": 571 }, { "completion_length": 250.0, "epoch": 0.04766666666666667, "grad_norm": 0.27313292026519775, "kl": 1.0673291683197021, "learning_rate": 2.3082024296829538e-06, "loss": 0.0427, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.875, "step": 572 }, { "completion_length": 250.0, "epoch": 0.04775, "grad_norm": 0.3057982325553894, "kl": 1.4214304685592651, "learning_rate": 2.2995026891778533e-06, "loss": 0.0569, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 1.0, "step": 573 }, { "completion_length": 250.0, "epoch": 0.04783333333333333, "grad_norm": 0.27380916476249695, "kl": 1.0907694101333618, "learning_rate": 2.290805391669212e-06, "loss": 0.0436, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 1.0, "step": 574 }, { "completion_length": 250.0, "epoch": 0.04791666666666667, "grad_norm": 0.4117473065853119, "kl": 1.1665431261062622, "learning_rate": 2.2821106431308546e-06, "loss": 0.0467, "reward": 1.1666667461395264, "reward_std": 0.5345224142074585, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.7916666865348816, "step": 575 }, { "completion_length": 250.0, "epoch": 0.048, "grad_norm": 0.6448284983634949, "kl": 1.6899546384811401, "learning_rate": 2.2734185495055503e-06, "loss": 0.0676, "reward": 1.1666667461395264, "reward_std": 0.7766432166099548, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.7916666865348816, "step": 576 }, { "completion_length": 250.0, "epoch": 0.04808333333333333, "grad_norm": 0.2402116358280182, "kl": 0.989196240901947, "learning_rate": 2.2647292167037143e-06, "loss": 0.0396, "reward": 1.5, "reward_std": 0.6424160599708557, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.875, "step": 577 }, { "completion_length": 250.0, "epoch": 0.04816666666666667, "grad_norm": 0.23641373217105865, "kl": 1.2374866008758545, "learning_rate": 2.256042750602127e-06, "loss": 0.0495, "reward": 1.7083333730697632, "reward_std": 0.7000566720962524, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.8333333730697632, "step": 578 }, { "completion_length": 250.0, "epoch": 0.04825, "grad_norm": 0.4009597897529602, "kl": 1.3696413040161133, "learning_rate": 2.2473592570426343e-06, "loss": 0.0548, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 1.0, "step": 579 }, { "completion_length": 250.0, "epoch": 0.04833333333333333, "grad_norm": 0.47816938161849976, "kl": 1.0150160789489746, "learning_rate": 2.238678841830867e-06, "loss": 0.0406, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.875, "step": 580 }, { "completion_length": 250.0, "epoch": 0.04841666666666666, "grad_norm": 0.2902098000049591, "kl": 1.2081258296966553, "learning_rate": 2.230001610734943e-06, "loss": 0.0483, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.875, "step": 581 }, { "completion_length": 250.0, "epoch": 0.0485, "grad_norm": 0.5224894285202026, "kl": 1.5874559879302979, "learning_rate": 2.2213276694841866e-06, "loss": 0.0635, "reward": 1.2916667461395264, "reward_std": 0.8249579668045044, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.7916666865348816, "step": 582 }, { "completion_length": 250.0, "epoch": 0.04858333333333333, "grad_norm": 0.3580199182033539, "kl": 0.8156141638755798, "learning_rate": 2.212657123767834e-06, "loss": 0.0326, "reward": 1.3333333730697632, "reward_std": 0.7766431570053101, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.8333333730697632, "step": 583 }, { "completion_length": 250.0, "epoch": 0.048666666666666664, "grad_norm": 0.28623875975608826, "kl": 1.2345472574234009, "learning_rate": 2.2039900792337477e-06, "loss": 0.0494, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.875, "step": 584 }, { "completion_length": 250.0, "epoch": 0.04875, "grad_norm": 0.340573251247406, "kl": 1.5262372493743896, "learning_rate": 2.195326641487132e-06, "loss": 0.061, "reward": 1.7083333730697632, "reward_std": 0.4520675837993622, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.8333333730697632, "step": 585 }, { "completion_length": 250.0, "epoch": 0.04883333333333333, "grad_norm": 0.425148069858551, "kl": 1.291420340538025, "learning_rate": 2.186666916089239e-06, "loss": 0.0517, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.75, "step": 586 }, { "completion_length": 250.0, "epoch": 0.048916666666666664, "grad_norm": 0.21294091641902924, "kl": 1.372128963470459, "learning_rate": 2.1780110085560935e-06, "loss": 0.0549, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 1.0, "step": 587 }, { "completion_length": 250.0, "epoch": 0.049, "grad_norm": 0.8173606395721436, "kl": 1.2428475618362427, "learning_rate": 2.1693590243571937e-06, "loss": 0.0497, "reward": 1.9583333730697632, "reward_std": 0.11785109341144562, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 0.9583333730697632, "step": 588 }, { "completion_length": 250.0, "epoch": 0.04908333333333333, "grad_norm": 0.3507099449634552, "kl": 0.9352976083755493, "learning_rate": 2.1607110689142393e-06, "loss": 0.0374, "reward": 1.2083333730697632, "reward_std": 0.8533315062522888, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.5833333730697632, "step": 589 }, { "completion_length": 250.0, "epoch": 0.049166666666666664, "grad_norm": 0.6860688924789429, "kl": 1.3327147960662842, "learning_rate": 2.1520672475998374e-06, "loss": 0.0533, "reward": 1.4583333730697632, "reward_std": 0.501980185508728, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.9583333730697632, "step": 590 }, { "completion_length": 250.0, "epoch": 0.04925, "grad_norm": 0.1836400032043457, "kl": 1.7783141136169434, "learning_rate": 2.143427665736221e-06, "loss": 0.0711, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 1.0, "step": 591 }, { "completion_length": 250.0, "epoch": 0.04933333333333333, "grad_norm": 1.7451122999191284, "kl": 1.7091467380523682, "learning_rate": 2.134792428593971e-06, "loss": 0.0684, "reward": 1.3333332538604736, "reward_std": 0.6666666269302368, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.7083333134651184, "step": 592 }, { "completion_length": 250.0, "epoch": 0.049416666666666664, "grad_norm": 0.23946592211723328, "kl": 1.4415324926376343, "learning_rate": 2.1261616413907267e-06, "loss": 0.0577, "reward": 1.2916667461395264, "reward_std": 0.6283639669418335, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.9166666865348816, "step": 593 }, { "completion_length": 250.0, "epoch": 0.0495, "grad_norm": 0.02028987742960453, "kl": 1.161889910697937, "learning_rate": 2.117535409289905e-06, "loss": 0.0465, "reward": 2.0, "reward_std": 0.0, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 1.0, "step": 594 }, { "completion_length": 250.0, "epoch": 0.04958333333333333, "grad_norm": 0.23525746166706085, "kl": 1.4621292352676392, "learning_rate": 2.1089138373994226e-06, "loss": 0.0585, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 1.0, "step": 595 }, { "completion_length": 250.0, "epoch": 0.049666666666666665, "grad_norm": 0.2379079908132553, "kl": 0.9912834763526917, "learning_rate": 2.1002970307704134e-06, "loss": 0.0397, "reward": 1.0416667461395264, "reward_std": 0.5473601818084717, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.7916666865348816, "step": 596 }, { "completion_length": 250.0, "epoch": 0.04975, "grad_norm": 1.0380606651306152, "kl": 1.8222761154174805, "learning_rate": 2.0916850943959453e-06, "loss": 0.0729, "reward": 1.5416667461395264, "reward_std": 0.501980185508728, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.7916666865348816, "step": 597 }, { "completion_length": 250.0, "epoch": 0.049833333333333334, "grad_norm": 0.2623489797115326, "kl": 0.9441961646080017, "learning_rate": 2.0830781332097446e-06, "loss": 0.0378, "reward": 1.5, "reward_std": 0.7126966118812561, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.875, "step": 598 }, { "completion_length": 250.0, "epoch": 0.049916666666666665, "grad_norm": 0.2724643647670746, "kl": 1.209511637687683, "learning_rate": 2.0744762520849193e-06, "loss": 0.0484, "reward": 1.5833333730697632, "reward_std": 0.49601587653160095, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.9583333730697632, "step": 599 }, { "completion_length": 250.0, "epoch": 0.05, "grad_norm": 0.02976626716554165, "kl": 1.7429208755493164, "learning_rate": 2.0658795558326745e-06, "loss": 0.0697, "reward": 2.0, "reward_std": 0.0, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 1.0, "step": 600 }, { "completion_length": 250.0, "epoch": 0.050083333333333334, "grad_norm": 0.2598009407520294, "kl": 1.0037543773651123, "learning_rate": 2.0572881492010423e-06, "loss": 0.0402, "reward": 1.3333333730697632, "reward_std": 0.854493260383606, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.7083333730697632, "step": 601 }, { "completion_length": 250.0, "epoch": 0.050166666666666665, "grad_norm": 0.2374367117881775, "kl": 1.5422213077545166, "learning_rate": 2.0487021368736002e-06, "loss": 0.0617, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.875, "step": 602 }, { "completion_length": 202.0, "epoch": 0.05025, "grad_norm": 0.23875918984413147, "kl": 1.7543926239013672, "learning_rate": 2.0401216234682e-06, "loss": 0.0702, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 1.0, "step": 603 }, { "completion_length": 250.0, "epoch": 0.050333333333333334, "grad_norm": 0.03209434449672699, "kl": 1.4833548069000244, "learning_rate": 2.031546713535688e-06, "loss": 0.0593, "reward": 1.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 1.0, "step": 604 }, { "completion_length": 250.0, "epoch": 0.050416666666666665, "grad_norm": 0.27828800678253174, "kl": 1.3084222078323364, "learning_rate": 2.022977511558638e-06, "loss": 0.0523, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.875, "step": 605 }, { "completion_length": 250.0, "epoch": 0.0505, "grad_norm": 1.4770981073379517, "kl": 1.1198890209197998, "learning_rate": 2.0144141219500707e-06, "loss": 0.0448, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.875, "step": 606 }, { "completion_length": 250.0, "epoch": 0.050583333333333334, "grad_norm": 0.33261868357658386, "kl": 1.2287256717681885, "learning_rate": 2.0058566490521848e-06, "loss": 0.0491, "reward": 1.0416666269302368, "reward_std": 0.7000566720962524, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.7916666865348816, "step": 607 }, { "completion_length": 250.0, "epoch": 0.050666666666666665, "grad_norm": 0.28096532821655273, "kl": 1.3212240934371948, "learning_rate": 1.997305197135089e-06, "loss": 0.0528, "reward": 1.7916667461395264, "reward_std": 0.39591163396835327, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.9166666865348816, "step": 608 }, { "completion_length": 250.0, "epoch": 0.05075, "grad_norm": 0.7251350283622742, "kl": 2.047295331954956, "learning_rate": 1.9887598703955244e-06, "loss": 0.0819, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.875, "step": 609 }, { "completion_length": 250.0, "epoch": 0.050833333333333335, "grad_norm": 0.22892563045024872, "kl": 1.1812776327133179, "learning_rate": 1.9802207729556023e-06, "loss": 0.0473, "reward": 1.7916667461395264, "reward_std": 0.46929529309272766, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.9166667461395264, "step": 610 }, { "completion_length": 250.0, "epoch": 0.050916666666666666, "grad_norm": 0.446869432926178, "kl": 1.157643437385559, "learning_rate": 1.971688008861529e-06, "loss": 0.0463, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.875, "step": 611 }, { "completion_length": 250.0, "epoch": 0.051, "grad_norm": 0.889122486114502, "kl": 0.7201449275016785, "learning_rate": 1.963161682082342e-06, "loss": 0.0288, "reward": 0.9166666865348816, "reward_std": 0.5563486218452454, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.7916666865348816, "step": 612 }, { "completion_length": 250.0, "epoch": 0.051083333333333335, "grad_norm": 0.3091413378715515, "kl": 1.346756935119629, "learning_rate": 1.9546418965086444e-06, "loss": 0.0539, "reward": 1.9166667461395264, "reward_std": 0.23570223152637482, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 0.9166666865348816, "step": 613 }, { "completion_length": 250.0, "epoch": 0.051166666666666666, "grad_norm": 0.40438079833984375, "kl": 1.2623693943023682, "learning_rate": 1.946128755951332e-06, "loss": 0.0505, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.875, "step": 614 }, { "completion_length": 244.0, "epoch": 0.05125, "grad_norm": 0.31069228053092957, "kl": 1.4803262948989868, "learning_rate": 1.937622364140338e-06, "loss": 0.0592, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.875, "step": 615 }, { "completion_length": 250.0, "epoch": 0.051333333333333335, "grad_norm": 0.34841683506965637, "kl": 1.1213486194610596, "learning_rate": 1.9291228247233607e-06, "loss": 0.0449, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.875, "step": 616 }, { "completion_length": 250.0, "epoch": 0.051416666666666666, "grad_norm": 0.2804429829120636, "kl": 1.3213424682617188, "learning_rate": 1.9206302412646074e-06, "loss": 0.0529, "reward": 0.9583333730697632, "reward_std": 0.11785111576318741, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.9583333730697632, "step": 617 }, { "completion_length": 250.0, "epoch": 0.0515, "grad_norm": 0.2578163146972656, "kl": 1.5600756406784058, "learning_rate": 1.912144717243525e-06, "loss": 0.0624, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 1.0, "step": 618 }, { "completion_length": 250.0, "epoch": 0.051583333333333335, "grad_norm": 0.27005648612976074, "kl": 1.027485728263855, "learning_rate": 1.9036663560535484e-06, "loss": 0.0411, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.875, "step": 619 }, { "completion_length": 250.0, "epoch": 0.051666666666666666, "grad_norm": 0.3639249801635742, "kl": 1.0344536304473877, "learning_rate": 1.895195261000831e-06, "loss": 0.0414, "reward": 1.6666667461395264, "reward_std": 0.7126966714859009, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.7916666865348816, "step": 620 }, { "completion_length": 250.0, "epoch": 0.05175, "grad_norm": 0.23271331191062927, "kl": 0.8880565166473389, "learning_rate": 1.8867315353029937e-06, "loss": 0.0355, "reward": 1.7083333730697632, "reward_std": 0.4520675241947174, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.9583333730697632, "step": 621 }, { "completion_length": 250.0, "epoch": 0.051833333333333335, "grad_norm": 0.2460673302412033, "kl": 2.0310378074645996, "learning_rate": 1.8782752820878636e-06, "loss": 0.0812, "reward": 1.6666667461395264, "reward_std": 0.4714045226573944, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.9166666865348816, "step": 622 }, { "completion_length": 250.0, "epoch": 0.051916666666666667, "grad_norm": 0.3399658501148224, "kl": 1.346755027770996, "learning_rate": 1.8698266043922159e-06, "loss": 0.0539, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 1.0, "step": 623 }, { "completion_length": 250.0, "epoch": 0.052, "grad_norm": 0.2878936529159546, "kl": 1.044472575187683, "learning_rate": 1.8613856051605242e-06, "loss": 0.0418, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.875, "step": 624 }, { "completion_length": 250.0, "epoch": 0.052083333333333336, "grad_norm": 0.30562835931777954, "kl": 1.278110384941101, "learning_rate": 1.852952387243698e-06, "loss": 0.0511, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 1.0, "step": 625 }, { "completion_length": 250.0, "epoch": 0.05216666666666667, "grad_norm": 0.30309000611305237, "kl": 1.116170048713684, "learning_rate": 1.8445270533978387e-06, "loss": 0.0446, "reward": 0.7916666865348816, "reward_std": 0.39591166377067566, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.7916666865348816, "step": 626 }, { "completion_length": 250.0, "epoch": 0.05225, "grad_norm": 0.29444682598114014, "kl": 1.0253041982650757, "learning_rate": 1.836109706282978e-06, "loss": 0.041, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 1.0, "step": 627 }, { "completion_length": 250.0, "epoch": 0.052333333333333336, "grad_norm": 0.3537648320198059, "kl": 0.7810394167900085, "learning_rate": 1.827700448461836e-06, "loss": 0.0312, "reward": 1.2083333730697632, "reward_std": 0.8717342615127563, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.7083333730697632, "step": 628 }, { "completion_length": 250.0, "epoch": 0.05241666666666667, "grad_norm": 0.1647401601076126, "kl": 1.585963249206543, "learning_rate": 1.8192993823985643e-06, "loss": 0.0634, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 1.0, "step": 629 }, { "completion_length": 250.0, "epoch": 0.0525, "grad_norm": 0.9263207316398621, "kl": 1.3402163982391357, "learning_rate": 1.8109066104575023e-06, "loss": 0.0536, "reward": 1.3333333730697632, "reward_std": 0.5634361505508423, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.9583333730697632, "step": 630 }, { "completion_length": 250.0, "epoch": 0.052583333333333336, "grad_norm": 0.29536283016204834, "kl": 1.0759024620056152, "learning_rate": 1.8025222349019273e-06, "loss": 0.043, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.875, "step": 631 }, { "completion_length": 250.0, "epoch": 0.05266666666666667, "grad_norm": 0.7649843692779541, "kl": 0.7642300724983215, "learning_rate": 1.7941463578928088e-06, "loss": 0.0306, "reward": 1.5, "reward_std": 0.7126966118812561, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.75, "step": 632 }, { "completion_length": 250.0, "epoch": 0.05275, "grad_norm": 0.26629337668418884, "kl": 1.3809568881988525, "learning_rate": 1.7857790814875665e-06, "loss": 0.0552, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 1.0, "step": 633 }, { "completion_length": 250.0, "epoch": 0.052833333333333336, "grad_norm": 0.2140216827392578, "kl": 0.8888575434684753, "learning_rate": 1.7774205076388207e-06, "loss": 0.0356, "reward": 1.7083333730697632, "reward_std": 0.5473601818084717, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.9583333730697632, "step": 634 }, { "completion_length": 250.0, "epoch": 0.05291666666666667, "grad_norm": 0.27039581537246704, "kl": 0.562234103679657, "learning_rate": 1.7690707381931585e-06, "loss": 0.0225, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.75, "step": 635 }, { "completion_length": 250.0, "epoch": 0.053, "grad_norm": 0.2814823091030121, "kl": 1.5715206861495972, "learning_rate": 1.7607298748898844e-06, "loss": 0.0629, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 1.0, "step": 636 }, { "completion_length": 250.0, "epoch": 0.05308333333333334, "grad_norm": 0.21937192976474762, "kl": 0.5414950251579285, "learning_rate": 1.7523980193597837e-06, "loss": 0.0217, "reward": 1.5833333730697632, "reward_std": 0.49601587653160095, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.9583333730697632, "step": 637 }, { "completion_length": 250.0, "epoch": 0.05316666666666667, "grad_norm": 0.28082460165023804, "kl": 1.2513439655303955, "learning_rate": 1.744075273123889e-06, "loss": 0.0501, "reward": 1.4583333730697632, "reward_std": 0.7332792282104492, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.8333333730697632, "step": 638 }, { "completion_length": 250.0, "epoch": 0.05325, "grad_norm": 0.029399115592241287, "kl": 1.4624956846237183, "learning_rate": 1.735761737592236e-06, "loss": 0.0585, "reward": 2.0, "reward_std": 0.0, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 1.0, "step": 639 }, { "completion_length": 250.0, "epoch": 0.05333333333333334, "grad_norm": 0.39860522747039795, "kl": 1.6155064105987549, "learning_rate": 1.7274575140626318e-06, "loss": 0.0646, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.75, "step": 640 }, { "completion_length": 250.0, "epoch": 0.05341666666666667, "grad_norm": 0.25739023089408875, "kl": 1.2121546268463135, "learning_rate": 1.7191627037194187e-06, "loss": 0.0485, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 1.0, "step": 641 }, { "completion_length": 250.0, "epoch": 0.0535, "grad_norm": 0.3668891191482544, "kl": 1.4224536418914795, "learning_rate": 1.7108774076322443e-06, "loss": 0.0569, "reward": 1.5833333730697632, "reward_std": 0.5841829776763916, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.9583333730697632, "step": 642 }, { "completion_length": 212.0, "epoch": 0.05358333333333333, "grad_norm": 0.2385721206665039, "kl": 1.3418517112731934, "learning_rate": 1.702601726754825e-06, "loss": 0.0537, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 1.0, "step": 643 }, { "completion_length": 250.0, "epoch": 0.05366666666666667, "grad_norm": 0.34340476989746094, "kl": 1.1666271686553955, "learning_rate": 1.6943357619237227e-06, "loss": 0.0467, "reward": 0.75, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.5, "step": 644 }, { "completion_length": 250.0, "epoch": 0.05375, "grad_norm": 0.22176310420036316, "kl": 1.4181245565414429, "learning_rate": 1.686079613857109e-06, "loss": 0.0567, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 1.0, "step": 645 }, { "completion_length": 250.0, "epoch": 0.05383333333333333, "grad_norm": 0.27106255292892456, "kl": 0.7385506629943848, "learning_rate": 1.677833383153542e-06, "loss": 0.0295, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.875, "step": 646 }, { "completion_length": 250.0, "epoch": 0.05391666666666667, "grad_norm": 0.23391051590442657, "kl": 1.1499696969985962, "learning_rate": 1.6695971702907425e-06, "loss": 0.046, "reward": 1.7083333730697632, "reward_std": 0.7000566720962524, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.8333333730697632, "step": 647 }, { "completion_length": 250.0, "epoch": 0.054, "grad_norm": 0.29008620977401733, "kl": 1.0525387525558472, "learning_rate": 1.661371075624363e-06, "loss": 0.0421, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 1.0, "step": 648 }, { "completion_length": 250.0, "epoch": 0.05408333333333333, "grad_norm": 0.2697657644748688, "kl": 0.778380811214447, "learning_rate": 1.6531551993867717e-06, "loss": 0.0311, "reward": 1.9166667461395264, "reward_std": 0.15430331230163574, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 0.9166666865348816, "step": 649 }, { "completion_length": 250.0, "epoch": 0.05416666666666667, "grad_norm": 0.2077609896659851, "kl": 0.8145649433135986, "learning_rate": 1.6449496416858285e-06, "loss": 0.0326, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 1.0, "step": 650 }, { "completion_length": 250.0, "epoch": 0.05425, "grad_norm": 0.2946789562702179, "kl": 1.0525617599487305, "learning_rate": 1.6367545025036634e-06, "loss": 0.0421, "reward": 1.9166667461395264, "reward_std": 0.23570223152637482, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 0.9166666865348816, "step": 651 }, { "completion_length": 250.0, "epoch": 0.05433333333333333, "grad_norm": 0.2999846041202545, "kl": 1.0172115564346313, "learning_rate": 1.6285698816954626e-06, "loss": 0.0407, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 1.0, "step": 652 }, { "completion_length": 250.0, "epoch": 0.05441666666666667, "grad_norm": 0.3423196077346802, "kl": 1.435135006904602, "learning_rate": 1.6203958789882457e-06, "loss": 0.0574, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.75, "step": 653 }, { "completion_length": 250.0, "epoch": 0.0545, "grad_norm": 0.4042535424232483, "kl": 0.9854827523231506, "learning_rate": 1.612232593979658e-06, "loss": 0.0394, "reward": 1.5416666269302368, "reward_std": 0.6651769280433655, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.9166666865348816, "step": 654 }, { "completion_length": 250.0, "epoch": 0.05458333333333333, "grad_norm": 0.5451151132583618, "kl": 1.6612234115600586, "learning_rate": 1.6040801261367494e-06, "loss": 0.0664, "reward": 1.7916667461395264, "reward_std": 0.39591163396835327, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 0.7916666865348816, "step": 655 }, { "completion_length": 250.0, "epoch": 0.05466666666666667, "grad_norm": 0.28427278995513916, "kl": 0.8143528699874878, "learning_rate": 1.5959385747947697e-06, "loss": 0.0326, "reward": 1.5416666269302368, "reward_std": 0.853331446647644, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.7916666865348816, "step": 656 }, { "completion_length": 250.0, "epoch": 0.05475, "grad_norm": 0.26303741335868835, "kl": 1.1400749683380127, "learning_rate": 1.5878080391559507e-06, "loss": 0.0456, "reward": 1.4583333730697632, "reward_std": 0.5892556309700012, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.9583333730697632, "step": 657 }, { "completion_length": 250.0, "epoch": 0.05483333333333333, "grad_norm": 2.4117088317871094, "kl": 1.7515506744384766, "learning_rate": 1.5796886182883053e-06, "loss": 0.0701, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 0.875, "step": 658 }, { "completion_length": 250.0, "epoch": 0.05491666666666667, "grad_norm": 0.37918296456336975, "kl": 1.5612022876739502, "learning_rate": 1.5715804111244138e-06, "loss": 0.0624, "reward": 1.6666667461395264, "reward_std": 0.4364357888698578, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.9166666865348816, "step": 659 }, { "completion_length": 250.0, "epoch": 0.055, "grad_norm": 0.3990735709667206, "kl": 0.9025101065635681, "learning_rate": 1.56348351646022e-06, "loss": 0.0361, "reward": 1.5833333730697632, "reward_std": 0.7292091846466064, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.8333333730697632, "step": 660 }, { "completion_length": 250.0, "epoch": 0.05508333333333333, "grad_norm": 0.2836557924747467, "kl": 1.0622992515563965, "learning_rate": 1.5553980329538326e-06, "loss": 0.0425, "reward": 1.5416666269302368, "reward_std": 0.8533315062522888, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.7916666865348816, "step": 661 }, { "completion_length": 249.0, "epoch": 0.05516666666666667, "grad_norm": 0.4254859685897827, "kl": 1.7287112474441528, "learning_rate": 1.547324059124315e-06, "loss": 0.0691, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 1.0, "step": 662 }, { "completion_length": 250.0, "epoch": 0.05525, "grad_norm": 0.517515242099762, "kl": 1.6896018981933594, "learning_rate": 1.539261693350491e-06, "loss": 0.0676, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.75, "step": 663 }, { "completion_length": 250.0, "epoch": 0.05533333333333333, "grad_norm": 0.03853216394782066, "kl": 1.4163974523544312, "learning_rate": 1.5312110338697427e-06, "loss": 0.0567, "reward": 2.0, "reward_std": 0.0, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 1.0, "step": 664 }, { "completion_length": 250.0, "epoch": 0.05541666666666667, "grad_norm": 5.298870086669922, "kl": 1.7589707374572754, "learning_rate": 1.5231721787768162e-06, "loss": 0.0704, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.875, "step": 665 }, { "completion_length": 250.0, "epoch": 0.0555, "grad_norm": 0.28842854499816895, "kl": 1.1976323127746582, "learning_rate": 1.5151452260226224e-06, "loss": 0.0479, "reward": 1.6666667461395264, "reward_std": 0.7126966714859009, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.7916666865348816, "step": 666 }, { "completion_length": 250.0, "epoch": 0.05558333333333333, "grad_norm": 0.4191083610057831, "kl": 1.463090419769287, "learning_rate": 1.5071302734130488e-06, "loss": 0.0585, "reward": 1.7083333730697632, "reward_std": 0.4520675241947174, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.8333333730697632, "step": 667 }, { "completion_length": 250.0, "epoch": 0.05566666666666667, "grad_norm": 0.2382550835609436, "kl": 1.061838150024414, "learning_rate": 1.4991274186077632e-06, "loss": 0.0425, "reward": 1.5833332538604736, "reward_std": 0.6362090110778809, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.8333333730697632, "step": 668 }, { "completion_length": 250.0, "epoch": 0.05575, "grad_norm": 0.2757638394832611, "kl": 1.002281665802002, "learning_rate": 1.491136759119025e-06, "loss": 0.0401, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.875, "step": 669 }, { "completion_length": 250.0, "epoch": 0.05583333333333333, "grad_norm": 0.4550880193710327, "kl": 0.8322465419769287, "learning_rate": 1.4831583923105e-06, "loss": 0.0333, "reward": 1.5833333730697632, "reward_std": 0.7292091846466064, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.7083333730697632, "step": 670 }, { "completion_length": 250.0, "epoch": 0.05591666666666667, "grad_norm": 0.52930748462677, "kl": 1.6172356605529785, "learning_rate": 1.4751924153960681e-06, "loss": 0.0647, "reward": 1.5833333730697632, "reward_std": 0.5841830372810364, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.9583333730697632, "step": 671 }, { "completion_length": 250.0, "epoch": 0.056, "grad_norm": 0.35954219102859497, "kl": 1.1327468156814575, "learning_rate": 1.467238925438646e-06, "loss": 0.0453, "reward": 1.4583333730697632, "reward_std": 0.7332792282104492, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.8333333730697632, "step": 672 }, { "completion_length": 250.0, "epoch": 0.05608333333333333, "grad_norm": 0.29655808210372925, "kl": 1.7345565557479858, "learning_rate": 1.4592980193489975e-06, "loss": 0.0694, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 1.0, "step": 673 }, { "completion_length": 250.0, "epoch": 0.05616666666666666, "grad_norm": 0.2724839746952057, "kl": 0.983393669128418, "learning_rate": 1.4513697938845571e-06, "loss": 0.0393, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 1.0, "step": 674 }, { "completion_length": 250.0, "epoch": 0.05625, "grad_norm": 0.2653176784515381, "kl": 1.4598757028579712, "learning_rate": 1.443454345648252e-06, "loss": 0.0584, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.875, "step": 675 }, { "completion_length": 250.0, "epoch": 0.05633333333333333, "grad_norm": 0.4087188243865967, "kl": 1.2594470977783203, "learning_rate": 1.4355517710873184e-06, "loss": 0.0504, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.875, "step": 676 }, { "completion_length": 250.0, "epoch": 0.056416666666666664, "grad_norm": 0.25172901153564453, "kl": 0.8408035635948181, "learning_rate": 1.4276621664921358e-06, "loss": 0.0336, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.875, "step": 677 }, { "completion_length": 250.0, "epoch": 0.0565, "grad_norm": 0.26606500148773193, "kl": 1.4946765899658203, "learning_rate": 1.419785627995044e-06, "loss": 0.0598, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 1.0, "step": 678 }, { "completion_length": 250.0, "epoch": 0.05658333333333333, "grad_norm": 0.2562119662761688, "kl": 1.0255094766616821, "learning_rate": 1.4119222515691817e-06, "loss": 0.041, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.875, "step": 679 }, { "completion_length": 250.0, "epoch": 0.056666666666666664, "grad_norm": 0.2506910264492035, "kl": 1.0600122213363647, "learning_rate": 1.4040721330273063e-06, "loss": 0.0424, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.875, "step": 680 }, { "completion_length": 250.0, "epoch": 0.05675, "grad_norm": 0.2096366286277771, "kl": 0.6006632447242737, "learning_rate": 1.3962353680206372e-06, "loss": 0.024, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.75, "step": 681 }, { "completion_length": 250.0, "epoch": 0.05683333333333333, "grad_norm": 0.032199203968048096, "kl": 1.5546534061431885, "learning_rate": 1.388412052037682e-06, "loss": 0.0622, "reward": 2.0, "reward_std": 0.0, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 1.0, "step": 682 }, { "completion_length": 214.0, "epoch": 0.056916666666666664, "grad_norm": 0.5028219819068909, "kl": 1.4313411712646484, "learning_rate": 1.380602280403076e-06, "loss": 0.0573, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.875, "step": 683 }, { "completion_length": 250.0, "epoch": 0.057, "grad_norm": 0.19551897048950195, "kl": 0.8340595960617065, "learning_rate": 1.3728061482764238e-06, "loss": 0.0334, "reward": 1.2083333730697632, "reward_std": 0.39591163396835327, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.9583333730697632, "step": 684 }, { "completion_length": 250.0, "epoch": 0.05708333333333333, "grad_norm": 0.4893221855163574, "kl": 1.325058102607727, "learning_rate": 1.3650237506511333e-06, "loss": 0.053, "reward": 1.8333333730697632, "reward_std": 0.35634827613830566, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.9583333730697632, "step": 685 }, { "completion_length": 250.0, "epoch": 0.057166666666666664, "grad_norm": 0.18707111477851868, "kl": 1.2680920362472534, "learning_rate": 1.3572551823532654e-06, "loss": 0.0507, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 1.0, "step": 686 }, { "completion_length": 250.0, "epoch": 0.05725, "grad_norm": 0.2992132306098938, "kl": 1.0152003765106201, "learning_rate": 1.349500538040371e-06, "loss": 0.0406, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.875, "step": 687 }, { "completion_length": 250.0, "epoch": 0.05733333333333333, "grad_norm": 0.24606172740459442, "kl": 1.0200669765472412, "learning_rate": 1.3417599122003464e-06, "loss": 0.0408, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 1.0, "step": 688 }, { "completion_length": 250.0, "epoch": 0.057416666666666664, "grad_norm": 0.269815593957901, "kl": 1.2791739702224731, "learning_rate": 1.3340333991502723e-06, "loss": 0.0512, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.875, "step": 689 }, { "completion_length": 250.0, "epoch": 0.0575, "grad_norm": 0.3196219205856323, "kl": 1.3466354608535767, "learning_rate": 1.3263210930352737e-06, "loss": 0.0539, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.875, "step": 690 }, { "completion_length": 250.0, "epoch": 0.057583333333333334, "grad_norm": 0.397535115480423, "kl": 1.0252505540847778, "learning_rate": 1.3186230878273654e-06, "loss": 0.041, "reward": 1.25, "reward_std": 0.8864052295684814, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.625, "step": 691 }, { "completion_length": 250.0, "epoch": 0.057666666666666665, "grad_norm": 0.29124775528907776, "kl": 1.024878740310669, "learning_rate": 1.3109394773243117e-06, "loss": 0.041, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 0.875, "step": 692 }, { "completion_length": 250.0, "epoch": 0.05775, "grad_norm": 0.2738453447818756, "kl": 1.3520758152008057, "learning_rate": 1.3032703551484832e-06, "loss": 0.0541, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 1.0, "step": 693 }, { "completion_length": 250.0, "epoch": 0.057833333333333334, "grad_norm": 0.2758381962776184, "kl": 1.5181013345718384, "learning_rate": 1.2956158147457116e-06, "loss": 0.0607, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 1.0, "step": 694 }, { "completion_length": 250.0, "epoch": 0.057916666666666665, "grad_norm": 0.7927420139312744, "kl": 0.9507368803024292, "learning_rate": 1.2879759493841577e-06, "loss": 0.038, "reward": 1.2916666269302368, "reward_std": 0.8249579071998596, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.7916666865348816, "step": 695 }, { "completion_length": 250.0, "epoch": 0.058, "grad_norm": 0.2301512509584427, "kl": 1.0944889783859253, "learning_rate": 1.280350852153168e-06, "loss": 0.0438, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 0.875, "step": 696 }, { "completion_length": 250.0, "epoch": 0.058083333333333334, "grad_norm": 0.2712516784667969, "kl": 0.8457162976264954, "learning_rate": 1.272740615962148e-06, "loss": 0.0338, "reward": 1.8333333730697632, "reward_std": 0.25197628140449524, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 0.8333333730697632, "step": 697 }, { "completion_length": 250.0, "epoch": 0.058166666666666665, "grad_norm": 0.32763615250587463, "kl": 0.9605115652084351, "learning_rate": 1.2651453335394232e-06, "loss": 0.0384, "reward": 1.2916667461395264, "reward_std": 0.7000566720962524, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.6666666865348816, "step": 698 }, { "completion_length": 250.0, "epoch": 0.05825, "grad_norm": 0.29685455560684204, "kl": 1.2807537317276, "learning_rate": 1.2575650974311118e-06, "loss": 0.0512, "reward": 1.4166667461395264, "reward_std": 0.6606874465942383, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.9166666865348816, "step": 699 }, { "completion_length": 250.0, "epoch": 0.058333333333333334, "grad_norm": 0.3393094837665558, "kl": 1.317583441734314, "learning_rate": 1.2500000000000007e-06, "loss": 0.0527, "reward": 1.4583333730697632, "reward_std": 0.7332792282104492, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.8333333730697632, "step": 700 }, { "completion_length": 250.0, "epoch": 0.058416666666666665, "grad_norm": 0.24073301255702972, "kl": 1.267662763595581, "learning_rate": 1.2424501334244124e-06, "loss": 0.0507, "reward": 1.0416667461395264, "reward_std": 0.41547447443008423, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.9166667461395264, "step": 701 }, { "completion_length": 250.0, "epoch": 0.0585, "grad_norm": 0.2873644530773163, "kl": 1.1300421953201294, "learning_rate": 1.234915589697091e-06, "loss": 0.0452, "reward": 1.7083333730697632, "reward_std": 0.602573812007904, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.8333333730697632, "step": 702 }, { "completion_length": 250.0, "epoch": 0.058583333333333334, "grad_norm": 0.2742319703102112, "kl": 0.9622892141342163, "learning_rate": 1.2273964606240718e-06, "loss": 0.0385, "reward": 1.6666666269302368, "reward_std": 0.6424161195755005, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.9166666865348816, "step": 703 }, { "completion_length": 250.0, "epoch": 0.058666666666666666, "grad_norm": 0.21177729964256287, "kl": 0.9296140670776367, "learning_rate": 1.2198928378235717e-06, "loss": 0.0372, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.875, "step": 704 }, { "completion_length": 250.0, "epoch": 0.05875, "grad_norm": 0.6481500267982483, "kl": 1.4008327722549438, "learning_rate": 1.2124048127248644e-06, "loss": 0.056, "reward": 1.2083333730697632, "reward_std": 0.501980185508728, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.9583333730697632, "step": 705 }, { "completion_length": 250.0, "epoch": 0.058833333333333335, "grad_norm": 0.21664856374263763, "kl": 1.1676733493804932, "learning_rate": 1.204932476567175e-06, "loss": 0.0467, "reward": 1.6666666269302368, "reward_std": 0.6424160599708557, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.9166666865348816, "step": 706 }, { "completion_length": 250.0, "epoch": 0.058916666666666666, "grad_norm": 0.33540189266204834, "kl": 1.3017733097076416, "learning_rate": 1.19747592039856e-06, "loss": 0.0521, "reward": 1.8333333730697632, "reward_std": 0.35634833574295044, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.9583333730697632, "step": 707 }, { "completion_length": 250.0, "epoch": 0.059, "grad_norm": 0.2600402235984802, "kl": 1.0948798656463623, "learning_rate": 1.1900352350748026e-06, "loss": 0.0438, "reward": 1.7916666269302368, "reward_std": 0.5892555713653564, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.9166666865348816, "step": 708 }, { "completion_length": 250.0, "epoch": 0.059083333333333335, "grad_norm": 0.20079267024993896, "kl": 1.0153428316116333, "learning_rate": 1.1826105112583061e-06, "loss": 0.0406, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 1.0, "step": 709 }, { "completion_length": 250.0, "epoch": 0.059166666666666666, "grad_norm": 0.29417306184768677, "kl": 0.6207944750785828, "learning_rate": 1.1752018394169882e-06, "loss": 0.0248, "reward": 1.0, "reward_std": 0.7559289336204529, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.75, "step": 710 }, { "completion_length": 250.0, "epoch": 0.05925, "grad_norm": 0.40088775753974915, "kl": 1.1724050045013428, "learning_rate": 1.1678093098231748e-06, "loss": 0.0469, "reward": 1.1666666269302368, "reward_std": 0.7766431570053101, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.7916666865348816, "step": 711 }, { "completion_length": 250.0, "epoch": 0.059333333333333335, "grad_norm": 0.4337019920349121, "kl": 1.4313944578170776, "learning_rate": 1.160433012552508e-06, "loss": 0.0573, "reward": 1.6666666269302368, "reward_std": 0.6424161195755005, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.7916666865348816, "step": 712 }, { "completion_length": 250.0, "epoch": 0.059416666666666666, "grad_norm": 0.25332921743392944, "kl": 1.3762283325195312, "learning_rate": 1.1530730374828422e-06, "loss": 0.055, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 1.0, "step": 713 }, { "completion_length": 250.0, "epoch": 0.0595, "grad_norm": 0.4357444941997528, "kl": 1.0639551877975464, "learning_rate": 1.1457294742931508e-06, "loss": 0.0426, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 1.0, "step": 714 }, { "completion_length": 250.0, "epoch": 0.059583333333333335, "grad_norm": 0.3148086369037628, "kl": 1.8188680410385132, "learning_rate": 1.1384024124624324e-06, "loss": 0.0728, "reward": 1.4166667461395264, "reward_std": 0.771516740322113, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.7916666865348816, "step": 715 }, { "completion_length": 250.0, "epoch": 0.059666666666666666, "grad_norm": 0.3978966176509857, "kl": 2.0942423343658447, "learning_rate": 1.1310919412686248e-06, "loss": 0.0838, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 1.0, "step": 716 }, { "completion_length": 250.0, "epoch": 0.05975, "grad_norm": 0.3257802128791809, "kl": 1.4454894065856934, "learning_rate": 1.1237981497875112e-06, "loss": 0.0578, "reward": 1.9583333730697632, "reward_std": 0.11785109341144562, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 0.9583333730697632, "step": 717 }, { "completion_length": 250.0, "epoch": 0.059833333333333336, "grad_norm": 0.29710352420806885, "kl": 0.89763343334198, "learning_rate": 1.11652112689164e-06, "loss": 0.0359, "reward": 1.5, "reward_std": 0.7126966714859009, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.75, "step": 718 }, { "completion_length": 250.0, "epoch": 0.05991666666666667, "grad_norm": 0.34548911452293396, "kl": 1.2456828355789185, "learning_rate": 1.109260961249238e-06, "loss": 0.0498, "reward": 1.375, "reward_std": 0.9161254167556763, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.75, "step": 719 }, { "completion_length": 250.0, "epoch": 0.06, "grad_norm": 0.46419206261634827, "kl": 1.6931712627410889, "learning_rate": 1.1020177413231334e-06, "loss": 0.0677, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.875, "step": 720 }, { "completion_length": 250.0, "epoch": 0.060083333333333336, "grad_norm": 0.2870257794857025, "kl": 0.8886799216270447, "learning_rate": 1.0947915553696742e-06, "loss": 0.0355, "reward": 1.8333333730697632, "reward_std": 0.35634827613830566, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.9583333730697632, "step": 721 }, { "completion_length": 250.0, "epoch": 0.06016666666666667, "grad_norm": 0.334460973739624, "kl": 1.3340364694595337, "learning_rate": 1.0875824914376555e-06, "loss": 0.0534, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 1.0, "step": 722 }, { "completion_length": 250.0, "epoch": 0.06025, "grad_norm": 0.24705474078655243, "kl": 0.7932687997817993, "learning_rate": 1.0803906373672477e-06, "loss": 0.0317, "reward": 1.0416666269302368, "reward_std": 0.7000566720962524, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.7916666865348816, "step": 723 }, { "completion_length": 250.0, "epoch": 0.060333333333333336, "grad_norm": 0.27795547246932983, "kl": 0.7931269407272339, "learning_rate": 1.073216080788921e-06, "loss": 0.0317, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.625, "step": 724 }, { "completion_length": 202.0, "epoch": 0.06041666666666667, "grad_norm": 0.1886208951473236, "kl": 1.4156323671340942, "learning_rate": 1.0660589091223854e-06, "loss": 0.0566, "reward": 1.9583333730697632, "reward_std": 0.11785109341144562, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 0.9583333730697632, "step": 725 }, { "completion_length": 250.0, "epoch": 0.0605, "grad_norm": 0.23371046781539917, "kl": 0.7740318179130554, "learning_rate": 1.0589192095755172e-06, "loss": 0.031, "reward": 1.4583333730697632, "reward_std": 0.6886264085769653, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.8333333730697632, "step": 726 }, { "completion_length": 250.0, "epoch": 0.060583333333333336, "grad_norm": 1.0416193008422852, "kl": 1.2525185346603394, "learning_rate": 1.0517970691433035e-06, "loss": 0.0501, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.875, "step": 727 }, { "completion_length": 250.0, "epoch": 0.06066666666666667, "grad_norm": 0.335510790348053, "kl": 0.8215186595916748, "learning_rate": 1.0446925746067768e-06, "loss": 0.0329, "reward": 1.4583332538604736, "reward_std": 0.7955730557441711, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.8333333730697632, "step": 728 }, { "completion_length": 250.0, "epoch": 0.06075, "grad_norm": 0.4560522735118866, "kl": 0.985789954662323, "learning_rate": 1.0376058125319614e-06, "loss": 0.0394, "reward": 1.125, "reward_std": 0.8345229625701904, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.75, "step": 729 }, { "completion_length": 227.0, "epoch": 0.060833333333333336, "grad_norm": 0.25721290707588196, "kl": 1.056433916091919, "learning_rate": 1.0305368692688175e-06, "loss": 0.0423, "reward": 1.3333333730697632, "reward_std": 0.5634361505508423, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.9583333730697632, "step": 730 }, { "completion_length": 250.0, "epoch": 0.06091666666666667, "grad_norm": 1.3237224817276, "kl": 0.6960461735725403, "learning_rate": 1.0234858309501864e-06, "loss": 0.0278, "reward": 1.0, "reward_std": 0.7126966714859009, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.75, "step": 731 }, { "completion_length": 250.0, "epoch": 0.061, "grad_norm": 0.2881050407886505, "kl": 0.5406643152236938, "learning_rate": 1.0164527834907468e-06, "loss": 0.0216, "reward": 1.3333333730697632, "reward_std": 0.8908708095550537, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.7083333730697632, "step": 732 }, { "completion_length": 250.0, "epoch": 0.06108333333333334, "grad_norm": 0.21133320033550262, "kl": 1.2629066705703735, "learning_rate": 1.0094378125859602e-06, "loss": 0.0505, "reward": 1.2083333730697632, "reward_std": 0.5019802451133728, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.9583333730697632, "step": 733 }, { "completion_length": 250.0, "epoch": 0.06116666666666667, "grad_norm": 0.22745351493358612, "kl": 0.8365185856819153, "learning_rate": 1.0024410037110358e-06, "loss": 0.0335, "reward": 1.2083333730697632, "reward_std": 0.501980185508728, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.9583333730697632, "step": 734 }, { "completion_length": 250.0, "epoch": 0.06125, "grad_norm": 0.2476133406162262, "kl": 1.2012953758239746, "learning_rate": 9.95462442119879e-07, "loss": 0.0481, "reward": 1.9583333730697632, "reward_std": 0.11785109341144562, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 0.9583333730697632, "step": 735 }, { "completion_length": 250.0, "epoch": 0.06133333333333333, "grad_norm": 0.19638416171073914, "kl": 1.6485852003097534, "learning_rate": 9.88502212844063e-07, "loss": 0.0659, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 1.0, "step": 736 }, { "completion_length": 250.0, "epoch": 0.06141666666666667, "grad_norm": 0.31301435828208923, "kl": 1.480404257774353, "learning_rate": 9.815604006917839e-07, "loss": 0.0592, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 1.0, "step": 737 }, { "completion_length": 250.0, "epoch": 0.0615, "grad_norm": 0.27803361415863037, "kl": 0.7003533244132996, "learning_rate": 9.746370902468311e-07, "loss": 0.028, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.875, "step": 738 }, { "completion_length": 250.0, "epoch": 0.06158333333333333, "grad_norm": 0.2773330807685852, "kl": 1.3522332906723022, "learning_rate": 9.677323658675594e-07, "loss": 0.0541, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 1.0, "step": 739 }, { "completion_length": 250.0, "epoch": 0.06166666666666667, "grad_norm": 0.6077278852462769, "kl": 1.1051663160324097, "learning_rate": 9.608463116858544e-07, "loss": 0.0442, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.875, "step": 740 }, { "completion_length": 250.0, "epoch": 0.06175, "grad_norm": 0.3956521451473236, "kl": 1.057028889656067, "learning_rate": 9.53979011606115e-07, "loss": 0.0423, "reward": 1.4583333730697632, "reward_std": 0.7332792282104492, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.8333333730697632, "step": 741 }, { "completion_length": 250.0, "epoch": 0.06183333333333333, "grad_norm": 0.2606249451637268, "kl": 1.6235839128494263, "learning_rate": 9.471305493042243e-07, "loss": 0.0649, "reward": 1.5833333730697632, "reward_std": 0.49601587653160095, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.9583333730697632, "step": 742 }, { "completion_length": 250.0, "epoch": 0.06191666666666667, "grad_norm": 0.20523270964622498, "kl": 1.068432092666626, "learning_rate": 9.403010082265351e-07, "loss": 0.0427, "reward": 1.3333333730697632, "reward_std": 0.5634361505508423, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.9583333730697632, "step": 743 }, { "completion_length": 250.0, "epoch": 0.062, "grad_norm": 0.2961874306201935, "kl": 1.2741352319717407, "learning_rate": 9.334904715888496e-07, "loss": 0.051, "reward": 1.4583333730697632, "reward_std": 0.5892556309700012, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.9583333730697632, "step": 744 }, { "completion_length": 250.0, "epoch": 0.06208333333333333, "grad_norm": 0.2894919216632843, "kl": 1.0481557846069336, "learning_rate": 9.266990223754069e-07, "loss": 0.0419, "reward": 1.6666666269302368, "reward_std": 0.6424160599708557, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.9166666865348816, "step": 745 }, { "completion_length": 250.0, "epoch": 0.06216666666666667, "grad_norm": 0.2562686800956726, "kl": 1.0808312892913818, "learning_rate": 9.199267433378728e-07, "loss": 0.0432, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 1.0, "step": 746 }, { "completion_length": 250.0, "epoch": 0.06225, "grad_norm": 0.28722333908081055, "kl": 1.1899094581604004, "learning_rate": 9.131737169943314e-07, "loss": 0.0476, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.875, "step": 747 }, { "completion_length": 250.0, "epoch": 0.06233333333333333, "grad_norm": 0.18329280614852905, "kl": 1.901811957359314, "learning_rate": 9.064400256282757e-07, "loss": 0.0761, "reward": 1.7083333730697632, "reward_std": 0.4520675241947174, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.9583333730697632, "step": 748 }, { "completion_length": 250.0, "epoch": 0.06241666666666667, "grad_norm": 0.22949326038360596, "kl": 1.2201263904571533, "learning_rate": 8.99725751287611e-07, "loss": 0.0488, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 1.0, "step": 749 }, { "completion_length": 250.0, "epoch": 0.0625, "grad_norm": 0.32113534212112427, "kl": 1.5129036903381348, "learning_rate": 8.930309757836517e-07, "loss": 0.0605, "reward": 1.8333333730697632, "reward_std": 0.35634827613830566, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.9583333730697632, "step": 750 }, { "completion_length": 250.0, "epoch": 0.06258333333333334, "grad_norm": 0.41787517070770264, "kl": 1.3719799518585205, "learning_rate": 8.863557806901233e-07, "loss": 0.0549, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 1.0, "step": 751 }, { "completion_length": 250.0, "epoch": 0.06266666666666666, "grad_norm": 0.2127346247434616, "kl": 1.2286632061004639, "learning_rate": 8.797002473421729e-07, "loss": 0.0491, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.875, "step": 752 }, { "completion_length": 250.0, "epoch": 0.06275, "grad_norm": 0.3302915096282959, "kl": 0.7858455777168274, "learning_rate": 8.73064456835373e-07, "loss": 0.0314, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 1.0, "step": 753 }, { "completion_length": 250.0, "epoch": 0.06283333333333334, "grad_norm": 0.21576067805290222, "kl": 1.1705996990203857, "learning_rate": 8.664484900247363e-07, "loss": 0.0468, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 1.0, "step": 754 }, { "completion_length": 250.0, "epoch": 0.06291666666666666, "grad_norm": 0.22521759569644928, "kl": 1.7442842721939087, "learning_rate": 8.598524275237321e-07, "loss": 0.0698, "reward": 1.7916666269302368, "reward_std": 0.589255690574646, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.9166666865348816, "step": 755 }, { "completion_length": 250.0, "epoch": 0.063, "grad_norm": 1.9343209266662598, "kl": 1.2830684185028076, "learning_rate": 8.532763497032987e-07, "loss": 0.0513, "reward": 1.7083333730697632, "reward_std": 0.4520675241947174, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.8333333730697632, "step": 756 }, { "completion_length": 250.0, "epoch": 0.06308333333333334, "grad_norm": 0.2488812506198883, "kl": 1.0733072757720947, "learning_rate": 8.467203366908708e-07, "loss": 0.0429, "reward": 1.25, "reward_std": 0.8864052295684814, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.75, "step": 757 }, { "completion_length": 250.0, "epoch": 0.06316666666666666, "grad_norm": 0.310738742351532, "kl": 1.2989583015441895, "learning_rate": 8.40184468369396e-07, "loss": 0.052, "reward": 1.625, "reward_std": 0.7000566720962524, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.8750000596046448, "step": 758 }, { "completion_length": 250.0, "epoch": 0.06325, "grad_norm": 0.25620657205581665, "kl": 1.613889217376709, "learning_rate": 8.336688243763691e-07, "loss": 0.0646, "reward": 1.5416667461395264, "reward_std": 0.7332792282104492, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.7916666865348816, "step": 759 }, { "completion_length": 250.0, "epoch": 0.06333333333333334, "grad_norm": 0.30064281821250916, "kl": 1.306907296180725, "learning_rate": 8.271734841028553e-07, "loss": 0.0523, "reward": 1.2083333730697632, "reward_std": 0.7753646969795227, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.7083333730697632, "step": 760 }, { "completion_length": 250.0, "epoch": 0.06341666666666666, "grad_norm": 0.3926166594028473, "kl": 1.2214906215667725, "learning_rate": 8.206985266925249e-07, "loss": 0.0489, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 1.0, "step": 761 }, { "completion_length": 250.0, "epoch": 0.0635, "grad_norm": 0.23394834995269775, "kl": 1.3625099658966064, "learning_rate": 8.142440310406923e-07, "loss": 0.0545, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 1.0, "step": 762 }, { "completion_length": 250.0, "epoch": 0.06358333333333334, "grad_norm": 1.0904709100723267, "kl": 1.4420005083084106, "learning_rate": 8.078100757933486e-07, "loss": 0.0577, "reward": 1.5833333730697632, "reward_std": 0.5841830372810364, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.8333333730697632, "step": 763 }, { "completion_length": 250.0, "epoch": 0.06366666666666666, "grad_norm": 0.4322141110897064, "kl": 1.1168162822723389, "learning_rate": 8.013967393462094e-07, "loss": 0.0447, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.75, "step": 764 }, { "completion_length": 250.0, "epoch": 0.06375, "grad_norm": 0.26758918166160583, "kl": 1.202095627784729, "learning_rate": 7.950040998437541e-07, "loss": 0.0481, "reward": 1.6666667461395264, "reward_std": 0.7126966118812561, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.7916666865348816, "step": 765 }, { "completion_length": 250.0, "epoch": 0.06383333333333334, "grad_norm": 0.30354562401771545, "kl": 1.4736483097076416, "learning_rate": 7.886322351782782e-07, "loss": 0.0589, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 1.0, "step": 766 }, { "completion_length": 250.0, "epoch": 0.06391666666666666, "grad_norm": 0.26811686158180237, "kl": 1.1674681901931763, "learning_rate": 7.822812229889429e-07, "loss": 0.0467, "reward": 1.375, "reward_std": 0.9161254167556763, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.625, "step": 767 }, { "completion_length": 250.0, "epoch": 0.064, "grad_norm": 0.3494676351547241, "kl": 0.9113569259643555, "learning_rate": 7.759511406608255e-07, "loss": 0.0365, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.875, "step": 768 }, { "completion_length": 250.0, "epoch": 0.06408333333333334, "grad_norm": 1.503426194190979, "kl": 1.3052912950515747, "learning_rate": 7.696420653239834e-07, "loss": 0.0522, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 1.0, "step": 769 }, { "completion_length": 250.0, "epoch": 0.06416666666666666, "grad_norm": 0.27594801783561707, "kl": 0.7944636940956116, "learning_rate": 7.633540738525066e-07, "loss": 0.0318, "reward": 1.3333333730697632, "reward_std": 0.7766431570053101, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.7083333730697632, "step": 770 }, { "completion_length": 250.0, "epoch": 0.06425, "grad_norm": 0.48417186737060547, "kl": 1.5408587455749512, "learning_rate": 7.57087242863589e-07, "loss": 0.0616, "reward": 1.4166666269302368, "reward_std": 0.6606874465942383, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.9166666865348816, "step": 771 }, { "completion_length": 250.0, "epoch": 0.06433333333333334, "grad_norm": 0.21317587792873383, "kl": 0.9737959504127502, "learning_rate": 7.508416487165862e-07, "loss": 0.039, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 1.0, "step": 772 }, { "completion_length": 250.0, "epoch": 0.06441666666666666, "grad_norm": 0.22758537530899048, "kl": 1.2084604501724243, "learning_rate": 7.44617367512094e-07, "loss": 0.0483, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 1.0, "step": 773 }, { "completion_length": 250.0, "epoch": 0.0645, "grad_norm": 0.42783284187316895, "kl": 1.0768017768859863, "learning_rate": 7.384144750910133e-07, "loss": 0.0431, "reward": 1.2083333730697632, "reward_std": 0.7332792282104492, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.8333333134651184, "step": 774 }, { "completion_length": 225.0, "epoch": 0.06458333333333334, "grad_norm": 0.22378872334957123, "kl": 1.7128205299377441, "learning_rate": 7.322330470336314e-07, "loss": 0.0685, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.875, "step": 775 }, { "completion_length": 250.0, "epoch": 0.06466666666666666, "grad_norm": 0.6742827296257019, "kl": 1.4242603778839111, "learning_rate": 7.260731586586983e-07, "loss": 0.057, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.875, "step": 776 }, { "completion_length": 250.0, "epoch": 0.06475, "grad_norm": 0.3610439896583557, "kl": 1.3908641338348389, "learning_rate": 7.199348850225091e-07, "loss": 0.0556, "reward": 1.5, "reward_std": 0.7126966714859009, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.625, "step": 777 }, { "completion_length": 250.0, "epoch": 0.06483333333333334, "grad_norm": 0.30944564938545227, "kl": 1.004869818687439, "learning_rate": 7.138183009179922e-07, "loss": 0.0402, "reward": 1.4583333730697632, "reward_std": 0.9074209332466125, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.7083333730697632, "step": 778 }, { "completion_length": 250.0, "epoch": 0.06491666666666666, "grad_norm": 0.36364126205444336, "kl": 0.7205187082290649, "learning_rate": 7.077234808737932e-07, "loss": 0.0288, "reward": 1.375, "reward_std": 0.6283640265464783, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.75, "step": 779 }, { "completion_length": 250.0, "epoch": 0.065, "grad_norm": 3.758852958679199, "kl": 1.4535225629806519, "learning_rate": 7.016504991533727e-07, "loss": 0.0581, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 0.875, "step": 780 }, { "completion_length": 250.0, "epoch": 0.06508333333333334, "grad_norm": 0.2152343988418579, "kl": 1.338955283164978, "learning_rate": 6.955994297540947e-07, "loss": 0.0536, "reward": 1.8333333730697632, "reward_std": 0.47140446305274963, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.9583333730697632, "step": 781 }, { "completion_length": 250.0, "epoch": 0.06516666666666666, "grad_norm": 0.4497411549091339, "kl": 1.6065069437026978, "learning_rate": 6.895703464063319e-07, "loss": 0.0643, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.875, "step": 782 }, { "completion_length": 250.0, "epoch": 0.06525, "grad_norm": 0.350917249917984, "kl": 1.1863226890563965, "learning_rate": 6.835633225725604e-07, "loss": 0.0475, "reward": 1.375, "reward_std": 0.9161254167556763, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.75, "step": 783 }, { "completion_length": 250.0, "epoch": 0.06533333333333333, "grad_norm": 0.2987426817417145, "kl": 1.2886359691619873, "learning_rate": 6.775784314464717e-07, "loss": 0.0515, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 1.0, "step": 784 }, { "completion_length": 250.0, "epoch": 0.06541666666666666, "grad_norm": 0.27569258213043213, "kl": 1.205812931060791, "learning_rate": 6.716157459520739e-07, "loss": 0.0482, "reward": 1.6666666269302368, "reward_std": 0.6424160599708557, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.7916666865348816, "step": 785 }, { "completion_length": 224.0, "epoch": 0.0655, "grad_norm": 0.4730622470378876, "kl": 1.1610169410705566, "learning_rate": 6.656753387428089e-07, "loss": 0.0464, "reward": 1.6666666269302368, "reward_std": 0.6424160599708557, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.9166666865348816, "step": 786 }, { "completion_length": 250.0, "epoch": 0.06558333333333333, "grad_norm": 0.36808550357818604, "kl": 0.9821891784667969, "learning_rate": 6.597572822006643e-07, "loss": 0.0393, "reward": 1.7916667461395264, "reward_std": 0.39591163396835327, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 0.7916666865348816, "step": 787 }, { "completion_length": 250.0, "epoch": 0.06566666666666666, "grad_norm": 1.0250049829483032, "kl": 1.1594293117523193, "learning_rate": 6.538616484352902e-07, "loss": 0.0464, "reward": 1.375, "reward_std": 0.9161254167556763, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.75, "step": 788 }, { "completion_length": 250.0, "epoch": 0.06575, "grad_norm": 0.32371285557746887, "kl": 1.193271517753601, "learning_rate": 6.479885092831251e-07, "loss": 0.0477, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.75, "step": 789 }, { "completion_length": 240.0, "epoch": 0.06583333333333333, "grad_norm": 0.3552028238773346, "kl": 1.2126344442367554, "learning_rate": 6.421379363065142e-07, "loss": 0.0485, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.875, "step": 790 }, { "completion_length": 250.0, "epoch": 0.06591666666666667, "grad_norm": 0.2879108190536499, "kl": 1.4960048198699951, "learning_rate": 6.363100007928447e-07, "loss": 0.0598, "reward": 1.7916666269302368, "reward_std": 0.5892555713653564, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.9166666865348816, "step": 791 }, { "completion_length": 250.0, "epoch": 0.066, "grad_norm": 0.30861926078796387, "kl": 0.9252521395683289, "learning_rate": 6.305047737536707e-07, "loss": 0.037, "reward": 1.4583333730697632, "reward_std": 0.5892556309700012, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.9583333730697632, "step": 792 }, { "completion_length": 250.0, "epoch": 0.06608333333333333, "grad_norm": 0.3545387089252472, "kl": 1.508813738822937, "learning_rate": 6.247223259238511e-07, "loss": 0.0604, "reward": 0.875, "reward_std": 0.39591166377067566, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.75, "step": 793 }, { "completion_length": 250.0, "epoch": 0.06616666666666667, "grad_norm": 0.2610399127006531, "kl": 0.8176184296607971, "learning_rate": 6.189627277606894e-07, "loss": 0.0327, "reward": 1.5833333730697632, "reward_std": 0.7292091846466064, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.8333333730697632, "step": 794 }, { "completion_length": 250.0, "epoch": 0.06625, "grad_norm": 0.2623574137687683, "kl": 1.1873376369476318, "learning_rate": 6.1322604944307e-07, "loss": 0.0475, "reward": 1.9583333730697632, "reward_std": 0.11785109341144562, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 0.9583333730697632, "step": 795 }, { "completion_length": 250.0, "epoch": 0.06633333333333333, "grad_norm": 0.2947082817554474, "kl": 1.858878254890442, "learning_rate": 6.075123608706093e-07, "loss": 0.0744, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.875, "step": 796 }, { "completion_length": 250.0, "epoch": 0.06641666666666667, "grad_norm": 0.3278063237667084, "kl": 1.1497657299041748, "learning_rate": 6.01821731662798e-07, "loss": 0.046, "reward": 1.0, "reward_std": 0.7559289336204529, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.75, "step": 797 }, { "completion_length": 250.0, "epoch": 0.0665, "grad_norm": 0.24040259420871735, "kl": 1.2203922271728516, "learning_rate": 5.961542311581586e-07, "loss": 0.0488, "reward": 1.5416667461395264, "reward_std": 0.501980185508728, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.9166666865348816, "step": 798 }, { "completion_length": 250.0, "epoch": 0.06658333333333333, "grad_norm": 0.342872679233551, "kl": 1.2977404594421387, "learning_rate": 5.905099284133953e-07, "loss": 0.0519, "reward": 1.5416666269302368, "reward_std": 0.6651768684387207, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.9166666865348816, "step": 799 }, { "completion_length": 233.0, "epoch": 0.06666666666666667, "grad_norm": 0.22665053606033325, "kl": 1.3235529661178589, "learning_rate": 5.848888922025553e-07, "loss": 0.0529, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 1.0, "step": 800 }, { "completion_length": 250.0, "epoch": 0.06675, "grad_norm": 0.248734250664711, "kl": 0.8164328932762146, "learning_rate": 5.792911910161922e-07, "loss": 0.0327, "reward": 1.4166667461395264, "reward_std": 0.6606874465942383, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.9166666865348816, "step": 801 }, { "completion_length": 250.0, "epoch": 0.06683333333333333, "grad_norm": 0.26447823643684387, "kl": 1.291729211807251, "learning_rate": 5.737168930605272e-07, "loss": 0.0517, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.875, "step": 802 }, { "completion_length": 250.0, "epoch": 0.06691666666666667, "grad_norm": 0.2176593393087387, "kl": 0.8029844164848328, "learning_rate": 5.681660662566225e-07, "loss": 0.0321, "reward": 1.0416667461395264, "reward_std": 0.6770031452178955, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.7916666865348816, "step": 803 }, { "completion_length": 250.0, "epoch": 0.067, "grad_norm": 0.2643926739692688, "kl": 0.7234349846839905, "learning_rate": 5.626387782395512e-07, "loss": 0.0289, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 1.0, "step": 804 }, { "completion_length": 250.0, "epoch": 0.06708333333333333, "grad_norm": 0.2635612189769745, "kl": 1.7153481245040894, "learning_rate": 5.571350963575728e-07, "loss": 0.0686, "reward": 1.625, "reward_std": 0.5473601818084717, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.8750000596046448, "step": 805 }, { "completion_length": 250.0, "epoch": 0.06716666666666667, "grad_norm": 0.28257104754447937, "kl": 1.6129707098007202, "learning_rate": 5.516550876713142e-07, "loss": 0.0645, "reward": 1.4166666269302368, "reward_std": 0.6606874465942383, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.9166666865348816, "step": 806 }, { "completion_length": 250.0, "epoch": 0.06725, "grad_norm": 0.3367782533168793, "kl": 1.151456356048584, "learning_rate": 5.461988189529529e-07, "loss": 0.0461, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.875, "step": 807 }, { "completion_length": 250.0, "epoch": 0.06733333333333333, "grad_norm": 0.4734201431274414, "kl": 1.5179961919784546, "learning_rate": 5.407663566854008e-07, "loss": 0.0607, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 1.0, "step": 808 }, { "completion_length": 250.0, "epoch": 0.06741666666666667, "grad_norm": 0.32155460119247437, "kl": 1.1060431003570557, "learning_rate": 5.353577670614951e-07, "loss": 0.0442, "reward": 1.375, "reward_std": 0.9161254167556763, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.75, "step": 809 }, { "completion_length": 250.0, "epoch": 0.0675, "grad_norm": 0.3959408104419708, "kl": 1.4056496620178223, "learning_rate": 5.299731159831953e-07, "loss": 0.0562, "reward": 1.5833333730697632, "reward_std": 0.49601584672927856, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.9583333730697632, "step": 810 }, { "completion_length": 250.0, "epoch": 0.06758333333333333, "grad_norm": 2.5059800148010254, "kl": 2.0833823680877686, "learning_rate": 5.24612469060774e-07, "loss": 0.0833, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.875, "step": 811 }, { "completion_length": 230.0, "epoch": 0.06766666666666667, "grad_norm": 0.48341092467308044, "kl": 1.4132311344146729, "learning_rate": 5.192758916120236e-07, "loss": 0.0565, "reward": 1.7916666269302368, "reward_std": 0.589255690574646, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.9166666865348816, "step": 812 }, { "completion_length": 250.0, "epoch": 0.06775, "grad_norm": 0.32052579522132874, "kl": 0.961887776851654, "learning_rate": 5.139634486614544e-07, "loss": 0.0385, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 1.0, "step": 813 }, { "completion_length": 250.0, "epoch": 0.06783333333333333, "grad_norm": 0.3565872013568878, "kl": 1.3793160915374756, "learning_rate": 5.086752049395094e-07, "loss": 0.0552, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.875, "step": 814 }, { "completion_length": 250.0, "epoch": 0.06791666666666667, "grad_norm": 0.5205422043800354, "kl": 1.2932004928588867, "learning_rate": 5.034112248817685e-07, "loss": 0.0517, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.875, "step": 815 }, { "completion_length": 250.0, "epoch": 0.068, "grad_norm": 0.3367973864078522, "kl": 1.2375197410583496, "learning_rate": 4.981715726281666e-07, "loss": 0.0495, "reward": 1.125, "reward_std": 0.8345229625701904, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.75, "step": 816 }, { "completion_length": 250.0, "epoch": 0.06808333333333333, "grad_norm": 0.298949658870697, "kl": 1.3202153444290161, "learning_rate": 4.929563120222142e-07, "loss": 0.0528, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.875, "step": 817 }, { "completion_length": 250.0, "epoch": 0.06816666666666667, "grad_norm": 0.27566391229629517, "kl": 1.1884781122207642, "learning_rate": 4.87765506610215e-07, "loss": 0.0475, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.875, "step": 818 }, { "completion_length": 250.0, "epoch": 0.06825, "grad_norm": 0.3612639009952545, "kl": 0.9719309210777283, "learning_rate": 4.825992196404958e-07, "loss": 0.0389, "reward": 1.125, "reward_std": 0.9910312294960022, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.625, "step": 819 }, { "completion_length": 250.0, "epoch": 0.06833333333333333, "grad_norm": 0.19263684749603271, "kl": 0.9239329695701599, "learning_rate": 4.774575140626317e-07, "loss": 0.037, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.75, "step": 820 }, { "completion_length": 250.0, "epoch": 0.06841666666666667, "grad_norm": 0.5026285648345947, "kl": 0.8872776627540588, "learning_rate": 4.7234045252668393e-07, "loss": 0.0355, "reward": 1.5416667461395264, "reward_std": 0.46929532289505005, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.7916666865348816, "step": 821 }, { "completion_length": 250.0, "epoch": 0.0685, "grad_norm": 0.3337138891220093, "kl": 1.1307857036590576, "learning_rate": 4.672480973824312e-07, "loss": 0.0452, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.875, "step": 822 }, { "completion_length": 250.0, "epoch": 0.06858333333333333, "grad_norm": 0.23775134980678558, "kl": 1.0065653324127197, "learning_rate": 4.6218051067861423e-07, "loss": 0.0403, "reward": 1.7083333730697632, "reward_std": 0.5473601818084717, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.9583333730697632, "step": 823 }, { "completion_length": 250.0, "epoch": 0.06866666666666667, "grad_norm": 0.2261497527360916, "kl": 0.4448733925819397, "learning_rate": 4.5713775416217884e-07, "loss": 0.0178, "reward": 1.4166667461395264, "reward_std": 0.771516740322113, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.7916667461395264, "step": 824 }, { "completion_length": 250.0, "epoch": 0.06875, "grad_norm": 0.4021577537059784, "kl": 1.8268983364105225, "learning_rate": 4.5211988927752026e-07, "loss": 0.0731, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 1.0, "step": 825 }, { "completion_length": 250.0, "epoch": 0.06883333333333333, "grad_norm": 0.745639979839325, "kl": 1.327449083328247, "learning_rate": 4.4712697716573994e-07, "loss": 0.0531, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 1.0, "step": 826 }, { "completion_length": 250.0, "epoch": 0.06891666666666667, "grad_norm": 0.25842320919036865, "kl": 1.019197940826416, "learning_rate": 4.421590786638952e-07, "loss": 0.0408, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 1.0, "step": 827 }, { "completion_length": 250.0, "epoch": 0.069, "grad_norm": 0.6545089483261108, "kl": 1.8898916244506836, "learning_rate": 4.372162543042624e-07, "loss": 0.0756, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 1.0, "step": 828 }, { "completion_length": 250.0, "epoch": 0.06908333333333333, "grad_norm": 0.2776699364185333, "kl": 1.2687866687774658, "learning_rate": 4.3229856431359516e-07, "loss": 0.0508, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.875, "step": 829 }, { "completion_length": 250.0, "epoch": 0.06916666666666667, "grad_norm": 1.2333475351333618, "kl": 0.7890111207962036, "learning_rate": 4.27406068612396e-07, "loss": 0.0316, "reward": 1.9583333730697632, "reward_std": 0.11785109341144562, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 0.9583333730697632, "step": 830 }, { "completion_length": 250.0, "epoch": 0.06925, "grad_norm": 47.50356674194336, "kl": 13.761457443237305, "learning_rate": 4.225388268141797e-07, "loss": 0.5505, "reward": 1.2083333730697632, "reward_std": 0.43415671586990356, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.8333333730697632, "step": 831 }, { "completion_length": 250.0, "epoch": 0.06933333333333333, "grad_norm": 0.26879197359085083, "kl": 0.6461024284362793, "learning_rate": 4.1769689822475147e-07, "loss": 0.0258, "reward": 1.2916666269302368, "reward_std": 0.9829902648925781, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.6666666865348816, "step": 832 }, { "completion_length": 250.0, "epoch": 0.06941666666666667, "grad_norm": 0.27685660123825073, "kl": 1.392545461654663, "learning_rate": 4.12880341841484e-07, "loss": 0.0557, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.875, "step": 833 }, { "completion_length": 250.0, "epoch": 0.0695, "grad_norm": 0.3339894115924835, "kl": 0.9655144214630127, "learning_rate": 4.0808921635259595e-07, "loss": 0.0386, "reward": 1.2083333730697632, "reward_std": 0.8533315062522888, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.7083333730697632, "step": 834 }, { "completion_length": 250.0, "epoch": 0.06958333333333333, "grad_norm": 0.2692475914955139, "kl": 2.0277373790740967, "learning_rate": 4.033235801364402e-07, "loss": 0.0811, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 1.0, "step": 835 }, { "completion_length": 250.0, "epoch": 0.06966666666666667, "grad_norm": 0.27213945984840393, "kl": 0.46007007360458374, "learning_rate": 3.9858349126078945e-07, "loss": 0.0184, "reward": 1.4166667461395264, "reward_std": 0.7292091846466064, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.7916666865348816, "step": 836 }, { "completion_length": 250.0, "epoch": 0.06975, "grad_norm": 0.2522222101688385, "kl": 1.3139028549194336, "learning_rate": 3.938690074821314e-07, "loss": 0.0526, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.875, "step": 837 }, { "completion_length": 250.0, "epoch": 0.06983333333333333, "grad_norm": 0.2399037927389145, "kl": 0.7940414547920227, "learning_rate": 3.891801862449629e-07, "loss": 0.0318, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 1.0, "step": 838 }, { "completion_length": 250.0, "epoch": 0.06991666666666667, "grad_norm": 0.22458046674728394, "kl": 1.2483919858932495, "learning_rate": 3.8451708468109026e-07, "loss": 0.0499, "reward": 1.4583333730697632, "reward_std": 0.46929532289505005, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.8333333730697632, "step": 839 }, { "completion_length": 250.0, "epoch": 0.07, "grad_norm": 0.211387500166893, "kl": 0.9435220956802368, "learning_rate": 3.798797596089351e-07, "loss": 0.0377, "reward": 1.3333333730697632, "reward_std": 0.5634361505508423, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.9583333730697632, "step": 840 }, { "completion_length": 250.0, "epoch": 0.07008333333333333, "grad_norm": 0.2669267952442169, "kl": 1.431569218635559, "learning_rate": 3.7526826753284065e-07, "loss": 0.0573, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 1.0, "step": 841 }, { "completion_length": 250.0, "epoch": 0.07016666666666667, "grad_norm": 0.2529590129852295, "kl": 1.039304256439209, "learning_rate": 3.7068266464238085e-07, "loss": 0.0416, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 1.0, "step": 842 }, { "completion_length": 250.0, "epoch": 0.07025, "grad_norm": 0.24863839149475098, "kl": 0.9364717602729797, "learning_rate": 3.661230068116811e-07, "loss": 0.0375, "reward": 1.1666666269302368, "reward_std": 0.7766431570053101, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.7916666269302368, "step": 843 }, { "completion_length": 250.0, "epoch": 0.07033333333333333, "grad_norm": 0.22837644815444946, "kl": 0.9734100103378296, "learning_rate": 3.615893495987335e-07, "loss": 0.0389, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.875, "step": 844 }, { "completion_length": 250.0, "epoch": 0.07041666666666667, "grad_norm": 0.2933385372161865, "kl": 0.6663645505905151, "learning_rate": 3.5708174824471947e-07, "loss": 0.0267, "reward": 1.5, "reward_std": 0.6424160599708557, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.875, "step": 845 }, { "completion_length": 250.0, "epoch": 0.0705, "grad_norm": 0.29375800490379333, "kl": 1.2523351907730103, "learning_rate": 3.5260025767333894e-07, "loss": 0.0501, "reward": 1.5, "reward_std": 0.7126966714859009, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.75, "step": 846 }, { "completion_length": 250.0, "epoch": 0.07058333333333333, "grad_norm": 0.2230493724346161, "kl": 1.5347055196762085, "learning_rate": 3.481449324901412e-07, "loss": 0.0614, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 1.0, "step": 847 }, { "completion_length": 250.0, "epoch": 0.07066666666666667, "grad_norm": 0.2553749680519104, "kl": 1.2786747217178345, "learning_rate": 3.4371582698185636e-07, "loss": 0.0511, "reward": 1.4583333730697632, "reward_std": 0.5892556309700012, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.9583333730697632, "step": 848 }, { "completion_length": 250.0, "epoch": 0.07075, "grad_norm": 1.1723978519439697, "kl": 1.5742748975753784, "learning_rate": 3.393129951157384e-07, "loss": 0.063, "reward": 1.4583333730697632, "reward_std": 0.5019802451133728, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.9583333730697632, "step": 849 }, { "completion_length": 250.0, "epoch": 0.07083333333333333, "grad_norm": 1.294960618019104, "kl": 1.270159363746643, "learning_rate": 3.3493649053890325e-07, "loss": 0.0508, "reward": 1.5833333730697632, "reward_std": 0.7918232679367065, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.8333333730697632, "step": 850 }, { "completion_length": 250.0, "epoch": 0.07091666666666667, "grad_norm": 0.32427558302879333, "kl": 1.2746331691741943, "learning_rate": 3.3058636657767927e-07, "loss": 0.051, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.875, "step": 851 }, { "completion_length": 250.0, "epoch": 0.071, "grad_norm": 0.28006064891815186, "kl": 1.1999834775924683, "learning_rate": 3.262626762369525e-07, "loss": 0.048, "reward": 1.3333333730697632, "reward_std": 0.6666666269302368, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.8333333730697632, "step": 852 }, { "completion_length": 250.0, "epoch": 0.07108333333333333, "grad_norm": 0.3267779052257538, "kl": 1.2184315919876099, "learning_rate": 3.219654721995266e-07, "loss": 0.0487, "reward": 1.2916667461395264, "reward_std": 0.8807914853096008, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.5416666865348816, "step": 853 }, { "completion_length": 250.0, "epoch": 0.07116666666666667, "grad_norm": 0.2615717947483063, "kl": 0.463012158870697, "learning_rate": 3.176948068254762e-07, "loss": 0.0185, "reward": 1.3333333730697632, "reward_std": 0.9428090453147888, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.7083333134651184, "step": 854 }, { "completion_length": 250.0, "epoch": 0.07125, "grad_norm": 0.20684708654880524, "kl": 1.38583242893219, "learning_rate": 3.134507321515107e-07, "loss": 0.0554, "reward": 1.2916666269302368, "reward_std": 0.6283639073371887, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.9166666865348816, "step": 855 }, { "completion_length": 250.0, "epoch": 0.07133333333333333, "grad_norm": 0.2460877001285553, "kl": 0.6953861117362976, "learning_rate": 3.092332998903416e-07, "loss": 0.0278, "reward": 1.2916666269302368, "reward_std": 0.7223747968673706, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.7916666865348816, "step": 856 }, { "completion_length": 242.0, "epoch": 0.07141666666666667, "grad_norm": 0.030763499438762665, "kl": 1.842020869255066, "learning_rate": 3.050425614300487e-07, "loss": 0.0737, "reward": 2.0, "reward_std": 0.0, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 1.0, "step": 857 }, { "completion_length": 250.0, "epoch": 0.0715, "grad_norm": 0.3020068407058716, "kl": 1.5266697406768799, "learning_rate": 3.0087856783345916e-07, "loss": 0.0611, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.875, "step": 858 }, { "completion_length": 250.0, "epoch": 0.07158333333333333, "grad_norm": 0.2564064860343933, "kl": 1.2908424139022827, "learning_rate": 2.967413698375196e-07, "loss": 0.0516, "reward": 1.4166666269302368, "reward_std": 0.6606874465942383, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.9166666865348816, "step": 859 }, { "completion_length": 250.0, "epoch": 0.07166666666666667, "grad_norm": 1.984915852546692, "kl": 1.585814118385315, "learning_rate": 2.9263101785268253e-07, "loss": 0.0634, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 0.875, "step": 860 }, { "completion_length": 250.0, "epoch": 0.07175, "grad_norm": 0.22617851197719574, "kl": 0.9019542336463928, "learning_rate": 2.8854756196229017e-07, "loss": 0.0361, "reward": 1.2916666269302368, "reward_std": 0.6283639669418335, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.9166666865348816, "step": 861 }, { "completion_length": 250.0, "epoch": 0.07183333333333333, "grad_norm": 0.24866744875907898, "kl": 0.7339966893196106, "learning_rate": 2.844910519219632e-07, "loss": 0.0294, "reward": 1.2083333730697632, "reward_std": 0.8533315062522888, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.7083333730697632, "step": 862 }, { "completion_length": 250.0, "epoch": 0.07191666666666667, "grad_norm": 0.2894342839717865, "kl": 0.7534470558166504, "learning_rate": 2.8046153715899695e-07, "loss": 0.0301, "reward": 1.0833333730697632, "reward_std": 0.29546844959259033, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.8333333730697632, "step": 863 }, { "completion_length": 250.0, "epoch": 0.072, "grad_norm": 0.24600763618946075, "kl": 1.024735450744629, "learning_rate": 2.764590667717562e-07, "loss": 0.041, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.875, "step": 864 }, { "completion_length": 250.0, "epoch": 0.07208333333333333, "grad_norm": 0.3743076026439667, "kl": 1.0410536527633667, "learning_rate": 2.7248368952908055e-07, "loss": 0.0416, "reward": 1.375, "reward_std": 0.7000566720962524, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.875, "step": 865 }, { "completion_length": 250.0, "epoch": 0.07216666666666667, "grad_norm": 0.2570086121559143, "kl": 1.0770304203033447, "learning_rate": 2.6853545386968607e-07, "loss": 0.0431, "reward": 1.4166666269302368, "reward_std": 0.6606874465942383, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.9166666865348816, "step": 866 }, { "completion_length": 250.0, "epoch": 0.07225, "grad_norm": 0.31037428975105286, "kl": 0.8965320587158203, "learning_rate": 2.6461440790157974e-07, "loss": 0.0359, "reward": 1.4166667461395264, "reward_std": 0.8498366475105286, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.7916666865348816, "step": 867 }, { "completion_length": 250.0, "epoch": 0.07233333333333333, "grad_norm": 0.2683153450489044, "kl": 0.8436253666877747, "learning_rate": 2.6072059940146775e-07, "loss": 0.0337, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.875, "step": 868 }, { "completion_length": 250.0, "epoch": 0.07241666666666667, "grad_norm": 0.2785882353782654, "kl": 0.9701066017150879, "learning_rate": 2.568540758141791e-07, "loss": 0.0388, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.875, "step": 869 }, { "completion_length": 250.0, "epoch": 0.0725, "grad_norm": 0.4102500379085541, "kl": 1.3558361530303955, "learning_rate": 2.53014884252083e-07, "loss": 0.0542, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 1.0, "step": 870 }, { "completion_length": 250.0, "epoch": 0.07258333333333333, "grad_norm": 0.23862656950950623, "kl": 0.8607050776481628, "learning_rate": 2.492030714945162e-07, "loss": 0.0344, "reward": 0.9166666865348816, "reward_std": 0.2357022613286972, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.9166666865348816, "step": 871 }, { "completion_length": 250.0, "epoch": 0.07266666666666667, "grad_norm": 0.31678903102874756, "kl": 0.5520573258399963, "learning_rate": 2.454186839872158e-07, "loss": 0.0221, "reward": 0.9166666865348816, "reward_std": 0.6606875061988831, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.6666666865348816, "step": 872 }, { "completion_length": 250.0, "epoch": 0.07275, "grad_norm": 0.297191321849823, "kl": 0.6400251984596252, "learning_rate": 2.4166176784174795e-07, "loss": 0.0256, "reward": 1.5416667461395264, "reward_std": 0.46929532289505005, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.9166667461395264, "step": 873 }, { "completion_length": 250.0, "epoch": 0.07283333333333333, "grad_norm": 0.46804097294807434, "kl": 1.2724863290786743, "learning_rate": 2.3793236883495164e-07, "loss": 0.0509, "reward": 1.3333333730697632, "reward_std": 0.8164966106414795, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.7083333730697632, "step": 874 }, { "completion_length": 250.0, "epoch": 0.07291666666666667, "grad_norm": 0.2768522799015045, "kl": 1.2393690347671509, "learning_rate": 2.3423053240837518e-07, "loss": 0.0496, "reward": 1.4583333730697632, "reward_std": 0.5892556309700012, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.9583333730697632, "step": 875 }, { "completion_length": 250.0, "epoch": 0.073, "grad_norm": 0.4209572672843933, "kl": 0.9548825025558472, "learning_rate": 2.3055630366772857e-07, "loss": 0.0382, "reward": 1.4166667461395264, "reward_std": 0.9041350483894348, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.6666666865348816, "step": 876 }, { "completion_length": 250.0, "epoch": 0.07308333333333333, "grad_norm": 0.29563507437705994, "kl": 0.9755483269691467, "learning_rate": 2.269097273823287e-07, "loss": 0.039, "reward": 1.0833333730697632, "reward_std": 0.38832154870033264, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.9583333730697632, "step": 877 }, { "completion_length": 250.0, "epoch": 0.07316666666666667, "grad_norm": 0.28373217582702637, "kl": 1.0492498874664307, "learning_rate": 2.2329084798455747e-07, "loss": 0.042, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 1.0, "step": 878 }, { "completion_length": 250.0, "epoch": 0.07325, "grad_norm": 0.7796637415885925, "kl": 1.0222169160842896, "learning_rate": 2.1969970956931762e-07, "loss": 0.0409, "reward": 1.3333333730697632, "reward_std": 0.7766431570053101, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.8333333730697632, "step": 879 }, { "completion_length": 208.0, "epoch": 0.07333333333333333, "grad_norm": 0.17579619586467743, "kl": 1.2056773900985718, "learning_rate": 2.1613635589349756e-07, "loss": 0.0482, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 1.0, "step": 880 }, { "completion_length": 250.0, "epoch": 0.07341666666666667, "grad_norm": 0.3150039315223694, "kl": 1.0645179748535156, "learning_rate": 2.1260083037543817e-07, "loss": 0.0426, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 0.875, "step": 881 }, { "completion_length": 250.0, "epoch": 0.0735, "grad_norm": 0.029675384983420372, "kl": 1.4976853132247925, "learning_rate": 2.0909317609440093e-07, "loss": 0.0599, "reward": 2.0, "reward_std": 0.0, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 1.0, "step": 882 }, { "completion_length": 250.0, "epoch": 0.07358333333333333, "grad_norm": 0.7188115119934082, "kl": 1.3928179740905762, "learning_rate": 2.0561343579004716e-07, "loss": 0.0557, "reward": 1.5833333730697632, "reward_std": 0.49601584672927856, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.9583333730697632, "step": 883 }, { "completion_length": 250.0, "epoch": 0.07366666666666667, "grad_norm": 0.27707529067993164, "kl": 1.0864824056625366, "learning_rate": 2.0216165186191406e-07, "loss": 0.0435, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.875, "step": 884 }, { "completion_length": 250.0, "epoch": 0.07375, "grad_norm": 0.27343204617500305, "kl": 1.5687239170074463, "learning_rate": 1.9873786636889908e-07, "loss": 0.0627, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.875, "step": 885 }, { "completion_length": 250.0, "epoch": 0.07383333333333333, "grad_norm": 0.3359210789203644, "kl": 0.9341368079185486, "learning_rate": 1.95342121028749e-07, "loss": 0.0374, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.875, "step": 886 }, { "completion_length": 250.0, "epoch": 0.07391666666666667, "grad_norm": 0.32620811462402344, "kl": 1.4301745891571045, "learning_rate": 1.9197445721754777e-07, "loss": 0.0572, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.875, "step": 887 }, { "completion_length": 250.0, "epoch": 0.074, "grad_norm": 0.24947281181812286, "kl": 0.3620012700557709, "learning_rate": 1.8863491596921745e-07, "loss": 0.0145, "reward": 1.4583333730697632, "reward_std": 0.7332792282104492, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.5833333730697632, "step": 888 }, { "completion_length": 250.0, "epoch": 0.07408333333333333, "grad_norm": 0.33198386430740356, "kl": 1.7999809980392456, "learning_rate": 1.8532353797501318e-07, "loss": 0.072, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.875, "step": 889 }, { "completion_length": 250.0, "epoch": 0.07416666666666667, "grad_norm": 1.02660071849823, "kl": 1.4582207202911377, "learning_rate": 1.8204036358303173e-07, "loss": 0.0583, "reward": 0.8333333730697632, "reward_std": 0.835710883140564, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.5833333730697632, "step": 890 }, { "completion_length": 250.0, "epoch": 0.07425, "grad_norm": 1.7326771020889282, "kl": 1.4054831266403198, "learning_rate": 1.787854327977162e-07, "loss": 0.0562, "reward": 1.0, "reward_std": 0.7559289336204529, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.75, "step": 891 }, { "completion_length": 250.0, "epoch": 0.07433333333333333, "grad_norm": 0.3918047249317169, "kl": 1.9732359647750854, "learning_rate": 1.7555878527937164e-07, "loss": 0.0789, "reward": 1.7916667461395264, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.9166667461395264, "step": 892 }, { "completion_length": 250.0, "epoch": 0.07441666666666667, "grad_norm": 0.0223043542355299, "kl": 1.2435429096221924, "learning_rate": 1.7236046034367959e-07, "loss": 0.0497, "reward": 1.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 1.0, "step": 893 }, { "completion_length": 250.0, "epoch": 0.0745, "grad_norm": 0.21901412308216095, "kl": 0.5204886794090271, "learning_rate": 1.6919049696121957e-07, "loss": 0.0208, "reward": 1.6666667461395264, "reward_std": 0.5345224738121033, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.9166667461395264, "step": 894 }, { "completion_length": 250.0, "epoch": 0.07458333333333333, "grad_norm": 0.291716068983078, "kl": 0.719805896282196, "learning_rate": 1.6604893375699594e-07, "loss": 0.0288, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 1.0, "step": 895 }, { "completion_length": 250.0, "epoch": 0.07466666666666667, "grad_norm": 0.3257061541080475, "kl": 1.1170681715011597, "learning_rate": 1.629358090099639e-07, "loss": 0.0447, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.875, "step": 896 }, { "completion_length": 250.0, "epoch": 0.07475, "grad_norm": 0.2914160192012787, "kl": 0.9540635943412781, "learning_rate": 1.5985116065256683e-07, "loss": 0.0382, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.875, "step": 897 }, { "completion_length": 250.0, "epoch": 0.07483333333333334, "grad_norm": 0.20414304733276367, "kl": 1.3840910196304321, "learning_rate": 1.567950262702714e-07, "loss": 0.0554, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 1.0, "step": 898 }, { "completion_length": 250.0, "epoch": 0.07491666666666667, "grad_norm": 0.28207582235336304, "kl": 0.8242446780204773, "learning_rate": 1.5376744310111019e-07, "loss": 0.033, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.875, "step": 899 }, { "completion_length": 250.0, "epoch": 0.075, "grad_norm": 0.2571064829826355, "kl": 0.4384586811065674, "learning_rate": 1.507684480352292e-07, "loss": 0.0175, "reward": 1.0, "reward_std": 0.9258201122283936, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.625, "step": 900 }, { "completion_length": 250.0, "epoch": 0.07508333333333334, "grad_norm": 0.2975660562515259, "kl": 0.8587030172348022, "learning_rate": 1.4779807761443638e-07, "loss": 0.0343, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.875, "step": 901 }, { "completion_length": 250.0, "epoch": 0.07516666666666667, "grad_norm": 1.0238752365112305, "kl": 1.3424298763275146, "learning_rate": 1.4485636803175828e-07, "loss": 0.0537, "reward": 1.375, "reward_std": 0.9161254167556763, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.75, "step": 902 }, { "completion_length": 250.0, "epoch": 0.07525, "grad_norm": 0.27268996834754944, "kl": 1.614228367805481, "learning_rate": 1.419433551309976e-07, "loss": 0.0646, "reward": 1.8333333730697632, "reward_std": 0.47140446305274963, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.9583333730697632, "step": 903 }, { "completion_length": 250.0, "epoch": 0.07533333333333334, "grad_norm": 0.2508178651332855, "kl": 0.6932893991470337, "learning_rate": 1.3905907440629752e-07, "loss": 0.0277, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.875, "step": 904 }, { "completion_length": 250.0, "epoch": 0.07541666666666667, "grad_norm": 0.5284227132797241, "kl": 1.5996646881103516, "learning_rate": 1.362035610017079e-07, "loss": 0.064, "reward": 1.25, "reward_std": 0.6606875061988831, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.875, "step": 905 }, { "completion_length": 250.0, "epoch": 0.0755, "grad_norm": 0.290884792804718, "kl": 0.6067217588424683, "learning_rate": 1.3337684971075932e-07, "loss": 0.0243, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.875, "step": 906 }, { "completion_length": 250.0, "epoch": 0.07558333333333334, "grad_norm": 0.3576277792453766, "kl": 0.8207005858421326, "learning_rate": 1.305789749760361e-07, "loss": 0.0328, "reward": 1.1666667461395264, "reward_std": 0.7766431570053101, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.7916666865348816, "step": 907 }, { "completion_length": 250.0, "epoch": 0.07566666666666666, "grad_norm": 0.36113837361335754, "kl": 1.7893757820129395, "learning_rate": 1.278099708887587e-07, "loss": 0.0716, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 1.0, "step": 908 }, { "completion_length": 250.0, "epoch": 0.07575, "grad_norm": 0.3109591603279114, "kl": 1.1817129850387573, "learning_rate": 1.2506987118836912e-07, "loss": 0.0473, "reward": 1.2083333730697632, "reward_std": 0.9074209332466125, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.7083333730697632, "step": 909 }, { "completion_length": 250.0, "epoch": 0.07583333333333334, "grad_norm": 756.9115600585938, "kl": 135.45335388183594, "learning_rate": 1.223587092621162e-07, "loss": 5.4181, "reward": 1.2083333730697632, "reward_std": 0.9074208736419678, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.7083333730697632, "step": 910 }, { "completion_length": 250.0, "epoch": 0.07591666666666666, "grad_norm": 0.8474955558776855, "kl": 1.7323054075241089, "learning_rate": 1.1967651814465353e-07, "loss": 0.0693, "reward": 1.3333333730697632, "reward_std": 0.5634361505508423, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.8333333730697632, "step": 911 }, { "completion_length": 250.0, "epoch": 0.076, "grad_norm": 0.31559211015701294, "kl": 1.9210703372955322, "learning_rate": 1.1702333051763271e-07, "loss": 0.0768, "reward": 1.75, "reward_std": 0.5841830968856812, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.875, "step": 912 }, { "completion_length": 250.0, "epoch": 0.07608333333333334, "grad_norm": 0.30108439922332764, "kl": 1.042654037475586, "learning_rate": 1.1439917870930795e-07, "loss": 0.0417, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 1.0, "step": 913 }, { "completion_length": 250.0, "epoch": 0.07616666666666666, "grad_norm": 0.29974260926246643, "kl": 1.0419851541519165, "learning_rate": 1.1180409469414094e-07, "loss": 0.0417, "reward": 1.3333333730697632, "reward_std": 0.7766431570053101, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.8333333730697632, "step": 914 }, { "completion_length": 250.0, "epoch": 0.07625, "grad_norm": 0.29832908511161804, "kl": 1.5337603092193604, "learning_rate": 1.0923811009241142e-07, "loss": 0.0614, "reward": 1.4166667461395264, "reward_std": 0.49601584672927856, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.9166666865348816, "step": 915 }, { "completion_length": 250.0, "epoch": 0.07633333333333334, "grad_norm": 0.4468366205692291, "kl": 1.797929048538208, "learning_rate": 1.067012561698319e-07, "loss": 0.0719, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.875, "step": 916 }, { "completion_length": 250.0, "epoch": 0.07641666666666666, "grad_norm": 0.3717971444129944, "kl": 1.0258549451828003, "learning_rate": 1.041935638371669e-07, "loss": 0.041, "reward": 0.875, "reward_std": 0.6408699750900269, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.75, "step": 917 }, { "completion_length": 250.0, "epoch": 0.0765, "grad_norm": 0.30195504426956177, "kl": 1.0266151428222656, "learning_rate": 1.0171506364985622e-07, "loss": 0.0411, "reward": 1.0, "reward_std": 0.7126966714859009, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.75, "step": 918 }, { "completion_length": 250.0, "epoch": 0.07658333333333334, "grad_norm": 0.29930129647254944, "kl": 1.6755496263504028, "learning_rate": 9.926578580764234e-08, "loss": 0.067, "reward": 1.625, "reward_std": 0.6283639073371887, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.875, "step": 919 }, { "completion_length": 250.0, "epoch": 0.07666666666666666, "grad_norm": 0.01916714385151863, "kl": 1.014693021774292, "learning_rate": 9.684576015420277e-08, "loss": 0.0406, "reward": 2.0, "reward_std": 0.0, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 1.0, "step": 920 }, { "completion_length": 164.0, "epoch": 0.07675, "grad_norm": 0.26526811718940735, "kl": 0.8480601906776428, "learning_rate": 9.445501617678654e-08, "loss": 0.0339, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 1.0, "step": 921 }, { "completion_length": 250.0, "epoch": 0.07683333333333334, "grad_norm": 0.41043245792388916, "kl": 0.912456214427948, "learning_rate": 9.209358300585474e-08, "loss": 0.0365, "reward": 1.4166667461395264, "reward_std": 0.7715167999267578, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.6666666865348816, "step": 922 }, { "completion_length": 250.0, "epoch": 0.07691666666666666, "grad_norm": 0.19834581017494202, "kl": 1.5605332851409912, "learning_rate": 8.9761489414725e-08, "loss": 0.0624, "reward": 1.4166666269302368, "reward_std": 0.6606874465942383, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.9166666865348816, "step": 923 }, { "completion_length": 250.0, "epoch": 0.077, "grad_norm": 0.27626922726631165, "kl": 1.0883418321609497, "learning_rate": 8.745876381922147e-08, "loss": 0.0435, "reward": 1.5, "reward_std": 0.7126966714859009, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.7500000596046448, "step": 924 }, { "completion_length": 250.0, "epoch": 0.07708333333333334, "grad_norm": 0.41667699813842773, "kl": 1.4361419677734375, "learning_rate": 8.518543427732951e-08, "loss": 0.0574, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 1.0, "step": 925 }, { "completion_length": 250.0, "epoch": 0.07716666666666666, "grad_norm": 0.8117958903312683, "kl": 1.2131803035736084, "learning_rate": 8.294152848885156e-08, "loss": 0.0485, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.875, "step": 926 }, { "completion_length": 250.0, "epoch": 0.07725, "grad_norm": 0.023965315893292427, "kl": 0.9878235459327698, "learning_rate": 8.072707379507217e-08, "loss": 0.0395, "reward": 2.0, "reward_std": 0.0, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 1.0, "step": 927 }, { "completion_length": 250.0, "epoch": 0.07733333333333334, "grad_norm": 0.06923804432153702, "kl": 1.3249129056930542, "learning_rate": 7.854209717842231e-08, "loss": 0.053, "reward": 2.0, "reward_std": 0.0, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 1.0, "step": 928 }, { "completion_length": 250.0, "epoch": 0.07741666666666666, "grad_norm": 0.21536372601985931, "kl": 1.1641136407852173, "learning_rate": 7.638662526215284e-08, "loss": 0.0466, "reward": 1.0416667461395264, "reward_std": 0.4520675241947174, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.9166666865348816, "step": 929 }, { "completion_length": 250.0, "epoch": 0.0775, "grad_norm": 0.3016450107097626, "kl": 1.2998625040054321, "learning_rate": 7.426068431000883e-08, "loss": 0.052, "reward": 0.9166667461395264, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.6666666865348816, "step": 930 }, { "completion_length": 250.0, "epoch": 0.07758333333333334, "grad_norm": 0.33901509642601013, "kl": 1.7370189428329468, "learning_rate": 7.216430022591009e-08, "loss": 0.0695, "reward": 1.9166667461395264, "reward_std": 0.23570223152637482, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 0.9166666865348816, "step": 931 }, { "completion_length": 250.0, "epoch": 0.07766666666666666, "grad_norm": 0.33317700028419495, "kl": 1.2731457948684692, "learning_rate": 7.009749855363457e-08, "loss": 0.0509, "reward": 1.375, "reward_std": 0.9161254167556763, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.75, "step": 932 }, { "completion_length": 250.0, "epoch": 0.07775, "grad_norm": 0.2928623855113983, "kl": 0.9490488171577454, "learning_rate": 6.806030447650879e-08, "loss": 0.038, "reward": 1.625, "reward_std": 0.602573812007904, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.75, "step": 933 }, { "completion_length": 250.0, "epoch": 0.07783333333333334, "grad_norm": 0.25610074400901794, "kl": 1.1091444492340088, "learning_rate": 6.605274281709929e-08, "loss": 0.0444, "reward": 1.6666667461395264, "reward_std": 0.5345224738121033, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.9166666865348816, "step": 934 }, { "completion_length": 250.0, "epoch": 0.07791666666666666, "grad_norm": 0.2953529953956604, "kl": 1.096183180809021, "learning_rate": 6.407483803691216e-08, "loss": 0.0438, "reward": 1.0833333730697632, "reward_std": 0.6606874465942383, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.8333333730697632, "step": 935 }, { "completion_length": 250.0, "epoch": 0.078, "grad_norm": 0.19404926896095276, "kl": 0.9853270053863525, "learning_rate": 6.212661423609184e-08, "loss": 0.0394, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 1.0, "step": 936 }, { "completion_length": 250.0, "epoch": 0.07808333333333334, "grad_norm": 0.26259902119636536, "kl": 1.091464877128601, "learning_rate": 6.020809515313141e-08, "loss": 0.0437, "reward": 1.3333333730697632, "reward_std": 0.7126966714859009, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.7083333730697632, "step": 937 }, { "completion_length": 250.0, "epoch": 0.07816666666666666, "grad_norm": 0.3351936638355255, "kl": 0.9903098344802856, "learning_rate": 5.83193041645802e-08, "loss": 0.0396, "reward": 1.0833333730697632, "reward_std": 0.38832157850265503, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.8333333730697632, "step": 938 }, { "completion_length": 250.0, "epoch": 0.07825, "grad_norm": 0.2362779676914215, "kl": 1.3006746768951416, "learning_rate": 5.6460264284760316e-08, "loss": 0.052, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 1.0, "step": 939 }, { "completion_length": 203.0, "epoch": 0.07833333333333334, "grad_norm": 0.2249480038881302, "kl": 1.123484492301941, "learning_rate": 5.463099816548578e-08, "loss": 0.0449, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 1.0, "step": 940 }, { "completion_length": 250.0, "epoch": 0.07841666666666666, "grad_norm": 0.3026413321495056, "kl": 1.1142150163650513, "learning_rate": 5.283152809578751e-08, "loss": 0.0446, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.75, "step": 941 }, { "completion_length": 250.0, "epoch": 0.0785, "grad_norm": 0.2342066615819931, "kl": 0.8287395238876343, "learning_rate": 5.106187600163987e-08, "loss": 0.0331, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.75, "step": 942 }, { "completion_length": 250.0, "epoch": 0.07858333333333334, "grad_norm": 0.06259375810623169, "kl": 1.5354900360107422, "learning_rate": 4.932206344569562e-08, "loss": 0.0614, "reward": 2.0, "reward_std": 0.0, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 1.0, "step": 943 }, { "completion_length": 250.0, "epoch": 0.07866666666666666, "grad_norm": 0.24400244653224945, "kl": 1.0855783224105835, "learning_rate": 4.761211162702117e-08, "loss": 0.0434, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.875, "step": 944 }, { "completion_length": 250.0, "epoch": 0.07875, "grad_norm": 0.23698869347572327, "kl": 1.1598446369171143, "learning_rate": 4.593204138084006e-08, "loss": 0.0464, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 1.0, "step": 945 }, { "completion_length": 250.0, "epoch": 0.07883333333333334, "grad_norm": 0.02643449977040291, "kl": 1.023648738861084, "learning_rate": 4.428187317827848e-08, "loss": 0.0409, "reward": 2.0, "reward_std": 0.0, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 1.0, "step": 946 }, { "completion_length": 229.0, "epoch": 0.07891666666666666, "grad_norm": 0.3633557856082916, "kl": 1.4960824251174927, "learning_rate": 4.26616271261146e-08, "loss": 0.0598, "reward": 1.3333333730697632, "reward_std": 0.7346308827400208, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.8333333730697632, "step": 947 }, { "completion_length": 250.0, "epoch": 0.079, "grad_norm": 0.049699533730745316, "kl": 1.670642614364624, "learning_rate": 4.1071322966535487e-08, "loss": 0.0668, "reward": 2.0, "reward_std": 0.0, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 1.0, "step": 948 }, { "completion_length": 250.0, "epoch": 0.07908333333333334, "grad_norm": 0.7215015292167664, "kl": 1.0973060131072998, "learning_rate": 3.95109800768953e-08, "loss": 0.0439, "reward": 1.5416667461395264, "reward_std": 0.7332792282104492, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.7916666865348816, "step": 949 }, { "completion_length": 250.0, "epoch": 0.07916666666666666, "grad_norm": 0.5393422842025757, "kl": 1.3605879545211792, "learning_rate": 3.798061746947995e-08, "loss": 0.0544, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.875, "step": 950 }, { "completion_length": 250.0, "epoch": 0.07925, "grad_norm": 0.5147884488105774, "kl": 0.8889792561531067, "learning_rate": 3.648025379127479e-08, "loss": 0.0356, "reward": 1.5416667461395264, "reward_std": 0.501980185508728, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.6666666865348816, "step": 951 }, { "completion_length": 250.0, "epoch": 0.07933333333333334, "grad_norm": 0.23729416728019714, "kl": 0.8729484677314758, "learning_rate": 3.5009907323737826e-08, "loss": 0.0349, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 1.0, "step": 952 }, { "completion_length": 250.0, "epoch": 0.07941666666666666, "grad_norm": 0.35395750403404236, "kl": 1.2410624027252197, "learning_rate": 3.3569595982576584e-08, "loss": 0.0496, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 1.0, "step": 953 }, { "completion_length": 250.0, "epoch": 0.0795, "grad_norm": 0.29070597887039185, "kl": 1.1642013788223267, "learning_rate": 3.2159337317530234e-08, "loss": 0.0466, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.875, "step": 954 }, { "completion_length": 250.0, "epoch": 0.07958333333333334, "grad_norm": 0.24920472502708435, "kl": 1.0929770469665527, "learning_rate": 3.077914851215585e-08, "loss": 0.0437, "reward": 1.0416667461395264, "reward_std": 0.4520675837993622, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.9166666865348816, "step": 955 }, { "completion_length": 250.0, "epoch": 0.07966666666666666, "grad_norm": 0.4327549338340759, "kl": 1.9852244853973389, "learning_rate": 2.9429046383618042e-08, "loss": 0.0794, "reward": 0.8333333730697632, "reward_std": 0.6424160599708557, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.7083333134651184, "step": 956 }, { "completion_length": 250.0, "epoch": 0.07975, "grad_norm": 0.27277326583862305, "kl": 1.475409984588623, "learning_rate": 2.810904738248549e-08, "loss": 0.059, "reward": 1.9583333730697632, "reward_std": 0.11785109341144562, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 0.9583333730697632, "step": 957 }, { "completion_length": 250.0, "epoch": 0.07983333333333334, "grad_norm": 0.27285173535346985, "kl": 1.1164697408676147, "learning_rate": 2.681916759252917e-08, "loss": 0.0447, "reward": 1.9166667461395264, "reward_std": 0.23570223152637482, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 0.9166666865348816, "step": 958 }, { "completion_length": 250.0, "epoch": 0.07991666666666666, "grad_norm": 3.066486358642578, "kl": 1.745069980621338, "learning_rate": 2.555942273052753e-08, "loss": 0.0698, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.75, "step": 959 }, { "completion_length": 250.0, "epoch": 0.08, "grad_norm": 0.3669474124908447, "kl": 1.6787079572677612, "learning_rate": 2.4329828146074096e-08, "loss": 0.0671, "reward": 1.3333333730697632, "reward_std": 0.7766431570053101, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.8333333730697632, "step": 960 }, { "completion_length": 250.0, "epoch": 0.08008333333333334, "grad_norm": 0.24523121118545532, "kl": 1.1602610349655151, "learning_rate": 2.313039882139101e-08, "loss": 0.0464, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 1.0, "step": 961 }, { "completion_length": 250.0, "epoch": 0.08016666666666666, "grad_norm": 0.2820650041103363, "kl": 1.0688502788543701, "learning_rate": 2.1961149371145795e-08, "loss": 0.0428, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.75, "step": 962 }, { "completion_length": 250.0, "epoch": 0.08025, "grad_norm": 0.34602880477905273, "kl": 1.1315749883651733, "learning_rate": 2.082209404227403e-08, "loss": 0.0453, "reward": 1.1666667461395264, "reward_std": 0.835710883140564, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.6666666865348816, "step": 963 }, { "completion_length": 250.0, "epoch": 0.08033333333333334, "grad_norm": 0.25703561305999756, "kl": 1.0106827020645142, "learning_rate": 1.9713246713805588e-08, "loss": 0.0404, "reward": 1.4166667461395264, "reward_std": 0.7292091846466064, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.7916666865348816, "step": 964 }, { "completion_length": 250.0, "epoch": 0.08041666666666666, "grad_norm": 0.28899118304252625, "kl": 0.9755896925926208, "learning_rate": 1.8634620896695044e-08, "loss": 0.039, "reward": 1.25, "reward_std": 0.8864052295684814, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.75, "step": 965 }, { "completion_length": 250.0, "epoch": 0.0805, "grad_norm": 0.29632315039634705, "kl": 1.442091464996338, "learning_rate": 1.7586229733657646e-08, "loss": 0.0577, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 1.0, "step": 966 }, { "completion_length": 250.0, "epoch": 0.08058333333333334, "grad_norm": 0.21348384022712708, "kl": 1.338564395904541, "learning_rate": 1.6568085999008886e-08, "loss": 0.0535, "reward": 1.8333333730697632, "reward_std": 0.35634827613830566, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.9583333730697632, "step": 967 }, { "completion_length": 250.0, "epoch": 0.08066666666666666, "grad_norm": 0.24184101819992065, "kl": 0.8153730034828186, "learning_rate": 1.5580202098509078e-08, "loss": 0.0326, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.75, "step": 968 }, { "completion_length": 250.0, "epoch": 0.08075, "grad_norm": 0.526576578617096, "kl": 1.7040972709655762, "learning_rate": 1.4622590069211517e-08, "loss": 0.0682, "reward": 1.7083332538604736, "reward_std": 0.5756294131278992, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.8333333730697632, "step": 969 }, { "completion_length": 250.0, "epoch": 0.08083333333333333, "grad_norm": 0.2984718680381775, "kl": 1.5154507160186768, "learning_rate": 1.3695261579316776e-08, "loss": 0.0606, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 0.875, "step": 970 }, { "completion_length": 250.0, "epoch": 0.08091666666666666, "grad_norm": 1.2827290296554565, "kl": 1.4101325273513794, "learning_rate": 1.2798227928029483e-08, "loss": 0.0564, "reward": 1.7916667461395264, "reward_std": 0.39591163396835327, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 0.7916666269302368, "step": 971 }, { "completion_length": 250.0, "epoch": 0.081, "grad_norm": 0.23937450349330902, "kl": 1.6009562015533447, "learning_rate": 1.193150004542204e-08, "loss": 0.064, "reward": 1.5833333730697632, "reward_std": 0.49601587653160095, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.9583333730697632, "step": 972 }, { "completion_length": 250.0, "epoch": 0.08108333333333333, "grad_norm": 0.5152274370193481, "kl": 0.8297097086906433, "learning_rate": 1.109508849230001e-08, "loss": 0.0332, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.75, "step": 973 }, { "completion_length": 250.0, "epoch": 0.08116666666666666, "grad_norm": 0.28064191341400146, "kl": 1.0859336853027344, "learning_rate": 1.0289003460074165e-08, "loss": 0.0434, "reward": 1.625, "reward_std": 0.6283639669418335, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.875, "step": 974 }, { "completion_length": 250.0, "epoch": 0.08125, "grad_norm": 0.2897964417934418, "kl": 1.0955630540847778, "learning_rate": 9.513254770636138e-09, "loss": 0.0438, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.875, "step": 975 }, { "completion_length": 250.0, "epoch": 0.08133333333333333, "grad_norm": 0.22548621892929077, "kl": 0.9993997812271118, "learning_rate": 8.767851876239075e-09, "loss": 0.04, "reward": 1.9583333730697632, "reward_std": 0.11785109341144562, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 0.9583333730697632, "step": 976 }, { "completion_length": 250.0, "epoch": 0.08141666666666666, "grad_norm": 0.326506108045578, "kl": 1.3311384916305542, "learning_rate": 8.052803859382174e-09, "loss": 0.0532, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 1.0, "step": 977 }, { "completion_length": 250.0, "epoch": 0.0815, "grad_norm": 0.26752278208732605, "kl": 0.6817623972892761, "learning_rate": 7.368119432699383e-09, "loss": 0.0273, "reward": 1.0, "reward_std": 0.7559289336204529, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.75, "step": 978 }, { "completion_length": 250.0, "epoch": 0.08158333333333333, "grad_norm": 0.3232153058052063, "kl": 0.79771888256073, "learning_rate": 6.7138069388547614e-09, "loss": 0.0319, "reward": 1.5416667461395264, "reward_std": 0.5019802451133728, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.9166666865348816, "step": 979 }, { "completion_length": 250.0, "epoch": 0.08166666666666667, "grad_norm": 0.35629838705062866, "kl": 1.6459451913833618, "learning_rate": 6.089874350439507e-09, "loss": 0.0658, "reward": 1.5416666269302368, "reward_std": 0.6651768684387207, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.9166666865348816, "step": 980 }, { "completion_length": 250.0, "epoch": 0.08175, "grad_norm": 0.30297717452049255, "kl": 0.822544515132904, "learning_rate": 5.4963292698750896e-09, "loss": 0.0329, "reward": 1.5416667461395264, "reward_std": 0.7332792282104492, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.7916666865348816, "step": 981 }, { "completion_length": 250.0, "epoch": 0.08183333333333333, "grad_norm": 0.3641495406627655, "kl": 1.5490293502807617, "learning_rate": 4.933178929321103e-09, "loss": 0.062, "reward": 1.6666666269302368, "reward_std": 0.6424160599708557, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.9166666865348816, "step": 982 }, { "completion_length": 250.0, "epoch": 0.08191666666666667, "grad_norm": 0.19789476692676544, "kl": 1.1928259134292603, "learning_rate": 4.400430190586724e-09, "loss": 0.0477, "reward": 1.0833333730697632, "reward_std": 0.38832157850265503, "rewards/correctness_reward_func": 0.125, "rewards/format_reward_func": 0.9583333730697632, "step": 983 }, { "completion_length": 250.0, "epoch": 0.082, "grad_norm": 1.2451598644256592, "kl": 1.1428676843643188, "learning_rate": 3.8980895450474455e-09, "loss": 0.0457, "reward": 1.5833332538604736, "reward_std": 0.6362089514732361, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.7083333730697632, "step": 984 }, { "completion_length": 250.0, "epoch": 0.08208333333333333, "grad_norm": 0.6361198425292969, "kl": 0.45185351371765137, "learning_rate": 3.4261631135654174e-09, "loss": 0.0181, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.75, "step": 985 }, { "completion_length": 250.0, "epoch": 0.08216666666666667, "grad_norm": 0.3087112307548523, "kl": 1.8694123029708862, "learning_rate": 2.984656646415063e-09, "loss": 0.0748, "reward": 1.9583333730697632, "reward_std": 0.11785109341144562, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 0.9583333730697632, "step": 986 }, { "completion_length": 250.0, "epoch": 0.08225, "grad_norm": 0.28350523114204407, "kl": 0.7043304443359375, "learning_rate": 2.573575523213412e-09, "loss": 0.0282, "reward": 0.7083333730697632, "reward_std": 0.4520675241947174, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.7083333730697632, "step": 987 }, { "completion_length": 250.0, "epoch": 0.08233333333333333, "grad_norm": 0.20463794469833374, "kl": 1.8880668878555298, "learning_rate": 2.192924752854042e-09, "loss": 0.0755, "reward": 1.6666667461395264, "reward_std": 0.4364357888698578, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.9166667461395264, "step": 988 }, { "completion_length": 250.0, "epoch": 0.08241666666666667, "grad_norm": 0.1875455379486084, "kl": 1.1098848581314087, "learning_rate": 1.842708973447127e-09, "loss": 0.0444, "reward": 1.7916667461395264, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.9166666865348816, "step": 989 }, { "completion_length": 209.0, "epoch": 0.0825, "grad_norm": 0.30843275785446167, "kl": 1.1816316843032837, "learning_rate": 1.5229324522605949e-09, "loss": 0.0473, "reward": 2.0, "reward_std": 0.0, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 1.0, "step": 990 }, { "completion_length": 250.0, "epoch": 0.08258333333333333, "grad_norm": 0.2759556770324707, "kl": 0.9381877779960632, "learning_rate": 1.2335990856710001e-09, "loss": 0.0375, "reward": 0.7083333730697632, "reward_std": 0.4520675837993622, "rewards/correctness_reward_func": 0.0, "rewards/format_reward_func": 0.7083333730697632, "step": 991 }, { "completion_length": 250.0, "epoch": 0.08266666666666667, "grad_norm": 0.28256580233573914, "kl": 0.8221601843833923, "learning_rate": 9.747123991141193e-10, "loss": 0.0329, "reward": 1.9583333730697632, "reward_std": 0.11785109341144562, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 0.9583333730697632, "step": 992 }, { "completion_length": 250.0, "epoch": 0.08275, "grad_norm": 0.3341481685638428, "kl": 1.0547815561294556, "learning_rate": 7.462755470422078e-10, "loss": 0.0422, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.875, "step": 993 }, { "completion_length": 250.0, "epoch": 0.08283333333333333, "grad_norm": 0.3639756441116333, "kl": 0.9993078708648682, "learning_rate": 5.48291312886251e-10, "loss": 0.04, "reward": 1.0, "reward_std": 0.6424161195755005, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.7500000596046448, "step": 994 }, { "completion_length": 250.0, "epoch": 0.08291666666666667, "grad_norm": 0.25637567043304443, "kl": 0.9738581776618958, "learning_rate": 3.8076210902182607e-10, "loss": 0.039, "reward": 1.25, "reward_std": 0.7071068286895752, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.625, "step": 995 }, { "completion_length": 158.0, "epoch": 0.083, "grad_norm": 0.3529271185398102, "kl": 0.9513704776763916, "learning_rate": 2.43689976739403e-10, "loss": 0.0381, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 1.0, "step": 996 }, { "completion_length": 250.0, "epoch": 0.08308333333333333, "grad_norm": 0.34197887778282166, "kl": 1.2964738607406616, "learning_rate": 1.3707658621964216e-10, "loss": 0.0519, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.875, "step": 997 }, { "completion_length": 250.0, "epoch": 0.08316666666666667, "grad_norm": 0.29255440831184387, "kl": 0.7699793577194214, "learning_rate": 6.092323651313293e-11, "loss": 0.0308, "reward": 1.8333333730697632, "reward_std": 0.47140446305274963, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.9583333730697632, "step": 998 }, { "completion_length": 250.0, "epoch": 0.08325, "grad_norm": 4.424619197845459, "kl": 1.8411613702774048, "learning_rate": 1.5230855524017708e-11, "loss": 0.0736, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/correctness_reward_func": 0.375, "rewards/format_reward_func": 0.875, "step": 999 }, { "completion_length": 250.0, "epoch": 0.08333333333333333, "grad_norm": 0.2679225206375122, "kl": 0.8978027701377869, "learning_rate": 0.0, "loss": 0.0359, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.875, "step": 1000 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }