diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.08333333333333333, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 250.0, + "epoch": 8.333333333333333e-05, + "grad_norm": 5370.26904296875, + "kl": 1680.79296875, + "learning_rate": 5.0000000000000004e-08, + "loss": 67.2317, + "reward": 0.4166666865348816, + "reward_std": 0.49601587653160095, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.0416666679084301, + "step": 1 + }, + { + "completion_length": 250.0, + "epoch": 0.00016666666666666666, + "grad_norm": 2.1227879524230957, + "kl": 0.3190357983112335, + "learning_rate": 1.0000000000000001e-07, + "loss": 0.0128, + "reward": 0.0416666679084301, + "reward_std": 0.1178511381149292, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0416666679084301, + "step": 2 + }, + { + "completion_length": 250.0, + "epoch": 0.00025, + "grad_norm": 1779814.75, + "kl": 204788.625, + "learning_rate": 1.5000000000000002e-07, + "loss": 8191.5454, + "reward": 0.1666666716337204, + "reward_std": 0.35634833574295044, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.0416666679084301, + "step": 3 + }, + { + "completion_length": 250.0, + "epoch": 0.0003333333333333333, + "grad_norm": 3.283656358718872, + "kl": 1.0824432373046875, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.0433, + "reward": 0.0416666679084301, + "reward_std": 0.1178511381149292, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0416666679084301, + "step": 4 + }, + { + "completion_length": 250.0, + "epoch": 0.0004166666666666667, + "grad_norm": 2.5271332263946533, + "kl": 0.6556969881057739, + "learning_rate": 2.5000000000000004e-07, + "loss": 0.0262, + "reward": 0.0416666679084301, + "reward_std": 0.1178511381149292, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0416666679084301, + "step": 5 + }, + { + "completion_length": 250.0, + "epoch": 0.0005, + "grad_norm": 1.6124398708343506, + "kl": 0.33115604519844055, + "learning_rate": 3.0000000000000004e-07, + "loss": 0.0132, + "reward": 0.0833333358168602, + "reward_std": 0.15430335700511932, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0833333358168602, + "step": 6 + }, + { + "completion_length": 250.0, + "epoch": 0.0005833333333333334, + "grad_norm": 1.7685329914093018, + "kl": 0.4470013380050659, + "learning_rate": 3.5000000000000004e-07, + "loss": 0.0179, + "reward": 0.2916666865348816, + "reward_std": 0.4520675837993622, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.2916666865348816, + "step": 7 + }, + { + "completion_length": 250.0, + "epoch": 0.0006666666666666666, + "grad_norm": 0.2993689179420471, + "kl": 0.24829219281673431, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.0099, + "reward": 0.0833333358168602, + "reward_std": 0.2357022762298584, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0833333358168602, + "step": 8 + }, + { + "completion_length": 250.0, + "epoch": 0.00075, + "grad_norm": 995.0140991210938, + "kl": 280.3321533203125, + "learning_rate": 4.5000000000000003e-07, + "loss": 11.2133, + "reward": 0.25, + "reward_std": 0.7071067690849304, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.125, + "step": 9 + }, + { + "completion_length": 250.0, + "epoch": 0.0008333333333333334, + "grad_norm": 0.5820446014404297, + "kl": 0.38791656494140625, + "learning_rate": 5.000000000000001e-07, + "loss": 0.0155, + "reward": 0.125, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.0, + "step": 10 + }, + { + "completion_length": 250.0, + "epoch": 0.0009166666666666666, + "grad_norm": 2.4436404705047607, + "kl": 0.6116478443145752, + "learning_rate": 5.5e-07, + "loss": 0.0245, + "reward": 0.0833333358168602, + "reward_std": 0.2357022762298584, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0833333358168602, + "step": 11 + }, + { + "completion_length": 250.0, + "epoch": 0.001, + "grad_norm": 5655916544.0, + "kl": 771800640.0, + "learning_rate": 6.000000000000001e-07, + "loss": 30872024.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 12 + }, + { + "completion_length": 250.0, + "epoch": 0.0010833333333333333, + "grad_norm": 15.001657485961914, + "kl": 3.9553842544555664, + "learning_rate": 6.5e-07, + "loss": 0.1582, + "reward": 0.0416666679084301, + "reward_std": 0.1178511381149292, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0416666679084301, + "step": 13 + }, + { + "completion_length": 250.0, + "epoch": 0.0011666666666666668, + "grad_norm": 24.87034034729004, + "kl": 6.287238121032715, + "learning_rate": 7.000000000000001e-07, + "loss": 0.2515, + "reward": 0.0416666679084301, + "reward_std": 0.1178511381149292, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0416666679084301, + "step": 14 + }, + { + "completion_length": 250.0, + "epoch": 0.00125, + "grad_norm": 0.8860446214675903, + "kl": 0.7432097792625427, + "learning_rate": 7.5e-07, + "loss": 0.0297, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 15 + }, + { + "completion_length": 250.0, + "epoch": 0.0013333333333333333, + "grad_norm": 16639295488.0, + "kl": 2705503488.0, + "learning_rate": 8.000000000000001e-07, + "loss": 108220136.0, + "reward": 0.1666666716337204, + "reward_std": 0.35634833574295044, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.0416666679084301, + "step": 16 + }, + { + "completion_length": 250.0, + "epoch": 0.0014166666666666668, + "grad_norm": 0.05391894653439522, + "kl": 0.284786581993103, + "learning_rate": 8.500000000000001e-07, + "loss": 0.0114, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 17 + }, + { + "completion_length": 250.0, + "epoch": 0.0015, + "grad_norm": 1350.6583251953125, + "kl": 165.6782989501953, + "learning_rate": 9.000000000000001e-07, + "loss": 6.6271, + "reward": 0.0416666679084301, + "reward_std": 0.1178511381149292, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0416666679084301, + "step": 18 + }, + { + "completion_length": 250.0, + "epoch": 0.0015833333333333333, + "grad_norm": 26.208847045898438, + "kl": 4.027891635894775, + "learning_rate": 9.500000000000001e-07, + "loss": 0.1611, + "reward": 0.25, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.25, + "step": 19 + }, + { + "completion_length": 250.0, + "epoch": 0.0016666666666666668, + "grad_norm": 1.4429972171783447, + "kl": 0.31500595808029175, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.0126, + "reward": 0.1666666716337204, + "reward_std": 0.30860671401023865, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.1666666716337204, + "step": 20 + }, + { + "completion_length": 250.0, + "epoch": 0.00175, + "grad_norm": 0.10234619677066803, + "kl": 0.3105039894580841, + "learning_rate": 1.0500000000000001e-06, + "loss": 0.0124, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 21 + }, + { + "completion_length": 250.0, + "epoch": 0.0018333333333333333, + "grad_norm": 0.3241354525089264, + "kl": 0.26765021681785583, + "learning_rate": 1.1e-06, + "loss": 0.0107, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 22 + }, + { + "completion_length": 250.0, + "epoch": 0.0019166666666666666, + "grad_norm": 1.2852263450622559, + "kl": 0.31648027896881104, + "learning_rate": 1.1500000000000002e-06, + "loss": 0.0127, + "reward": 0.25, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.0, + "step": 23 + }, + { + "completion_length": 250.0, + "epoch": 0.002, + "grad_norm": 2.508118152618408, + "kl": 0.2790696918964386, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.0112, + "reward": 0.0416666679084301, + "reward_std": 0.1178511381149292, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0416666679084301, + "step": 24 + }, + { + "completion_length": 250.0, + "epoch": 0.0020833333333333333, + "grad_norm": 254.1594696044922, + "kl": 28.16029930114746, + "learning_rate": 1.25e-06, + "loss": 1.1264, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 25 + }, + { + "completion_length": 250.0, + "epoch": 0.0021666666666666666, + "grad_norm": 47.175018310546875, + "kl": 3.506722927093506, + "learning_rate": 1.3e-06, + "loss": 0.1403, + "reward": 0.0416666679084301, + "reward_std": 0.1178511381149292, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0416666679084301, + "step": 26 + }, + { + "completion_length": 250.0, + "epoch": 0.00225, + "grad_norm": 514.6583862304688, + "kl": 64.28365325927734, + "learning_rate": 1.3500000000000002e-06, + "loss": 2.5713, + "reward": 0.0416666679084301, + "reward_std": 0.1178511381149292, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0416666679084301, + "step": 27 + }, + { + "completion_length": 250.0, + "epoch": 0.0023333333333333335, + "grad_norm": 0.4107680916786194, + "kl": 0.31204113364219666, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.0125, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 28 + }, + { + "completion_length": 250.0, + "epoch": 0.002416666666666667, + "grad_norm": 242400784.0, + "kl": 28910430.0, + "learning_rate": 1.45e-06, + "loss": 1156417.25, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 29 + }, + { + "completion_length": 250.0, + "epoch": 0.0025, + "grad_norm": 0.3205853998661041, + "kl": 0.24721822142601013, + "learning_rate": 1.5e-06, + "loss": 0.0099, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 30 + }, + { + "completion_length": 250.0, + "epoch": 0.0025833333333333333, + "grad_norm": 4.041526794433594, + "kl": 1.1066405773162842, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.0443, + "reward": 0.0833333358168602, + "reward_std": 0.15430335700511932, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0833333358168602, + "step": 31 + }, + { + "completion_length": 250.0, + "epoch": 0.0026666666666666666, + "grad_norm": 160318.890625, + "kl": 8970.6455078125, + "learning_rate": 1.6000000000000001e-06, + "loss": 358.8259, + "reward": 0.375, + "reward_std": 0.4520675837993622, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.375, + "step": 32 + }, + { + "completion_length": 250.0, + "epoch": 0.00275, + "grad_norm": 0.11564349383115768, + "kl": 0.3156932294368744, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.0126, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 33 + }, + { + "completion_length": 250.0, + "epoch": 0.0028333333333333335, + "grad_norm": 11.49873161315918, + "kl": 3.654203176498413, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.1462, + "reward": 0.1666666716337204, + "reward_std": 0.35634833574295044, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.0416666679084301, + "step": 34 + }, + { + "completion_length": 250.0, + "epoch": 0.002916666666666667, + "grad_norm": 3.487309455871582, + "kl": 0.47985804080963135, + "learning_rate": 1.75e-06, + "loss": 0.0192, + "reward": 0.0416666679084301, + "reward_std": 0.1178511381149292, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0416666679084301, + "step": 35 + }, + { + "completion_length": 250.0, + "epoch": 0.003, + "grad_norm": 1.3627883195877075, + "kl": 0.7914985418319702, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.0317, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 36 + }, + { + "completion_length": 250.0, + "epoch": 0.0030833333333333333, + "grad_norm": 0.7565972208976746, + "kl": 0.2916364371776581, + "learning_rate": 1.85e-06, + "loss": 0.0117, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 37 + }, + { + "completion_length": 250.0, + "epoch": 0.0031666666666666666, + "grad_norm": 230.62786865234375, + "kl": 28.600061416625977, + "learning_rate": 1.9000000000000002e-06, + "loss": 1.144, + "reward": 0.0416666679084301, + "reward_std": 0.1178511381149292, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0416666679084301, + "step": 38 + }, + { + "completion_length": 250.0, + "epoch": 0.00325, + "grad_norm": 14.349813461303711, + "kl": 2.471902847290039, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.0989, + "reward": 0.125, + "reward_std": 0.24800793826580048, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.125, + "step": 39 + }, + { + "completion_length": 250.0, + "epoch": 0.0033333333333333335, + "grad_norm": 0.34092557430267334, + "kl": 0.2546476423740387, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0102, + "reward": 0.0833333358168602, + "reward_std": 0.15430335700511932, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0833333358168602, + "step": 40 + }, + { + "completion_length": 250.0, + "epoch": 0.003416666666666667, + "grad_norm": 2.506922483444214, + "kl": 0.2611209452152252, + "learning_rate": 2.05e-06, + "loss": 0.0104, + "reward": 0.375, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.25, + "step": 41 + }, + { + "completion_length": 250.0, + "epoch": 0.0035, + "grad_norm": 2.241769790649414, + "kl": 0.6660223603248596, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.0266, + "reward": 0.125, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.125, + "step": 42 + }, + { + "completion_length": 250.0, + "epoch": 0.0035833333333333333, + "grad_norm": 0.8412100076675415, + "kl": 0.35636767745018005, + "learning_rate": 2.15e-06, + "loss": 0.0143, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 43 + }, + { + "completion_length": 250.0, + "epoch": 0.0036666666666666666, + "grad_norm": 3.74702525138855, + "kl": 1.0497126579284668, + "learning_rate": 2.2e-06, + "loss": 0.042, + "reward": 0.0833333358168602, + "reward_std": 0.15430335700511932, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0833333358168602, + "step": 44 + }, + { + "completion_length": 250.0, + "epoch": 0.00375, + "grad_norm": 0.1603858321905136, + "kl": 0.2380465269088745, + "learning_rate": 2.25e-06, + "loss": 0.0095, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 45 + }, + { + "completion_length": 250.0, + "epoch": 0.003833333333333333, + "grad_norm": 3.948719024658203, + "kl": 0.958899974822998, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.0384, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 46 + }, + { + "completion_length": 250.0, + "epoch": 0.003916666666666666, + "grad_norm": 0.3620174825191498, + "kl": 0.24766167998313904, + "learning_rate": 2.35e-06, + "loss": 0.0099, + "reward": 0.125, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.125, + "step": 47 + }, + { + "completion_length": 250.0, + "epoch": 0.004, + "grad_norm": 0.1155853196978569, + "kl": 0.26840996742248535, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.0107, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 48 + }, + { + "completion_length": 250.0, + "epoch": 0.004083333333333333, + "grad_norm": 19.22943878173828, + "kl": 2.2793242931365967, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.0912, + "reward": 0.0416666679084301, + "reward_std": 0.1178511381149292, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0416666679084301, + "step": 49 + }, + { + "completion_length": 250.0, + "epoch": 0.004166666666666667, + "grad_norm": 2.660940647125244, + "kl": 0.9060783386230469, + "learning_rate": 2.5e-06, + "loss": 0.0362, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 50 + }, + { + "completion_length": 250.0, + "epoch": 0.00425, + "grad_norm": 2.71610426902771, + "kl": 0.7481005191802979, + "learning_rate": 2.55e-06, + "loss": 0.0299, + "reward": 0.0833333358168602, + "reward_std": 0.2357022762298584, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0833333358168602, + "step": 51 + }, + { + "completion_length": 250.0, + "epoch": 0.004333333333333333, + "grad_norm": 4604.9912109375, + "kl": 107.24817657470703, + "learning_rate": 2.6e-06, + "loss": 4.2899, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 52 + }, + { + "completion_length": 250.0, + "epoch": 0.004416666666666667, + "grad_norm": 0.5669422745704651, + "kl": 0.31511324644088745, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.0126, + "reward": 0.0416666679084301, + "reward_std": 0.1178511381149292, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0416666679084301, + "step": 53 + }, + { + "completion_length": 250.0, + "epoch": 0.0045, + "grad_norm": 669.4596557617188, + "kl": 16.526533126831055, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.6611, + "reward": 0.2083333432674408, + "reward_std": 0.39591166377067566, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.2083333432674408, + "step": 54 + }, + { + "completion_length": 250.0, + "epoch": 0.004583333333333333, + "grad_norm": 1.862663745880127, + "kl": 0.26984715461730957, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.0108, + "reward": 0.0416666679084301, + "reward_std": 0.1178511381149292, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0416666679084301, + "step": 55 + }, + { + "completion_length": 250.0, + "epoch": 0.004666666666666667, + "grad_norm": 0.026774972677230835, + "kl": 0.22219908237457275, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0089, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 56 + }, + { + "completion_length": 250.0, + "epoch": 0.00475, + "grad_norm": 0.03144953399896622, + "kl": 0.2409312129020691, + "learning_rate": 2.85e-06, + "loss": 0.0096, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 57 + }, + { + "completion_length": 250.0, + "epoch": 0.004833333333333334, + "grad_norm": 1.8306984901428223, + "kl": 0.3393392562866211, + "learning_rate": 2.9e-06, + "loss": 0.0136, + "reward": 0.2083333432674408, + "reward_std": 0.589255690574646, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.0833333358168602, + "step": 58 + }, + { + "completion_length": 250.0, + "epoch": 0.004916666666666666, + "grad_norm": 185.6766815185547, + "kl": 31.755769729614258, + "learning_rate": 2.95e-06, + "loss": 1.2702, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 59 + }, + { + "completion_length": 250.0, + "epoch": 0.005, + "grad_norm": 4095.82421875, + "kl": 48.20116424560547, + "learning_rate": 3e-06, + "loss": 1.928, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 60 + }, + { + "completion_length": 250.0, + "epoch": 0.005083333333333333, + "grad_norm": 85.37049102783203, + "kl": 12.757698059082031, + "learning_rate": 3.05e-06, + "loss": 0.5103, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 61 + }, + { + "completion_length": 250.0, + "epoch": 0.005166666666666667, + "grad_norm": 0.6013548374176025, + "kl": 0.2426263988018036, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.0097, + "reward": 0.125, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.0, + "step": 62 + }, + { + "completion_length": 250.0, + "epoch": 0.00525, + "grad_norm": 3.4427809715270996, + "kl": 0.9819362163543701, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.0393, + "reward": 0.125, + "reward_std": 0.24800795316696167, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.125, + "step": 63 + }, + { + "completion_length": 250.0, + "epoch": 0.005333333333333333, + "grad_norm": 0.37425747513771057, + "kl": 0.3634275197982788, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.0145, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 64 + }, + { + "completion_length": 250.0, + "epoch": 0.005416666666666667, + "grad_norm": 2.08392596244812, + "kl": 0.30326148867607117, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.0121, + "reward": 0.125, + "reward_std": 0.24800793826580048, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.125, + "step": 65 + }, + { + "completion_length": 250.0, + "epoch": 0.0055, + "grad_norm": 12975.3134765625, + "kl": 1615.21484375, + "learning_rate": 3.3000000000000006e-06, + "loss": 64.6086, + "reward": 0.0416666679084301, + "reward_std": 0.1178511381149292, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0416666679084301, + "step": 66 + }, + { + "completion_length": 250.0, + "epoch": 0.005583333333333333, + "grad_norm": 0.7209013104438782, + "kl": 0.3030403256416321, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.0121, + "reward": 0.125, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.125, + "step": 67 + }, + { + "completion_length": 250.0, + "epoch": 0.005666666666666667, + "grad_norm": 2.0693342685699463, + "kl": 0.3132570683956146, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0125, + "reward": 0.0416666679084301, + "reward_std": 0.1178511381149292, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0416666679084301, + "step": 68 + }, + { + "completion_length": 250.0, + "epoch": 0.00575, + "grad_norm": 148.04296875, + "kl": 8.410460472106934, + "learning_rate": 3.45e-06, + "loss": 0.3364, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 69 + }, + { + "completion_length": 250.0, + "epoch": 0.005833333333333334, + "grad_norm": 2913710.75, + "kl": 355526.25, + "learning_rate": 3.5e-06, + "loss": 14221.0527, + "reward": 0.0416666679084301, + "reward_std": 0.1178511381149292, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0416666679084301, + "step": 70 + }, + { + "completion_length": 250.0, + "epoch": 0.005916666666666666, + "grad_norm": 1.4818073511123657, + "kl": 0.28156542778015137, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.0113, + "reward": 0.1666666716337204, + "reward_std": 0.35634827613830566, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.1666666716337204, + "step": 71 + }, + { + "completion_length": 250.0, + "epoch": 0.006, + "grad_norm": 1.2178808450698853, + "kl": 0.42147043347358704, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.0169, + "reward": 0.125, + "reward_std": 0.24800793826580048, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.125, + "step": 72 + }, + { + "completion_length": 250.0, + "epoch": 0.006083333333333333, + "grad_norm": 9.053384780883789, + "kl": 2.088000535964966, + "learning_rate": 3.65e-06, + "loss": 0.0835, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 73 + }, + { + "completion_length": 250.0, + "epoch": 0.006166666666666667, + "grad_norm": 16.30586814880371, + "kl": 2.418818712234497, + "learning_rate": 3.7e-06, + "loss": 0.0968, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 74 + }, + { + "completion_length": 250.0, + "epoch": 0.00625, + "grad_norm": 0.9915629625320435, + "kl": 0.4261716902256012, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.017, + "reward": 0.2916666865348816, + "reward_std": 0.5473601818084717, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.1666666716337204, + "step": 75 + }, + { + "completion_length": 250.0, + "epoch": 0.006333333333333333, + "grad_norm": 35.2829704284668, + "kl": 4.973476409912109, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.1989, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 76 + }, + { + "completion_length": 250.0, + "epoch": 0.006416666666666667, + "grad_norm": 0.44628340005874634, + "kl": 0.3691655397415161, + "learning_rate": 3.85e-06, + "loss": 0.0148, + "reward": 0.125, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.0, + "step": 77 + }, + { + "completion_length": 250.0, + "epoch": 0.0065, + "grad_norm": 1.1199471950531006, + "kl": 0.40812310576438904, + "learning_rate": 3.900000000000001e-06, + "loss": 0.0163, + "reward": 0.5, + "reward_std": 0.9258201122283936, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.25, + "step": 78 + }, + { + "completion_length": 250.0, + "epoch": 0.006583333333333333, + "grad_norm": 3.701707363128662, + "kl": 0.302755743265152, + "learning_rate": 3.95e-06, + "loss": 0.0121, + "reward": 0.0416666679084301, + "reward_std": 0.1178511381149292, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0416666679084301, + "step": 79 + }, + { + "completion_length": 250.0, + "epoch": 0.006666666666666667, + "grad_norm": 173.58099365234375, + "kl": 35.22575378417969, + "learning_rate": 4.000000000000001e-06, + "loss": 1.409, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 80 + }, + { + "completion_length": 250.0, + "epoch": 0.00675, + "grad_norm": 0.9575639367103577, + "kl": 0.45123860239982605, + "learning_rate": 4.05e-06, + "loss": 0.018, + "reward": 0.0416666679084301, + "reward_std": 0.1178511381149292, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0416666679084301, + "step": 81 + }, + { + "completion_length": 250.0, + "epoch": 0.006833333333333334, + "grad_norm": 0.15877264738082886, + "kl": 0.3092237710952759, + "learning_rate": 4.1e-06, + "loss": 0.0124, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 82 + }, + { + "completion_length": 250.0, + "epoch": 0.0069166666666666664, + "grad_norm": 4.570583820343018, + "kl": 0.6996808648109436, + "learning_rate": 4.15e-06, + "loss": 0.028, + "reward": 0.0833333358168602, + "reward_std": 0.2357022762298584, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0833333358168602, + "step": 83 + }, + { + "completion_length": 250.0, + "epoch": 0.007, + "grad_norm": 1.1855798959732056, + "kl": 0.2818334698677063, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0113, + "reward": 0.125, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.125, + "step": 84 + }, + { + "completion_length": 250.0, + "epoch": 0.007083333333333333, + "grad_norm": 0.5873503088951111, + "kl": 0.47469863295555115, + "learning_rate": 4.25e-06, + "loss": 0.019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 85 + }, + { + "completion_length": 250.0, + "epoch": 0.007166666666666667, + "grad_norm": 0.5640818476676941, + "kl": 0.4295016825199127, + "learning_rate": 4.3e-06, + "loss": 0.0172, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 86 + }, + { + "completion_length": 250.0, + "epoch": 0.00725, + "grad_norm": 115.95929718017578, + "kl": 21.73447036743164, + "learning_rate": 4.350000000000001e-06, + "loss": 0.8694, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 87 + }, + { + "completion_length": 250.0, + "epoch": 0.007333333333333333, + "grad_norm": 47534.85546875, + "kl": 5644.39404296875, + "learning_rate": 4.4e-06, + "loss": 225.7758, + "reward": 0.25, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.0, + "step": 88 + }, + { + "completion_length": 250.0, + "epoch": 0.007416666666666667, + "grad_norm": 0.42040178179740906, + "kl": 0.3909342885017395, + "learning_rate": 4.450000000000001e-06, + "loss": 0.0156, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 89 + }, + { + "completion_length": 250.0, + "epoch": 0.0075, + "grad_norm": 14.471317291259766, + "kl": 3.260627031326294, + "learning_rate": 4.5e-06, + "loss": 0.1304, + "reward": 0.0416666679084301, + "reward_std": 0.1178511381149292, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0416666679084301, + "step": 90 + }, + { + "completion_length": 250.0, + "epoch": 0.007583333333333333, + "grad_norm": 0.6545644998550415, + "kl": 0.31067171692848206, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.0124, + "reward": 0.1666666716337204, + "reward_std": 0.35634833574295044, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.1666666716337204, + "step": 91 + }, + { + "completion_length": 250.0, + "epoch": 0.007666666666666666, + "grad_norm": 2.0157968997955322, + "kl": 0.8457777500152588, + "learning_rate": 4.600000000000001e-06, + "loss": 0.0338, + "reward": 0.0416666679084301, + "reward_std": 0.1178511381149292, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0416666679084301, + "step": 92 + }, + { + "completion_length": 250.0, + "epoch": 0.00775, + "grad_norm": 0.27556928992271423, + "kl": 0.21060839295387268, + "learning_rate": 4.65e-06, + "loss": 0.0084, + "reward": 0.0833333358168602, + "reward_std": 0.2357022762298584, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0833333358168602, + "step": 93 + }, + { + "completion_length": 250.0, + "epoch": 0.007833333333333333, + "grad_norm": 1.8622896671295166, + "kl": 0.38421598076820374, + "learning_rate": 4.7e-06, + "loss": 0.0154, + "reward": 0.125, + "reward_std": 0.24800795316696167, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.125, + "step": 94 + }, + { + "completion_length": 250.0, + "epoch": 0.007916666666666667, + "grad_norm": 1.3327813148498535, + "kl": 0.26667675375938416, + "learning_rate": 4.75e-06, + "loss": 0.0107, + "reward": 0.0416666679084301, + "reward_std": 0.1178511381149292, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0416666679084301, + "step": 95 + }, + { + "completion_length": 250.0, + "epoch": 0.008, + "grad_norm": 0.4855498969554901, + "kl": 0.2688427269458771, + "learning_rate": 4.800000000000001e-06, + "loss": 0.0108, + "reward": 0.125, + "reward_std": 0.24800795316696167, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.125, + "step": 96 + }, + { + "completion_length": 250.0, + "epoch": 0.008083333333333333, + "grad_norm": 0.4623148739337921, + "kl": 0.26257768273353577, + "learning_rate": 4.85e-06, + "loss": 0.0105, + "reward": 0.1666666716337204, + "reward_std": 0.35634833574295044, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.1666666716337204, + "step": 97 + }, + { + "completion_length": 250.0, + "epoch": 0.008166666666666666, + "grad_norm": 0.6498294472694397, + "kl": 0.3181808292865753, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.0127, + "reward": 0.2083333432674408, + "reward_std": 0.39591166377067566, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.2083333432674408, + "step": 98 + }, + { + "completion_length": 250.0, + "epoch": 0.00825, + "grad_norm": 0.8791100382804871, + "kl": 0.5302932262420654, + "learning_rate": 4.95e-06, + "loss": 0.0212, + "reward": 0.4583333134651184, + "reward_std": 0.46929532289505005, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.3333333432674408, + "step": 99 + }, + { + "completion_length": 250.0, + "epoch": 0.008333333333333333, + "grad_norm": 78.05960845947266, + "kl": 18.444576263427734, + "learning_rate": 5e-06, + "loss": 0.7378, + "reward": 0.125, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.0, + "step": 100 + }, + { + "completion_length": 250.0, + "epoch": 0.008416666666666666, + "grad_norm": 0.6424371004104614, + "kl": 0.3804784119129181, + "learning_rate": 4.999984769144476e-06, + "loss": 0.0152, + "reward": 0.3333333432674408, + "reward_std": 0.6900655627250671, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.2083333432674408, + "step": 101 + }, + { + "completion_length": 250.0, + "epoch": 0.0085, + "grad_norm": 115.73678588867188, + "kl": 4.351790904998779, + "learning_rate": 4.999939076763487e-06, + "loss": 0.1741, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 102 + }, + { + "completion_length": 250.0, + "epoch": 0.008583333333333333, + "grad_norm": 0.12965546548366547, + "kl": 0.2807658612728119, + "learning_rate": 4.999862923413781e-06, + "loss": 0.0112, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 103 + }, + { + "completion_length": 250.0, + "epoch": 0.008666666666666666, + "grad_norm": 2.4104604721069336, + "kl": 0.5225258469581604, + "learning_rate": 4.999756310023261e-06, + "loss": 0.0209, + "reward": 0.0416666679084301, + "reward_std": 0.1178511381149292, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0416666679084301, + "step": 104 + }, + { + "completion_length": 250.0, + "epoch": 0.00875, + "grad_norm": 0.3326241075992584, + "kl": 0.29482659697532654, + "learning_rate": 4.9996192378909785e-06, + "loss": 0.0118, + "reward": 0.2916666567325592, + "reward_std": 0.7000566720962524, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.1666666716337204, + "step": 105 + }, + { + "completion_length": 250.0, + "epoch": 0.008833333333333334, + "grad_norm": 1.436844825744629, + "kl": 0.40061473846435547, + "learning_rate": 4.999451708687114e-06, + "loss": 0.016, + "reward": 0.4583333134651184, + "reward_std": 0.7113032937049866, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.2083333432674408, + "step": 106 + }, + { + "completion_length": 250.0, + "epoch": 0.008916666666666666, + "grad_norm": 95.7150650024414, + "kl": 2.640369176864624, + "learning_rate": 4.9992537244529585e-06, + "loss": 0.1056, + "reward": 0.0833333358168602, + "reward_std": 0.2357022762298584, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0833333358168602, + "step": 107 + }, + { + "completion_length": 250.0, + "epoch": 0.009, + "grad_norm": 108.41773223876953, + "kl": 6.058004856109619, + "learning_rate": 4.999025287600886e-06, + "loss": 0.2423, + "reward": 0.125, + "reward_std": 0.24800795316696167, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.125, + "step": 108 + }, + { + "completion_length": 250.0, + "epoch": 0.009083333333333334, + "grad_norm": 25.03618049621582, + "kl": 3.463536262512207, + "learning_rate": 4.998766400914329e-06, + "loss": 0.1385, + "reward": 0.0416666679084301, + "reward_std": 0.1178511381149292, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0416666679084301, + "step": 109 + }, + { + "completion_length": 250.0, + "epoch": 0.009166666666666667, + "grad_norm": 0.5373579263687134, + "kl": 0.3192852735519409, + "learning_rate": 4.99847706754774e-06, + "loss": 0.0128, + "reward": 0.125, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.125, + "step": 110 + }, + { + "completion_length": 250.0, + "epoch": 0.00925, + "grad_norm": 0.2514178454875946, + "kl": 0.4298870861530304, + "learning_rate": 4.998157291026553e-06, + "loss": 0.0172, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 111 + }, + { + "completion_length": 250.0, + "epoch": 0.009333333333333334, + "grad_norm": 49.19013214111328, + "kl": 8.058541297912598, + "learning_rate": 4.997807075247147e-06, + "loss": 0.3223, + "reward": 0.125, + "reward_std": 0.17251639068126678, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.125, + "step": 112 + }, + { + "completion_length": 250.0, + "epoch": 0.009416666666666667, + "grad_norm": 1.157423973083496, + "kl": 0.41131162643432617, + "learning_rate": 4.997426424476787e-06, + "loss": 0.0165, + "reward": 0.0416666679084301, + "reward_std": 0.1178511381149292, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0416666679084301, + "step": 113 + }, + { + "completion_length": 250.0, + "epoch": 0.0095, + "grad_norm": 122.4400863647461, + "kl": 6.790146350860596, + "learning_rate": 4.9970153433535855e-06, + "loss": 0.2716, + "reward": 0.0416666679084301, + "reward_std": 0.1178511381149292, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0416666679084301, + "step": 114 + }, + { + "completion_length": 250.0, + "epoch": 0.009583333333333333, + "grad_norm": 0.07149865478277206, + "kl": 0.3394841253757477, + "learning_rate": 4.9965738368864345e-06, + "loss": 0.0136, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 115 + }, + { + "completion_length": 250.0, + "epoch": 0.009666666666666667, + "grad_norm": 1.3715343475341797, + "kl": 0.4577629864215851, + "learning_rate": 4.996101910454953e-06, + "loss": 0.0183, + "reward": 0.0833333358168602, + "reward_std": 0.2357022762298584, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0833333358168602, + "step": 116 + }, + { + "completion_length": 250.0, + "epoch": 0.00975, + "grad_norm": 0.551726222038269, + "kl": 0.3871065080165863, + "learning_rate": 4.995599569809414e-06, + "loss": 0.0155, + "reward": 0.125, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.0, + "step": 117 + }, + { + "completion_length": 250.0, + "epoch": 0.009833333333333333, + "grad_norm": 2.083500623703003, + "kl": 0.7725878357887268, + "learning_rate": 4.9950668210706795e-06, + "loss": 0.0309, + "reward": 0.3333333432674408, + "reward_std": 0.5345224738121033, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.2083333432674408, + "step": 118 + }, + { + "completion_length": 250.0, + "epoch": 0.009916666666666667, + "grad_norm": 1.0463969707489014, + "kl": 0.4617312550544739, + "learning_rate": 4.994503670730126e-06, + "loss": 0.0185, + "reward": 0.1666666716337204, + "reward_std": 0.30860671401023865, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.1666666716337204, + "step": 119 + }, + { + "completion_length": 250.0, + "epoch": 0.01, + "grad_norm": 0.4884531795978546, + "kl": 0.3635513186454773, + "learning_rate": 4.993910125649561e-06, + "loss": 0.0145, + "reward": 0.25, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.25, + "step": 120 + }, + { + "completion_length": 250.0, + "epoch": 0.010083333333333333, + "grad_norm": 0.30621615052223206, + "kl": 0.37473881244659424, + "learning_rate": 4.993286193061145e-06, + "loss": 0.015, + "reward": 0.125, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.125, + "step": 121 + }, + { + "completion_length": 250.0, + "epoch": 0.010166666666666666, + "grad_norm": 88.8227767944336, + "kl": 24.4356689453125, + "learning_rate": 4.992631880567301e-06, + "loss": 0.9774, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 122 + }, + { + "completion_length": 250.0, + "epoch": 0.01025, + "grad_norm": 0.6021216511726379, + "kl": 0.4579870402812958, + "learning_rate": 4.991947196140619e-06, + "loss": 0.0183, + "reward": 0.25, + "reward_std": 0.38832157850265503, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.125, + "step": 123 + }, + { + "completion_length": 250.0, + "epoch": 0.010333333333333333, + "grad_norm": 1.0776395797729492, + "kl": 0.3739393353462219, + "learning_rate": 4.9912321481237616e-06, + "loss": 0.015, + "reward": 0.1666666716337204, + "reward_std": 0.35634833574295044, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.1666666716337204, + "step": 124 + }, + { + "completion_length": 250.0, + "epoch": 0.010416666666666666, + "grad_norm": 0.704728364944458, + "kl": 0.33970534801483154, + "learning_rate": 4.990486745229364e-06, + "loss": 0.0136, + "reward": 0.0416666679084301, + "reward_std": 0.1178511381149292, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0416666679084301, + "step": 125 + }, + { + "completion_length": 250.0, + "epoch": 0.0105, + "grad_norm": 0.4066079556941986, + "kl": 0.34832248091697693, + "learning_rate": 4.989710996539926e-06, + "loss": 0.0139, + "reward": 0.375, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.25, + "step": 126 + }, + { + "completion_length": 250.0, + "epoch": 0.010583333333333333, + "grad_norm": 20332.4453125, + "kl": 2519.667236328125, + "learning_rate": 4.9889049115077e-06, + "loss": 100.7867, + "reward": 0.125, + "reward_std": 0.17251639068126678, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.125, + "step": 127 + }, + { + "completion_length": 250.0, + "epoch": 0.010666666666666666, + "grad_norm": 1014.6124877929688, + "kl": 151.2786102294922, + "learning_rate": 4.988068499954578e-06, + "loss": 6.0511, + "reward": 0.0833333358168602, + "reward_std": 0.2357022762298584, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0833333358168602, + "step": 128 + }, + { + "completion_length": 250.0, + "epoch": 0.01075, + "grad_norm": 2.2194466590881348, + "kl": 0.45352834463119507, + "learning_rate": 4.987201772071971e-06, + "loss": 0.0181, + "reward": 0.125, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.125, + "step": 129 + }, + { + "completion_length": 250.0, + "epoch": 0.010833333333333334, + "grad_norm": 0.8629388213157654, + "kl": 0.4860643446445465, + "learning_rate": 4.986304738420684e-06, + "loss": 0.0194, + "reward": 0.2083333432674408, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.2083333432674408, + "step": 130 + }, + { + "completion_length": 250.0, + "epoch": 0.010916666666666667, + "grad_norm": 3.691181182861328, + "kl": 0.396503210067749, + "learning_rate": 4.985377409930789e-06, + "loss": 0.0159, + "reward": 0.0833333358168602, + "reward_std": 0.15430335700511932, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0833333358168602, + "step": 131 + }, + { + "completion_length": 250.0, + "epoch": 0.011, + "grad_norm": 20.175403594970703, + "kl": 3.6169893741607666, + "learning_rate": 4.984419797901491e-06, + "loss": 0.1447, + "reward": 0.2083333432674408, + "reward_std": 0.3535533547401428, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.2083333432674408, + "step": 132 + }, + { + "completion_length": 250.0, + "epoch": 0.011083333333333334, + "grad_norm": 0.6853134632110596, + "kl": 0.3578898012638092, + "learning_rate": 4.983431914000991e-06, + "loss": 0.0143, + "reward": 0.3333333432674408, + "reward_std": 0.4714045226573944, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.3333333432674408, + "step": 133 + }, + { + "completion_length": 250.0, + "epoch": 0.011166666666666667, + "grad_norm": 2.199497699737549, + "kl": 1.075109839439392, + "learning_rate": 4.9824137702663424e-06, + "loss": 0.043, + "reward": 0.125, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.125, + "step": 134 + }, + { + "completion_length": 250.0, + "epoch": 0.01125, + "grad_norm": 0.43966594338417053, + "kl": 0.38842320442199707, + "learning_rate": 4.981365379103306e-06, + "loss": 0.0155, + "reward": 0.0833333358168602, + "reward_std": 0.15430335700511932, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0833333358168602, + "step": 135 + }, + { + "completion_length": 250.0, + "epoch": 0.011333333333333334, + "grad_norm": 0.5732003450393677, + "kl": 0.3031355142593384, + "learning_rate": 4.980286753286196e-06, + "loss": 0.0121, + "reward": 0.2083333432674408, + "reward_std": 0.39591166377067566, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.2083333432674408, + "step": 136 + }, + { + "completion_length": 250.0, + "epoch": 0.011416666666666667, + "grad_norm": 1.1415404081344604, + "kl": 0.47937247157096863, + "learning_rate": 4.979177905957726e-06, + "loss": 0.0192, + "reward": 0.2083333432674408, + "reward_std": 0.3053751289844513, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.2083333432674408, + "step": 137 + }, + { + "completion_length": 250.0, + "epoch": 0.0115, + "grad_norm": 5.704109191894531, + "kl": 0.6407493352890015, + "learning_rate": 4.978038850628855e-06, + "loss": 0.0256, + "reward": 0.0416666679084301, + "reward_std": 0.1178511381149292, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0416666679084301, + "step": 138 + }, + { + "completion_length": 250.0, + "epoch": 0.011583333333333333, + "grad_norm": 1.1015725135803223, + "kl": 0.40485408902168274, + "learning_rate": 4.9768696011786095e-06, + "loss": 0.0162, + "reward": 0.0833333358168602, + "reward_std": 0.15430335700511932, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0833333358168602, + "step": 139 + }, + { + "completion_length": 250.0, + "epoch": 0.011666666666666667, + "grad_norm": 0.06077095866203308, + "kl": 0.2962065041065216, + "learning_rate": 4.975670171853926e-06, + "loss": 0.0118, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 140 + }, + { + "completion_length": 250.0, + "epoch": 0.01175, + "grad_norm": 0.030355358496308327, + "kl": 0.25768283009529114, + "learning_rate": 4.974440577269473e-06, + "loss": 0.0103, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 141 + }, + { + "completion_length": 250.0, + "epoch": 0.011833333333333333, + "grad_norm": 0.22492274641990662, + "kl": 0.4085735082626343, + "learning_rate": 4.973180832407471e-06, + "loss": 0.0163, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 142 + }, + { + "completion_length": 250.0, + "epoch": 0.011916666666666667, + "grad_norm": 9.671571731567383, + "kl": 1.6705416440963745, + "learning_rate": 4.971890952617515e-06, + "loss": 0.0668, + "reward": 0.1666666716337204, + "reward_std": 0.4714045524597168, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.0416666679084301, + "step": 143 + }, + { + "completion_length": 250.0, + "epoch": 0.012, + "grad_norm": 0.4590764343738556, + "kl": 0.40203264355659485, + "learning_rate": 4.970570953616383e-06, + "loss": 0.0161, + "reward": 0.5, + "reward_std": 0.7126966714859009, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.375, + "step": 144 + }, + { + "completion_length": 250.0, + "epoch": 0.012083333333333333, + "grad_norm": 0.6464155912399292, + "kl": 0.4423307478427887, + "learning_rate": 4.9692208514878445e-06, + "loss": 0.0177, + "reward": 0.1666666716337204, + "reward_std": 0.35634833574295044, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.1666666716337204, + "step": 145 + }, + { + "completion_length": 250.0, + "epoch": 0.012166666666666666, + "grad_norm": 0.4339180886745453, + "kl": 0.3777730166912079, + "learning_rate": 4.96784066268247e-06, + "loss": 0.0151, + "reward": 0.2916666567325592, + "reward_std": 0.7000566720962524, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.1666666716337204, + "step": 146 + }, + { + "completion_length": 250.0, + "epoch": 0.01225, + "grad_norm": 1.3693031072616577, + "kl": 0.4329844117164612, + "learning_rate": 4.966430404017424e-06, + "loss": 0.0173, + "reward": 0.125, + "reward_std": 0.17251640558242798, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.125, + "step": 147 + }, + { + "completion_length": 250.0, + "epoch": 0.012333333333333333, + "grad_norm": 0.5217644572257996, + "kl": 0.3636080324649811, + "learning_rate": 4.964990092676263e-06, + "loss": 0.0145, + "reward": 0.2916666865348816, + "reward_std": 0.37533053755760193, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.2916666567325592, + "step": 148 + }, + { + "completion_length": 250.0, + "epoch": 0.012416666666666666, + "grad_norm": 19.131145477294922, + "kl": 4.095874786376953, + "learning_rate": 4.963519746208726e-06, + "loss": 0.1638, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 149 + }, + { + "completion_length": 250.0, + "epoch": 0.0125, + "grad_norm": 1.2726384401321411, + "kl": 0.4131162762641907, + "learning_rate": 4.962019382530521e-06, + "loss": 0.0165, + "reward": 0.2916666865348816, + "reward_std": 0.41547447443008423, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.1666666716337204, + "step": 150 + }, + { + "completion_length": 250.0, + "epoch": 0.012583333333333334, + "grad_norm": 0.28327932953834534, + "kl": 0.32023757696151733, + "learning_rate": 4.960489019923105e-06, + "loss": 0.0128, + "reward": 0.125, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.125, + "step": 151 + }, + { + "completion_length": 250.0, + "epoch": 0.012666666666666666, + "grad_norm": 0.3637878894805908, + "kl": 0.36896905303001404, + "learning_rate": 4.958928677033465e-06, + "loss": 0.0148, + "reward": 0.0833333358168602, + "reward_std": 0.15430335700511932, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0833333358168602, + "step": 152 + }, + { + "completion_length": 250.0, + "epoch": 0.01275, + "grad_norm": 0.3663618862628937, + "kl": 0.3021296560764313, + "learning_rate": 4.957338372873886e-06, + "loss": 0.0121, + "reward": 0.0833333358168602, + "reward_std": 0.15430335700511932, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0833333358168602, + "step": 153 + }, + { + "completion_length": 250.0, + "epoch": 0.012833333333333334, + "grad_norm": 1.0241296291351318, + "kl": 0.3188839554786682, + "learning_rate": 4.9557181268217225e-06, + "loss": 0.0128, + "reward": 0.25, + "reward_std": 0.7071067690849304, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.125, + "step": 154 + }, + { + "completion_length": 250.0, + "epoch": 0.012916666666666667, + "grad_norm": 2.020049810409546, + "kl": 0.40295442938804626, + "learning_rate": 4.9540679586191605e-06, + "loss": 0.0161, + "reward": 0.2916666865348816, + "reward_std": 0.2781743109226227, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.2916666865348816, + "step": 155 + }, + { + "completion_length": 250.0, + "epoch": 0.013, + "grad_norm": 0.6685758829116821, + "kl": 0.41572338342666626, + "learning_rate": 4.9523878883729794e-06, + "loss": 0.0166, + "reward": 0.4583333432674408, + "reward_std": 0.6886264681816101, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.3333333432674408, + "step": 156 + }, + { + "completion_length": 250.0, + "epoch": 0.013083333333333334, + "grad_norm": 0.4234911799430847, + "kl": 0.3872148096561432, + "learning_rate": 4.9506779365543054e-06, + "loss": 0.0155, + "reward": 0.2083333432674408, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.2083333432674408, + "step": 157 + }, + { + "completion_length": 250.0, + "epoch": 0.013166666666666667, + "grad_norm": 0.6146082878112793, + "kl": 0.3194940388202667, + "learning_rate": 4.94893812399836e-06, + "loss": 0.0128, + "reward": 0.1666666716337204, + "reward_std": 0.35634833574295044, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.1666666716337204, + "step": 158 + }, + { + "completion_length": 250.0, + "epoch": 0.01325, + "grad_norm": 6.197954177856445, + "kl": 0.9764306545257568, + "learning_rate": 4.947168471904213e-06, + "loss": 0.0391, + "reward": 0.1666666716337204, + "reward_std": 0.2519763112068176, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.1666666716337204, + "step": 159 + }, + { + "completion_length": 250.0, + "epoch": 0.013333333333333334, + "grad_norm": 0.5794118046760559, + "kl": 0.42038655281066895, + "learning_rate": 4.9453690018345144e-06, + "loss": 0.0168, + "reward": 0.75, + "reward_std": 1.0350983142852783, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.375, + "step": 160 + }, + { + "completion_length": 250.0, + "epoch": 0.013416666666666667, + "grad_norm": 23.13763999938965, + "kl": 2.790783405303955, + "learning_rate": 4.9435397357152406e-06, + "loss": 0.1116, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 161 + }, + { + "completion_length": 250.0, + "epoch": 0.0135, + "grad_norm": 0.9066824913024902, + "kl": 0.4245196580886841, + "learning_rate": 4.9416806958354206e-06, + "loss": 0.017, + "reward": 0.2083333432674408, + "reward_std": 0.39591163396835327, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.2083333432674408, + "step": 162 + }, + { + "completion_length": 250.0, + "epoch": 0.013583333333333333, + "grad_norm": 0.6648077964782715, + "kl": 0.5795211791992188, + "learning_rate": 4.939791904846869e-06, + "loss": 0.0232, + "reward": 0.2916666865348816, + "reward_std": 0.4154745042324066, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.2916666865348816, + "step": 163 + }, + { + "completion_length": 250.0, + "epoch": 0.013666666666666667, + "grad_norm": 0.5751465559005737, + "kl": 0.3582766056060791, + "learning_rate": 4.937873385763909e-06, + "loss": 0.0143, + "reward": 0.2916666865348816, + "reward_std": 0.4154745042324066, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.2916666865348816, + "step": 164 + }, + { + "completion_length": 250.0, + "epoch": 0.01375, + "grad_norm": 0.407955139875412, + "kl": 0.31000596284866333, + "learning_rate": 4.935925161963089e-06, + "loss": 0.0124, + "reward": 0.2083333432674408, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.2083333432674408, + "step": 165 + }, + { + "completion_length": 250.0, + "epoch": 0.013833333333333333, + "grad_norm": 1.1364095211029053, + "kl": 0.3804260492324829, + "learning_rate": 4.933947257182901e-06, + "loss": 0.0152, + "reward": 0.2083333432674408, + "reward_std": 0.39591163396835327, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.2083333432674408, + "step": 166 + }, + { + "completion_length": 250.0, + "epoch": 0.013916666666666667, + "grad_norm": 5.946568489074707, + "kl": 0.4389808773994446, + "learning_rate": 4.9319396955234925e-06, + "loss": 0.0176, + "reward": 0.4166666865348816, + "reward_std": 0.7292091846466064, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.2916666567325592, + "step": 167 + }, + { + "completion_length": 250.0, + "epoch": 0.014, + "grad_norm": 0.48689746856689453, + "kl": 0.4137730002403259, + "learning_rate": 4.9299025014463665e-06, + "loss": 0.0166, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 168 + }, + { + "completion_length": 250.0, + "epoch": 0.014083333333333333, + "grad_norm": 0.30347901582717896, + "kl": 0.2522311210632324, + "learning_rate": 4.92783569977409e-06, + "loss": 0.0101, + "reward": 0.1666666716337204, + "reward_std": 0.2519763112068176, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.1666666716337204, + "step": 169 + }, + { + "completion_length": 250.0, + "epoch": 0.014166666666666666, + "grad_norm": 0.7049718499183655, + "kl": 0.5534805655479431, + "learning_rate": 4.925739315689991e-06, + "loss": 0.0221, + "reward": 0.25, + "reward_std": 0.3450327515602112, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.125, + "step": 170 + }, + { + "completion_length": 250.0, + "epoch": 0.01425, + "grad_norm": 3.0675125122070312, + "kl": 0.2963431477546692, + "learning_rate": 4.923613374737848e-06, + "loss": 0.0119, + "reward": 0.1666666716337204, + "reward_std": 0.35634833574295044, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.1666666716337204, + "step": 171 + }, + { + "completion_length": 250.0, + "epoch": 0.014333333333333333, + "grad_norm": 0.5279906392097473, + "kl": 0.276735782623291, + "learning_rate": 4.921457902821578e-06, + "loss": 0.0111, + "reward": 0.1666666716337204, + "reward_std": 0.35634833574295044, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.1666666716337204, + "step": 172 + }, + { + "completion_length": 250.0, + "epoch": 0.014416666666666666, + "grad_norm": 0.3988935947418213, + "kl": 0.34347009658813477, + "learning_rate": 4.9192729262049285e-06, + "loss": 0.0137, + "reward": 0.5, + "reward_std": 0.5345224738121033, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.5, + "step": 173 + }, + { + "completion_length": 250.0, + "epoch": 0.0145, + "grad_norm": 1.7523726224899292, + "kl": 0.44115620851516724, + "learning_rate": 4.917058471511149e-06, + "loss": 0.0176, + "reward": 0.1666666716337204, + "reward_std": 0.2519763112068176, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.1666666716337204, + "step": 174 + }, + { + "completion_length": 250.0, + "epoch": 0.014583333333333334, + "grad_norm": 0.5499840378761292, + "kl": 0.37359291315078735, + "learning_rate": 4.914814565722671e-06, + "loss": 0.0149, + "reward": 0.0833333358168602, + "reward_std": 0.2357022762298584, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0833333358168602, + "step": 175 + }, + { + "completion_length": 250.0, + "epoch": 0.014666666666666666, + "grad_norm": 1.1000566482543945, + "kl": 0.5037091374397278, + "learning_rate": 4.912541236180779e-06, + "loss": 0.0201, + "reward": 0.2916666567325592, + "reward_std": 0.7000566720962524, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.1666666716337204, + "step": 176 + }, + { + "completion_length": 250.0, + "epoch": 0.01475, + "grad_norm": 0.3692109286785126, + "kl": 0.32822439074516296, + "learning_rate": 4.910238510585275e-06, + "loss": 0.0131, + "reward": 0.1666666716337204, + "reward_std": 0.35634833574295044, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.1666666716337204, + "step": 177 + }, + { + "completion_length": 250.0, + "epoch": 0.014833333333333334, + "grad_norm": 0.38307639956474304, + "kl": 0.45367297530174255, + "learning_rate": 4.907906416994146e-06, + "loss": 0.0181, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0, + "step": 178 + }, + { + "completion_length": 250.0, + "epoch": 0.014916666666666667, + "grad_norm": 0.8600552678108215, + "kl": 0.4305053949356079, + "learning_rate": 4.905544983823214e-06, + "loss": 0.0172, + "reward": 0.625, + "reward_std": 0.6283639669418335, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.5, + "step": 179 + }, + { + "completion_length": 250.0, + "epoch": 0.015, + "grad_norm": 0.8289276957511902, + "kl": 0.36276039481163025, + "learning_rate": 4.903154239845798e-06, + "loss": 0.0145, + "reward": 0.2083333432674408, + "reward_std": 0.39591166377067566, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.2083333432674408, + "step": 180 + }, + { + "completion_length": 250.0, + "epoch": 0.015083333333333334, + "grad_norm": 0.8905088901519775, + "kl": 0.42038679122924805, + "learning_rate": 4.900734214192358e-06, + "loss": 0.0168, + "reward": 0.375, + "reward_std": 0.4520675837993622, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.25, + "step": 181 + }, + { + "completion_length": 250.0, + "epoch": 0.015166666666666667, + "grad_norm": 0.4034233093261719, + "kl": 0.35581687092781067, + "learning_rate": 4.898284936350144e-06, + "loss": 0.0142, + "reward": 0.4166666567325592, + "reward_std": 0.7292091846466064, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.2916666865348816, + "step": 182 + }, + { + "completion_length": 250.0, + "epoch": 0.01525, + "grad_norm": 0.392464816570282, + "kl": 0.6867326498031616, + "learning_rate": 4.8958064361628334e-06, + "loss": 0.0275, + "reward": 0.5, + "reward_std": 0.4714045226573944, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.5, + "step": 183 + }, + { + "completion_length": 250.0, + "epoch": 0.015333333333333332, + "grad_norm": 0.7030049562454224, + "kl": 0.3916150629520416, + "learning_rate": 4.893298743830168e-06, + "loss": 0.0157, + "reward": 0.3333333432674408, + "reward_std": 0.6900655627250671, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.2083333432674408, + "step": 184 + }, + { + "completion_length": 250.0, + "epoch": 0.015416666666666667, + "grad_norm": 0.7821568846702576, + "kl": 0.7238714098930359, + "learning_rate": 4.890761889907589e-06, + "loss": 0.029, + "reward": 0.75, + "reward_std": 0.49601587653160095, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.625, + "step": 185 + }, + { + "completion_length": 250.0, + "epoch": 0.0155, + "grad_norm": 0.8786028027534485, + "kl": 0.4036749303340912, + "learning_rate": 4.888195905305859e-06, + "loss": 0.0161, + "reward": 0.5416666865348816, + "reward_std": 0.39591166377067566, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.4166666865348816, + "step": 186 + }, + { + "completion_length": 250.0, + "epoch": 0.015583333333333333, + "grad_norm": 0.5510672926902771, + "kl": 0.4351351857185364, + "learning_rate": 4.885600821290692e-06, + "loss": 0.0174, + "reward": 0.4166666865348816, + "reward_std": 0.49601587653160095, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.2916666865348816, + "step": 187 + }, + { + "completion_length": 250.0, + "epoch": 0.015666666666666666, + "grad_norm": 0.2654396891593933, + "kl": 0.2575778365135193, + "learning_rate": 4.882976669482368e-06, + "loss": 0.0103, + "reward": 0.9166666865348816, + "reward_std": 0.8498365879058838, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.4166666567325592, + "step": 188 + }, + { + "completion_length": 250.0, + "epoch": 0.01575, + "grad_norm": 0.6470081210136414, + "kl": 0.43255674839019775, + "learning_rate": 4.880323481855347e-06, + "loss": 0.0173, + "reward": 0.4583333730697632, + "reward_std": 0.5616726875305176, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.2083333432674408, + "step": 189 + }, + { + "completion_length": 250.0, + "epoch": 0.015833333333333335, + "grad_norm": 0.6133238673210144, + "kl": 0.370347797870636, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.0148, + "reward": 0.0416666679084301, + "reward_std": 0.1178511381149292, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.0416666679084301, + "step": 190 + }, + { + "completion_length": 250.0, + "epoch": 0.015916666666666666, + "grad_norm": 3.239405393600464, + "kl": 0.5021273493766785, + "learning_rate": 4.874930128811631e-06, + "loss": 0.0201, + "reward": 0.5416666269302368, + "reward_std": 0.6651768684387207, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.2916666567325592, + "step": 191 + }, + { + "completion_length": 250.0, + "epoch": 0.016, + "grad_norm": 0.8710654973983765, + "kl": 0.5387289524078369, + "learning_rate": 4.8721900291112415e-06, + "loss": 0.0215, + "reward": 0.625, + "reward_std": 0.6770032048225403, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.5, + "step": 192 + }, + { + "completion_length": 250.0, + "epoch": 0.016083333333333335, + "grad_norm": 1.41133451461792, + "kl": 0.5218385457992554, + "learning_rate": 4.869421025023965e-06, + "loss": 0.0209, + "reward": 0.5416666865348816, + "reward_std": 0.7332792282104492, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.4166666865348816, + "step": 193 + }, + { + "completion_length": 250.0, + "epoch": 0.016166666666666666, + "grad_norm": 0.9869065880775452, + "kl": 0.5528762340545654, + "learning_rate": 4.866623150289241e-06, + "loss": 0.0221, + "reward": 0.4166666865348816, + "reward_std": 0.7071067690849304, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.2916666865348816, + "step": 194 + }, + { + "completion_length": 250.0, + "epoch": 0.01625, + "grad_norm": 0.347484827041626, + "kl": 0.3075355589389801, + "learning_rate": 4.863796438998293e-06, + "loss": 0.0123, + "reward": 0.375, + "reward_std": 0.4154745042324066, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.375, + "step": 195 + }, + { + "completion_length": 250.0, + "epoch": 0.01633333333333333, + "grad_norm": 0.5246497988700867, + "kl": 0.47872331738471985, + "learning_rate": 4.860940925593703e-06, + "loss": 0.0191, + "reward": 0.4166666567325592, + "reward_std": 0.49601587653160095, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.4166666865348816, + "step": 196 + }, + { + "completion_length": 250.0, + "epoch": 0.016416666666666666, + "grad_norm": 0.5665456056594849, + "kl": 0.40581831336021423, + "learning_rate": 4.858056644869002e-06, + "loss": 0.0162, + "reward": 0.7916666865348816, + "reward_std": 0.8533315062522888, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.4166666865348816, + "step": 197 + }, + { + "completion_length": 250.0, + "epoch": 0.0165, + "grad_norm": 0.37036922574043274, + "kl": 0.4440433382987976, + "learning_rate": 4.855143631968242e-06, + "loss": 0.0178, + "reward": 0.625, + "reward_std": 0.4520675241947174, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.625, + "step": 198 + }, + { + "completion_length": 250.0, + "epoch": 0.016583333333333332, + "grad_norm": 1.1761598587036133, + "kl": 0.5070418119430542, + "learning_rate": 4.852201922385564e-06, + "loss": 0.0203, + "reward": 0.25, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.25, + "step": 199 + }, + { + "completion_length": 250.0, + "epoch": 0.016666666666666666, + "grad_norm": 0.3598565459251404, + "kl": 0.38801202178001404, + "learning_rate": 4.849231551964771e-06, + "loss": 0.0155, + "reward": 0.4583333134651184, + "reward_std": 0.7753647565841675, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.2083333432674408, + "step": 200 + }, + { + "completion_length": 250.0, + "epoch": 0.01675, + "grad_norm": 6.092010974884033, + "kl": 0.5048520565032959, + "learning_rate": 4.84623255689889e-06, + "loss": 0.0202, + "reward": 0.2083333432674408, + "reward_std": 0.39591166377067566, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.2083333432674408, + "step": 201 + }, + { + "completion_length": 250.0, + "epoch": 0.016833333333333332, + "grad_norm": 2.679046392440796, + "kl": 0.5951515436172485, + "learning_rate": 4.84320497372973e-06, + "loss": 0.0238, + "reward": 0.5, + "reward_std": 0.7126966118812561, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.25, + "step": 202 + }, + { + "completion_length": 250.0, + "epoch": 0.016916666666666667, + "grad_norm": 1.4502449035644531, + "kl": 0.6235775351524353, + "learning_rate": 4.840148839347434e-06, + "loss": 0.0249, + "reward": 0.5416666269302368, + "reward_std": 0.7332792282104492, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.4166666865348816, + "step": 203 + }, + { + "completion_length": 250.0, + "epoch": 0.017, + "grad_norm": 0.34813621640205383, + "kl": 0.3194099962711334, + "learning_rate": 4.837064190990036e-06, + "loss": 0.0128, + "reward": 0.6666666269302368, + "reward_std": 0.8908708095550537, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.2916666567325592, + "step": 204 + }, + { + "completion_length": 250.0, + "epoch": 0.017083333333333332, + "grad_norm": 0.7955384254455566, + "kl": 0.6405460238456726, + "learning_rate": 4.833951066243004e-06, + "loss": 0.0256, + "reward": 0.4583333432674408, + "reward_std": 0.46929529309272766, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.4583333134651184, + "step": 205 + }, + { + "completion_length": 250.0, + "epoch": 0.017166666666666667, + "grad_norm": 0.6468902826309204, + "kl": 0.4588083326816559, + "learning_rate": 4.830809503038781e-06, + "loss": 0.0184, + "reward": 0.2083333432674408, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.2083333432674408, + "step": 206 + }, + { + "completion_length": 250.0, + "epoch": 0.01725, + "grad_norm": 0.6557303071022034, + "kl": 0.4808078706264496, + "learning_rate": 4.8276395396563215e-06, + "loss": 0.0192, + "reward": 0.4583333432674408, + "reward_std": 0.7332792282104492, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.3333333432674408, + "step": 207 + }, + { + "completion_length": 250.0, + "epoch": 0.017333333333333333, + "grad_norm": 0.8586503267288208, + "kl": 0.7754402756690979, + "learning_rate": 4.824441214720629e-06, + "loss": 0.031, + "reward": 0.5416666865348816, + "reward_std": 0.43415671586990356, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.5416666865348816, + "step": 208 + }, + { + "completion_length": 250.0, + "epoch": 0.017416666666666667, + "grad_norm": 0.2894943356513977, + "kl": 0.2780013978481293, + "learning_rate": 4.821214567202284e-06, + "loss": 0.0111, + "reward": 0.3333333432674408, + "reward_std": 0.4364357590675354, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.3333333432674408, + "step": 209 + }, + { + "completion_length": 250.0, + "epoch": 0.0175, + "grad_norm": 0.4333679676055908, + "kl": 0.4271698594093323, + "learning_rate": 4.817959636416969e-06, + "loss": 0.0171, + "reward": 0.5416666865348816, + "reward_std": 0.46929532289505005, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.4166666865348816, + "step": 210 + }, + { + "completion_length": 250.0, + "epoch": 0.017583333333333333, + "grad_norm": 2.1452853679656982, + "kl": 0.41794443130493164, + "learning_rate": 4.814676462024988e-06, + "loss": 0.0167, + "reward": 0.375, + "reward_std": 0.41547447443008423, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.25, + "step": 211 + }, + { + "completion_length": 250.0, + "epoch": 0.017666666666666667, + "grad_norm": 0.37924328446388245, + "kl": 0.5044135451316833, + "learning_rate": 4.811365084030784e-06, + "loss": 0.0202, + "reward": 0.4166666567325592, + "reward_std": 0.6606874465942383, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.2916666865348816, + "step": 212 + }, + { + "completion_length": 250.0, + "epoch": 0.01775, + "grad_norm": 1.1589990854263306, + "kl": 0.5812166333198547, + "learning_rate": 4.808025542782453e-06, + "loss": 0.0232, + "reward": 0.4166666865348816, + "reward_std": 0.49601587653160095, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.2916666865348816, + "step": 213 + }, + { + "completion_length": 250.0, + "epoch": 0.017833333333333333, + "grad_norm": 2.28521728515625, + "kl": 0.5571350455284119, + "learning_rate": 4.804657878971252e-06, + "loss": 0.0223, + "reward": 0.4583333432674408, + "reward_std": 0.6651769280433655, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.3333333432674408, + "step": 214 + }, + { + "completion_length": 250.0, + "epoch": 0.017916666666666668, + "grad_norm": 0.861594557762146, + "kl": 0.6830026507377625, + "learning_rate": 4.801262133631101e-06, + "loss": 0.0273, + "reward": 0.8333333134651184, + "reward_std": 0.8164966106414795, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.5833333730697632, + "step": 215 + }, + { + "completion_length": 250.0, + "epoch": 0.018, + "grad_norm": 1.8360395431518555, + "kl": 1.10158371925354, + "learning_rate": 4.7978383481380865e-06, + "loss": 0.0441, + "reward": 0.4583333730697632, + "reward_std": 0.43415671586990356, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.3333333432674408, + "step": 216 + }, + { + "completion_length": 250.0, + "epoch": 0.018083333333333333, + "grad_norm": 0.957562267780304, + "kl": 0.542965292930603, + "learning_rate": 4.794386564209953e-06, + "loss": 0.0217, + "reward": 1.1666666269302368, + "reward_std": 0.9428090453147888, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.6666666269302368, + "step": 217 + }, + { + "completion_length": 250.0, + "epoch": 0.018166666666666668, + "grad_norm": 0.33301106095314026, + "kl": 0.3330531120300293, + "learning_rate": 4.790906823905599e-06, + "loss": 0.0133, + "reward": 0.2916666567325592, + "reward_std": 0.4520675837993622, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.2916666865348816, + "step": 218 + }, + { + "completion_length": 250.0, + "epoch": 0.01825, + "grad_norm": 0.9023637771606445, + "kl": 0.5449975728988647, + "learning_rate": 4.787399169624562e-06, + "loss": 0.0218, + "reward": 0.2083333432674408, + "reward_std": 0.24800793826580048, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.2083333432674408, + "step": 219 + }, + { + "completion_length": 250.0, + "epoch": 0.018333333333333333, + "grad_norm": 4.706124782562256, + "kl": 0.884423553943634, + "learning_rate": 4.783863644106502e-06, + "loss": 0.0354, + "reward": 0.75, + "reward_std": 0.7918233275413513, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.5, + "step": 220 + }, + { + "completion_length": 250.0, + "epoch": 0.018416666666666668, + "grad_norm": 0.7690190076828003, + "kl": 0.6677464246749878, + "learning_rate": 4.780300290430683e-06, + "loss": 0.0267, + "reward": 0.4583333432674408, + "reward_std": 0.7332792282104492, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.3333333432674408, + "step": 221 + }, + { + "completion_length": 250.0, + "epoch": 0.0185, + "grad_norm": 0.7956643104553223, + "kl": 0.4805806577205658, + "learning_rate": 4.776709152015443e-06, + "loss": 0.0192, + "reward": 0.375, + "reward_std": 0.4520675837993622, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.3750000298023224, + "step": 222 + }, + { + "completion_length": 250.0, + "epoch": 0.018583333333333334, + "grad_norm": 0.3264180123806, + "kl": 0.46136581897735596, + "learning_rate": 4.773090272617672e-06, + "loss": 0.0185, + "reward": 0.2916666865348816, + "reward_std": 0.4520675241947174, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.2916666865348816, + "step": 223 + }, + { + "completion_length": 250.0, + "epoch": 0.018666666666666668, + "grad_norm": 0.33508577942848206, + "kl": 0.33475160598754883, + "learning_rate": 4.769443696332272e-06, + "loss": 0.0134, + "reward": 0.1666666716337204, + "reward_std": 0.17817416787147522, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.1666666716337204, + "step": 224 + }, + { + "completion_length": 250.0, + "epoch": 0.01875, + "grad_norm": 0.35157206654548645, + "kl": 0.48729386925697327, + "learning_rate": 4.765769467591626e-06, + "loss": 0.0195, + "reward": 0.4166666865348816, + "reward_std": 0.42724665999412537, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.4166666865348816, + "step": 225 + }, + { + "completion_length": 250.0, + "epoch": 0.018833333333333334, + "grad_norm": 0.44434380531311035, + "kl": 0.27789148688316345, + "learning_rate": 4.762067631165049e-06, + "loss": 0.0111, + "reward": 0.25, + "reward_std": 0.34503278136253357, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.125, + "step": 226 + }, + { + "completion_length": 250.0, + "epoch": 0.018916666666666665, + "grad_norm": 1.0383962392807007, + "kl": 0.6592018604278564, + "learning_rate": 4.7583382321582525e-06, + "loss": 0.0264, + "reward": 0.75, + "reward_std": 0.6606874465942383, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.5, + "step": 227 + }, + { + "completion_length": 250.0, + "epoch": 0.019, + "grad_norm": 0.7657462954521179, + "kl": 0.45288151502609253, + "learning_rate": 4.754581316012785e-06, + "loss": 0.0181, + "reward": 0.375, + "reward_std": 0.4520675837993622, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.375, + "step": 228 + }, + { + "completion_length": 250.0, + "epoch": 0.019083333333333334, + "grad_norm": 0.35769039392471313, + "kl": 0.4359067380428314, + "learning_rate": 4.750796928505484e-06, + "loss": 0.0174, + "reward": 0.3333333432674408, + "reward_std": 0.35634833574295044, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.3333333432674408, + "step": 229 + }, + { + "completion_length": 250.0, + "epoch": 0.019166666666666665, + "grad_norm": 0.6205803155899048, + "kl": 0.43002450466156006, + "learning_rate": 4.746985115747918e-06, + "loss": 0.0172, + "reward": 0.625, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.375, + "step": 230 + }, + { + "completion_length": 250.0, + "epoch": 0.01925, + "grad_norm": 0.44161456823349, + "kl": 0.4217735230922699, + "learning_rate": 4.743145924185821e-06, + "loss": 0.0169, + "reward": 0.375, + "reward_std": 0.4154745042324066, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.375, + "step": 231 + }, + { + "completion_length": 250.0, + "epoch": 0.019333333333333334, + "grad_norm": 1.402330994606018, + "kl": 0.5382075309753418, + "learning_rate": 4.7392794005985324e-06, + "loss": 0.0215, + "reward": 0.5416666865348816, + "reward_std": 0.43415671586990356, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.5416666865348816, + "step": 232 + }, + { + "completion_length": 250.0, + "epoch": 0.019416666666666665, + "grad_norm": 0.30393776297569275, + "kl": 0.41637468338012695, + "learning_rate": 4.735385592098421e-06, + "loss": 0.0167, + "reward": 0.2083333432674408, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.2083333432674408, + "step": 233 + }, + { + "completion_length": 250.0, + "epoch": 0.0195, + "grad_norm": 0.29830071330070496, + "kl": 0.38214111328125, + "learning_rate": 4.731464546130315e-06, + "loss": 0.0153, + "reward": 0.5833333730697632, + "reward_std": 0.7292091846466064, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.4583333432674408, + "step": 234 + }, + { + "completion_length": 250.0, + "epoch": 0.019583333333333335, + "grad_norm": 0.7765196561813354, + "kl": 0.6346302032470703, + "learning_rate": 4.72751631047092e-06, + "loss": 0.0254, + "reward": 0.6250000596046448, + "reward_std": 0.8249579668045044, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.3750000298023224, + "step": 235 + }, + { + "completion_length": 250.0, + "epoch": 0.019666666666666666, + "grad_norm": 0.5595923662185669, + "kl": 0.5578448176383972, + "learning_rate": 4.723540933228245e-06, + "loss": 0.0223, + "reward": 0.4583333432674408, + "reward_std": 0.7332792282104492, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.3333333432674408, + "step": 236 + }, + { + "completion_length": 250.0, + "epoch": 0.01975, + "grad_norm": 3.7788710594177246, + "kl": 0.4666965901851654, + "learning_rate": 4.719538462841003e-06, + "loss": 0.0187, + "reward": 0.3333333432674408, + "reward_std": 0.4714045226573944, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.3333333432674408, + "step": 237 + }, + { + "completion_length": 250.0, + "epoch": 0.019833333333333335, + "grad_norm": 0.2744882106781006, + "kl": 0.49767276644706726, + "learning_rate": 4.715508948078037e-06, + "loss": 0.0199, + "reward": 0.5833333730697632, + "reward_std": 0.34503278136253357, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.5833333730697632, + "step": 238 + }, + { + "completion_length": 250.0, + "epoch": 0.019916666666666666, + "grad_norm": 37.09307098388672, + "kl": 2.531292676925659, + "learning_rate": 4.71145243803771e-06, + "loss": 0.1013, + "reward": 0.625, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.5, + "step": 239 + }, + { + "completion_length": 250.0, + "epoch": 0.02, + "grad_norm": 0.5616350173950195, + "kl": 0.8509740829467773, + "learning_rate": 4.707368982147318e-06, + "loss": 0.034, + "reward": 0.6666666865348816, + "reward_std": 0.6900655627250671, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.5416666865348816, + "step": 240 + }, + { + "completion_length": 250.0, + "epoch": 0.020083333333333335, + "grad_norm": 0.3082992136478424, + "kl": 0.4213227331638336, + "learning_rate": 4.703258630162481e-06, + "loss": 0.0169, + "reward": 0.2916666567325592, + "reward_std": 0.37533050775527954, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.2916666865348816, + "step": 241 + }, + { + "completion_length": 250.0, + "epoch": 0.020166666666666666, + "grad_norm": 0.35087594389915466, + "kl": 0.34501180052757263, + "learning_rate": 4.699121432166542e-06, + "loss": 0.0138, + "reward": 0.4583333432674408, + "reward_std": 0.43415671586990356, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.4583333432674408, + "step": 242 + }, + { + "completion_length": 250.0, + "epoch": 0.02025, + "grad_norm": 1.2634652853012085, + "kl": 0.8103005290031433, + "learning_rate": 4.6949574385699514e-06, + "loss": 0.0324, + "reward": 1.1666666269302368, + "reward_std": 0.7766431570053101, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.7916666865348816, + "step": 243 + }, + { + "completion_length": 250.0, + "epoch": 0.02033333333333333, + "grad_norm": 0.8633314967155457, + "kl": 0.8408775925636292, + "learning_rate": 4.690766700109659e-06, + "loss": 0.0336, + "reward": 0.8333333730697632, + "reward_std": 0.6424160599708557, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.7083333730697632, + "step": 244 + }, + { + "completion_length": 250.0, + "epoch": 0.020416666666666666, + "grad_norm": 0.34025847911834717, + "kl": 0.3001478612422943, + "learning_rate": 4.68654926784849e-06, + "loss": 0.012, + "reward": 0.2916666865348816, + "reward_std": 0.33034375309944153, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.2916666865348816, + "step": 245 + }, + { + "completion_length": 250.0, + "epoch": 0.0205, + "grad_norm": 0.6269357800483704, + "kl": 0.6774225831031799, + "learning_rate": 4.682305193174524e-06, + "loss": 0.0271, + "reward": 0.5, + "reward_std": 0.4714045226573944, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.5, + "step": 246 + }, + { + "completion_length": 250.0, + "epoch": 0.020583333333333332, + "grad_norm": 0.261139839887619, + "kl": 0.628397524356842, + "learning_rate": 4.6780345278004744e-06, + "loss": 0.0251, + "reward": 0.9583333730697632, + "reward_std": 0.7650604248046875, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.5833333730697632, + "step": 247 + }, + { + "completion_length": 250.0, + "epoch": 0.020666666666666667, + "grad_norm": 0.3640219569206238, + "kl": 0.35570889711380005, + "learning_rate": 4.673737323763048e-06, + "loss": 0.0142, + "reward": 0.4166666567325592, + "reward_std": 0.49601587653160095, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.4166666567325592, + "step": 248 + }, + { + "completion_length": 250.0, + "epoch": 0.02075, + "grad_norm": 0.3684917688369751, + "kl": 0.3530455231666565, + "learning_rate": 4.669413633422322e-06, + "loss": 0.0141, + "reward": 0.2916666865348816, + "reward_std": 0.4520675241947174, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.2916666865348816, + "step": 249 + }, + { + "completion_length": 250.0, + "epoch": 0.020833333333333332, + "grad_norm": 0.4557255804538727, + "kl": 0.6199475526809692, + "learning_rate": 4.665063509461098e-06, + "loss": 0.0248, + "reward": 1.1666667461395264, + "reward_std": 0.7559289336204529, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.7916666865348816, + "step": 250 + }, + { + "completion_length": 250.0, + "epoch": 0.020916666666666667, + "grad_norm": 0.2662737965583801, + "kl": 0.4792260229587555, + "learning_rate": 4.6606870048842626e-06, + "loss": 0.0192, + "reward": 0.4583333134651184, + "reward_std": 0.7113032937049866, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.3333333432674408, + "step": 251 + }, + { + "completion_length": 250.0, + "epoch": 0.021, + "grad_norm": 0.3056776821613312, + "kl": 0.6897392868995667, + "learning_rate": 4.656284173018144e-06, + "loss": 0.0276, + "reward": 1.2916667461395264, + "reward_std": 0.6770032048225403, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.6666666865348816, + "step": 252 + }, + { + "completion_length": 250.0, + "epoch": 0.021083333333333332, + "grad_norm": 0.33795520663261414, + "kl": 0.3045516312122345, + "learning_rate": 4.65185506750986e-06, + "loss": 0.0122, + "reward": 0.4583333134651184, + "reward_std": 0.46929532289505005, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.4583333730697632, + "step": 253 + }, + { + "completion_length": 250.0, + "epoch": 0.021166666666666667, + "grad_norm": 0.24205689132213593, + "kl": 0.42878177762031555, + "learning_rate": 4.6473997423266615e-06, + "loss": 0.0172, + "reward": 1.125, + "reward_std": 0.589255690574646, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.7500000596046448, + "step": 254 + }, + { + "completion_length": 250.0, + "epoch": 0.02125, + "grad_norm": 1.4344029426574707, + "kl": 0.7129054665565491, + "learning_rate": 4.642918251755281e-06, + "loss": 0.0285, + "reward": 0.5416666269302368, + "reward_std": 0.5019802451133728, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.5416666865348816, + "step": 255 + }, + { + "completion_length": 250.0, + "epoch": 0.021333333333333333, + "grad_norm": 2.667504072189331, + "kl": 1.6249091625213623, + "learning_rate": 4.638410650401267e-06, + "loss": 0.065, + "reward": 0.6666666269302368, + "reward_std": 0.7126966714859009, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.5416666269302368, + "step": 256 + }, + { + "completion_length": 250.0, + "epoch": 0.021416666666666667, + "grad_norm": 0.5786775946617126, + "kl": 0.8081812262535095, + "learning_rate": 4.633876993188319e-06, + "loss": 0.0323, + "reward": 0.8333333730697632, + "reward_std": 0.35634833574295044, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.5833333730697632, + "step": 257 + }, + { + "completion_length": 250.0, + "epoch": 0.0215, + "grad_norm": 4.823422908782959, + "kl": 0.7295472025871277, + "learning_rate": 4.62931733535762e-06, + "loss": 0.0292, + "reward": 0.7083333730697632, + "reward_std": 0.7000566720962524, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.4583333432674408, + "step": 258 + }, + { + "completion_length": 250.0, + "epoch": 0.021583333333333333, + "grad_norm": 0.7597243785858154, + "kl": 0.7165347337722778, + "learning_rate": 4.62473173246716e-06, + "loss": 0.0287, + "reward": 0.7083333730697632, + "reward_std": 0.6770032048225403, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.5833333730697632, + "step": 259 + }, + { + "completion_length": 250.0, + "epoch": 0.021666666666666667, + "grad_norm": 0.6498469710350037, + "kl": 0.6449453830718994, + "learning_rate": 4.620120240391065e-06, + "loss": 0.0258, + "reward": 0.5, + "reward_std": 0.7559289336204529, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.375, + "step": 260 + }, + { + "completion_length": 250.0, + "epoch": 0.02175, + "grad_norm": 1.3547675609588623, + "kl": 0.7189813256263733, + "learning_rate": 4.6154829153189105e-06, + "loss": 0.0288, + "reward": 0.5, + "reward_std": 0.4364357888698578, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.5, + "step": 261 + }, + { + "completion_length": 250.0, + "epoch": 0.021833333333333333, + "grad_norm": 0.38959622383117676, + "kl": 0.6248072981834412, + "learning_rate": 4.610819813755038e-06, + "loss": 0.025, + "reward": 0.6666666865348816, + "reward_std": 0.7126966714859009, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.5416666865348816, + "step": 262 + }, + { + "completion_length": 250.0, + "epoch": 0.021916666666666668, + "grad_norm": 0.5569881200790405, + "kl": 0.5907061696052551, + "learning_rate": 4.60613099251787e-06, + "loss": 0.0236, + "reward": 0.625, + "reward_std": 0.4520675837993622, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.625, + "step": 263 + }, + { + "completion_length": 250.0, + "epoch": 0.022, + "grad_norm": 0.9613457322120667, + "kl": 0.7936864495277405, + "learning_rate": 4.601416508739211e-06, + "loss": 0.0317, + "reward": 0.7083333730697632, + "reward_std": 0.6770032048225403, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.5833333730697632, + "step": 264 + }, + { + "completion_length": 250.0, + "epoch": 0.022083333333333333, + "grad_norm": 0.9828578233718872, + "kl": 0.6290979981422424, + "learning_rate": 4.596676419863561e-06, + "loss": 0.0252, + "reward": 0.75, + "reward_std": 0.6606874465942383, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.5, + "step": 265 + }, + { + "completion_length": 250.0, + "epoch": 0.022166666666666668, + "grad_norm": 20.848907470703125, + "kl": 2.7031993865966797, + "learning_rate": 4.591910783647405e-06, + "loss": 0.1081, + "reward": 0.7083333730697632, + "reward_std": 0.6531364917755127, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.5833333730697632, + "step": 266 + }, + { + "completion_length": 250.0, + "epoch": 0.02225, + "grad_norm": 0.6771628260612488, + "kl": 0.762807309627533, + "learning_rate": 4.587119658158517e-06, + "loss": 0.0305, + "reward": 0.75, + "reward_std": 0.7071067690849304, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.625, + "step": 267 + }, + { + "completion_length": 250.0, + "epoch": 0.022333333333333334, + "grad_norm": 1.2213846445083618, + "kl": 0.7535417675971985, + "learning_rate": 4.582303101775249e-06, + "loss": 0.0301, + "reward": 0.8333333730697632, + "reward_std": 0.5909367799758911, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.7083333730697632, + "step": 268 + }, + { + "completion_length": 250.0, + "epoch": 0.022416666666666668, + "grad_norm": 0.3612246513366699, + "kl": 0.4376307725906372, + "learning_rate": 4.577461173185821e-06, + "loss": 0.0175, + "reward": 0.4583333432674408, + "reward_std": 0.7332792282104492, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.3333333432674408, + "step": 269 + }, + { + "completion_length": 250.0, + "epoch": 0.0225, + "grad_norm": 0.5589926242828369, + "kl": 0.43723782896995544, + "learning_rate": 4.572593931387604e-06, + "loss": 0.0175, + "reward": 0.375, + "reward_std": 0.3753305673599243, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.375, + "step": 270 + }, + { + "completion_length": 250.0, + "epoch": 0.022583333333333334, + "grad_norm": 0.4354017972946167, + "kl": 0.6601411700248718, + "learning_rate": 4.567701435686405e-06, + "loss": 0.0264, + "reward": 0.7083333730697632, + "reward_std": 0.4520675241947174, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.7083333730697632, + "step": 271 + }, + { + "completion_length": 250.0, + "epoch": 0.02266666666666667, + "grad_norm": 0.31577983498573303, + "kl": 0.5072486996650696, + "learning_rate": 4.562783745695738e-06, + "loss": 0.0203, + "reward": 0.75, + "reward_std": 0.7071067690849304, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.625, + "step": 272 + }, + { + "completion_length": 250.0, + "epoch": 0.02275, + "grad_norm": 0.44634732604026794, + "kl": 0.631403386592865, + "learning_rate": 4.5578409213361055e-06, + "loss": 0.0253, + "reward": 1.125, + "reward_std": 0.5892556309700012, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.75, + "step": 273 + }, + { + "completion_length": 250.0, + "epoch": 0.022833333333333334, + "grad_norm": 2.6211862564086914, + "kl": 0.8090041875839233, + "learning_rate": 4.55287302283426e-06, + "loss": 0.0324, + "reward": 0.8333333134651184, + "reward_std": 0.8164964914321899, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.5833333730697632, + "step": 274 + }, + { + "completion_length": 250.0, + "epoch": 0.022916666666666665, + "grad_norm": 5.3712663650512695, + "kl": 1.6774547100067139, + "learning_rate": 4.54788011072248e-06, + "loss": 0.0671, + "reward": 0.7916666865348816, + "reward_std": 0.7332792282104492, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.5416666865348816, + "step": 275 + }, + { + "completion_length": 250.0, + "epoch": 0.023, + "grad_norm": 0.9446873068809509, + "kl": 0.5351336002349854, + "learning_rate": 4.542862245837821e-06, + "loss": 0.0214, + "reward": 0.4166666865348816, + "reward_std": 0.49601587653160095, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.2916666567325592, + "step": 276 + }, + { + "completion_length": 250.0, + "epoch": 0.023083333333333334, + "grad_norm": 2.5900323390960693, + "kl": 1.0092023611068726, + "learning_rate": 4.537819489321385e-06, + "loss": 0.0404, + "reward": 0.9166666865348816, + "reward_std": 0.5841830372810364, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.7916666269302368, + "step": 277 + }, + { + "completion_length": 250.0, + "epoch": 0.023166666666666665, + "grad_norm": 0.4755576550960541, + "kl": 0.756645917892456, + "learning_rate": 4.5327519026175694e-06, + "loss": 0.0303, + "reward": 0.8333333730697632, + "reward_std": 0.4364357888698578, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.5833333730697632, + "step": 278 + }, + { + "completion_length": 250.0, + "epoch": 0.02325, + "grad_norm": 0.26933223009109497, + "kl": 0.7350080013275146, + "learning_rate": 4.527659547473317e-06, + "loss": 0.0294, + "reward": 1.0, + "reward_std": 0.6900655031204224, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.75, + "step": 279 + }, + { + "completion_length": 189.0, + "epoch": 0.023333333333333334, + "grad_norm": 0.3307403028011322, + "kl": 0.6269306540489197, + "learning_rate": 4.522542485937369e-06, + "loss": 0.0251, + "reward": 1.0, + "reward_std": 0.7559289336204529, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.75, + "step": 280 + }, + { + "completion_length": 250.0, + "epoch": 0.023416666666666665, + "grad_norm": 0.4250253140926361, + "kl": 0.7219645977020264, + "learning_rate": 4.517400780359505e-06, + "loss": 0.0289, + "reward": 1.0, + "reward_std": 0.7126966118812561, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.75, + "step": 281 + }, + { + "completion_length": 250.0, + "epoch": 0.0235, + "grad_norm": 0.2644083499908447, + "kl": 0.7858296632766724, + "learning_rate": 4.512234493389785e-06, + "loss": 0.0314, + "reward": 1.3333333730697632, + "reward_std": 0.5634361505508423, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.9583333730697632, + "step": 282 + }, + { + "completion_length": 250.0, + "epoch": 0.023583333333333335, + "grad_norm": 0.9490629434585571, + "kl": 1.0001749992370605, + "learning_rate": 4.507043687977787e-06, + "loss": 0.04, + "reward": 0.8333333730697632, + "reward_std": 0.6900655627250671, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.5833333730697632, + "step": 283 + }, + { + "completion_length": 250.0, + "epoch": 0.023666666666666666, + "grad_norm": 2.613924741744995, + "kl": 0.9775833487510681, + "learning_rate": 4.501828427371834e-06, + "loss": 0.0391, + "reward": 0.875, + "reward_std": 0.9074209332466125, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.375, + "step": 284 + }, + { + "completion_length": 250.0, + "epoch": 0.02375, + "grad_norm": 0.32292431592941284, + "kl": 0.39211633801460266, + "learning_rate": 4.496588775118232e-06, + "loss": 0.0157, + "reward": 0.4583333432674408, + "reward_std": 0.43415671586990356, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.4583333432674408, + "step": 285 + }, + { + "completion_length": 250.0, + "epoch": 0.023833333333333335, + "grad_norm": 3.7025341987609863, + "kl": 0.681837260723114, + "learning_rate": 4.491324795060491e-06, + "loss": 0.0273, + "reward": 0.7916666865348816, + "reward_std": 0.39591163396835327, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.6666666865348816, + "step": 286 + }, + { + "completion_length": 250.0, + "epoch": 0.023916666666666666, + "grad_norm": 0.4885029196739197, + "kl": 0.9224212765693665, + "learning_rate": 4.4860365513385456e-06, + "loss": 0.0369, + "reward": 0.9583333730697632, + "reward_std": 0.7650604248046875, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.5833333730697632, + "step": 287 + }, + { + "completion_length": 250.0, + "epoch": 0.024, + "grad_norm": 0.3269827365875244, + "kl": 0.5056464672088623, + "learning_rate": 4.4807241083879774e-06, + "loss": 0.0202, + "reward": 0.7083333730697632, + "reward_std": 0.7000566720962524, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.5833333134651184, + "step": 288 + }, + { + "completion_length": 250.0, + "epoch": 0.024083333333333335, + "grad_norm": 1.628382682800293, + "kl": 0.7768495678901672, + "learning_rate": 4.475387530939226e-06, + "loss": 0.0311, + "reward": 1.125, + "reward_std": 0.8345229625701904, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.75, + "step": 289 + }, + { + "completion_length": 250.0, + "epoch": 0.024166666666666666, + "grad_norm": 0.42525872588157654, + "kl": 0.44087234139442444, + "learning_rate": 4.470026884016805e-06, + "loss": 0.0176, + "reward": 0.6666666865348816, + "reward_std": 0.7126966118812561, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.5416666269302368, + "step": 290 + }, + { + "completion_length": 250.0, + "epoch": 0.02425, + "grad_norm": 1.2830743789672852, + "kl": 0.5833079814910889, + "learning_rate": 4.464642232938505e-06, + "loss": 0.0233, + "reward": 0.5416666865348816, + "reward_std": 0.7332792282104492, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.4166666865348816, + "step": 291 + }, + { + "completion_length": 250.0, + "epoch": 0.024333333333333332, + "grad_norm": 0.2711179256439209, + "kl": 0.6251975297927856, + "learning_rate": 4.4592336433146e-06, + "loss": 0.025, + "reward": 1.4583333730697632, + "reward_std": 0.7332792282104492, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.8333333134651184, + "step": 292 + }, + { + "completion_length": 250.0, + "epoch": 0.024416666666666666, + "grad_norm": 1.7788009643554688, + "kl": 1.1422821283340454, + "learning_rate": 4.453801181047047e-06, + "loss": 0.0457, + "reward": 1.25, + "reward_std": 0.8864052295684814, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.75, + "step": 293 + }, + { + "completion_length": 250.0, + "epoch": 0.0245, + "grad_norm": 0.6440579295158386, + "kl": 0.7067055106163025, + "learning_rate": 4.448344912328686e-06, + "loss": 0.0283, + "reward": 0.9583333730697632, + "reward_std": 0.5473601818084717, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.8333333730697632, + "step": 294 + }, + { + "completion_length": 250.0, + "epoch": 0.024583333333333332, + "grad_norm": 1.1637884378433228, + "kl": 1.3202813863754272, + "learning_rate": 4.442864903642428e-06, + "loss": 0.0528, + "reward": 1.2083333730697632, + "reward_std": 0.7332792282104492, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.8333333730697632, + "step": 295 + }, + { + "completion_length": 250.0, + "epoch": 0.024666666666666667, + "grad_norm": 0.35194841027259827, + "kl": 0.6548830270767212, + "learning_rate": 4.437361221760449e-06, + "loss": 0.0262, + "reward": 1.0, + "reward_std": 0.7126966118812561, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.75, + "step": 296 + }, + { + "completion_length": 250.0, + "epoch": 0.02475, + "grad_norm": 0.7272174954414368, + "kl": 0.7445266842842102, + "learning_rate": 4.431833933743378e-06, + "loss": 0.0298, + "reward": 1.0416666269302368, + "reward_std": 1.0302951335906982, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.5416666865348816, + "step": 297 + }, + { + "completion_length": 250.0, + "epoch": 0.024833333333333332, + "grad_norm": 0.36612656712532043, + "kl": 0.6318120956420898, + "learning_rate": 4.426283106939474e-06, + "loss": 0.0253, + "reward": 0.7916666269302368, + "reward_std": 0.8533315062522888, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.5416666269302368, + "step": 298 + }, + { + "completion_length": 250.0, + "epoch": 0.024916666666666667, + "grad_norm": 4.444971561431885, + "kl": 1.4804537296295166, + "learning_rate": 4.420708808983809e-06, + "loss": 0.0592, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.75, + "step": 299 + }, + { + "completion_length": 250.0, + "epoch": 0.025, + "grad_norm": 0.3847966194152832, + "kl": 0.6287730932235718, + "learning_rate": 4.415111107797445e-06, + "loss": 0.0252, + "reward": 1.0416667461395264, + "reward_std": 0.9332908391952515, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.4166666865348816, + "step": 300 + }, + { + "completion_length": 250.0, + "epoch": 0.025083333333333332, + "grad_norm": 1.1420190334320068, + "kl": 0.900063157081604, + "learning_rate": 4.409490071586606e-06, + "loss": 0.036, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.875, + "step": 301 + }, + { + "completion_length": 250.0, + "epoch": 0.025166666666666667, + "grad_norm": 1.9523299932479858, + "kl": 1.0245622396469116, + "learning_rate": 4.403845768841842e-06, + "loss": 0.041, + "reward": 0.8333333134651184, + "reward_std": 0.8164965510368347, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.5833333134651184, + "step": 302 + }, + { + "completion_length": 250.0, + "epoch": 0.02525, + "grad_norm": 0.2748126685619354, + "kl": 0.49438124895095825, + "learning_rate": 4.398178268337202e-06, + "loss": 0.0198, + "reward": 0.7083333730697632, + "reward_std": 0.7000566720962524, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.4583333432674408, + "step": 303 + }, + { + "completion_length": 250.0, + "epoch": 0.025333333333333333, + "grad_norm": 0.9594418406486511, + "kl": 0.8395068049430847, + "learning_rate": 4.3924876391293915e-06, + "loss": 0.0336, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.875, + "step": 304 + }, + { + "completion_length": 250.0, + "epoch": 0.025416666666666667, + "grad_norm": 0.33724528551101685, + "kl": 0.4758547246456146, + "learning_rate": 4.386773950556931e-06, + "loss": 0.019, + "reward": 0.3750000298023224, + "reward_std": 0.4520675837993622, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.375, + "step": 305 + }, + { + "completion_length": 250.0, + "epoch": 0.0255, + "grad_norm": 0.8724843859672546, + "kl": 0.882912278175354, + "learning_rate": 4.381037272239311e-06, + "loss": 0.0353, + "reward": 1.4166666269302368, + "reward_std": 0.6606874465942383, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.9166666865348816, + "step": 306 + }, + { + "completion_length": 250.0, + "epoch": 0.025583333333333333, + "grad_norm": 0.30091574788093567, + "kl": 0.7076762318611145, + "learning_rate": 4.3752776740761495e-06, + "loss": 0.0283, + "reward": 1.0833333730697632, + "reward_std": 0.5841830968856812, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.7083333730697632, + "step": 307 + }, + { + "completion_length": 250.0, + "epoch": 0.025666666666666667, + "grad_norm": 0.3942156434059143, + "kl": 1.1950002908706665, + "learning_rate": 4.36949522624633e-06, + "loss": 0.0478, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.875, + "step": 308 + }, + { + "completion_length": 250.0, + "epoch": 0.02575, + "grad_norm": 1.6910313367843628, + "kl": 1.2377409934997559, + "learning_rate": 4.3636899992071555e-06, + "loss": 0.0495, + "reward": 1.125, + "reward_std": 0.46929532289505005, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.8750000596046448, + "step": 309 + }, + { + "completion_length": 250.0, + "epoch": 0.025833333333333333, + "grad_norm": 3.3976781368255615, + "kl": 1.8446786403656006, + "learning_rate": 4.357862063693486e-06, + "loss": 0.0738, + "reward": 1.3333332538604736, + "reward_std": 0.9428090453147888, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.7083333730697632, + "step": 310 + }, + { + "completion_length": 250.0, + "epoch": 0.025916666666666668, + "grad_norm": 0.49139174818992615, + "kl": 1.5378676652908325, + "learning_rate": 4.352011490716875e-06, + "loss": 0.0615, + "reward": 1.2916666269302368, + "reward_std": 0.8249579071998596, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.6666666865348816, + "step": 311 + }, + { + "completion_length": 250.0, + "epoch": 0.026, + "grad_norm": 0.49455150961875916, + "kl": 1.1211459636688232, + "learning_rate": 4.346138351564711e-06, + "loss": 0.0448, + "reward": 0.7083333730697632, + "reward_std": 0.7000566720962524, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.5833333730697632, + "step": 312 + }, + { + "completion_length": 250.0, + "epoch": 0.026083333333333333, + "grad_norm": 1.1779581308364868, + "kl": 1.5087566375732422, + "learning_rate": 4.340242717799337e-06, + "loss": 0.0604, + "reward": 0.75, + "reward_std": 0.8498365879058838, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.5, + "step": 313 + }, + { + "completion_length": 250.0, + "epoch": 0.026166666666666668, + "grad_norm": 0.3693522810935974, + "kl": 0.5869612693786621, + "learning_rate": 4.334324661257191e-06, + "loss": 0.0235, + "reward": 1.0, + "reward_std": 0.9258201122283936, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.625, + "step": 314 + }, + { + "completion_length": 250.0, + "epoch": 0.02625, + "grad_norm": 2.1444251537323, + "kl": 1.3732556104660034, + "learning_rate": 4.328384254047927e-06, + "loss": 0.0549, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 1.0, + "step": 315 + }, + { + "completion_length": 250.0, + "epoch": 0.026333333333333334, + "grad_norm": 0.30390700697898865, + "kl": 0.9706352949142456, + "learning_rate": 4.322421568553529e-06, + "loss": 0.0388, + "reward": 1.0416667461395264, + "reward_std": 0.4520675837993622, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.9166666865348816, + "step": 316 + }, + { + "completion_length": 250.0, + "epoch": 0.026416666666666668, + "grad_norm": 1.256115436553955, + "kl": 0.730143129825592, + "learning_rate": 4.316436677427441e-06, + "loss": 0.0292, + "reward": 1.375, + "reward_std": 0.8807914853096008, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.75, + "step": 317 + }, + { + "completion_length": 250.0, + "epoch": 0.0265, + "grad_norm": 0.3741249442100525, + "kl": 0.9804970622062683, + "learning_rate": 4.3104296535936695e-06, + "loss": 0.0392, + "reward": 1.125, + "reward_std": 0.589255690574646, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.875, + "step": 318 + }, + { + "completion_length": 250.0, + "epoch": 0.026583333333333334, + "grad_norm": 2.2285826206207275, + "kl": 1.3444833755493164, + "learning_rate": 4.3044005702459055e-06, + "loss": 0.0538, + "reward": 1.0416666269302368, + "reward_std": 0.7223747968673706, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.6666666865348816, + "step": 319 + }, + { + "completion_length": 250.0, + "epoch": 0.02666666666666667, + "grad_norm": 1.4974360466003418, + "kl": 0.8706364035606384, + "learning_rate": 4.2983495008466285e-06, + "loss": 0.0348, + "reward": 1.0, + "reward_std": 0.7559289336204529, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.75, + "step": 320 + }, + { + "completion_length": 250.0, + "epoch": 0.02675, + "grad_norm": 0.8530201315879822, + "kl": 0.9074989557266235, + "learning_rate": 4.2922765191262075e-06, + "loss": 0.0363, + "reward": 0.75, + "reward_std": 0.6606874465942383, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.625, + "step": 321 + }, + { + "completion_length": 250.0, + "epoch": 0.026833333333333334, + "grad_norm": 0.4051268398761749, + "kl": 0.6288662552833557, + "learning_rate": 4.286181699082008e-06, + "loss": 0.0252, + "reward": 0.875, + "reward_std": 0.7753647565841675, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.625, + "step": 322 + }, + { + "completion_length": 250.0, + "epoch": 0.026916666666666665, + "grad_norm": 0.430800199508667, + "kl": 0.9511799216270447, + "learning_rate": 4.280065114977492e-06, + "loss": 0.038, + "reward": 1.0833333730697632, + "reward_std": 0.8864052295684814, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.4583333432674408, + "step": 323 + }, + { + "completion_length": 250.0, + "epoch": 0.027, + "grad_norm": 0.48526695370674133, + "kl": 1.0858272314071655, + "learning_rate": 4.273926841341303e-06, + "loss": 0.0434, + "reward": 1.0833332538604736, + "reward_std": 0.9880235195159912, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.5833333730697632, + "step": 324 + }, + { + "completion_length": 222.0, + "epoch": 0.027083333333333334, + "grad_norm": 0.22421292960643768, + "kl": 0.834428071975708, + "learning_rate": 4.267766952966369e-06, + "loss": 0.0334, + "reward": 1.5416667461395264, + "reward_std": 0.501980185508728, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.9166666865348816, + "step": 325 + }, + { + "completion_length": 250.0, + "epoch": 0.027166666666666665, + "grad_norm": 0.3456939458847046, + "kl": 1.3001376390457153, + "learning_rate": 4.261585524908987e-06, + "loss": 0.052, + "reward": 1.375, + "reward_std": 0.602573812007904, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.875, + "step": 326 + }, + { + "completion_length": 250.0, + "epoch": 0.02725, + "grad_norm": 0.3079489469528198, + "kl": 0.6641181111335754, + "learning_rate": 4.255382632487907e-06, + "loss": 0.0266, + "reward": 0.9583333730697632, + "reward_std": 0.7855339050292969, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.4583333730697632, + "step": 327 + }, + { + "completion_length": 250.0, + "epoch": 0.027333333333333334, + "grad_norm": 0.6239248514175415, + "kl": 0.5432109236717224, + "learning_rate": 4.249158351283414e-06, + "loss": 0.0217, + "reward": 1.0416667461395264, + "reward_std": 0.6770032048225403, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.7916666865348816, + "step": 328 + }, + { + "completion_length": 250.0, + "epoch": 0.027416666666666666, + "grad_norm": 2.2812557220458984, + "kl": 1.1748898029327393, + "learning_rate": 4.242912757136412e-06, + "loss": 0.047, + "reward": 1.0416667461395264, + "reward_std": 0.7000566720962524, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.7916666865348816, + "step": 329 + }, + { + "completion_length": 250.0, + "epoch": 0.0275, + "grad_norm": 0.3581920266151428, + "kl": 0.6617559790611267, + "learning_rate": 4.236645926147493e-06, + "loss": 0.0265, + "reward": 1.125, + "reward_std": 0.7955730557441711, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.7500000596046448, + "step": 330 + }, + { + "completion_length": 250.0, + "epoch": 0.027583333333333335, + "grad_norm": 1.4809401035308838, + "kl": 1.5578365325927734, + "learning_rate": 4.230357934676017e-06, + "loss": 0.0623, + "reward": 1.4583333730697632, + "reward_std": 0.7546154260635376, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.8333333730697632, + "step": 331 + }, + { + "completion_length": 250.0, + "epoch": 0.027666666666666666, + "grad_norm": 0.35725685954093933, + "kl": 1.0120397806167603, + "learning_rate": 4.224048859339175e-06, + "loss": 0.0405, + "reward": 1.2083333730697632, + "reward_std": 0.6651769280433655, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.8333333730697632, + "step": 332 + }, + { + "completion_length": 250.0, + "epoch": 0.02775, + "grad_norm": 0.28768303990364075, + "kl": 0.9968042969703674, + "learning_rate": 4.217718777011058e-06, + "loss": 0.0399, + "reward": 1.2916667461395264, + "reward_std": 0.6283639669418335, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.9166666865348816, + "step": 333 + }, + { + "completion_length": 250.0, + "epoch": 0.027833333333333335, + "grad_norm": 0.8159481287002563, + "kl": 0.9098507761955261, + "learning_rate": 4.211367764821722e-06, + "loss": 0.0364, + "reward": 1.0833333730697632, + "reward_std": 0.6606874465942383, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.7083333730697632, + "step": 334 + }, + { + "completion_length": 250.0, + "epoch": 0.027916666666666666, + "grad_norm": 28.54840087890625, + "kl": 6.921156406402588, + "learning_rate": 4.204995900156247e-06, + "loss": 0.2768, + "reward": 1.6666667461395264, + "reward_std": 0.6900655627250671, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.7916667461395264, + "step": 335 + }, + { + "completion_length": 250.0, + "epoch": 0.028, + "grad_norm": 3.128087282180786, + "kl": 1.198431134223938, + "learning_rate": 4.198603260653792e-06, + "loss": 0.0479, + "reward": 1.5833333730697632, + "reward_std": 0.527046263217926, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.8333333730697632, + "step": 336 + }, + { + "completion_length": 250.0, + "epoch": 0.02808333333333333, + "grad_norm": 8.092500686645508, + "kl": 4.037166595458984, + "learning_rate": 4.192189924206652e-06, + "loss": 0.1615, + "reward": 1.25, + "reward_std": 0.8864052295684814, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.625, + "step": 337 + }, + { + "completion_length": 250.0, + "epoch": 0.028166666666666666, + "grad_norm": 2.2210144996643066, + "kl": 1.3574161529541016, + "learning_rate": 4.185755968959308e-06, + "loss": 0.0543, + "reward": 1.3333333730697632, + "reward_std": 0.8164965510368347, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.7083333730697632, + "step": 338 + }, + { + "completion_length": 250.0, + "epoch": 0.02825, + "grad_norm": 1.8085874319076538, + "kl": 0.9977900385856628, + "learning_rate": 4.179301473307476e-06, + "loss": 0.0399, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.875, + "step": 339 + }, + { + "completion_length": 250.0, + "epoch": 0.028333333333333332, + "grad_norm": 0.8611170649528503, + "kl": 0.8231365084648132, + "learning_rate": 4.172826515897146e-06, + "loss": 0.0329, + "reward": 1.1666666269302368, + "reward_std": 0.7968190312385559, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.5416666865348816, + "step": 340 + }, + { + "completion_length": 250.0, + "epoch": 0.028416666666666666, + "grad_norm": 0.2523643970489502, + "kl": 1.1845202445983887, + "learning_rate": 4.166331175623631e-06, + "loss": 0.0474, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.875, + "step": 341 + }, + { + "completion_length": 250.0, + "epoch": 0.0285, + "grad_norm": 0.3289211690425873, + "kl": 0.9523929953575134, + "learning_rate": 4.159815531630604e-06, + "loss": 0.0381, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.875, + "step": 342 + }, + { + "completion_length": 250.0, + "epoch": 0.028583333333333332, + "grad_norm": 1.336962103843689, + "kl": 1.3120596408843994, + "learning_rate": 4.15327966330913e-06, + "loss": 0.0525, + "reward": 1.1666667461395264, + "reward_std": 0.5634361505508423, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.7916666865348816, + "step": 343 + }, + { + "completion_length": 240.0, + "epoch": 0.028666666666666667, + "grad_norm": 0.3630130887031555, + "kl": 1.1895368099212646, + "learning_rate": 4.146723650296701e-06, + "loss": 0.0476, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 1.0, + "step": 344 + }, + { + "completion_length": 250.0, + "epoch": 0.02875, + "grad_norm": 0.3989483416080475, + "kl": 0.6941573619842529, + "learning_rate": 4.140147572476269e-06, + "loss": 0.0278, + "reward": 0.7083333730697632, + "reward_std": 0.6531365513801575, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.4583333432674408, + "step": 345 + }, + { + "completion_length": 250.0, + "epoch": 0.028833333333333332, + "grad_norm": 0.315608412027359, + "kl": 0.5940113067626953, + "learning_rate": 4.133551509975264e-06, + "loss": 0.0238, + "reward": 1.0416666269302368, + "reward_std": 0.8439795970916748, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.6666666865348816, + "step": 346 + }, + { + "completion_length": 250.0, + "epoch": 0.028916666666666667, + "grad_norm": 0.28037476539611816, + "kl": 0.553116500377655, + "learning_rate": 4.126935543164628e-06, + "loss": 0.0221, + "reward": 1.25, + "reward_std": 0.8864052295684814, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.625, + "step": 347 + }, + { + "completion_length": 250.0, + "epoch": 0.029, + "grad_norm": 0.23763324320316315, + "kl": 0.6913818717002869, + "learning_rate": 4.120299752657828e-06, + "loss": 0.0277, + "reward": 1.6666666269302368, + "reward_std": 0.6424161195755005, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.9166666865348816, + "step": 348 + }, + { + "completion_length": 250.0, + "epoch": 0.029083333333333333, + "grad_norm": 0.367928147315979, + "kl": 0.6871733665466309, + "learning_rate": 4.113644219309877e-06, + "loss": 0.0275, + "reward": 1.375, + "reward_std": 0.8054870963096619, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.75, + "step": 349 + }, + { + "completion_length": 250.0, + "epoch": 0.029166666666666667, + "grad_norm": 0.2744889557361603, + "kl": 0.7716153860092163, + "learning_rate": 4.106969024216348e-06, + "loss": 0.0309, + "reward": 0.9583333730697632, + "reward_std": 0.8054871559143066, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.5833333134651184, + "step": 350 + }, + { + "completion_length": 250.0, + "epoch": 0.02925, + "grad_norm": 1.142672061920166, + "kl": 1.1665054559707642, + "learning_rate": 4.1002742487123896e-06, + "loss": 0.0467, + "reward": 1.5833333730697632, + "reward_std": 0.5841830372810364, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.8333333730697632, + "step": 351 + }, + { + "completion_length": 250.0, + "epoch": 0.029333333333333333, + "grad_norm": 0.0448664054274559, + "kl": 1.181343674659729, + "learning_rate": 4.093559974371725e-06, + "loss": 0.0473, + "reward": 2.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 1.0, + "step": 352 + }, + { + "completion_length": 250.0, + "epoch": 0.029416666666666667, + "grad_norm": 0.2805432975292206, + "kl": 1.1471784114837646, + "learning_rate": 4.086826283005669e-06, + "loss": 0.0459, + "reward": 1.2916666269302368, + "reward_std": 0.6283639669418335, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.9166666865348816, + "step": 353 + }, + { + "completion_length": 250.0, + "epoch": 0.0295, + "grad_norm": 0.36402520537376404, + "kl": 0.9837138056755066, + "learning_rate": 4.080073256662128e-06, + "loss": 0.0393, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.875, + "step": 354 + }, + { + "completion_length": 250.0, + "epoch": 0.029583333333333333, + "grad_norm": 0.24651949107646942, + "kl": 0.8240635991096497, + "learning_rate": 4.073300977624594e-06, + "loss": 0.033, + "reward": 1.6666666269302368, + "reward_std": 0.6424160599708557, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.9166666865348816, + "step": 355 + }, + { + "completion_length": 250.0, + "epoch": 0.029666666666666668, + "grad_norm": 0.8997929692268372, + "kl": 1.0346089601516724, + "learning_rate": 4.066509528411151e-06, + "loss": 0.0414, + "reward": 1.3333333730697632, + "reward_std": 0.7766431570053101, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.8333333730697632, + "step": 356 + }, + { + "completion_length": 213.0, + "epoch": 0.02975, + "grad_norm": 0.2964262068271637, + "kl": 1.3979225158691406, + "learning_rate": 4.059698991773466e-06, + "loss": 0.0559, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.875, + "step": 357 + }, + { + "completion_length": 250.0, + "epoch": 0.029833333333333333, + "grad_norm": 0.5031925439834595, + "kl": 1.1336249113082886, + "learning_rate": 4.052869450695776e-06, + "loss": 0.0453, + "reward": 1.6666666269302368, + "reward_std": 0.6424160599708557, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.7916666865348816, + "step": 358 + }, + { + "completion_length": 250.0, + "epoch": 0.029916666666666668, + "grad_norm": 0.4681895971298218, + "kl": 0.8900135159492493, + "learning_rate": 4.046020988393886e-06, + "loss": 0.0356, + "reward": 1.0, + "reward_std": 0.8908708095550537, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.625, + "step": 359 + }, + { + "completion_length": 250.0, + "epoch": 0.03, + "grad_norm": 0.320224791765213, + "kl": 0.7603616714477539, + "learning_rate": 4.039153688314146e-06, + "loss": 0.0304, + "reward": 1.25, + "reward_std": 0.6606874465942383, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.875, + "step": 360 + }, + { + "completion_length": 250.0, + "epoch": 0.030083333333333333, + "grad_norm": 0.6727330684661865, + "kl": 0.9661973714828491, + "learning_rate": 4.032267634132442e-06, + "loss": 0.0386, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 1.0, + "step": 361 + }, + { + "completion_length": 250.0, + "epoch": 0.030166666666666668, + "grad_norm": 0.8547607064247131, + "kl": 1.3019939661026, + "learning_rate": 4.02536290975317e-06, + "loss": 0.0521, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.875, + "step": 362 + }, + { + "completion_length": 250.0, + "epoch": 0.03025, + "grad_norm": 0.27684468030929565, + "kl": 0.980901300907135, + "learning_rate": 4.018439599308217e-06, + "loss": 0.0392, + "reward": 1.125, + "reward_std": 0.8345229625701904, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.625, + "step": 363 + }, + { + "completion_length": 250.0, + "epoch": 0.030333333333333334, + "grad_norm": 0.6026430726051331, + "kl": 0.8164455890655518, + "learning_rate": 4.011497787155938e-06, + "loss": 0.0327, + "reward": 1.2083333730697632, + "reward_std": 0.7546154260635376, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.7083333134651184, + "step": 364 + }, + { + "completion_length": 250.0, + "epoch": 0.030416666666666668, + "grad_norm": 0.24824610352516174, + "kl": 0.7272346019744873, + "learning_rate": 4.0045375578801216e-06, + "loss": 0.0291, + "reward": 1.2083332538604736, + "reward_std": 0.8897565007209778, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.7083333730697632, + "step": 365 + }, + { + "completion_length": 250.0, + "epoch": 0.0305, + "grad_norm": 0.419993132352829, + "kl": 0.791063666343689, + "learning_rate": 3.997558996288965e-06, + "loss": 0.0316, + "reward": 0.875, + "reward_std": 0.6651769280433655, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.625, + "step": 366 + }, + { + "completion_length": 250.0, + "epoch": 0.030583333333333334, + "grad_norm": 0.24291253089904785, + "kl": 0.8568480014801025, + "learning_rate": 3.9905621874140396e-06, + "loss": 0.0343, + "reward": 1.125, + "reward_std": 0.5019802451133728, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.875, + "step": 367 + }, + { + "completion_length": 250.0, + "epoch": 0.030666666666666665, + "grad_norm": 1.9887789487838745, + "kl": 1.3834556341171265, + "learning_rate": 3.983547216509254e-06, + "loss": 0.0553, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.75, + "step": 368 + }, + { + "completion_length": 250.0, + "epoch": 0.03075, + "grad_norm": 1.0339287519454956, + "kl": 1.5022096633911133, + "learning_rate": 3.976514169049814e-06, + "loss": 0.0601, + "reward": 0.8333333134651184, + "reward_std": 0.8728715777397156, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.3333333432674408, + "step": 369 + }, + { + "completion_length": 250.0, + "epoch": 0.030833333333333334, + "grad_norm": 1.1466647386550903, + "kl": 0.6712841987609863, + "learning_rate": 3.969463130731183e-06, + "loss": 0.0269, + "reward": 0.5416666865348816, + "reward_std": 0.46929532289505005, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.5416666865348816, + "step": 370 + }, + { + "completion_length": 250.0, + "epoch": 0.030916666666666665, + "grad_norm": 0.8428452014923096, + "kl": 1.1327028274536133, + "learning_rate": 3.96239418746804e-06, + "loss": 0.0453, + "reward": 1.6666666269302368, + "reward_std": 0.6424161195755005, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.7916666865348816, + "step": 371 + }, + { + "completion_length": 250.0, + "epoch": 0.031, + "grad_norm": 1.3757325410842896, + "kl": 0.7787545919418335, + "learning_rate": 3.955307425393224e-06, + "loss": 0.0312, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 1.0, + "step": 372 + }, + { + "completion_length": 250.0, + "epoch": 0.031083333333333334, + "grad_norm": 0.7574729323387146, + "kl": 1.0990623235702515, + "learning_rate": 3.948202930856697e-06, + "loss": 0.044, + "reward": 1.3333333730697632, + "reward_std": 0.5634361505508423, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.9583333730697632, + "step": 373 + }, + { + "completion_length": 250.0, + "epoch": 0.031166666666666665, + "grad_norm": 4.400579452514648, + "kl": 1.3642206192016602, + "learning_rate": 3.941080790424483e-06, + "loss": 0.0546, + "reward": 1.2083333730697632, + "reward_std": 0.8533315062522888, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.7083333730697632, + "step": 374 + }, + { + "completion_length": 250.0, + "epoch": 0.03125, + "grad_norm": 0.5696729421615601, + "kl": 1.0298165082931519, + "learning_rate": 3.933941090877615e-06, + "loss": 0.0412, + "reward": 1.6666667461395264, + "reward_std": 0.4714045226573944, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.9166666865348816, + "step": 375 + }, + { + "completion_length": 250.0, + "epoch": 0.03133333333333333, + "grad_norm": 0.22187651693820953, + "kl": 0.9323797225952148, + "learning_rate": 3.92678391921108e-06, + "loss": 0.0373, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.875, + "step": 376 + }, + { + "completion_length": 250.0, + "epoch": 0.03141666666666667, + "grad_norm": 0.35391122102737427, + "kl": 1.0090752840042114, + "learning_rate": 3.9196093626327535e-06, + "loss": 0.0404, + "reward": 1.6666667461395264, + "reward_std": 0.6900655627250671, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.7916666865348816, + "step": 377 + }, + { + "completion_length": 250.0, + "epoch": 0.0315, + "grad_norm": 3.709547758102417, + "kl": 1.3984549045562744, + "learning_rate": 3.912417508562345e-06, + "loss": 0.0559, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.875, + "step": 378 + }, + { + "completion_length": 250.0, + "epoch": 0.03158333333333333, + "grad_norm": 2.316877603530884, + "kl": 1.1878688335418701, + "learning_rate": 3.905208444630326e-06, + "loss": 0.0475, + "reward": 1.0833333730697632, + "reward_std": 0.8498365879058838, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.7083333730697632, + "step": 379 + }, + { + "completion_length": 250.0, + "epoch": 0.03166666666666667, + "grad_norm": 0.40123090147972107, + "kl": 1.305446982383728, + "learning_rate": 3.897982258676867e-06, + "loss": 0.0522, + "reward": 1.0833333730697632, + "reward_std": 0.771516740322113, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.5833333134651184, + "step": 380 + }, + { + "completion_length": 250.0, + "epoch": 0.03175, + "grad_norm": 0.3199678659439087, + "kl": 1.1264021396636963, + "learning_rate": 3.890739038750763e-06, + "loss": 0.0451, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.75, + "step": 381 + }, + { + "completion_length": 250.0, + "epoch": 0.03183333333333333, + "grad_norm": 0.27256831526756287, + "kl": 1.2236875295639038, + "learning_rate": 3.88347887310836e-06, + "loss": 0.0489, + "reward": 1.6666667461395264, + "reward_std": 0.4714045226573944, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.9166666865348816, + "step": 382 + }, + { + "completion_length": 232.0, + "epoch": 0.03191666666666667, + "grad_norm": 0.2633911669254303, + "kl": 1.0527544021606445, + "learning_rate": 3.876201850212489e-06, + "loss": 0.0421, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.875, + "step": 383 + }, + { + "completion_length": 250.0, + "epoch": 0.032, + "grad_norm": 0.2916422486305237, + "kl": 0.9229851365089417, + "learning_rate": 3.868908058731376e-06, + "loss": 0.0369, + "reward": 1.2083332538604736, + "reward_std": 0.7753647565841675, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.5833333730697632, + "step": 384 + }, + { + "completion_length": 185.0, + "epoch": 0.03208333333333333, + "grad_norm": 0.30826249718666077, + "kl": 0.9985988140106201, + "learning_rate": 3.861597587537568e-06, + "loss": 0.0399, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 1.0, + "step": 385 + }, + { + "completion_length": 250.0, + "epoch": 0.03216666666666667, + "grad_norm": 0.0261689443141222, + "kl": 1.0752596855163574, + "learning_rate": 3.85427052570685e-06, + "loss": 0.043, + "reward": 2.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 1.0, + "step": 386 + }, + { + "completion_length": 250.0, + "epoch": 0.03225, + "grad_norm": 0.3096613883972168, + "kl": 1.0192590951919556, + "learning_rate": 3.846926962517158e-06, + "loss": 0.0408, + "reward": 1.9583333730697632, + "reward_std": 0.11785109341144562, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 0.9583333730697632, + "step": 387 + }, + { + "completion_length": 250.0, + "epoch": 0.03233333333333333, + "grad_norm": 1.7860721349716187, + "kl": 0.6369985342025757, + "learning_rate": 3.839566987447492e-06, + "loss": 0.0255, + "reward": 1.2083333730697632, + "reward_std": 0.7332792282104492, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.8333333730697632, + "step": 388 + }, + { + "completion_length": 250.0, + "epoch": 0.03241666666666667, + "grad_norm": 0.26386263966560364, + "kl": 0.7966631650924683, + "learning_rate": 3.832190690176825e-06, + "loss": 0.0319, + "reward": 1.2083333730697632, + "reward_std": 0.6651769280433655, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.8333333730697632, + "step": 389 + }, + { + "completion_length": 250.0, + "epoch": 0.0325, + "grad_norm": 0.32119181752204895, + "kl": 0.9594342708587646, + "learning_rate": 3.824798160583012e-06, + "loss": 0.0384, + "reward": 1.0416667461395264, + "reward_std": 0.8807914853096008, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.5416666269302368, + "step": 390 + }, + { + "completion_length": 250.0, + "epoch": 0.03258333333333333, + "grad_norm": 0.47729289531707764, + "kl": 1.0643929243087769, + "learning_rate": 3.817389488741694e-06, + "loss": 0.0426, + "reward": 1.5, + "reward_std": 0.6172133684158325, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.875, + "step": 391 + }, + { + "completion_length": 250.0, + "epoch": 0.03266666666666666, + "grad_norm": 1.1122965812683105, + "kl": 1.5673550367355347, + "learning_rate": 3.8099647649251984e-06, + "loss": 0.0627, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.875, + "step": 392 + }, + { + "completion_length": 250.0, + "epoch": 0.03275, + "grad_norm": 2.794879198074341, + "kl": 1.3167065382003784, + "learning_rate": 3.802524079601442e-06, + "loss": 0.0527, + "reward": 1.4166667461395264, + "reward_std": 0.7292091846466064, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.7916666865348816, + "step": 393 + }, + { + "completion_length": 250.0, + "epoch": 0.03283333333333333, + "grad_norm": 0.9184636473655701, + "kl": 1.0699944496154785, + "learning_rate": 3.795067523432826e-06, + "loss": 0.0428, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 1.0, + "step": 394 + }, + { + "completion_length": 250.0, + "epoch": 0.032916666666666664, + "grad_norm": 0.21204939484596252, + "kl": 1.1911050081253052, + "learning_rate": 3.787595187275136e-06, + "loss": 0.0476, + "reward": 1.0, + "reward_std": 0.5345224738121033, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.875, + "step": 395 + }, + { + "completion_length": 250.0, + "epoch": 0.033, + "grad_norm": 0.6764863729476929, + "kl": 0.9250705242156982, + "learning_rate": 3.780107162176429e-06, + "loss": 0.037, + "reward": 1.375, + "reward_std": 0.9161254167556763, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.75, + "step": 396 + }, + { + "completion_length": 250.0, + "epoch": 0.03308333333333333, + "grad_norm": 5.473031997680664, + "kl": 1.4827823638916016, + "learning_rate": 3.772603539375929e-06, + "loss": 0.0593, + "reward": 1.0416666269302368, + "reward_std": 0.8807914853096008, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.6666666865348816, + "step": 397 + }, + { + "completion_length": 250.0, + "epoch": 0.033166666666666664, + "grad_norm": 3.915686845779419, + "kl": 1.3783940076828003, + "learning_rate": 3.7650844103029093e-06, + "loss": 0.0551, + "reward": 1.3333333730697632, + "reward_std": 0.7766431570053101, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.8333333134651184, + "step": 398 + }, + { + "completion_length": 250.0, + "epoch": 0.03325, + "grad_norm": 0.343504935503006, + "kl": 0.7900997400283813, + "learning_rate": 3.7575498665755884e-06, + "loss": 0.0316, + "reward": 1.0, + "reward_std": 0.9258201122283936, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.625, + "step": 399 + }, + { + "completion_length": 250.0, + "epoch": 0.03333333333333333, + "grad_norm": 0.32992246747016907, + "kl": 0.8160569071769714, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.0326, + "reward": 1.25, + "reward_std": 0.8864052295684814, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.75, + "step": 400 + }, + { + "completion_length": 250.0, + "epoch": 0.033416666666666664, + "grad_norm": 0.726000964641571, + "kl": 1.2352204322814941, + "learning_rate": 3.742434902568889e-06, + "loss": 0.0494, + "reward": 1.5833332538604736, + "reward_std": 0.6362089514732361, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.7083333134651184, + "step": 401 + }, + { + "completion_length": 250.0, + "epoch": 0.0335, + "grad_norm": 0.31690701842308044, + "kl": 0.9032598733901978, + "learning_rate": 3.7348546664605777e-06, + "loss": 0.0361, + "reward": 0.9583333730697632, + "reward_std": 0.7650604248046875, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.7083333730697632, + "step": 402 + }, + { + "completion_length": 250.0, + "epoch": 0.03358333333333333, + "grad_norm": 0.3370535969734192, + "kl": 0.8206230998039246, + "learning_rate": 3.7272593840378526e-06, + "loss": 0.0328, + "reward": 0.7916666865348816, + "reward_std": 0.6651768684387207, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.6666666865348816, + "step": 403 + }, + { + "completion_length": 250.0, + "epoch": 0.033666666666666664, + "grad_norm": 0.3390369415283203, + "kl": 1.0624157190322876, + "learning_rate": 3.7196491478468322e-06, + "loss": 0.0425, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 1.0, + "step": 404 + }, + { + "completion_length": 250.0, + "epoch": 0.03375, + "grad_norm": 0.2972298860549927, + "kl": 1.257109522819519, + "learning_rate": 3.7120240506158433e-06, + "loss": 0.0503, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 1.0, + "step": 405 + }, + { + "completion_length": 250.0, + "epoch": 0.03383333333333333, + "grad_norm": 0.38008397817611694, + "kl": 0.9990907907485962, + "learning_rate": 3.7043841852542884e-06, + "loss": 0.04, + "reward": 1.4166667461395264, + "reward_std": 0.49601587653160095, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.9166666865348816, + "step": 406 + }, + { + "completion_length": 250.0, + "epoch": 0.033916666666666664, + "grad_norm": 0.5591140389442444, + "kl": 1.0515403747558594, + "learning_rate": 3.6967296448515176e-06, + "loss": 0.0421, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.75, + "step": 407 + }, + { + "completion_length": 250.0, + "epoch": 0.034, + "grad_norm": 0.6368426084518433, + "kl": 0.9551165103912354, + "learning_rate": 3.689060522675689e-06, + "loss": 0.0382, + "reward": 1.2916666269302368, + "reward_std": 0.6283639669418335, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.7916666865348816, + "step": 408 + }, + { + "completion_length": 250.0, + "epoch": 0.034083333333333334, + "grad_norm": 0.30579978227615356, + "kl": 0.6054512858390808, + "learning_rate": 3.6813769121726356e-06, + "loss": 0.0242, + "reward": 1.0416667461395264, + "reward_std": 0.4520675241947174, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.9166666865348816, + "step": 409 + }, + { + "completion_length": 250.0, + "epoch": 0.034166666666666665, + "grad_norm": 0.6442110538482666, + "kl": 0.9826160073280334, + "learning_rate": 3.6736789069647273e-06, + "loss": 0.0393, + "reward": 1.5416667461395264, + "reward_std": 0.7332792282104492, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.7916666865348816, + "step": 410 + }, + { + "completion_length": 250.0, + "epoch": 0.03425, + "grad_norm": 0.264870285987854, + "kl": 0.5808348059654236, + "learning_rate": 3.6659666008497287e-06, + "loss": 0.0232, + "reward": 1.0833332538604736, + "reward_std": 0.8309490084648132, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.7083333730697632, + "step": 411 + }, + { + "completion_length": 250.0, + "epoch": 0.034333333333333334, + "grad_norm": 0.5558726787567139, + "kl": 1.1091468334197998, + "learning_rate": 3.658240087799655e-06, + "loss": 0.0444, + "reward": 1.375, + "reward_std": 0.9161254167556763, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.75, + "step": 412 + }, + { + "completion_length": 250.0, + "epoch": 0.034416666666666665, + "grad_norm": 0.5184251666069031, + "kl": 0.9304683804512024, + "learning_rate": 3.6504994619596295e-06, + "loss": 0.0372, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.875, + "step": 413 + }, + { + "completion_length": 250.0, + "epoch": 0.0345, + "grad_norm": 1.4748616218566895, + "kl": 1.3676832914352417, + "learning_rate": 3.642744817646736e-06, + "loss": 0.0547, + "reward": 1.4583332538604736, + "reward_std": 0.8345229029655457, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.7083333730697632, + "step": 414 + }, + { + "completion_length": 250.0, + "epoch": 0.034583333333333334, + "grad_norm": 0.43191081285476685, + "kl": 1.3092877864837646, + "learning_rate": 3.634976249348867e-06, + "loss": 0.0524, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 1.0, + "step": 415 + }, + { + "completion_length": 250.0, + "epoch": 0.034666666666666665, + "grad_norm": 0.4750445485115051, + "kl": 1.3486921787261963, + "learning_rate": 3.627193851723577e-06, + "loss": 0.0539, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.75, + "step": 416 + }, + { + "completion_length": 250.0, + "epoch": 0.03475, + "grad_norm": 0.5661317110061646, + "kl": 1.1146034002304077, + "learning_rate": 3.6193977195969243e-06, + "loss": 0.0446, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 1.0, + "step": 417 + }, + { + "completion_length": 250.0, + "epoch": 0.034833333333333334, + "grad_norm": 0.36562928557395935, + "kl": 1.1447898149490356, + "learning_rate": 3.611587947962319e-06, + "loss": 0.0458, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.75, + "step": 418 + }, + { + "completion_length": 250.0, + "epoch": 0.034916666666666665, + "grad_norm": 0.2762870788574219, + "kl": 0.48091429471969604, + "learning_rate": 3.6037646319793635e-06, + "loss": 0.0192, + "reward": 1.5833333730697632, + "reward_std": 0.5841830372810364, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.8333333730697632, + "step": 419 + }, + { + "completion_length": 250.0, + "epoch": 0.035, + "grad_norm": 0.668820321559906, + "kl": 1.2909846305847168, + "learning_rate": 3.595927866972694e-06, + "loss": 0.0516, + "reward": 1.4583333730697632, + "reward_std": 0.7332792282104492, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.7083333730697632, + "step": 420 + }, + { + "completion_length": 250.0, + "epoch": 0.035083333333333334, + "grad_norm": 0.4891945421695709, + "kl": 1.1140190362930298, + "learning_rate": 3.5880777484308193e-06, + "loss": 0.0446, + "reward": 1.1666667461395264, + "reward_std": 0.8728715181350708, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.5416666865348816, + "step": 421 + }, + { + "completion_length": 250.0, + "epoch": 0.035166666666666666, + "grad_norm": 0.33090484142303467, + "kl": 0.6019576191902161, + "learning_rate": 3.5802143720049565e-06, + "loss": 0.0241, + "reward": 1.0833333730697632, + "reward_std": 0.6606875061988831, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.8333333730697632, + "step": 422 + }, + { + "completion_length": 250.0, + "epoch": 0.03525, + "grad_norm": 0.6281843781471252, + "kl": 1.1191685199737549, + "learning_rate": 3.5723378335078653e-06, + "loss": 0.0448, + "reward": 1.2083333730697632, + "reward_std": 0.7332792282104492, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.8333333730697632, + "step": 423 + }, + { + "completion_length": 250.0, + "epoch": 0.035333333333333335, + "grad_norm": 0.24378234148025513, + "kl": 0.8178219795227051, + "learning_rate": 3.564448228912682e-06, + "loss": 0.0327, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.875, + "step": 424 + }, + { + "completion_length": 250.0, + "epoch": 0.035416666666666666, + "grad_norm": 0.24927818775177002, + "kl": 1.2881877422332764, + "learning_rate": 3.556545654351749e-06, + "loss": 0.0515, + "reward": 1.9583333730697632, + "reward_std": 0.11785109341144562, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 0.9583333730697632, + "step": 425 + }, + { + "completion_length": 250.0, + "epoch": 0.0355, + "grad_norm": 0.4163737893104553, + "kl": 0.7852768301963806, + "learning_rate": 3.5486302061154433e-06, + "loss": 0.0314, + "reward": 1.2083332538604736, + "reward_std": 0.9074209332466125, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.7083333730697632, + "step": 426 + }, + { + "completion_length": 250.0, + "epoch": 0.035583333333333335, + "grad_norm": 0.38066983222961426, + "kl": 1.0970571041107178, + "learning_rate": 3.5407019806510035e-06, + "loss": 0.0439, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.875, + "step": 427 + }, + { + "completion_length": 250.0, + "epoch": 0.035666666666666666, + "grad_norm": 1.0730751752853394, + "kl": 0.7307629585266113, + "learning_rate": 3.532761074561355e-06, + "loss": 0.0292, + "reward": 1.5416667461395264, + "reward_std": 0.501980185508728, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.7916666865348816, + "step": 428 + }, + { + "completion_length": 250.0, + "epoch": 0.03575, + "grad_norm": 0.2747225761413574, + "kl": 0.9983835816383362, + "learning_rate": 3.524807584603932e-06, + "loss": 0.0399, + "reward": 1.125, + "reward_std": 0.8345229625701904, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.75, + "step": 429 + }, + { + "completion_length": 250.0, + "epoch": 0.035833333333333335, + "grad_norm": 0.3085501492023468, + "kl": 1.0624085664749146, + "learning_rate": 3.516841607689501e-06, + "loss": 0.0425, + "reward": 1.4583333730697632, + "reward_std": 0.7955730557441711, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.8333333730697632, + "step": 430 + }, + { + "completion_length": 250.0, + "epoch": 0.035916666666666666, + "grad_norm": 0.3820912539958954, + "kl": 0.8280222415924072, + "learning_rate": 3.5088632408809757e-06, + "loss": 0.0331, + "reward": 1.0833333730697632, + "reward_std": 0.6606874465942383, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.8333333730697632, + "step": 431 + }, + { + "completion_length": 250.0, + "epoch": 0.036, + "grad_norm": 0.31436631083488464, + "kl": 0.5407013893127441, + "learning_rate": 3.5008725813922383e-06, + "loss": 0.0216, + "reward": 1.0416666269302368, + "reward_std": 0.8807914853096008, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.5416666865348816, + "step": 432 + }, + { + "completion_length": 250.0, + "epoch": 0.036083333333333335, + "grad_norm": 1.559922695159912, + "kl": 1.1256322860717773, + "learning_rate": 3.4928697265869516e-06, + "loss": 0.045, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.875, + "step": 433 + }, + { + "completion_length": 250.0, + "epoch": 0.036166666666666666, + "grad_norm": 0.36709415912628174, + "kl": 0.810982346534729, + "learning_rate": 3.4848547739773782e-06, + "loss": 0.0324, + "reward": 0.7916666269302368, + "reward_std": 0.8533315062522888, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.5416666865348816, + "step": 434 + }, + { + "completion_length": 250.0, + "epoch": 0.03625, + "grad_norm": 0.38521692156791687, + "kl": 1.0653852224349976, + "learning_rate": 3.476827821223184e-06, + "loss": 0.0426, + "reward": 1.1666667461395264, + "reward_std": 0.8728715181350708, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.6666666865348816, + "step": 435 + }, + { + "completion_length": 250.0, + "epoch": 0.036333333333333336, + "grad_norm": 0.4810712933540344, + "kl": 1.0313137769699097, + "learning_rate": 3.4687889661302577e-06, + "loss": 0.0413, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.875, + "step": 436 + }, + { + "completion_length": 250.0, + "epoch": 0.03641666666666667, + "grad_norm": 1.5242727994918823, + "kl": 1.0612870454788208, + "learning_rate": 3.460738306649509e-06, + "loss": 0.0425, + "reward": 0.7083333730697632, + "reward_std": 0.4520675837993622, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.5833333730697632, + "step": 437 + }, + { + "completion_length": 250.0, + "epoch": 0.0365, + "grad_norm": 0.27597489953041077, + "kl": 0.7890688180923462, + "learning_rate": 3.452675940875686e-06, + "loss": 0.0316, + "reward": 1.6666667461395264, + "reward_std": 0.7126966714859009, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.7916666865348816, + "step": 438 + }, + { + "completion_length": 250.0, + "epoch": 0.036583333333333336, + "grad_norm": 0.35958653688430786, + "kl": 0.9022118449211121, + "learning_rate": 3.4446019670461684e-06, + "loss": 0.0361, + "reward": 1.0, + "reward_std": 0.7559289336204529, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.75, + "step": 439 + }, + { + "completion_length": 250.0, + "epoch": 0.03666666666666667, + "grad_norm": 0.28990885615348816, + "kl": 0.7540520429611206, + "learning_rate": 3.436516483539781e-06, + "loss": 0.0302, + "reward": 1.2916667461395264, + "reward_std": 0.6770032048225403, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.5416666865348816, + "step": 440 + }, + { + "completion_length": 250.0, + "epoch": 0.03675, + "grad_norm": 0.4592471420764923, + "kl": 1.2466713190078735, + "learning_rate": 3.4284195888755877e-06, + "loss": 0.0499, + "reward": 1.25, + "reward_std": 0.6606874465942383, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.75, + "step": 441 + }, + { + "completion_length": 250.0, + "epoch": 0.036833333333333336, + "grad_norm": 0.2260519117116928, + "kl": 1.1097904443740845, + "learning_rate": 3.4203113817116955e-06, + "loss": 0.0444, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 1.0, + "step": 442 + }, + { + "completion_length": 250.0, + "epoch": 0.03691666666666667, + "grad_norm": 0.3158986568450928, + "kl": 0.7736285924911499, + "learning_rate": 3.412191960844049e-06, + "loss": 0.0309, + "reward": 1.1666667461395264, + "reward_std": 0.942808985710144, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.5416666865348816, + "step": 443 + }, + { + "completion_length": 250.0, + "epoch": 0.037, + "grad_norm": 0.35627302527427673, + "kl": 0.7449521422386169, + "learning_rate": 3.4040614252052305e-06, + "loss": 0.0298, + "reward": 1.1666667461395264, + "reward_std": 0.835710883140564, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.5416666865348816, + "step": 444 + }, + { + "completion_length": 250.0, + "epoch": 0.037083333333333336, + "grad_norm": 1.3280401229858398, + "kl": 0.9883641600608826, + "learning_rate": 3.39591987386325e-06, + "loss": 0.0395, + "reward": 1.125, + "reward_std": 0.8345229625701904, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.5, + "step": 445 + }, + { + "completion_length": 250.0, + "epoch": 0.03716666666666667, + "grad_norm": 0.3993532359600067, + "kl": 1.3265860080718994, + "learning_rate": 3.387767406020343e-06, + "loss": 0.0531, + "reward": 1.375, + "reward_std": 0.9161254167556763, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.75, + "step": 446 + }, + { + "completion_length": 250.0, + "epoch": 0.03725, + "grad_norm": 0.3469555079936981, + "kl": 0.6623489260673523, + "learning_rate": 3.3796041210117545e-06, + "loss": 0.0265, + "reward": 1.1666667461395264, + "reward_std": 0.7968190908432007, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.6666666269302368, + "step": 447 + }, + { + "completion_length": 250.0, + "epoch": 0.037333333333333336, + "grad_norm": 0.7821484804153442, + "kl": 1.5054004192352295, + "learning_rate": 3.3714301183045382e-06, + "loss": 0.0602, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 1.0, + "step": 448 + }, + { + "completion_length": 250.0, + "epoch": 0.03741666666666667, + "grad_norm": 0.6973661780357361, + "kl": 1.2113670110702515, + "learning_rate": 3.3632454974963368e-06, + "loss": 0.0485, + "reward": 1.2083333730697632, + "reward_std": 0.7332792282104492, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.8333333730697632, + "step": 449 + }, + { + "completion_length": 250.0, + "epoch": 0.0375, + "grad_norm": 0.32803142070770264, + "kl": 0.9755523204803467, + "learning_rate": 3.3550503583141726e-06, + "loss": 0.039, + "reward": 1.0, + "reward_std": 0.7126965522766113, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.625, + "step": 450 + }, + { + "completion_length": 250.0, + "epoch": 0.03758333333333334, + "grad_norm": 0.3818299472332001, + "kl": 0.9919738173484802, + "learning_rate": 3.346844800613229e-06, + "loss": 0.0397, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.875, + "step": 451 + }, + { + "completion_length": 250.0, + "epoch": 0.03766666666666667, + "grad_norm": 0.3395150303840637, + "kl": 1.0552934408187866, + "learning_rate": 3.338628924375638e-06, + "loss": 0.0422, + "reward": 1.125, + "reward_std": 0.8345229625701904, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.75, + "step": 452 + }, + { + "completion_length": 250.0, + "epoch": 0.03775, + "grad_norm": 0.22457285225391388, + "kl": 1.2115274667739868, + "learning_rate": 3.3304028297092583e-06, + "loss": 0.0485, + "reward": 1.5833333730697632, + "reward_std": 0.49601587653160095, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.8333333730697632, + "step": 453 + }, + { + "completion_length": 250.0, + "epoch": 0.03783333333333333, + "grad_norm": 0.32445210218429565, + "kl": 0.7955477237701416, + "learning_rate": 3.3221666168464584e-06, + "loss": 0.0318, + "reward": 0.875, + "reward_std": 0.7955731153488159, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.625, + "step": 454 + }, + { + "completion_length": 250.0, + "epoch": 0.03791666666666667, + "grad_norm": 1.0360915660858154, + "kl": 1.115738034248352, + "learning_rate": 3.313920386142892e-06, + "loss": 0.0446, + "reward": 1.4583333730697632, + "reward_std": 0.7955730557441711, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.8333333730697632, + "step": 455 + }, + { + "completion_length": 250.0, + "epoch": 0.038, + "grad_norm": 0.26157450675964355, + "kl": 1.0352263450622559, + "learning_rate": 3.3056642380762783e-06, + "loss": 0.0414, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.875, + "step": 456 + }, + { + "completion_length": 250.0, + "epoch": 0.03808333333333333, + "grad_norm": 0.33227723836898804, + "kl": 0.9722086191177368, + "learning_rate": 3.2973982732451753e-06, + "loss": 0.0389, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 1.0, + "step": 457 + }, + { + "completion_length": 250.0, + "epoch": 0.03816666666666667, + "grad_norm": 0.48032528162002563, + "kl": 1.262650966644287, + "learning_rate": 3.2891225923677565e-06, + "loss": 0.0505, + "reward": 1.1666667461395264, + "reward_std": 0.6424161195755005, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.5416666865348816, + "step": 458 + }, + { + "completion_length": 250.0, + "epoch": 0.03825, + "grad_norm": 0.2080181986093521, + "kl": 1.3035730123519897, + "learning_rate": 3.280837296280582e-06, + "loss": 0.0521, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.875, + "step": 459 + }, + { + "completion_length": 250.0, + "epoch": 0.03833333333333333, + "grad_norm": 0.2657634913921356, + "kl": 1.0605506896972656, + "learning_rate": 3.272542485937369e-06, + "loss": 0.0424, + "reward": 1.0833333730697632, + "reward_std": 0.6606874465942383, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.8333333134651184, + "step": 460 + }, + { + "completion_length": 250.0, + "epoch": 0.03841666666666667, + "grad_norm": 0.32230645418167114, + "kl": 1.6198720932006836, + "learning_rate": 3.2642382624077647e-06, + "loss": 0.0648, + "reward": 1.6666667461395264, + "reward_std": 0.7126966118812561, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.7916666865348816, + "step": 461 + }, + { + "completion_length": 250.0, + "epoch": 0.0385, + "grad_norm": 0.31350037455558777, + "kl": 1.1919293403625488, + "learning_rate": 3.2559247268761117e-06, + "loss": 0.0477, + "reward": 1.6666667461395264, + "reward_std": 0.7126966714859009, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.7916666865348816, + "step": 462 + }, + { + "completion_length": 250.0, + "epoch": 0.03858333333333333, + "grad_norm": 0.30738067626953125, + "kl": 0.9954556226730347, + "learning_rate": 3.247601980640217e-06, + "loss": 0.0398, + "reward": 1.625, + "reward_std": 0.7000566720962524, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.8750000596046448, + "step": 463 + }, + { + "completion_length": 236.0, + "epoch": 0.03866666666666667, + "grad_norm": 1.2310477495193481, + "kl": 1.2627601623535156, + "learning_rate": 3.2392701251101172e-06, + "loss": 0.0505, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 1.0, + "step": 464 + }, + { + "completion_length": 250.0, + "epoch": 0.03875, + "grad_norm": 0.2840711772441864, + "kl": 0.8784255981445312, + "learning_rate": 3.230929261806842e-06, + "loss": 0.0351, + "reward": 1.3333333730697632, + "reward_std": 0.7766431570053101, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.8333333134651184, + "step": 465 + }, + { + "completion_length": 250.0, + "epoch": 0.03883333333333333, + "grad_norm": 0.22466637194156647, + "kl": 1.233144760131836, + "learning_rate": 3.222579492361179e-06, + "loss": 0.0493, + "reward": 1.5416667461395264, + "reward_std": 0.7332792282104492, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.7916666865348816, + "step": 466 + }, + { + "completion_length": 250.0, + "epoch": 0.03891666666666667, + "grad_norm": 0.31554660201072693, + "kl": 0.8954232335090637, + "learning_rate": 3.214220918512434e-06, + "loss": 0.0358, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.875, + "step": 467 + }, + { + "completion_length": 250.0, + "epoch": 0.039, + "grad_norm": 0.35043928027153015, + "kl": 0.9754191040992737, + "learning_rate": 3.205853642107192e-06, + "loss": 0.039, + "reward": 1.125, + "reward_std": 0.8345229625701904, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.75, + "step": 468 + }, + { + "completion_length": 250.0, + "epoch": 0.03908333333333333, + "grad_norm": 0.3606829047203064, + "kl": 1.6161555051803589, + "learning_rate": 3.1974777650980737e-06, + "loss": 0.0646, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 1.0, + "step": 469 + }, + { + "completion_length": 250.0, + "epoch": 0.03916666666666667, + "grad_norm": 0.3894639015197754, + "kl": 0.9157559871673584, + "learning_rate": 3.189093389542498e-06, + "loss": 0.0366, + "reward": 1.3333333730697632, + "reward_std": 0.7766431570053101, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.8333333730697632, + "step": 470 + }, + { + "completion_length": 250.0, + "epoch": 0.03925, + "grad_norm": 0.5538480281829834, + "kl": 0.9871796369552612, + "learning_rate": 3.180700617601436e-06, + "loss": 0.0395, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.875, + "step": 471 + }, + { + "completion_length": 250.0, + "epoch": 0.03933333333333333, + "grad_norm": 0.03941356763243675, + "kl": 1.603979468345642, + "learning_rate": 3.1722995515381644e-06, + "loss": 0.0642, + "reward": 2.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 1.0, + "step": 472 + }, + { + "completion_length": 250.0, + "epoch": 0.03941666666666667, + "grad_norm": 0.36027953028678894, + "kl": 1.4444609880447388, + "learning_rate": 3.1638902937170224e-06, + "loss": 0.0578, + "reward": 1.9583333730697632, + "reward_std": 0.11785109341144562, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 0.9583333730697632, + "step": 473 + }, + { + "completion_length": 250.0, + "epoch": 0.0395, + "grad_norm": 0.2484903782606125, + "kl": 1.535847783088684, + "learning_rate": 3.155472946602162e-06, + "loss": 0.0614, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 1.0, + "step": 474 + }, + { + "completion_length": 250.0, + "epoch": 0.03958333333333333, + "grad_norm": 0.43076378107070923, + "kl": 1.3793715238571167, + "learning_rate": 3.147047612756302e-06, + "loss": 0.0552, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.75, + "step": 475 + }, + { + "completion_length": 250.0, + "epoch": 0.03966666666666667, + "grad_norm": 0.24676185846328735, + "kl": 1.1927064657211304, + "learning_rate": 3.1386143948394764e-06, + "loss": 0.0477, + "reward": 1.5833333730697632, + "reward_std": 0.7292091846466064, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.8333333730697632, + "step": 476 + }, + { + "completion_length": 250.0, + "epoch": 0.03975, + "grad_norm": 0.27644336223602295, + "kl": 1.0569162368774414, + "learning_rate": 3.130173395607785e-06, + "loss": 0.0423, + "reward": 1.25, + "reward_std": 0.8498365879058838, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.7500000596046448, + "step": 477 + }, + { + "completion_length": 250.0, + "epoch": 0.03983333333333333, + "grad_norm": 1.9261839389801025, + "kl": 1.384006381034851, + "learning_rate": 3.121724717912138e-06, + "loss": 0.0554, + "reward": 1.7916667461395264, + "reward_std": 0.39591163396835327, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.9166666865348816, + "step": 478 + }, + { + "completion_length": 250.0, + "epoch": 0.03991666666666667, + "grad_norm": 0.25609302520751953, + "kl": 1.1626935005187988, + "learning_rate": 3.1132684646970068e-06, + "loss": 0.0465, + "reward": 1.1666667461395264, + "reward_std": 0.5634361505508423, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.9166666865348816, + "step": 479 + }, + { + "completion_length": 250.0, + "epoch": 0.04, + "grad_norm": 1.250272512435913, + "kl": 1.0145455598831177, + "learning_rate": 3.1048047389991693e-06, + "loss": 0.0406, + "reward": 1.4583333730697632, + "reward_std": 0.501980185508728, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.9583333730697632, + "step": 480 + }, + { + "completion_length": 250.0, + "epoch": 0.04008333333333333, + "grad_norm": 0.2401411086320877, + "kl": 0.8946865200996399, + "learning_rate": 3.0963336439464527e-06, + "loss": 0.0358, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.875, + "step": 481 + }, + { + "completion_length": 250.0, + "epoch": 0.04016666666666667, + "grad_norm": 0.23744302988052368, + "kl": 0.5881522297859192, + "learning_rate": 3.087855282756475e-06, + "loss": 0.0235, + "reward": 1.2916666269302368, + "reward_std": 0.8249579071998596, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.7916666865348816, + "step": 482 + }, + { + "completion_length": 250.0, + "epoch": 0.04025, + "grad_norm": 0.5861865878105164, + "kl": 1.260177493095398, + "learning_rate": 3.079369758735393e-06, + "loss": 0.0504, + "reward": 1.5416667461395264, + "reward_std": 0.7113032937049866, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.6666666865348816, + "step": 483 + }, + { + "completion_length": 250.0, + "epoch": 0.04033333333333333, + "grad_norm": 0.3826155364513397, + "kl": 1.2853862047195435, + "learning_rate": 3.0708771752766397e-06, + "loss": 0.0514, + "reward": 1.4583333730697632, + "reward_std": 0.7332792282104492, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.8333333730697632, + "step": 484 + }, + { + "completion_length": 250.0, + "epoch": 0.04041666666666666, + "grad_norm": 0.22148284316062927, + "kl": 1.1608880758285522, + "learning_rate": 3.062377635859663e-06, + "loss": 0.0464, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 1.0, + "step": 485 + }, + { + "completion_length": 250.0, + "epoch": 0.0405, + "grad_norm": 0.3896584212779999, + "kl": 1.1302204132080078, + "learning_rate": 3.053871244048669e-06, + "loss": 0.0452, + "reward": 1.25, + "reward_std": 0.8864052295684814, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.75, + "step": 486 + }, + { + "completion_length": 250.0, + "epoch": 0.04058333333333333, + "grad_norm": 0.8122249245643616, + "kl": 1.505839467048645, + "learning_rate": 3.045358103491357e-06, + "loss": 0.0602, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 1.0, + "step": 487 + }, + { + "completion_length": 250.0, + "epoch": 0.04066666666666666, + "grad_norm": 0.2632407248020172, + "kl": 1.4302340745925903, + "learning_rate": 3.0368383179176584e-06, + "loss": 0.0572, + "reward": 1.7916666269302368, + "reward_std": 0.5892555713653564, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.9166666865348816, + "step": 488 + }, + { + "completion_length": 250.0, + "epoch": 0.04075, + "grad_norm": 0.30897876620292664, + "kl": 0.74764084815979, + "learning_rate": 3.0283119911384724e-06, + "loss": 0.0299, + "reward": 1.0833333730697632, + "reward_std": 0.6606874465942383, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.8333333730697632, + "step": 489 + }, + { + "completion_length": 250.0, + "epoch": 0.04083333333333333, + "grad_norm": 0.25567445158958435, + "kl": 1.0470243692398071, + "learning_rate": 3.019779227044398e-06, + "loss": 0.0419, + "reward": 1.8333333730697632, + "reward_std": 0.35634827613830566, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.9583333730697632, + "step": 490 + }, + { + "completion_length": 250.0, + "epoch": 0.040916666666666664, + "grad_norm": 0.3491830825805664, + "kl": 1.0824482440948486, + "learning_rate": 3.0112401296044756e-06, + "loss": 0.0433, + "reward": 1.5833333730697632, + "reward_std": 0.7292091846466064, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.8333333730697632, + "step": 491 + }, + { + "completion_length": 248.0, + "epoch": 0.041, + "grad_norm": 0.26145610213279724, + "kl": 1.5022797584533691, + "learning_rate": 3.002694802864912e-06, + "loss": 0.0601, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 1.0, + "step": 492 + }, + { + "completion_length": 250.0, + "epoch": 0.04108333333333333, + "grad_norm": 2.2558088302612305, + "kl": 1.2809425592422485, + "learning_rate": 2.9941433509478157e-06, + "loss": 0.0512, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.875, + "step": 493 + }, + { + "completion_length": 250.0, + "epoch": 0.041166666666666664, + "grad_norm": 0.24117594957351685, + "kl": 1.0864267349243164, + "learning_rate": 2.98558587804993e-06, + "loss": 0.0435, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.875, + "step": 494 + }, + { + "completion_length": 250.0, + "epoch": 0.04125, + "grad_norm": 1.2394402027130127, + "kl": 1.355527400970459, + "learning_rate": 2.9770224884413625e-06, + "loss": 0.0542, + "reward": 1.4583333730697632, + "reward_std": 0.501980185508728, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.9583333730697632, + "step": 495 + }, + { + "completion_length": 250.0, + "epoch": 0.04133333333333333, + "grad_norm": 0.3969794511795044, + "kl": 1.4105161428451538, + "learning_rate": 2.9684532864643123e-06, + "loss": 0.0564, + "reward": 1.25, + "reward_std": 0.6606874465942383, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.875, + "step": 496 + }, + { + "completion_length": 250.0, + "epoch": 0.041416666666666664, + "grad_norm": 0.2702740728855133, + "kl": 1.046398639678955, + "learning_rate": 2.9598783765318005e-06, + "loss": 0.0419, + "reward": 1.0, + "reward_std": 0.5345224738121033, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.875, + "step": 497 + }, + { + "completion_length": 250.0, + "epoch": 0.0415, + "grad_norm": 0.22273120284080505, + "kl": 0.8685632944107056, + "learning_rate": 2.9512978631264006e-06, + "loss": 0.0347, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.875, + "step": 498 + }, + { + "completion_length": 250.0, + "epoch": 0.04158333333333333, + "grad_norm": 0.5162345767021179, + "kl": 1.2586992979049683, + "learning_rate": 2.942711850798959e-06, + "loss": 0.0503, + "reward": 1.4166667461395264, + "reward_std": 0.7292091846466064, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.7916666865348816, + "step": 499 + }, + { + "completion_length": 250.0, + "epoch": 0.041666666666666664, + "grad_norm": 0.41288647055625916, + "kl": 1.1819934844970703, + "learning_rate": 2.9341204441673267e-06, + "loss": 0.0473, + "reward": 1.4166666269302368, + "reward_std": 0.6606874465942383, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.9166666865348816, + "step": 500 + }, + { + "completion_length": 250.0, + "epoch": 0.04175, + "grad_norm": 0.8625938892364502, + "kl": 0.8906731009483337, + "learning_rate": 2.9255237479150815e-06, + "loss": 0.0356, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 1.0, + "step": 501 + }, + { + "completion_length": 250.0, + "epoch": 0.041833333333333333, + "grad_norm": 0.8455320000648499, + "kl": 1.6155681610107422, + "learning_rate": 2.9169218667902562e-06, + "loss": 0.0646, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 1.0, + "step": 502 + }, + { + "completion_length": 250.0, + "epoch": 0.041916666666666665, + "grad_norm": 0.2829776406288147, + "kl": 1.0219650268554688, + "learning_rate": 2.908314905604056e-06, + "loss": 0.0409, + "reward": 1.2083333730697632, + "reward_std": 0.7332792282104492, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.8333333730697632, + "step": 503 + }, + { + "completion_length": 250.0, + "epoch": 0.042, + "grad_norm": 0.5874464511871338, + "kl": 1.6597095727920532, + "learning_rate": 2.8997029692295875e-06, + "loss": 0.0664, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.875, + "step": 504 + }, + { + "completion_length": 250.0, + "epoch": 0.042083333333333334, + "grad_norm": 0.9483749270439148, + "kl": 1.8449759483337402, + "learning_rate": 2.8910861626005774e-06, + "loss": 0.0738, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 1.0, + "step": 505 + }, + { + "completion_length": 250.0, + "epoch": 0.042166666666666665, + "grad_norm": 0.2268337458372116, + "kl": 0.8647147417068481, + "learning_rate": 2.8824645907100957e-06, + "loss": 0.0346, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.875, + "step": 506 + }, + { + "completion_length": 250.0, + "epoch": 0.04225, + "grad_norm": 0.2466856986284256, + "kl": 0.6964913010597229, + "learning_rate": 2.8738383586092745e-06, + "loss": 0.0279, + "reward": 1.375, + "reward_std": 0.6283639669418335, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.875, + "step": 507 + }, + { + "completion_length": 250.0, + "epoch": 0.042333333333333334, + "grad_norm": 1.2652459144592285, + "kl": 0.9631326794624329, + "learning_rate": 2.8652075714060296e-06, + "loss": 0.0385, + "reward": 1.125, + "reward_std": 0.6408699750900269, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.75, + "step": 508 + }, + { + "completion_length": 250.0, + "epoch": 0.042416666666666665, + "grad_norm": 0.27476221323013306, + "kl": 0.9217783212661743, + "learning_rate": 2.8565723342637797e-06, + "loss": 0.0369, + "reward": 0.8333333730697632, + "reward_std": 0.9920317530632019, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.4583333432674408, + "step": 509 + }, + { + "completion_length": 250.0, + "epoch": 0.0425, + "grad_norm": 0.21916547417640686, + "kl": 1.3772886991500854, + "learning_rate": 2.847932752400164e-06, + "loss": 0.0551, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.875, + "step": 510 + }, + { + "completion_length": 250.0, + "epoch": 0.042583333333333334, + "grad_norm": 0.33844467997550964, + "kl": 1.338585376739502, + "learning_rate": 2.8392889310857615e-06, + "loss": 0.0535, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.875, + "step": 511 + }, + { + "completion_length": 250.0, + "epoch": 0.042666666666666665, + "grad_norm": 0.29742342233657837, + "kl": 1.344675898551941, + "learning_rate": 2.8306409756428067e-06, + "loss": 0.0538, + "reward": 1.7916666269302368, + "reward_std": 0.589255690574646, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.9166666865348816, + "step": 512 + }, + { + "completion_length": 250.0, + "epoch": 0.04275, + "grad_norm": 0.3239578902721405, + "kl": 1.0606533288955688, + "learning_rate": 2.8219889914439073e-06, + "loss": 0.0424, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 1.0, + "step": 513 + }, + { + "completion_length": 250.0, + "epoch": 0.042833333333333334, + "grad_norm": 0.3085322678089142, + "kl": 0.7539732456207275, + "learning_rate": 2.813333083910761e-06, + "loss": 0.0302, + "reward": 1.375, + "reward_std": 0.7855339050292969, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.75, + "step": 514 + }, + { + "completion_length": 250.0, + "epoch": 0.042916666666666665, + "grad_norm": 0.645790159702301, + "kl": 0.8328765630722046, + "learning_rate": 2.804673358512869e-06, + "loss": 0.0333, + "reward": 0.875, + "reward_std": 0.7955730557441711, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.625, + "step": 515 + }, + { + "completion_length": 250.0, + "epoch": 0.043, + "grad_norm": 0.8226057291030884, + "kl": 1.01832914352417, + "learning_rate": 2.7960099207662535e-06, + "loss": 0.0407, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 0.875, + "step": 516 + }, + { + "completion_length": 212.0, + "epoch": 0.043083333333333335, + "grad_norm": 0.24604718387126923, + "kl": 1.0776610374450684, + "learning_rate": 2.7873428762321667e-06, + "loss": 0.0431, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 1.0, + "step": 517 + }, + { + "completion_length": 250.0, + "epoch": 0.043166666666666666, + "grad_norm": 0.33582380414009094, + "kl": 0.8894166946411133, + "learning_rate": 2.778672330515814e-06, + "loss": 0.0356, + "reward": 1.2083333730697632, + "reward_std": 0.6651769280433655, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.4583333432674408, + "step": 518 + }, + { + "completion_length": 250.0, + "epoch": 0.04325, + "grad_norm": 0.4405062198638916, + "kl": 1.2915191650390625, + "learning_rate": 2.769998389265057e-06, + "loss": 0.0517, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.875, + "step": 519 + }, + { + "completion_length": 232.0, + "epoch": 0.043333333333333335, + "grad_norm": 1.5345605611801147, + "kl": 1.553789734840393, + "learning_rate": 2.761321158169134e-06, + "loss": 0.0622, + "reward": 1.125, + "reward_std": 0.6408699750900269, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.875, + "step": 520 + }, + { + "completion_length": 206.0, + "epoch": 0.043416666666666666, + "grad_norm": 0.25380414724349976, + "kl": 1.496401071548462, + "learning_rate": 2.752640742957366e-06, + "loss": 0.0599, + "reward": 1.7916667461395264, + "reward_std": 0.39591163396835327, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.9166666865348816, + "step": 521 + }, + { + "completion_length": 250.0, + "epoch": 0.0435, + "grad_norm": 0.2672164738178253, + "kl": 0.8028614521026611, + "learning_rate": 2.743957249397874e-06, + "loss": 0.0321, + "reward": 1.5416666269302368, + "reward_std": 0.6651768684387207, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.9166666865348816, + "step": 522 + }, + { + "completion_length": 250.0, + "epoch": 0.043583333333333335, + "grad_norm": 0.2636195123195648, + "kl": 1.4980413913726807, + "learning_rate": 2.7352707832962865e-06, + "loss": 0.0599, + "reward": 1.6666666269302368, + "reward_std": 0.6424160599708557, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.9166666865348816, + "step": 523 + }, + { + "completion_length": 250.0, + "epoch": 0.043666666666666666, + "grad_norm": 1.0833460092544556, + "kl": 1.3643357753753662, + "learning_rate": 2.726581450494451e-06, + "loss": 0.0546, + "reward": 1.3333332538604736, + "reward_std": 0.9258201122283936, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.7083333730697632, + "step": 524 + }, + { + "completion_length": 250.0, + "epoch": 0.04375, + "grad_norm": 0.22439569234848022, + "kl": 1.132871150970459, + "learning_rate": 2.717889356869146e-06, + "loss": 0.0453, + "reward": 1.3333333730697632, + "reward_std": 0.7126966714859009, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.8333333730697632, + "step": 525 + }, + { + "completion_length": 250.0, + "epoch": 0.043833333333333335, + "grad_norm": 0.2796757221221924, + "kl": 1.0327954292297363, + "learning_rate": 2.70919460833079e-06, + "loss": 0.0413, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 1.0, + "step": 526 + }, + { + "completion_length": 250.0, + "epoch": 0.043916666666666666, + "grad_norm": 0.31149035692214966, + "kl": 1.374606966972351, + "learning_rate": 2.700497310822147e-06, + "loss": 0.055, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.875, + "step": 527 + }, + { + "completion_length": 250.0, + "epoch": 0.044, + "grad_norm": 0.7924208641052246, + "kl": 0.987855076789856, + "learning_rate": 2.6917975703170466e-06, + "loss": 0.0395, + "reward": 1.125, + "reward_std": 0.9910312294960022, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.625, + "step": 528 + }, + { + "completion_length": 250.0, + "epoch": 0.044083333333333335, + "grad_norm": 0.25965285301208496, + "kl": 0.8640234470367432, + "learning_rate": 2.6830954928190795e-06, + "loss": 0.0346, + "reward": 1.6666667461395264, + "reward_std": 0.6424160599708557, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.9166666865348816, + "step": 529 + }, + { + "completion_length": 250.0, + "epoch": 0.04416666666666667, + "grad_norm": 0.38326141238212585, + "kl": 0.8228756189346313, + "learning_rate": 2.6743911843603134e-06, + "loss": 0.0329, + "reward": 1.2916666269302368, + "reward_std": 0.8249579071998596, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.7916666865348816, + "step": 530 + }, + { + "completion_length": 250.0, + "epoch": 0.04425, + "grad_norm": 1.5146892070770264, + "kl": 1.9087010622024536, + "learning_rate": 2.6656847510000013e-06, + "loss": 0.0763, + "reward": 1.5416667461395264, + "reward_std": 0.501980185508728, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.9166666865348816, + "step": 531 + }, + { + "completion_length": 215.0, + "epoch": 0.044333333333333336, + "grad_norm": 0.2388104796409607, + "kl": 1.3102915287017822, + "learning_rate": 2.6569762988232838e-06, + "loss": 0.0524, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 1.0, + "step": 532 + }, + { + "completion_length": 250.0, + "epoch": 0.04441666666666667, + "grad_norm": 0.33024802803993225, + "kl": 0.8110081553459167, + "learning_rate": 2.6482659339399047e-06, + "loss": 0.0324, + "reward": 1.5833333730697632, + "reward_std": 0.7292091846466064, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.8333333730697632, + "step": 533 + }, + { + "completion_length": 250.0, + "epoch": 0.0445, + "grad_norm": 0.28449299931526184, + "kl": 0.9894317388534546, + "learning_rate": 2.63955376248291e-06, + "loss": 0.0396, + "reward": 1.4583333730697632, + "reward_std": 0.7955730557441711, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.8333333730697632, + "step": 534 + }, + { + "completion_length": 250.0, + "epoch": 0.044583333333333336, + "grad_norm": 0.24825812876224518, + "kl": 0.8840400576591492, + "learning_rate": 2.6308398906073603e-06, + "loss": 0.0354, + "reward": 1.0416667461395264, + "reward_std": 0.4520675241947174, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.9166666865348816, + "step": 535 + }, + { + "completion_length": 250.0, + "epoch": 0.04466666666666667, + "grad_norm": 1.5011301040649414, + "kl": 1.2059112787246704, + "learning_rate": 2.6221244244890336e-06, + "loss": 0.0482, + "reward": 1.2083333730697632, + "reward_std": 0.501980185508728, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.9583333730697632, + "step": 536 + }, + { + "completion_length": 250.0, + "epoch": 0.04475, + "grad_norm": 0.7638510465621948, + "kl": 1.5593329668045044, + "learning_rate": 2.613407470323134e-06, + "loss": 0.0624, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.75, + "step": 537 + }, + { + "completion_length": 168.0, + "epoch": 0.044833333333333336, + "grad_norm": 0.23012110590934753, + "kl": 0.8535071611404419, + "learning_rate": 2.604689134322999e-06, + "loss": 0.0341, + "reward": 1.4583333730697632, + "reward_std": 0.5892556309700012, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.9583333730697632, + "step": 538 + }, + { + "completion_length": 250.0, + "epoch": 0.04491666666666667, + "grad_norm": 0.3940114676952362, + "kl": 1.290722370147705, + "learning_rate": 2.5959695227188e-06, + "loss": 0.0516, + "reward": 1.5416666269302368, + "reward_std": 0.853331446647644, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.7916666865348816, + "step": 539 + }, + { + "completion_length": 250.0, + "epoch": 0.045, + "grad_norm": 0.2533872723579407, + "kl": 1.0061720609664917, + "learning_rate": 2.587248741756253e-06, + "loss": 0.0402, + "reward": 1.8333333730697632, + "reward_std": 0.4714045226573944, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.9583333730697632, + "step": 540 + }, + { + "completion_length": 250.0, + "epoch": 0.045083333333333336, + "grad_norm": 0.2459549903869629, + "kl": 1.3281100988388062, + "learning_rate": 2.578526897695321e-06, + "loss": 0.0531, + "reward": 1.9583333730697632, + "reward_std": 0.11785109341144562, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 0.9583333730697632, + "step": 541 + }, + { + "completion_length": 250.0, + "epoch": 0.04516666666666667, + "grad_norm": 0.45357388257980347, + "kl": 1.2036126852035522, + "learning_rate": 2.569804096808923e-06, + "loss": 0.0481, + "reward": 1.0416667461395264, + "reward_std": 0.8807914853096008, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.6666666269302368, + "step": 542 + }, + { + "completion_length": 250.0, + "epoch": 0.04525, + "grad_norm": 0.639238715171814, + "kl": 1.46484375, + "learning_rate": 2.5610804453816333e-06, + "loss": 0.0586, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.75, + "step": 543 + }, + { + "completion_length": 250.0, + "epoch": 0.04533333333333334, + "grad_norm": 0.2612769305706024, + "kl": 0.8998706340789795, + "learning_rate": 2.5523560497083927e-06, + "loss": 0.036, + "reward": 1.0833333730697632, + "reward_std": 0.6606874465942383, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.8333333730697632, + "step": 544 + }, + { + "completion_length": 250.0, + "epoch": 0.04541666666666667, + "grad_norm": 0.3391062021255493, + "kl": 0.9011841416358948, + "learning_rate": 2.543631016093209e-06, + "loss": 0.036, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 1.0, + "step": 545 + }, + { + "completion_length": 250.0, + "epoch": 0.0455, + "grad_norm": 0.909966230392456, + "kl": 1.1737316846847534, + "learning_rate": 2.5349054508478636e-06, + "loss": 0.0469, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.875, + "step": 546 + }, + { + "completion_length": 250.0, + "epoch": 0.04558333333333333, + "grad_norm": 0.7492355704307556, + "kl": 1.3316445350646973, + "learning_rate": 2.526179460290615e-06, + "loss": 0.0533, + "reward": 1.125, + "reward_std": 0.8345229625701904, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.75, + "step": 547 + }, + { + "completion_length": 250.0, + "epoch": 0.04566666666666667, + "grad_norm": 0.2507287263870239, + "kl": 0.7561590671539307, + "learning_rate": 2.517453150744904e-06, + "loss": 0.0302, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 1.0, + "step": 548 + }, + { + "completion_length": 250.0, + "epoch": 0.04575, + "grad_norm": 1.408830165863037, + "kl": 1.2084333896636963, + "learning_rate": 2.5087266285380597e-06, + "loss": 0.0483, + "reward": 1.2083333730697632, + "reward_std": 0.9074209332466125, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.7083333730697632, + "step": 549 + }, + { + "completion_length": 250.0, + "epoch": 0.04583333333333333, + "grad_norm": 0.5603641867637634, + "kl": 1.1831409931182861, + "learning_rate": 2.5e-06, + "loss": 0.0473, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.875, + "step": 550 + }, + { + "completion_length": 250.0, + "epoch": 0.04591666666666667, + "grad_norm": 0.24828428030014038, + "kl": 1.2554258108139038, + "learning_rate": 2.4912733714619415e-06, + "loss": 0.0502, + "reward": 1.3333332538604736, + "reward_std": 0.7559289336204529, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.8333333730697632, + "step": 551 + }, + { + "completion_length": 250.0, + "epoch": 0.046, + "grad_norm": 0.2216569483280182, + "kl": 1.5270963907241821, + "learning_rate": 2.482546849255096e-06, + "loss": 0.0611, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 1.0, + "step": 552 + }, + { + "completion_length": 250.0, + "epoch": 0.04608333333333333, + "grad_norm": 0.7645318508148193, + "kl": 0.9473637342453003, + "learning_rate": 2.4738205397093863e-06, + "loss": 0.0379, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.75, + "step": 553 + }, + { + "completion_length": 250.0, + "epoch": 0.04616666666666667, + "grad_norm": 0.2752268612384796, + "kl": 0.5936354994773865, + "learning_rate": 2.4650945491521372e-06, + "loss": 0.0237, + "reward": 1.5, + "reward_std": 0.471404492855072, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.875, + "step": 554 + }, + { + "completion_length": 250.0, + "epoch": 0.04625, + "grad_norm": 0.029000254347920418, + "kl": 1.6099191904067993, + "learning_rate": 2.4563689839067913e-06, + "loss": 0.0644, + "reward": 2.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 1.0, + "step": 555 + }, + { + "completion_length": 250.0, + "epoch": 0.04633333333333333, + "grad_norm": 0.21292659640312195, + "kl": 1.1784905195236206, + "learning_rate": 2.447643950291608e-06, + "loss": 0.0471, + "reward": 1.1666667461395264, + "reward_std": 0.5634361505508423, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.9166666865348816, + "step": 556 + }, + { + "completion_length": 250.0, + "epoch": 0.04641666666666667, + "grad_norm": 0.23645764589309692, + "kl": 1.1839085817337036, + "learning_rate": 2.4389195546183676e-06, + "loss": 0.0474, + "reward": 1.7083333730697632, + "reward_std": 0.48591262102127075, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.8333333730697632, + "step": 557 + }, + { + "completion_length": 250.0, + "epoch": 0.0465, + "grad_norm": 0.38307103514671326, + "kl": 1.4955880641937256, + "learning_rate": 2.4301959031910785e-06, + "loss": 0.0598, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 1.0, + "step": 558 + }, + { + "completion_length": 250.0, + "epoch": 0.04658333333333333, + "grad_norm": 0.2519403398036957, + "kl": 1.4596502780914307, + "learning_rate": 2.4214731023046795e-06, + "loss": 0.0584, + "reward": 1.7083333730697632, + "reward_std": 0.4520675241947174, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.9583333730697632, + "step": 559 + }, + { + "completion_length": 250.0, + "epoch": 0.04666666666666667, + "grad_norm": 0.8129304647445679, + "kl": 1.21884024143219, + "learning_rate": 2.4127512582437486e-06, + "loss": 0.0488, + "reward": 1.375, + "reward_std": 0.9161254167556763, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.75, + "step": 560 + }, + { + "completion_length": 250.0, + "epoch": 0.04675, + "grad_norm": 0.6133831143379211, + "kl": 1.6948648691177368, + "learning_rate": 2.4040304772812002e-06, + "loss": 0.0678, + "reward": 1.5, + "reward_std": 0.8357109427452087, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.75, + "step": 561 + }, + { + "completion_length": 250.0, + "epoch": 0.04683333333333333, + "grad_norm": 0.30969029664993286, + "kl": 1.4396021366119385, + "learning_rate": 2.3953108656770018e-06, + "loss": 0.0576, + "reward": 1.25, + "reward_std": 0.7918233275413513, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.75, + "step": 562 + }, + { + "completion_length": 250.0, + "epoch": 0.04691666666666667, + "grad_norm": 0.2306750863790512, + "kl": 1.3534696102142334, + "learning_rate": 2.3865925296768658e-06, + "loss": 0.0541, + "reward": 1.2916666269302368, + "reward_std": 0.6283639669418335, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.9166666865348816, + "step": 563 + }, + { + "completion_length": 250.0, + "epoch": 0.047, + "grad_norm": 0.2677202820777893, + "kl": 1.3551838397979736, + "learning_rate": 2.377875575510967e-06, + "loss": 0.0542, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.875, + "step": 564 + }, + { + "completion_length": 250.0, + "epoch": 0.04708333333333333, + "grad_norm": 0.7604575157165527, + "kl": 1.264426350593567, + "learning_rate": 2.3691601093926406e-06, + "loss": 0.0506, + "reward": 1.1666667461395264, + "reward_std": 0.50395268201828, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.7916666865348816, + "step": 565 + }, + { + "completion_length": 250.0, + "epoch": 0.04716666666666667, + "grad_norm": 1.7140240669250488, + "kl": 1.1938304901123047, + "learning_rate": 2.3604462375170905e-06, + "loss": 0.0478, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.75, + "step": 566 + }, + { + "completion_length": 250.0, + "epoch": 0.04725, + "grad_norm": 0.3368653655052185, + "kl": 1.5280756950378418, + "learning_rate": 2.3517340660600965e-06, + "loss": 0.0611, + "reward": 1.0, + "reward_std": 0.7559289336204529, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.75, + "step": 567 + }, + { + "completion_length": 250.0, + "epoch": 0.04733333333333333, + "grad_norm": 0.4907490015029907, + "kl": 1.3557684421539307, + "learning_rate": 2.3430237011767166e-06, + "loss": 0.0542, + "reward": 1.5416666269302368, + "reward_std": 0.6651769280433655, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.9166666865348816, + "step": 568 + }, + { + "completion_length": 250.0, + "epoch": 0.04741666666666667, + "grad_norm": 0.4219551086425781, + "kl": 1.487517237663269, + "learning_rate": 2.3343152490000004e-06, + "loss": 0.0595, + "reward": 1.625, + "reward_std": 0.6283639073371887, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.875, + "step": 569 + }, + { + "completion_length": 250.0, + "epoch": 0.0475, + "grad_norm": 0.37886741757392883, + "kl": 0.992394745349884, + "learning_rate": 2.325608815639687e-06, + "loss": 0.0397, + "reward": 1.1666667461395264, + "reward_std": 0.7766431570053101, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.7916666865348816, + "step": 570 + }, + { + "completion_length": 250.0, + "epoch": 0.04758333333333333, + "grad_norm": 0.2531243562698364, + "kl": 1.362313985824585, + "learning_rate": 2.3169045071809217e-06, + "loss": 0.0545, + "reward": 1.75, + "reward_std": 0.38832157850265503, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.875, + "step": 571 + }, + { + "completion_length": 250.0, + "epoch": 0.04766666666666667, + "grad_norm": 0.27313292026519775, + "kl": 1.0673291683197021, + "learning_rate": 2.3082024296829538e-06, + "loss": 0.0427, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.875, + "step": 572 + }, + { + "completion_length": 250.0, + "epoch": 0.04775, + "grad_norm": 0.3057982325553894, + "kl": 1.4214304685592651, + "learning_rate": 2.2995026891778533e-06, + "loss": 0.0569, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 1.0, + "step": 573 + }, + { + "completion_length": 250.0, + "epoch": 0.04783333333333333, + "grad_norm": 0.27380916476249695, + "kl": 1.0907694101333618, + "learning_rate": 2.290805391669212e-06, + "loss": 0.0436, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 1.0, + "step": 574 + }, + { + "completion_length": 250.0, + "epoch": 0.04791666666666667, + "grad_norm": 0.4117473065853119, + "kl": 1.1665431261062622, + "learning_rate": 2.2821106431308546e-06, + "loss": 0.0467, + "reward": 1.1666667461395264, + "reward_std": 0.5345224142074585, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.7916666865348816, + "step": 575 + }, + { + "completion_length": 250.0, + "epoch": 0.048, + "grad_norm": 0.6448284983634949, + "kl": 1.6899546384811401, + "learning_rate": 2.2734185495055503e-06, + "loss": 0.0676, + "reward": 1.1666667461395264, + "reward_std": 0.7766432166099548, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.7916666865348816, + "step": 576 + }, + { + "completion_length": 250.0, + "epoch": 0.04808333333333333, + "grad_norm": 0.2402116358280182, + "kl": 0.989196240901947, + "learning_rate": 2.2647292167037143e-06, + "loss": 0.0396, + "reward": 1.5, + "reward_std": 0.6424160599708557, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.875, + "step": 577 + }, + { + "completion_length": 250.0, + "epoch": 0.04816666666666667, + "grad_norm": 0.23641373217105865, + "kl": 1.2374866008758545, + "learning_rate": 2.256042750602127e-06, + "loss": 0.0495, + "reward": 1.7083333730697632, + "reward_std": 0.7000566720962524, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.8333333730697632, + "step": 578 + }, + { + "completion_length": 250.0, + "epoch": 0.04825, + "grad_norm": 0.4009597897529602, + "kl": 1.3696413040161133, + "learning_rate": 2.2473592570426343e-06, + "loss": 0.0548, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 1.0, + "step": 579 + }, + { + "completion_length": 250.0, + "epoch": 0.04833333333333333, + "grad_norm": 0.47816938161849976, + "kl": 1.0150160789489746, + "learning_rate": 2.238678841830867e-06, + "loss": 0.0406, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.875, + "step": 580 + }, + { + "completion_length": 250.0, + "epoch": 0.04841666666666666, + "grad_norm": 0.2902098000049591, + "kl": 1.2081258296966553, + "learning_rate": 2.230001610734943e-06, + "loss": 0.0483, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.875, + "step": 581 + }, + { + "completion_length": 250.0, + "epoch": 0.0485, + "grad_norm": 0.5224894285202026, + "kl": 1.5874559879302979, + "learning_rate": 2.2213276694841866e-06, + "loss": 0.0635, + "reward": 1.2916667461395264, + "reward_std": 0.8249579668045044, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.7916666865348816, + "step": 582 + }, + { + "completion_length": 250.0, + "epoch": 0.04858333333333333, + "grad_norm": 0.3580199182033539, + "kl": 0.8156141638755798, + "learning_rate": 2.212657123767834e-06, + "loss": 0.0326, + "reward": 1.3333333730697632, + "reward_std": 0.7766431570053101, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.8333333730697632, + "step": 583 + }, + { + "completion_length": 250.0, + "epoch": 0.048666666666666664, + "grad_norm": 0.28623875975608826, + "kl": 1.2345472574234009, + "learning_rate": 2.2039900792337477e-06, + "loss": 0.0494, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.875, + "step": 584 + }, + { + "completion_length": 250.0, + "epoch": 0.04875, + "grad_norm": 0.340573251247406, + "kl": 1.5262372493743896, + "learning_rate": 2.195326641487132e-06, + "loss": 0.061, + "reward": 1.7083333730697632, + "reward_std": 0.4520675837993622, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.8333333730697632, + "step": 585 + }, + { + "completion_length": 250.0, + "epoch": 0.04883333333333333, + "grad_norm": 0.425148069858551, + "kl": 1.291420340538025, + "learning_rate": 2.186666916089239e-06, + "loss": 0.0517, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.75, + "step": 586 + }, + { + "completion_length": 250.0, + "epoch": 0.048916666666666664, + "grad_norm": 0.21294091641902924, + "kl": 1.372128963470459, + "learning_rate": 2.1780110085560935e-06, + "loss": 0.0549, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 1.0, + "step": 587 + }, + { + "completion_length": 250.0, + "epoch": 0.049, + "grad_norm": 0.8173606395721436, + "kl": 1.2428475618362427, + "learning_rate": 2.1693590243571937e-06, + "loss": 0.0497, + "reward": 1.9583333730697632, + "reward_std": 0.11785109341144562, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 0.9583333730697632, + "step": 588 + }, + { + "completion_length": 250.0, + "epoch": 0.04908333333333333, + "grad_norm": 0.3507099449634552, + "kl": 0.9352976083755493, + "learning_rate": 2.1607110689142393e-06, + "loss": 0.0374, + "reward": 1.2083333730697632, + "reward_std": 0.8533315062522888, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.5833333730697632, + "step": 589 + }, + { + "completion_length": 250.0, + "epoch": 0.049166666666666664, + "grad_norm": 0.6860688924789429, + "kl": 1.3327147960662842, + "learning_rate": 2.1520672475998374e-06, + "loss": 0.0533, + "reward": 1.4583333730697632, + "reward_std": 0.501980185508728, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.9583333730697632, + "step": 590 + }, + { + "completion_length": 250.0, + "epoch": 0.04925, + "grad_norm": 0.1836400032043457, + "kl": 1.7783141136169434, + "learning_rate": 2.143427665736221e-06, + "loss": 0.0711, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 1.0, + "step": 591 + }, + { + "completion_length": 250.0, + "epoch": 0.04933333333333333, + "grad_norm": 1.7451122999191284, + "kl": 1.7091467380523682, + "learning_rate": 2.134792428593971e-06, + "loss": 0.0684, + "reward": 1.3333332538604736, + "reward_std": 0.6666666269302368, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.7083333134651184, + "step": 592 + }, + { + "completion_length": 250.0, + "epoch": 0.049416666666666664, + "grad_norm": 0.23946592211723328, + "kl": 1.4415324926376343, + "learning_rate": 2.1261616413907267e-06, + "loss": 0.0577, + "reward": 1.2916667461395264, + "reward_std": 0.6283639669418335, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.9166666865348816, + "step": 593 + }, + { + "completion_length": 250.0, + "epoch": 0.0495, + "grad_norm": 0.02028987742960453, + "kl": 1.161889910697937, + "learning_rate": 2.117535409289905e-06, + "loss": 0.0465, + "reward": 2.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 1.0, + "step": 594 + }, + { + "completion_length": 250.0, + "epoch": 0.04958333333333333, + "grad_norm": 0.23525746166706085, + "kl": 1.4621292352676392, + "learning_rate": 2.1089138373994226e-06, + "loss": 0.0585, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 1.0, + "step": 595 + }, + { + "completion_length": 250.0, + "epoch": 0.049666666666666665, + "grad_norm": 0.2379079908132553, + "kl": 0.9912834763526917, + "learning_rate": 2.1002970307704134e-06, + "loss": 0.0397, + "reward": 1.0416667461395264, + "reward_std": 0.5473601818084717, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.7916666865348816, + "step": 596 + }, + { + "completion_length": 250.0, + "epoch": 0.04975, + "grad_norm": 1.0380606651306152, + "kl": 1.8222761154174805, + "learning_rate": 2.0916850943959453e-06, + "loss": 0.0729, + "reward": 1.5416667461395264, + "reward_std": 0.501980185508728, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.7916666865348816, + "step": 597 + }, + { + "completion_length": 250.0, + "epoch": 0.049833333333333334, + "grad_norm": 0.2623489797115326, + "kl": 0.9441961646080017, + "learning_rate": 2.0830781332097446e-06, + "loss": 0.0378, + "reward": 1.5, + "reward_std": 0.7126966118812561, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.875, + "step": 598 + }, + { + "completion_length": 250.0, + "epoch": 0.049916666666666665, + "grad_norm": 0.2724643647670746, + "kl": 1.209511637687683, + "learning_rate": 2.0744762520849193e-06, + "loss": 0.0484, + "reward": 1.5833333730697632, + "reward_std": 0.49601587653160095, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.9583333730697632, + "step": 599 + }, + { + "completion_length": 250.0, + "epoch": 0.05, + "grad_norm": 0.02976626716554165, + "kl": 1.7429208755493164, + "learning_rate": 2.0658795558326745e-06, + "loss": 0.0697, + "reward": 2.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 1.0, + "step": 600 + }, + { + "completion_length": 250.0, + "epoch": 0.050083333333333334, + "grad_norm": 0.2598009407520294, + "kl": 1.0037543773651123, + "learning_rate": 2.0572881492010423e-06, + "loss": 0.0402, + "reward": 1.3333333730697632, + "reward_std": 0.854493260383606, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.7083333730697632, + "step": 601 + }, + { + "completion_length": 250.0, + "epoch": 0.050166666666666665, + "grad_norm": 0.2374367117881775, + "kl": 1.5422213077545166, + "learning_rate": 2.0487021368736002e-06, + "loss": 0.0617, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.875, + "step": 602 + }, + { + "completion_length": 202.0, + "epoch": 0.05025, + "grad_norm": 0.23875918984413147, + "kl": 1.7543926239013672, + "learning_rate": 2.0401216234682e-06, + "loss": 0.0702, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 1.0, + "step": 603 + }, + { + "completion_length": 250.0, + "epoch": 0.050333333333333334, + "grad_norm": 0.03209434449672699, + "kl": 1.4833548069000244, + "learning_rate": 2.031546713535688e-06, + "loss": 0.0593, + "reward": 1.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 1.0, + "step": 604 + }, + { + "completion_length": 250.0, + "epoch": 0.050416666666666665, + "grad_norm": 0.27828800678253174, + "kl": 1.3084222078323364, + "learning_rate": 2.022977511558638e-06, + "loss": 0.0523, + "reward": 1.125, + "reward_std": 0.6408699750900269, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.875, + "step": 605 + }, + { + "completion_length": 250.0, + "epoch": 0.0505, + "grad_norm": 1.4770981073379517, + "kl": 1.1198890209197998, + "learning_rate": 2.0144141219500707e-06, + "loss": 0.0448, + "reward": 1.125, + "reward_std": 0.6408699750900269, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.875, + "step": 606 + }, + { + "completion_length": 250.0, + "epoch": 0.050583333333333334, + "grad_norm": 0.33261868357658386, + "kl": 1.2287256717681885, + "learning_rate": 2.0058566490521848e-06, + "loss": 0.0491, + "reward": 1.0416666269302368, + "reward_std": 0.7000566720962524, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.7916666865348816, + "step": 607 + }, + { + "completion_length": 250.0, + "epoch": 0.050666666666666665, + "grad_norm": 0.28096532821655273, + "kl": 1.3212240934371948, + "learning_rate": 1.997305197135089e-06, + "loss": 0.0528, + "reward": 1.7916667461395264, + "reward_std": 0.39591163396835327, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.9166666865348816, + "step": 608 + }, + { + "completion_length": 250.0, + "epoch": 0.05075, + "grad_norm": 0.7251350283622742, + "kl": 2.047295331954956, + "learning_rate": 1.9887598703955244e-06, + "loss": 0.0819, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.875, + "step": 609 + }, + { + "completion_length": 250.0, + "epoch": 0.050833333333333335, + "grad_norm": 0.22892563045024872, + "kl": 1.1812776327133179, + "learning_rate": 1.9802207729556023e-06, + "loss": 0.0473, + "reward": 1.7916667461395264, + "reward_std": 0.46929529309272766, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.9166667461395264, + "step": 610 + }, + { + "completion_length": 250.0, + "epoch": 0.050916666666666666, + "grad_norm": 0.446869432926178, + "kl": 1.157643437385559, + "learning_rate": 1.971688008861529e-06, + "loss": 0.0463, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.875, + "step": 611 + }, + { + "completion_length": 250.0, + "epoch": 0.051, + "grad_norm": 0.889122486114502, + "kl": 0.7201449275016785, + "learning_rate": 1.963161682082342e-06, + "loss": 0.0288, + "reward": 0.9166666865348816, + "reward_std": 0.5563486218452454, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.7916666865348816, + "step": 612 + }, + { + "completion_length": 250.0, + "epoch": 0.051083333333333335, + "grad_norm": 0.3091413378715515, + "kl": 1.346756935119629, + "learning_rate": 1.9546418965086444e-06, + "loss": 0.0539, + "reward": 1.9166667461395264, + "reward_std": 0.23570223152637482, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 0.9166666865348816, + "step": 613 + }, + { + "completion_length": 250.0, + "epoch": 0.051166666666666666, + "grad_norm": 0.40438079833984375, + "kl": 1.2623693943023682, + "learning_rate": 1.946128755951332e-06, + "loss": 0.0505, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.875, + "step": 614 + }, + { + "completion_length": 244.0, + "epoch": 0.05125, + "grad_norm": 0.31069228053092957, + "kl": 1.4803262948989868, + "learning_rate": 1.937622364140338e-06, + "loss": 0.0592, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.875, + "step": 615 + }, + { + "completion_length": 250.0, + "epoch": 0.051333333333333335, + "grad_norm": 0.34841683506965637, + "kl": 1.1213486194610596, + "learning_rate": 1.9291228247233607e-06, + "loss": 0.0449, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.875, + "step": 616 + }, + { + "completion_length": 250.0, + "epoch": 0.051416666666666666, + "grad_norm": 0.2804429829120636, + "kl": 1.3213424682617188, + "learning_rate": 1.9206302412646074e-06, + "loss": 0.0529, + "reward": 0.9583333730697632, + "reward_std": 0.11785111576318741, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.9583333730697632, + "step": 617 + }, + { + "completion_length": 250.0, + "epoch": 0.0515, + "grad_norm": 0.2578163146972656, + "kl": 1.5600756406784058, + "learning_rate": 1.912144717243525e-06, + "loss": 0.0624, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 1.0, + "step": 618 + }, + { + "completion_length": 250.0, + "epoch": 0.051583333333333335, + "grad_norm": 0.27005648612976074, + "kl": 1.027485728263855, + "learning_rate": 1.9036663560535484e-06, + "loss": 0.0411, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.875, + "step": 619 + }, + { + "completion_length": 250.0, + "epoch": 0.051666666666666666, + "grad_norm": 0.3639249801635742, + "kl": 1.0344536304473877, + "learning_rate": 1.895195261000831e-06, + "loss": 0.0414, + "reward": 1.6666667461395264, + "reward_std": 0.7126966714859009, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.7916666865348816, + "step": 620 + }, + { + "completion_length": 250.0, + "epoch": 0.05175, + "grad_norm": 0.23271331191062927, + "kl": 0.8880565166473389, + "learning_rate": 1.8867315353029937e-06, + "loss": 0.0355, + "reward": 1.7083333730697632, + "reward_std": 0.4520675241947174, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.9583333730697632, + "step": 621 + }, + { + "completion_length": 250.0, + "epoch": 0.051833333333333335, + "grad_norm": 0.2460673302412033, + "kl": 2.0310378074645996, + "learning_rate": 1.8782752820878636e-06, + "loss": 0.0812, + "reward": 1.6666667461395264, + "reward_std": 0.4714045226573944, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.9166666865348816, + "step": 622 + }, + { + "completion_length": 250.0, + "epoch": 0.051916666666666667, + "grad_norm": 0.3399658501148224, + "kl": 1.346755027770996, + "learning_rate": 1.8698266043922159e-06, + "loss": 0.0539, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 1.0, + "step": 623 + }, + { + "completion_length": 250.0, + "epoch": 0.052, + "grad_norm": 0.2878936529159546, + "kl": 1.044472575187683, + "learning_rate": 1.8613856051605242e-06, + "loss": 0.0418, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.875, + "step": 624 + }, + { + "completion_length": 250.0, + "epoch": 0.052083333333333336, + "grad_norm": 0.30562835931777954, + "kl": 1.278110384941101, + "learning_rate": 1.852952387243698e-06, + "loss": 0.0511, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 1.0, + "step": 625 + }, + { + "completion_length": 250.0, + "epoch": 0.05216666666666667, + "grad_norm": 0.30309000611305237, + "kl": 1.116170048713684, + "learning_rate": 1.8445270533978387e-06, + "loss": 0.0446, + "reward": 0.7916666865348816, + "reward_std": 0.39591166377067566, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.7916666865348816, + "step": 626 + }, + { + "completion_length": 250.0, + "epoch": 0.05225, + "grad_norm": 0.29444682598114014, + "kl": 1.0253041982650757, + "learning_rate": 1.836109706282978e-06, + "loss": 0.041, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 1.0, + "step": 627 + }, + { + "completion_length": 250.0, + "epoch": 0.052333333333333336, + "grad_norm": 0.3537648320198059, + "kl": 0.7810394167900085, + "learning_rate": 1.827700448461836e-06, + "loss": 0.0312, + "reward": 1.2083333730697632, + "reward_std": 0.8717342615127563, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.7083333730697632, + "step": 628 + }, + { + "completion_length": 250.0, + "epoch": 0.05241666666666667, + "grad_norm": 0.1647401601076126, + "kl": 1.585963249206543, + "learning_rate": 1.8192993823985643e-06, + "loss": 0.0634, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 1.0, + "step": 629 + }, + { + "completion_length": 250.0, + "epoch": 0.0525, + "grad_norm": 0.9263207316398621, + "kl": 1.3402163982391357, + "learning_rate": 1.8109066104575023e-06, + "loss": 0.0536, + "reward": 1.3333333730697632, + "reward_std": 0.5634361505508423, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.9583333730697632, + "step": 630 + }, + { + "completion_length": 250.0, + "epoch": 0.052583333333333336, + "grad_norm": 0.29536283016204834, + "kl": 1.0759024620056152, + "learning_rate": 1.8025222349019273e-06, + "loss": 0.043, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.875, + "step": 631 + }, + { + "completion_length": 250.0, + "epoch": 0.05266666666666667, + "grad_norm": 0.7649843692779541, + "kl": 0.7642300724983215, + "learning_rate": 1.7941463578928088e-06, + "loss": 0.0306, + "reward": 1.5, + "reward_std": 0.7126966118812561, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.75, + "step": 632 + }, + { + "completion_length": 250.0, + "epoch": 0.05275, + "grad_norm": 0.26629337668418884, + "kl": 1.3809568881988525, + "learning_rate": 1.7857790814875665e-06, + "loss": 0.0552, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 1.0, + "step": 633 + }, + { + "completion_length": 250.0, + "epoch": 0.052833333333333336, + "grad_norm": 0.2140216827392578, + "kl": 0.8888575434684753, + "learning_rate": 1.7774205076388207e-06, + "loss": 0.0356, + "reward": 1.7083333730697632, + "reward_std": 0.5473601818084717, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.9583333730697632, + "step": 634 + }, + { + "completion_length": 250.0, + "epoch": 0.05291666666666667, + "grad_norm": 0.27039581537246704, + "kl": 0.562234103679657, + "learning_rate": 1.7690707381931585e-06, + "loss": 0.0225, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.75, + "step": 635 + }, + { + "completion_length": 250.0, + "epoch": 0.053, + "grad_norm": 0.2814823091030121, + "kl": 1.5715206861495972, + "learning_rate": 1.7607298748898844e-06, + "loss": 0.0629, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 1.0, + "step": 636 + }, + { + "completion_length": 250.0, + "epoch": 0.05308333333333334, + "grad_norm": 0.21937192976474762, + "kl": 0.5414950251579285, + "learning_rate": 1.7523980193597837e-06, + "loss": 0.0217, + "reward": 1.5833333730697632, + "reward_std": 0.49601587653160095, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.9583333730697632, + "step": 637 + }, + { + "completion_length": 250.0, + "epoch": 0.05316666666666667, + "grad_norm": 0.28082460165023804, + "kl": 1.2513439655303955, + "learning_rate": 1.744075273123889e-06, + "loss": 0.0501, + "reward": 1.4583333730697632, + "reward_std": 0.7332792282104492, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.8333333730697632, + "step": 638 + }, + { + "completion_length": 250.0, + "epoch": 0.05325, + "grad_norm": 0.029399115592241287, + "kl": 1.4624956846237183, + "learning_rate": 1.735761737592236e-06, + "loss": 0.0585, + "reward": 2.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 1.0, + "step": 639 + }, + { + "completion_length": 250.0, + "epoch": 0.05333333333333334, + "grad_norm": 0.39860522747039795, + "kl": 1.6155064105987549, + "learning_rate": 1.7274575140626318e-06, + "loss": 0.0646, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.75, + "step": 640 + }, + { + "completion_length": 250.0, + "epoch": 0.05341666666666667, + "grad_norm": 0.25739023089408875, + "kl": 1.2121546268463135, + "learning_rate": 1.7191627037194187e-06, + "loss": 0.0485, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 1.0, + "step": 641 + }, + { + "completion_length": 250.0, + "epoch": 0.0535, + "grad_norm": 0.3668891191482544, + "kl": 1.4224536418914795, + "learning_rate": 1.7108774076322443e-06, + "loss": 0.0569, + "reward": 1.5833333730697632, + "reward_std": 0.5841829776763916, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.9583333730697632, + "step": 642 + }, + { + "completion_length": 212.0, + "epoch": 0.05358333333333333, + "grad_norm": 0.2385721206665039, + "kl": 1.3418517112731934, + "learning_rate": 1.702601726754825e-06, + "loss": 0.0537, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 1.0, + "step": 643 + }, + { + "completion_length": 250.0, + "epoch": 0.05366666666666667, + "grad_norm": 0.34340476989746094, + "kl": 1.1666271686553955, + "learning_rate": 1.6943357619237227e-06, + "loss": 0.0467, + "reward": 0.75, + "reward_std": 0.7071067690849304, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.5, + "step": 644 + }, + { + "completion_length": 250.0, + "epoch": 0.05375, + "grad_norm": 0.22176310420036316, + "kl": 1.4181245565414429, + "learning_rate": 1.686079613857109e-06, + "loss": 0.0567, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 1.0, + "step": 645 + }, + { + "completion_length": 250.0, + "epoch": 0.05383333333333333, + "grad_norm": 0.27106255292892456, + "kl": 0.7385506629943848, + "learning_rate": 1.677833383153542e-06, + "loss": 0.0295, + "reward": 1.0, + "reward_std": 0.5345224738121033, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.875, + "step": 646 + }, + { + "completion_length": 250.0, + "epoch": 0.05391666666666667, + "grad_norm": 0.23391051590442657, + "kl": 1.1499696969985962, + "learning_rate": 1.6695971702907425e-06, + "loss": 0.046, + "reward": 1.7083333730697632, + "reward_std": 0.7000566720962524, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.8333333730697632, + "step": 647 + }, + { + "completion_length": 250.0, + "epoch": 0.054, + "grad_norm": 0.29008620977401733, + "kl": 1.0525387525558472, + "learning_rate": 1.661371075624363e-06, + "loss": 0.0421, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 1.0, + "step": 648 + }, + { + "completion_length": 250.0, + "epoch": 0.05408333333333333, + "grad_norm": 0.2697657644748688, + "kl": 0.778380811214447, + "learning_rate": 1.6531551993867717e-06, + "loss": 0.0311, + "reward": 1.9166667461395264, + "reward_std": 0.15430331230163574, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 0.9166666865348816, + "step": 649 + }, + { + "completion_length": 250.0, + "epoch": 0.05416666666666667, + "grad_norm": 0.2077609896659851, + "kl": 0.8145649433135986, + "learning_rate": 1.6449496416858285e-06, + "loss": 0.0326, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 1.0, + "step": 650 + }, + { + "completion_length": 250.0, + "epoch": 0.05425, + "grad_norm": 0.2946789562702179, + "kl": 1.0525617599487305, + "learning_rate": 1.6367545025036634e-06, + "loss": 0.0421, + "reward": 1.9166667461395264, + "reward_std": 0.23570223152637482, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 0.9166666865348816, + "step": 651 + }, + { + "completion_length": 250.0, + "epoch": 0.05433333333333333, + "grad_norm": 0.2999846041202545, + "kl": 1.0172115564346313, + "learning_rate": 1.6285698816954626e-06, + "loss": 0.0407, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 1.0, + "step": 652 + }, + { + "completion_length": 250.0, + "epoch": 0.05441666666666667, + "grad_norm": 0.3423196077346802, + "kl": 1.435135006904602, + "learning_rate": 1.6203958789882457e-06, + "loss": 0.0574, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.75, + "step": 653 + }, + { + "completion_length": 250.0, + "epoch": 0.0545, + "grad_norm": 0.4042535424232483, + "kl": 0.9854827523231506, + "learning_rate": 1.612232593979658e-06, + "loss": 0.0394, + "reward": 1.5416666269302368, + "reward_std": 0.6651769280433655, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.9166666865348816, + "step": 654 + }, + { + "completion_length": 250.0, + "epoch": 0.05458333333333333, + "grad_norm": 0.5451151132583618, + "kl": 1.6612234115600586, + "learning_rate": 1.6040801261367494e-06, + "loss": 0.0664, + "reward": 1.7916667461395264, + "reward_std": 0.39591163396835327, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 0.7916666865348816, + "step": 655 + }, + { + "completion_length": 250.0, + "epoch": 0.05466666666666667, + "grad_norm": 0.28427278995513916, + "kl": 0.8143528699874878, + "learning_rate": 1.5959385747947697e-06, + "loss": 0.0326, + "reward": 1.5416666269302368, + "reward_std": 0.853331446647644, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.7916666865348816, + "step": 656 + }, + { + "completion_length": 250.0, + "epoch": 0.05475, + "grad_norm": 0.26303741335868835, + "kl": 1.1400749683380127, + "learning_rate": 1.5878080391559507e-06, + "loss": 0.0456, + "reward": 1.4583333730697632, + "reward_std": 0.5892556309700012, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.9583333730697632, + "step": 657 + }, + { + "completion_length": 250.0, + "epoch": 0.05483333333333333, + "grad_norm": 2.4117088317871094, + "kl": 1.7515506744384766, + "learning_rate": 1.5796886182883053e-06, + "loss": 0.0701, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 0.875, + "step": 658 + }, + { + "completion_length": 250.0, + "epoch": 0.05491666666666667, + "grad_norm": 0.37918296456336975, + "kl": 1.5612022876739502, + "learning_rate": 1.5715804111244138e-06, + "loss": 0.0624, + "reward": 1.6666667461395264, + "reward_std": 0.4364357888698578, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.9166666865348816, + "step": 659 + }, + { + "completion_length": 250.0, + "epoch": 0.055, + "grad_norm": 0.3990735709667206, + "kl": 0.9025101065635681, + "learning_rate": 1.56348351646022e-06, + "loss": 0.0361, + "reward": 1.5833333730697632, + "reward_std": 0.7292091846466064, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.8333333730697632, + "step": 660 + }, + { + "completion_length": 250.0, + "epoch": 0.05508333333333333, + "grad_norm": 0.2836557924747467, + "kl": 1.0622992515563965, + "learning_rate": 1.5553980329538326e-06, + "loss": 0.0425, + "reward": 1.5416666269302368, + "reward_std": 0.8533315062522888, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.7916666865348816, + "step": 661 + }, + { + "completion_length": 249.0, + "epoch": 0.05516666666666667, + "grad_norm": 0.4254859685897827, + "kl": 1.7287112474441528, + "learning_rate": 1.547324059124315e-06, + "loss": 0.0691, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 1.0, + "step": 662 + }, + { + "completion_length": 250.0, + "epoch": 0.05525, + "grad_norm": 0.517515242099762, + "kl": 1.6896018981933594, + "learning_rate": 1.539261693350491e-06, + "loss": 0.0676, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.75, + "step": 663 + }, + { + "completion_length": 250.0, + "epoch": 0.05533333333333333, + "grad_norm": 0.03853216394782066, + "kl": 1.4163974523544312, + "learning_rate": 1.5312110338697427e-06, + "loss": 0.0567, + "reward": 2.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 1.0, + "step": 664 + }, + { + "completion_length": 250.0, + "epoch": 0.05541666666666667, + "grad_norm": 5.298870086669922, + "kl": 1.7589707374572754, + "learning_rate": 1.5231721787768162e-06, + "loss": 0.0704, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.875, + "step": 665 + }, + { + "completion_length": 250.0, + "epoch": 0.0555, + "grad_norm": 0.28842854499816895, + "kl": 1.1976323127746582, + "learning_rate": 1.5151452260226224e-06, + "loss": 0.0479, + "reward": 1.6666667461395264, + "reward_std": 0.7126966714859009, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.7916666865348816, + "step": 666 + }, + { + "completion_length": 250.0, + "epoch": 0.05558333333333333, + "grad_norm": 0.4191083610057831, + "kl": 1.463090419769287, + "learning_rate": 1.5071302734130488e-06, + "loss": 0.0585, + "reward": 1.7083333730697632, + "reward_std": 0.4520675241947174, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.8333333730697632, + "step": 667 + }, + { + "completion_length": 250.0, + "epoch": 0.05566666666666667, + "grad_norm": 0.2382550835609436, + "kl": 1.061838150024414, + "learning_rate": 1.4991274186077632e-06, + "loss": 0.0425, + "reward": 1.5833332538604736, + "reward_std": 0.6362090110778809, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.8333333730697632, + "step": 668 + }, + { + "completion_length": 250.0, + "epoch": 0.05575, + "grad_norm": 0.2757638394832611, + "kl": 1.002281665802002, + "learning_rate": 1.491136759119025e-06, + "loss": 0.0401, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.875, + "step": 669 + }, + { + "completion_length": 250.0, + "epoch": 0.05583333333333333, + "grad_norm": 0.4550880193710327, + "kl": 0.8322465419769287, + "learning_rate": 1.4831583923105e-06, + "loss": 0.0333, + "reward": 1.5833333730697632, + "reward_std": 0.7292091846466064, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.7083333730697632, + "step": 670 + }, + { + "completion_length": 250.0, + "epoch": 0.05591666666666667, + "grad_norm": 0.52930748462677, + "kl": 1.6172356605529785, + "learning_rate": 1.4751924153960681e-06, + "loss": 0.0647, + "reward": 1.5833333730697632, + "reward_std": 0.5841830372810364, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.9583333730697632, + "step": 671 + }, + { + "completion_length": 250.0, + "epoch": 0.056, + "grad_norm": 0.35954219102859497, + "kl": 1.1327468156814575, + "learning_rate": 1.467238925438646e-06, + "loss": 0.0453, + "reward": 1.4583333730697632, + "reward_std": 0.7332792282104492, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.8333333730697632, + "step": 672 + }, + { + "completion_length": 250.0, + "epoch": 0.05608333333333333, + "grad_norm": 0.29655808210372925, + "kl": 1.7345565557479858, + "learning_rate": 1.4592980193489975e-06, + "loss": 0.0694, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 1.0, + "step": 673 + }, + { + "completion_length": 250.0, + "epoch": 0.05616666666666666, + "grad_norm": 0.2724839746952057, + "kl": 0.983393669128418, + "learning_rate": 1.4513697938845571e-06, + "loss": 0.0393, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 1.0, + "step": 674 + }, + { + "completion_length": 250.0, + "epoch": 0.05625, + "grad_norm": 0.2653176784515381, + "kl": 1.4598757028579712, + "learning_rate": 1.443454345648252e-06, + "loss": 0.0584, + "reward": 1.0, + "reward_std": 0.5345224738121033, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.875, + "step": 675 + }, + { + "completion_length": 250.0, + "epoch": 0.05633333333333333, + "grad_norm": 0.4087188243865967, + "kl": 1.2594470977783203, + "learning_rate": 1.4355517710873184e-06, + "loss": 0.0504, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.875, + "step": 676 + }, + { + "completion_length": 250.0, + "epoch": 0.056416666666666664, + "grad_norm": 0.25172901153564453, + "kl": 0.8408035635948181, + "learning_rate": 1.4276621664921358e-06, + "loss": 0.0336, + "reward": 1.125, + "reward_std": 0.6408699750900269, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.875, + "step": 677 + }, + { + "completion_length": 250.0, + "epoch": 0.0565, + "grad_norm": 0.26606500148773193, + "kl": 1.4946765899658203, + "learning_rate": 1.419785627995044e-06, + "loss": 0.0598, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 1.0, + "step": 678 + }, + { + "completion_length": 250.0, + "epoch": 0.05658333333333333, + "grad_norm": 0.2562119662761688, + "kl": 1.0255094766616821, + "learning_rate": 1.4119222515691817e-06, + "loss": 0.041, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.875, + "step": 679 + }, + { + "completion_length": 250.0, + "epoch": 0.056666666666666664, + "grad_norm": 0.2506910264492035, + "kl": 1.0600122213363647, + "learning_rate": 1.4040721330273063e-06, + "loss": 0.0424, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.875, + "step": 680 + }, + { + "completion_length": 250.0, + "epoch": 0.05675, + "grad_norm": 0.2096366286277771, + "kl": 0.6006632447242737, + "learning_rate": 1.3962353680206372e-06, + "loss": 0.024, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.75, + "step": 681 + }, + { + "completion_length": 250.0, + "epoch": 0.05683333333333333, + "grad_norm": 0.032199203968048096, + "kl": 1.5546534061431885, + "learning_rate": 1.388412052037682e-06, + "loss": 0.0622, + "reward": 2.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 1.0, + "step": 682 + }, + { + "completion_length": 214.0, + "epoch": 0.056916666666666664, + "grad_norm": 0.5028219819068909, + "kl": 1.4313411712646484, + "learning_rate": 1.380602280403076e-06, + "loss": 0.0573, + "reward": 1.125, + "reward_std": 0.6408699750900269, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.875, + "step": 683 + }, + { + "completion_length": 250.0, + "epoch": 0.057, + "grad_norm": 0.19551897048950195, + "kl": 0.8340595960617065, + "learning_rate": 1.3728061482764238e-06, + "loss": 0.0334, + "reward": 1.2083333730697632, + "reward_std": 0.39591163396835327, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.9583333730697632, + "step": 684 + }, + { + "completion_length": 250.0, + "epoch": 0.05708333333333333, + "grad_norm": 0.4893221855163574, + "kl": 1.325058102607727, + "learning_rate": 1.3650237506511333e-06, + "loss": 0.053, + "reward": 1.8333333730697632, + "reward_std": 0.35634827613830566, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.9583333730697632, + "step": 685 + }, + { + "completion_length": 250.0, + "epoch": 0.057166666666666664, + "grad_norm": 0.18707111477851868, + "kl": 1.2680920362472534, + "learning_rate": 1.3572551823532654e-06, + "loss": 0.0507, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 1.0, + "step": 686 + }, + { + "completion_length": 250.0, + "epoch": 0.05725, + "grad_norm": 0.2992132306098938, + "kl": 1.0152003765106201, + "learning_rate": 1.349500538040371e-06, + "loss": 0.0406, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.875, + "step": 687 + }, + { + "completion_length": 250.0, + "epoch": 0.05733333333333333, + "grad_norm": 0.24606172740459442, + "kl": 1.0200669765472412, + "learning_rate": 1.3417599122003464e-06, + "loss": 0.0408, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 1.0, + "step": 688 + }, + { + "completion_length": 250.0, + "epoch": 0.057416666666666664, + "grad_norm": 0.269815593957901, + "kl": 1.2791739702224731, + "learning_rate": 1.3340333991502723e-06, + "loss": 0.0512, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.875, + "step": 689 + }, + { + "completion_length": 250.0, + "epoch": 0.0575, + "grad_norm": 0.3196219205856323, + "kl": 1.3466354608535767, + "learning_rate": 1.3263210930352737e-06, + "loss": 0.0539, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.875, + "step": 690 + }, + { + "completion_length": 250.0, + "epoch": 0.057583333333333334, + "grad_norm": 0.397535115480423, + "kl": 1.0252505540847778, + "learning_rate": 1.3186230878273654e-06, + "loss": 0.041, + "reward": 1.25, + "reward_std": 0.8864052295684814, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.625, + "step": 691 + }, + { + "completion_length": 250.0, + "epoch": 0.057666666666666665, + "grad_norm": 0.29124775528907776, + "kl": 1.024878740310669, + "learning_rate": 1.3109394773243117e-06, + "loss": 0.041, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 0.875, + "step": 692 + }, + { + "completion_length": 250.0, + "epoch": 0.05775, + "grad_norm": 0.2738453447818756, + "kl": 1.3520758152008057, + "learning_rate": 1.3032703551484832e-06, + "loss": 0.0541, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 1.0, + "step": 693 + }, + { + "completion_length": 250.0, + "epoch": 0.057833333333333334, + "grad_norm": 0.2758381962776184, + "kl": 1.5181013345718384, + "learning_rate": 1.2956158147457116e-06, + "loss": 0.0607, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 1.0, + "step": 694 + }, + { + "completion_length": 250.0, + "epoch": 0.057916666666666665, + "grad_norm": 0.7927420139312744, + "kl": 0.9507368803024292, + "learning_rate": 1.2879759493841577e-06, + "loss": 0.038, + "reward": 1.2916666269302368, + "reward_std": 0.8249579071998596, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.7916666865348816, + "step": 695 + }, + { + "completion_length": 250.0, + "epoch": 0.058, + "grad_norm": 0.2301512509584427, + "kl": 1.0944889783859253, + "learning_rate": 1.280350852153168e-06, + "loss": 0.0438, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 0.875, + "step": 696 + }, + { + "completion_length": 250.0, + "epoch": 0.058083333333333334, + "grad_norm": 0.2712516784667969, + "kl": 0.8457162976264954, + "learning_rate": 1.272740615962148e-06, + "loss": 0.0338, + "reward": 1.8333333730697632, + "reward_std": 0.25197628140449524, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 0.8333333730697632, + "step": 697 + }, + { + "completion_length": 250.0, + "epoch": 0.058166666666666665, + "grad_norm": 0.32763615250587463, + "kl": 0.9605115652084351, + "learning_rate": 1.2651453335394232e-06, + "loss": 0.0384, + "reward": 1.2916667461395264, + "reward_std": 0.7000566720962524, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.6666666865348816, + "step": 698 + }, + { + "completion_length": 250.0, + "epoch": 0.05825, + "grad_norm": 0.29685455560684204, + "kl": 1.2807537317276, + "learning_rate": 1.2575650974311118e-06, + "loss": 0.0512, + "reward": 1.4166667461395264, + "reward_std": 0.6606874465942383, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.9166666865348816, + "step": 699 + }, + { + "completion_length": 250.0, + "epoch": 0.058333333333333334, + "grad_norm": 0.3393094837665558, + "kl": 1.317583441734314, + "learning_rate": 1.2500000000000007e-06, + "loss": 0.0527, + "reward": 1.4583333730697632, + "reward_std": 0.7332792282104492, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.8333333730697632, + "step": 700 + }, + { + "completion_length": 250.0, + "epoch": 0.058416666666666665, + "grad_norm": 0.24073301255702972, + "kl": 1.267662763595581, + "learning_rate": 1.2424501334244124e-06, + "loss": 0.0507, + "reward": 1.0416667461395264, + "reward_std": 0.41547447443008423, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.9166667461395264, + "step": 701 + }, + { + "completion_length": 250.0, + "epoch": 0.0585, + "grad_norm": 0.2873644530773163, + "kl": 1.1300421953201294, + "learning_rate": 1.234915589697091e-06, + "loss": 0.0452, + "reward": 1.7083333730697632, + "reward_std": 0.602573812007904, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.8333333730697632, + "step": 702 + }, + { + "completion_length": 250.0, + "epoch": 0.058583333333333334, + "grad_norm": 0.2742319703102112, + "kl": 0.9622892141342163, + "learning_rate": 1.2273964606240718e-06, + "loss": 0.0385, + "reward": 1.6666666269302368, + "reward_std": 0.6424161195755005, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.9166666865348816, + "step": 703 + }, + { + "completion_length": 250.0, + "epoch": 0.058666666666666666, + "grad_norm": 0.21177729964256287, + "kl": 0.9296140670776367, + "learning_rate": 1.2198928378235717e-06, + "loss": 0.0372, + "reward": 1.125, + "reward_std": 0.6408699750900269, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.875, + "step": 704 + }, + { + "completion_length": 250.0, + "epoch": 0.05875, + "grad_norm": 0.6481500267982483, + "kl": 1.4008327722549438, + "learning_rate": 1.2124048127248644e-06, + "loss": 0.056, + "reward": 1.2083333730697632, + "reward_std": 0.501980185508728, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.9583333730697632, + "step": 705 + }, + { + "completion_length": 250.0, + "epoch": 0.058833333333333335, + "grad_norm": 0.21664856374263763, + "kl": 1.1676733493804932, + "learning_rate": 1.204932476567175e-06, + "loss": 0.0467, + "reward": 1.6666666269302368, + "reward_std": 0.6424160599708557, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.9166666865348816, + "step": 706 + }, + { + "completion_length": 250.0, + "epoch": 0.058916666666666666, + "grad_norm": 0.33540189266204834, + "kl": 1.3017733097076416, + "learning_rate": 1.19747592039856e-06, + "loss": 0.0521, + "reward": 1.8333333730697632, + "reward_std": 0.35634833574295044, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.9583333730697632, + "step": 707 + }, + { + "completion_length": 250.0, + "epoch": 0.059, + "grad_norm": 0.2600402235984802, + "kl": 1.0948798656463623, + "learning_rate": 1.1900352350748026e-06, + "loss": 0.0438, + "reward": 1.7916666269302368, + "reward_std": 0.5892555713653564, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.9166666865348816, + "step": 708 + }, + { + "completion_length": 250.0, + "epoch": 0.059083333333333335, + "grad_norm": 0.20079267024993896, + "kl": 1.0153428316116333, + "learning_rate": 1.1826105112583061e-06, + "loss": 0.0406, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 1.0, + "step": 709 + }, + { + "completion_length": 250.0, + "epoch": 0.059166666666666666, + "grad_norm": 0.29417306184768677, + "kl": 0.6207944750785828, + "learning_rate": 1.1752018394169882e-06, + "loss": 0.0248, + "reward": 1.0, + "reward_std": 0.7559289336204529, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.75, + "step": 710 + }, + { + "completion_length": 250.0, + "epoch": 0.05925, + "grad_norm": 0.40088775753974915, + "kl": 1.1724050045013428, + "learning_rate": 1.1678093098231748e-06, + "loss": 0.0469, + "reward": 1.1666666269302368, + "reward_std": 0.7766431570053101, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.7916666865348816, + "step": 711 + }, + { + "completion_length": 250.0, + "epoch": 0.059333333333333335, + "grad_norm": 0.4337019920349121, + "kl": 1.4313944578170776, + "learning_rate": 1.160433012552508e-06, + "loss": 0.0573, + "reward": 1.6666666269302368, + "reward_std": 0.6424161195755005, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.7916666865348816, + "step": 712 + }, + { + "completion_length": 250.0, + "epoch": 0.059416666666666666, + "grad_norm": 0.25332921743392944, + "kl": 1.3762283325195312, + "learning_rate": 1.1530730374828422e-06, + "loss": 0.055, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 1.0, + "step": 713 + }, + { + "completion_length": 250.0, + "epoch": 0.0595, + "grad_norm": 0.4357444941997528, + "kl": 1.0639551877975464, + "learning_rate": 1.1457294742931508e-06, + "loss": 0.0426, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 1.0, + "step": 714 + }, + { + "completion_length": 250.0, + "epoch": 0.059583333333333335, + "grad_norm": 0.3148086369037628, + "kl": 1.8188680410385132, + "learning_rate": 1.1384024124624324e-06, + "loss": 0.0728, + "reward": 1.4166667461395264, + "reward_std": 0.771516740322113, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.7916666865348816, + "step": 715 + }, + { + "completion_length": 250.0, + "epoch": 0.059666666666666666, + "grad_norm": 0.3978966176509857, + "kl": 2.0942423343658447, + "learning_rate": 1.1310919412686248e-06, + "loss": 0.0838, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 1.0, + "step": 716 + }, + { + "completion_length": 250.0, + "epoch": 0.05975, + "grad_norm": 0.3257802128791809, + "kl": 1.4454894065856934, + "learning_rate": 1.1237981497875112e-06, + "loss": 0.0578, + "reward": 1.9583333730697632, + "reward_std": 0.11785109341144562, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 0.9583333730697632, + "step": 717 + }, + { + "completion_length": 250.0, + "epoch": 0.059833333333333336, + "grad_norm": 0.29710352420806885, + "kl": 0.89763343334198, + "learning_rate": 1.11652112689164e-06, + "loss": 0.0359, + "reward": 1.5, + "reward_std": 0.7126966714859009, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.75, + "step": 718 + }, + { + "completion_length": 250.0, + "epoch": 0.05991666666666667, + "grad_norm": 0.34548911452293396, + "kl": 1.2456828355789185, + "learning_rate": 1.109260961249238e-06, + "loss": 0.0498, + "reward": 1.375, + "reward_std": 0.9161254167556763, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.75, + "step": 719 + }, + { + "completion_length": 250.0, + "epoch": 0.06, + "grad_norm": 0.46419206261634827, + "kl": 1.6931712627410889, + "learning_rate": 1.1020177413231334e-06, + "loss": 0.0677, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.875, + "step": 720 + }, + { + "completion_length": 250.0, + "epoch": 0.060083333333333336, + "grad_norm": 0.2870257794857025, + "kl": 0.8886799216270447, + "learning_rate": 1.0947915553696742e-06, + "loss": 0.0355, + "reward": 1.8333333730697632, + "reward_std": 0.35634827613830566, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.9583333730697632, + "step": 721 + }, + { + "completion_length": 250.0, + "epoch": 0.06016666666666667, + "grad_norm": 0.334460973739624, + "kl": 1.3340364694595337, + "learning_rate": 1.0875824914376555e-06, + "loss": 0.0534, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 1.0, + "step": 722 + }, + { + "completion_length": 250.0, + "epoch": 0.06025, + "grad_norm": 0.24705474078655243, + "kl": 0.7932687997817993, + "learning_rate": 1.0803906373672477e-06, + "loss": 0.0317, + "reward": 1.0416666269302368, + "reward_std": 0.7000566720962524, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.7916666865348816, + "step": 723 + }, + { + "completion_length": 250.0, + "epoch": 0.060333333333333336, + "grad_norm": 0.27795547246932983, + "kl": 0.7931269407272339, + "learning_rate": 1.073216080788921e-06, + "loss": 0.0317, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.625, + "step": 724 + }, + { + "completion_length": 202.0, + "epoch": 0.06041666666666667, + "grad_norm": 0.1886208951473236, + "kl": 1.4156323671340942, + "learning_rate": 1.0660589091223854e-06, + "loss": 0.0566, + "reward": 1.9583333730697632, + "reward_std": 0.11785109341144562, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 0.9583333730697632, + "step": 725 + }, + { + "completion_length": 250.0, + "epoch": 0.0605, + "grad_norm": 0.23371046781539917, + "kl": 0.7740318179130554, + "learning_rate": 1.0589192095755172e-06, + "loss": 0.031, + "reward": 1.4583333730697632, + "reward_std": 0.6886264085769653, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.8333333730697632, + "step": 726 + }, + { + "completion_length": 250.0, + "epoch": 0.060583333333333336, + "grad_norm": 1.0416193008422852, + "kl": 1.2525185346603394, + "learning_rate": 1.0517970691433035e-06, + "loss": 0.0501, + "reward": 1.125, + "reward_std": 0.6408699750900269, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.875, + "step": 727 + }, + { + "completion_length": 250.0, + "epoch": 0.06066666666666667, + "grad_norm": 0.335510790348053, + "kl": 0.8215186595916748, + "learning_rate": 1.0446925746067768e-06, + "loss": 0.0329, + "reward": 1.4583332538604736, + "reward_std": 0.7955730557441711, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.8333333730697632, + "step": 728 + }, + { + "completion_length": 250.0, + "epoch": 0.06075, + "grad_norm": 0.4560522735118866, + "kl": 0.985789954662323, + "learning_rate": 1.0376058125319614e-06, + "loss": 0.0394, + "reward": 1.125, + "reward_std": 0.8345229625701904, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.75, + "step": 729 + }, + { + "completion_length": 227.0, + "epoch": 0.060833333333333336, + "grad_norm": 0.25721290707588196, + "kl": 1.056433916091919, + "learning_rate": 1.0305368692688175e-06, + "loss": 0.0423, + "reward": 1.3333333730697632, + "reward_std": 0.5634361505508423, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.9583333730697632, + "step": 730 + }, + { + "completion_length": 250.0, + "epoch": 0.06091666666666667, + "grad_norm": 1.3237224817276, + "kl": 0.6960461735725403, + "learning_rate": 1.0234858309501864e-06, + "loss": 0.0278, + "reward": 1.0, + "reward_std": 0.7126966714859009, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.75, + "step": 731 + }, + { + "completion_length": 250.0, + "epoch": 0.061, + "grad_norm": 0.2881050407886505, + "kl": 0.5406643152236938, + "learning_rate": 1.0164527834907468e-06, + "loss": 0.0216, + "reward": 1.3333333730697632, + "reward_std": 0.8908708095550537, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.7083333730697632, + "step": 732 + }, + { + "completion_length": 250.0, + "epoch": 0.06108333333333334, + "grad_norm": 0.21133320033550262, + "kl": 1.2629066705703735, + "learning_rate": 1.0094378125859602e-06, + "loss": 0.0505, + "reward": 1.2083333730697632, + "reward_std": 0.5019802451133728, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.9583333730697632, + "step": 733 + }, + { + "completion_length": 250.0, + "epoch": 0.06116666666666667, + "grad_norm": 0.22745351493358612, + "kl": 0.8365185856819153, + "learning_rate": 1.0024410037110358e-06, + "loss": 0.0335, + "reward": 1.2083333730697632, + "reward_std": 0.501980185508728, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.9583333730697632, + "step": 734 + }, + { + "completion_length": 250.0, + "epoch": 0.06125, + "grad_norm": 0.2476133406162262, + "kl": 1.2012953758239746, + "learning_rate": 9.95462442119879e-07, + "loss": 0.0481, + "reward": 1.9583333730697632, + "reward_std": 0.11785109341144562, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 0.9583333730697632, + "step": 735 + }, + { + "completion_length": 250.0, + "epoch": 0.06133333333333333, + "grad_norm": 0.19638416171073914, + "kl": 1.6485852003097534, + "learning_rate": 9.88502212844063e-07, + "loss": 0.0659, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 1.0, + "step": 736 + }, + { + "completion_length": 250.0, + "epoch": 0.06141666666666667, + "grad_norm": 0.31301435828208923, + "kl": 1.480404257774353, + "learning_rate": 9.815604006917839e-07, + "loss": 0.0592, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 1.0, + "step": 737 + }, + { + "completion_length": 250.0, + "epoch": 0.0615, + "grad_norm": 0.27803361415863037, + "kl": 0.7003533244132996, + "learning_rate": 9.746370902468311e-07, + "loss": 0.028, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.875, + "step": 738 + }, + { + "completion_length": 250.0, + "epoch": 0.06158333333333333, + "grad_norm": 0.2773330807685852, + "kl": 1.3522332906723022, + "learning_rate": 9.677323658675594e-07, + "loss": 0.0541, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 1.0, + "step": 739 + }, + { + "completion_length": 250.0, + "epoch": 0.06166666666666667, + "grad_norm": 0.6077278852462769, + "kl": 1.1051663160324097, + "learning_rate": 9.608463116858544e-07, + "loss": 0.0442, + "reward": 1.125, + "reward_std": 0.6408699750900269, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.875, + "step": 740 + }, + { + "completion_length": 250.0, + "epoch": 0.06175, + "grad_norm": 0.3956521451473236, + "kl": 1.057028889656067, + "learning_rate": 9.53979011606115e-07, + "loss": 0.0423, + "reward": 1.4583333730697632, + "reward_std": 0.7332792282104492, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.8333333730697632, + "step": 741 + }, + { + "completion_length": 250.0, + "epoch": 0.06183333333333333, + "grad_norm": 0.2606249451637268, + "kl": 1.6235839128494263, + "learning_rate": 9.471305493042243e-07, + "loss": 0.0649, + "reward": 1.5833333730697632, + "reward_std": 0.49601587653160095, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.9583333730697632, + "step": 742 + }, + { + "completion_length": 250.0, + "epoch": 0.06191666666666667, + "grad_norm": 0.20523270964622498, + "kl": 1.068432092666626, + "learning_rate": 9.403010082265351e-07, + "loss": 0.0427, + "reward": 1.3333333730697632, + "reward_std": 0.5634361505508423, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.9583333730697632, + "step": 743 + }, + { + "completion_length": 250.0, + "epoch": 0.062, + "grad_norm": 0.2961874306201935, + "kl": 1.2741352319717407, + "learning_rate": 9.334904715888496e-07, + "loss": 0.051, + "reward": 1.4583333730697632, + "reward_std": 0.5892556309700012, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.9583333730697632, + "step": 744 + }, + { + "completion_length": 250.0, + "epoch": 0.06208333333333333, + "grad_norm": 0.2894919216632843, + "kl": 1.0481557846069336, + "learning_rate": 9.266990223754069e-07, + "loss": 0.0419, + "reward": 1.6666666269302368, + "reward_std": 0.6424160599708557, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.9166666865348816, + "step": 745 + }, + { + "completion_length": 250.0, + "epoch": 0.06216666666666667, + "grad_norm": 0.2562686800956726, + "kl": 1.0808312892913818, + "learning_rate": 9.199267433378728e-07, + "loss": 0.0432, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 1.0, + "step": 746 + }, + { + "completion_length": 250.0, + "epoch": 0.06225, + "grad_norm": 0.28722333908081055, + "kl": 1.1899094581604004, + "learning_rate": 9.131737169943314e-07, + "loss": 0.0476, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.875, + "step": 747 + }, + { + "completion_length": 250.0, + "epoch": 0.06233333333333333, + "grad_norm": 0.18329280614852905, + "kl": 1.901811957359314, + "learning_rate": 9.064400256282757e-07, + "loss": 0.0761, + "reward": 1.7083333730697632, + "reward_std": 0.4520675241947174, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.9583333730697632, + "step": 748 + }, + { + "completion_length": 250.0, + "epoch": 0.06241666666666667, + "grad_norm": 0.22949326038360596, + "kl": 1.2201263904571533, + "learning_rate": 8.99725751287611e-07, + "loss": 0.0488, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 1.0, + "step": 749 + }, + { + "completion_length": 250.0, + "epoch": 0.0625, + "grad_norm": 0.32113534212112427, + "kl": 1.5129036903381348, + "learning_rate": 8.930309757836517e-07, + "loss": 0.0605, + "reward": 1.8333333730697632, + "reward_std": 0.35634827613830566, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.9583333730697632, + "step": 750 + }, + { + "completion_length": 250.0, + "epoch": 0.06258333333333334, + "grad_norm": 0.41787517070770264, + "kl": 1.3719799518585205, + "learning_rate": 8.863557806901233e-07, + "loss": 0.0549, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 1.0, + "step": 751 + }, + { + "completion_length": 250.0, + "epoch": 0.06266666666666666, + "grad_norm": 0.2127346247434616, + "kl": 1.2286632061004639, + "learning_rate": 8.797002473421729e-07, + "loss": 0.0491, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.875, + "step": 752 + }, + { + "completion_length": 250.0, + "epoch": 0.06275, + "grad_norm": 0.3302915096282959, + "kl": 0.7858455777168274, + "learning_rate": 8.73064456835373e-07, + "loss": 0.0314, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 1.0, + "step": 753 + }, + { + "completion_length": 250.0, + "epoch": 0.06283333333333334, + "grad_norm": 0.21576067805290222, + "kl": 1.1705996990203857, + "learning_rate": 8.664484900247363e-07, + "loss": 0.0468, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 1.0, + "step": 754 + }, + { + "completion_length": 250.0, + "epoch": 0.06291666666666666, + "grad_norm": 0.22521759569644928, + "kl": 1.7442842721939087, + "learning_rate": 8.598524275237321e-07, + "loss": 0.0698, + "reward": 1.7916666269302368, + "reward_std": 0.589255690574646, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.9166666865348816, + "step": 755 + }, + { + "completion_length": 250.0, + "epoch": 0.063, + "grad_norm": 1.9343209266662598, + "kl": 1.2830684185028076, + "learning_rate": 8.532763497032987e-07, + "loss": 0.0513, + "reward": 1.7083333730697632, + "reward_std": 0.4520675241947174, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.8333333730697632, + "step": 756 + }, + { + "completion_length": 250.0, + "epoch": 0.06308333333333334, + "grad_norm": 0.2488812506198883, + "kl": 1.0733072757720947, + "learning_rate": 8.467203366908708e-07, + "loss": 0.0429, + "reward": 1.25, + "reward_std": 0.8864052295684814, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.75, + "step": 757 + }, + { + "completion_length": 250.0, + "epoch": 0.06316666666666666, + "grad_norm": 0.310738742351532, + "kl": 1.2989583015441895, + "learning_rate": 8.40184468369396e-07, + "loss": 0.052, + "reward": 1.625, + "reward_std": 0.7000566720962524, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.8750000596046448, + "step": 758 + }, + { + "completion_length": 250.0, + "epoch": 0.06325, + "grad_norm": 0.25620657205581665, + "kl": 1.613889217376709, + "learning_rate": 8.336688243763691e-07, + "loss": 0.0646, + "reward": 1.5416667461395264, + "reward_std": 0.7332792282104492, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.7916666865348816, + "step": 759 + }, + { + "completion_length": 250.0, + "epoch": 0.06333333333333334, + "grad_norm": 0.30064281821250916, + "kl": 1.306907296180725, + "learning_rate": 8.271734841028553e-07, + "loss": 0.0523, + "reward": 1.2083333730697632, + "reward_std": 0.7753646969795227, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.7083333730697632, + "step": 760 + }, + { + "completion_length": 250.0, + "epoch": 0.06341666666666666, + "grad_norm": 0.3926166594028473, + "kl": 1.2214906215667725, + "learning_rate": 8.206985266925249e-07, + "loss": 0.0489, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 1.0, + "step": 761 + }, + { + "completion_length": 250.0, + "epoch": 0.0635, + "grad_norm": 0.23394834995269775, + "kl": 1.3625099658966064, + "learning_rate": 8.142440310406923e-07, + "loss": 0.0545, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 1.0, + "step": 762 + }, + { + "completion_length": 250.0, + "epoch": 0.06358333333333334, + "grad_norm": 1.0904709100723267, + "kl": 1.4420005083084106, + "learning_rate": 8.078100757933486e-07, + "loss": 0.0577, + "reward": 1.5833333730697632, + "reward_std": 0.5841830372810364, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.8333333730697632, + "step": 763 + }, + { + "completion_length": 250.0, + "epoch": 0.06366666666666666, + "grad_norm": 0.4322141110897064, + "kl": 1.1168162822723389, + "learning_rate": 8.013967393462094e-07, + "loss": 0.0447, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.75, + "step": 764 + }, + { + "completion_length": 250.0, + "epoch": 0.06375, + "grad_norm": 0.26758918166160583, + "kl": 1.202095627784729, + "learning_rate": 7.950040998437541e-07, + "loss": 0.0481, + "reward": 1.6666667461395264, + "reward_std": 0.7126966118812561, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.7916666865348816, + "step": 765 + }, + { + "completion_length": 250.0, + "epoch": 0.06383333333333334, + "grad_norm": 0.30354562401771545, + "kl": 1.4736483097076416, + "learning_rate": 7.886322351782782e-07, + "loss": 0.0589, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 1.0, + "step": 766 + }, + { + "completion_length": 250.0, + "epoch": 0.06391666666666666, + "grad_norm": 0.26811686158180237, + "kl": 1.1674681901931763, + "learning_rate": 7.822812229889429e-07, + "loss": 0.0467, + "reward": 1.375, + "reward_std": 0.9161254167556763, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.625, + "step": 767 + }, + { + "completion_length": 250.0, + "epoch": 0.064, + "grad_norm": 0.3494676351547241, + "kl": 0.9113569259643555, + "learning_rate": 7.759511406608255e-07, + "loss": 0.0365, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.875, + "step": 768 + }, + { + "completion_length": 250.0, + "epoch": 0.06408333333333334, + "grad_norm": 1.503426194190979, + "kl": 1.3052912950515747, + "learning_rate": 7.696420653239834e-07, + "loss": 0.0522, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 1.0, + "step": 769 + }, + { + "completion_length": 250.0, + "epoch": 0.06416666666666666, + "grad_norm": 0.27594801783561707, + "kl": 0.7944636940956116, + "learning_rate": 7.633540738525066e-07, + "loss": 0.0318, + "reward": 1.3333333730697632, + "reward_std": 0.7766431570053101, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.7083333730697632, + "step": 770 + }, + { + "completion_length": 250.0, + "epoch": 0.06425, + "grad_norm": 0.48417186737060547, + "kl": 1.5408587455749512, + "learning_rate": 7.57087242863589e-07, + "loss": 0.0616, + "reward": 1.4166666269302368, + "reward_std": 0.6606874465942383, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.9166666865348816, + "step": 771 + }, + { + "completion_length": 250.0, + "epoch": 0.06433333333333334, + "grad_norm": 0.21317587792873383, + "kl": 0.9737959504127502, + "learning_rate": 7.508416487165862e-07, + "loss": 0.039, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 1.0, + "step": 772 + }, + { + "completion_length": 250.0, + "epoch": 0.06441666666666666, + "grad_norm": 0.22758537530899048, + "kl": 1.2084604501724243, + "learning_rate": 7.44617367512094e-07, + "loss": 0.0483, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 1.0, + "step": 773 + }, + { + "completion_length": 250.0, + "epoch": 0.0645, + "grad_norm": 0.42783284187316895, + "kl": 1.0768017768859863, + "learning_rate": 7.384144750910133e-07, + "loss": 0.0431, + "reward": 1.2083333730697632, + "reward_std": 0.7332792282104492, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.8333333134651184, + "step": 774 + }, + { + "completion_length": 225.0, + "epoch": 0.06458333333333334, + "grad_norm": 0.22378872334957123, + "kl": 1.7128205299377441, + "learning_rate": 7.322330470336314e-07, + "loss": 0.0685, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.875, + "step": 775 + }, + { + "completion_length": 250.0, + "epoch": 0.06466666666666666, + "grad_norm": 0.6742827296257019, + "kl": 1.4242603778839111, + "learning_rate": 7.260731586586983e-07, + "loss": 0.057, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.875, + "step": 776 + }, + { + "completion_length": 250.0, + "epoch": 0.06475, + "grad_norm": 0.3610439896583557, + "kl": 1.3908641338348389, + "learning_rate": 7.199348850225091e-07, + "loss": 0.0556, + "reward": 1.5, + "reward_std": 0.7126966714859009, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.625, + "step": 777 + }, + { + "completion_length": 250.0, + "epoch": 0.06483333333333334, + "grad_norm": 0.30944564938545227, + "kl": 1.004869818687439, + "learning_rate": 7.138183009179922e-07, + "loss": 0.0402, + "reward": 1.4583333730697632, + "reward_std": 0.9074209332466125, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.7083333730697632, + "step": 778 + }, + { + "completion_length": 250.0, + "epoch": 0.06491666666666666, + "grad_norm": 0.36364126205444336, + "kl": 0.7205187082290649, + "learning_rate": 7.077234808737932e-07, + "loss": 0.0288, + "reward": 1.375, + "reward_std": 0.6283640265464783, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.75, + "step": 779 + }, + { + "completion_length": 250.0, + "epoch": 0.065, + "grad_norm": 3.758852958679199, + "kl": 1.4535225629806519, + "learning_rate": 7.016504991533727e-07, + "loss": 0.0581, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 0.875, + "step": 780 + }, + { + "completion_length": 250.0, + "epoch": 0.06508333333333334, + "grad_norm": 0.2152343988418579, + "kl": 1.338955283164978, + "learning_rate": 6.955994297540947e-07, + "loss": 0.0536, + "reward": 1.8333333730697632, + "reward_std": 0.47140446305274963, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.9583333730697632, + "step": 781 + }, + { + "completion_length": 250.0, + "epoch": 0.06516666666666666, + "grad_norm": 0.4497411549091339, + "kl": 1.6065069437026978, + "learning_rate": 6.895703464063319e-07, + "loss": 0.0643, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.875, + "step": 782 + }, + { + "completion_length": 250.0, + "epoch": 0.06525, + "grad_norm": 0.350917249917984, + "kl": 1.1863226890563965, + "learning_rate": 6.835633225725604e-07, + "loss": 0.0475, + "reward": 1.375, + "reward_std": 0.9161254167556763, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.75, + "step": 783 + }, + { + "completion_length": 250.0, + "epoch": 0.06533333333333333, + "grad_norm": 0.2987426817417145, + "kl": 1.2886359691619873, + "learning_rate": 6.775784314464717e-07, + "loss": 0.0515, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 1.0, + "step": 784 + }, + { + "completion_length": 250.0, + "epoch": 0.06541666666666666, + "grad_norm": 0.27569258213043213, + "kl": 1.205812931060791, + "learning_rate": 6.716157459520739e-07, + "loss": 0.0482, + "reward": 1.6666666269302368, + "reward_std": 0.6424160599708557, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.7916666865348816, + "step": 785 + }, + { + "completion_length": 224.0, + "epoch": 0.0655, + "grad_norm": 0.4730622470378876, + "kl": 1.1610169410705566, + "learning_rate": 6.656753387428089e-07, + "loss": 0.0464, + "reward": 1.6666666269302368, + "reward_std": 0.6424160599708557, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.9166666865348816, + "step": 786 + }, + { + "completion_length": 250.0, + "epoch": 0.06558333333333333, + "grad_norm": 0.36808550357818604, + "kl": 0.9821891784667969, + "learning_rate": 6.597572822006643e-07, + "loss": 0.0393, + "reward": 1.7916667461395264, + "reward_std": 0.39591163396835327, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 0.7916666865348816, + "step": 787 + }, + { + "completion_length": 250.0, + "epoch": 0.06566666666666666, + "grad_norm": 1.0250049829483032, + "kl": 1.1594293117523193, + "learning_rate": 6.538616484352902e-07, + "loss": 0.0464, + "reward": 1.375, + "reward_std": 0.9161254167556763, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.75, + "step": 788 + }, + { + "completion_length": 250.0, + "epoch": 0.06575, + "grad_norm": 0.32371285557746887, + "kl": 1.193271517753601, + "learning_rate": 6.479885092831251e-07, + "loss": 0.0477, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.75, + "step": 789 + }, + { + "completion_length": 240.0, + "epoch": 0.06583333333333333, + "grad_norm": 0.3552028238773346, + "kl": 1.2126344442367554, + "learning_rate": 6.421379363065142e-07, + "loss": 0.0485, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.875, + "step": 790 + }, + { + "completion_length": 250.0, + "epoch": 0.06591666666666667, + "grad_norm": 0.2879108190536499, + "kl": 1.4960048198699951, + "learning_rate": 6.363100007928447e-07, + "loss": 0.0598, + "reward": 1.7916666269302368, + "reward_std": 0.5892555713653564, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.9166666865348816, + "step": 791 + }, + { + "completion_length": 250.0, + "epoch": 0.066, + "grad_norm": 0.30861926078796387, + "kl": 0.9252521395683289, + "learning_rate": 6.305047737536707e-07, + "loss": 0.037, + "reward": 1.4583333730697632, + "reward_std": 0.5892556309700012, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.9583333730697632, + "step": 792 + }, + { + "completion_length": 250.0, + "epoch": 0.06608333333333333, + "grad_norm": 0.3545387089252472, + "kl": 1.508813738822937, + "learning_rate": 6.247223259238511e-07, + "loss": 0.0604, + "reward": 0.875, + "reward_std": 0.39591166377067566, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.75, + "step": 793 + }, + { + "completion_length": 250.0, + "epoch": 0.06616666666666667, + "grad_norm": 0.2610399127006531, + "kl": 0.8176184296607971, + "learning_rate": 6.189627277606894e-07, + "loss": 0.0327, + "reward": 1.5833333730697632, + "reward_std": 0.7292091846466064, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.8333333730697632, + "step": 794 + }, + { + "completion_length": 250.0, + "epoch": 0.06625, + "grad_norm": 0.2623574137687683, + "kl": 1.1873376369476318, + "learning_rate": 6.1322604944307e-07, + "loss": 0.0475, + "reward": 1.9583333730697632, + "reward_std": 0.11785109341144562, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 0.9583333730697632, + "step": 795 + }, + { + "completion_length": 250.0, + "epoch": 0.06633333333333333, + "grad_norm": 0.2947082817554474, + "kl": 1.858878254890442, + "learning_rate": 6.075123608706093e-07, + "loss": 0.0744, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.875, + "step": 796 + }, + { + "completion_length": 250.0, + "epoch": 0.06641666666666667, + "grad_norm": 0.3278063237667084, + "kl": 1.1497657299041748, + "learning_rate": 6.01821731662798e-07, + "loss": 0.046, + "reward": 1.0, + "reward_std": 0.7559289336204529, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.75, + "step": 797 + }, + { + "completion_length": 250.0, + "epoch": 0.0665, + "grad_norm": 0.24040259420871735, + "kl": 1.2203922271728516, + "learning_rate": 5.961542311581586e-07, + "loss": 0.0488, + "reward": 1.5416667461395264, + "reward_std": 0.501980185508728, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.9166666865348816, + "step": 798 + }, + { + "completion_length": 250.0, + "epoch": 0.06658333333333333, + "grad_norm": 0.342872679233551, + "kl": 1.2977404594421387, + "learning_rate": 5.905099284133953e-07, + "loss": 0.0519, + "reward": 1.5416666269302368, + "reward_std": 0.6651768684387207, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.9166666865348816, + "step": 799 + }, + { + "completion_length": 233.0, + "epoch": 0.06666666666666667, + "grad_norm": 0.22665053606033325, + "kl": 1.3235529661178589, + "learning_rate": 5.848888922025553e-07, + "loss": 0.0529, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 1.0, + "step": 800 + }, + { + "completion_length": 250.0, + "epoch": 0.06675, + "grad_norm": 0.248734250664711, + "kl": 0.8164328932762146, + "learning_rate": 5.792911910161922e-07, + "loss": 0.0327, + "reward": 1.4166667461395264, + "reward_std": 0.6606874465942383, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.9166666865348816, + "step": 801 + }, + { + "completion_length": 250.0, + "epoch": 0.06683333333333333, + "grad_norm": 0.26447823643684387, + "kl": 1.291729211807251, + "learning_rate": 5.737168930605272e-07, + "loss": 0.0517, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.875, + "step": 802 + }, + { + "completion_length": 250.0, + "epoch": 0.06691666666666667, + "grad_norm": 0.2176593393087387, + "kl": 0.8029844164848328, + "learning_rate": 5.681660662566225e-07, + "loss": 0.0321, + "reward": 1.0416667461395264, + "reward_std": 0.6770031452178955, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.7916666865348816, + "step": 803 + }, + { + "completion_length": 250.0, + "epoch": 0.067, + "grad_norm": 0.2643926739692688, + "kl": 0.7234349846839905, + "learning_rate": 5.626387782395512e-07, + "loss": 0.0289, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 1.0, + "step": 804 + }, + { + "completion_length": 250.0, + "epoch": 0.06708333333333333, + "grad_norm": 0.2635612189769745, + "kl": 1.7153481245040894, + "learning_rate": 5.571350963575728e-07, + "loss": 0.0686, + "reward": 1.625, + "reward_std": 0.5473601818084717, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.8750000596046448, + "step": 805 + }, + { + "completion_length": 250.0, + "epoch": 0.06716666666666667, + "grad_norm": 0.28257104754447937, + "kl": 1.6129707098007202, + "learning_rate": 5.516550876713142e-07, + "loss": 0.0645, + "reward": 1.4166666269302368, + "reward_std": 0.6606874465942383, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.9166666865348816, + "step": 806 + }, + { + "completion_length": 250.0, + "epoch": 0.06725, + "grad_norm": 0.3367782533168793, + "kl": 1.151456356048584, + "learning_rate": 5.461988189529529e-07, + "loss": 0.0461, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.875, + "step": 807 + }, + { + "completion_length": 250.0, + "epoch": 0.06733333333333333, + "grad_norm": 0.4734201431274414, + "kl": 1.5179961919784546, + "learning_rate": 5.407663566854008e-07, + "loss": 0.0607, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 1.0, + "step": 808 + }, + { + "completion_length": 250.0, + "epoch": 0.06741666666666667, + "grad_norm": 0.32155460119247437, + "kl": 1.1060431003570557, + "learning_rate": 5.353577670614951e-07, + "loss": 0.0442, + "reward": 1.375, + "reward_std": 0.9161254167556763, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.75, + "step": 809 + }, + { + "completion_length": 250.0, + "epoch": 0.0675, + "grad_norm": 0.3959408104419708, + "kl": 1.4056496620178223, + "learning_rate": 5.299731159831953e-07, + "loss": 0.0562, + "reward": 1.5833333730697632, + "reward_std": 0.49601584672927856, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.9583333730697632, + "step": 810 + }, + { + "completion_length": 250.0, + "epoch": 0.06758333333333333, + "grad_norm": 2.5059800148010254, + "kl": 2.0833823680877686, + "learning_rate": 5.24612469060774e-07, + "loss": 0.0833, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.875, + "step": 811 + }, + { + "completion_length": 230.0, + "epoch": 0.06766666666666667, + "grad_norm": 0.48341092467308044, + "kl": 1.4132311344146729, + "learning_rate": 5.192758916120236e-07, + "loss": 0.0565, + "reward": 1.7916666269302368, + "reward_std": 0.589255690574646, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.9166666865348816, + "step": 812 + }, + { + "completion_length": 250.0, + "epoch": 0.06775, + "grad_norm": 0.32052579522132874, + "kl": 0.961887776851654, + "learning_rate": 5.139634486614544e-07, + "loss": 0.0385, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 1.0, + "step": 813 + }, + { + "completion_length": 250.0, + "epoch": 0.06783333333333333, + "grad_norm": 0.3565872013568878, + "kl": 1.3793160915374756, + "learning_rate": 5.086752049395094e-07, + "loss": 0.0552, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.875, + "step": 814 + }, + { + "completion_length": 250.0, + "epoch": 0.06791666666666667, + "grad_norm": 0.5205422043800354, + "kl": 1.2932004928588867, + "learning_rate": 5.034112248817685e-07, + "loss": 0.0517, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.875, + "step": 815 + }, + { + "completion_length": 250.0, + "epoch": 0.068, + "grad_norm": 0.3367973864078522, + "kl": 1.2375197410583496, + "learning_rate": 4.981715726281666e-07, + "loss": 0.0495, + "reward": 1.125, + "reward_std": 0.8345229625701904, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.75, + "step": 816 + }, + { + "completion_length": 250.0, + "epoch": 0.06808333333333333, + "grad_norm": 0.298949658870697, + "kl": 1.3202153444290161, + "learning_rate": 4.929563120222142e-07, + "loss": 0.0528, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.875, + "step": 817 + }, + { + "completion_length": 250.0, + "epoch": 0.06816666666666667, + "grad_norm": 0.27566391229629517, + "kl": 1.1884781122207642, + "learning_rate": 4.87765506610215e-07, + "loss": 0.0475, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.875, + "step": 818 + }, + { + "completion_length": 250.0, + "epoch": 0.06825, + "grad_norm": 0.3612639009952545, + "kl": 0.9719309210777283, + "learning_rate": 4.825992196404958e-07, + "loss": 0.0389, + "reward": 1.125, + "reward_std": 0.9910312294960022, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.625, + "step": 819 + }, + { + "completion_length": 250.0, + "epoch": 0.06833333333333333, + "grad_norm": 0.19263684749603271, + "kl": 0.9239329695701599, + "learning_rate": 4.774575140626317e-07, + "loss": 0.037, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.75, + "step": 820 + }, + { + "completion_length": 250.0, + "epoch": 0.06841666666666667, + "grad_norm": 0.5026285648345947, + "kl": 0.8872776627540588, + "learning_rate": 4.7234045252668393e-07, + "loss": 0.0355, + "reward": 1.5416667461395264, + "reward_std": 0.46929532289505005, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.7916666865348816, + "step": 821 + }, + { + "completion_length": 250.0, + "epoch": 0.0685, + "grad_norm": 0.3337138891220093, + "kl": 1.1307857036590576, + "learning_rate": 4.672480973824312e-07, + "loss": 0.0452, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.875, + "step": 822 + }, + { + "completion_length": 250.0, + "epoch": 0.06858333333333333, + "grad_norm": 0.23775134980678558, + "kl": 1.0065653324127197, + "learning_rate": 4.6218051067861423e-07, + "loss": 0.0403, + "reward": 1.7083333730697632, + "reward_std": 0.5473601818084717, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.9583333730697632, + "step": 823 + }, + { + "completion_length": 250.0, + "epoch": 0.06866666666666667, + "grad_norm": 0.2261497527360916, + "kl": 0.4448733925819397, + "learning_rate": 4.5713775416217884e-07, + "loss": 0.0178, + "reward": 1.4166667461395264, + "reward_std": 0.771516740322113, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.7916667461395264, + "step": 824 + }, + { + "completion_length": 250.0, + "epoch": 0.06875, + "grad_norm": 0.4021577537059784, + "kl": 1.8268983364105225, + "learning_rate": 4.5211988927752026e-07, + "loss": 0.0731, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 1.0, + "step": 825 + }, + { + "completion_length": 250.0, + "epoch": 0.06883333333333333, + "grad_norm": 0.745639979839325, + "kl": 1.327449083328247, + "learning_rate": 4.4712697716573994e-07, + "loss": 0.0531, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 1.0, + "step": 826 + }, + { + "completion_length": 250.0, + "epoch": 0.06891666666666667, + "grad_norm": 0.25842320919036865, + "kl": 1.019197940826416, + "learning_rate": 4.421590786638952e-07, + "loss": 0.0408, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 1.0, + "step": 827 + }, + { + "completion_length": 250.0, + "epoch": 0.069, + "grad_norm": 0.6545089483261108, + "kl": 1.8898916244506836, + "learning_rate": 4.372162543042624e-07, + "loss": 0.0756, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 1.0, + "step": 828 + }, + { + "completion_length": 250.0, + "epoch": 0.06908333333333333, + "grad_norm": 0.2776699364185333, + "kl": 1.2687866687774658, + "learning_rate": 4.3229856431359516e-07, + "loss": 0.0508, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.875, + "step": 829 + }, + { + "completion_length": 250.0, + "epoch": 0.06916666666666667, + "grad_norm": 1.2333475351333618, + "kl": 0.7890111207962036, + "learning_rate": 4.27406068612396e-07, + "loss": 0.0316, + "reward": 1.9583333730697632, + "reward_std": 0.11785109341144562, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 0.9583333730697632, + "step": 830 + }, + { + "completion_length": 250.0, + "epoch": 0.06925, + "grad_norm": 47.50356674194336, + "kl": 13.761457443237305, + "learning_rate": 4.225388268141797e-07, + "loss": 0.5505, + "reward": 1.2083333730697632, + "reward_std": 0.43415671586990356, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.8333333730697632, + "step": 831 + }, + { + "completion_length": 250.0, + "epoch": 0.06933333333333333, + "grad_norm": 0.26879197359085083, + "kl": 0.6461024284362793, + "learning_rate": 4.1769689822475147e-07, + "loss": 0.0258, + "reward": 1.2916666269302368, + "reward_std": 0.9829902648925781, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.6666666865348816, + "step": 832 + }, + { + "completion_length": 250.0, + "epoch": 0.06941666666666667, + "grad_norm": 0.27685660123825073, + "kl": 1.392545461654663, + "learning_rate": 4.12880341841484e-07, + "loss": 0.0557, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.875, + "step": 833 + }, + { + "completion_length": 250.0, + "epoch": 0.0695, + "grad_norm": 0.3339894115924835, + "kl": 0.9655144214630127, + "learning_rate": 4.0808921635259595e-07, + "loss": 0.0386, + "reward": 1.2083333730697632, + "reward_std": 0.8533315062522888, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.7083333730697632, + "step": 834 + }, + { + "completion_length": 250.0, + "epoch": 0.06958333333333333, + "grad_norm": 0.2692475914955139, + "kl": 2.0277373790740967, + "learning_rate": 4.033235801364402e-07, + "loss": 0.0811, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 1.0, + "step": 835 + }, + { + "completion_length": 250.0, + "epoch": 0.06966666666666667, + "grad_norm": 0.27213945984840393, + "kl": 0.46007007360458374, + "learning_rate": 3.9858349126078945e-07, + "loss": 0.0184, + "reward": 1.4166667461395264, + "reward_std": 0.7292091846466064, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.7916666865348816, + "step": 836 + }, + { + "completion_length": 250.0, + "epoch": 0.06975, + "grad_norm": 0.2522222101688385, + "kl": 1.3139028549194336, + "learning_rate": 3.938690074821314e-07, + "loss": 0.0526, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.875, + "step": 837 + }, + { + "completion_length": 250.0, + "epoch": 0.06983333333333333, + "grad_norm": 0.2399037927389145, + "kl": 0.7940414547920227, + "learning_rate": 3.891801862449629e-07, + "loss": 0.0318, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 1.0, + "step": 838 + }, + { + "completion_length": 250.0, + "epoch": 0.06991666666666667, + "grad_norm": 0.22458046674728394, + "kl": 1.2483919858932495, + "learning_rate": 3.8451708468109026e-07, + "loss": 0.0499, + "reward": 1.4583333730697632, + "reward_std": 0.46929532289505005, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.8333333730697632, + "step": 839 + }, + { + "completion_length": 250.0, + "epoch": 0.07, + "grad_norm": 0.211387500166893, + "kl": 0.9435220956802368, + "learning_rate": 3.798797596089351e-07, + "loss": 0.0377, + "reward": 1.3333333730697632, + "reward_std": 0.5634361505508423, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.9583333730697632, + "step": 840 + }, + { + "completion_length": 250.0, + "epoch": 0.07008333333333333, + "grad_norm": 0.2669267952442169, + "kl": 1.431569218635559, + "learning_rate": 3.7526826753284065e-07, + "loss": 0.0573, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 1.0, + "step": 841 + }, + { + "completion_length": 250.0, + "epoch": 0.07016666666666667, + "grad_norm": 0.2529590129852295, + "kl": 1.039304256439209, + "learning_rate": 3.7068266464238085e-07, + "loss": 0.0416, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 1.0, + "step": 842 + }, + { + "completion_length": 250.0, + "epoch": 0.07025, + "grad_norm": 0.24863839149475098, + "kl": 0.9364717602729797, + "learning_rate": 3.661230068116811e-07, + "loss": 0.0375, + "reward": 1.1666666269302368, + "reward_std": 0.7766431570053101, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.7916666269302368, + "step": 843 + }, + { + "completion_length": 250.0, + "epoch": 0.07033333333333333, + "grad_norm": 0.22837644815444946, + "kl": 0.9734100103378296, + "learning_rate": 3.615893495987335e-07, + "loss": 0.0389, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.875, + "step": 844 + }, + { + "completion_length": 250.0, + "epoch": 0.07041666666666667, + "grad_norm": 0.2933385372161865, + "kl": 0.6663645505905151, + "learning_rate": 3.5708174824471947e-07, + "loss": 0.0267, + "reward": 1.5, + "reward_std": 0.6424160599708557, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.875, + "step": 845 + }, + { + "completion_length": 250.0, + "epoch": 0.0705, + "grad_norm": 0.29375800490379333, + "kl": 1.2523351907730103, + "learning_rate": 3.5260025767333894e-07, + "loss": 0.0501, + "reward": 1.5, + "reward_std": 0.7126966714859009, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.75, + "step": 846 + }, + { + "completion_length": 250.0, + "epoch": 0.07058333333333333, + "grad_norm": 0.2230493724346161, + "kl": 1.5347055196762085, + "learning_rate": 3.481449324901412e-07, + "loss": 0.0614, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 1.0, + "step": 847 + }, + { + "completion_length": 250.0, + "epoch": 0.07066666666666667, + "grad_norm": 0.2553749680519104, + "kl": 1.2786747217178345, + "learning_rate": 3.4371582698185636e-07, + "loss": 0.0511, + "reward": 1.4583333730697632, + "reward_std": 0.5892556309700012, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.9583333730697632, + "step": 848 + }, + { + "completion_length": 250.0, + "epoch": 0.07075, + "grad_norm": 1.1723978519439697, + "kl": 1.5742748975753784, + "learning_rate": 3.393129951157384e-07, + "loss": 0.063, + "reward": 1.4583333730697632, + "reward_std": 0.5019802451133728, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.9583333730697632, + "step": 849 + }, + { + "completion_length": 250.0, + "epoch": 0.07083333333333333, + "grad_norm": 1.294960618019104, + "kl": 1.270159363746643, + "learning_rate": 3.3493649053890325e-07, + "loss": 0.0508, + "reward": 1.5833333730697632, + "reward_std": 0.7918232679367065, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.8333333730697632, + "step": 850 + }, + { + "completion_length": 250.0, + "epoch": 0.07091666666666667, + "grad_norm": 0.32427558302879333, + "kl": 1.2746331691741943, + "learning_rate": 3.3058636657767927e-07, + "loss": 0.051, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.875, + "step": 851 + }, + { + "completion_length": 250.0, + "epoch": 0.071, + "grad_norm": 0.28006064891815186, + "kl": 1.1999834775924683, + "learning_rate": 3.262626762369525e-07, + "loss": 0.048, + "reward": 1.3333333730697632, + "reward_std": 0.6666666269302368, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.8333333730697632, + "step": 852 + }, + { + "completion_length": 250.0, + "epoch": 0.07108333333333333, + "grad_norm": 0.3267779052257538, + "kl": 1.2184315919876099, + "learning_rate": 3.219654721995266e-07, + "loss": 0.0487, + "reward": 1.2916667461395264, + "reward_std": 0.8807914853096008, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.5416666865348816, + "step": 853 + }, + { + "completion_length": 250.0, + "epoch": 0.07116666666666667, + "grad_norm": 0.2615717947483063, + "kl": 0.463012158870697, + "learning_rate": 3.176948068254762e-07, + "loss": 0.0185, + "reward": 1.3333333730697632, + "reward_std": 0.9428090453147888, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.7083333134651184, + "step": 854 + }, + { + "completion_length": 250.0, + "epoch": 0.07125, + "grad_norm": 0.20684708654880524, + "kl": 1.38583242893219, + "learning_rate": 3.134507321515107e-07, + "loss": 0.0554, + "reward": 1.2916666269302368, + "reward_std": 0.6283639073371887, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.9166666865348816, + "step": 855 + }, + { + "completion_length": 250.0, + "epoch": 0.07133333333333333, + "grad_norm": 0.2460877001285553, + "kl": 0.6953861117362976, + "learning_rate": 3.092332998903416e-07, + "loss": 0.0278, + "reward": 1.2916666269302368, + "reward_std": 0.7223747968673706, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.7916666865348816, + "step": 856 + }, + { + "completion_length": 242.0, + "epoch": 0.07141666666666667, + "grad_norm": 0.030763499438762665, + "kl": 1.842020869255066, + "learning_rate": 3.050425614300487e-07, + "loss": 0.0737, + "reward": 2.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 1.0, + "step": 857 + }, + { + "completion_length": 250.0, + "epoch": 0.0715, + "grad_norm": 0.3020068407058716, + "kl": 1.5266697406768799, + "learning_rate": 3.0087856783345916e-07, + "loss": 0.0611, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.875, + "step": 858 + }, + { + "completion_length": 250.0, + "epoch": 0.07158333333333333, + "grad_norm": 0.2564064860343933, + "kl": 1.2908424139022827, + "learning_rate": 2.967413698375196e-07, + "loss": 0.0516, + "reward": 1.4166666269302368, + "reward_std": 0.6606874465942383, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.9166666865348816, + "step": 859 + }, + { + "completion_length": 250.0, + "epoch": 0.07166666666666667, + "grad_norm": 1.984915852546692, + "kl": 1.585814118385315, + "learning_rate": 2.9263101785268253e-07, + "loss": 0.0634, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 0.875, + "step": 860 + }, + { + "completion_length": 250.0, + "epoch": 0.07175, + "grad_norm": 0.22617851197719574, + "kl": 0.9019542336463928, + "learning_rate": 2.8854756196229017e-07, + "loss": 0.0361, + "reward": 1.2916666269302368, + "reward_std": 0.6283639669418335, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.9166666865348816, + "step": 861 + }, + { + "completion_length": 250.0, + "epoch": 0.07183333333333333, + "grad_norm": 0.24866744875907898, + "kl": 0.7339966893196106, + "learning_rate": 2.844910519219632e-07, + "loss": 0.0294, + "reward": 1.2083333730697632, + "reward_std": 0.8533315062522888, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.7083333730697632, + "step": 862 + }, + { + "completion_length": 250.0, + "epoch": 0.07191666666666667, + "grad_norm": 0.2894342839717865, + "kl": 0.7534470558166504, + "learning_rate": 2.8046153715899695e-07, + "loss": 0.0301, + "reward": 1.0833333730697632, + "reward_std": 0.29546844959259033, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.8333333730697632, + "step": 863 + }, + { + "completion_length": 250.0, + "epoch": 0.072, + "grad_norm": 0.24600763618946075, + "kl": 1.024735450744629, + "learning_rate": 2.764590667717562e-07, + "loss": 0.041, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.875, + "step": 864 + }, + { + "completion_length": 250.0, + "epoch": 0.07208333333333333, + "grad_norm": 0.3743076026439667, + "kl": 1.0410536527633667, + "learning_rate": 2.7248368952908055e-07, + "loss": 0.0416, + "reward": 1.375, + "reward_std": 0.7000566720962524, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.875, + "step": 865 + }, + { + "completion_length": 250.0, + "epoch": 0.07216666666666667, + "grad_norm": 0.2570086121559143, + "kl": 1.0770304203033447, + "learning_rate": 2.6853545386968607e-07, + "loss": 0.0431, + "reward": 1.4166666269302368, + "reward_std": 0.6606874465942383, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.9166666865348816, + "step": 866 + }, + { + "completion_length": 250.0, + "epoch": 0.07225, + "grad_norm": 0.31037428975105286, + "kl": 0.8965320587158203, + "learning_rate": 2.6461440790157974e-07, + "loss": 0.0359, + "reward": 1.4166667461395264, + "reward_std": 0.8498366475105286, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.7916666865348816, + "step": 867 + }, + { + "completion_length": 250.0, + "epoch": 0.07233333333333333, + "grad_norm": 0.2683153450489044, + "kl": 0.8436253666877747, + "learning_rate": 2.6072059940146775e-07, + "loss": 0.0337, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.875, + "step": 868 + }, + { + "completion_length": 250.0, + "epoch": 0.07241666666666667, + "grad_norm": 0.2785882353782654, + "kl": 0.9701066017150879, + "learning_rate": 2.568540758141791e-07, + "loss": 0.0388, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.875, + "step": 869 + }, + { + "completion_length": 250.0, + "epoch": 0.0725, + "grad_norm": 0.4102500379085541, + "kl": 1.3558361530303955, + "learning_rate": 2.53014884252083e-07, + "loss": 0.0542, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 1.0, + "step": 870 + }, + { + "completion_length": 250.0, + "epoch": 0.07258333333333333, + "grad_norm": 0.23862656950950623, + "kl": 0.8607050776481628, + "learning_rate": 2.492030714945162e-07, + "loss": 0.0344, + "reward": 0.9166666865348816, + "reward_std": 0.2357022613286972, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.9166666865348816, + "step": 871 + }, + { + "completion_length": 250.0, + "epoch": 0.07266666666666667, + "grad_norm": 0.31678903102874756, + "kl": 0.5520573258399963, + "learning_rate": 2.454186839872158e-07, + "loss": 0.0221, + "reward": 0.9166666865348816, + "reward_std": 0.6606875061988831, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.6666666865348816, + "step": 872 + }, + { + "completion_length": 250.0, + "epoch": 0.07275, + "grad_norm": 0.297191321849823, + "kl": 0.6400251984596252, + "learning_rate": 2.4166176784174795e-07, + "loss": 0.0256, + "reward": 1.5416667461395264, + "reward_std": 0.46929532289505005, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.9166667461395264, + "step": 873 + }, + { + "completion_length": 250.0, + "epoch": 0.07283333333333333, + "grad_norm": 0.46804097294807434, + "kl": 1.2724863290786743, + "learning_rate": 2.3793236883495164e-07, + "loss": 0.0509, + "reward": 1.3333333730697632, + "reward_std": 0.8164966106414795, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.7083333730697632, + "step": 874 + }, + { + "completion_length": 250.0, + "epoch": 0.07291666666666667, + "grad_norm": 0.2768522799015045, + "kl": 1.2393690347671509, + "learning_rate": 2.3423053240837518e-07, + "loss": 0.0496, + "reward": 1.4583333730697632, + "reward_std": 0.5892556309700012, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.9583333730697632, + "step": 875 + }, + { + "completion_length": 250.0, + "epoch": 0.073, + "grad_norm": 0.4209572672843933, + "kl": 0.9548825025558472, + "learning_rate": 2.3055630366772857e-07, + "loss": 0.0382, + "reward": 1.4166667461395264, + "reward_std": 0.9041350483894348, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.6666666865348816, + "step": 876 + }, + { + "completion_length": 250.0, + "epoch": 0.07308333333333333, + "grad_norm": 0.29563507437705994, + "kl": 0.9755483269691467, + "learning_rate": 2.269097273823287e-07, + "loss": 0.039, + "reward": 1.0833333730697632, + "reward_std": 0.38832154870033264, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.9583333730697632, + "step": 877 + }, + { + "completion_length": 250.0, + "epoch": 0.07316666666666667, + "grad_norm": 0.28373217582702637, + "kl": 1.0492498874664307, + "learning_rate": 2.2329084798455747e-07, + "loss": 0.042, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 1.0, + "step": 878 + }, + { + "completion_length": 250.0, + "epoch": 0.07325, + "grad_norm": 0.7796637415885925, + "kl": 1.0222169160842896, + "learning_rate": 2.1969970956931762e-07, + "loss": 0.0409, + "reward": 1.3333333730697632, + "reward_std": 0.7766431570053101, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.8333333730697632, + "step": 879 + }, + { + "completion_length": 208.0, + "epoch": 0.07333333333333333, + "grad_norm": 0.17579619586467743, + "kl": 1.2056773900985718, + "learning_rate": 2.1613635589349756e-07, + "loss": 0.0482, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 1.0, + "step": 880 + }, + { + "completion_length": 250.0, + "epoch": 0.07341666666666667, + "grad_norm": 0.3150039315223694, + "kl": 1.0645179748535156, + "learning_rate": 2.1260083037543817e-07, + "loss": 0.0426, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 0.875, + "step": 881 + }, + { + "completion_length": 250.0, + "epoch": 0.0735, + "grad_norm": 0.029675384983420372, + "kl": 1.4976853132247925, + "learning_rate": 2.0909317609440093e-07, + "loss": 0.0599, + "reward": 2.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 1.0, + "step": 882 + }, + { + "completion_length": 250.0, + "epoch": 0.07358333333333333, + "grad_norm": 0.7188115119934082, + "kl": 1.3928179740905762, + "learning_rate": 2.0561343579004716e-07, + "loss": 0.0557, + "reward": 1.5833333730697632, + "reward_std": 0.49601584672927856, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.9583333730697632, + "step": 883 + }, + { + "completion_length": 250.0, + "epoch": 0.07366666666666667, + "grad_norm": 0.27707529067993164, + "kl": 1.0864824056625366, + "learning_rate": 2.0216165186191406e-07, + "loss": 0.0435, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.875, + "step": 884 + }, + { + "completion_length": 250.0, + "epoch": 0.07375, + "grad_norm": 0.27343204617500305, + "kl": 1.5687239170074463, + "learning_rate": 1.9873786636889908e-07, + "loss": 0.0627, + "reward": 1.125, + "reward_std": 0.6408699750900269, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.875, + "step": 885 + }, + { + "completion_length": 250.0, + "epoch": 0.07383333333333333, + "grad_norm": 0.3359210789203644, + "kl": 0.9341368079185486, + "learning_rate": 1.95342121028749e-07, + "loss": 0.0374, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.875, + "step": 886 + }, + { + "completion_length": 250.0, + "epoch": 0.07391666666666667, + "grad_norm": 0.32620811462402344, + "kl": 1.4301745891571045, + "learning_rate": 1.9197445721754777e-07, + "loss": 0.0572, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.875, + "step": 887 + }, + { + "completion_length": 250.0, + "epoch": 0.074, + "grad_norm": 0.24947281181812286, + "kl": 0.3620012700557709, + "learning_rate": 1.8863491596921745e-07, + "loss": 0.0145, + "reward": 1.4583333730697632, + "reward_std": 0.7332792282104492, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.5833333730697632, + "step": 888 + }, + { + "completion_length": 250.0, + "epoch": 0.07408333333333333, + "grad_norm": 0.33198386430740356, + "kl": 1.7999809980392456, + "learning_rate": 1.8532353797501318e-07, + "loss": 0.072, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.875, + "step": 889 + }, + { + "completion_length": 250.0, + "epoch": 0.07416666666666667, + "grad_norm": 1.02660071849823, + "kl": 1.4582207202911377, + "learning_rate": 1.8204036358303173e-07, + "loss": 0.0583, + "reward": 0.8333333730697632, + "reward_std": 0.835710883140564, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.5833333730697632, + "step": 890 + }, + { + "completion_length": 250.0, + "epoch": 0.07425, + "grad_norm": 1.7326771020889282, + "kl": 1.4054831266403198, + "learning_rate": 1.787854327977162e-07, + "loss": 0.0562, + "reward": 1.0, + "reward_std": 0.7559289336204529, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.75, + "step": 891 + }, + { + "completion_length": 250.0, + "epoch": 0.07433333333333333, + "grad_norm": 0.3918047249317169, + "kl": 1.9732359647750854, + "learning_rate": 1.7555878527937164e-07, + "loss": 0.0789, + "reward": 1.7916667461395264, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.9166667461395264, + "step": 892 + }, + { + "completion_length": 250.0, + "epoch": 0.07441666666666667, + "grad_norm": 0.0223043542355299, + "kl": 1.2435429096221924, + "learning_rate": 1.7236046034367959e-07, + "loss": 0.0497, + "reward": 1.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 1.0, + "step": 893 + }, + { + "completion_length": 250.0, + "epoch": 0.0745, + "grad_norm": 0.21901412308216095, + "kl": 0.5204886794090271, + "learning_rate": 1.6919049696121957e-07, + "loss": 0.0208, + "reward": 1.6666667461395264, + "reward_std": 0.5345224738121033, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.9166667461395264, + "step": 894 + }, + { + "completion_length": 250.0, + "epoch": 0.07458333333333333, + "grad_norm": 0.291716068983078, + "kl": 0.719805896282196, + "learning_rate": 1.6604893375699594e-07, + "loss": 0.0288, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 1.0, + "step": 895 + }, + { + "completion_length": 250.0, + "epoch": 0.07466666666666667, + "grad_norm": 0.3257061541080475, + "kl": 1.1170681715011597, + "learning_rate": 1.629358090099639e-07, + "loss": 0.0447, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.875, + "step": 896 + }, + { + "completion_length": 250.0, + "epoch": 0.07475, + "grad_norm": 0.2914160192012787, + "kl": 0.9540635943412781, + "learning_rate": 1.5985116065256683e-07, + "loss": 0.0382, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.875, + "step": 897 + }, + { + "completion_length": 250.0, + "epoch": 0.07483333333333334, + "grad_norm": 0.20414304733276367, + "kl": 1.3840910196304321, + "learning_rate": 1.567950262702714e-07, + "loss": 0.0554, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 1.0, + "step": 898 + }, + { + "completion_length": 250.0, + "epoch": 0.07491666666666667, + "grad_norm": 0.28207582235336304, + "kl": 0.8242446780204773, + "learning_rate": 1.5376744310111019e-07, + "loss": 0.033, + "reward": 1.0, + "reward_std": 0.5345224738121033, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.875, + "step": 899 + }, + { + "completion_length": 250.0, + "epoch": 0.075, + "grad_norm": 0.2571064829826355, + "kl": 0.4384586811065674, + "learning_rate": 1.507684480352292e-07, + "loss": 0.0175, + "reward": 1.0, + "reward_std": 0.9258201122283936, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.625, + "step": 900 + }, + { + "completion_length": 250.0, + "epoch": 0.07508333333333334, + "grad_norm": 0.2975660562515259, + "kl": 0.8587030172348022, + "learning_rate": 1.4779807761443638e-07, + "loss": 0.0343, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.875, + "step": 901 + }, + { + "completion_length": 250.0, + "epoch": 0.07516666666666667, + "grad_norm": 1.0238752365112305, + "kl": 1.3424298763275146, + "learning_rate": 1.4485636803175828e-07, + "loss": 0.0537, + "reward": 1.375, + "reward_std": 0.9161254167556763, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.75, + "step": 902 + }, + { + "completion_length": 250.0, + "epoch": 0.07525, + "grad_norm": 0.27268996834754944, + "kl": 1.614228367805481, + "learning_rate": 1.419433551309976e-07, + "loss": 0.0646, + "reward": 1.8333333730697632, + "reward_std": 0.47140446305274963, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.9583333730697632, + "step": 903 + }, + { + "completion_length": 250.0, + "epoch": 0.07533333333333334, + "grad_norm": 0.2508178651332855, + "kl": 0.6932893991470337, + "learning_rate": 1.3905907440629752e-07, + "loss": 0.0277, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.875, + "step": 904 + }, + { + "completion_length": 250.0, + "epoch": 0.07541666666666667, + "grad_norm": 0.5284227132797241, + "kl": 1.5996646881103516, + "learning_rate": 1.362035610017079e-07, + "loss": 0.064, + "reward": 1.25, + "reward_std": 0.6606875061988831, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.875, + "step": 905 + }, + { + "completion_length": 250.0, + "epoch": 0.0755, + "grad_norm": 0.290884792804718, + "kl": 0.6067217588424683, + "learning_rate": 1.3337684971075932e-07, + "loss": 0.0243, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.875, + "step": 906 + }, + { + "completion_length": 250.0, + "epoch": 0.07558333333333334, + "grad_norm": 0.3576277792453766, + "kl": 0.8207005858421326, + "learning_rate": 1.305789749760361e-07, + "loss": 0.0328, + "reward": 1.1666667461395264, + "reward_std": 0.7766431570053101, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.7916666865348816, + "step": 907 + }, + { + "completion_length": 250.0, + "epoch": 0.07566666666666666, + "grad_norm": 0.36113837361335754, + "kl": 1.7893757820129395, + "learning_rate": 1.278099708887587e-07, + "loss": 0.0716, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 1.0, + "step": 908 + }, + { + "completion_length": 250.0, + "epoch": 0.07575, + "grad_norm": 0.3109591603279114, + "kl": 1.1817129850387573, + "learning_rate": 1.2506987118836912e-07, + "loss": 0.0473, + "reward": 1.2083333730697632, + "reward_std": 0.9074209332466125, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.7083333730697632, + "step": 909 + }, + { + "completion_length": 250.0, + "epoch": 0.07583333333333334, + "grad_norm": 756.9115600585938, + "kl": 135.45335388183594, + "learning_rate": 1.223587092621162e-07, + "loss": 5.4181, + "reward": 1.2083333730697632, + "reward_std": 0.9074208736419678, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.7083333730697632, + "step": 910 + }, + { + "completion_length": 250.0, + "epoch": 0.07591666666666666, + "grad_norm": 0.8474955558776855, + "kl": 1.7323054075241089, + "learning_rate": 1.1967651814465353e-07, + "loss": 0.0693, + "reward": 1.3333333730697632, + "reward_std": 0.5634361505508423, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.8333333730697632, + "step": 911 + }, + { + "completion_length": 250.0, + "epoch": 0.076, + "grad_norm": 0.31559211015701294, + "kl": 1.9210703372955322, + "learning_rate": 1.1702333051763271e-07, + "loss": 0.0768, + "reward": 1.75, + "reward_std": 0.5841830968856812, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.875, + "step": 912 + }, + { + "completion_length": 250.0, + "epoch": 0.07608333333333334, + "grad_norm": 0.30108439922332764, + "kl": 1.042654037475586, + "learning_rate": 1.1439917870930795e-07, + "loss": 0.0417, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 1.0, + "step": 913 + }, + { + "completion_length": 250.0, + "epoch": 0.07616666666666666, + "grad_norm": 0.29974260926246643, + "kl": 1.0419851541519165, + "learning_rate": 1.1180409469414094e-07, + "loss": 0.0417, + "reward": 1.3333333730697632, + "reward_std": 0.7766431570053101, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.8333333730697632, + "step": 914 + }, + { + "completion_length": 250.0, + "epoch": 0.07625, + "grad_norm": 0.29832908511161804, + "kl": 1.5337603092193604, + "learning_rate": 1.0923811009241142e-07, + "loss": 0.0614, + "reward": 1.4166667461395264, + "reward_std": 0.49601584672927856, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.9166666865348816, + "step": 915 + }, + { + "completion_length": 250.0, + "epoch": 0.07633333333333334, + "grad_norm": 0.4468366205692291, + "kl": 1.797929048538208, + "learning_rate": 1.067012561698319e-07, + "loss": 0.0719, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.875, + "step": 916 + }, + { + "completion_length": 250.0, + "epoch": 0.07641666666666666, + "grad_norm": 0.3717971444129944, + "kl": 1.0258549451828003, + "learning_rate": 1.041935638371669e-07, + "loss": 0.041, + "reward": 0.875, + "reward_std": 0.6408699750900269, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.75, + "step": 917 + }, + { + "completion_length": 250.0, + "epoch": 0.0765, + "grad_norm": 0.30195504426956177, + "kl": 1.0266151428222656, + "learning_rate": 1.0171506364985622e-07, + "loss": 0.0411, + "reward": 1.0, + "reward_std": 0.7126966714859009, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.75, + "step": 918 + }, + { + "completion_length": 250.0, + "epoch": 0.07658333333333334, + "grad_norm": 0.29930129647254944, + "kl": 1.6755496263504028, + "learning_rate": 9.926578580764234e-08, + "loss": 0.067, + "reward": 1.625, + "reward_std": 0.6283639073371887, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.875, + "step": 919 + }, + { + "completion_length": 250.0, + "epoch": 0.07666666666666666, + "grad_norm": 0.01916714385151863, + "kl": 1.014693021774292, + "learning_rate": 9.684576015420277e-08, + "loss": 0.0406, + "reward": 2.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 1.0, + "step": 920 + }, + { + "completion_length": 164.0, + "epoch": 0.07675, + "grad_norm": 0.26526811718940735, + "kl": 0.8480601906776428, + "learning_rate": 9.445501617678654e-08, + "loss": 0.0339, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 1.0, + "step": 921 + }, + { + "completion_length": 250.0, + "epoch": 0.07683333333333334, + "grad_norm": 0.41043245792388916, + "kl": 0.912456214427948, + "learning_rate": 9.209358300585474e-08, + "loss": 0.0365, + "reward": 1.4166667461395264, + "reward_std": 0.7715167999267578, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.6666666865348816, + "step": 922 + }, + { + "completion_length": 250.0, + "epoch": 0.07691666666666666, + "grad_norm": 0.19834581017494202, + "kl": 1.5605332851409912, + "learning_rate": 8.9761489414725e-08, + "loss": 0.0624, + "reward": 1.4166666269302368, + "reward_std": 0.6606874465942383, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.9166666865348816, + "step": 923 + }, + { + "completion_length": 250.0, + "epoch": 0.077, + "grad_norm": 0.27626922726631165, + "kl": 1.0883418321609497, + "learning_rate": 8.745876381922147e-08, + "loss": 0.0435, + "reward": 1.5, + "reward_std": 0.7126966714859009, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.7500000596046448, + "step": 924 + }, + { + "completion_length": 250.0, + "epoch": 0.07708333333333334, + "grad_norm": 0.41667699813842773, + "kl": 1.4361419677734375, + "learning_rate": 8.518543427732951e-08, + "loss": 0.0574, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 1.0, + "step": 925 + }, + { + "completion_length": 250.0, + "epoch": 0.07716666666666666, + "grad_norm": 0.8117958903312683, + "kl": 1.2131803035736084, + "learning_rate": 8.294152848885156e-08, + "loss": 0.0485, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.875, + "step": 926 + }, + { + "completion_length": 250.0, + "epoch": 0.07725, + "grad_norm": 0.023965315893292427, + "kl": 0.9878235459327698, + "learning_rate": 8.072707379507217e-08, + "loss": 0.0395, + "reward": 2.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 1.0, + "step": 927 + }, + { + "completion_length": 250.0, + "epoch": 0.07733333333333334, + "grad_norm": 0.06923804432153702, + "kl": 1.3249129056930542, + "learning_rate": 7.854209717842231e-08, + "loss": 0.053, + "reward": 2.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 1.0, + "step": 928 + }, + { + "completion_length": 250.0, + "epoch": 0.07741666666666666, + "grad_norm": 0.21536372601985931, + "kl": 1.1641136407852173, + "learning_rate": 7.638662526215284e-08, + "loss": 0.0466, + "reward": 1.0416667461395264, + "reward_std": 0.4520675241947174, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.9166666865348816, + "step": 929 + }, + { + "completion_length": 250.0, + "epoch": 0.0775, + "grad_norm": 0.3016450107097626, + "kl": 1.2998625040054321, + "learning_rate": 7.426068431000883e-08, + "loss": 0.052, + "reward": 0.9166667461395264, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.6666666865348816, + "step": 930 + }, + { + "completion_length": 250.0, + "epoch": 0.07758333333333334, + "grad_norm": 0.33901509642601013, + "kl": 1.7370189428329468, + "learning_rate": 7.216430022591009e-08, + "loss": 0.0695, + "reward": 1.9166667461395264, + "reward_std": 0.23570223152637482, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 0.9166666865348816, + "step": 931 + }, + { + "completion_length": 250.0, + "epoch": 0.07766666666666666, + "grad_norm": 0.33317700028419495, + "kl": 1.2731457948684692, + "learning_rate": 7.009749855363457e-08, + "loss": 0.0509, + "reward": 1.375, + "reward_std": 0.9161254167556763, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.75, + "step": 932 + }, + { + "completion_length": 250.0, + "epoch": 0.07775, + "grad_norm": 0.2928623855113983, + "kl": 0.9490488171577454, + "learning_rate": 6.806030447650879e-08, + "loss": 0.038, + "reward": 1.625, + "reward_std": 0.602573812007904, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.75, + "step": 933 + }, + { + "completion_length": 250.0, + "epoch": 0.07783333333333334, + "grad_norm": 0.25610074400901794, + "kl": 1.1091444492340088, + "learning_rate": 6.605274281709929e-08, + "loss": 0.0444, + "reward": 1.6666667461395264, + "reward_std": 0.5345224738121033, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.9166666865348816, + "step": 934 + }, + { + "completion_length": 250.0, + "epoch": 0.07791666666666666, + "grad_norm": 0.2953529953956604, + "kl": 1.096183180809021, + "learning_rate": 6.407483803691216e-08, + "loss": 0.0438, + "reward": 1.0833333730697632, + "reward_std": 0.6606874465942383, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.8333333730697632, + "step": 935 + }, + { + "completion_length": 250.0, + "epoch": 0.078, + "grad_norm": 0.19404926896095276, + "kl": 0.9853270053863525, + "learning_rate": 6.212661423609184e-08, + "loss": 0.0394, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 1.0, + "step": 936 + }, + { + "completion_length": 250.0, + "epoch": 0.07808333333333334, + "grad_norm": 0.26259902119636536, + "kl": 1.091464877128601, + "learning_rate": 6.020809515313141e-08, + "loss": 0.0437, + "reward": 1.3333333730697632, + "reward_std": 0.7126966714859009, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.7083333730697632, + "step": 937 + }, + { + "completion_length": 250.0, + "epoch": 0.07816666666666666, + "grad_norm": 0.3351936638355255, + "kl": 0.9903098344802856, + "learning_rate": 5.83193041645802e-08, + "loss": 0.0396, + "reward": 1.0833333730697632, + "reward_std": 0.38832157850265503, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.8333333730697632, + "step": 938 + }, + { + "completion_length": 250.0, + "epoch": 0.07825, + "grad_norm": 0.2362779676914215, + "kl": 1.3006746768951416, + "learning_rate": 5.6460264284760316e-08, + "loss": 0.052, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 1.0, + "step": 939 + }, + { + "completion_length": 203.0, + "epoch": 0.07833333333333334, + "grad_norm": 0.2249480038881302, + "kl": 1.123484492301941, + "learning_rate": 5.463099816548578e-08, + "loss": 0.0449, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 1.0, + "step": 940 + }, + { + "completion_length": 250.0, + "epoch": 0.07841666666666666, + "grad_norm": 0.3026413321495056, + "kl": 1.1142150163650513, + "learning_rate": 5.283152809578751e-08, + "loss": 0.0446, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.75, + "step": 941 + }, + { + "completion_length": 250.0, + "epoch": 0.0785, + "grad_norm": 0.2342066615819931, + "kl": 0.8287395238876343, + "learning_rate": 5.106187600163987e-08, + "loss": 0.0331, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.75, + "step": 942 + }, + { + "completion_length": 250.0, + "epoch": 0.07858333333333334, + "grad_norm": 0.06259375810623169, + "kl": 1.5354900360107422, + "learning_rate": 4.932206344569562e-08, + "loss": 0.0614, + "reward": 2.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 1.0, + "step": 943 + }, + { + "completion_length": 250.0, + "epoch": 0.07866666666666666, + "grad_norm": 0.24400244653224945, + "kl": 1.0855783224105835, + "learning_rate": 4.761211162702117e-08, + "loss": 0.0434, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.875, + "step": 944 + }, + { + "completion_length": 250.0, + "epoch": 0.07875, + "grad_norm": 0.23698869347572327, + "kl": 1.1598446369171143, + "learning_rate": 4.593204138084006e-08, + "loss": 0.0464, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 1.0, + "step": 945 + }, + { + "completion_length": 250.0, + "epoch": 0.07883333333333334, + "grad_norm": 0.02643449977040291, + "kl": 1.023648738861084, + "learning_rate": 4.428187317827848e-08, + "loss": 0.0409, + "reward": 2.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 1.0, + "step": 946 + }, + { + "completion_length": 229.0, + "epoch": 0.07891666666666666, + "grad_norm": 0.3633557856082916, + "kl": 1.4960824251174927, + "learning_rate": 4.26616271261146e-08, + "loss": 0.0598, + "reward": 1.3333333730697632, + "reward_std": 0.7346308827400208, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.8333333730697632, + "step": 947 + }, + { + "completion_length": 250.0, + "epoch": 0.079, + "grad_norm": 0.049699533730745316, + "kl": 1.670642614364624, + "learning_rate": 4.1071322966535487e-08, + "loss": 0.0668, + "reward": 2.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 1.0, + "step": 948 + }, + { + "completion_length": 250.0, + "epoch": 0.07908333333333334, + "grad_norm": 0.7215015292167664, + "kl": 1.0973060131072998, + "learning_rate": 3.95109800768953e-08, + "loss": 0.0439, + "reward": 1.5416667461395264, + "reward_std": 0.7332792282104492, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.7916666865348816, + "step": 949 + }, + { + "completion_length": 250.0, + "epoch": 0.07916666666666666, + "grad_norm": 0.5393422842025757, + "kl": 1.3605879545211792, + "learning_rate": 3.798061746947995e-08, + "loss": 0.0544, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.875, + "step": 950 + }, + { + "completion_length": 250.0, + "epoch": 0.07925, + "grad_norm": 0.5147884488105774, + "kl": 0.8889792561531067, + "learning_rate": 3.648025379127479e-08, + "loss": 0.0356, + "reward": 1.5416667461395264, + "reward_std": 0.501980185508728, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.6666666865348816, + "step": 951 + }, + { + "completion_length": 250.0, + "epoch": 0.07933333333333334, + "grad_norm": 0.23729416728019714, + "kl": 0.8729484677314758, + "learning_rate": 3.5009907323737826e-08, + "loss": 0.0349, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 1.0, + "step": 952 + }, + { + "completion_length": 250.0, + "epoch": 0.07941666666666666, + "grad_norm": 0.35395750403404236, + "kl": 1.2410624027252197, + "learning_rate": 3.3569595982576584e-08, + "loss": 0.0496, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 1.0, + "step": 953 + }, + { + "completion_length": 250.0, + "epoch": 0.0795, + "grad_norm": 0.29070597887039185, + "kl": 1.1642013788223267, + "learning_rate": 3.2159337317530234e-08, + "loss": 0.0466, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.875, + "step": 954 + }, + { + "completion_length": 250.0, + "epoch": 0.07958333333333334, + "grad_norm": 0.24920472502708435, + "kl": 1.0929770469665527, + "learning_rate": 3.077914851215585e-08, + "loss": 0.0437, + "reward": 1.0416667461395264, + "reward_std": 0.4520675837993622, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.9166666865348816, + "step": 955 + }, + { + "completion_length": 250.0, + "epoch": 0.07966666666666666, + "grad_norm": 0.4327549338340759, + "kl": 1.9852244853973389, + "learning_rate": 2.9429046383618042e-08, + "loss": 0.0794, + "reward": 0.8333333730697632, + "reward_std": 0.6424160599708557, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.7083333134651184, + "step": 956 + }, + { + "completion_length": 250.0, + "epoch": 0.07975, + "grad_norm": 0.27277326583862305, + "kl": 1.475409984588623, + "learning_rate": 2.810904738248549e-08, + "loss": 0.059, + "reward": 1.9583333730697632, + "reward_std": 0.11785109341144562, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 0.9583333730697632, + "step": 957 + }, + { + "completion_length": 250.0, + "epoch": 0.07983333333333334, + "grad_norm": 0.27285173535346985, + "kl": 1.1164697408676147, + "learning_rate": 2.681916759252917e-08, + "loss": 0.0447, + "reward": 1.9166667461395264, + "reward_std": 0.23570223152637482, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 0.9166666865348816, + "step": 958 + }, + { + "completion_length": 250.0, + "epoch": 0.07991666666666666, + "grad_norm": 3.066486358642578, + "kl": 1.745069980621338, + "learning_rate": 2.555942273052753e-08, + "loss": 0.0698, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.75, + "step": 959 + }, + { + "completion_length": 250.0, + "epoch": 0.08, + "grad_norm": 0.3669474124908447, + "kl": 1.6787079572677612, + "learning_rate": 2.4329828146074096e-08, + "loss": 0.0671, + "reward": 1.3333333730697632, + "reward_std": 0.7766431570053101, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.8333333730697632, + "step": 960 + }, + { + "completion_length": 250.0, + "epoch": 0.08008333333333334, + "grad_norm": 0.24523121118545532, + "kl": 1.1602610349655151, + "learning_rate": 2.313039882139101e-08, + "loss": 0.0464, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 1.0, + "step": 961 + }, + { + "completion_length": 250.0, + "epoch": 0.08016666666666666, + "grad_norm": 0.2820650041103363, + "kl": 1.0688502788543701, + "learning_rate": 2.1961149371145795e-08, + "loss": 0.0428, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.75, + "step": 962 + }, + { + "completion_length": 250.0, + "epoch": 0.08025, + "grad_norm": 0.34602880477905273, + "kl": 1.1315749883651733, + "learning_rate": 2.082209404227403e-08, + "loss": 0.0453, + "reward": 1.1666667461395264, + "reward_std": 0.835710883140564, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.6666666865348816, + "step": 963 + }, + { + "completion_length": 250.0, + "epoch": 0.08033333333333334, + "grad_norm": 0.25703561305999756, + "kl": 1.0106827020645142, + "learning_rate": 1.9713246713805588e-08, + "loss": 0.0404, + "reward": 1.4166667461395264, + "reward_std": 0.7292091846466064, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.7916666865348816, + "step": 964 + }, + { + "completion_length": 250.0, + "epoch": 0.08041666666666666, + "grad_norm": 0.28899118304252625, + "kl": 0.9755896925926208, + "learning_rate": 1.8634620896695044e-08, + "loss": 0.039, + "reward": 1.25, + "reward_std": 0.8864052295684814, + "rewards/correctness_reward_func": 0.5, + "rewards/format_reward_func": 0.75, + "step": 965 + }, + { + "completion_length": 250.0, + "epoch": 0.0805, + "grad_norm": 0.29632315039634705, + "kl": 1.442091464996338, + "learning_rate": 1.7586229733657646e-08, + "loss": 0.0577, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 1.0, + "step": 966 + }, + { + "completion_length": 250.0, + "epoch": 0.08058333333333334, + "grad_norm": 0.21348384022712708, + "kl": 1.338564395904541, + "learning_rate": 1.6568085999008886e-08, + "loss": 0.0535, + "reward": 1.8333333730697632, + "reward_std": 0.35634827613830566, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.9583333730697632, + "step": 967 + }, + { + "completion_length": 250.0, + "epoch": 0.08066666666666666, + "grad_norm": 0.24184101819992065, + "kl": 0.8153730034828186, + "learning_rate": 1.5580202098509078e-08, + "loss": 0.0326, + "reward": 1.125, + "reward_std": 0.6408699750900269, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.75, + "step": 968 + }, + { + "completion_length": 250.0, + "epoch": 0.08075, + "grad_norm": 0.526576578617096, + "kl": 1.7040972709655762, + "learning_rate": 1.4622590069211517e-08, + "loss": 0.0682, + "reward": 1.7083332538604736, + "reward_std": 0.5756294131278992, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.8333333730697632, + "step": 969 + }, + { + "completion_length": 250.0, + "epoch": 0.08083333333333333, + "grad_norm": 0.2984718680381775, + "kl": 1.5154507160186768, + "learning_rate": 1.3695261579316776e-08, + "loss": 0.0606, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 0.875, + "step": 970 + }, + { + "completion_length": 250.0, + "epoch": 0.08091666666666666, + "grad_norm": 1.2827290296554565, + "kl": 1.4101325273513794, + "learning_rate": 1.2798227928029483e-08, + "loss": 0.0564, + "reward": 1.7916667461395264, + "reward_std": 0.39591163396835327, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 0.7916666269302368, + "step": 971 + }, + { + "completion_length": 250.0, + "epoch": 0.081, + "grad_norm": 0.23937450349330902, + "kl": 1.6009562015533447, + "learning_rate": 1.193150004542204e-08, + "loss": 0.064, + "reward": 1.5833333730697632, + "reward_std": 0.49601587653160095, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.9583333730697632, + "step": 972 + }, + { + "completion_length": 250.0, + "epoch": 0.08108333333333333, + "grad_norm": 0.5152274370193481, + "kl": 0.8297097086906433, + "learning_rate": 1.109508849230001e-08, + "loss": 0.0332, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.75, + "step": 973 + }, + { + "completion_length": 250.0, + "epoch": 0.08116666666666666, + "grad_norm": 0.28064191341400146, + "kl": 1.0859336853027344, + "learning_rate": 1.0289003460074165e-08, + "loss": 0.0434, + "reward": 1.625, + "reward_std": 0.6283639669418335, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.875, + "step": 974 + }, + { + "completion_length": 250.0, + "epoch": 0.08125, + "grad_norm": 0.2897964417934418, + "kl": 1.0955630540847778, + "learning_rate": 9.513254770636138e-09, + "loss": 0.0438, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.875, + "step": 975 + }, + { + "completion_length": 250.0, + "epoch": 0.08133333333333333, + "grad_norm": 0.22548621892929077, + "kl": 0.9993997812271118, + "learning_rate": 8.767851876239075e-09, + "loss": 0.04, + "reward": 1.9583333730697632, + "reward_std": 0.11785109341144562, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 0.9583333730697632, + "step": 976 + }, + { + "completion_length": 250.0, + "epoch": 0.08141666666666666, + "grad_norm": 0.326506108045578, + "kl": 1.3311384916305542, + "learning_rate": 8.052803859382174e-09, + "loss": 0.0532, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 1.0, + "step": 977 + }, + { + "completion_length": 250.0, + "epoch": 0.0815, + "grad_norm": 0.26752278208732605, + "kl": 0.6817623972892761, + "learning_rate": 7.368119432699383e-09, + "loss": 0.0273, + "reward": 1.0, + "reward_std": 0.7559289336204529, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.75, + "step": 978 + }, + { + "completion_length": 250.0, + "epoch": 0.08158333333333333, + "grad_norm": 0.3232153058052063, + "kl": 0.79771888256073, + "learning_rate": 6.7138069388547614e-09, + "loss": 0.0319, + "reward": 1.5416667461395264, + "reward_std": 0.5019802451133728, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.9166666865348816, + "step": 979 + }, + { + "completion_length": 250.0, + "epoch": 0.08166666666666667, + "grad_norm": 0.35629838705062866, + "kl": 1.6459451913833618, + "learning_rate": 6.089874350439507e-09, + "loss": 0.0658, + "reward": 1.5416666269302368, + "reward_std": 0.6651768684387207, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.9166666865348816, + "step": 980 + }, + { + "completion_length": 250.0, + "epoch": 0.08175, + "grad_norm": 0.30297717452049255, + "kl": 0.822544515132904, + "learning_rate": 5.4963292698750896e-09, + "loss": 0.0329, + "reward": 1.5416667461395264, + "reward_std": 0.7332792282104492, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.7916666865348816, + "step": 981 + }, + { + "completion_length": 250.0, + "epoch": 0.08183333333333333, + "grad_norm": 0.3641495406627655, + "kl": 1.5490293502807617, + "learning_rate": 4.933178929321103e-09, + "loss": 0.062, + "reward": 1.6666666269302368, + "reward_std": 0.6424160599708557, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.9166666865348816, + "step": 982 + }, + { + "completion_length": 250.0, + "epoch": 0.08191666666666667, + "grad_norm": 0.19789476692676544, + "kl": 1.1928259134292603, + "learning_rate": 4.400430190586724e-09, + "loss": 0.0477, + "reward": 1.0833333730697632, + "reward_std": 0.38832157850265503, + "rewards/correctness_reward_func": 0.125, + "rewards/format_reward_func": 0.9583333730697632, + "step": 983 + }, + { + "completion_length": 250.0, + "epoch": 0.082, + "grad_norm": 1.2451598644256592, + "kl": 1.1428676843643188, + "learning_rate": 3.8980895450474455e-09, + "loss": 0.0457, + "reward": 1.5833332538604736, + "reward_std": 0.6362089514732361, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.7083333730697632, + "step": 984 + }, + { + "completion_length": 250.0, + "epoch": 0.08208333333333333, + "grad_norm": 0.6361198425292969, + "kl": 0.45185351371765137, + "learning_rate": 3.4261631135654174e-09, + "loss": 0.0181, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.75, + "step": 985 + }, + { + "completion_length": 250.0, + "epoch": 0.08216666666666667, + "grad_norm": 0.3087112307548523, + "kl": 1.8694123029708862, + "learning_rate": 2.984656646415063e-09, + "loss": 0.0748, + "reward": 1.9583333730697632, + "reward_std": 0.11785109341144562, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 0.9583333730697632, + "step": 986 + }, + { + "completion_length": 250.0, + "epoch": 0.08225, + "grad_norm": 0.28350523114204407, + "kl": 0.7043304443359375, + "learning_rate": 2.573575523213412e-09, + "loss": 0.0282, + "reward": 0.7083333730697632, + "reward_std": 0.4520675241947174, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.7083333730697632, + "step": 987 + }, + { + "completion_length": 250.0, + "epoch": 0.08233333333333333, + "grad_norm": 0.20463794469833374, + "kl": 1.8880668878555298, + "learning_rate": 2.192924752854042e-09, + "loss": 0.0755, + "reward": 1.6666667461395264, + "reward_std": 0.4364357888698578, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.9166667461395264, + "step": 988 + }, + { + "completion_length": 250.0, + "epoch": 0.08241666666666667, + "grad_norm": 0.1875455379486084, + "kl": 1.1098848581314087, + "learning_rate": 1.842708973447127e-09, + "loss": 0.0444, + "reward": 1.7916667461395264, + "reward_std": 0.3535533845424652, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.9166666865348816, + "step": 989 + }, + { + "completion_length": 209.0, + "epoch": 0.0825, + "grad_norm": 0.30843275785446167, + "kl": 1.1816316843032837, + "learning_rate": 1.5229324522605949e-09, + "loss": 0.0473, + "reward": 2.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 1.0, + "step": 990 + }, + { + "completion_length": 250.0, + "epoch": 0.08258333333333333, + "grad_norm": 0.2759556770324707, + "kl": 0.9381877779960632, + "learning_rate": 1.2335990856710001e-09, + "loss": 0.0375, + "reward": 0.7083333730697632, + "reward_std": 0.4520675837993622, + "rewards/correctness_reward_func": 0.0, + "rewards/format_reward_func": 0.7083333730697632, + "step": 991 + }, + { + "completion_length": 250.0, + "epoch": 0.08266666666666667, + "grad_norm": 0.28256580233573914, + "kl": 0.8221601843833923, + "learning_rate": 9.747123991141193e-10, + "loss": 0.0329, + "reward": 1.9583333730697632, + "reward_std": 0.11785109341144562, + "rewards/correctness_reward_func": 1.0, + "rewards/format_reward_func": 0.9583333730697632, + "step": 992 + }, + { + "completion_length": 250.0, + "epoch": 0.08275, + "grad_norm": 0.3341481685638428, + "kl": 1.0547815561294556, + "learning_rate": 7.462755470422078e-10, + "loss": 0.0422, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.875, + "step": 993 + }, + { + "completion_length": 250.0, + "epoch": 0.08283333333333333, + "grad_norm": 0.3639756441116333, + "kl": 0.9993078708648682, + "learning_rate": 5.48291312886251e-10, + "loss": 0.04, + "reward": 1.0, + "reward_std": 0.6424161195755005, + "rewards/correctness_reward_func": 0.25, + "rewards/format_reward_func": 0.7500000596046448, + "step": 994 + }, + { + "completion_length": 250.0, + "epoch": 0.08291666666666667, + "grad_norm": 0.25637567043304443, + "kl": 0.9738581776618958, + "learning_rate": 3.8076210902182607e-10, + "loss": 0.039, + "reward": 1.25, + "reward_std": 0.7071068286895752, + "rewards/correctness_reward_func": 0.625, + "rewards/format_reward_func": 0.625, + "step": 995 + }, + { + "completion_length": 158.0, + "epoch": 0.083, + "grad_norm": 0.3529271185398102, + "kl": 0.9513704776763916, + "learning_rate": 2.43689976739403e-10, + "loss": 0.0381, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 1.0, + "step": 996 + }, + { + "completion_length": 250.0, + "epoch": 0.08308333333333333, + "grad_norm": 0.34197887778282166, + "kl": 1.2964738607406616, + "learning_rate": 1.3707658621964216e-10, + "loss": 0.0519, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.875, + "step": 997 + }, + { + "completion_length": 250.0, + "epoch": 0.08316666666666667, + "grad_norm": 0.29255440831184387, + "kl": 0.7699793577194214, + "learning_rate": 6.092323651313293e-11, + "loss": 0.0308, + "reward": 1.8333333730697632, + "reward_std": 0.47140446305274963, + "rewards/correctness_reward_func": 0.875, + "rewards/format_reward_func": 0.9583333730697632, + "step": 998 + }, + { + "completion_length": 250.0, + "epoch": 0.08325, + "grad_norm": 4.424619197845459, + "kl": 1.8411613702774048, + "learning_rate": 1.5230855524017708e-11, + "loss": 0.0736, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/correctness_reward_func": 0.375, + "rewards/format_reward_func": 0.875, + "step": 999 + }, + { + "completion_length": 250.0, + "epoch": 0.08333333333333333, + "grad_norm": 0.2679225206375122, + "kl": 0.8978027701377869, + "learning_rate": 0.0, + "loss": 0.0359, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/correctness_reward_func": 0.75, + "rewards/format_reward_func": 0.875, + "step": 1000 + } + ], + "logging_steps": 1, + "max_steps": 1000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}