{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 200.0, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 500.0, "epoch": 0.2, "grad_norm": 144.23707580566406, "kl": 51.48179244995117, "learning_rate": 3.1250000000000005e-08, "loss": 2.0593, "reward": 0.9761996865272522, "reward_std": 3.3251326084136963, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.148800253868103, "rewards/wrapped_format_reward": 0.625, "step": 1 }, { "completion_length": 500.0, "epoch": 0.4, "grad_norm": 974.2139892578125, "kl": 216.24957275390625, "learning_rate": 6.250000000000001e-08, "loss": 8.65, "reward": -3.75, "reward_std": 0.5, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.25, "step": 2 }, { "completion_length": 500.0, "epoch": 0.6, "grad_norm": 41697.91015625, "kl": 3837.41943359375, "learning_rate": 9.375e-08, "loss": 153.4967, "reward": -0.7961921691894531, "reward_std": 3.700653076171875, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4375, "rewards/wrapped_driving_reward": -1.7336921691894531, "rewards/wrapped_format_reward": 0.0, "step": 3 }, { "completion_length": 500.0, "epoch": 0.8, "grad_norm": 10122959.0, "kl": 511094.90625, "learning_rate": 1.2500000000000002e-07, "loss": 20443.7988, "reward": -2.338921546936035, "reward_std": 3.322157144546509, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.838921546936035, "rewards/wrapped_format_reward": 0.0, "step": 4 }, { "completion_length": 500.0, "epoch": 1.0, "grad_norm": 100702232.0, "kl": 5416315.0, "learning_rate": 1.5625e-07, "loss": 216652.5938, "reward": -0.16450506448745728, "reward_std": 3.8515079021453857, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.6645050048828125, "rewards/wrapped_format_reward": 0.5, "step": 5 }, { "completion_length": 500.0, "epoch": 1.2, "grad_norm": 17.33243751525879, "kl": 7.672175884246826, "learning_rate": 1.875e-07, "loss": 0.3069, "reward": 0.9893605709075928, "reward_std": 1.5257619619369507, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.3856394290924072, "rewards/wrapped_format_reward": 0.375, "step": 6 }, { "completion_length": 500.0, "epoch": 1.4, "grad_norm": 70.34513092041016, "kl": 17.917146682739258, "learning_rate": 2.1875e-07, "loss": 0.7167, "reward": 1.2267450094223022, "reward_std": 3.4932949542999268, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.5232549905776978, "rewards/wrapped_format_reward": 0.25, "step": 7 }, { "completion_length": 500.0, "epoch": 1.6, "grad_norm": 69448.7734375, "kl": 9786.2802734375, "learning_rate": 2.5000000000000004e-07, "loss": 391.4512, "reward": -0.967779815196991, "reward_std": 3.5180184841156006, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.2177798748016357, "rewards/wrapped_format_reward": 0.25, "step": 8 }, { "completion_length": 500.0, "epoch": 1.8, "grad_norm": 7205431.5, "kl": 363326.15625, "learning_rate": 2.8125e-07, "loss": 14533.0439, "reward": -0.4434952139854431, "reward_std": 4.112156867980957, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.5684951543807983, "rewards/wrapped_format_reward": 0.125, "step": 9 }, { "completion_length": 500.0, "epoch": 2.0, "grad_norm": 1344.39306640625, "kl": 182.57179260253906, "learning_rate": 3.125e-07, "loss": 7.3029, "reward": -0.5283111929893494, "reward_std": 3.7256903648376465, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.6533112525939941, "rewards/wrapped_format_reward": 0.125, "step": 10 }, { "completion_length": 500.0, "epoch": 2.2, "grad_norm": 663.510498046875, "kl": 125.65680694580078, "learning_rate": 3.4375000000000004e-07, "loss": 5.0263, "reward": -2.449397563934326, "reward_std": 3.1012051105499268, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.949397563934326, "rewards/wrapped_format_reward": 0.0, "step": 11 }, { "completion_length": 500.0, "epoch": 2.4, "grad_norm": 31.331226348876953, "kl": 10.755382537841797, "learning_rate": 3.75e-07, "loss": 0.4302, "reward": -2.1561226844787598, "reward_std": 3.36269211769104, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.7811226844787598, "rewards/wrapped_format_reward": 0.125, "step": 12 }, { "completion_length": 500.0, "epoch": 2.6, "grad_norm": 10.003079414367676, "kl": 3.8625946044921875, "learning_rate": 4.0625000000000003e-07, "loss": 0.1545, "reward": -2.1760454177856445, "reward_std": 3.647908926010132, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.8010454177856445, "rewards/wrapped_format_reward": 0.125, "step": 13 }, { "completion_length": 500.0, "epoch": 2.8, "grad_norm": 62.2325325012207, "kl": 13.510702133178711, "learning_rate": 4.375e-07, "loss": 0.5404, "reward": -2.294018030166626, "reward_std": 3.0876402854919434, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -3.044018030166626, "rewards/wrapped_format_reward": 0.25, "step": 14 }, { "completion_length": 500.0, "epoch": 3.0, "grad_norm": 96.1074447631836, "kl": 10.679292678833008, "learning_rate": 4.6875000000000006e-07, "loss": 0.4272, "reward": -0.6371059417724609, "reward_std": 3.885227680206299, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.637105941772461, "rewards/wrapped_format_reward": 0.0, "step": 15 }, { "completion_length": 500.0, "epoch": 3.2, "grad_norm": 1650.8782958984375, "kl": 208.3596954345703, "learning_rate": 5.000000000000001e-07, "loss": 8.3344, "reward": -4.0, "reward_std": 0.0, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.0, "step": 16 }, { "completion_length": 500.0, "epoch": 3.4, "grad_norm": 17.093393325805664, "kl": 5.905396461486816, "learning_rate": 5.3125e-07, "loss": 0.2362, "reward": -2.4352118968963623, "reward_std": 2.806159257888794, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -3.060211658477783, "rewards/wrapped_format_reward": 0.125, "step": 17 }, { "completion_length": 500.0, "epoch": 3.6, "grad_norm": 78087.4140625, "kl": 7675.3564453125, "learning_rate": 5.625e-07, "loss": 307.0142, "reward": -0.4786604046821594, "reward_std": 4.071903228759766, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.8536603450775146, "rewards/wrapped_format_reward": 0.375, "step": 18 }, { "completion_length": 500.0, "epoch": 3.8, "grad_norm": 2062.067626953125, "kl": 105.56303405761719, "learning_rate": 5.9375e-07, "loss": 4.2225, "reward": -0.4283701777458191, "reward_std": 4.145442008972168, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.8033702373504639, "rewards/wrapped_format_reward": 0.375, "step": 19 }, { "completion_length": 500.0, "epoch": 4.0, "grad_norm": 513992.65625, "kl": 39077.08984375, "learning_rate": 6.25e-07, "loss": 1563.0836, "reward": -2.0327651500701904, "reward_std": 3.3016297817230225, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.7827651500701904, "rewards/wrapped_format_reward": 0.25, "step": 20 }, { "completion_length": 500.0, "epoch": 4.2, "grad_norm": 271.5398254394531, "kl": 37.08869934082031, "learning_rate": 6.562500000000001e-07, "loss": 1.4835, "reward": -3.5, "reward_std": 0.5773502588272095, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.5, "step": 21 }, { "completion_length": 500.0, "epoch": 4.4, "grad_norm": 33774.453125, "kl": 4115.1591796875, "learning_rate": 6.875000000000001e-07, "loss": 164.6064, "reward": -1.1355788707733154, "reward_std": 3.42315673828125, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.5105788707733154, "rewards/wrapped_format_reward": 0.375, "step": 22 }, { "completion_length": 500.0, "epoch": 4.6, "grad_norm": 52.09832000732422, "kl": 14.069100379943848, "learning_rate": 7.1875e-07, "loss": 0.5628, "reward": 0.9047523736953735, "reward_std": 3.2798702716827393, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.5952475666999817, "rewards/wrapped_format_reward": 0.0, "step": 23 }, { "completion_length": 500.0, "epoch": 4.8, "grad_norm": 75.83870697021484, "kl": 16.262989044189453, "learning_rate": 7.5e-07, "loss": 0.6505, "reward": -0.5572073459625244, "reward_std": 2.9710958003997803, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -2.0572073459625244, "rewards/wrapped_format_reward": 0.0, "step": 24 }, { "completion_length": 500.0, "epoch": 5.0, "grad_norm": 8.57257080078125, "kl": 3.3865182399749756, "learning_rate": 7.8125e-07, "loss": 0.1355, "reward": -2.110412120819092, "reward_std": 3.7791755199432373, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.860412120819092, "rewards/wrapped_format_reward": 0.25, "step": 25 }, { "completion_length": 500.0, "epoch": 5.2, "grad_norm": 978403.0, "kl": 89647.2578125, "learning_rate": 8.125000000000001e-07, "loss": 3585.8899, "reward": -0.2944529056549072, "reward_std": 4.2804718017578125, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.6694529056549072, "rewards/wrapped_format_reward": 0.375, "step": 26 }, { "completion_length": 500.0, "epoch": 5.4, "grad_norm": 168.73036193847656, "kl": 29.724079132080078, "learning_rate": 8.437500000000001e-07, "loss": 1.189, "reward": -0.6913368701934814, "reward_std": 3.5795211791992188, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.0663368701934814, "rewards/wrapped_format_reward": 0.375, "step": 27 }, { "completion_length": 500.0, "epoch": 5.6, "grad_norm": 75324.5234375, "kl": 7936.74267578125, "learning_rate": 8.75e-07, "loss": 317.4697, "reward": 0.9355948567390442, "reward_std": 3.3464736938476562, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.6894051432609558, "rewards/wrapped_format_reward": 0.125, "step": 28 }, { "completion_length": 500.0, "epoch": 5.8, "grad_norm": 469.9671630859375, "kl": 71.2878189086914, "learning_rate": 9.0625e-07, "loss": 2.8515, "reward": -2.547950267791748, "reward_std": 2.904099464416504, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -3.047950267791748, "rewards/wrapped_format_reward": 0.0, "step": 29 }, { "completion_length": 500.0, "epoch": 6.0, "grad_norm": 36.652645111083984, "kl": 13.148932456970215, "learning_rate": 9.375000000000001e-07, "loss": 0.526, "reward": -3.375, "reward_std": 0.9464846849441528, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.125, "step": 30 }, { "completion_length": 500.0, "epoch": 6.2, "grad_norm": 30.199684143066406, "kl": 9.480849266052246, "learning_rate": 9.6875e-07, "loss": 0.3792, "reward": -2.0530290603637695, "reward_std": 3.2615222930908203, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.9280290603637695, "rewards/wrapped_format_reward": 0.375, "step": 31 }, { "completion_length": 500.0, "epoch": 6.4, "grad_norm": 78.3298568725586, "kl": 26.2161865234375, "learning_rate": 1.0000000000000002e-06, "loss": 1.0486, "reward": 1.33430814743042, "reward_std": 3.5583572387695312, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.41569197177886963, "rewards/wrapped_format_reward": 0.25, "step": 32 }, { "completion_length": 500.0, "epoch": 6.6, "grad_norm": 19.472774505615234, "kl": 7.02009391784668, "learning_rate": 1.03125e-06, "loss": 0.2808, "reward": -0.6657888889312744, "reward_std": 3.853987216949463, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.6657888889312744, "rewards/wrapped_format_reward": 0.0, "step": 33 }, { "completion_length": 500.0, "epoch": 6.8, "grad_norm": 65.95396423339844, "kl": 18.14912223815918, "learning_rate": 1.0625e-06, "loss": 0.726, "reward": -0.8661626577377319, "reward_std": 3.6189870834350586, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.1161625385284424, "rewards/wrapped_format_reward": 0.25, "step": 34 }, { "completion_length": 500.0, "epoch": 7.0, "grad_norm": 1.4008393287658691, "kl": 0.8411699533462524, "learning_rate": 1.0937500000000001e-06, "loss": 0.0336, "reward": -0.38928359746932983, "reward_std": 4.171046257019043, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.6392836570739746, "rewards/wrapped_format_reward": 0.25, "step": 35 }, { "completion_length": 500.0, "epoch": 7.2, "grad_norm": 7.80991268157959, "kl": 4.218427658081055, "learning_rate": 1.125e-06, "loss": 0.1687, "reward": -1.1417465209960938, "reward_std": 3.381352186203003, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.1417465209960938, "rewards/wrapped_format_reward": 0.0, "step": 36 }, { "completion_length": 500.0, "epoch": 7.4, "grad_norm": 4.478097438812256, "kl": 2.112290143966675, "learning_rate": 1.1562500000000002e-06, "loss": 0.0845, "reward": -2.102426528930664, "reward_std": 3.1373467445373535, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.852426528930664, "rewards/wrapped_format_reward": 0.25, "step": 37 }, { "completion_length": 500.0, "epoch": 7.6, "grad_norm": 9.182758331298828, "kl": 4.63405179977417, "learning_rate": 1.1875e-06, "loss": 0.1854, "reward": -2.2051236629486084, "reward_std": 3.2649383544921875, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.9551236629486084, "rewards/wrapped_format_reward": 0.25, "step": 38 }, { "completion_length": 500.0, "epoch": 7.8, "grad_norm": 203072.515625, "kl": 20056.82421875, "learning_rate": 1.21875e-06, "loss": 802.2729, "reward": -2.125791311264038, "reward_std": 3.0907511711120605, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -3.000791311264038, "rewards/wrapped_format_reward": 0.375, "step": 39 }, { "completion_length": 500.0, "epoch": 8.0, "grad_norm": 5.843477725982666, "kl": 3.436691999435425, "learning_rate": 1.25e-06, "loss": 0.1375, "reward": -0.9351435899734497, "reward_std": 3.608586072921753, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.3101437091827393, "rewards/wrapped_format_reward": 0.375, "step": 40 }, { "completion_length": 500.0, "epoch": 8.2, "grad_norm": 7742784.5, "kl": 650933.375, "learning_rate": 1.28125e-06, "loss": 26037.334, "reward": 1.5518302917480469, "reward_std": 3.7290942668914795, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.4481697678565979, "rewards/wrapped_format_reward": 0.5, "step": 41 }, { "completion_length": 500.0, "epoch": 8.4, "grad_norm": 8.937725067138672, "kl": 3.0701639652252197, "learning_rate": 1.3125000000000001e-06, "loss": 0.1228, "reward": -2.1904397010803223, "reward_std": 3.2942306995391846, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -3.0654397010803223, "rewards/wrapped_format_reward": 0.375, "step": 42 }, { "completion_length": 500.0, "epoch": 8.6, "grad_norm": 14.36681079864502, "kl": 5.88793420791626, "learning_rate": 1.34375e-06, "loss": 0.2355, "reward": 0.6519123315811157, "reward_std": 3.105113983154297, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.973087728023529, "rewards/wrapped_format_reward": 0.125, "step": 43 }, { "completion_length": 500.0, "epoch": 8.8, "grad_norm": 180.1724395751953, "kl": 33.255760192871094, "learning_rate": 1.3750000000000002e-06, "loss": 1.3302, "reward": -0.6511552333831787, "reward_std": 3.9030916690826416, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.7761552333831787, "rewards/wrapped_format_reward": 0.125, "step": 44 }, { "completion_length": 500.0, "epoch": 9.0, "grad_norm": 32.81709671020508, "kl": 6.7791428565979, "learning_rate": 1.40625e-06, "loss": 0.2712, "reward": -3.875, "reward_std": 0.25, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.125, "step": 45 }, { "completion_length": 500.0, "epoch": 9.2, "grad_norm": 4151.640625, "kl": 350.8572082519531, "learning_rate": 1.4375e-06, "loss": 14.0343, "reward": -1.1032943725585938, "reward_std": 3.3453028202056885, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.1032943725585938, "rewards/wrapped_format_reward": 0.0, "step": 46 }, { "completion_length": 500.0, "epoch": 9.4, "grad_norm": 15.409821510314941, "kl": 5.346187114715576, "learning_rate": 1.4687500000000001e-06, "loss": 0.2138, "reward": -1.1791430711746216, "reward_std": 2.8540005683898926, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -2.929143190383911, "rewards/wrapped_format_reward": 0.25, "step": 47 }, { "completion_length": 500.0, "epoch": 9.6, "grad_norm": 144.20443725585938, "kl": 23.608051300048828, "learning_rate": 1.5e-06, "loss": 0.9443, "reward": -2.530029058456421, "reward_std": 2.939941883087158, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.20000000298023224, "rewards/wrapped_driving_reward": -2.9800291061401367, "rewards/wrapped_format_reward": 0.0, "step": 48 }, { "completion_length": 500.0, "epoch": 9.8, "grad_norm": 7.547443866729736, "kl": 3.824962615966797, "learning_rate": 1.5312500000000002e-06, "loss": 0.153, "reward": -0.7816690802574158, "reward_std": 3.7201201915740967, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.906669020652771, "rewards/wrapped_format_reward": 0.125, "step": 49 }, { "completion_length": 500.0, "epoch": 10.0, "grad_norm": 77.945556640625, "kl": 16.699840545654297, "learning_rate": 1.5625e-06, "loss": 0.668, "reward": -0.2709696292877197, "reward_std": 4.022421360015869, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.6459696292877197, "rewards/wrapped_format_reward": 0.375, "step": 50 }, { "completion_length": 500.0, "epoch": 10.2, "grad_norm": 11738.6953125, "kl": 798.2957763671875, "learning_rate": 1.59375e-06, "loss": 31.9318, "reward": -2.244354724884033, "reward_std": 3.1866860389709473, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.869354724884033, "rewards/wrapped_format_reward": 0.125, "step": 51 }, { "completion_length": 500.0, "epoch": 10.4, "grad_norm": 14.563969612121582, "kl": 5.301497936248779, "learning_rate": 1.6250000000000001e-06, "loss": 0.2121, "reward": 2.282280445098877, "reward_std": 0.7978482246398926, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.21771956980228424, "rewards/wrapped_format_reward": 0.5, "step": 52 }, { "completion_length": 500.0, "epoch": 10.6, "grad_norm": 6.280083656311035, "kl": 3.3535187244415283, "learning_rate": 1.65625e-06, "loss": 0.1341, "reward": 0.2996126413345337, "reward_std": 2.949772357940674, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.3253873586654663, "rewards/wrapped_format_reward": 0.125, "step": 53 }, { "completion_length": 500.0, "epoch": 10.8, "grad_norm": 223484.046875, "kl": 25810.041015625, "learning_rate": 1.6875000000000001e-06, "loss": 1032.4015, "reward": -0.344623327255249, "reward_std": 3.944869041442871, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.719623327255249, "rewards/wrapped_format_reward": 0.375, "step": 54 }, { "completion_length": 500.0, "epoch": 11.0, "grad_norm": 593881.0, "kl": 99004.265625, "learning_rate": 1.71875e-06, "loss": 3960.1709, "reward": -3.875, "reward_std": 0.25, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.125, "step": 55 }, { "completion_length": 500.0, "epoch": 11.2, "grad_norm": 6.934082508087158, "kl": 2.1647584438323975, "learning_rate": 1.75e-06, "loss": 0.0866, "reward": -1.2983622550964355, "reward_std": 2.1767022609710693, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -3.1733622550964355, "rewards/wrapped_format_reward": 0.375, "step": 56 }, { "completion_length": 500.0, "epoch": 11.4, "grad_norm": 11660.44921875, "kl": 1039.497802734375, "learning_rate": 1.78125e-06, "loss": 41.5799, "reward": 1.111867070198059, "reward_std": 3.4727301597595215, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.6381329298019409, "rewards/wrapped_format_reward": 0.25, "step": 57 }, { "completion_length": 500.0, "epoch": 11.6, "grad_norm": 116.61476135253906, "kl": 30.001558303833008, "learning_rate": 1.8125e-06, "loss": 1.2001, "reward": -3.875, "reward_std": 0.25, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.125, "step": 58 }, { "completion_length": 500.0, "epoch": 11.8, "grad_norm": 7.750627040863037, "kl": 3.2049598693847656, "learning_rate": 1.8437500000000003e-06, "loss": 0.1282, "reward": -3.875, "reward_std": 0.25, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.125, "step": 59 }, { "completion_length": 500.0, "epoch": 12.0, "grad_norm": 6.483101844787598, "kl": 2.8182482719421387, "learning_rate": 1.8750000000000003e-06, "loss": 0.1127, "reward": 0.4944196343421936, "reward_std": 2.8224055767059326, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.3805804252624512, "rewards/wrapped_format_reward": 0.375, "step": 60 }, { "completion_length": 500.0, "epoch": 12.2, "grad_norm": 2.41937255859375, "kl": 1.5243698358535767, "learning_rate": 1.90625e-06, "loss": 0.061, "reward": -1.5549238920211792, "reward_std": 3.3336286544799805, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.9299237728118896, "rewards/wrapped_format_reward": 0.375, "step": 61 }, { "completion_length": 500.0, "epoch": 12.4, "grad_norm": 1.8835395574569702, "kl": 1.3928029537200928, "learning_rate": 1.9375e-06, "loss": 0.0557, "reward": 0.5334538817405701, "reward_std": 3.0702548027038574, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9665461778640747, "rewards/wrapped_format_reward": 0.0, "step": 62 }, { "completion_length": 500.0, "epoch": 12.6, "grad_norm": 82.16962432861328, "kl": 9.488784790039062, "learning_rate": 1.96875e-06, "loss": 0.3796, "reward": -0.4010847806930542, "reward_std": 4.155675411224365, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.7760847806930542, "rewards/wrapped_format_reward": 0.375, "step": 63 }, { "completion_length": 500.0, "epoch": 12.8, "grad_norm": 1.6330454349517822, "kl": 0.7770444750785828, "learning_rate": 2.0000000000000003e-06, "loss": 0.0311, "reward": -1.0365194082260132, "reward_std": 3.1399788856506348, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.46875, "rewards/wrapped_driving_reward": -2.2552695274353027, "rewards/wrapped_format_reward": 0.25, "step": 64 }, { "completion_length": 500.0, "epoch": 13.0, "grad_norm": 226.30535888671875, "kl": 45.10585021972656, "learning_rate": 2.0312500000000002e-06, "loss": 1.8042, "reward": -1.9248578548431396, "reward_std": 3.5153682231903076, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.7998578548431396, "rewards/wrapped_format_reward": 0.375, "step": 65 }, { "completion_length": 500.0, "epoch": 13.2, "grad_norm": 62.87789535522461, "kl": 7.880831718444824, "learning_rate": 2.0625e-06, "loss": 0.3152, "reward": 3.1509861946105957, "reward_std": 0.29935261607170105, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.987500011920929, "rewards/wrapped_driving_reward": 0.5384860038757324, "rewards/wrapped_format_reward": 0.625, "step": 66 }, { "completion_length": 500.0, "epoch": 13.4, "grad_norm": 41.82035446166992, "kl": 9.357061386108398, "learning_rate": 2.09375e-06, "loss": 0.3743, "reward": -2.576190948486328, "reward_std": 2.8476178646087646, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -3.201190948486328, "rewards/wrapped_format_reward": 0.125, "step": 67 }, { "completion_length": 500.0, "epoch": 13.6, "grad_norm": 4.275550365447998, "kl": 2.5297634601593018, "learning_rate": 2.125e-06, "loss": 0.1012, "reward": -2.036945104598999, "reward_std": 3.926109790802002, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.786945104598999, "rewards/wrapped_format_reward": 0.25, "step": 68 }, { "completion_length": 500.0, "epoch": 13.8, "grad_norm": 12.396137237548828, "kl": 3.0427801609039307, "learning_rate": 2.1562500000000003e-06, "loss": 0.1217, "reward": 0.7283755540847778, "reward_std": 3.323927879333496, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.2716244459152222, "rewards/wrapped_format_reward": 0.5, "step": 69 }, { "completion_length": 500.0, "epoch": 14.0, "grad_norm": 1.5282281637191772, "kl": 1.092595100402832, "learning_rate": 2.1875000000000002e-06, "loss": 0.0437, "reward": -3.2356114387512207, "reward_std": 1.528777003288269, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -3.8606114387512207, "rewards/wrapped_format_reward": 0.125, "step": 70 }, { "completion_length": 500.0, "epoch": 14.2, "grad_norm": 1.2480015754699707, "kl": 0.7834239602088928, "learning_rate": 2.21875e-06, "loss": 0.0313, "reward": -2.0355887413024902, "reward_std": 3.270659923553467, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.7855887413024902, "rewards/wrapped_format_reward": 0.25, "step": 71 }, { "completion_length": 500.0, "epoch": 14.4, "grad_norm": 80.85037231445312, "kl": 9.716327667236328, "learning_rate": 2.25e-06, "loss": 0.3887, "reward": -3.375, "reward_std": 1.25, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.125, "step": 72 }, { "completion_length": 500.0, "epoch": 14.6, "grad_norm": 1.1875276565551758, "kl": 0.9156450629234314, "learning_rate": 2.28125e-06, "loss": 0.0366, "reward": 0.398318886756897, "reward_std": 2.9533674716949463, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.3516812324523926, "rewards/wrapped_format_reward": 0.25, "step": 73 }, { "completion_length": 500.0, "epoch": 14.8, "grad_norm": 5.200348854064941, "kl": 2.256690502166748, "learning_rate": 2.3125000000000003e-06, "loss": 0.0903, "reward": -0.17332077026367188, "reward_std": 4.147373199462891, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.47727274894714355, "rewards/wrapped_driving_reward": -1.650593638420105, "rewards/wrapped_format_reward": 0.5, "step": 74 }, { "completion_length": 500.0, "epoch": 15.0, "grad_norm": 7008.42626953125, "kl": 815.8104248046875, "learning_rate": 2.3437500000000002e-06, "loss": 32.6324, "reward": -2.0281713008880615, "reward_std": 3.3107235431671143, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.9031713008880615, "rewards/wrapped_format_reward": 0.375, "step": 75 }, { "completion_length": 500.0, "epoch": 15.2, "grad_norm": 0.9020519852638245, "kl": 0.9176801443099976, "learning_rate": 2.375e-06, "loss": 0.0367, "reward": -2.1365058422088623, "reward_std": 3.0964157581329346, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -3.1365058422088623, "rewards/wrapped_format_reward": 0.5, "step": 76 }, { "completion_length": 500.0, "epoch": 15.4, "grad_norm": 2.2519280910491943, "kl": 0.8236393332481384, "learning_rate": 2.40625e-06, "loss": 0.0329, "reward": -1.22861909866333, "reward_std": 2.9241857528686523, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.353619337081909, "rewards/wrapped_format_reward": 0.125, "step": 77 }, { "completion_length": 500.0, "epoch": 15.6, "grad_norm": 1.5832031965255737, "kl": 0.7527546286582947, "learning_rate": 2.4375e-06, "loss": 0.0301, "reward": -0.6693365573883057, "reward_std": 3.860503911972046, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.7943366765975952, "rewards/wrapped_format_reward": 0.125, "step": 78 }, { "completion_length": 500.0, "epoch": 15.8, "grad_norm": 1.108726143836975, "kl": 1.0248883962631226, "learning_rate": 2.4687500000000003e-06, "loss": 0.041, "reward": -3.125, "reward_std": 1.4361406564712524, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.375, "step": 79 }, { "completion_length": 500.0, "epoch": 16.0, "grad_norm": 1.0169743299484253, "kl": 0.7592311501502991, "learning_rate": 2.5e-06, "loss": 0.0304, "reward": 1.145168423652649, "reward_std": 3.56965708732605, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.7298316359519958, "rewards/wrapped_format_reward": 0.375, "step": 80 }, { "completion_length": 500.0, "epoch": 16.2, "grad_norm": 0.7822604179382324, "kl": 0.560085117816925, "learning_rate": 2.53125e-06, "loss": 0.0224, "reward": 1.2056835889816284, "reward_std": 3.5178937911987305, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.6693164110183716, "rewards/wrapped_format_reward": 0.375, "step": 81 }, { "completion_length": 500.0, "epoch": 16.4, "grad_norm": 0.8175077438354492, "kl": 0.5752599239349365, "learning_rate": 2.5625e-06, "loss": 0.023, "reward": 2.9252407550811768, "reward_std": 0.7892647385597229, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.3002408742904663, "rewards/wrapped_format_reward": 0.625, "step": 82 }, { "completion_length": 500.0, "epoch": 16.6, "grad_norm": 6.47392463684082, "kl": 2.1055572032928467, "learning_rate": 2.5937500000000004e-06, "loss": 0.0842, "reward": -2.7708332538604736, "reward_std": 1.4678263664245605, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4791666865348816, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.25, "step": 83 }, { "completion_length": 500.0, "epoch": 16.8, "grad_norm": 0.9220647811889648, "kl": 0.7157851457595825, "learning_rate": 2.6250000000000003e-06, "loss": 0.0286, "reward": -2.987729549407959, "reward_std": 1.3781793117523193, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -3.987729549407959, "rewards/wrapped_format_reward": 0.5, "step": 84 }, { "completion_length": 500.0, "epoch": 17.0, "grad_norm": 62.959754943847656, "kl": 11.148348808288574, "learning_rate": 2.65625e-06, "loss": 0.4459, "reward": -3.875, "reward_std": 0.25, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.125, "step": 85 }, { "completion_length": 500.0, "epoch": 17.2, "grad_norm": 1.0555181503295898, "kl": 0.7983899712562561, "learning_rate": 2.6875e-06, "loss": 0.0319, "reward": -0.34822893142700195, "reward_std": 1.868666648864746, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -2.973228931427002, "rewards/wrapped_format_reward": 0.625, "step": 86 }, { "completion_length": 500.0, "epoch": 17.4, "grad_norm": 5.494264602661133, "kl": 1.303008794784546, "learning_rate": 2.71875e-06, "loss": 0.0521, "reward": -1.6160635948181152, "reward_std": 3.132638931274414, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.7410635948181152, "rewards/wrapped_format_reward": 0.125, "step": 87 }, { "completion_length": 500.0, "epoch": 17.6, "grad_norm": 4.55032205581665, "kl": 0.8609241247177124, "learning_rate": 2.7500000000000004e-06, "loss": 0.0344, "reward": -0.07164722681045532, "reward_std": 2.4449591636657715, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7045454978942871, "rewards/wrapped_driving_reward": -1.9011927843093872, "rewards/wrapped_format_reward": 0.375, "step": 88 }, { "completion_length": 500.0, "epoch": 17.8, "grad_norm": 0.9195135831832886, "kl": 0.683874785900116, "learning_rate": 2.7812500000000003e-06, "loss": 0.0274, "reward": -0.11890482902526855, "reward_std": 3.1548802852630615, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.699999988079071, "rewards/wrapped_driving_reward": -2.0689048767089844, "rewards/wrapped_format_reward": 0.5, "step": 89 }, { "completion_length": 500.0, "epoch": 18.0, "grad_norm": 33.38914108276367, "kl": 7.705580234527588, "learning_rate": 2.8125e-06, "loss": 0.3082, "reward": 1.586517572402954, "reward_std": 3.729795217514038, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.4134823679924011, "rewards/wrapped_format_reward": 0.5, "step": 90 }, { "completion_length": 500.0, "epoch": 18.2, "grad_norm": 57.7512092590332, "kl": 6.441009998321533, "learning_rate": 2.84375e-06, "loss": 0.2576, "reward": -0.8143091201782227, "reward_std": 3.7249650955200195, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4444444477558136, "rewards/wrapped_driving_reward": -2.008753776550293, "rewards/wrapped_format_reward": 0.25, "step": 91 }, { "completion_length": 500.0, "epoch": 18.4, "grad_norm": 1.0778491497039795, "kl": 0.7857025265693665, "learning_rate": 2.875e-06, "loss": 0.0314, "reward": -0.4593994617462158, "reward_std": 3.805197238922119, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.8343994617462158, "rewards/wrapped_format_reward": 0.375, "step": 92 }, { "completion_length": 500.0, "epoch": 18.6, "grad_norm": 1.1437242031097412, "kl": 0.5162321925163269, "learning_rate": 2.9062500000000003e-06, "loss": 0.0206, "reward": -1.5748236179351807, "reward_std": 3.3948116302490234, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.8248236179351807, "rewards/wrapped_format_reward": 0.25, "step": 93 }, { "completion_length": 500.0, "epoch": 18.8, "grad_norm": 0.782410204410553, "kl": 0.4215336740016937, "learning_rate": 2.9375000000000003e-06, "loss": 0.0169, "reward": -1.7746977806091309, "reward_std": 3.3859715461730957, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4375, "rewards/wrapped_driving_reward": -2.962197780609131, "rewards/wrapped_format_reward": 0.25, "step": 94 }, { "completion_length": 500.0, "epoch": 19.0, "grad_norm": 4.5705718994140625, "kl": 2.0152359008789062, "learning_rate": 2.96875e-06, "loss": 0.0806, "reward": 0.6353222131729126, "reward_std": 3.1809890270233154, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.1146776676177979, "rewards/wrapped_format_reward": 0.25, "step": 95 }, { "completion_length": 500.0, "epoch": 19.2, "grad_norm": 0.7173673510551453, "kl": 0.5042878985404968, "learning_rate": 3e-06, "loss": 0.0202, "reward": 3.3471288681030273, "reward_std": 0.31114432215690613, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.722128689289093, "rewards/wrapped_format_reward": 0.625, "step": 96 }, { "completion_length": 500.0, "epoch": 19.4, "grad_norm": 2.924496650695801, "kl": 0.9709882736206055, "learning_rate": 3.03125e-06, "loss": 0.0388, "reward": 1.2860008478164673, "reward_std": 2.1523053646087646, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9166666865348816, "rewards/wrapped_driving_reward": -1.1306657791137695, "rewards/wrapped_format_reward": 0.5, "step": 97 }, { "completion_length": 500.0, "epoch": 19.6, "grad_norm": 1.3449220657348633, "kl": 0.776192843914032, "learning_rate": 3.0625000000000003e-06, "loss": 0.031, "reward": -1.2475244998931885, "reward_std": 3.184887170791626, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.2475244998931885, "rewards/wrapped_format_reward": 0.0, "step": 98 }, { "completion_length": 500.0, "epoch": 19.8, "grad_norm": 16.095233917236328, "kl": 1.6037352085113525, "learning_rate": 3.0937500000000002e-06, "loss": 0.0641, "reward": 0.7092133164405823, "reward_std": 3.2424275875091553, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.040786862373352, "rewards/wrapped_format_reward": 0.25, "step": 99 }, { "completion_length": 500.0, "epoch": 20.0, "grad_norm": 1.065063714981079, "kl": 0.6967657208442688, "learning_rate": 3.125e-06, "loss": 0.0279, "reward": 1.58005690574646, "reward_std": 3.7297377586364746, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.41994309425354004, "rewards/wrapped_format_reward": 0.5, "step": 100 }, { "completion_length": 500.0, "epoch": 20.2, "grad_norm": 0.7442240715026855, "kl": 0.5057598352432251, "learning_rate": 3.15625e-06, "loss": 0.0202, "reward": -0.4889770746231079, "reward_std": 3.2432987689971924, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.1139769554138184, "rewards/wrapped_format_reward": 0.625, "step": 101 }, { "completion_length": 500.0, "epoch": 20.4, "grad_norm": 0.6366997361183167, "kl": 0.44367504119873047, "learning_rate": 3.1875e-06, "loss": 0.0177, "reward": -2.3750693798065186, "reward_std": 2.5939254760742188, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -3.2500693798065186, "rewards/wrapped_format_reward": 0.375, "step": 102 }, { "completion_length": 500.0, "epoch": 20.6, "grad_norm": 2.0096611976623535, "kl": 0.4689376652240753, "learning_rate": 3.2187500000000003e-06, "loss": 0.0188, "reward": -0.5932518243789673, "reward_std": 3.942629814147949, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.9682518243789673, "rewards/wrapped_format_reward": 0.375, "step": 103 }, { "completion_length": 500.0, "epoch": 20.8, "grad_norm": 1.2003757953643799, "kl": 0.3688035309314728, "learning_rate": 3.2500000000000002e-06, "loss": 0.0148, "reward": 2.4348607063293457, "reward_std": 1.402535080909729, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.824999988079071, "rewards/wrapped_driving_reward": 0.1098608672618866, "rewards/wrapped_format_reward": 0.5, "step": 104 }, { "completion_length": 500.0, "epoch": 21.0, "grad_norm": 1.0951755046844482, "kl": 0.6130416393280029, "learning_rate": 3.28125e-06, "loss": 0.0245, "reward": 1.912153720855713, "reward_std": 2.661609649658203, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.4628463387489319, "rewards/wrapped_format_reward": 0.375, "step": 105 }, { "completion_length": 500.0, "epoch": 21.2, "grad_norm": 0.7108362317085266, "kl": 0.4603574275970459, "learning_rate": 3.3125e-06, "loss": 0.0184, "reward": -0.09991639852523804, "reward_std": 2.701847791671753, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.699999988079071, "rewards/wrapped_driving_reward": -1.674916386604309, "rewards/wrapped_format_reward": 0.125, "step": 106 }, { "completion_length": 500.0, "epoch": 21.4, "grad_norm": 0.6701599955558777, "kl": 0.40188899636268616, "learning_rate": 3.34375e-06, "loss": 0.0161, "reward": -0.8211934566497803, "reward_std": 3.6706011295318604, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.0711934566497803, "rewards/wrapped_format_reward": 0.25, "step": 107 }, { "completion_length": 500.0, "epoch": 21.6, "grad_norm": 2.1547489166259766, "kl": 1.2839192152023315, "learning_rate": 3.3750000000000003e-06, "loss": 0.0514, "reward": 1.2595562934875488, "reward_std": 3.514036178588867, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.36544373631477356, "rewards/wrapped_format_reward": 0.125, "step": 108 }, { "completion_length": 500.0, "epoch": 21.8, "grad_norm": 1.838152527809143, "kl": 0.49252963066101074, "learning_rate": 3.40625e-06, "loss": 0.0197, "reward": 1.507678508758545, "reward_std": 3.675663948059082, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7272727489471436, "rewards/wrapped_driving_reward": -0.46959418058395386, "rewards/wrapped_format_reward": 0.5, "step": 109 }, { "completion_length": 500.0, "epoch": 22.0, "grad_norm": 0.7439582347869873, "kl": 0.595367431640625, "learning_rate": 3.4375e-06, "loss": 0.0238, "reward": -3.625, "reward_std": 0.4787135720252991, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.375, "step": 110 }, { "completion_length": 500.0, "epoch": 22.2, "grad_norm": 0.5864555835723877, "kl": 0.3803044855594635, "learning_rate": 3.46875e-06, "loss": 0.0152, "reward": 1.1622142791748047, "reward_std": 3.198068618774414, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7272727489471436, "rewards/wrapped_driving_reward": -1.0650583505630493, "rewards/wrapped_format_reward": 0.75, "step": 111 }, { "completion_length": 500.0, "epoch": 22.4, "grad_norm": 4.526273250579834, "kl": 0.46392467617988586, "learning_rate": 3.5e-06, "loss": 0.0186, "reward": -0.7247750163078308, "reward_std": 3.514815330505371, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.0997748374938965, "rewards/wrapped_format_reward": 0.375, "step": 112 }, { "completion_length": 500.0, "epoch": 22.6, "grad_norm": 0.5704318881034851, "kl": 0.31563645601272583, "learning_rate": 3.5312500000000007e-06, "loss": 0.0126, "reward": -0.6594128608703613, "reward_std": 3.574946880340576, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.0344128608703613, "rewards/wrapped_format_reward": 0.375, "step": 113 }, { "completion_length": 500.0, "epoch": 22.8, "grad_norm": 0.671708881855011, "kl": 0.44867807626724243, "learning_rate": 3.5625e-06, "loss": 0.0179, "reward": 1.6787878274917603, "reward_std": 0.9073445200920105, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.9462121725082397, "rewards/wrapped_format_reward": 0.625, "step": 114 }, { "completion_length": 500.0, "epoch": 23.0, "grad_norm": 1.2352200746536255, "kl": 0.46100977063179016, "learning_rate": 3.59375e-06, "loss": 0.0184, "reward": -0.30863749980926514, "reward_std": 3.9797427654266357, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.8086374998092651, "rewards/wrapped_format_reward": 0.5, "step": 115 }, { "completion_length": 500.0, "epoch": 23.2, "grad_norm": 0.783157229423523, "kl": 0.44468817114830017, "learning_rate": 3.625e-06, "loss": 0.0178, "reward": 1.9851404428482056, "reward_std": 1.0498998165130615, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.2648596167564392, "rewards/wrapped_format_reward": 0.25, "step": 116 }, { "completion_length": 500.0, "epoch": 23.4, "grad_norm": 0.6096097826957703, "kl": 0.36140069365501404, "learning_rate": 3.65625e-06, "loss": 0.0145, "reward": -0.9730753898620605, "reward_std": 2.724126100540161, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -3.0980753898620605, "rewards/wrapped_format_reward": 0.625, "step": 117 }, { "completion_length": 500.0, "epoch": 23.6, "grad_norm": 2.4817147254943848, "kl": 0.3356289267539978, "learning_rate": 3.6875000000000007e-06, "loss": 0.0134, "reward": 2.9043874740600586, "reward_std": 0.34505343437194824, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.5293872952461243, "rewards/wrapped_format_reward": 0.375, "step": 118 }, { "completion_length": 500.0, "epoch": 23.8, "grad_norm": 1.3457905054092407, "kl": 0.32610735297203064, "learning_rate": 3.7187500000000006e-06, "loss": 0.013, "reward": -0.004844188690185547, "reward_std": 3.559382915496826, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.7548441886901855, "rewards/wrapped_format_reward": 0.25, "step": 119 }, { "completion_length": 500.0, "epoch": 24.0, "grad_norm": 0.8271002769470215, "kl": 0.592341423034668, "learning_rate": 3.7500000000000005e-06, "loss": 0.0237, "reward": 0.23737984895706177, "reward_std": 2.8921873569488525, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6000000238418579, "rewards/wrapped_driving_reward": -1.2376201152801514, "rewards/wrapped_format_reward": 0.125, "step": 120 }, { "completion_length": 500.0, "epoch": 24.2, "grad_norm": 1.575377106666565, "kl": 0.31468361616134644, "learning_rate": 3.78125e-06, "loss": 0.0126, "reward": 0.08798408508300781, "reward_std": 3.364243984222412, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.9120157957077026, "rewards/wrapped_format_reward": 0.5, "step": 121 }, { "completion_length": 500.0, "epoch": 24.4, "grad_norm": 1.548542857170105, "kl": 0.7125066518783569, "learning_rate": 3.8125e-06, "loss": 0.0285, "reward": 3.202035903930664, "reward_std": 0.5515704154968262, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8958333134651184, "rewards/wrapped_driving_reward": 0.6812027096748352, "rewards/wrapped_format_reward": 0.625, "step": 122 }, { "completion_length": 500.0, "epoch": 24.6, "grad_norm": 0.6466585397720337, "kl": 0.33141595125198364, "learning_rate": 3.84375e-06, "loss": 0.0133, "reward": -0.8563422560691833, "reward_std": 2.9308738708496094, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -2.981342315673828, "rewards/wrapped_format_reward": 0.625, "step": 123 }, { "completion_length": 500.0, "epoch": 24.8, "grad_norm": 0.9053751826286316, "kl": 0.3941192626953125, "learning_rate": 3.875e-06, "loss": 0.0158, "reward": -0.9088470935821533, "reward_std": 2.4116313457489014, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -2.6588470935821533, "rewards/wrapped_format_reward": 0.25, "step": 124 }, { "completion_length": 500.0, "epoch": 25.0, "grad_norm": 0.7404253482818604, "kl": 0.3537856936454773, "learning_rate": 3.90625e-06, "loss": 0.0142, "reward": -0.08935052156448364, "reward_std": 4.237273693084717, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.5893504619598389, "rewards/wrapped_format_reward": 0.5, "step": 125 }, { "completion_length": 500.0, "epoch": 25.2, "grad_norm": 0.5974608659744263, "kl": 0.31292691826820374, "learning_rate": 3.9375e-06, "loss": 0.0125, "reward": -0.5100458860397339, "reward_std": 3.746746063232422, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.8850458860397339, "rewards/wrapped_format_reward": 0.375, "step": 126 }, { "completion_length": 500.0, "epoch": 25.4, "grad_norm": 0.9886866807937622, "kl": 0.3266676068305969, "learning_rate": 3.96875e-06, "loss": 0.0131, "reward": 3.5397558212280273, "reward_std": 0.24529722332954407, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.7897558212280273, "rewards/wrapped_format_reward": 0.75, "step": 127 }, { "completion_length": 500.0, "epoch": 25.6, "grad_norm": 0.6569087505340576, "kl": 0.28314509987831116, "learning_rate": 4.000000000000001e-06, "loss": 0.0113, "reward": -0.5303106904029846, "reward_std": 4.0666303634643555, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.7803106307983398, "rewards/wrapped_format_reward": 0.375, "step": 128 }, { "completion_length": 500.0, "epoch": 25.8, "grad_norm": 1.4679771661758423, "kl": 0.4160246253013611, "learning_rate": 4.031250000000001e-06, "loss": 0.0166, "reward": -0.5868573188781738, "reward_std": 3.941485643386841, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.8368571996688843, "rewards/wrapped_format_reward": 0.25, "step": 129 }, { "completion_length": 500.0, "epoch": 26.0, "grad_norm": 0.5985941290855408, "kl": 0.31736528873443604, "learning_rate": 4.0625000000000005e-06, "loss": 0.0127, "reward": 1.1113789081573486, "reward_std": 3.429651975631714, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.0136209726333618, "rewards/wrapped_format_reward": 0.625, "step": 130 }, { "completion_length": 500.0, "epoch": 26.2, "grad_norm": 0.6327362656593323, "kl": 0.40226221084594727, "learning_rate": 4.09375e-06, "loss": 0.0161, "reward": 0.7944153547286987, "reward_std": 2.8826069831848145, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7250000238418579, "rewards/wrapped_driving_reward": -1.3055846691131592, "rewards/wrapped_format_reward": 0.625, "step": 131 }, { "completion_length": 500.0, "epoch": 26.4, "grad_norm": 0.6797881722450256, "kl": 0.4582635164260864, "learning_rate": 4.125e-06, "loss": 0.0183, "reward": 2.8031327724456787, "reward_std": 0.7006269097328186, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.05313277989625931, "rewards/wrapped_format_reward": 0.75, "step": 132 }, { "completion_length": 500.0, "epoch": 26.6, "grad_norm": 0.5752917528152466, "kl": 0.36153456568717957, "learning_rate": 4.15625e-06, "loss": 0.0145, "reward": -2.564105987548828, "reward_std": 2.871788263320923, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -3.189105987548828, "rewards/wrapped_format_reward": 0.125, "step": 133 }, { "completion_length": 500.0, "epoch": 26.8, "grad_norm": 0.569823145866394, "kl": 0.3600581884384155, "learning_rate": 4.1875e-06, "loss": 0.0144, "reward": 3.2037861347198486, "reward_std": 0.1732039451599121, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.8287861943244934, "rewards/wrapped_format_reward": 0.375, "step": 134 }, { "completion_length": 500.0, "epoch": 27.0, "grad_norm": 11.942618370056152, "kl": 2.177290678024292, "learning_rate": 4.21875e-06, "loss": 0.0871, "reward": -2.3714582920074463, "reward_std": 1.8921570777893066, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -3.7464582920074463, "rewards/wrapped_format_reward": 0.375, "step": 135 }, { "completion_length": 500.0, "epoch": 27.2, "grad_norm": 0.5660642385482788, "kl": 0.2908819019794464, "learning_rate": 4.25e-06, "loss": 0.0116, "reward": -0.6192033290863037, "reward_std": 3.6331570148468018, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.1192033290863037, "rewards/wrapped_format_reward": 0.5, "step": 136 }, { "completion_length": 500.0, "epoch": 27.4, "grad_norm": 1.4041975736618042, "kl": 0.463067889213562, "learning_rate": 4.28125e-06, "loss": 0.0185, "reward": -2.75, "reward_std": 1.1902379989624023, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.25, "step": 137 }, { "completion_length": 500.0, "epoch": 27.6, "grad_norm": 0.4801470637321472, "kl": 0.2532914876937866, "learning_rate": 4.312500000000001e-06, "loss": 0.0101, "reward": -2.304798126220703, "reward_std": 3.3904037475585938, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -3.054798126220703, "rewards/wrapped_format_reward": 0.25, "step": 138 }, { "completion_length": 500.0, "epoch": 27.8, "grad_norm": 0.6999854445457458, "kl": 0.4938638210296631, "learning_rate": 4.3437500000000006e-06, "loss": 0.0198, "reward": -1.9556548595428467, "reward_std": 3.430131196975708, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.9556548595428467, "rewards/wrapped_format_reward": 0.5, "step": 139 }, { "completion_length": 500.0, "epoch": 28.0, "grad_norm": 1.7622352838516235, "kl": 0.32535520195961, "learning_rate": 4.3750000000000005e-06, "loss": 0.013, "reward": 3.048956871032715, "reward_std": 0.7497459053993225, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.42395687103271484, "rewards/wrapped_format_reward": 0.625, "step": 140 }, { "completion_length": 500.0, "epoch": 28.2, "grad_norm": 1.0910435914993286, "kl": 0.3166691064834595, "learning_rate": 4.40625e-06, "loss": 0.0127, "reward": 2.1717541217803955, "reward_std": 2.45133900642395, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.45324593782424927, "rewards/wrapped_format_reward": 0.625, "step": 141 }, { "completion_length": 500.0, "epoch": 28.4, "grad_norm": 0.563035249710083, "kl": 0.34334975481033325, "learning_rate": 4.4375e-06, "loss": 0.0137, "reward": -0.30545544624328613, "reward_std": 2.531362295150757, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.71875, "rewards/wrapped_driving_reward": -2.149205446243286, "rewards/wrapped_format_reward": 0.375, "step": 142 }, { "completion_length": 500.0, "epoch": 28.6, "grad_norm": 0.6513370871543884, "kl": 0.2893451154232025, "learning_rate": 4.46875e-06, "loss": 0.0116, "reward": -0.7655331492424011, "reward_std": 3.818908214569092, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.265533208847046, "rewards/wrapped_format_reward": 0.5, "step": 143 }, { "completion_length": 500.0, "epoch": 28.8, "grad_norm": 0.6747258305549622, "kl": 0.4012701213359833, "learning_rate": 4.5e-06, "loss": 0.0161, "reward": -1.631712794303894, "reward_std": 2.7382092475891113, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -3.2567129135131836, "rewards/wrapped_format_reward": 0.625, "step": 144 }, { "completion_length": 500.0, "epoch": 29.0, "grad_norm": 1.388415813446045, "kl": 0.3030587136745453, "learning_rate": 4.53125e-06, "loss": 0.0121, "reward": -0.3709021210670471, "reward_std": 3.0999691486358643, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6666666865348816, "rewards/wrapped_driving_reward": -2.0375688076019287, "rewards/wrapped_format_reward": 0.25, "step": 145 }, { "completion_length": 500.0, "epoch": 29.2, "grad_norm": 1.3835958242416382, "kl": 0.5185285806655884, "learning_rate": 4.5625e-06, "loss": 0.0207, "reward": -0.25881457328796387, "reward_std": 2.5411531925201416, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9010416865348816, "rewards/wrapped_driving_reward": -2.6598563194274902, "rewards/wrapped_format_reward": 0.5, "step": 146 }, { "completion_length": 500.0, "epoch": 29.4, "grad_norm": 1.0529229640960693, "kl": 0.304034560918808, "learning_rate": 4.59375e-06, "loss": 0.0122, "reward": 0.5033270120620728, "reward_std": 3.8062596321105957, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.6216729879379272, "rewards/wrapped_format_reward": 0.625, "step": 147 }, { "completion_length": 500.0, "epoch": 29.6, "grad_norm": 1.3924496173858643, "kl": 0.3519279956817627, "learning_rate": 4.625000000000001e-06, "loss": 0.0141, "reward": 1.081155776977539, "reward_std": 2.07023286819458, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9750000238418579, "rewards/wrapped_driving_reward": -1.1438441276550293, "rewards/wrapped_format_reward": 0.25, "step": 148 }, { "completion_length": 500.0, "epoch": 29.8, "grad_norm": 1.6642379760742188, "kl": 0.5595217347145081, "learning_rate": 4.6562500000000005e-06, "loss": 0.0224, "reward": 2.879631519317627, "reward_std": 0.5703426003456116, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.12963154911994934, "rewards/wrapped_format_reward": 0.75, "step": 149 }, { "completion_length": 500.0, "epoch": 30.0, "grad_norm": 0.5775982737541199, "kl": 0.2810514271259308, "learning_rate": 4.6875000000000004e-06, "loss": 0.0112, "reward": 0.10444420576095581, "reward_std": 3.2514774799346924, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -2.0205557346343994, "rewards/wrapped_format_reward": 0.625, "step": 150 }, { "completion_length": 500.0, "epoch": 30.2, "grad_norm": 0.9198185801506042, "kl": 0.28956174850463867, "learning_rate": 4.71875e-06, "loss": 0.0116, "reward": -0.260436087846756, "reward_std": 2.8927173614501953, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -2.2604360580444336, "rewards/wrapped_format_reward": 0.5, "step": 151 }, { "completion_length": 500.0, "epoch": 30.4, "grad_norm": 0.7754166722297668, "kl": 0.38463443517684937, "learning_rate": 4.75e-06, "loss": 0.0154, "reward": 0.4493406414985657, "reward_std": 2.646808385848999, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.300659418106079, "rewards/wrapped_format_reward": 0.25, "step": 152 }, { "completion_length": 500.0, "epoch": 30.6, "grad_norm": 0.5780096650123596, "kl": 0.3385607898235321, "learning_rate": 4.781250000000001e-06, "loss": 0.0135, "reward": -1.7927734851837158, "reward_std": 3.755190849304199, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.792773485183716, "rewards/wrapped_format_reward": 0.5, "step": 153 }, { "completion_length": 500.0, "epoch": 30.8, "grad_norm": 0.5552729964256287, "kl": 0.28436288237571716, "learning_rate": 4.8125e-06, "loss": 0.0114, "reward": 0.6222386360168457, "reward_std": 2.1850173473358154, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.8777613639831543, "rewards/wrapped_format_reward": 0.5, "step": 154 }, { "completion_length": 500.0, "epoch": 31.0, "grad_norm": 0.9199939370155334, "kl": 0.37593454122543335, "learning_rate": 4.84375e-06, "loss": 0.015, "reward": 0.4306233525276184, "reward_std": 3.5992963314056396, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6607142686843872, "rewards/wrapped_driving_reward": -1.605090856552124, "rewards/wrapped_format_reward": 0.625, "step": 155 }, { "completion_length": 500.0, "epoch": 31.2, "grad_norm": 0.5603945851325989, "kl": 0.3141997754573822, "learning_rate": 4.875e-06, "loss": 0.0126, "reward": -2.0, "reward_std": 1.3540064096450806, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.5, "step": 156 }, { "completion_length": 500.0, "epoch": 31.4, "grad_norm": 0.6190344095230103, "kl": 0.27537742257118225, "learning_rate": 4.90625e-06, "loss": 0.011, "reward": 1.0282058715820312, "reward_std": 3.394833564758301, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.846794068813324, "rewards/wrapped_format_reward": 0.375, "step": 157 }, { "completion_length": 500.0, "epoch": 31.6, "grad_norm": 0.6877399682998657, "kl": 0.2958383858203888, "learning_rate": 4.937500000000001e-06, "loss": 0.0118, "reward": -0.755041241645813, "reward_std": 3.4646472930908203, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.1300413608551025, "rewards/wrapped_format_reward": 0.375, "step": 158 }, { "completion_length": 500.0, "epoch": 31.8, "grad_norm": 31.595932006835938, "kl": 6.78364372253418, "learning_rate": 4.9687500000000005e-06, "loss": 0.2713, "reward": 0.35857605934143066, "reward_std": 2.9158554077148438, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.7664239406585693, "rewards/wrapped_format_reward": 0.625, "step": 159 }, { "completion_length": 500.0, "epoch": 32.0, "grad_norm": 1.7048529386520386, "kl": 0.35252463817596436, "learning_rate": 5e-06, "loss": 0.0141, "reward": 2.64233136177063, "reward_std": 0.7985239624977112, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.14233140647411346, "rewards/wrapped_format_reward": 0.5, "step": 160 }, { "completion_length": 500.0, "epoch": 32.2, "grad_norm": 0.9699507355690002, "kl": 0.3963090479373932, "learning_rate": 4.99999405044338e-06, "loss": 0.0159, "reward": 2.578547477722168, "reward_std": 0.16937123239040375, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.046452634036540985, "rewards/wrapped_format_reward": 0.625, "step": 161 }, { "completion_length": 500.0, "epoch": 32.4, "grad_norm": 0.6427643299102783, "kl": 0.2770542800426483, "learning_rate": 4.999976201801837e-06, "loss": 0.0111, "reward": 2.2058539390563965, "reward_std": 1.1022424697875977, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.41914597153663635, "rewards/wrapped_format_reward": 0.625, "step": 162 }, { "completion_length": 500.0, "epoch": 32.6, "grad_norm": 0.6896190047264099, "kl": 0.27440541982650757, "learning_rate": 4.999946454160323e-06, "loss": 0.011, "reward": 1.1675429344177246, "reward_std": 3.4587650299072266, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7142857313156128, "rewards/wrapped_driving_reward": -0.7967426180839539, "rewards/wrapped_format_reward": 0.5, "step": 163 }, { "completion_length": 500.0, "epoch": 32.8, "grad_norm": 0.7695831060409546, "kl": 0.4198772609233856, "learning_rate": 4.9999048076604286e-06, "loss": 0.0168, "reward": -1.912316918373108, "reward_std": 2.555265426635742, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.9123167991638184, "rewards/wrapped_format_reward": 0.0, "step": 164 }, { "completion_length": 500.0, "epoch": 33.0, "grad_norm": 0.5920954942703247, "kl": 0.28969520330429077, "learning_rate": 4.999851262500375e-06, "loss": 0.0116, "reward": 3.1377110481262207, "reward_std": 0.5497580170631409, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.5127109289169312, "rewards/wrapped_format_reward": 0.625, "step": 165 }, { "completion_length": 500.0, "epoch": 33.2, "grad_norm": 6.077564716339111, "kl": 0.44747114181518555, "learning_rate": 4.999785818935018e-06, "loss": 0.0179, "reward": 2.047877073287964, "reward_std": 2.722182035446167, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.45212286710739136, "rewards/wrapped_format_reward": 0.5, "step": 166 }, { "completion_length": 500.0, "epoch": 33.4, "grad_norm": 0.6889002919197083, "kl": 0.37658053636550903, "learning_rate": 4.999708477275846e-06, "loss": 0.0151, "reward": -2.284590482711792, "reward_std": 3.106440782546997, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -3.159590482711792, "rewards/wrapped_format_reward": 0.375, "step": 167 }, { "completion_length": 500.0, "epoch": 33.6, "grad_norm": 1.8645473718643188, "kl": 0.3408987522125244, "learning_rate": 4.9996192378909785e-06, "loss": 0.0136, "reward": 0.917718231678009, "reward_std": 2.948974132537842, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.699999988079071, "rewards/wrapped_driving_reward": -1.157281756401062, "rewards/wrapped_format_reward": 0.625, "step": 168 }, { "completion_length": 500.0, "epoch": 33.8, "grad_norm": 0.535763680934906, "kl": 0.25453072786331177, "learning_rate": 4.999518101205162e-06, "loss": 0.0102, "reward": 3.604552745819092, "reward_std": 0.45598289370536804, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.7295528054237366, "rewards/wrapped_format_reward": 0.875, "step": 169 }, { "completion_length": 500.0, "epoch": 34.0, "grad_norm": 1.0853776931762695, "kl": 0.2871979773044586, "learning_rate": 4.999405067699773e-06, "loss": 0.0115, "reward": 0.7697337865829468, "reward_std": 3.0176069736480713, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.3552662134170532, "rewards/wrapped_format_reward": 0.625, "step": 170 }, { "completion_length": 500.0, "epoch": 34.2, "grad_norm": 2.175551176071167, "kl": 0.7303879261016846, "learning_rate": 4.99928013791281e-06, "loss": 0.0292, "reward": 0.010015249252319336, "reward_std": 4.346557140350342, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.46875, "rewards/wrapped_driving_reward": -1.5837347507476807, "rewards/wrapped_format_reward": 0.625, "step": 171 }, { "completion_length": 500.0, "epoch": 34.4, "grad_norm": 1.3378883600234985, "kl": 0.2555471658706665, "learning_rate": 4.999143312438893e-06, "loss": 0.0102, "reward": 1.064118504524231, "reward_std": 1.464298963546753, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.4358813762664795, "rewards/wrapped_format_reward": 0.5, "step": 172 }, { "completion_length": 500.0, "epoch": 34.6, "grad_norm": 1.6005758047103882, "kl": 0.3272940516471863, "learning_rate": 4.998994591929266e-06, "loss": 0.0131, "reward": 3.320277214050293, "reward_std": 0.5942137241363525, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.8202772736549377, "rewards/wrapped_format_reward": 0.5, "step": 173 }, { "completion_length": 500.0, "epoch": 34.8, "grad_norm": 0.8775622844696045, "kl": 0.3981474041938782, "learning_rate": 4.998833977091783e-06, "loss": 0.0159, "reward": 2.548191547393799, "reward_std": 0.13038182258605957, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.17319151759147644, "rewards/wrapped_format_reward": 0.375, "step": 174 }, { "completion_length": 500.0, "epoch": 35.0, "grad_norm": 0.5131356716156006, "kl": 0.26495081186294556, "learning_rate": 4.998661468690914e-06, "loss": 0.0106, "reward": 0.2881455421447754, "reward_std": 3.1594552993774414, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.7118544578552246, "rewards/wrapped_format_reward": 0.5, "step": 175 }, { "completion_length": 500.0, "epoch": 35.2, "grad_norm": 1.4990577697753906, "kl": 0.3656232953071594, "learning_rate": 4.99847706754774e-06, "loss": 0.0146, "reward": 2.0933961868286133, "reward_std": 0.39702948927879333, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9375, "rewards/wrapped_driving_reward": -0.594103991985321, "rewards/wrapped_format_reward": 0.75, "step": 176 }, { "completion_length": 500.0, "epoch": 35.4, "grad_norm": 0.5740483999252319, "kl": 0.265653520822525, "learning_rate": 4.998280774539943e-06, "loss": 0.0106, "reward": 1.1700050830841064, "reward_std": 3.1657402515411377, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.8299949765205383, "rewards/wrapped_format_reward": 0.5, "step": 177 }, { "completion_length": 500.0, "epoch": 35.6, "grad_norm": 0.6564896702766418, "kl": 0.265337198972702, "learning_rate": 4.998072590601808e-06, "loss": 0.0106, "reward": -0.852949857711792, "reward_std": 3.3822429180145264, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.102949857711792, "rewards/wrapped_format_reward": 0.25, "step": 178 }, { "completion_length": 500.0, "epoch": 35.8, "grad_norm": 23.83641242980957, "kl": 4.303451061248779, "learning_rate": 4.9978525167242176e-06, "loss": 0.1721, "reward": 0.764412522315979, "reward_std": 2.8684115409851074, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7250000238418579, "rewards/wrapped_driving_reward": -1.085587501525879, "rewards/wrapped_format_reward": 0.375, "step": 179 }, { "completion_length": 500.0, "epoch": 36.0, "grad_norm": 0.7350974082946777, "kl": 0.30466321110725403, "learning_rate": 4.997620553954645e-06, "loss": 0.0122, "reward": -0.10997164249420166, "reward_std": 2.883012056350708, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -2.109971523284912, "rewards/wrapped_format_reward": 0.5, "step": 180 }, { "completion_length": 500.0, "epoch": 36.2, "grad_norm": 1.8978265523910522, "kl": 0.5050737857818604, "learning_rate": 4.997376703397151e-06, "loss": 0.0202, "reward": -0.35431569814682007, "reward_std": 4.209678649902344, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.8543156385421753, "rewards/wrapped_format_reward": 0.5, "step": 181 }, { "completion_length": 500.0, "epoch": 36.4, "grad_norm": 0.6739000678062439, "kl": 0.3342580497264862, "learning_rate": 4.9971209662123774e-06, "loss": 0.0134, "reward": 1.24358332157135, "reward_std": 3.5022475719451904, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.5064166188240051, "rewards/wrapped_format_reward": 0.25, "step": 182 }, { "completion_length": 500.0, "epoch": 36.6, "grad_norm": 0.8527255654335022, "kl": 0.44380900263786316, "learning_rate": 4.996853343617542e-06, "loss": 0.0178, "reward": 1.3519909381866455, "reward_std": 2.9203834533691406, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.71875, "rewards/wrapped_driving_reward": -0.6167589426040649, "rewards/wrapped_format_reward": 0.5, "step": 183 }, { "completion_length": 500.0, "epoch": 36.8, "grad_norm": 0.6037353277206421, "kl": 0.3514931797981262, "learning_rate": 4.9965738368864345e-06, "loss": 0.0141, "reward": 2.4617958068847656, "reward_std": 0.43256813287734985, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.1632043421268463, "rewards/wrapped_format_reward": 0.625, "step": 184 }, { "completion_length": 500.0, "epoch": 37.0, "grad_norm": 0.6498645544052124, "kl": 0.39014145731925964, "learning_rate": 4.996282447349408e-06, "loss": 0.0156, "reward": 2.696049451828003, "reward_std": 0.6518055200576782, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.17895053327083588, "rewards/wrapped_format_reward": 0.875, "step": 185 }, { "completion_length": 500.0, "epoch": 37.2, "grad_norm": 0.6228243708610535, "kl": 0.2633248567581177, "learning_rate": 4.995979176393372e-06, "loss": 0.0105, "reward": 1.1363269090652466, "reward_std": 3.4644434452056885, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9886730313301086, "rewards/wrapped_format_reward": 0.625, "step": 186 }, { "completion_length": 500.0, "epoch": 37.4, "grad_norm": 8.40079402923584, "kl": 1.8278297185897827, "learning_rate": 4.99566402546179e-06, "loss": 0.0731, "reward": -0.7244951725006104, "reward_std": 3.783473491668701, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3705357313156128, "rewards/wrapped_driving_reward": -2.0950307846069336, "rewards/wrapped_format_reward": 0.5, "step": 187 }, { "completion_length": 500.0, "epoch": 37.6, "grad_norm": 0.5168763399124146, "kl": 0.2395801991224289, "learning_rate": 4.995336996054668e-06, "loss": 0.0096, "reward": 1.9002426862716675, "reward_std": 2.223823070526123, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9772727489471436, "rewards/wrapped_driving_reward": -0.7020300626754761, "rewards/wrapped_format_reward": 0.625, "step": 188 }, { "completion_length": 500.0, "epoch": 37.8, "grad_norm": 0.9863908290863037, "kl": 0.27976277470588684, "learning_rate": 4.99499808972855e-06, "loss": 0.0112, "reward": -0.028857052326202393, "reward_std": 2.8702406883239746, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -2.1538569927215576, "rewards/wrapped_format_reward": 0.625, "step": 189 }, { "completion_length": 500.0, "epoch": 38.0, "grad_norm": 0.8377166986465454, "kl": 0.48623228073120117, "learning_rate": 4.994647308096509e-06, "loss": 0.0194, "reward": 2.531177043914795, "reward_std": 0.5673744082450867, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9166666865348816, "rewards/wrapped_driving_reward": 0.4895104467868805, "rewards/wrapped_format_reward": 0.125, "step": 190 }, { "completion_length": 500.0, "epoch": 38.2, "grad_norm": 0.9249876737594604, "kl": 0.4526787996292114, "learning_rate": 4.994284652828143e-06, "loss": 0.0181, "reward": 0.6909130215644836, "reward_std": 3.1517491340637207, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.1840870380401611, "rewards/wrapped_format_reward": 0.375, "step": 191 }, { "completion_length": 500.0, "epoch": 38.4, "grad_norm": 0.5216014385223389, "kl": 0.2844958007335663, "learning_rate": 4.993910125649561e-06, "loss": 0.0114, "reward": 1.347219705581665, "reward_std": 3.583749771118164, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.527780294418335, "rewards/wrapped_format_reward": 0.375, "step": 192 }, { "completion_length": 500.0, "epoch": 38.6, "grad_norm": 0.7675309181213379, "kl": 0.46290096640586853, "learning_rate": 4.99352372834338e-06, "loss": 0.0185, "reward": 1.28756582736969, "reward_std": 3.200143337249756, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7083333134651184, "rewards/wrapped_driving_reward": -0.5457674860954285, "rewards/wrapped_format_reward": 0.375, "step": 193 }, { "completion_length": 500.0, "epoch": 38.8, "grad_norm": 0.5267873406410217, "kl": 0.27246928215026855, "learning_rate": 4.993125462748714e-06, "loss": 0.0109, "reward": 0.5119737386703491, "reward_std": 2.572335958480835, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.9880262613296509, "rewards/wrapped_format_reward": 0.5, "step": 194 }, { "completion_length": 500.0, "epoch": 39.0, "grad_norm": 0.557345449924469, "kl": 0.33223679661750793, "learning_rate": 4.992715330761167e-06, "loss": 0.0133, "reward": 1.9005041122436523, "reward_std": 1.5405527353286743, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.5994958281517029, "rewards/wrapped_format_reward": 0.5, "step": 195 }, { "completion_length": 500.0, "epoch": 39.2, "grad_norm": 0.5145586729049683, "kl": 0.27872464060783386, "learning_rate": 4.992293334332821e-06, "loss": 0.0111, "reward": 0.08070141077041626, "reward_std": 2.161402702331543, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -2.9192986488342285, "rewards/wrapped_format_reward": 1.0, "step": 196 }, { "completion_length": 500.0, "epoch": 39.4, "grad_norm": 0.5731538534164429, "kl": 0.2947344481945038, "learning_rate": 4.9918594754722286e-06, "loss": 0.0118, "reward": 1.089212417602539, "reward_std": 3.5704760551452637, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9107875823974609, "rewards/wrapped_format_reward": 0.5, "step": 197 }, { "completion_length": 500.0, "epoch": 39.6, "grad_norm": 1.0262069702148438, "kl": 0.36793074011802673, "learning_rate": 4.991413756244404e-06, "loss": 0.0147, "reward": 2.804293632507324, "reward_std": 0.05172164365649223, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.3042936325073242, "rewards/wrapped_format_reward": 0.5, "step": 198 }, { "completion_length": 500.0, "epoch": 39.8, "grad_norm": 0.7235340476036072, "kl": 0.4867457151412964, "learning_rate": 4.990956178770814e-06, "loss": 0.0195, "reward": 2.4924705028533936, "reward_std": 0.6009870767593384, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.949999988079071, "rewards/wrapped_driving_reward": -0.08252956718206406, "rewards/wrapped_format_reward": 0.625, "step": 199 }, { "completion_length": 500.0, "epoch": 40.0, "grad_norm": 0.8564599752426147, "kl": 0.4650922119617462, "learning_rate": 4.990486745229364e-06, "loss": 0.0186, "reward": 2.757322311401367, "reward_std": 0.5960695743560791, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9444444179534912, "rewards/wrapped_driving_reward": 0.18787765502929688, "rewards/wrapped_format_reward": 0.625, "step": 200 }, { "completion_length": 500.0, "epoch": 40.2, "grad_norm": 0.6181848645210266, "kl": 0.33555763959884644, "learning_rate": 4.990005457854392e-06, "loss": 0.0134, "reward": 0.935232937335968, "reward_std": 2.9882521629333496, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7222222089767456, "rewards/wrapped_driving_reward": -0.7869893312454224, "rewards/wrapped_format_reward": 0.25, "step": 201 }, { "completion_length": 500.0, "epoch": 40.4, "grad_norm": 0.8061473369598389, "kl": 0.3526011109352112, "learning_rate": 4.989512318936654e-06, "loss": 0.0141, "reward": 2.038607597351074, "reward_std": 1.286082148551941, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9772727489471436, "rewards/wrapped_driving_reward": -0.4386652112007141, "rewards/wrapped_format_reward": 0.5, "step": 202 }, { "completion_length": 500.0, "epoch": 40.6, "grad_norm": 1.0745853185653687, "kl": 0.7225068807601929, "learning_rate": 4.989007330823319e-06, "loss": 0.0289, "reward": 3.327683210372925, "reward_std": 0.45302456617355347, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.5776832103729248, "rewards/wrapped_format_reward": 0.75, "step": 203 }, { "completion_length": 500.0, "epoch": 40.8, "grad_norm": 0.6797990202903748, "kl": 0.49457883834838867, "learning_rate": 4.988490495917948e-06, "loss": 0.0198, "reward": 1.4564661979675293, "reward_std": 3.6745243072509766, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.7935338020324707, "rewards/wrapped_format_reward": 0.75, "step": 204 }, { "completion_length": 500.0, "epoch": 41.0, "grad_norm": 0.5719887018203735, "kl": 0.3025702238082886, "learning_rate": 4.987961816680493e-06, "loss": 0.0121, "reward": 0.8813665509223938, "reward_std": 3.3135292530059814, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.685606062412262, "rewards/wrapped_driving_reward": -1.1792395114898682, "rewards/wrapped_format_reward": 0.625, "step": 205 }, { "completion_length": 500.0, "epoch": 41.2, "grad_norm": 0.7324315905570984, "kl": 0.387521356344223, "learning_rate": 4.987421295627279e-06, "loss": 0.0155, "reward": 3.60201358795166, "reward_std": 0.17326904833316803, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9722222089767456, "rewards/wrapped_driving_reward": 0.7547914981842041, "rewards/wrapped_format_reward": 0.875, "step": 206 }, { "completion_length": 500.0, "epoch": 41.4, "grad_norm": 1.4426076412200928, "kl": 0.3239262104034424, "learning_rate": 4.986868935330998e-06, "loss": 0.013, "reward": 1.1451337337493896, "reward_std": 3.175523042678833, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9798662662506104, "rewards/wrapped_format_reward": 0.625, "step": 207 }, { "completion_length": 500.0, "epoch": 41.6, "grad_norm": 0.6265994310379028, "kl": 0.31086966395378113, "learning_rate": 4.986304738420684e-06, "loss": 0.0124, "reward": -0.08087223768234253, "reward_std": 3.9480772018432617, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.5808722972869873, "rewards/wrapped_format_reward": 0.5, "step": 208 }, { "completion_length": 500.0, "epoch": 41.8, "grad_norm": 0.5122293829917908, "kl": 0.22147461771965027, "learning_rate": 4.985728707581717e-06, "loss": 0.0089, "reward": 2.2255654335021973, "reward_std": 0.4417201578617096, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9821428656578064, "rewards/wrapped_driving_reward": -0.2565774619579315, "rewards/wrapped_format_reward": 0.5, "step": 209 }, { "completion_length": 500.0, "epoch": 42.0, "grad_norm": 0.5366212725639343, "kl": 0.2860429286956787, "learning_rate": 4.985140845555799e-06, "loss": 0.0114, "reward": -1.875, "reward_std": 1.108677864074707, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.625, "step": 210 }, { "completion_length": 500.0, "epoch": 42.2, "grad_norm": 0.757074773311615, "kl": 0.5041708946228027, "learning_rate": 4.984541155140945e-06, "loss": 0.0202, "reward": 1.3050158023834229, "reward_std": 3.2698206901550293, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6428571343421936, "rewards/wrapped_driving_reward": -0.7128414511680603, "rewards/wrapped_format_reward": 0.625, "step": 211 }, { "completion_length": 500.0, "epoch": 42.4, "grad_norm": 0.5149911046028137, "kl": 0.24131189286708832, "learning_rate": 4.9839296391914696e-06, "loss": 0.0097, "reward": -0.5590072870254517, "reward_std": 3.6906325817108154, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.9340074062347412, "rewards/wrapped_format_reward": 0.375, "step": 212 }, { "completion_length": 500.0, "epoch": 42.6, "grad_norm": 0.7922428250312805, "kl": 0.4074100852012634, "learning_rate": 4.98330630061797e-06, "loss": 0.0163, "reward": 0.7251100540161133, "reward_std": 3.205897569656372, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.1498900651931763, "rewards/wrapped_format_reward": 0.375, "step": 213 }, { "completion_length": 500.0, "epoch": 42.8, "grad_norm": 0.8499237298965454, "kl": 0.533706784248352, "learning_rate": 4.982671142387316e-06, "loss": 0.0213, "reward": 1.2264912128448486, "reward_std": 3.1925883293151855, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.7735086679458618, "rewards/wrapped_format_reward": 0.5, "step": 214 }, { "completion_length": 500.0, "epoch": 43.0, "grad_norm": 0.5848891139030457, "kl": 0.4833756983280182, "learning_rate": 4.982024167522638e-06, "loss": 0.0193, "reward": 2.640871524810791, "reward_std": 0.3350675404071808, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.935606062412262, "rewards/wrapped_driving_reward": -0.1697344183921814, "rewards/wrapped_format_reward": 0.875, "step": 215 }, { "completion_length": 500.0, "epoch": 43.2, "grad_norm": 1.0190398693084717, "kl": 0.5212844014167786, "learning_rate": 4.981365379103306e-06, "loss": 0.0209, "reward": 1.518845796585083, "reward_std": 1.8981057405471802, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9772727489471436, "rewards/wrapped_driving_reward": -1.0834269523620605, "rewards/wrapped_format_reward": 0.625, "step": 216 }, { "completion_length": 500.0, "epoch": 43.4, "grad_norm": 0.6003134250640869, "kl": 0.2476293295621872, "learning_rate": 4.980694780264918e-06, "loss": 0.0099, "reward": 2.3462984561920166, "reward_std": 0.5958766937255859, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.4037014842033386, "rewards/wrapped_format_reward": 0.75, "step": 217 }, { "completion_length": 500.0, "epoch": 43.6, "grad_norm": 0.5352597832679749, "kl": 0.33760789036750793, "learning_rate": 4.980012374199288e-06, "loss": 0.0135, "reward": 1.1177078485488892, "reward_std": 3.422083854675293, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.8822920918464661, "rewards/wrapped_format_reward": 0.5, "step": 218 }, { "completion_length": 500.0, "epoch": 43.8, "grad_norm": 0.78425532579422, "kl": 0.45192739367485046, "learning_rate": 4.979318164154426e-06, "loss": 0.0181, "reward": 3.3331549167633057, "reward_std": 0.4030221104621887, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.8331548571586609, "rewards/wrapped_format_reward": 0.5, "step": 219 }, { "completion_length": 500.0, "epoch": 44.0, "grad_norm": 0.5511319041252136, "kl": 0.2625429630279541, "learning_rate": 4.978612153434527e-06, "loss": 0.0105, "reward": 3.4739222526550293, "reward_std": 0.35263335704803467, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.5989223122596741, "rewards/wrapped_format_reward": 0.875, "step": 220 }, { "completion_length": 500.0, "epoch": 44.2, "grad_norm": 0.8345232009887695, "kl": 0.5118071436882019, "learning_rate": 4.97789434539995e-06, "loss": 0.0205, "reward": 1.788142442703247, "reward_std": 2.3180289268493652, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.086857557296753, "rewards/wrapped_format_reward": 0.875, "step": 221 }, { "completion_length": 500.0, "epoch": 44.4, "grad_norm": 0.8292976021766663, "kl": 0.5234676003456116, "learning_rate": 4.977164743467206e-06, "loss": 0.0209, "reward": 1.3859682083129883, "reward_std": 3.6182608604431152, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6875, "rewards/wrapped_driving_reward": -0.5515317916870117, "rewards/wrapped_format_reward": 0.5, "step": 222 }, { "completion_length": 500.0, "epoch": 44.6, "grad_norm": 0.8200549483299255, "kl": 0.3950418531894684, "learning_rate": 4.976423351108943e-06, "loss": 0.0158, "reward": 1.9203238487243652, "reward_std": 1.1563453674316406, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.7046762704849243, "rewards/wrapped_format_reward": 0.625, "step": 223 }, { "completion_length": 500.0, "epoch": 44.8, "grad_norm": 0.6968622207641602, "kl": 0.2271728217601776, "learning_rate": 4.975670171853926e-06, "loss": 0.0091, "reward": -0.3170052766799927, "reward_std": 2.1093220710754395, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9772727489471436, "rewards/wrapped_driving_reward": -2.919278144836426, "rewards/wrapped_format_reward": 0.625, "step": 224 }, { "completion_length": 500.0, "epoch": 45.0, "grad_norm": 0.7795050144195557, "kl": 0.4355601966381073, "learning_rate": 4.97490520928702e-06, "loss": 0.0174, "reward": 2.6324033737182617, "reward_std": 0.5314469933509827, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.11759641766548157, "rewards/wrapped_format_reward": 0.75, "step": 225 }, { "completion_length": 500.0, "epoch": 45.2, "grad_norm": 0.5524005889892578, "kl": 0.30952146649360657, "learning_rate": 4.974128467049177e-06, "loss": 0.0124, "reward": -2.125, "reward_std": 1.314977765083313, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.375, "step": 226 }, { "completion_length": 500.0, "epoch": 45.4, "grad_norm": 0.5645884871482849, "kl": 0.4939887821674347, "learning_rate": 4.9733399488374115e-06, "loss": 0.0198, "reward": 2.418989658355713, "reward_std": 0.14345024526119232, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.08101026713848114, "rewards/wrapped_format_reward": 0.5, "step": 227 }, { "completion_length": 500.0, "epoch": 45.6, "grad_norm": 0.9631263017654419, "kl": 0.6647568941116333, "learning_rate": 4.972539658404793e-06, "loss": 0.0266, "reward": -0.0228692889213562, "reward_std": 3.135000228881836, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -2.272869110107422, "rewards/wrapped_format_reward": 0.75, "step": 228 }, { "completion_length": 500.0, "epoch": 45.8, "grad_norm": 0.5868902802467346, "kl": 0.536701500415802, "learning_rate": 4.971727599560418e-06, "loss": 0.0215, "reward": 2.595135450363159, "reward_std": 0.5522119402885437, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.949999988079071, "rewards/wrapped_driving_reward": 0.020135482773184776, "rewards/wrapped_format_reward": 0.625, "step": 229 }, { "completion_length": 500.0, "epoch": 46.0, "grad_norm": 0.6927148103713989, "kl": 0.5391973257064819, "learning_rate": 4.970903776169403e-06, "loss": 0.0216, "reward": 3.2273426055908203, "reward_std": 0.38745206594467163, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9642857313156128, "rewards/wrapped_driving_reward": 0.7630569934844971, "rewards/wrapped_format_reward": 0.5, "step": 230 }, { "completion_length": 500.0, "epoch": 46.2, "grad_norm": 2.157358407974243, "kl": 0.5963761210441589, "learning_rate": 4.9700681921528495e-06, "loss": 0.0239, "reward": 3.3556950092315674, "reward_std": 0.5486971735954285, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.7306950092315674, "rewards/wrapped_format_reward": 0.625, "step": 231 }, { "completion_length": 500.0, "epoch": 46.4, "grad_norm": 0.5409197211265564, "kl": 0.31054040789604187, "learning_rate": 4.9692208514878445e-06, "loss": 0.0124, "reward": -1.75, "reward_std": 1.1902379989624023, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 232 }, { "completion_length": 500.0, "epoch": 46.6, "grad_norm": 0.8271388411521912, "kl": 0.5030784606933594, "learning_rate": 4.968361758207428e-06, "loss": 0.0201, "reward": 2.2951016426086426, "reward_std": 0.6324443817138672, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.07989836484193802, "rewards/wrapped_format_reward": 0.375, "step": 233 }, { "completion_length": 500.0, "epoch": 46.8, "grad_norm": 0.9013113975524902, "kl": 0.527148425579071, "learning_rate": 4.9674909164005805e-06, "loss": 0.0211, "reward": -0.08311975002288818, "reward_std": 4.243640422821045, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.5831197500228882, "rewards/wrapped_format_reward": 0.5, "step": 234 }, { "completion_length": 500.0, "epoch": 47.0, "grad_norm": 0.621760368347168, "kl": 0.5894174575805664, "learning_rate": 4.966608330212198e-06, "loss": 0.0236, "reward": 2.69521427154541, "reward_std": 0.2680894732475281, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9722222089767456, "rewards/wrapped_driving_reward": -0.15200814604759216, "rewards/wrapped_format_reward": 0.875, "step": 235 }, { "completion_length": 500.0, "epoch": 47.2, "grad_norm": 0.6673128604888916, "kl": 0.42412999272346497, "learning_rate": 4.965714003843079e-06, "loss": 0.017, "reward": -2.0, "reward_std": 1.0801234245300293, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.5, "step": 236 }, { "completion_length": 500.0, "epoch": 47.4, "grad_norm": 0.6826753616333008, "kl": 0.48437440395355225, "learning_rate": 4.9648079415499e-06, "loss": 0.0194, "reward": 2.6671550273895264, "reward_std": 0.6421502828598022, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.20784501731395721, "rewards/wrapped_format_reward": 0.875, "step": 237 }, { "completion_length": 500.0, "epoch": 47.6, "grad_norm": 0.7442097663879395, "kl": 0.5179538130760193, "learning_rate": 4.963890147645195e-06, "loss": 0.0207, "reward": 0.023519575595855713, "reward_std": 1.7913424968719482, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -2.351480484008789, "rewards/wrapped_format_reward": 0.375, "step": 238 }, { "completion_length": 500.0, "epoch": 47.8, "grad_norm": 0.9971833825111389, "kl": 0.2566893994808197, "learning_rate": 4.962960626497339e-06, "loss": 0.0103, "reward": 1.0741076469421387, "reward_std": 3.4465811252593994, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.6758923530578613, "rewards/wrapped_format_reward": 0.25, "step": 239 }, { "completion_length": 500.0, "epoch": 48.0, "grad_norm": 0.776371955871582, "kl": 0.6667019724845886, "learning_rate": 4.962019382530521e-06, "loss": 0.0267, "reward": 0.7681245803833008, "reward_std": 3.63140606880188, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.6068754196166992, "rewards/wrapped_format_reward": 0.875, "step": 240 }, { "completion_length": 500.0, "epoch": 48.2, "grad_norm": 0.958461344242096, "kl": 0.6015651226043701, "learning_rate": 4.961066420224729e-06, "loss": 0.0241, "reward": 0.8900174498558044, "reward_std": 2.1547889709472656, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9583333134651184, "rewards/wrapped_driving_reward": -1.568315863609314, "rewards/wrapped_format_reward": 0.5, "step": 241 }, { "completion_length": 500.0, "epoch": 48.4, "grad_norm": 0.8577614426612854, "kl": 0.7052382230758667, "learning_rate": 4.960101744115727e-06, "loss": 0.0282, "reward": 0.500007152557373, "reward_std": 3.6019463539123535, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.685606062412262, "rewards/wrapped_driving_reward": -1.6855988502502441, "rewards/wrapped_format_reward": 0.75, "step": 242 }, { "completion_length": 500.0, "epoch": 48.6, "grad_norm": 0.6088186502456665, "kl": 0.3410260081291199, "learning_rate": 4.959125358795031e-06, "loss": 0.0136, "reward": 1.2359226942062378, "reward_std": 3.157292127609253, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.1390773057937622, "rewards/wrapped_format_reward": 0.875, "step": 243 }, { "completion_length": 500.0, "epoch": 48.8, "grad_norm": 0.6780346035957336, "kl": 0.47339513897895813, "learning_rate": 4.958137268909887e-06, "loss": 0.0189, "reward": 1.3727295398712158, "reward_std": 3.2822999954223633, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7250000238418579, "rewards/wrapped_driving_reward": -0.8522703647613525, "rewards/wrapped_format_reward": 0.75, "step": 244 }, { "completion_length": 500.0, "epoch": 49.0, "grad_norm": 0.6219626069068909, "kl": 0.3212871849536896, "learning_rate": 4.957137479163253e-06, "loss": 0.0129, "reward": 0.08353948593139648, "reward_std": 2.884551525115967, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -2.2914605140686035, "rewards/wrapped_format_reward": 0.875, "step": 245 }, { "completion_length": 500.0, "epoch": 49.2, "grad_norm": 0.8742188811302185, "kl": 0.6009516716003418, "learning_rate": 4.956125994313775e-06, "loss": 0.024, "reward": 3.219036817550659, "reward_std": 0.6377858519554138, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8035714626312256, "rewards/wrapped_driving_reward": 0.5404652953147888, "rewards/wrapped_format_reward": 0.875, "step": 246 }, { "completion_length": 500.0, "epoch": 49.4, "grad_norm": 3.8272242546081543, "kl": 1.5439887046813965, "learning_rate": 4.95510281917576e-06, "loss": 0.0618, "reward": 3.679497241973877, "reward_std": 0.29719072580337524, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.6794970631599426, "rewards/wrapped_format_reward": 1.0, "step": 247 }, { "completion_length": 500.0, "epoch": 49.6, "grad_norm": 0.5449193120002747, "kl": 0.3074452579021454, "learning_rate": 4.9540679586191605e-06, "loss": 0.0123, "reward": -0.8099073171615601, "reward_std": 2.768624782562256, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -2.9349074363708496, "rewards/wrapped_format_reward": 0.625, "step": 248 }, { "completion_length": 500.0, "epoch": 49.8, "grad_norm": 0.9541939496994019, "kl": 0.5022038221359253, "learning_rate": 4.953021417569545e-06, "loss": 0.0201, "reward": 0.9676476120948792, "reward_std": 3.3370866775512695, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.1573524475097656, "rewards/wrapped_format_reward": 0.625, "step": 249 }, { "completion_length": 500.0, "epoch": 50.0, "grad_norm": 0.6913716197013855, "kl": 0.22377586364746094, "learning_rate": 4.9519632010080765e-06, "loss": 0.009, "reward": -0.7356908917427063, "reward_std": 3.197190761566162, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.4856908321380615, "rewards/wrapped_format_reward": 0.75, "step": 250 }, { "completion_length": 500.0, "epoch": 50.2, "grad_norm": 0.8929083347320557, "kl": 0.5696563720703125, "learning_rate": 4.950893313971492e-06, "loss": 0.0228, "reward": 1.0284161567687988, "reward_std": 2.463923454284668, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.7215839624404907, "rewards/wrapped_format_reward": 0.75, "step": 251 }, { "completion_length": 500.0, "epoch": 50.4, "grad_norm": 1.311846375465393, "kl": 0.7255131006240845, "learning_rate": 4.949811761552074e-06, "loss": 0.029, "reward": 1.2360342741012573, "reward_std": 3.5064666271209717, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6428571343421936, "rewards/wrapped_driving_reward": -0.656822919845581, "rewards/wrapped_format_reward": 0.5, "step": 252 }, { "completion_length": 500.0, "epoch": 50.6, "grad_norm": 0.4827212989330292, "kl": 0.33761507272720337, "learning_rate": 4.9487185488976284e-06, "loss": 0.0135, "reward": 1.2470874786376953, "reward_std": 3.3136324882507324, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.8779124021530151, "rewards/wrapped_format_reward": 0.625, "step": 253 }, { "completion_length": 500.0, "epoch": 50.8, "grad_norm": 0.846889078617096, "kl": 0.7514812350273132, "learning_rate": 4.94761368121146e-06, "loss": 0.0301, "reward": 2.474397897720337, "reward_std": 0.5729619264602661, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.96875, "rewards/wrapped_driving_reward": -0.24435219168663025, "rewards/wrapped_format_reward": 0.75, "step": 254 }, { "completion_length": 467.0, "epoch": 51.0, "grad_norm": 0.6226330399513245, "kl": 0.9035637378692627, "learning_rate": 4.9464971637523465e-06, "loss": 0.0361, "reward": 2.976405620574951, "reward_std": 0.597745954990387, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.023594465106725693, "rewards/wrapped_format_reward": 1.0, "step": 255 }, { "completion_length": 500.0, "epoch": 51.2, "grad_norm": 0.6273528933525085, "kl": 0.46313872933387756, "learning_rate": 4.9453690018345144e-06, "loss": 0.0185, "reward": 1.4185447692871094, "reward_std": 3.323413848876953, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.7064552307128906, "rewards/wrapped_format_reward": 0.625, "step": 256 }, { "completion_length": 500.0, "epoch": 51.4, "grad_norm": 1.4358348846435547, "kl": 0.3820998966693878, "learning_rate": 4.944229200827616e-06, "loss": 0.0153, "reward": 0.3355594873428345, "reward_std": 2.6716933250427246, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.414440393447876, "rewards/wrapped_format_reward": 0.25, "step": 257 }, { "completion_length": 500.0, "epoch": 51.6, "grad_norm": 0.5715557932853699, "kl": 0.26808997988700867, "learning_rate": 4.943077766156698e-06, "loss": 0.0107, "reward": 1.8104004859924316, "reward_std": 3.5490846633911133, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7272727489471436, "rewards/wrapped_driving_reward": -0.4168723225593567, "rewards/wrapped_format_reward": 0.75, "step": 258 }, { "completion_length": 500.0, "epoch": 51.8, "grad_norm": 0.9949742555618286, "kl": 0.6875662803649902, "learning_rate": 4.941914703302181e-06, "loss": 0.0275, "reward": 3.0740609169006348, "reward_std": 1.2247024774551392, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": 0.5740607976913452, "rewards/wrapped_format_reward": 0.75, "step": 259 }, { "completion_length": 500.0, "epoch": 52.0, "grad_norm": 0.5395461916923523, "kl": 0.3153356909751892, "learning_rate": 4.9407400177998335e-06, "loss": 0.0126, "reward": 1.252676010131836, "reward_std": 3.17907452583313, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6875, "rewards/wrapped_driving_reward": -0.9348239898681641, "rewards/wrapped_format_reward": 0.75, "step": 260 }, { "completion_length": 500.0, "epoch": 52.2, "grad_norm": 0.5175924301147461, "kl": 0.41518455743789673, "learning_rate": 4.939553715240741e-06, "loss": 0.0166, "reward": 3.3429102897644043, "reward_std": 0.5592035055160522, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.46791017055511475, "rewards/wrapped_format_reward": 0.875, "step": 261 }, { "completion_length": 500.0, "epoch": 52.4, "grad_norm": 0.47460633516311646, "kl": 0.5516465306282043, "learning_rate": 4.938355801271282e-06, "loss": 0.0221, "reward": 1.5110602378845215, "reward_std": 1.9977182149887085, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9375, "rewards/wrapped_driving_reward": -1.3014397621154785, "rewards/wrapped_format_reward": 0.875, "step": 262 }, { "completion_length": 500.0, "epoch": 52.6, "grad_norm": 0.8124420046806335, "kl": 0.5848581790924072, "learning_rate": 4.937146281593103e-06, "loss": 0.0234, "reward": 3.247490406036377, "reward_std": 0.6556951999664307, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.6224905252456665, "rewards/wrapped_format_reward": 0.625, "step": 263 }, { "completion_length": 500.0, "epoch": 52.8, "grad_norm": 0.6221967339515686, "kl": 0.30070972442626953, "learning_rate": 4.935925161963089e-06, "loss": 0.012, "reward": 0.7041885852813721, "reward_std": 3.16145658493042, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.295811414718628, "rewards/wrapped_format_reward": 0.5, "step": 264 }, { "completion_length": 500.0, "epoch": 53.0, "grad_norm": 0.8025345206260681, "kl": 0.4004298448562622, "learning_rate": 4.9346924481933345e-06, "loss": 0.016, "reward": -2.3282265663146973, "reward_std": 1.6561260223388672, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -3.9532265663146973, "rewards/wrapped_format_reward": 0.625, "step": 265 }, { "completion_length": 500.0, "epoch": 53.2, "grad_norm": 2.1500437259674072, "kl": 0.9270884990692139, "learning_rate": 4.933448146151122e-06, "loss": 0.0371, "reward": 3.697523355484009, "reward_std": 0.2363002598285675, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.8225233554840088, "rewards/wrapped_format_reward": 0.875, "step": 266 }, { "completion_length": 500.0, "epoch": 53.4, "grad_norm": 0.9306778311729431, "kl": 0.46557193994522095, "learning_rate": 4.932192261758885e-06, "loss": 0.0186, "reward": 3.163017749786377, "reward_std": 0.2954404056072235, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9772727489471436, "rewards/wrapped_driving_reward": 0.3107450008392334, "rewards/wrapped_format_reward": 0.875, "step": 267 }, { "completion_length": 500.0, "epoch": 53.6, "grad_norm": 0.7212801575660706, "kl": 0.5435701608657837, "learning_rate": 4.930924800994192e-06, "loss": 0.0217, "reward": 1.4288103580474854, "reward_std": 3.6827523708343506, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.5711897015571594, "rewards/wrapped_format_reward": 0.5, "step": 268 }, { "completion_length": 500.0, "epoch": 53.8, "grad_norm": 0.8867128491401672, "kl": 0.7739095687866211, "learning_rate": 4.929645769889704e-06, "loss": 0.031, "reward": -2.1571238040924072, "reward_std": 1.6486133337020874, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -3.7821238040924072, "rewards/wrapped_format_reward": 0.625, "step": 269 }, { "completion_length": 500.0, "epoch": 54.0, "grad_norm": 0.510140061378479, "kl": 0.8436269164085388, "learning_rate": 4.928355174533153e-06, "loss": 0.0337, "reward": 2.837273120880127, "reward_std": 0.3524271845817566, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.03772694990038872, "rewards/wrapped_format_reward": 0.875, "step": 270 }, { "completion_length": 500.0, "epoch": 54.2, "grad_norm": 0.6447189450263977, "kl": 0.37782302498817444, "learning_rate": 4.927053021067321e-06, "loss": 0.0151, "reward": 2.576737642288208, "reward_std": 0.5483171939849854, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.326737642288208, "rewards/wrapped_format_reward": 0.25, "step": 271 }, { "completion_length": 500.0, "epoch": 54.4, "grad_norm": 0.670859158039093, "kl": 1.0185219049453735, "learning_rate": 4.925739315689991e-06, "loss": 0.0407, "reward": 2.652383327484131, "reward_std": 0.27769026160240173, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9166666865348816, "rewards/wrapped_driving_reward": -0.1392831951379776, "rewards/wrapped_format_reward": 0.875, "step": 272 }, { "completion_length": 500.0, "epoch": 54.6, "grad_norm": 0.5182738304138184, "kl": 0.6035473942756653, "learning_rate": 4.924414064653938e-06, "loss": 0.0241, "reward": -1.0863802433013916, "reward_std": 2.398730754852295, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7272727489471436, "rewards/wrapped_driving_reward": -3.188652992248535, "rewards/wrapped_format_reward": 0.625, "step": 273 }, { "completion_length": 500.0, "epoch": 54.8, "grad_norm": 0.506093442440033, "kl": 0.23574630916118622, "learning_rate": 4.923077274266886e-06, "loss": 0.0094, "reward": 1.0250887870788574, "reward_std": 2.6349871158599854, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.8499112129211426, "rewards/wrapped_format_reward": 0.875, "step": 274 }, { "completion_length": 500.0, "epoch": 55.0, "grad_norm": 1.008044719696045, "kl": 1.0019625425338745, "learning_rate": 4.9217289508914836e-06, "loss": 0.0401, "reward": 3.194295883178711, "reward_std": 0.7734468579292297, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.44429582357406616, "rewards/wrapped_format_reward": 0.75, "step": 275 }, { "completion_length": 500.0, "epoch": 55.2, "grad_norm": 0.7316671013832092, "kl": 0.44905000925064087, "learning_rate": 4.92036910094527e-06, "loss": 0.018, "reward": 2.1943540573120117, "reward_std": 1.172672152519226, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9750000238418579, "rewards/wrapped_driving_reward": -0.40564602613449097, "rewards/wrapped_format_reward": 0.625, "step": 276 }, { "completion_length": 500.0, "epoch": 55.4, "grad_norm": 0.5603216290473938, "kl": 0.6424278616905212, "learning_rate": 4.91899773090065e-06, "loss": 0.0257, "reward": 2.5801260471343994, "reward_std": 0.4576241075992584, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.16987384855747223, "rewards/wrapped_format_reward": 0.75, "step": 277 }, { "completion_length": 500.0, "epoch": 55.6, "grad_norm": 0.7360312938690186, "kl": 0.760132372379303, "learning_rate": 4.917614847284858e-06, "loss": 0.0304, "reward": 3.1857800483703613, "reward_std": 0.6251810193061829, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.5607799887657166, "rewards/wrapped_format_reward": 0.625, "step": 278 }, { "completion_length": 500.0, "epoch": 55.8, "grad_norm": 0.6797391176223755, "kl": 0.7862927317619324, "learning_rate": 4.91622045667993e-06, "loss": 0.0315, "reward": 1.7073078155517578, "reward_std": 3.480921983718872, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.542692244052887, "rewards/wrapped_format_reward": 0.75, "step": 279 }, { "completion_length": 500.0, "epoch": 56.0, "grad_norm": 0.7321208119392395, "kl": 0.5703269243240356, "learning_rate": 4.914814565722671e-06, "loss": 0.0228, "reward": 2.8495311737060547, "reward_std": 0.2323673665523529, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9583333134651184, "rewards/wrapped_driving_reward": 0.01619771495461464, "rewards/wrapped_format_reward": 0.875, "step": 280 }, { "completion_length": 500.0, "epoch": 56.2, "grad_norm": 0.5318320989608765, "kl": 0.695686936378479, "learning_rate": 4.913397181104623e-06, "loss": 0.0278, "reward": 1.8786531686782837, "reward_std": 3.592921257019043, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.3713468611240387, "rewards/wrapped_format_reward": 0.75, "step": 281 }, { "completion_length": 500.0, "epoch": 56.4, "grad_norm": 1.4247713088989258, "kl": 0.6277374029159546, "learning_rate": 4.9119683095720325e-06, "loss": 0.0251, "reward": -1.9102458953857422, "reward_std": 3.205610513687134, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -3.035245895385742, "rewards/wrapped_format_reward": 0.625, "step": 282 }, { "completion_length": 500.0, "epoch": 56.6, "grad_norm": 0.5404212474822998, "kl": 0.5326585173606873, "learning_rate": 4.9105279579258234e-06, "loss": 0.0213, "reward": 2.9429216384887695, "reward_std": 0.2231481820344925, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9772727489471436, "rewards/wrapped_driving_reward": 0.0906490758061409, "rewards/wrapped_format_reward": 0.875, "step": 283 }, { "completion_length": 500.0, "epoch": 56.8, "grad_norm": 0.649975061416626, "kl": 0.692694365978241, "learning_rate": 4.909076133021558e-06, "loss": 0.0277, "reward": 2.443101406097412, "reward_std": 0.7186054587364197, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.06810133904218674, "rewards/wrapped_format_reward": 0.375, "step": 284 }, { "completion_length": 500.0, "epoch": 57.0, "grad_norm": 1.1204376220703125, "kl": 0.5154175162315369, "learning_rate": 4.907612841769407e-06, "loss": 0.0206, "reward": 3.386246681213379, "reward_std": 0.1496068835258484, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.7612467408180237, "rewards/wrapped_format_reward": 0.625, "step": 285 }, { "completion_length": 500.0, "epoch": 57.2, "grad_norm": 0.734048068523407, "kl": 0.5711463093757629, "learning_rate": 4.906138091134118e-06, "loss": 0.0228, "reward": 1.2789300680160522, "reward_std": 3.544008731842041, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9710699319839478, "rewards/wrapped_format_reward": 0.75, "step": 286 }, { "completion_length": 500.0, "epoch": 57.4, "grad_norm": 0.5225769281387329, "kl": 0.27049189805984497, "learning_rate": 4.904651888134982e-06, "loss": 0.0108, "reward": 3.3136672973632812, "reward_std": 0.08986721932888031, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9722222089767456, "rewards/wrapped_driving_reward": 0.3414452075958252, "rewards/wrapped_format_reward": 1.0, "step": 287 }, { "completion_length": 413.0, "epoch": 57.6, "grad_norm": 1.317987322807312, "kl": 0.8254587650299072, "learning_rate": 4.903154239845798e-06, "loss": 0.033, "reward": 3.501180648803711, "reward_std": 0.3513033092021942, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.6261807084083557, "rewards/wrapped_format_reward": 0.875, "step": 288 }, { "completion_length": 500.0, "epoch": 57.8, "grad_norm": 0.6399166584014893, "kl": 0.6499161124229431, "learning_rate": 4.901645153394838e-06, "loss": 0.026, "reward": 2.196173667907715, "reward_std": 1.2169743776321411, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9545454382896423, "rewards/wrapped_driving_reward": -0.38337159156799316, "rewards/wrapped_format_reward": 0.625, "step": 289 }, { "completion_length": 465.0, "epoch": 58.0, "grad_norm": 0.57232266664505, "kl": 0.6679652333259583, "learning_rate": 4.900124635964823e-06, "loss": 0.0267, "reward": 2.7008635997772217, "reward_std": 0.3644496500492096, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.17413626611232758, "rewards/wrapped_format_reward": 0.875, "step": 290 }, { "completion_length": 500.0, "epoch": 58.2, "grad_norm": 0.5505650639533997, "kl": 0.6699390411376953, "learning_rate": 4.898592694792871e-06, "loss": 0.0268, "reward": 3.24006724357605, "reward_std": 0.26251503825187683, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.7400672435760498, "rewards/wrapped_format_reward": 0.5, "step": 291 }, { "completion_length": 500.0, "epoch": 58.4, "grad_norm": 0.6135743260383606, "kl": 0.41203054785728455, "learning_rate": 4.897049337170483e-06, "loss": 0.0165, "reward": 2.3971381187438965, "reward_std": 2.2839035987854004, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.4778619110584259, "rewards/wrapped_format_reward": 0.875, "step": 292 }, { "completion_length": 500.0, "epoch": 58.6, "grad_norm": 0.5831702351570129, "kl": 0.4307749271392822, "learning_rate": 4.895494570443492e-06, "loss": 0.0172, "reward": -0.7622057199478149, "reward_std": 2.7372794151306152, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -3.0122056007385254, "rewards/wrapped_format_reward": 0.75, "step": 293 }, { "completion_length": 500.0, "epoch": 58.8, "grad_norm": 0.5363391637802124, "kl": 0.3049720525741577, "learning_rate": 4.8939284020120365e-06, "loss": 0.0122, "reward": 1.0504395961761475, "reward_std": 3.033752202987671, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.699999988079071, "rewards/wrapped_driving_reward": -1.274560570716858, "rewards/wrapped_format_reward": 0.875, "step": 294 }, { "completion_length": 500.0, "epoch": 59.0, "grad_norm": 0.6540634632110596, "kl": 0.2309640794992447, "learning_rate": 4.8923508393305224e-06, "loss": 0.0092, "reward": 1.0688008069992065, "reward_std": 3.047051429748535, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9311991333961487, "rewards/wrapped_format_reward": 0.5, "step": 295 }, { "completion_length": 500.0, "epoch": 59.2, "grad_norm": 0.7878542542457581, "kl": 0.44138067960739136, "learning_rate": 4.890761889907589e-06, "loss": 0.0177, "reward": 1.3176345825195312, "reward_std": 2.8866841793060303, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.8073654770851135, "rewards/wrapped_format_reward": 0.625, "step": 296 }, { "completion_length": 500.0, "epoch": 59.4, "grad_norm": 0.5553893446922302, "kl": 0.6941342949867249, "learning_rate": 4.8891615613060715e-06, "loss": 0.0278, "reward": 2.6625680923461914, "reward_std": 0.46423208713531494, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.037567950785160065, "rewards/wrapped_format_reward": 0.625, "step": 297 }, { "completion_length": 500.0, "epoch": 59.6, "grad_norm": 0.5995137095451355, "kl": 0.9473585486412048, "learning_rate": 4.887549861142967e-06, "loss": 0.0379, "reward": 1.7219200134277344, "reward_std": 2.298279047012329, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.9030801057815552, "rewards/wrapped_format_reward": 0.625, "step": 298 }, { "completion_length": 500.0, "epoch": 59.8, "grad_norm": 0.58851557970047, "kl": 0.5266136527061462, "learning_rate": 4.885926797089396e-06, "loss": 0.0211, "reward": 2.1079962253570557, "reward_std": 0.7257985472679138, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.90625, "rewards/wrapped_driving_reward": -0.2982538044452667, "rewards/wrapped_format_reward": 0.5, "step": 299 }, { "completion_length": 500.0, "epoch": 60.0, "grad_norm": 0.5463822484016418, "kl": 0.4176102578639984, "learning_rate": 4.884292376870567e-06, "loss": 0.0167, "reward": 3.0649795532226562, "reward_std": 0.5604602694511414, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.06497950851917267, "rewards/wrapped_format_reward": 1.0, "step": 300 }, { "completion_length": 500.0, "epoch": 60.2, "grad_norm": 0.6283360123634338, "kl": 0.7974268198013306, "learning_rate": 4.882646608265743e-06, "loss": 0.0319, "reward": 2.354006290435791, "reward_std": 2.2440249919891357, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.6459937691688538, "rewards/wrapped_format_reward": 1.0, "step": 301 }, { "completion_length": 500.0, "epoch": 60.4, "grad_norm": 0.5363072156906128, "kl": 0.3808169662952423, "learning_rate": 4.880989499108196e-06, "loss": 0.0152, "reward": 2.4912989139556885, "reward_std": 0.2686402499675751, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.2587011158466339, "rewards/wrapped_format_reward": 0.75, "step": 302 }, { "completion_length": 500.0, "epoch": 60.6, "grad_norm": 0.7250503897666931, "kl": 0.9471665620803833, "learning_rate": 4.8793210572851795e-06, "loss": 0.0379, "reward": 1.964458703994751, "reward_std": 2.0956764221191406, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.6605411767959595, "rewards/wrapped_format_reward": 0.625, "step": 303 }, { "completion_length": 500.0, "epoch": 60.8, "grad_norm": 0.5111416578292847, "kl": 0.3088001608848572, "learning_rate": 4.8776412907378845e-06, "loss": 0.0124, "reward": -1.0382410287857056, "reward_std": 2.2208359241485596, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -3.038240909576416, "rewards/wrapped_format_reward": 0.5, "step": 304 }, { "completion_length": 500.0, "epoch": 61.0, "grad_norm": 0.5215743780136108, "kl": 0.24940745532512665, "learning_rate": 4.875950207461403e-06, "loss": 0.01, "reward": 1.1871674060821533, "reward_std": 1.8741310834884644, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.1878325939178467, "rewards/wrapped_format_reward": 0.375, "step": 305 }, { "completion_length": 500.0, "epoch": 61.2, "grad_norm": 0.6458624005317688, "kl": 0.5968429446220398, "learning_rate": 4.874247815504693e-06, "loss": 0.0239, "reward": 1.3467873334884644, "reward_std": 3.565579414367676, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9032126665115356, "rewards/wrapped_format_reward": 0.75, "step": 306 }, { "completion_length": 500.0, "epoch": 61.4, "grad_norm": 0.5787416696548462, "kl": 0.3611050844192505, "learning_rate": 4.872534122970536e-06, "loss": 0.0144, "reward": -1.5641289949417114, "reward_std": 2.654411792755127, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.689128875732422, "rewards/wrapped_format_reward": 0.125, "step": 307 }, { "completion_length": 500.0, "epoch": 61.6, "grad_norm": 0.9745285511016846, "kl": 1.186964988708496, "learning_rate": 4.870809138015499e-06, "loss": 0.0475, "reward": 3.2458059787750244, "reward_std": 0.423534095287323, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.4958060383796692, "rewards/wrapped_format_reward": 0.75, "step": 308 }, { "completion_length": 500.0, "epoch": 61.8, "grad_norm": 0.5002840757369995, "kl": 0.35287949442863464, "learning_rate": 4.8690728688499e-06, "loss": 0.0141, "reward": -0.37188810110092163, "reward_std": 2.867581367492676, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.699999988079071, "rewards/wrapped_driving_reward": -2.821887969970703, "rewards/wrapped_format_reward": 1.0, "step": 309 }, { "completion_length": 500.0, "epoch": 62.0, "grad_norm": 0.5000089406967163, "kl": 0.49758777022361755, "learning_rate": 4.867325323737765e-06, "loss": 0.0199, "reward": 1.0681167840957642, "reward_std": 3.100673198699951, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7222222089767456, "rewards/wrapped_driving_reward": -1.2791054248809814, "rewards/wrapped_format_reward": 0.875, "step": 310 }, { "completion_length": 500.0, "epoch": 62.2, "grad_norm": 0.5534230470657349, "kl": 0.2521771788597107, "learning_rate": 4.865566510996787e-06, "loss": 0.0101, "reward": 1.340180516242981, "reward_std": 2.2682108879089355, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.2848193645477295, "rewards/wrapped_format_reward": 0.625, "step": 311 }, { "completion_length": 500.0, "epoch": 62.4, "grad_norm": 0.8099197149276733, "kl": 0.9075281023979187, "learning_rate": 4.863796438998293e-06, "loss": 0.0363, "reward": 1.2343413829803467, "reward_std": 3.5059289932250977, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7083333134651184, "rewards/wrapped_driving_reward": -0.723991870880127, "rewards/wrapped_format_reward": 0.5, "step": 312 }, { "completion_length": 500.0, "epoch": 62.6, "grad_norm": 0.5338905453681946, "kl": 0.6918814778327942, "learning_rate": 4.862015116167195e-06, "loss": 0.0277, "reward": -0.8979493975639343, "reward_std": 2.7375407218933105, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -3.022949457168579, "rewards/wrapped_format_reward": 0.625, "step": 313 }, { "completion_length": 500.0, "epoch": 62.8, "grad_norm": 0.4916922450065613, "kl": 0.46378105878829956, "learning_rate": 4.860222550981961e-06, "loss": 0.0186, "reward": 3.5812926292419434, "reward_std": 0.48870983719825745, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.831292450428009, "rewards/wrapped_format_reward": 0.75, "step": 314 }, { "completion_length": 500.0, "epoch": 63.0, "grad_norm": 0.5365267395973206, "kl": 0.9468160271644592, "learning_rate": 4.858418751974564e-06, "loss": 0.0379, "reward": 2.7630491256713867, "reward_std": 0.2816019356250763, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.013049202039837837, "rewards/wrapped_format_reward": 0.75, "step": 315 }, { "completion_length": 500.0, "epoch": 63.2, "grad_norm": 0.6400690674781799, "kl": 0.4425306022167206, "learning_rate": 4.856603727730446e-06, "loss": 0.0177, "reward": 0.8091722130775452, "reward_std": 3.2286431789398193, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7361111044883728, "rewards/wrapped_driving_reward": -1.176938772201538, "rewards/wrapped_format_reward": 0.5, "step": 316 }, { "completion_length": 500.0, "epoch": 63.4, "grad_norm": 0.6683816909790039, "kl": 0.5423528552055359, "learning_rate": 4.854777486888481e-06, "loss": 0.0217, "reward": 1.423877239227295, "reward_std": 2.06416654586792, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9318181872367859, "rewards/wrapped_driving_reward": -1.0079410076141357, "rewards/wrapped_format_reward": 0.5, "step": 317 }, { "completion_length": 500.0, "epoch": 63.6, "grad_norm": 0.5277029871940613, "kl": 0.49146440625190735, "learning_rate": 4.852940038140927e-06, "loss": 0.0197, "reward": 3.4070889949798584, "reward_std": 0.4839623272418976, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.532089114189148, "rewards/wrapped_format_reward": 0.875, "step": 318 }, { "completion_length": 500.0, "epoch": 63.8, "grad_norm": 0.8004822134971619, "kl": 0.6069704294204712, "learning_rate": 4.8510913902333876e-06, "loss": 0.0243, "reward": 1.4200356006622314, "reward_std": 3.2922091484069824, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9549642205238342, "rewards/wrapped_format_reward": 0.875, "step": 319 }, { "completion_length": 500.0, "epoch": 64.0, "grad_norm": 0.5641809105873108, "kl": 0.47654300928115845, "learning_rate": 4.849231551964771e-06, "loss": 0.0191, "reward": 1.7679204940795898, "reward_std": 2.522942543029785, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.48207950592041016, "rewards/wrapped_format_reward": 0.25, "step": 320 }, { "completion_length": 500.0, "epoch": 64.2, "grad_norm": 0.6091551184654236, "kl": 0.25179192423820496, "learning_rate": 4.8473605321872484e-06, "loss": 0.0101, "reward": -0.5306634306907654, "reward_std": 2.778803825378418, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -2.65566349029541, "rewards/wrapped_format_reward": 0.625, "step": 321 }, { "completion_length": 500.0, "epoch": 64.4, "grad_norm": 0.5598704218864441, "kl": 0.5083995461463928, "learning_rate": 4.845478339806211e-06, "loss": 0.0203, "reward": 1.3865933418273926, "reward_std": 3.261355400085449, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6439394354820251, "rewards/wrapped_driving_reward": -0.3823460340499878, "rewards/wrapped_format_reward": 0.375, "step": 322 }, { "completion_length": 500.0, "epoch": 64.6, "grad_norm": 1.2819907665252686, "kl": 0.26251715421676636, "learning_rate": 4.843584983780225e-06, "loss": 0.0105, "reward": 2.78169846534729, "reward_std": 0.5642634034156799, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.53169846534729, "rewards/wrapped_format_reward": 0.25, "step": 323 }, { "completion_length": 500.0, "epoch": 64.8, "grad_norm": 0.5119650363922119, "kl": 0.44216257333755493, "learning_rate": 4.841680473120994e-06, "loss": 0.0177, "reward": 1.7560722827911377, "reward_std": 0.8860724568367004, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8999999761581421, "rewards/wrapped_driving_reward": -0.893927812576294, "rewards/wrapped_format_reward": 0.75, "step": 324 }, { "completion_length": 500.0, "epoch": 65.0, "grad_norm": 0.5220951437950134, "kl": 0.3728632926940918, "learning_rate": 4.839764816893315e-06, "loss": 0.0149, "reward": 0.11694353818893433, "reward_std": 3.4642200469970703, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6625000238418579, "rewards/wrapped_driving_reward": -1.7955565452575684, "rewards/wrapped_format_reward": 0.5, "step": 325 }, { "completion_length": 500.0, "epoch": 65.2, "grad_norm": 0.7010544538497925, "kl": 0.6249963641166687, "learning_rate": 4.83783802421503e-06, "loss": 0.025, "reward": 2.118408441543579, "reward_std": 0.14386098086833954, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.3815915584564209, "rewards/wrapped_format_reward": 0.5, "step": 326 }, { "completion_length": 500.0, "epoch": 65.4, "grad_norm": 0.6010672450065613, "kl": 0.8014960289001465, "learning_rate": 4.835900104256989e-06, "loss": 0.0321, "reward": 3.40175724029541, "reward_std": 0.4777882695198059, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.7767573595046997, "rewards/wrapped_format_reward": 0.625, "step": 327 }, { "completion_length": 500.0, "epoch": 65.6, "grad_norm": 0.7275899052619934, "kl": 1.015032410621643, "learning_rate": 4.833951066243004e-06, "loss": 0.0406, "reward": 1.5494968891143799, "reward_std": 3.7214293479919434, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": -0.5755031704902649, "rewards/wrapped_format_reward": 0.75, "step": 328 }, { "completion_length": 500.0, "epoch": 65.8, "grad_norm": 0.512077271938324, "kl": 0.4679301083087921, "learning_rate": 4.831990919449806e-06, "loss": 0.0187, "reward": 1.843889832496643, "reward_std": 1.9196797609329224, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9583333134651184, "rewards/wrapped_driving_reward": -0.9894434213638306, "rewards/wrapped_format_reward": 0.875, "step": 329 }, { "completion_length": 500.0, "epoch": 66.0, "grad_norm": 0.5223225355148315, "kl": 0.5572952628135681, "learning_rate": 4.830019673206997e-06, "loss": 0.0223, "reward": 1.724566102027893, "reward_std": 0.6851178407669067, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.7754338979721069, "rewards/wrapped_format_reward": 0.5, "step": 330 }, { "completion_length": 500.0, "epoch": 66.2, "grad_norm": 0.8261262774467468, "kl": 0.6189790964126587, "learning_rate": 4.828037336897009e-06, "loss": 0.0248, "reward": 2.4810099601745605, "reward_std": 0.7533643245697021, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9444444179534912, "rewards/wrapped_driving_reward": -0.21343453228473663, "rewards/wrapped_format_reward": 0.75, "step": 331 }, { "completion_length": 500.0, "epoch": 66.4, "grad_norm": 0.7530797123908997, "kl": 1.0583492517471313, "learning_rate": 4.826043919955062e-06, "loss": 0.0423, "reward": 0.9729395508766174, "reward_std": 1.709369421005249, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.5270603895187378, "rewards/wrapped_format_reward": 0.5, "step": 332 }, { "completion_length": 500.0, "epoch": 66.6, "grad_norm": 0.7703694105148315, "kl": 1.1116454601287842, "learning_rate": 4.824039431869112e-06, "loss": 0.0445, "reward": 2.3814101219177246, "reward_std": 0.2855786383152008, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.118589848279953, "rewards/wrapped_format_reward": 0.5, "step": 333 }, { "completion_length": 500.0, "epoch": 66.8, "grad_norm": 1.2055093050003052, "kl": 1.0002336502075195, "learning_rate": 4.822023882179811e-06, "loss": 0.04, "reward": 1.3947391510009766, "reward_std": 3.263190746307373, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9802609086036682, "rewards/wrapped_format_reward": 0.875, "step": 334 }, { "completion_length": 500.0, "epoch": 67.0, "grad_norm": 0.5130785703659058, "kl": 0.5356588959693909, "learning_rate": 4.8199972804804615e-06, "loss": 0.0214, "reward": 1.5739116668701172, "reward_std": 3.7419235706329346, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7142857313156128, "rewards/wrapped_driving_reward": -0.3903741240501404, "rewards/wrapped_format_reward": 0.5, "step": 335 }, { "completion_length": 500.0, "epoch": 67.2, "grad_norm": 0.5589498281478882, "kl": 0.5461040139198303, "learning_rate": 4.817959636416969e-06, "loss": 0.0218, "reward": 0.7681459784507751, "reward_std": 2.159348726272583, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.949999988079071, "rewards/wrapped_driving_reward": -1.931854009628296, "rewards/wrapped_format_reward": 0.75, "step": 336 }, { "completion_length": 500.0, "epoch": 67.4, "grad_norm": 0.5277290344238281, "kl": 0.3753708004951477, "learning_rate": 4.815910959687795e-06, "loss": 0.015, "reward": 2.5270836353302, "reward_std": 0.9810623526573181, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.22291645407676697, "rewards/wrapped_format_reward": 0.75, "step": 337 }, { "completion_length": 500.0, "epoch": 67.6, "grad_norm": 0.6664614081382751, "kl": 0.4376363158226013, "learning_rate": 4.8138512600439165e-06, "loss": 0.0175, "reward": 0.6591283679008484, "reward_std": 3.1547491550445557, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.5908715724945068, "rewards/wrapped_format_reward": 0.75, "step": 338 }, { "completion_length": 500.0, "epoch": 67.8, "grad_norm": 0.6150086522102356, "kl": 1.0432140827178955, "learning_rate": 4.8117805472887706e-06, "loss": 0.0417, "reward": 1.9501566886901855, "reward_std": 3.63374924659729, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7272727489471436, "rewards/wrapped_driving_reward": -0.4021160304546356, "rewards/wrapped_format_reward": 0.875, "step": 339 }, { "completion_length": 500.0, "epoch": 68.0, "grad_norm": 0.6247034072875977, "kl": 0.5003357529640198, "learning_rate": 4.809698831278217e-06, "loss": 0.02, "reward": 3.1344668865203857, "reward_std": 0.4356966018676758, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.949999988079071, "rewards/wrapped_driving_reward": 0.5594670176506042, "rewards/wrapped_format_reward": 0.625, "step": 340 }, { "completion_length": 500.0, "epoch": 68.2, "grad_norm": 0.7365332245826721, "kl": 0.9077324867248535, "learning_rate": 4.807606121920486e-06, "loss": 0.0363, "reward": 2.7017669677734375, "reward_std": 0.26797592639923096, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.1732332408428192, "rewards/wrapped_format_reward": 0.875, "step": 341 }, { "completion_length": 500.0, "epoch": 68.4, "grad_norm": 0.5844131112098694, "kl": 0.7788793444633484, "learning_rate": 4.80550242917613e-06, "loss": 0.0312, "reward": 1.429833173751831, "reward_std": 1.8360050916671753, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.930555522441864, "rewards/wrapped_driving_reward": -1.2507224082946777, "rewards/wrapped_format_reward": 0.75, "step": 342 }, { "completion_length": 500.0, "epoch": 68.6, "grad_norm": 0.5121277570724487, "kl": 0.41953617334365845, "learning_rate": 4.803387763057981e-06, "loss": 0.0168, "reward": 1.4760076999664307, "reward_std": 3.3271522521972656, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.7739923000335693, "rewards/wrapped_format_reward": 0.75, "step": 343 }, { "completion_length": 500.0, "epoch": 68.8, "grad_norm": 0.6013743877410889, "kl": 0.8165791630744934, "learning_rate": 4.801262133631101e-06, "loss": 0.0327, "reward": 1.8635355234146118, "reward_std": 3.9091763496398926, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.3864644765853882, "rewards/wrapped_format_reward": 0.75, "step": 344 }, { "completion_length": 500.0, "epoch": 69.0, "grad_norm": 0.6182828545570374, "kl": 1.0937128067016602, "learning_rate": 4.799125551012731e-06, "loss": 0.0437, "reward": 3.419711112976074, "reward_std": 0.48218590021133423, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9375, "rewards/wrapped_driving_reward": 0.7322112321853638, "rewards/wrapped_format_reward": 0.75, "step": 345 }, { "completion_length": 500.0, "epoch": 69.2, "grad_norm": 0.6208542585372925, "kl": 0.6823076605796814, "learning_rate": 4.796978025372247e-06, "loss": 0.0273, "reward": 0.9823777079582214, "reward_std": 2.9922034740448, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7250000238418579, "rewards/wrapped_driving_reward": -0.9926222562789917, "rewards/wrapped_format_reward": 0.5, "step": 346 }, { "completion_length": 500.0, "epoch": 69.4, "grad_norm": 0.46940603852272034, "kl": 0.6684026122093201, "learning_rate": 4.794819566931107e-06, "loss": 0.0267, "reward": 1.6367030143737793, "reward_std": 3.091221809387207, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.4882969856262207, "rewards/wrapped_format_reward": 0.625, "step": 347 }, { "completion_length": 500.0, "epoch": 69.6, "grad_norm": 0.6682089567184448, "kl": 0.5266464352607727, "learning_rate": 4.79265018596281e-06, "loss": 0.0211, "reward": 0.8233842253684998, "reward_std": 2.3113174438476562, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.926615595817566, "rewards/wrapped_format_reward": 0.75, "step": 348 }, { "completion_length": 500.0, "epoch": 69.8, "grad_norm": 0.49833944439888, "kl": 0.22715015709400177, "learning_rate": 4.79046989279284e-06, "loss": 0.0091, "reward": 0.9500528573989868, "reward_std": 2.9911322593688965, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.1749471426010132, "rewards/wrapped_format_reward": 0.625, "step": 349 }, { "completion_length": 500.0, "epoch": 70.0, "grad_norm": 0.5571979880332947, "kl": 0.39211493730545044, "learning_rate": 4.788278697798619e-06, "loss": 0.0157, "reward": 3.3137106895446777, "reward_std": 0.5589663982391357, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.8137108087539673, "rewards/wrapped_format_reward": 0.5, "step": 350 }, { "completion_length": 500.0, "epoch": 70.2, "grad_norm": 0.6056340932846069, "kl": 0.5785823464393616, "learning_rate": 4.7860766114094555e-06, "loss": 0.0231, "reward": 2.404787063598633, "reward_std": 0.34057241678237915, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.22021275758743286, "rewards/wrapped_format_reward": 0.625, "step": 351 }, { "completion_length": 500.0, "epoch": 70.4, "grad_norm": 0.5014784932136536, "kl": 0.29856839776039124, "learning_rate": 4.783863644106502e-06, "loss": 0.0119, "reward": 0.22824877500534058, "reward_std": 1.6301518678665161, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8999999761581421, "rewards/wrapped_driving_reward": -2.546751022338867, "rewards/wrapped_format_reward": 0.875, "step": 352 }, { "completion_length": 500.0, "epoch": 70.6, "grad_norm": 0.4747620224952698, "kl": 0.9486736059188843, "learning_rate": 4.781639806422699e-06, "loss": 0.0379, "reward": 3.8079710006713867, "reward_std": 0.04277324676513672, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9791666865348816, "rewards/wrapped_driving_reward": 0.8288043737411499, "rewards/wrapped_format_reward": 1.0, "step": 353 }, { "completion_length": 500.0, "epoch": 70.8, "grad_norm": 0.6246031522750854, "kl": 0.6103500127792358, "learning_rate": 4.779405108942722e-06, "loss": 0.0244, "reward": 3.2111012935638428, "reward_std": 0.5528056621551514, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.46110111474990845, "rewards/wrapped_format_reward": 0.75, "step": 354 }, { "completion_length": 500.0, "epoch": 71.0, "grad_norm": 0.7725012898445129, "kl": 1.2677111625671387, "learning_rate": 4.77715956230294e-06, "loss": 0.0507, "reward": -0.5685252547264099, "reward_std": 1.8629494905471802, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -3.0685253143310547, "rewards/wrapped_format_reward": 0.5, "step": 355 }, { "completion_length": 500.0, "epoch": 71.2, "grad_norm": 0.8144864439964294, "kl": 0.9257476329803467, "learning_rate": 4.774903177191358e-06, "loss": 0.037, "reward": 1.7669270038604736, "reward_std": 3.5174825191497803, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.48307299613952637, "rewards/wrapped_format_reward": 0.75, "step": 356 }, { "completion_length": 500.0, "epoch": 71.4, "grad_norm": 0.6923106908798218, "kl": 0.52337646484375, "learning_rate": 4.77263596434757e-06, "loss": 0.0209, "reward": -1.5257434844970703, "reward_std": 3.4870338439941406, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4583333134651184, "rewards/wrapped_driving_reward": -2.859076738357544, "rewards/wrapped_format_reward": 0.375, "step": 357 }, { "completion_length": 500.0, "epoch": 71.6, "grad_norm": 0.5352271199226379, "kl": 0.8994762301445007, "learning_rate": 4.770357934562704e-06, "loss": 0.036, "reward": 2.3911008834838867, "reward_std": 0.5525059103965759, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.875, "rewards/wrapped_driving_reward": -0.3588991165161133, "rewards/wrapped_format_reward": 0.875, "step": 358 }, { "completion_length": 500.0, "epoch": 71.8, "grad_norm": 0.5663076043128967, "kl": 0.7645798325538635, "learning_rate": 4.7680690986793734e-06, "loss": 0.0306, "reward": 1.1860283613204956, "reward_std": 3.1243858337402344, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.8139715790748596, "rewards/wrapped_format_reward": 0.5, "step": 359 }, { "completion_length": 500.0, "epoch": 72.0, "grad_norm": 0.45141172409057617, "kl": 0.5153231620788574, "learning_rate": 4.765769467591626e-06, "loss": 0.0206, "reward": 2.5078225135803223, "reward_std": 0.24923977255821228, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.3671773672103882, "rewards/wrapped_format_reward": 0.875, "step": 360 }, { "completion_length": 500.0, "epoch": 72.2, "grad_norm": 0.4486269950866699, "kl": 0.9089427590370178, "learning_rate": 4.7634590522448886e-06, "loss": 0.0364, "reward": 2.8786072731018066, "reward_std": 0.16827794909477234, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.121392622590065, "rewards/wrapped_format_reward": 1.0, "step": 361 }, { "completion_length": 500.0, "epoch": 72.4, "grad_norm": 0.5474759340286255, "kl": 0.8071030974388123, "learning_rate": 4.761137863635921e-06, "loss": 0.0323, "reward": 2.189277172088623, "reward_std": 0.3206322491168976, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.31072288751602173, "rewards/wrapped_format_reward": 0.5, "step": 362 }, { "completion_length": 466.0, "epoch": 72.6, "grad_norm": 1.6158084869384766, "kl": 0.6787058711051941, "learning_rate": 4.758805912812755e-06, "loss": 0.0271, "reward": 3.3123486042022705, "reward_std": 0.5808764100074768, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.8123486042022705, "rewards/wrapped_format_reward": 0.5, "step": 363 }, { "completion_length": 500.0, "epoch": 72.8, "grad_norm": 0.5949323177337646, "kl": 0.9953030943870544, "learning_rate": 4.7564632108746524e-06, "loss": 0.0398, "reward": 1.4721925258636475, "reward_std": 2.548034906387329, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.699999988079071, "rewards/wrapped_driving_reward": -0.7278074026107788, "rewards/wrapped_format_reward": 0.5, "step": 364 }, { "completion_length": 500.0, "epoch": 73.0, "grad_norm": 0.5318465232849121, "kl": 0.7464905977249146, "learning_rate": 4.75410976897204e-06, "loss": 0.0299, "reward": 0.9842851758003235, "reward_std": 2.294755697250366, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.8907147645950317, "rewards/wrapped_format_reward": 0.875, "step": 365 }, { "completion_length": 500.0, "epoch": 73.2, "grad_norm": 0.6238420009613037, "kl": 0.9425249099731445, "learning_rate": 4.7517455983064694e-06, "loss": 0.0377, "reward": 3.152679204940796, "reward_std": 0.3899010717868805, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.2776792049407959, "rewards/wrapped_format_reward": 0.875, "step": 366 }, { "completion_length": 500.0, "epoch": 73.4, "grad_norm": 0.5412909984588623, "kl": 0.31825780868530273, "learning_rate": 4.7493707101305545e-06, "loss": 0.0127, "reward": -0.11978721618652344, "reward_std": 1.6915035247802734, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -2.9947872161865234, "rewards/wrapped_format_reward": 0.875, "step": 367 }, { "completion_length": 500.0, "epoch": 73.6, "grad_norm": 1.3431739807128906, "kl": 0.5796495079994202, "learning_rate": 4.746985115747918e-06, "loss": 0.0232, "reward": 2.067376136779785, "reward_std": 2.382765769958496, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9861111044883728, "rewards/wrapped_driving_reward": -0.6687348484992981, "rewards/wrapped_format_reward": 0.75, "step": 368 }, { "completion_length": 467.0, "epoch": 73.8, "grad_norm": 0.5503766536712646, "kl": 0.6919428706169128, "learning_rate": 4.744588826513145e-06, "loss": 0.0277, "reward": 2.56295108795166, "reward_std": 0.2011384665966034, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.918749988079071, "rewards/wrapped_driving_reward": -0.35579875111579895, "rewards/wrapped_format_reward": 1.0, "step": 369 }, { "completion_length": 407.0, "epoch": 74.0, "grad_norm": 0.5191554427146912, "kl": 0.5616594552993774, "learning_rate": 4.742181853831721e-06, "loss": 0.0225, "reward": 3.3623037338256836, "reward_std": 0.5844976902008057, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.875, "rewards/wrapped_driving_reward": 0.48730385303497314, "rewards/wrapped_format_reward": 1.0, "step": 370 }, { "completion_length": 500.0, "epoch": 74.2, "grad_norm": 0.6369850635528564, "kl": 0.351097047328949, "learning_rate": 4.739764209159984e-06, "loss": 0.014, "reward": 0.09611350297927856, "reward_std": 1.9310574531555176, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9147727489471436, "rewards/wrapped_driving_reward": -2.3186590671539307, "rewards/wrapped_format_reward": 0.5, "step": 371 }, { "completion_length": 500.0, "epoch": 74.4, "grad_norm": 0.592671275138855, "kl": 0.7280439734458923, "learning_rate": 4.737335904005063e-06, "loss": 0.0291, "reward": 2.9110307693481445, "reward_std": 0.6375144124031067, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8863636255264282, "rewards/wrapped_driving_reward": 0.3996671438217163, "rewards/wrapped_format_reward": 0.625, "step": 372 }, { "completion_length": 500.0, "epoch": 74.6, "grad_norm": 0.5246638059616089, "kl": 0.53719562292099, "learning_rate": 4.734896949924831e-06, "loss": 0.0215, "reward": -0.7122367024421692, "reward_std": 2.786146402359009, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -2.4622366428375244, "rewards/wrapped_format_reward": 0.25, "step": 373 }, { "completion_length": 500.0, "epoch": 74.8, "grad_norm": 0.47917625308036804, "kl": 0.8582153916358948, "learning_rate": 4.732447358527843e-06, "loss": 0.0343, "reward": 0.6898465156555176, "reward_std": 2.585700750350952, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9642857313156128, "rewards/wrapped_driving_reward": -2.0244390964508057, "rewards/wrapped_format_reward": 0.75, "step": 374 }, { "completion_length": 500.0, "epoch": 75.0, "grad_norm": 2.939560890197754, "kl": 1.2595782279968262, "learning_rate": 4.729987141473286e-06, "loss": 0.0504, "reward": 1.0538822412490845, "reward_std": 3.369788885116577, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7083333134651184, "rewards/wrapped_driving_reward": -1.1544511318206787, "rewards/wrapped_format_reward": 0.75, "step": 375 }, { "completion_length": 500.0, "epoch": 75.2, "grad_norm": 0.5623316168785095, "kl": 0.8516190648078918, "learning_rate": 4.72751631047092e-06, "loss": 0.0341, "reward": 1.0435024499893188, "reward_std": 1.7953479290008545, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8928571343421936, "rewards/wrapped_driving_reward": -1.3493547439575195, "rewards/wrapped_format_reward": 0.5, "step": 376 }, { "completion_length": 500.0, "epoch": 75.4, "grad_norm": 0.5572675466537476, "kl": 0.5570351481437683, "learning_rate": 4.725034877281025e-06, "loss": 0.0223, "reward": 2.0858306884765625, "reward_std": 2.7353217601776123, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.539169430732727, "rewards/wrapped_format_reward": 0.625, "step": 377 }, { "completion_length": 500.0, "epoch": 75.6, "grad_norm": 0.48211756348609924, "kl": 0.8747799396514893, "learning_rate": 4.7225428537143414e-06, "loss": 0.035, "reward": 2.519843578338623, "reward_std": 0.06863429397344589, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.4801563024520874, "rewards/wrapped_format_reward": 1.0, "step": 378 }, { "completion_length": 500.0, "epoch": 75.8, "grad_norm": 0.5445191860198975, "kl": 0.5903018116950989, "learning_rate": 4.720040251632019e-06, "loss": 0.0236, "reward": 2.2911336421966553, "reward_std": 0.9245793223381042, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.33386632800102234, "rewards/wrapped_format_reward": 0.625, "step": 379 }, { "completion_length": 436.0, "epoch": 76.0, "grad_norm": 0.494449645280838, "kl": 1.1676191091537476, "learning_rate": 4.717527082945555e-06, "loss": 0.0467, "reward": 3.4632315635681152, "reward_std": 0.11494097858667374, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9583333134651184, "rewards/wrapped_driving_reward": 0.5048981308937073, "rewards/wrapped_format_reward": 1.0, "step": 380 }, { "completion_length": 500.0, "epoch": 76.2, "grad_norm": 0.5274999141693115, "kl": 0.5592718124389648, "learning_rate": 4.715003359616741e-06, "loss": 0.0224, "reward": 1.0327037572860718, "reward_std": 2.441744804382324, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9642857313156128, "rewards/wrapped_driving_reward": -1.931581974029541, "rewards/wrapped_format_reward": 1.0, "step": 381 }, { "completion_length": 500.0, "epoch": 76.4, "grad_norm": 0.4961002469062805, "kl": 0.4407399296760559, "learning_rate": 4.712469093657605e-06, "loss": 0.0176, "reward": 0.06372499465942383, "reward_std": 4.116382598876953, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.6862750053405762, "rewards/wrapped_format_reward": 0.75, "step": 382 }, { "completion_length": 500.0, "epoch": 76.6, "grad_norm": 0.5338679552078247, "kl": 0.6065806746482849, "learning_rate": 4.709924297130354e-06, "loss": 0.0243, "reward": 0.48900270462036133, "reward_std": 2.7124648094177246, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.6359972953796387, "rewards/wrapped_format_reward": 0.625, "step": 383 }, { "completion_length": 500.0, "epoch": 76.8, "grad_norm": 0.5038735866546631, "kl": 0.37462395429611206, "learning_rate": 4.707368982147318e-06, "loss": 0.015, "reward": 1.0094149112701416, "reward_std": 3.141808271408081, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6428571343421936, "rewards/wrapped_driving_reward": -0.758442223072052, "rewards/wrapped_format_reward": 0.375, "step": 384 }, { "completion_length": 500.0, "epoch": 77.0, "grad_norm": 0.5533128976821899, "kl": 0.5821331143379211, "learning_rate": 4.704803160870888e-06, "loss": 0.0233, "reward": 3.293281078338623, "reward_std": 0.5485936999320984, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.6682810187339783, "rewards/wrapped_format_reward": 0.625, "step": 385 }, { "completion_length": 500.0, "epoch": 77.2, "grad_norm": 0.5457403659820557, "kl": 0.8627141118049622, "learning_rate": 4.702226845513465e-06, "loss": 0.0345, "reward": 2.2821011543273926, "reward_std": 0.8117403984069824, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9090908765792847, "rewards/wrapped_driving_reward": -0.12698988616466522, "rewards/wrapped_format_reward": 0.5, "step": 386 }, { "completion_length": 500.0, "epoch": 77.4, "grad_norm": 0.4861312210559845, "kl": 0.9649275541305542, "learning_rate": 4.699640048337394e-06, "loss": 0.0386, "reward": 3.2577872276306152, "reward_std": 0.1188662126660347, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9375, "rewards/wrapped_driving_reward": 0.8202871084213257, "rewards/wrapped_format_reward": 0.5, "step": 387 }, { "completion_length": 500.0, "epoch": 77.6, "grad_norm": 0.5439221858978271, "kl": 1.217611312866211, "learning_rate": 4.697042781654913e-06, "loss": 0.0487, "reward": 2.2471086978912354, "reward_std": 2.214808225631714, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.6278913021087646, "rewards/wrapped_format_reward": 0.875, "step": 388 }, { "completion_length": 500.0, "epoch": 77.8, "grad_norm": 0.6984148621559143, "kl": 0.6679953932762146, "learning_rate": 4.694435057828092e-06, "loss": 0.0267, "reward": 1.991325855255127, "reward_std": 0.5920568704605103, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.7586740851402283, "rewards/wrapped_format_reward": 0.75, "step": 389 }, { "completion_length": 500.0, "epoch": 78.0, "grad_norm": 0.5098196268081665, "kl": 1.2996803522109985, "learning_rate": 4.69181688926877e-06, "loss": 0.052, "reward": 3.0302999019622803, "reward_std": 0.24492628872394562, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.28029996156692505, "rewards/wrapped_format_reward": 0.75, "step": 390 }, { "completion_length": 500.0, "epoch": 78.2, "grad_norm": 0.7863979935646057, "kl": 1.2847576141357422, "learning_rate": 4.6891882884384994e-06, "loss": 0.0514, "reward": 1.7419676780700684, "reward_std": 3.181126832962036, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7083333134651184, "rewards/wrapped_driving_reward": -0.7163656949996948, "rewards/wrapped_format_reward": 1.0, "step": 391 }, { "completion_length": 500.0, "epoch": 78.4, "grad_norm": 0.8814217448234558, "kl": 1.0120067596435547, "learning_rate": 4.68654926784849e-06, "loss": 0.0405, "reward": -1.4409170150756836, "reward_std": 3.4238805770874023, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.9409170150756836, "rewards/wrapped_format_reward": 0.5, "step": 392 }, { "completion_length": 500.0, "epoch": 78.6, "grad_norm": 1.0240262746810913, "kl": 1.0795583724975586, "learning_rate": 4.683899840059543e-06, "loss": 0.0432, "reward": 1.3624104261398315, "reward_std": 2.9232280254364014, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.0125895738601685, "rewards/wrapped_format_reward": 0.875, "step": 393 }, { "completion_length": 500.0, "epoch": 78.8, "grad_norm": 0.5109947919845581, "kl": 0.2543693482875824, "learning_rate": 4.681240017681994e-06, "loss": 0.0102, "reward": 1.1824758052825928, "reward_std": 3.17807674407959, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9425241351127625, "rewards/wrapped_format_reward": 0.625, "step": 394 }, { "completion_length": 500.0, "epoch": 79.0, "grad_norm": 0.7424824833869934, "kl": 0.7182059288024902, "learning_rate": 4.678569813375654e-06, "loss": 0.0287, "reward": 2.9500770568847656, "reward_std": 0.5127381682395935, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9750000238418579, "rewards/wrapped_driving_reward": 0.35007697343826294, "rewards/wrapped_format_reward": 0.625, "step": 395 }, { "completion_length": 500.0, "epoch": 79.2, "grad_norm": 0.879162609577179, "kl": 0.8275773525238037, "learning_rate": 4.675889239849749e-06, "loss": 0.0331, "reward": 3.0128397941589355, "reward_std": 0.4457358717918396, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.3878398835659027, "rewards/wrapped_format_reward": 0.625, "step": 396 }, { "completion_length": 500.0, "epoch": 79.4, "grad_norm": 1.017330527305603, "kl": 0.7725871801376343, "learning_rate": 4.67319830986286e-06, "loss": 0.0309, "reward": 1.7406187057495117, "reward_std": 3.501706838607788, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7083333134651184, "rewards/wrapped_driving_reward": -0.46771466732025146, "rewards/wrapped_format_reward": 0.75, "step": 397 }, { "completion_length": 474.0, "epoch": 79.6, "grad_norm": 0.6759405732154846, "kl": 0.613670289516449, "learning_rate": 4.670497036222856e-06, "loss": 0.0245, "reward": 3.172016143798828, "reward_std": 0.8060486316680908, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9166666865348816, "rewards/wrapped_driving_reward": 0.3803495764732361, "rewards/wrapped_format_reward": 0.875, "step": 398 }, { "completion_length": 500.0, "epoch": 79.8, "grad_norm": 0.6927025318145752, "kl": 0.3530423045158386, "learning_rate": 4.667785431786843e-06, "loss": 0.0141, "reward": 2.0550060272216797, "reward_std": 0.7213863134384155, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8531249761581421, "rewards/wrapped_driving_reward": -0.29811891913414, "rewards/wrapped_format_reward": 0.5, "step": 399 }, { "completion_length": 500.0, "epoch": 80.0, "grad_norm": 0.6842610836029053, "kl": 0.3291209936141968, "learning_rate": 4.665063509461098e-06, "loss": 0.0132, "reward": -0.5214939117431641, "reward_std": 4.023183822631836, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.021493911743164, "rewards/wrapped_format_reward": 0.5, "step": 400 }, { "completion_length": 500.0, "epoch": 80.2, "grad_norm": 1.280340552330017, "kl": 0.9234899878501892, "learning_rate": 4.662331282201002e-06, "loss": 0.0369, "reward": 1.608371615409851, "reward_std": 3.448504686355591, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6875, "rewards/wrapped_driving_reward": -0.7041283845901489, "rewards/wrapped_format_reward": 0.875, "step": 401 }, { "completion_length": 500.0, "epoch": 80.4, "grad_norm": 1.4907759428024292, "kl": 0.787459135055542, "learning_rate": 4.65958876301099e-06, "loss": 0.0315, "reward": 3.0962343215942383, "reward_std": 0.35777872800827026, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.22123444080352783, "rewards/wrapped_format_reward": 0.875, "step": 402 }, { "completion_length": 500.0, "epoch": 80.6, "grad_norm": 0.49698302149772644, "kl": 0.24020951986312866, "learning_rate": 4.65683596494448e-06, "loss": 0.0096, "reward": 2.8298206329345703, "reward_std": 0.41156643629074097, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.925000011920929, "rewards/wrapped_driving_reward": -0.09517934173345566, "rewards/wrapped_format_reward": 1.0, "step": 403 }, { "completion_length": 500.0, "epoch": 80.8, "grad_norm": 1.796094298362732, "kl": 0.980272650718689, "learning_rate": 4.654072901103815e-06, "loss": 0.0392, "reward": 2.3932039737701416, "reward_std": 0.9440638422966003, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.356795996427536, "rewards/wrapped_format_reward": 0.75, "step": 404 }, { "completion_length": 500.0, "epoch": 81.0, "grad_norm": 1.1823846101760864, "kl": 0.7713034749031067, "learning_rate": 4.651299584640198e-06, "loss": 0.0309, "reward": 2.2802748680114746, "reward_std": 0.5997734665870667, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.3447251319885254, "rewards/wrapped_format_reward": 0.625, "step": 405 }, { "completion_length": 449.0, "epoch": 81.2, "grad_norm": 0.6266138553619385, "kl": 0.7948797345161438, "learning_rate": 4.648516028753632e-06, "loss": 0.0318, "reward": 2.6703319549560547, "reward_std": 0.31966376304626465, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.20466792583465576, "rewards/wrapped_format_reward": 0.875, "step": 406 }, { "completion_length": 500.0, "epoch": 81.4, "grad_norm": 0.7062759399414062, "kl": 0.5212233066558838, "learning_rate": 4.645722246692856e-06, "loss": 0.0208, "reward": 1.2558926343917847, "reward_std": 3.199486017227173, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.71875, "rewards/wrapped_driving_reward": -0.7128572463989258, "rewards/wrapped_format_reward": 0.5, "step": 407 }, { "completion_length": 480.0, "epoch": 81.6, "grad_norm": 0.5641493201255798, "kl": 0.8769745826721191, "learning_rate": 4.642918251755281e-06, "loss": 0.0351, "reward": 2.6268346309661865, "reward_std": 0.37705758213996887, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.0018346160650253296, "rewards/wrapped_format_reward": 0.625, "step": 408 }, { "completion_length": 428.0, "epoch": 81.8, "grad_norm": 0.6789681315422058, "kl": 0.580537736415863, "learning_rate": 4.6401040572869295e-06, "loss": 0.0232, "reward": 3.1963601112365723, "reward_std": 0.6234812140464783, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": 0.8213601112365723, "rewards/wrapped_format_reward": 0.625, "step": 409 }, { "completion_length": 500.0, "epoch": 82.0, "grad_norm": 0.5842041373252869, "kl": 0.9948064088821411, "learning_rate": 4.637279676682367e-06, "loss": 0.0398, "reward": 2.9659104347229004, "reward_std": 0.20523911714553833, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.21591025590896606, "rewards/wrapped_format_reward": 0.75, "step": 410 }, { "completion_length": 500.0, "epoch": 82.2, "grad_norm": 0.7610025405883789, "kl": 0.992956280708313, "learning_rate": 4.634445123384644e-06, "loss": 0.0397, "reward": 1.2791435718536377, "reward_std": 3.190803050994873, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9708565473556519, "rewards/wrapped_format_reward": 0.75, "step": 411 }, { "completion_length": 500.0, "epoch": 82.4, "grad_norm": 0.5706319212913513, "kl": 1.0067870616912842, "learning_rate": 4.631600410885231e-06, "loss": 0.0403, "reward": 2.5278797149658203, "reward_std": 0.3967418074607849, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9772727489471436, "rewards/wrapped_driving_reward": -0.19939307868480682, "rewards/wrapped_format_reward": 0.75, "step": 412 }, { "completion_length": 500.0, "epoch": 82.6, "grad_norm": 0.5617349147796631, "kl": 0.8165132999420166, "learning_rate": 4.6287455527239475e-06, "loss": 0.0327, "reward": 2.4629299640655518, "reward_std": 0.6600992679595947, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9375, "rewards/wrapped_driving_reward": 0.27542999386787415, "rewards/wrapped_format_reward": 0.25, "step": 413 }, { "completion_length": 500.0, "epoch": 82.8, "grad_norm": 0.6648067831993103, "kl": 0.639345109462738, "learning_rate": 4.625880562488908e-06, "loss": 0.0256, "reward": 3.1534743309020996, "reward_std": 0.5813019871711731, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.96875, "rewards/wrapped_driving_reward": 0.5597245097160339, "rewards/wrapped_format_reward": 0.625, "step": 414 }, { "completion_length": 482.0, "epoch": 83.0, "grad_norm": 0.5819172263145447, "kl": 0.48156633973121643, "learning_rate": 4.623005453816447e-06, "loss": 0.0193, "reward": 0.7172784805297852, "reward_std": 2.3905746936798096, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.9077215194702148, "rewards/wrapped_format_reward": 0.625, "step": 415 }, { "completion_length": 500.0, "epoch": 83.2, "grad_norm": 0.6890819668769836, "kl": 0.970129132270813, "learning_rate": 4.620120240391065e-06, "loss": 0.0388, "reward": 3.3380627632141113, "reward_std": 0.025144066661596298, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.7130628824234009, "rewards/wrapped_format_reward": 0.625, "step": 416 }, { "completion_length": 500.0, "epoch": 83.4, "grad_norm": 0.5055440068244934, "kl": 1.0465319156646729, "learning_rate": 4.617224935945354e-06, "loss": 0.0419, "reward": 1.7734256982803345, "reward_std": 1.9043101072311401, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.1015743017196655, "rewards/wrapped_format_reward": 0.875, "step": 417 }, { "completion_length": 500.0, "epoch": 83.6, "grad_norm": 0.7463017106056213, "kl": 0.7646268606185913, "learning_rate": 4.614319554259934e-06, "loss": 0.0306, "reward": 0.765255331993103, "reward_std": 3.2255241870880127, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6818181872367859, "rewards/wrapped_driving_reward": -1.166562795639038, "rewards/wrapped_format_reward": 0.5, "step": 418 }, { "completion_length": 494.0, "epoch": 83.8, "grad_norm": 0.613161027431488, "kl": 1.2434450387954712, "learning_rate": 4.611404109163392e-06, "loss": 0.0497, "reward": 3.17510986328125, "reward_std": 0.35877394676208496, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9583333134651184, "rewards/wrapped_driving_reward": 0.34177637100219727, "rewards/wrapped_format_reward": 0.875, "step": 419 }, { "completion_length": 500.0, "epoch": 84.0, "grad_norm": 0.4756300151348114, "kl": 0.8438820838928223, "learning_rate": 4.608478614532215e-06, "loss": 0.0338, "reward": 2.6704771518707275, "reward_std": 1.0551282167434692, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.20452289283275604, "rewards/wrapped_format_reward": 0.875, "step": 420 }, { "completion_length": 500.0, "epoch": 84.2, "grad_norm": 0.6265870332717896, "kl": 0.7220280170440674, "learning_rate": 4.605543084290716e-06, "loss": 0.0289, "reward": 3.330387592315674, "reward_std": 0.5945422053337097, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.8303877115249634, "rewards/wrapped_format_reward": 0.5, "step": 421 }, { "completion_length": 500.0, "epoch": 84.4, "grad_norm": 0.4972628653049469, "kl": 0.4299587309360504, "learning_rate": 4.602597532410982e-06, "loss": 0.0172, "reward": 1.6803350448608398, "reward_std": 2.1513352394104004, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.9446649551391602, "rewards/wrapped_format_reward": 0.625, "step": 422 }, { "completion_length": 500.0, "epoch": 84.6, "grad_norm": 1.2557274103164673, "kl": 1.1829395294189453, "learning_rate": 4.599641972912791e-06, "loss": 0.0473, "reward": 3.656310796737671, "reward_std": 0.22195641696453094, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.7813107967376709, "rewards/wrapped_format_reward": 0.875, "step": 423 }, { "completion_length": 500.0, "epoch": 84.8, "grad_norm": 0.5427276492118835, "kl": 0.6175304055213928, "learning_rate": 4.596676419863561e-06, "loss": 0.0247, "reward": 2.078035593032837, "reward_std": 1.5830367803573608, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.7969645261764526, "rewards/wrapped_format_reward": 0.875, "step": 424 }, { "completion_length": 500.0, "epoch": 85.0, "grad_norm": 0.6221134066581726, "kl": 0.36543771624565125, "learning_rate": 4.59370088737827e-06, "loss": 0.0146, "reward": 3.0459718704223633, "reward_std": 0.3525036573410034, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9750000238418579, "rewards/wrapped_driving_reward": 0.19597166776657104, "rewards/wrapped_format_reward": 0.875, "step": 425 }, { "completion_length": 471.0, "epoch": 85.2, "grad_norm": 1.1358462572097778, "kl": 1.153946042060852, "learning_rate": 4.590715389619399e-06, "loss": 0.0462, "reward": 2.3951056003570557, "reward_std": 0.31768667697906494, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.22989457845687866, "rewards/wrapped_format_reward": 0.625, "step": 426 }, { "completion_length": 500.0, "epoch": 85.4, "grad_norm": 0.6305968761444092, "kl": 0.5702832341194153, "learning_rate": 4.587719940796858e-06, "loss": 0.0228, "reward": 1.2231757640838623, "reward_std": 1.9306583404541016, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.5268242359161377, "rewards/wrapped_format_reward": 0.75, "step": 427 }, { "completion_length": 500.0, "epoch": 85.6, "grad_norm": 0.8879334330558777, "kl": 0.9042296409606934, "learning_rate": 4.584714555167921e-06, "loss": 0.0362, "reward": 3.215460777282715, "reward_std": 0.508735716342926, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9444444179534912, "rewards/wrapped_driving_reward": 0.39601635932922363, "rewards/wrapped_format_reward": 0.875, "step": 428 }, { "completion_length": 500.0, "epoch": 85.8, "grad_norm": 0.4813222289085388, "kl": 0.408966988325119, "learning_rate": 4.581699247037157e-06, "loss": 0.0164, "reward": 2.7956860065460205, "reward_std": 0.7762725353240967, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9285714626312256, "rewards/wrapped_driving_reward": 0.11711461842060089, "rewards/wrapped_format_reward": 0.75, "step": 429 }, { "completion_length": 500.0, "epoch": 86.0, "grad_norm": 0.5622531771659851, "kl": 0.9757765531539917, "learning_rate": 4.578674030756364e-06, "loss": 0.039, "reward": 3.035037040710449, "reward_std": 0.4434798061847687, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.949999988079071, "rewards/wrapped_driving_reward": 0.21003687381744385, "rewards/wrapped_format_reward": 0.875, "step": 430 }, { "completion_length": 488.0, "epoch": 86.2, "grad_norm": 0.499776154756546, "kl": 1.332625389099121, "learning_rate": 4.5756389207244965e-06, "loss": 0.0533, "reward": 3.2134695053100586, "reward_std": 0.827461302280426, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9375, "rewards/wrapped_driving_reward": 0.5259695053100586, "rewards/wrapped_format_reward": 0.75, "step": 431 }, { "completion_length": 500.0, "epoch": 86.4, "grad_norm": 0.4789699912071228, "kl": 0.47637441754341125, "learning_rate": 4.572593931387604e-06, "loss": 0.0191, "reward": 2.457927703857422, "reward_std": 0.9853748083114624, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9375, "rewards/wrapped_driving_reward": -0.3545722961425781, "rewards/wrapped_format_reward": 0.875, "step": 432 }, { "completion_length": 500.0, "epoch": 86.6, "grad_norm": 0.4391123354434967, "kl": 1.1674070358276367, "learning_rate": 4.569539077238756e-06, "loss": 0.0467, "reward": 2.6357340812683105, "reward_std": 0.510990560054779, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.13573402166366577, "rewards/wrapped_format_reward": 0.5, "step": 433 }, { "completion_length": 500.0, "epoch": 86.8, "grad_norm": 0.5584562420845032, "kl": 0.7644482851028442, "learning_rate": 4.566474372817971e-06, "loss": 0.0306, "reward": 2.366978645324707, "reward_std": 0.4779915511608124, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.13302119076251984, "rewards/wrapped_format_reward": 0.5, "step": 434 }, { "completion_length": 500.0, "epoch": 87.0, "grad_norm": 0.5053340792655945, "kl": 0.5127562284469604, "learning_rate": 4.5633998327121595e-06, "loss": 0.0205, "reward": 2.978019952774048, "reward_std": 0.3388763666152954, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.02198006771504879, "rewards/wrapped_format_reward": 1.0, "step": 435 }, { "completion_length": 500.0, "epoch": 87.2, "grad_norm": 0.5881168246269226, "kl": 0.8654472231864929, "learning_rate": 4.560315471555039e-06, "loss": 0.0346, "reward": 2.695511817932129, "reward_std": 0.2650168240070343, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.17948828637599945, "rewards/wrapped_format_reward": 0.875, "step": 436 }, { "completion_length": 500.0, "epoch": 87.4, "grad_norm": 0.5051465630531311, "kl": 0.7103445529937744, "learning_rate": 4.557221304027077e-06, "loss": 0.0284, "reward": 0.450472354888916, "reward_std": 2.2633330821990967, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.924527645111084, "rewards/wrapped_format_reward": 0.375, "step": 437 }, { "completion_length": 500.0, "epoch": 87.6, "grad_norm": 0.8089684247970581, "kl": 0.7311402559280396, "learning_rate": 4.55411734485541e-06, "loss": 0.0292, "reward": 3.2812561988830566, "reward_std": 0.32883548736572266, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.4062563478946686, "rewards/wrapped_format_reward": 0.875, "step": 438 }, { "completion_length": 500.0, "epoch": 87.8, "grad_norm": 0.5592673420906067, "kl": 0.6468991041183472, "learning_rate": 4.551003608813784e-06, "loss": 0.0259, "reward": 1.8620704412460327, "reward_std": 0.6889193058013916, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9750000238418579, "rewards/wrapped_driving_reward": -0.8629295229911804, "rewards/wrapped_format_reward": 0.75, "step": 439 }, { "completion_length": 500.0, "epoch": 88.0, "grad_norm": 0.5054028630256653, "kl": 0.36086270213127136, "learning_rate": 4.54788011072248e-06, "loss": 0.0144, "reward": 3.3388941287994385, "reward_std": 0.5939581394195557, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.8388940691947937, "rewards/wrapped_format_reward": 0.5, "step": 440 }, { "completion_length": 500.0, "epoch": 88.2, "grad_norm": 9.32406997680664, "kl": 2.2713303565979004, "learning_rate": 4.544746865448239e-06, "loss": 0.0909, "reward": 1.8941614627838135, "reward_std": 2.3242433071136475, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.7308385968208313, "rewards/wrapped_format_reward": 0.625, "step": 441 }, { "completion_length": 500.0, "epoch": 88.4, "grad_norm": 0.7286486625671387, "kl": 0.23764638602733612, "learning_rate": 4.541603887904198e-06, "loss": 0.0095, "reward": 1.595595121383667, "reward_std": 3.398191213607788, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.65625, "rewards/wrapped_driving_reward": -0.43565481901168823, "rewards/wrapped_format_reward": 0.625, "step": 442 }, { "completion_length": 472.0, "epoch": 88.6, "grad_norm": 0.4791865944862366, "kl": 0.853591799736023, "learning_rate": 4.538451193049814e-06, "loss": 0.0341, "reward": 2.5504026412963867, "reward_std": 0.576043963432312, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.4495972692966461, "rewards/wrapped_format_reward": 1.0, "step": 443 }, { "completion_length": 500.0, "epoch": 88.8, "grad_norm": 0.5535397529602051, "kl": 0.1802796721458435, "learning_rate": 4.535288795890799e-06, "loss": 0.0072, "reward": -0.1438617706298828, "reward_std": 3.8885178565979004, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.44999998807907104, "rewards/wrapped_driving_reward": -1.593861699104309, "rewards/wrapped_format_reward": 0.5, "step": 444 }, { "completion_length": 500.0, "epoch": 89.0, "grad_norm": 0.5480430126190186, "kl": 0.5194934010505676, "learning_rate": 4.532116711479039e-06, "loss": 0.0208, "reward": 2.7844161987304688, "reward_std": 0.34969019889831543, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.0905836746096611, "rewards/wrapped_format_reward": 0.875, "step": 445 }, { "completion_length": 500.0, "epoch": 89.2, "grad_norm": 7.007938861846924, "kl": 1.402961015701294, "learning_rate": 4.528934954912531e-06, "loss": 0.0561, "reward": -0.7073632478713989, "reward_std": 2.505068778991699, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7083333134651184, "rewards/wrapped_driving_reward": -3.040696620941162, "rewards/wrapped_format_reward": 0.875, "step": 446 }, { "completion_length": 500.0, "epoch": 89.4, "grad_norm": 0.558776319026947, "kl": 1.3707541227340698, "learning_rate": 4.525743541335309e-06, "loss": 0.0548, "reward": 2.7044730186462402, "reward_std": 0.5199458003044128, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.2955269515514374, "rewards/wrapped_format_reward": 1.0, "step": 447 }, { "completion_length": 500.0, "epoch": 89.6, "grad_norm": 0.46237713098526, "kl": 0.986226499080658, "learning_rate": 4.522542485937369e-06, "loss": 0.0394, "reward": 3.5677292346954346, "reward_std": 0.30159273743629456, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.817729115486145, "rewards/wrapped_format_reward": 0.75, "step": 448 }, { "completion_length": 500.0, "epoch": 89.8, "grad_norm": 0.6039856672286987, "kl": 0.5541295409202576, "learning_rate": 4.519331803954599e-06, "loss": 0.0222, "reward": 2.4867310523986816, "reward_std": 0.18556839227676392, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9147727489471436, "rewards/wrapped_driving_reward": -0.17804156243801117, "rewards/wrapped_format_reward": 0.75, "step": 449 }, { "completion_length": 500.0, "epoch": 90.0, "grad_norm": 0.6416642665863037, "kl": 0.5888490676879883, "learning_rate": 4.516111510668707e-06, "loss": 0.0236, "reward": 3.0717105865478516, "reward_std": 0.9419719576835632, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.5717105865478516, "rewards/wrapped_format_reward": 0.5, "step": 450 }, { "completion_length": 500.0, "epoch": 90.2, "grad_norm": 0.8580331802368164, "kl": 1.3547351360321045, "learning_rate": 4.512881621407146e-06, "loss": 0.0542, "reward": 1.4438624382019043, "reward_std": 3.3256571292877197, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9311375021934509, "rewards/wrapped_format_reward": 0.875, "step": 451 }, { "completion_length": 500.0, "epoch": 90.4, "grad_norm": 0.6190384030342102, "kl": 0.4166165888309479, "learning_rate": 4.509642151543043e-06, "loss": 0.0167, "reward": 1.4482831954956055, "reward_std": 3.3409366607666016, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9267167448997498, "rewards/wrapped_format_reward": 0.875, "step": 452 }, { "completion_length": 470.0, "epoch": 90.6, "grad_norm": 0.9551142454147339, "kl": 0.9795337915420532, "learning_rate": 4.506393116495128e-06, "loss": 0.0392, "reward": 3.8107566833496094, "reward_std": 0.042369965463876724, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9750000238418579, "rewards/wrapped_driving_reward": 0.835756778717041, "rewards/wrapped_format_reward": 1.0, "step": 453 }, { "completion_length": 500.0, "epoch": 90.8, "grad_norm": 0.532974362373352, "kl": 0.43851980566978455, "learning_rate": 4.503134531727652e-06, "loss": 0.0175, "reward": 2.5775976181030273, "reward_std": 0.30674323439598083, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.17240235209465027, "rewards/wrapped_format_reward": 0.75, "step": 454 }, { "completion_length": 500.0, "epoch": 91.0, "grad_norm": 0.5811918377876282, "kl": 0.46373996138572693, "learning_rate": 4.499866412750324e-06, "loss": 0.0185, "reward": 3.5562117099761963, "reward_std": 0.12775082886219025, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.5562118291854858, "rewards/wrapped_format_reward": 1.0, "step": 455 }, { "completion_length": 500.0, "epoch": 91.2, "grad_norm": 0.53252774477005, "kl": 0.653654158115387, "learning_rate": 4.496588775118232e-06, "loss": 0.0261, "reward": 1.345137119293213, "reward_std": 1.6006327867507935, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9750000238418579, "rewards/wrapped_driving_reward": -1.2548627853393555, "rewards/wrapped_format_reward": 0.625, "step": 456 }, { "completion_length": 500.0, "epoch": 91.4, "grad_norm": 0.6192057728767395, "kl": 0.8161628842353821, "learning_rate": 4.493301634431768e-06, "loss": 0.0326, "reward": 3.238480567932129, "reward_std": 0.6025396585464478, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9750000238418579, "rewards/wrapped_driving_reward": 0.513480544090271, "rewards/wrapped_format_reward": 0.75, "step": 457 }, { "completion_length": 500.0, "epoch": 91.6, "grad_norm": 0.5338960289955139, "kl": 0.4019594192504883, "learning_rate": 4.490005006336555e-06, "loss": 0.0161, "reward": 1.511120080947876, "reward_std": 3.7101433277130127, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.6138800382614136, "rewards/wrapped_format_reward": 0.625, "step": 458 }, { "completion_length": 500.0, "epoch": 91.8, "grad_norm": 0.6177345514297485, "kl": 0.7023136615753174, "learning_rate": 4.486698906523375e-06, "loss": 0.0281, "reward": 0.6668235063552856, "reward_std": 2.7802770137786865, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7022727131843567, "rewards/wrapped_driving_reward": -1.1604492664337158, "rewards/wrapped_format_reward": 0.375, "step": 459 }, { "completion_length": 500.0, "epoch": 92.0, "grad_norm": 0.498141884803772, "kl": 0.7440797686576843, "learning_rate": 4.4833833507280884e-06, "loss": 0.0298, "reward": 1.770745038986206, "reward_std": 2.273341178894043, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.854254961013794, "rewards/wrapped_format_reward": 0.625, "step": 460 }, { "completion_length": 500.0, "epoch": 92.2, "grad_norm": 0.5320531725883484, "kl": 0.5814719796180725, "learning_rate": 4.4800583547315654e-06, "loss": 0.0233, "reward": -0.1779249906539917, "reward_std": 1.991480827331543, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -3.0529251098632812, "rewards/wrapped_format_reward": 0.875, "step": 461 }, { "completion_length": 500.0, "epoch": 92.4, "grad_norm": 0.6644296646118164, "kl": 1.252918004989624, "learning_rate": 4.476723934359609e-06, "loss": 0.0501, "reward": 1.581032395362854, "reward_std": 3.7278060913085938, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.6689676642417908, "rewards/wrapped_format_reward": 0.75, "step": 462 }, { "completion_length": 500.0, "epoch": 92.6, "grad_norm": 0.6294512748718262, "kl": 0.454416960477829, "learning_rate": 4.473380105482875e-06, "loss": 0.0182, "reward": 0.13677752017974854, "reward_std": 2.944185733795166, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -2.113222360610962, "rewards/wrapped_format_reward": 0.75, "step": 463 }, { "completion_length": 500.0, "epoch": 92.8, "grad_norm": 0.5402928590774536, "kl": 0.8170625567436218, "learning_rate": 4.470026884016805e-06, "loss": 0.0327, "reward": 3.565312385559082, "reward_std": 0.3042621612548828, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.8153125047683716, "rewards/wrapped_format_reward": 0.75, "step": 464 }, { "completion_length": 500.0, "epoch": 93.0, "grad_norm": 0.533676266670227, "kl": 0.7118498682975769, "learning_rate": 4.466664285921543e-06, "loss": 0.0285, "reward": 2.770576000213623, "reward_std": 0.16069670021533966, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9545454978942871, "rewards/wrapped_driving_reward": -0.05896963179111481, "rewards/wrapped_format_reward": 0.875, "step": 465 }, { "completion_length": 500.0, "epoch": 93.2, "grad_norm": 0.5372627377510071, "kl": 1.061421275138855, "learning_rate": 4.463292327201862e-06, "loss": 0.0425, "reward": 0.8557369112968445, "reward_std": 2.914240598678589, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.3942632675170898, "rewards/wrapped_format_reward": 0.75, "step": 466 }, { "completion_length": 500.0, "epoch": 93.4, "grad_norm": 0.6193510293960571, "kl": 1.1515281200408936, "learning_rate": 4.459911023907092e-06, "loss": 0.0461, "reward": 3.183587074279785, "reward_std": 0.39359328150749207, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9472222328186035, "rewards/wrapped_driving_reward": 0.4863646626472473, "rewards/wrapped_format_reward": 0.75, "step": 467 }, { "completion_length": 500.0, "epoch": 93.6, "grad_norm": 0.5153403878211975, "kl": 0.6276007890701294, "learning_rate": 4.456520392131035e-06, "loss": 0.0251, "reward": 1.2580312490463257, "reward_std": 2.2587177753448486, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.6169687509536743, "rewards/wrapped_format_reward": 0.875, "step": 468 }, { "completion_length": 500.0, "epoch": 93.8, "grad_norm": 0.8933354616165161, "kl": 0.5966981053352356, "learning_rate": 4.453120448011897e-06, "loss": 0.0239, "reward": 3.828160285949707, "reward_std": 0.019492290914058685, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.828160285949707, "rewards/wrapped_format_reward": 1.0, "step": 469 }, { "completion_length": 500.0, "epoch": 94.0, "grad_norm": 0.5882790684700012, "kl": 0.6073835492134094, "learning_rate": 4.4497112077322045e-06, "loss": 0.0243, "reward": 2.9608585834503174, "reward_std": 0.5855686664581299, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9791666865348816, "rewards/wrapped_driving_reward": 0.2316918969154358, "rewards/wrapped_format_reward": 0.75, "step": 470 }, { "completion_length": 500.0, "epoch": 94.2, "grad_norm": 0.5647352337837219, "kl": 0.695755660533905, "learning_rate": 4.446292687518734e-06, "loss": 0.0278, "reward": 1.5310817956924438, "reward_std": 3.7384822368621826, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6785714626312256, "rewards/wrapped_driving_reward": -0.39748966693878174, "rewards/wrapped_format_reward": 0.5, "step": 471 }, { "completion_length": 500.0, "epoch": 94.4, "grad_norm": 0.5346417427062988, "kl": 0.8996866345405579, "learning_rate": 4.442864903642428e-06, "loss": 0.036, "reward": 1.0604596138000488, "reward_std": 3.060889959335327, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": -1.0645403861999512, "rewards/wrapped_format_reward": 0.75, "step": 472 }, { "completion_length": 500.0, "epoch": 94.6, "grad_norm": 0.5798156261444092, "kl": 0.6437966227531433, "learning_rate": 4.439427872418321e-06, "loss": 0.0258, "reward": 3.384183168411255, "reward_std": 0.42769670486450195, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.5091832280158997, "rewards/wrapped_format_reward": 0.875, "step": 473 }, { "completion_length": 500.0, "epoch": 94.8, "grad_norm": 0.49952396750450134, "kl": 0.5218140482902527, "learning_rate": 4.435981610205464e-06, "loss": 0.0209, "reward": -0.285653293132782, "reward_std": 2.212059497833252, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9583333134651184, "rewards/wrapped_driving_reward": -2.9939866065979004, "rewards/wrapped_format_reward": 0.75, "step": 474 }, { "completion_length": 500.0, "epoch": 95.0, "grad_norm": 1.0312459468841553, "kl": 0.6847164630889893, "learning_rate": 4.432526133406843e-06, "loss": 0.0274, "reward": 2.879366636276245, "reward_std": 0.39730778336524963, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.2543666362762451, "rewards/wrapped_format_reward": 0.625, "step": 475 }, { "completion_length": 500.0, "epoch": 95.2, "grad_norm": 0.582595705986023, "kl": 0.599985659122467, "learning_rate": 4.4290614584693005e-06, "loss": 0.024, "reward": 0.7622314691543579, "reward_std": 2.525625228881836, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.237768530845642, "rewards/wrapped_format_reward": 0.5, "step": 476 }, { "completion_length": 500.0, "epoch": 95.4, "grad_norm": 0.5313723683357239, "kl": 0.868817925453186, "learning_rate": 4.425587601883461e-06, "loss": 0.0348, "reward": 2.211709976196289, "reward_std": 0.4423002600669861, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.96875, "rewards/wrapped_driving_reward": -0.3820401132106781, "rewards/wrapped_format_reward": 0.625, "step": 477 }, { "completion_length": 500.0, "epoch": 95.6, "grad_norm": 0.5549596548080444, "kl": 1.2767629623413086, "learning_rate": 4.422104580183649e-06, "loss": 0.0511, "reward": 3.195434331893921, "reward_std": 0.395979642868042, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.875, "rewards/wrapped_driving_reward": 0.4454343914985657, "rewards/wrapped_format_reward": 0.875, "step": 478 }, { "completion_length": 500.0, "epoch": 95.8, "grad_norm": 0.4778159260749817, "kl": 0.23531889915466309, "learning_rate": 4.418612409947814e-06, "loss": 0.0094, "reward": 2.747316360473633, "reward_std": 0.6012102961540222, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.122316375374794, "rewards/wrapped_format_reward": 0.625, "step": 479 }, { "completion_length": 500.0, "epoch": 96.0, "grad_norm": 0.6010138392448425, "kl": 0.6477577090263367, "learning_rate": 4.415111107797445e-06, "loss": 0.0259, "reward": 1.7465873956680298, "reward_std": 3.5261001586914062, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.3784126043319702, "rewards/wrapped_format_reward": 0.625, "step": 480 }, { "completion_length": 476.0, "epoch": 96.2, "grad_norm": 0.4925939440727234, "kl": 1.1028554439544678, "learning_rate": 4.4116006903975015e-06, "loss": 0.0441, "reward": 2.4060165882110596, "reward_std": 0.4904329478740692, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.984375, "rewards/wrapped_driving_reward": -0.20335841178894043, "rewards/wrapped_format_reward": 0.625, "step": 481 }, { "completion_length": 467.0, "epoch": 96.4, "grad_norm": 0.5640705823898315, "kl": 1.066386103630066, "learning_rate": 4.408081174456322e-06, "loss": 0.0427, "reward": 3.762505292892456, "reward_std": 0.12346359342336655, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9375, "rewards/wrapped_driving_reward": 0.825005292892456, "rewards/wrapped_format_reward": 1.0, "step": 482 }, { "completion_length": 500.0, "epoch": 96.6, "grad_norm": 0.5508888959884644, "kl": 0.3224889039993286, "learning_rate": 4.404552576725557e-06, "loss": 0.0129, "reward": 1.1607617139816284, "reward_std": 3.1784019470214844, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.7142382860183716, "rewards/wrapped_format_reward": 0.375, "step": 483 }, { "completion_length": 500.0, "epoch": 96.8, "grad_norm": 0.5421119332313538, "kl": 0.2217361032962799, "learning_rate": 4.401014914000078e-06, "loss": 0.0089, "reward": 1.3363996744155884, "reward_std": 3.2391209602355957, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.7886003255844116, "rewards/wrapped_format_reward": 0.625, "step": 484 }, { "completion_length": 403.0, "epoch": 97.0, "grad_norm": 0.5398365259170532, "kl": 1.3086752891540527, "learning_rate": 4.397468203117905e-06, "loss": 0.0523, "reward": 2.178347110748291, "reward_std": 2.1330792903900146, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.571652889251709, "rewards/wrapped_format_reward": 0.75, "step": 485 }, { "completion_length": 500.0, "epoch": 97.2, "grad_norm": 0.5164806246757507, "kl": 1.1992006301879883, "learning_rate": 4.393912460960125e-06, "loss": 0.048, "reward": -0.017039000988006592, "reward_std": 1.9406579732894897, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.869949460029602, "rewards/wrapped_driving_reward": -2.386988401412964, "rewards/wrapped_format_reward": 0.5, "step": 486 }, { "completion_length": 447.0, "epoch": 97.4, "grad_norm": 0.5225284695625305, "kl": 0.9045883417129517, "learning_rate": 4.3903477044508066e-06, "loss": 0.0362, "reward": 3.6854019165039062, "reward_std": 0.25948864221572876, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.8104017972946167, "rewards/wrapped_format_reward": 0.875, "step": 487 }, { "completion_length": 500.0, "epoch": 97.6, "grad_norm": 0.5833526253700256, "kl": 0.5774132013320923, "learning_rate": 4.386773950556931e-06, "loss": 0.0231, "reward": 3.013421058654785, "reward_std": 0.4619700014591217, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9583333134651184, "rewards/wrapped_driving_reward": 0.18008768558502197, "rewards/wrapped_format_reward": 0.875, "step": 488 }, { "completion_length": 500.0, "epoch": 97.8, "grad_norm": 0.4881477653980255, "kl": 0.38699668645858765, "learning_rate": 4.3831912162882946e-06, "loss": 0.0155, "reward": 0.5115635395050049, "reward_std": 1.8448858261108398, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8522727489471436, "rewards/wrapped_driving_reward": -1.9657092094421387, "rewards/wrapped_format_reward": 0.625, "step": 489 }, { "completion_length": 500.0, "epoch": 98.0, "grad_norm": 0.5169448852539062, "kl": 1.3162471055984497, "learning_rate": 4.379599518697444e-06, "loss": 0.0526, "reward": 3.255016803741455, "reward_std": 0.4777297377586365, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.875, "rewards/wrapped_driving_reward": 0.5050168633460999, "rewards/wrapped_format_reward": 0.875, "step": 490 }, { "completion_length": 500.0, "epoch": 98.2, "grad_norm": 0.553719699382782, "kl": 0.4232349097728729, "learning_rate": 4.375998874879585e-06, "loss": 0.0169, "reward": -2.375, "reward_std": 1.6007810831069946, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.625, "step": 491 }, { "completion_length": 500.0, "epoch": 98.4, "grad_norm": 0.549466073513031, "kl": 0.9794742465019226, "learning_rate": 4.372389301972506e-06, "loss": 0.0392, "reward": 2.8539419174194336, "reward_std": 0.11372269690036774, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.14605820178985596, "rewards/wrapped_format_reward": 1.0, "step": 492 }, { "completion_length": 500.0, "epoch": 98.6, "grad_norm": 0.4947136342525482, "kl": 0.7255305051803589, "learning_rate": 4.368770817156493e-06, "loss": 0.029, "reward": 1.4876158237457275, "reward_std": 3.332230567932129, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.3873841464519501, "rewards/wrapped_format_reward": 0.375, "step": 493 }, { "completion_length": 480.0, "epoch": 98.8, "grad_norm": 0.60732501745224, "kl": 0.8074931502342224, "learning_rate": 4.365143437654249e-06, "loss": 0.0323, "reward": 3.7022383213043213, "reward_std": 0.10065864771604538, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.96875, "rewards/wrapped_driving_reward": 0.7334883213043213, "rewards/wrapped_format_reward": 1.0, "step": 494 }, { "completion_length": 471.0, "epoch": 99.0, "grad_norm": 0.5607241988182068, "kl": 0.9837010502815247, "learning_rate": 4.3615071807308165e-06, "loss": 0.0393, "reward": 2.795694589614868, "reward_std": 0.325857937335968, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.04569460451602936, "rewards/wrapped_format_reward": 0.75, "step": 495 }, { "completion_length": 349.0, "epoch": 99.2, "grad_norm": 0.5761314034461975, "kl": 0.4112725853919983, "learning_rate": 4.357862063693486e-06, "loss": 0.0165, "reward": 3.6457560062408447, "reward_std": 0.12520428001880646, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.984375, "rewards/wrapped_driving_reward": 0.6613809466362, "rewards/wrapped_format_reward": 1.0, "step": 496 }, { "completion_length": 500.0, "epoch": 99.4, "grad_norm": 0.4828655421733856, "kl": 0.8910616636276245, "learning_rate": 4.354208103891723e-06, "loss": 0.0356, "reward": 3.4025442600250244, "reward_std": 0.550991415977478, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.96875, "rewards/wrapped_driving_reward": 0.6837941408157349, "rewards/wrapped_format_reward": 0.75, "step": 497 }, { "completion_length": 500.0, "epoch": 99.6, "grad_norm": 3.4703691005706787, "kl": 0.9643664360046387, "learning_rate": 4.350545318717081e-06, "loss": 0.0386, "reward": 2.7853288650512695, "reward_std": 0.7604644894599915, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.03532897308468819, "rewards/wrapped_format_reward": 0.75, "step": 498 }, { "completion_length": 500.0, "epoch": 99.8, "grad_norm": 0.5264486074447632, "kl": 0.6139304041862488, "learning_rate": 4.3468737256031155e-06, "loss": 0.0246, "reward": 1.6845433712005615, "reward_std": 0.8301387429237366, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.1904566287994385, "rewards/wrapped_format_reward": 0.875, "step": 499 }, { "completion_length": 419.0, "epoch": 100.0, "grad_norm": 0.5735057592391968, "kl": 0.41647979617118835, "learning_rate": 4.34319334202531e-06, "loss": 0.0167, "reward": 2.6967225074768066, "reward_std": 0.0777384340763092, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9437500238418579, "rewards/wrapped_driving_reward": -0.24702748656272888, "rewards/wrapped_format_reward": 1.0, "step": 500 }, { "completion_length": 500.0, "epoch": 100.2, "grad_norm": 0.7586433291435242, "kl": 0.7535019516944885, "learning_rate": 4.339504185500984e-06, "loss": 0.0301, "reward": 3.8102352619171143, "reward_std": 0.015372190624475479, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.810235321521759, "rewards/wrapped_format_reward": 1.0, "step": 501 }, { "completion_length": 466.0, "epoch": 100.4, "grad_norm": 0.6329609751701355, "kl": 1.2336125373840332, "learning_rate": 4.335806273589214e-06, "loss": 0.0493, "reward": 1.1259068250656128, "reward_std": 3.417491912841797, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.1240931749343872, "rewards/wrapped_format_reward": 0.75, "step": 502 }, { "completion_length": 500.0, "epoch": 100.6, "grad_norm": 0.5284588932991028, "kl": 1.2016366720199585, "learning_rate": 4.332099623890749e-06, "loss": 0.0481, "reward": 2.9199860095977783, "reward_std": 0.41995736956596375, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.04498597979545593, "rewards/wrapped_format_reward": 0.875, "step": 503 }, { "completion_length": 371.0, "epoch": 100.8, "grad_norm": 0.6299885511398315, "kl": 0.4117721617221832, "learning_rate": 4.328384254047927e-06, "loss": 0.0165, "reward": 3.296032428741455, "reward_std": 0.5489058494567871, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.5460324287414551, "rewards/wrapped_format_reward": 0.75, "step": 504 }, { "completion_length": 487.0, "epoch": 101.0, "grad_norm": 0.5148439407348633, "kl": 0.7591438889503479, "learning_rate": 4.324660181744589e-06, "loss": 0.0304, "reward": 2.263432502746582, "reward_std": 1.198183536529541, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.3615674376487732, "rewards/wrapped_format_reward": 0.625, "step": 505 }, { "completion_length": 500.0, "epoch": 101.2, "grad_norm": 0.5383084416389465, "kl": 0.8281757235527039, "learning_rate": 4.320927424706001e-06, "loss": 0.0331, "reward": 2.926753520965576, "reward_std": 0.5967720746994019, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.17675352096557617, "rewards/wrapped_format_reward": 0.75, "step": 506 }, { "completion_length": 500.0, "epoch": 101.4, "grad_norm": 0.5339255332946777, "kl": 0.8783241510391235, "learning_rate": 4.317186000698761e-06, "loss": 0.0351, "reward": 2.7582178115844727, "reward_std": 0.5562581419944763, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.2582179605960846, "rewards/wrapped_format_reward": 0.5, "step": 507 }, { "completion_length": 500.0, "epoch": 101.6, "grad_norm": 0.5230175256729126, "kl": 0.7820099592208862, "learning_rate": 4.313435927530719e-06, "loss": 0.0313, "reward": 1.851464867591858, "reward_std": 0.8467382788658142, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.6485350728034973, "rewards/wrapped_format_reward": 0.5, "step": 508 }, { "completion_length": 500.0, "epoch": 101.8, "grad_norm": 0.4553928077220917, "kl": 0.7339428067207336, "learning_rate": 4.309677223050895e-06, "loss": 0.0294, "reward": 1.0629948377609253, "reward_std": 3.375459671020508, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.1870051622390747, "rewards/wrapped_format_reward": 0.75, "step": 509 }, { "completion_length": 500.0, "epoch": 102.0, "grad_norm": 6.9111127853393555, "kl": 2.688588857650757, "learning_rate": 4.305909905149389e-06, "loss": 0.1075, "reward": 1.5133618116378784, "reward_std": 3.677501916885376, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.7366381287574768, "rewards/wrapped_format_reward": 0.75, "step": 510 }, { "completion_length": 500.0, "epoch": 102.2, "grad_norm": 0.6580697298049927, "kl": 1.4310871362686157, "learning_rate": 4.3021339917572975e-06, "loss": 0.0572, "reward": 0.9614809155464172, "reward_std": 3.113192558288574, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.1635191440582275, "rewards/wrapped_format_reward": 0.625, "step": 511 }, { "completion_length": 500.0, "epoch": 102.4, "grad_norm": 0.5757476091384888, "kl": 1.0578932762145996, "learning_rate": 4.2983495008466285e-06, "loss": 0.0423, "reward": 1.5223838090896606, "reward_std": 3.3499815464019775, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.8526161909103394, "rewards/wrapped_format_reward": 0.875, "step": 512 }, { "completion_length": 500.0, "epoch": 102.6, "grad_norm": 0.5535104870796204, "kl": 1.4569833278656006, "learning_rate": 4.294556450430216e-06, "loss": 0.0583, "reward": 2.8807883262634277, "reward_std": 0.7175414562225342, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.25578856468200684, "rewards/wrapped_format_reward": 0.625, "step": 513 }, { "completion_length": 386.0, "epoch": 102.8, "grad_norm": 0.538215696811676, "kl": 0.8643997311592102, "learning_rate": 4.290754858561636e-06, "loss": 0.0346, "reward": 3.3438968658447266, "reward_std": 0.5325138568878174, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9272727370262146, "rewards/wrapped_driving_reward": 0.4166242182254791, "rewards/wrapped_format_reward": 1.0, "step": 514 }, { "completion_length": 500.0, "epoch": 103.0, "grad_norm": 0.5776817798614502, "kl": 1.1718462705612183, "learning_rate": 4.2869447433351165e-06, "loss": 0.0469, "reward": 1.0269173383712769, "reward_std": 3.018467426300049, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7272727489471436, "rewards/wrapped_driving_reward": -1.3253554105758667, "rewards/wrapped_format_reward": 0.875, "step": 515 }, { "completion_length": 500.0, "epoch": 103.2, "grad_norm": 0.5751096606254578, "kl": 1.1135950088500977, "learning_rate": 4.283126122885455e-06, "loss": 0.0445, "reward": 2.5814623832702637, "reward_std": 0.2912139296531677, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.16853773593902588, "rewards/wrapped_format_reward": 0.75, "step": 516 }, { "completion_length": 500.0, "epoch": 103.4, "grad_norm": 0.6097070574760437, "kl": 1.033371090888977, "learning_rate": 4.2792990153879286e-06, "loss": 0.0413, "reward": 3.570962905883789, "reward_std": 0.47861868143081665, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.8209629058837891, "rewards/wrapped_format_reward": 0.75, "step": 517 }, { "completion_length": 500.0, "epoch": 103.6, "grad_norm": 0.5641130805015564, "kl": 0.7364106774330139, "learning_rate": 4.275463439058214e-06, "loss": 0.0295, "reward": 2.555635929107666, "reward_std": 1.0865808725357056, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.06936385482549667, "rewards/wrapped_format_reward": 0.625, "step": 518 }, { "completion_length": 500.0, "epoch": 103.8, "grad_norm": 0.8291902542114258, "kl": 1.3542972803115845, "learning_rate": 4.271619412152293e-06, "loss": 0.0542, "reward": 0.07205808162689209, "reward_std": 3.21856689453125, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -2.0529417991638184, "rewards/wrapped_format_reward": 0.625, "step": 519 }, { "completion_length": 500.0, "epoch": 104.0, "grad_norm": 0.497583270072937, "kl": 1.1503536701202393, "learning_rate": 4.267766952966369e-06, "loss": 0.046, "reward": 3.3464627265930176, "reward_std": 0.23250795900821686, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.4714627265930176, "rewards/wrapped_format_reward": 0.875, "step": 520 }, { "completion_length": 500.0, "epoch": 104.2, "grad_norm": 0.510185182094574, "kl": 0.9102869033813477, "learning_rate": 4.2639060798367835e-06, "loss": 0.0364, "reward": 3.5760674476623535, "reward_std": 0.17436249554157257, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.96875, "rewards/wrapped_driving_reward": 0.7323175668716431, "rewards/wrapped_format_reward": 0.875, "step": 521 }, { "completion_length": 500.0, "epoch": 104.4, "grad_norm": 0.6613531708717346, "kl": 0.6623646020889282, "learning_rate": 4.260036811139922e-06, "loss": 0.0265, "reward": 0.3890085220336914, "reward_std": 3.5544650554656982, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.8609914779663086, "rewards/wrapped_format_reward": 0.75, "step": 522 }, { "completion_length": 500.0, "epoch": 104.6, "grad_norm": 0.6660361886024475, "kl": 1.1192728281021118, "learning_rate": 4.25615916529213e-06, "loss": 0.0448, "reward": 3.0469722747802734, "reward_std": 0.6083049178123474, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9285714626312256, "rewards/wrapped_driving_reward": 0.36840105056762695, "rewards/wrapped_format_reward": 0.75, "step": 523 }, { "completion_length": 500.0, "epoch": 104.8, "grad_norm": 0.5429725646972656, "kl": 1.0266575813293457, "learning_rate": 4.2522731607496275e-06, "loss": 0.0411, "reward": 2.9466092586517334, "reward_std": 0.3607306480407715, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.05339070409536362, "rewards/wrapped_format_reward": 1.0, "step": 524 }, { "completion_length": 500.0, "epoch": 105.0, "grad_norm": 0.5302778482437134, "kl": 1.0682640075683594, "learning_rate": 4.248378816008418e-06, "loss": 0.0427, "reward": 3.572202205657959, "reward_std": 0.28779879212379456, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.8222021460533142, "rewards/wrapped_format_reward": 0.75, "step": 525 }, { "completion_length": 500.0, "epoch": 105.2, "grad_norm": 0.6403073072433472, "kl": 1.3778810501098633, "learning_rate": 4.244476149604201e-06, "loss": 0.0551, "reward": 1.7839674949645996, "reward_std": 3.200443744659424, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.7160324454307556, "rewards/wrapped_format_reward": 1.0, "step": 526 }, { "completion_length": 500.0, "epoch": 105.4, "grad_norm": 0.48363983631134033, "kl": 0.6617292761802673, "learning_rate": 4.2405651801122835e-06, "loss": 0.0265, "reward": 0.5352749824523926, "reward_std": 2.7295193672180176, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.5897250175476074, "rewards/wrapped_format_reward": 0.625, "step": 527 }, { "completion_length": 454.0, "epoch": 105.6, "grad_norm": 0.49991506338119507, "kl": 1.0598442554473877, "learning_rate": 4.236645926147493e-06, "loss": 0.0424, "reward": 3.2137718200683594, "reward_std": 0.2735251486301422, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.935606062412262, "rewards/wrapped_driving_reward": 0.40316566824913025, "rewards/wrapped_format_reward": 0.875, "step": 528 }, { "completion_length": 427.0, "epoch": 105.8, "grad_norm": 0.5063905119895935, "kl": 0.9304898977279663, "learning_rate": 4.2327184063640905e-06, "loss": 0.0372, "reward": 2.7854347229003906, "reward_std": 0.10556995123624802, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.949999988079071, "rewards/wrapped_driving_reward": -0.16456523537635803, "rewards/wrapped_format_reward": 1.0, "step": 529 }, { "completion_length": 500.0, "epoch": 106.0, "grad_norm": 0.5251207947731018, "kl": 0.7667698264122009, "learning_rate": 4.228782639455674e-06, "loss": 0.0307, "reward": 3.4631171226501465, "reward_std": 0.4566783905029297, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.7131170630455017, "rewards/wrapped_format_reward": 0.75, "step": 530 }, { "completion_length": 500.0, "epoch": 106.2, "grad_norm": 0.7626741528511047, "kl": 0.8085213303565979, "learning_rate": 4.224838644155099e-06, "loss": 0.0323, "reward": 2.4170961380004883, "reward_std": 0.5227943062782288, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.20790360867977142, "rewards/wrapped_format_reward": 0.625, "step": 531 }, { "completion_length": 423.0, "epoch": 106.4, "grad_norm": 0.5680715441703796, "kl": 0.6939449310302734, "learning_rate": 4.220886439234385e-06, "loss": 0.0278, "reward": 2.497769355773926, "reward_std": 0.5466521978378296, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.1272306740283966, "rewards/wrapped_format_reward": 0.625, "step": 532 }, { "completion_length": 500.0, "epoch": 106.6, "grad_norm": 0.6183933615684509, "kl": 1.1602180004119873, "learning_rate": 4.216926043504626e-06, "loss": 0.0464, "reward": 2.2846357822418213, "reward_std": 2.229444980621338, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.46536412835121155, "rewards/wrapped_format_reward": 0.75, "step": 533 }, { "completion_length": 500.0, "epoch": 106.8, "grad_norm": 0.6668643355369568, "kl": 0.7370520234107971, "learning_rate": 4.212957475815898e-06, "loss": 0.0295, "reward": 1.1688251495361328, "reward_std": 3.1426844596862793, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9561749696731567, "rewards/wrapped_format_reward": 0.625, "step": 534 }, { "completion_length": 500.0, "epoch": 107.0, "grad_norm": 0.537616491317749, "kl": 1.0950063467025757, "learning_rate": 4.2089807550571786e-06, "loss": 0.0438, "reward": 3.4438555240631104, "reward_std": 0.7739672660827637, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9464285373687744, "rewards/wrapped_driving_reward": 0.7474268674850464, "rewards/wrapped_format_reward": 0.75, "step": 535 }, { "completion_length": 500.0, "epoch": 107.2, "grad_norm": 0.625481367111206, "kl": 1.0519458055496216, "learning_rate": 4.204995900156247e-06, "loss": 0.0421, "reward": 2.311418294906616, "reward_std": 2.2375292778015137, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.5635817646980286, "rewards/wrapped_format_reward": 0.875, "step": 536 }, { "completion_length": 465.0, "epoch": 107.4, "grad_norm": 0.46361467242240906, "kl": 0.9292150735855103, "learning_rate": 4.2010029300795986e-06, "loss": 0.0372, "reward": -1.5, "reward_std": 1.0, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 1.0, "step": 537 }, { "completion_length": 500.0, "epoch": 107.6, "grad_norm": 0.5417262315750122, "kl": 1.2354282140731812, "learning_rate": 4.197001863832355e-06, "loss": 0.0494, "reward": 2.9962191581726074, "reward_std": 0.42118921875953674, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.24621909856796265, "rewards/wrapped_format_reward": 0.75, "step": 538 }, { "completion_length": 403.0, "epoch": 107.8, "grad_norm": 0.5251120924949646, "kl": 0.6911302804946899, "learning_rate": 4.192992720458172e-06, "loss": 0.0276, "reward": 3.7265474796295166, "reward_std": 0.1257975995540619, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.96875, "rewards/wrapped_driving_reward": 0.7577975392341614, "rewards/wrapped_format_reward": 1.0, "step": 539 }, { "completion_length": 484.0, "epoch": 108.0, "grad_norm": 0.4822171628475189, "kl": 0.48192501068115234, "learning_rate": 4.188975519039151e-06, "loss": 0.0193, "reward": 2.7973673343658447, "reward_std": 0.40795889496803284, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.29736727476119995, "rewards/wrapped_format_reward": 0.5, "step": 540 }, { "completion_length": 500.0, "epoch": 108.2, "grad_norm": 0.5852164626121521, "kl": 1.5952774286270142, "learning_rate": 4.184950278695745e-06, "loss": 0.0638, "reward": 1.0873538255691528, "reward_std": 3.0677762031555176, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.1626461744308472, "rewards/wrapped_format_reward": 0.75, "step": 541 }, { "completion_length": 500.0, "epoch": 108.4, "grad_norm": 0.6261100769042969, "kl": 0.29827645421028137, "learning_rate": 4.18091701858667e-06, "loss": 0.0119, "reward": 1.9164016246795654, "reward_std": 1.2969717979431152, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.7085983753204346, "rewards/wrapped_format_reward": 0.625, "step": 542 }, { "completion_length": 472.0, "epoch": 108.6, "grad_norm": 0.6102639436721802, "kl": 1.2650554180145264, "learning_rate": 4.1768757579088145e-06, "loss": 0.0506, "reward": 3.090078353881836, "reward_std": 0.4966849684715271, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.3400783836841583, "rewards/wrapped_format_reward": 0.75, "step": 543 }, { "completion_length": 500.0, "epoch": 108.8, "grad_norm": 0.5978904366493225, "kl": 0.4474976360797882, "learning_rate": 4.172826515897146e-06, "loss": 0.0179, "reward": -1.7957313060760498, "reward_std": 1.3004844188690186, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7272727489471436, "rewards/wrapped_driving_reward": -3.8980040550231934, "rewards/wrapped_format_reward": 0.625, "step": 544 }, { "completion_length": 500.0, "epoch": 109.0, "grad_norm": 0.5209200382232666, "kl": 1.0046333074569702, "learning_rate": 4.168769311824619e-06, "loss": 0.0402, "reward": 3.4396934509277344, "reward_std": 0.48305127024650574, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.984375, "rewards/wrapped_driving_reward": 0.8303184509277344, "rewards/wrapped_format_reward": 0.625, "step": 545 }, { "completion_length": 500.0, "epoch": 109.2, "grad_norm": 0.6050389409065247, "kl": 1.4569886922836304, "learning_rate": 4.164704165002086e-06, "loss": 0.0583, "reward": 3.0283496379852295, "reward_std": 0.3845747411251068, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.984375, "rewards/wrapped_driving_reward": 0.16897478699684143, "rewards/wrapped_format_reward": 0.875, "step": 546 }, { "completion_length": 500.0, "epoch": 109.4, "grad_norm": 0.6651176810264587, "kl": 0.5778868794441223, "learning_rate": 4.160631094778205e-06, "loss": 0.0231, "reward": 2.923994541168213, "reward_std": 0.7468093037605286, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.17399446666240692, "rewards/wrapped_format_reward": 0.75, "step": 547 }, { "completion_length": 479.0, "epoch": 109.6, "grad_norm": 0.5163626670837402, "kl": 0.47668424248695374, "learning_rate": 4.1565501205393445e-06, "loss": 0.0191, "reward": 2.7185633182525635, "reward_std": 0.14974838495254517, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.2814367413520813, "rewards/wrapped_format_reward": 1.0, "step": 548 }, { "completion_length": 500.0, "epoch": 109.8, "grad_norm": 0.5523006916046143, "kl": 1.2691748142242432, "learning_rate": 4.152461261709494e-06, "loss": 0.0508, "reward": 3.828211784362793, "reward_std": 0.023511776700615883, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.8282119035720825, "rewards/wrapped_format_reward": 1.0, "step": 549 }, { "completion_length": 500.0, "epoch": 110.0, "grad_norm": 0.5335929989814758, "kl": 0.9684991240501404, "learning_rate": 4.1483645377501726e-06, "loss": 0.0387, "reward": 3.222689151763916, "reward_std": 0.20909461379051208, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.34768906235694885, "rewards/wrapped_format_reward": 0.875, "step": 550 }, { "completion_length": 460.0, "epoch": 110.2, "grad_norm": 0.5376462936401367, "kl": 0.7713929414749146, "learning_rate": 4.144259968160332e-06, "loss": 0.0309, "reward": 3.4893195629119873, "reward_std": 0.6945598721504211, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.7393194437026978, "rewards/wrapped_format_reward": 0.75, "step": 551 }, { "completion_length": 464.0, "epoch": 110.4, "grad_norm": 0.5207159519195557, "kl": 1.09281587600708, "learning_rate": 4.140147572476269e-06, "loss": 0.0437, "reward": 2.8704469203948975, "reward_std": 0.04285159707069397, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9722222089767456, "rewards/wrapped_driving_reward": -0.10177526623010635, "rewards/wrapped_format_reward": 1.0, "step": 552 }, { "completion_length": 500.0, "epoch": 110.6, "grad_norm": 0.5194886326789856, "kl": 0.6254560351371765, "learning_rate": 4.136027370271526e-06, "loss": 0.025, "reward": 2.7038888931274414, "reward_std": 0.6243050694465637, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.046111032366752625, "rewards/wrapped_format_reward": 0.75, "step": 553 }, { "completion_length": 378.0, "epoch": 110.8, "grad_norm": 0.5150373578071594, "kl": 1.3050411939620972, "learning_rate": 4.1318993811568065e-06, "loss": 0.0522, "reward": 2.7843098640441895, "reward_std": 0.2614060640335083, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9642857313156128, "rewards/wrapped_driving_reward": -0.1799759566783905, "rewards/wrapped_format_reward": 1.0, "step": 554 }, { "completion_length": 500.0, "epoch": 111.0, "grad_norm": 0.4927043616771698, "kl": 1.0330181121826172, "learning_rate": 4.127763624779873e-06, "loss": 0.0413, "reward": 2.8825972080230713, "reward_std": 0.6047881245613098, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.2575971484184265, "rewards/wrapped_format_reward": 0.625, "step": 555 }, { "completion_length": 500.0, "epoch": 111.2, "grad_norm": 0.6897583603858948, "kl": 1.2359685897827148, "learning_rate": 4.123620120825459e-06, "loss": 0.0494, "reward": 3.281771183013916, "reward_std": 0.47440090775489807, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.5317711234092712, "rewards/wrapped_format_reward": 0.75, "step": 556 }, { "completion_length": 495.0, "epoch": 111.4, "grad_norm": 0.5989323854446411, "kl": 1.2655354738235474, "learning_rate": 4.119468889015175e-06, "loss": 0.0506, "reward": 2.762580394744873, "reward_std": 0.01977536454796791, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9750000238418579, "rewards/wrapped_driving_reward": -0.21241959929466248, "rewards/wrapped_format_reward": 1.0, "step": 557 }, { "completion_length": 500.0, "epoch": 111.6, "grad_norm": 0.5576749444007874, "kl": 0.46419841051101685, "learning_rate": 4.11530994910741e-06, "loss": 0.0186, "reward": 0.5449908375740051, "reward_std": 3.1576852798461914, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.5800092220306396, "rewards/wrapped_format_reward": 0.625, "step": 558 }, { "completion_length": 424.0, "epoch": 111.8, "grad_norm": 0.5356802940368652, "kl": 0.6746591329574585, "learning_rate": 4.111143320897244e-06, "loss": 0.027, "reward": 2.571215867996216, "reward_std": 0.6723935008049011, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.4287840723991394, "rewards/wrapped_format_reward": 1.0, "step": 559 }, { "completion_length": 390.0, "epoch": 112.0, "grad_norm": 0.6406697630882263, "kl": 0.9746482968330383, "learning_rate": 4.106969024216348e-06, "loss": 0.039, "reward": 2.444140672683716, "reward_std": 2.3113973140716553, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9886363744735718, "rewards/wrapped_driving_reward": -0.5444957613945007, "rewards/wrapped_format_reward": 1.0, "step": 560 }, { "completion_length": 500.0, "epoch": 112.2, "grad_norm": 0.48831650614738464, "kl": 1.0716817378997803, "learning_rate": 4.102787078932896e-06, "loss": 0.0429, "reward": 0.15067720413208008, "reward_std": 3.1778695583343506, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7142857313156128, "rewards/wrapped_driving_reward": -1.9386086463928223, "rewards/wrapped_format_reward": 0.625, "step": 561 }, { "completion_length": 500.0, "epoch": 112.4, "grad_norm": 0.7172489166259766, "kl": 1.6123234033584595, "learning_rate": 4.098597504951462e-06, "loss": 0.0645, "reward": 1.052835464477539, "reward_std": 3.0422089099884033, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.699999988079071, "rewards/wrapped_driving_reward": -1.1471645832061768, "rewards/wrapped_format_reward": 0.75, "step": 562 }, { "completion_length": 500.0, "epoch": 112.6, "grad_norm": 0.4700511693954468, "kl": 1.5025134086608887, "learning_rate": 4.094400322212933e-06, "loss": 0.0601, "reward": 3.1847565174102783, "reward_std": 0.271543949842453, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.18475648760795593, "rewards/wrapped_format_reward": 1.0, "step": 563 }, { "completion_length": 500.0, "epoch": 112.8, "grad_norm": 0.5466091632843018, "kl": 1.3995592594146729, "learning_rate": 4.09019555069441e-06, "loss": 0.056, "reward": 3.2351319789886475, "reward_std": 0.10810267180204391, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.3601318895816803, "rewards/wrapped_format_reward": 0.875, "step": 564 }, { "completion_length": 500.0, "epoch": 113.0, "grad_norm": 0.47437676787376404, "kl": 0.7883629202842712, "learning_rate": 4.085983210409114e-06, "loss": 0.0315, "reward": 1.85590398311615, "reward_std": 3.577000379562378, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.734375, "rewards/wrapped_driving_reward": -0.3784710764884949, "rewards/wrapped_format_reward": 0.75, "step": 565 }, { "completion_length": 500.0, "epoch": 113.2, "grad_norm": 0.5034269690513611, "kl": 1.2129865884780884, "learning_rate": 4.081763321406291e-06, "loss": 0.0485, "reward": 2.7711429595947266, "reward_std": 0.5451120734214783, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.22885683178901672, "rewards/wrapped_format_reward": 1.0, "step": 566 }, { "completion_length": 500.0, "epoch": 113.4, "grad_norm": 0.469226598739624, "kl": 0.9821814298629761, "learning_rate": 4.077535903771115e-06, "loss": 0.0393, "reward": 3.4845423698425293, "reward_std": 0.1735163778066635, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9642857313156128, "rewards/wrapped_driving_reward": 0.5202568173408508, "rewards/wrapped_format_reward": 1.0, "step": 567 }, { "completion_length": 500.0, "epoch": 113.6, "grad_norm": 0.5355891585350037, "kl": 1.099183440208435, "learning_rate": 4.073300977624594e-06, "loss": 0.044, "reward": 2.550929307937622, "reward_std": 0.5364440679550171, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9772727489471436, "rewards/wrapped_driving_reward": -0.17634353041648865, "rewards/wrapped_format_reward": 0.75, "step": 568 }, { "completion_length": 500.0, "epoch": 113.8, "grad_norm": 1.7050570249557495, "kl": 1.0493282079696655, "learning_rate": 4.069058563123476e-06, "loss": 0.042, "reward": -1.254978895187378, "reward_std": 0.2830427885055542, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -3.754978895187378, "rewards/wrapped_format_reward": 0.5, "step": 569 }, { "completion_length": 500.0, "epoch": 114.0, "grad_norm": 0.5383164882659912, "kl": 0.8460338115692139, "learning_rate": 4.064808680460149e-06, "loss": 0.0338, "reward": 3.1393980979919434, "reward_std": 0.4082465171813965, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.935606062412262, "rewards/wrapped_driving_reward": 0.45379188656806946, "rewards/wrapped_format_reward": 0.75, "step": 570 }, { "completion_length": 500.0, "epoch": 114.2, "grad_norm": 0.48489129543304443, "kl": 0.6688995361328125, "learning_rate": 4.060551349862545e-06, "loss": 0.0268, "reward": 2.5152158737182617, "reward_std": 0.3381154239177704, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.35978400707244873, "rewards/wrapped_format_reward": 0.875, "step": 571 }, { "completion_length": 382.0, "epoch": 114.4, "grad_norm": 0.6301425099372864, "kl": 0.767602801322937, "learning_rate": 4.056286591594049e-06, "loss": 0.0307, "reward": 2.103844165802002, "reward_std": 2.120149612426758, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.7711557745933533, "rewards/wrapped_format_reward": 0.875, "step": 572 }, { "completion_length": 500.0, "epoch": 114.6, "grad_norm": 0.5594694018363953, "kl": 0.8042862415313721, "learning_rate": 4.052014425953399e-06, "loss": 0.0322, "reward": 2.261784553527832, "reward_std": 0.5970240235328674, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.2382153868675232, "rewards/wrapped_format_reward": 0.5, "step": 573 }, { "completion_length": 500.0, "epoch": 114.8, "grad_norm": 0.5271262526512146, "kl": 0.7885665893554688, "learning_rate": 4.047734873274586e-06, "loss": 0.0315, "reward": 3.6224513053894043, "reward_std": 0.416412889957428, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9166666865348816, "rewards/wrapped_driving_reward": 0.8307844400405884, "rewards/wrapped_format_reward": 0.875, "step": 574 }, { "completion_length": 500.0, "epoch": 115.0, "grad_norm": 0.6031838655471802, "kl": 0.8005996346473694, "learning_rate": 4.043447953926763e-06, "loss": 0.032, "reward": 1.5088732242584229, "reward_std": 3.370800018310547, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7272727489471436, "rewards/wrapped_driving_reward": -0.8433995246887207, "rewards/wrapped_format_reward": 0.875, "step": 575 }, { "completion_length": 500.0, "epoch": 115.2, "grad_norm": 0.5976753234863281, "kl": 1.0529546737670898, "learning_rate": 4.039153688314146e-06, "loss": 0.0421, "reward": 0.6955030560493469, "reward_std": 3.1456363201141357, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5833333730697632, "rewards/wrapped_driving_reward": -1.3878302574157715, "rewards/wrapped_format_reward": 0.75, "step": 576 }, { "completion_length": 500.0, "epoch": 115.4, "grad_norm": 0.5516257882118225, "kl": 1.106783390045166, "learning_rate": 4.034852096875917e-06, "loss": 0.0443, "reward": 3.089322566986084, "reward_std": 0.561561107635498, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.589322566986084, "rewards/wrapped_format_reward": 0.5, "step": 577 }, { "completion_length": 500.0, "epoch": 115.6, "grad_norm": 0.5982745289802551, "kl": 0.8650901913642883, "learning_rate": 4.0305432000861236e-06, "loss": 0.0346, "reward": -0.42720913887023926, "reward_std": 2.9172680377960205, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -2.6772091388702393, "rewards/wrapped_format_reward": 0.75, "step": 578 }, { "completion_length": 500.0, "epoch": 115.8, "grad_norm": 0.5181532502174377, "kl": 0.4505057632923126, "learning_rate": 4.026227018453587e-06, "loss": 0.018, "reward": 0.1567600965499878, "reward_std": 4.2224836349487305, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.5932399034500122, "rewards/wrapped_format_reward": 0.75, "step": 579 }, { "completion_length": 477.0, "epoch": 116.0, "grad_norm": 0.5441889762878418, "kl": 1.122840404510498, "learning_rate": 4.021903572521802e-06, "loss": 0.0449, "reward": 2.3626151084899902, "reward_std": 1.115975022315979, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.6373847723007202, "rewards/wrapped_format_reward": 1.0, "step": 580 }, { "completion_length": 500.0, "epoch": 116.2, "grad_norm": 0.6262593269348145, "kl": 1.4455444812774658, "learning_rate": 4.0175728828688355e-06, "loss": 0.0578, "reward": 3.0197038650512695, "reward_std": 0.16474168002605438, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.019704071804881096, "rewards/wrapped_format_reward": 1.0, "step": 581 }, { "completion_length": 455.0, "epoch": 116.4, "grad_norm": 0.7343289256095886, "kl": 1.0988672971725464, "learning_rate": 4.013234970107236e-06, "loss": 0.044, "reward": 3.284791946411133, "reward_std": 0.6016852259635925, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9722222089767456, "rewards/wrapped_driving_reward": 0.5625696182250977, "rewards/wrapped_format_reward": 0.75, "step": 582 }, { "completion_length": 500.0, "epoch": 116.6, "grad_norm": 0.7269496917724609, "kl": 1.15138578414917, "learning_rate": 4.0088898548839285e-06, "loss": 0.0461, "reward": 1.8786382675170898, "reward_std": 1.9626902341842651, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.8713617324829102, "rewards/wrapped_format_reward": 0.75, "step": 583 }, { "completion_length": 457.0, "epoch": 116.8, "grad_norm": 0.5865412354469299, "kl": 1.0894410610198975, "learning_rate": 4.0045375578801216e-06, "loss": 0.0436, "reward": 1.158536434173584, "reward_std": 2.4944934844970703, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.841463565826416, "rewards/wrapped_format_reward": 1.0, "step": 584 }, { "completion_length": 493.0, "epoch": 117.0, "grad_norm": 0.5936154127120972, "kl": 1.2170442342758179, "learning_rate": 4.000178099811203e-06, "loss": 0.0487, "reward": 3.4582886695861816, "reward_std": 0.7560064792633057, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.4582887887954712, "rewards/wrapped_format_reward": 1.0, "step": 585 }, { "completion_length": 395.0, "epoch": 117.2, "grad_norm": 0.6214985251426697, "kl": 1.1934869289398193, "learning_rate": 3.995811501426648e-06, "loss": 0.0477, "reward": 1.7916817665100098, "reward_std": 1.8864822387695312, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.2083182334899902, "rewards/wrapped_format_reward": 1.0, "step": 586 }, { "completion_length": 422.0, "epoch": 117.4, "grad_norm": 0.5442876815795898, "kl": 0.6532779932022095, "learning_rate": 3.991437783509916e-06, "loss": 0.0261, "reward": 2.1177978515625, "reward_std": 3.411893129348755, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.3822019100189209, "rewards/wrapped_format_reward": 1.0, "step": 587 }, { "completion_length": 500.0, "epoch": 117.6, "grad_norm": 0.5017068982124329, "kl": 0.4669530391693115, "learning_rate": 3.987056966878354e-06, "loss": 0.0187, "reward": 2.701904773712158, "reward_std": 0.4848705530166626, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.20190489292144775, "rewards/wrapped_format_reward": 0.5, "step": 588 }, { "completion_length": 451.0, "epoch": 117.8, "grad_norm": 0.6502745151519775, "kl": 0.9627188444137573, "learning_rate": 3.982669072383093e-06, "loss": 0.0385, "reward": 3.4546749591827393, "reward_std": 0.22837932407855988, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9750000238418579, "rewards/wrapped_driving_reward": 0.4796748757362366, "rewards/wrapped_format_reward": 1.0, "step": 589 }, { "completion_length": 500.0, "epoch": 118.0, "grad_norm": 0.5828717947006226, "kl": 1.0402276515960693, "learning_rate": 3.978274120908957e-06, "loss": 0.0416, "reward": 1.3493000268936157, "reward_std": 3.240011215209961, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.0256999731063843, "rewards/wrapped_format_reward": 0.875, "step": 590 }, { "completion_length": 441.0, "epoch": 118.2, "grad_norm": 0.4777461588382721, "kl": 0.8824390769004822, "learning_rate": 3.973872133374354e-06, "loss": 0.0353, "reward": 2.3634040355682373, "reward_std": 0.32662835717201233, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.875, "rewards/wrapped_driving_reward": -0.3865959048271179, "rewards/wrapped_format_reward": 0.875, "step": 591 }, { "completion_length": 500.0, "epoch": 118.4, "grad_norm": 0.4983525276184082, "kl": 1.1703006029129028, "learning_rate": 3.969463130731183e-06, "loss": 0.0468, "reward": 3.0047945976257324, "reward_std": 0.6505519151687622, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.25479456782341003, "rewards/wrapped_format_reward": 0.75, "step": 592 }, { "completion_length": 500.0, "epoch": 118.6, "grad_norm": 0.6996035575866699, "kl": 0.7430159449577332, "learning_rate": 3.965047133964735e-06, "loss": 0.0297, "reward": 1.286207914352417, "reward_std": 3.1933374404907227, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.963792085647583, "rewards/wrapped_format_reward": 0.75, "step": 593 }, { "completion_length": 500.0, "epoch": 118.8, "grad_norm": 0.5467674136161804, "kl": 0.8126164674758911, "learning_rate": 3.960624164093587e-06, "loss": 0.0325, "reward": 2.703213691711426, "reward_std": 0.6341179013252258, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9444444179534912, "rewards/wrapped_driving_reward": 0.13376936316490173, "rewards/wrapped_format_reward": 0.625, "step": 594 }, { "completion_length": 500.0, "epoch": 119.0, "grad_norm": 0.531501054763794, "kl": 1.0887751579284668, "learning_rate": 3.956194242169506e-06, "loss": 0.0436, "reward": 3.7124338150024414, "reward_std": 0.24506977200508118, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.8374338150024414, "rewards/wrapped_format_reward": 0.875, "step": 595 }, { "completion_length": 406.0, "epoch": 119.2, "grad_norm": 0.592689037322998, "kl": 0.8516889214515686, "learning_rate": 3.951757389277349e-06, "loss": 0.0341, "reward": 2.9845433235168457, "reward_std": 0.557920515537262, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.10954323410987854, "rewards/wrapped_format_reward": 0.875, "step": 596 }, { "completion_length": 500.0, "epoch": 119.4, "grad_norm": 0.6375570893287659, "kl": 0.32796257734298706, "learning_rate": 3.947313626534965e-06, "loss": 0.0131, "reward": 1.316743016242981, "reward_std": 3.562058448791504, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.703125, "rewards/wrapped_driving_reward": -0.761381983757019, "rewards/wrapped_format_reward": 0.625, "step": 597 }, { "completion_length": 500.0, "epoch": 119.6, "grad_norm": 0.5976330637931824, "kl": 0.8567104339599609, "learning_rate": 3.942862975093085e-06, "loss": 0.0343, "reward": 3.3494763374328613, "reward_std": 0.23357340693473816, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.34947648644447327, "rewards/wrapped_format_reward": 1.0, "step": 598 }, { "completion_length": 500.0, "epoch": 119.8, "grad_norm": 0.5101990103721619, "kl": 1.0826239585876465, "learning_rate": 3.938405456135231e-06, "loss": 0.0433, "reward": 3.693582057952881, "reward_std": 0.24203148484230042, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.8185819983482361, "rewards/wrapped_format_reward": 0.875, "step": 599 }, { "completion_length": 500.0, "epoch": 120.0, "grad_norm": 0.5968356132507324, "kl": 0.4376532733440399, "learning_rate": 3.933941090877615e-06, "loss": 0.0175, "reward": -0.602975606918335, "reward_std": 3.3535501956939697, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.227975368499756, "rewards/wrapped_format_reward": 0.625, "step": 600 }, { "completion_length": 500.0, "epoch": 120.2, "grad_norm": 0.5347983837127686, "kl": 0.8922858834266663, "learning_rate": 3.929469900569031e-06, "loss": 0.0357, "reward": 1.4616684913635254, "reward_std": 2.0025837421417236, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.6633315086364746, "rewards/wrapped_format_reward": 0.125, "step": 601 }, { "completion_length": 500.0, "epoch": 120.4, "grad_norm": 0.49703311920166016, "kl": 1.0437543392181396, "learning_rate": 3.924991906490758e-06, "loss": 0.0418, "reward": -0.2095600962638855, "reward_std": 1.9399936199188232, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -2.8345601558685303, "rewards/wrapped_format_reward": 0.625, "step": 602 }, { "completion_length": 498.0, "epoch": 120.6, "grad_norm": 0.4421629011631012, "kl": 0.9316573739051819, "learning_rate": 3.92050712995646e-06, "loss": 0.0373, "reward": 3.238769054412842, "reward_std": 0.17726895213127136, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.238769069314003, "rewards/wrapped_format_reward": 1.0, "step": 603 }, { "completion_length": 362.0, "epoch": 120.8, "grad_norm": 0.6305619478225708, "kl": 0.8384993672370911, "learning_rate": 3.916015592312083e-06, "loss": 0.0335, "reward": 3.710329532623291, "reward_std": 0.2542549967765808, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.8353294134140015, "rewards/wrapped_format_reward": 0.875, "step": 604 }, { "completion_length": 410.0, "epoch": 121.0, "grad_norm": 0.5139946341514587, "kl": 1.0644137859344482, "learning_rate": 3.911517314935752e-06, "loss": 0.0426, "reward": 3.2626075744628906, "reward_std": 0.07001760601997375, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.2626074254512787, "rewards/wrapped_format_reward": 1.0, "step": 605 }, { "completion_length": 500.0, "epoch": 121.2, "grad_norm": 0.7288184762001038, "kl": 1.8501490354537964, "learning_rate": 3.907012319237672e-06, "loss": 0.074, "reward": 1.4808318614959717, "reward_std": 3.336132526397705, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7272727489471436, "rewards/wrapped_driving_reward": -0.7464408874511719, "rewards/wrapped_format_reward": 0.75, "step": 606 }, { "completion_length": 427.0, "epoch": 121.4, "grad_norm": 0.5688821077346802, "kl": 1.0949598550796509, "learning_rate": 3.902500626660025e-06, "loss": 0.0438, "reward": -0.19872227311134338, "reward_std": 2.0224311351776123, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8500000238418579, "rewards/wrapped_driving_reward": -3.048722267150879, "rewards/wrapped_format_reward": 1.0, "step": 607 }, { "completion_length": 500.0, "epoch": 121.6, "grad_norm": 0.5117161273956299, "kl": 0.8083535432815552, "learning_rate": 3.897982258676867e-06, "loss": 0.0323, "reward": 2.9899044036865234, "reward_std": 0.9233556389808655, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7291666865348816, "rewards/wrapped_driving_reward": 0.7607378959655762, "rewards/wrapped_format_reward": 0.5, "step": 608 }, { "completion_length": 461.0, "epoch": 121.8, "grad_norm": 0.41981130838394165, "kl": 1.5435020923614502, "learning_rate": 3.8934572367940285e-06, "loss": 0.0617, "reward": 2.585515260696411, "reward_std": 0.15323100984096527, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.41448473930358887, "rewards/wrapped_format_reward": 1.0, "step": 609 }, { "completion_length": 456.0, "epoch": 122.0, "grad_norm": 0.5537935495376587, "kl": 0.9971626996994019, "learning_rate": 3.888925582549006e-06, "loss": 0.0399, "reward": 2.5353288650512695, "reward_std": 2.358036756515503, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9722222089767456, "rewards/wrapped_driving_reward": -0.4368933141231537, "rewards/wrapped_format_reward": 1.0, "step": 610 }, { "completion_length": 500.0, "epoch": 122.2, "grad_norm": 0.6289500594139099, "kl": 1.1899405717849731, "learning_rate": 3.8843873175108685e-06, "loss": 0.0476, "reward": 3.100329875946045, "reward_std": 0.530876874923706, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.96875, "rewards/wrapped_driving_reward": 0.38157975673675537, "rewards/wrapped_format_reward": 0.75, "step": 611 }, { "completion_length": 500.0, "epoch": 122.4, "grad_norm": 0.533795952796936, "kl": 1.278198003768921, "learning_rate": 3.879842463280146e-06, "loss": 0.0511, "reward": 2.8987560272216797, "reward_std": 0.5386033058166504, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.148756206035614, "rewards/wrapped_format_reward": 0.75, "step": 612 }, { "completion_length": 360.0, "epoch": 122.6, "grad_norm": 0.5524008274078369, "kl": 0.7699089050292969, "learning_rate": 3.875291041488734e-06, "loss": 0.0308, "reward": 3.1413626670837402, "reward_std": 0.4477062523365021, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9230769276618958, "rewards/wrapped_driving_reward": 0.7182857990264893, "rewards/wrapped_format_reward": 0.5, "step": 613 }, { "completion_length": 500.0, "epoch": 122.8, "grad_norm": 0.4463861882686615, "kl": 1.3604273796081543, "learning_rate": 3.870733073799785e-06, "loss": 0.0544, "reward": 2.821460247039795, "reward_std": 0.21004045009613037, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9375, "rewards/wrapped_driving_reward": -0.11603959649801254, "rewards/wrapped_format_reward": 1.0, "step": 614 }, { "completion_length": 500.0, "epoch": 123.0, "grad_norm": 0.5090853571891785, "kl": 0.8209176063537598, "learning_rate": 3.866168581907609e-06, "loss": 0.0328, "reward": 3.033195734024048, "reward_std": 0.2945842444896698, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.1581955850124359, "rewards/wrapped_format_reward": 0.875, "step": 615 }, { "completion_length": 500.0, "epoch": 123.2, "grad_norm": 0.5075961947441101, "kl": 0.6219139695167542, "learning_rate": 3.861597587537568e-06, "loss": 0.0249, "reward": -0.6715507507324219, "reward_std": 2.605349063873291, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -3.046550750732422, "rewards/wrapped_format_reward": 0.875, "step": 616 }, { "completion_length": 500.0, "epoch": 123.4, "grad_norm": 0.6667484045028687, "kl": 1.1634588241577148, "learning_rate": 3.8570201124459745e-06, "loss": 0.0465, "reward": 1.6204893589019775, "reward_std": 3.7763609886169434, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.37951064109802246, "rewards/wrapped_format_reward": 0.5, "step": 617 }, { "completion_length": 500.0, "epoch": 123.6, "grad_norm": 0.6611601114273071, "kl": 0.5242982506752014, "learning_rate": 3.8524361784199855e-06, "loss": 0.021, "reward": 2.819260358810425, "reward_std": 0.5874485373497009, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.3192603886127472, "rewards/wrapped_format_reward": 0.5, "step": 618 }, { "completion_length": 500.0, "epoch": 123.8, "grad_norm": 1.5914947986602783, "kl": 1.2179936170578003, "learning_rate": 3.847845807277501e-06, "loss": 0.0487, "reward": 1.4335602521896362, "reward_std": 2.353290319442749, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.0664397478103638, "rewards/wrapped_format_reward": 0.5, "step": 619 }, { "completion_length": 462.0, "epoch": 124.0, "grad_norm": 0.6235033273696899, "kl": 0.9362177848815918, "learning_rate": 3.8432490208670605e-06, "loss": 0.0374, "reward": 2.7716119289398193, "reward_std": 0.6671396493911743, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9166666865348816, "rewards/wrapped_driving_reward": -0.14505480229854584, "rewards/wrapped_format_reward": 1.0, "step": 620 }, { "completion_length": 347.0, "epoch": 124.2, "grad_norm": 0.824704110622406, "kl": 0.8133590221405029, "learning_rate": 3.838645841067735e-06, "loss": 0.0325, "reward": 1.1893500089645386, "reward_std": 2.809638023376465, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5833333730697632, "rewards/wrapped_driving_reward": -1.1439833641052246, "rewards/wrapped_format_reward": 1.0, "step": 621 }, { "completion_length": 386.0, "epoch": 124.4, "grad_norm": 0.7511218190193176, "kl": 1.0320838689804077, "learning_rate": 3.83403628978903e-06, "loss": 0.0413, "reward": 1.5405443906784058, "reward_std": 3.0421595573425293, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9594556093215942, "rewards/wrapped_format_reward": 1.0, "step": 622 }, { "completion_length": 500.0, "epoch": 124.6, "grad_norm": 0.7474787831306458, "kl": 2.0438644886016846, "learning_rate": 3.829420388970772e-06, "loss": 0.0818, "reward": 3.1411290168762207, "reward_std": 0.30362197756767273, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.3911292254924774, "rewards/wrapped_format_reward": 0.75, "step": 623 }, { "completion_length": 489.0, "epoch": 124.8, "grad_norm": 0.5865500569343567, "kl": 1.1531330347061157, "learning_rate": 3.824798160583012e-06, "loss": 0.0461, "reward": 1.3933429718017578, "reward_std": 1.692426323890686, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.3566570281982422, "rewards/wrapped_format_reward": 0.75, "step": 624 }, { "completion_length": 497.0, "epoch": 125.0, "grad_norm": 1.8625023365020752, "kl": 1.2559576034545898, "learning_rate": 3.82016962662592e-06, "loss": 0.0502, "reward": 3.264035224914551, "reward_std": 0.47378936409950256, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.26403525471687317, "rewards/wrapped_format_reward": 1.0, "step": 625 }, { "completion_length": 424.0, "epoch": 125.2, "grad_norm": 0.5372361540794373, "kl": 1.0564872026443481, "learning_rate": 3.815534809129674e-06, "loss": 0.0423, "reward": 3.686570167541504, "reward_std": 0.2549721598625183, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.8115702271461487, "rewards/wrapped_format_reward": 0.875, "step": 626 }, { "completion_length": 500.0, "epoch": 125.4, "grad_norm": 0.6623659133911133, "kl": 1.3536056280136108, "learning_rate": 3.8108937301543613e-06, "loss": 0.0541, "reward": 1.6809741258621216, "reward_std": 3.1360626220703125, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.6940257549285889, "rewards/wrapped_format_reward": 0.875, "step": 627 }, { "completion_length": 500.0, "epoch": 125.6, "grad_norm": 0.7702702879905701, "kl": 0.5843633413314819, "learning_rate": 3.806246411789872e-06, "loss": 0.0234, "reward": 0.16281354427337646, "reward_std": 2.0939905643463135, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -2.337186336517334, "rewards/wrapped_format_reward": 0.5, "step": 628 }, { "completion_length": 451.0, "epoch": 125.8, "grad_norm": 0.567046046257019, "kl": 0.6312512755393982, "learning_rate": 3.8015928761557937e-06, "loss": 0.0253, "reward": 0.7392481565475464, "reward_std": 2.621385335922241, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.8857518434524536, "rewards/wrapped_format_reward": 0.625, "step": 629 }, { "completion_length": 500.0, "epoch": 126.0, "grad_norm": 0.5259938836097717, "kl": 1.050511121749878, "learning_rate": 3.796933145401304e-06, "loss": 0.042, "reward": 1.5790135860443115, "reward_std": 3.3867099285125732, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.7959863543510437, "rewards/wrapped_format_reward": 0.875, "step": 630 }, { "completion_length": 500.0, "epoch": 126.2, "grad_norm": 0.5427968502044678, "kl": 1.2725714445114136, "learning_rate": 3.7922672417050687e-06, "loss": 0.0509, "reward": 2.798665761947632, "reward_std": 0.6321271657943726, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.875, "rewards/wrapped_driving_reward": 0.2986658215522766, "rewards/wrapped_format_reward": 0.625, "step": 631 }, { "completion_length": 437.0, "epoch": 126.4, "grad_norm": 0.5535632371902466, "kl": 1.0805319547653198, "learning_rate": 3.787595187275136e-06, "loss": 0.0432, "reward": 3.469416379928589, "reward_std": 0.41116681694984436, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9375, "rewards/wrapped_driving_reward": 0.6569163799285889, "rewards/wrapped_format_reward": 0.875, "step": 632 }, { "completion_length": 500.0, "epoch": 126.6, "grad_norm": 1.1283907890319824, "kl": 1.4552005529403687, "learning_rate": 3.782917004348826e-06, "loss": 0.0582, "reward": 2.5764386653900146, "reward_std": 0.25892573595046997, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8999999761581421, "rewards/wrapped_driving_reward": -0.19856137037277222, "rewards/wrapped_format_reward": 0.875, "step": 633 }, { "completion_length": 500.0, "epoch": 126.8, "grad_norm": 0.6342353224754333, "kl": 1.0458495616912842, "learning_rate": 3.77823271519263e-06, "loss": 0.0418, "reward": 1.4458222389221191, "reward_std": 2.164912223815918, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.1791776418685913, "rewards/wrapped_format_reward": 0.625, "step": 634 }, { "completion_length": 500.0, "epoch": 127.0, "grad_norm": 0.5013647079467773, "kl": 1.0987770557403564, "learning_rate": 3.773542342102105e-06, "loss": 0.044, "reward": 2.5572562217712402, "reward_std": 0.5937471985816956, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9583333134651184, "rewards/wrapped_driving_reward": 0.09892302751541138, "rewards/wrapped_format_reward": 0.5, "step": 635 }, { "completion_length": 500.0, "epoch": 127.2, "grad_norm": 0.4656875729560852, "kl": 1.0471619367599487, "learning_rate": 3.768845907401761e-06, "loss": 0.0419, "reward": 2.2354865074157715, "reward_std": 0.9342080950737, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.6395134329795837, "rewards/wrapped_format_reward": 0.875, "step": 636 }, { "completion_length": 500.0, "epoch": 127.4, "grad_norm": 0.575062096118927, "kl": 1.259049654006958, "learning_rate": 3.764143433444962e-06, "loss": 0.0504, "reward": 2.576545000076294, "reward_std": 0.6139914989471436, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9375, "rewards/wrapped_driving_reward": 0.014045089483261108, "rewards/wrapped_format_reward": 0.625, "step": 637 }, { "completion_length": 500.0, "epoch": 127.6, "grad_norm": 0.525324821472168, "kl": 0.8727381229400635, "learning_rate": 3.759434942613816e-06, "loss": 0.0349, "reward": 1.5306200981140137, "reward_std": 3.3567087650299072, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.7193797826766968, "rewards/wrapped_format_reward": 0.75, "step": 638 }, { "completion_length": 500.0, "epoch": 127.8, "grad_norm": 0.4832088053226471, "kl": 0.7729612588882446, "learning_rate": 3.75472045731907e-06, "loss": 0.0309, "reward": 2.4255733489990234, "reward_std": 0.7332895398139954, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.32442671060562134, "rewards/wrapped_format_reward": 0.75, "step": 639 }, { "completion_length": 500.0, "epoch": 128.0, "grad_norm": 0.5773348808288574, "kl": 1.278677225112915, "learning_rate": 3.7500000000000005e-06, "loss": 0.0511, "reward": 1.4694275856018066, "reward_std": 2.9984025955200195, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.6555723547935486, "rewards/wrapped_format_reward": 0.625, "step": 640 }, { "completion_length": 500.0, "epoch": 128.2, "grad_norm": 0.6425707936286926, "kl": 0.8203465342521667, "learning_rate": 3.7452735931243108e-06, "loss": 0.0328, "reward": 2.112128734588623, "reward_std": 0.4703961908817291, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.5128712058067322, "rewards/wrapped_format_reward": 0.625, "step": 641 }, { "completion_length": 500.0, "epoch": 128.4, "grad_norm": 0.4806380867958069, "kl": 1.1360549926757812, "learning_rate": 3.7405412591880213e-06, "loss": 0.0454, "reward": 0.18416738510131836, "reward_std": 3.472820520401001, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.6908326148986816, "rewards/wrapped_format_reward": 0.375, "step": 642 }, { "completion_length": 500.0, "epoch": 128.6, "grad_norm": 0.5253452062606812, "kl": 0.8624851107597351, "learning_rate": 3.735803020715362e-06, "loss": 0.0345, "reward": 1.8811970949172974, "reward_std": 3.594886302947998, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.36880284547805786, "rewards/wrapped_format_reward": 0.75, "step": 643 }, { "completion_length": 364.0, "epoch": 128.8, "grad_norm": 0.5939153432846069, "kl": 0.9492148160934448, "learning_rate": 3.7310589002586683e-06, "loss": 0.038, "reward": 0.8848087787628174, "reward_std": 2.2207157611846924, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.9901912212371826, "rewards/wrapped_format_reward": 0.875, "step": 644 }, { "completion_length": 500.0, "epoch": 129.0, "grad_norm": 0.484019011259079, "kl": 1.1680047512054443, "learning_rate": 3.7263089203982698e-06, "loss": 0.0467, "reward": 1.8454086780548096, "reward_std": 1.9383881092071533, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.7795912027359009, "rewards/wrapped_format_reward": 0.625, "step": 645 }, { "completion_length": 494.0, "epoch": 129.2, "grad_norm": 0.46752461791038513, "kl": 0.7998576164245605, "learning_rate": 3.721553103742388e-06, "loss": 0.032, "reward": 2.1568634510040283, "reward_std": 2.207801103591919, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9791666865348816, "rewards/wrapped_driving_reward": -0.44730305671691895, "rewards/wrapped_format_reward": 0.625, "step": 646 }, { "completion_length": 500.0, "epoch": 129.4, "grad_norm": 0.5095931887626648, "kl": 1.2253186702728271, "learning_rate": 3.7167914729270205e-06, "loss": 0.049, "reward": 3.187159299850464, "reward_std": 0.5010700225830078, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9886363744735718, "rewards/wrapped_driving_reward": 0.19852286577224731, "rewards/wrapped_format_reward": 1.0, "step": 647 }, { "completion_length": 500.0, "epoch": 129.6, "grad_norm": 0.5528401732444763, "kl": 1.5397626161575317, "learning_rate": 3.7120240506158433e-06, "loss": 0.0616, "reward": 3.4808921813964844, "reward_std": 0.575390100479126, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.6058919429779053, "rewards/wrapped_format_reward": 0.875, "step": 648 }, { "completion_length": 487.0, "epoch": 129.8, "grad_norm": 0.5192949771881104, "kl": 1.3175593614578247, "learning_rate": 3.7072508595000935e-06, "loss": 0.0527, "reward": 3.1154708862304688, "reward_std": 0.7258886098861694, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.24047090113162994, "rewards/wrapped_format_reward": 0.875, "step": 649 }, { "completion_length": 500.0, "epoch": 130.0, "grad_norm": 0.5336673855781555, "kl": 1.6302653551101685, "learning_rate": 3.7024719222984696e-06, "loss": 0.0652, "reward": 1.334566593170166, "reward_std": 3.2289183139801025, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.040433406829834, "rewards/wrapped_format_reward": 0.875, "step": 650 }, { "completion_length": 486.0, "epoch": 130.2, "grad_norm": 0.7520384192466736, "kl": 1.874834656715393, "learning_rate": 3.6976872617570163e-06, "loss": 0.075, "reward": 3.4488072395324707, "reward_std": 0.258309543132782, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9642857313156128, "rewards/wrapped_driving_reward": 0.6095216274261475, "rewards/wrapped_format_reward": 0.875, "step": 651 }, { "completion_length": 500.0, "epoch": 130.4, "grad_norm": 0.7811522483825684, "kl": 0.6162184476852417, "learning_rate": 3.6928969006490212e-06, "loss": 0.0246, "reward": -2.375, "reward_std": 1.6007810831069946, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.625, "step": 652 }, { "completion_length": 500.0, "epoch": 130.6, "grad_norm": 0.4578838050365448, "kl": 1.285402536392212, "learning_rate": 3.6881008617749042e-06, "loss": 0.0514, "reward": 0.9785336256027222, "reward_std": 2.6833934783935547, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.1464663743972778, "rewards/wrapped_format_reward": 0.625, "step": 653 }, { "completion_length": 500.0, "epoch": 130.8, "grad_norm": 0.48609647154808044, "kl": 1.111726999282837, "learning_rate": 3.6832991679621087e-06, "loss": 0.0445, "reward": 2.5674822330474854, "reward_std": 0.3593307435512543, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.18251775205135345, "rewards/wrapped_format_reward": 0.75, "step": 654 }, { "completion_length": 455.0, "epoch": 131.0, "grad_norm": 0.5047434568405151, "kl": 1.223645806312561, "learning_rate": 3.6784918420649952e-06, "loss": 0.0489, "reward": 3.3820629119873047, "reward_std": 0.4785292148590088, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9642857313156128, "rewards/wrapped_driving_reward": 0.41777700185775757, "rewards/wrapped_format_reward": 1.0, "step": 655 }, { "completion_length": 433.0, "epoch": 131.2, "grad_norm": 0.5695823431015015, "kl": 1.3213516473770142, "learning_rate": 3.6736789069647273e-06, "loss": 0.0529, "reward": 3.4408211708068848, "reward_std": 0.48655256628990173, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9545454382896423, "rewards/wrapped_driving_reward": 0.4862755835056305, "rewards/wrapped_format_reward": 1.0, "step": 656 }, { "completion_length": 500.0, "epoch": 131.4, "grad_norm": 0.5650737881660461, "kl": 0.753746509552002, "learning_rate": 3.6688603855691713e-06, "loss": 0.0301, "reward": 0.73109370470047, "reward_std": 2.6236155033111572, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -2.018906354904175, "rewards/wrapped_format_reward": 0.75, "step": 657 }, { "completion_length": 417.0, "epoch": 131.6, "grad_norm": 0.5839191675186157, "kl": 1.2604615688323975, "learning_rate": 3.664036300812779e-06, "loss": 0.0504, "reward": 3.5584030151367188, "reward_std": 0.49135151505470276, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9750000238418579, "rewards/wrapped_driving_reward": 0.8334031701087952, "rewards/wrapped_format_reward": 0.75, "step": 658 }, { "completion_length": 459.0, "epoch": 131.8, "grad_norm": 0.5347810983657837, "kl": 1.0865789651870728, "learning_rate": 3.6592066756564825e-06, "loss": 0.0435, "reward": 2.519155263900757, "reward_std": 0.6673591732978821, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.10584461688995361, "rewards/wrapped_format_reward": 0.625, "step": 659 }, { "completion_length": 442.0, "epoch": 132.0, "grad_norm": 0.52985680103302, "kl": 0.3356545865535736, "learning_rate": 3.654371533087586e-06, "loss": 0.0134, "reward": 2.64139986038208, "reward_std": 0.9725221395492554, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.016399994492530823, "rewards/wrapped_format_reward": 0.625, "step": 660 }, { "completion_length": 500.0, "epoch": 132.2, "grad_norm": 0.48469388484954834, "kl": 1.271719217300415, "learning_rate": 3.64953089611965e-06, "loss": 0.0509, "reward": 2.7419373989105225, "reward_std": 0.08097031712532043, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9642857313156128, "rewards/wrapped_driving_reward": -0.22234830260276794, "rewards/wrapped_format_reward": 1.0, "step": 661 }, { "completion_length": 500.0, "epoch": 132.4, "grad_norm": 0.87370365858078, "kl": 1.4457768201828003, "learning_rate": 3.6446847877923917e-06, "loss": 0.0578, "reward": 2.3081374168395996, "reward_std": 2.2228269577026367, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.44186270236968994, "rewards/wrapped_format_reward": 0.75, "step": 662 }, { "completion_length": 410.0, "epoch": 132.6, "grad_norm": 0.5732893943786621, "kl": 1.3890821933746338, "learning_rate": 3.639833231171569e-06, "loss": 0.0556, "reward": 3.4063034057617188, "reward_std": 0.5635956525802612, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9772727489471436, "rewards/wrapped_driving_reward": 0.55403071641922, "rewards/wrapped_format_reward": 0.875, "step": 663 }, { "completion_length": 500.0, "epoch": 132.8, "grad_norm": 0.5441069602966309, "kl": 0.8945682048797607, "learning_rate": 3.634976249348867e-06, "loss": 0.0358, "reward": 3.1547274589538574, "reward_std": 0.16029714047908783, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.279727578163147, "rewards/wrapped_format_reward": 0.875, "step": 664 }, { "completion_length": 500.0, "epoch": 133.0, "grad_norm": 0.5390021800994873, "kl": 0.566189169883728, "learning_rate": 3.6301138654418e-06, "loss": 0.0226, "reward": 1.5111743211746216, "reward_std": 3.3476204872131348, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.8638256788253784, "rewards/wrapped_format_reward": 0.875, "step": 665 }, { "completion_length": 500.0, "epoch": 133.2, "grad_norm": 0.49394407868385315, "kl": 0.46724024415016174, "learning_rate": 3.625246102593588e-06, "loss": 0.0187, "reward": 1.7283051013946533, "reward_std": 3.516357660293579, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.3966948688030243, "rewards/wrapped_format_reward": 0.625, "step": 666 }, { "completion_length": 481.0, "epoch": 133.4, "grad_norm": 0.5267409682273865, "kl": 0.5396498441696167, "learning_rate": 3.6203729839730567e-06, "loss": 0.0216, "reward": 1.739487886428833, "reward_std": 1.8327277898788452, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9750000238418579, "rewards/wrapped_driving_reward": -1.1105120182037354, "rewards/wrapped_format_reward": 0.875, "step": 667 }, { "completion_length": 500.0, "epoch": 133.6, "grad_norm": 0.5509145855903625, "kl": 0.605006992816925, "learning_rate": 3.6154945327745223e-06, "loss": 0.0242, "reward": -0.9433751106262207, "reward_std": 2.5427985191345215, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -3.1933751106262207, "rewards/wrapped_format_reward": 0.75, "step": 668 }, { "completion_length": 500.0, "epoch": 133.8, "grad_norm": 0.601017415523529, "kl": 0.7718148231506348, "learning_rate": 3.610610772217682e-06, "loss": 0.0309, "reward": 2.710988998413086, "reward_std": 0.43899405002593994, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.28901103138923645, "rewards/wrapped_format_reward": 1.0, "step": 669 }, { "completion_length": 483.0, "epoch": 134.0, "grad_norm": 0.5350854992866516, "kl": 0.619436502456665, "learning_rate": 3.6057217255475034e-06, "loss": 0.0248, "reward": 3.0488271713256836, "reward_std": 0.5188868641853333, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.4238271713256836, "rewards/wrapped_format_reward": 0.625, "step": 670 }, { "completion_length": 500.0, "epoch": 134.2, "grad_norm": 0.4458288848400116, "kl": 0.7321600317955017, "learning_rate": 3.600827416034115e-06, "loss": 0.0293, "reward": -0.6971749067306519, "reward_std": 2.9557926654815674, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.45454543828964233, "rewards/wrapped_driving_reward": -2.5267202854156494, "rewards/wrapped_format_reward": 0.875, "step": 671 }, { "completion_length": 500.0, "epoch": 134.4, "grad_norm": 0.5819605588912964, "kl": 1.0092272758483887, "learning_rate": 3.595927866972694e-06, "loss": 0.0404, "reward": 3.0731568336486816, "reward_std": 0.34458592534065247, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.1981567144393921, "rewards/wrapped_format_reward": 0.875, "step": 672 }, { "completion_length": 471.0, "epoch": 134.6, "grad_norm": 0.4470769464969635, "kl": 1.3916444778442383, "learning_rate": 3.591023101683355e-06, "loss": 0.0557, "reward": 2.623095750808716, "reward_std": 0.15428505837917328, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.37690430879592896, "rewards/wrapped_format_reward": 1.0, "step": 673 }, { "completion_length": 471.0, "epoch": 134.8, "grad_norm": 0.8402154445648193, "kl": 0.894091010093689, "learning_rate": 3.586113143511043e-06, "loss": 0.0358, "reward": 3.297544479370117, "reward_std": 0.5369318723678589, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9375, "rewards/wrapped_driving_reward": 0.610044538974762, "rewards/wrapped_format_reward": 0.75, "step": 674 }, { "completion_length": 500.0, "epoch": 135.0, "grad_norm": 0.6311519742012024, "kl": 0.7051270008087158, "learning_rate": 3.5811980158254156e-06, "loss": 0.0282, "reward": 1.474664330482483, "reward_std": 3.348062515258789, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9003356099128723, "rewards/wrapped_format_reward": 0.875, "step": 675 }, { "completion_length": 500.0, "epoch": 135.2, "grad_norm": 0.6317039728164673, "kl": 0.52884840965271, "learning_rate": 3.5762777420207382e-06, "loss": 0.0212, "reward": -0.5374802947044373, "reward_std": 3.7230663299560547, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.162480354309082, "rewards/wrapped_format_reward": 0.625, "step": 676 }, { "completion_length": 500.0, "epoch": 135.4, "grad_norm": 0.7059259414672852, "kl": 1.1256921291351318, "learning_rate": 3.5713523455157686e-06, "loss": 0.045, "reward": 1.6178922653198242, "reward_std": 3.7729244232177734, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.38210779428482056, "rewards/wrapped_format_reward": 0.5, "step": 677 }, { "completion_length": 469.0, "epoch": 135.6, "grad_norm": 0.699884831905365, "kl": 0.8976281881332397, "learning_rate": 3.566421849753646e-06, "loss": 0.0359, "reward": 2.338595151901245, "reward_std": 0.5884451270103455, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9375, "rewards/wrapped_driving_reward": -0.2239048033952713, "rewards/wrapped_format_reward": 0.625, "step": 678 }, { "completion_length": 493.0, "epoch": 135.8, "grad_norm": 0.8474484086036682, "kl": 1.239915370941162, "learning_rate": 3.5614862782017833e-06, "loss": 0.0496, "reward": 3.6089816093444824, "reward_std": 0.34150734543800354, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.6089816093444824, "rewards/wrapped_format_reward": 1.0, "step": 679 }, { "completion_length": 500.0, "epoch": 136.0, "grad_norm": 0.7421549558639526, "kl": 1.3694592714309692, "learning_rate": 3.556545654351749e-06, "loss": 0.0548, "reward": 0.45889168977737427, "reward_std": 2.6780941486358643, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6041666865348816, "rewards/wrapped_driving_reward": -1.3952749967575073, "rewards/wrapped_format_reward": 0.5, "step": 680 }, { "completion_length": 495.0, "epoch": 136.2, "grad_norm": 0.5081685781478882, "kl": 1.3575365543365479, "learning_rate": 3.551600001719161e-06, "loss": 0.0543, "reward": 3.8419742584228516, "reward_std": 0.012562450021505356, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.841974139213562, "rewards/wrapped_format_reward": 1.0, "step": 681 }, { "completion_length": 500.0, "epoch": 136.4, "grad_norm": 0.7341115474700928, "kl": 1.159175992012024, "learning_rate": 3.5466493438435707e-06, "loss": 0.0464, "reward": 1.3301422595977783, "reward_std": 3.229234457015991, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.7948578000068665, "rewards/wrapped_format_reward": 0.625, "step": 682 }, { "completion_length": 500.0, "epoch": 136.6, "grad_norm": 0.7937794327735901, "kl": 1.867870807647705, "learning_rate": 3.541693704288355e-06, "loss": 0.0747, "reward": 3.3098440170288086, "reward_std": 0.45399734377861023, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.5598439574241638, "rewards/wrapped_format_reward": 0.75, "step": 683 }, { "completion_length": 440.0, "epoch": 136.8, "grad_norm": 0.540158748626709, "kl": 0.5177903175354004, "learning_rate": 3.536733106640598e-06, "loss": 0.0207, "reward": 1.2544230222702026, "reward_std": 2.3613858222961426, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.7455769777297974, "rewards/wrapped_format_reward": 1.0, "step": 684 }, { "completion_length": 500.0, "epoch": 137.0, "grad_norm": 0.45787709951400757, "kl": 0.9340965747833252, "learning_rate": 3.531767574510987e-06, "loss": 0.0374, "reward": 2.6123316287994385, "reward_std": 0.17315728962421417, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9722222089767456, "rewards/wrapped_driving_reward": -0.35989055037498474, "rewards/wrapped_format_reward": 1.0, "step": 685 }, { "completion_length": 500.0, "epoch": 137.2, "grad_norm": 0.5473198294639587, "kl": 1.2692763805389404, "learning_rate": 3.5267971315336936e-06, "loss": 0.0508, "reward": 1.327620029449463, "reward_std": 3.2223923206329346, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.047379970550537, "rewards/wrapped_format_reward": 0.875, "step": 686 }, { "completion_length": 449.0, "epoch": 137.4, "grad_norm": 0.7179501056671143, "kl": 0.834479570388794, "learning_rate": 3.5218218013662626e-06, "loss": 0.0334, "reward": 2.7406983375549316, "reward_std": 0.6632347702980042, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9642857313156128, "rewards/wrapped_driving_reward": 0.02641259878873825, "rewards/wrapped_format_reward": 0.75, "step": 687 }, { "completion_length": 500.0, "epoch": 137.6, "grad_norm": 0.5952706933021545, "kl": 0.749407172203064, "learning_rate": 3.516841607689501e-06, "loss": 0.03, "reward": 3.403985023498535, "reward_std": 0.48997849225997925, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.403984934091568, "rewards/wrapped_format_reward": 1.0, "step": 688 }, { "completion_length": 500.0, "epoch": 137.8, "grad_norm": 0.48037606477737427, "kl": 0.7486713528633118, "learning_rate": 3.511856574207364e-06, "loss": 0.0299, "reward": 3.3666038513183594, "reward_std": 0.11598622053861618, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.36660394072532654, "rewards/wrapped_format_reward": 1.0, "step": 689 }, { "completion_length": 500.0, "epoch": 138.0, "grad_norm": 0.6659343242645264, "kl": 0.98530113697052, "learning_rate": 3.5068667246468437e-06, "loss": 0.0394, "reward": 1.5401697158813477, "reward_std": 3.7069664001464844, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7045454382896423, "rewards/wrapped_driving_reward": -0.6643756628036499, "rewards/wrapped_format_reward": 0.75, "step": 690 }, { "completion_length": 500.0, "epoch": 138.2, "grad_norm": 0.5222293734550476, "kl": 0.5954757928848267, "learning_rate": 3.5018720827578523e-06, "loss": 0.0238, "reward": 2.6262009143829346, "reward_std": 0.13065750896930695, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.12379903346300125, "rewards/wrapped_format_reward": 0.75, "step": 691 }, { "completion_length": 500.0, "epoch": 138.4, "grad_norm": 0.641049325466156, "kl": 1.211357593536377, "learning_rate": 3.496872672313116e-06, "loss": 0.0485, "reward": 1.5172151327133179, "reward_std": 3.099297523498535, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.7327848672866821, "rewards/wrapped_format_reward": 0.75, "step": 692 }, { "completion_length": 500.0, "epoch": 138.6, "grad_norm": 0.5524839162826538, "kl": 1.0793451070785522, "learning_rate": 3.491868517108053e-06, "loss": 0.0432, "reward": 2.820830821990967, "reward_std": 0.561494767665863, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.17916926741600037, "rewards/wrapped_format_reward": 1.0, "step": 693 }, { "completion_length": 500.0, "epoch": 138.8, "grad_norm": 0.41166210174560547, "kl": 1.6346538066864014, "learning_rate": 3.486859640960668e-06, "loss": 0.0654, "reward": 2.7948412895202637, "reward_std": 0.020073924213647842, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.20515868067741394, "rewards/wrapped_format_reward": 1.0, "step": 694 }, { "completion_length": 500.0, "epoch": 139.0, "grad_norm": 0.5615822076797485, "kl": 0.5488652586936951, "learning_rate": 3.481846067711436e-06, "loss": 0.022, "reward": 3.8224549293518066, "reward_std": 0.021474715322256088, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.8224549293518066, "rewards/wrapped_format_reward": 1.0, "step": 695 }, { "completion_length": 500.0, "epoch": 139.2, "grad_norm": 0.5782674551010132, "kl": 1.0001182556152344, "learning_rate": 3.476827821223184e-06, "loss": 0.04, "reward": 1.3818203210830688, "reward_std": 3.267575263977051, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9931796789169312, "rewards/wrapped_format_reward": 0.875, "step": 696 }, { "completion_length": 500.0, "epoch": 139.4, "grad_norm": 0.5270988941192627, "kl": 0.6028380393981934, "learning_rate": 3.4718049253809894e-06, "loss": 0.0241, "reward": 3.23805570602417, "reward_std": 0.320840448141098, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9583333134651184, "rewards/wrapped_driving_reward": 0.40472227334976196, "rewards/wrapped_format_reward": 0.875, "step": 697 }, { "completion_length": 500.0, "epoch": 139.6, "grad_norm": 0.4955165982246399, "kl": 0.2747107148170471, "learning_rate": 3.466777404092052e-06, "loss": 0.011, "reward": 1.2135114669799805, "reward_std": 3.2385220527648926, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.03648841381073, "rewards/wrapped_format_reward": 0.75, "step": 698 }, { "completion_length": 435.0, "epoch": 139.8, "grad_norm": 0.580830991268158, "kl": 0.769486665725708, "learning_rate": 3.4617452812855908e-06, "loss": 0.0308, "reward": 3.6568925380706787, "reward_std": 0.10732667148113251, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.824999988079071, "rewards/wrapped_driving_reward": 0.8318926095962524, "rewards/wrapped_format_reward": 1.0, "step": 699 }, { "completion_length": 494.0, "epoch": 140.0, "grad_norm": 0.5295502543449402, "kl": 0.8366852402687073, "learning_rate": 3.4567085809127247e-06, "loss": 0.0335, "reward": 3.1698110103607178, "reward_std": 0.26023775339126587, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.16981105506420135, "rewards/wrapped_format_reward": 1.0, "step": 700 }, { "completion_length": 500.0, "epoch": 140.2, "grad_norm": 0.7031523585319519, "kl": 1.2178704738616943, "learning_rate": 3.4516673269463617e-06, "loss": 0.0487, "reward": 3.5846829414367676, "reward_std": 0.5065321922302246, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.8346830010414124, "rewards/wrapped_format_reward": 0.75, "step": 701 }, { "completion_length": 500.0, "epoch": 140.4, "grad_norm": 0.5828245282173157, "kl": 1.1058602333068848, "learning_rate": 3.4466215433810827e-06, "loss": 0.0442, "reward": 1.7846969366073608, "reward_std": 3.545959949493408, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.5903029441833496, "rewards/wrapped_format_reward": 0.875, "step": 702 }, { "completion_length": 437.0, "epoch": 140.6, "grad_norm": 0.5404889583587646, "kl": 1.335483193397522, "learning_rate": 3.441571254233027e-06, "loss": 0.0534, "reward": 2.553800582885742, "reward_std": 0.27431902289390564, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.1961991786956787, "rewards/wrapped_format_reward": 0.75, "step": 703 }, { "completion_length": 500.0, "epoch": 140.8, "grad_norm": 0.4963621497154236, "kl": 0.47123226523399353, "learning_rate": 3.436516483539781e-06, "loss": 0.0188, "reward": 2.3996798992156982, "reward_std": 1.5315988063812256, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9375, "rewards/wrapped_driving_reward": -0.5378199815750122, "rewards/wrapped_format_reward": 1.0, "step": 704 }, { "completion_length": 500.0, "epoch": 141.0, "grad_norm": 0.5385916233062744, "kl": 1.076686978340149, "learning_rate": 3.4314572553602577e-06, "loss": 0.0431, "reward": -1.0651588439941406, "reward_std": 2.3442935943603516, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -3.0651588439941406, "rewards/wrapped_format_reward": 0.5, "step": 705 }, { "completion_length": 500.0, "epoch": 141.2, "grad_norm": 0.6693921685218811, "kl": 1.052538275718689, "learning_rate": 3.426393593774591e-06, "loss": 0.0421, "reward": 2.958894968032837, "reward_std": 0.5083202123641968, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9722222089767456, "rewards/wrapped_driving_reward": 0.36167269945144653, "rewards/wrapped_format_reward": 0.625, "step": 706 }, { "completion_length": 500.0, "epoch": 141.4, "grad_norm": 0.5258262157440186, "kl": 0.5887873768806458, "learning_rate": 3.421325522884013e-06, "loss": 0.0236, "reward": 1.8505744934082031, "reward_std": 3.574678421020508, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.39942556619644165, "rewards/wrapped_format_reward": 0.75, "step": 707 }, { "completion_length": 500.0, "epoch": 141.6, "grad_norm": 0.5373359322547913, "kl": 0.80438631772995, "learning_rate": 3.4162530668107435e-06, "loss": 0.0322, "reward": 3.0027952194213867, "reward_std": 0.3596634864807129, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.25279513001441956, "rewards/wrapped_format_reward": 0.75, "step": 708 }, { "completion_length": 500.0, "epoch": 141.8, "grad_norm": 0.8107948303222656, "kl": 0.7092552185058594, "learning_rate": 3.4111762496978753e-06, "loss": 0.0284, "reward": 0.7482374310493469, "reward_std": 2.8443443775177, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.2517626285552979, "rewards/wrapped_format_reward": 0.5, "step": 709 }, { "completion_length": 484.0, "epoch": 142.0, "grad_norm": 0.5415573120117188, "kl": 0.9022457599639893, "learning_rate": 3.406095095709254e-06, "loss": 0.0361, "reward": 1.4290342330932617, "reward_std": 1.759615182876587, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9772727489471436, "rewards/wrapped_driving_reward": -1.5482385158538818, "rewards/wrapped_format_reward": 1.0, "step": 710 }, { "completion_length": 474.0, "epoch": 142.2, "grad_norm": 0.4888540804386139, "kl": 0.6180402636528015, "learning_rate": 3.401009629029375e-06, "loss": 0.0247, "reward": 2.0353569984436035, "reward_std": 2.0499258041381836, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9722222089767456, "rewards/wrapped_driving_reward": -0.9368650317192078, "rewards/wrapped_format_reward": 1.0, "step": 711 }, { "completion_length": 467.0, "epoch": 142.4, "grad_norm": 0.5631850957870483, "kl": 1.0096300840377808, "learning_rate": 3.39591987386325e-06, "loss": 0.0404, "reward": 1.2916650772094727, "reward_std": 2.871446371078491, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.0833349227905273, "rewards/wrapped_format_reward": 0.875, "step": 712 }, { "completion_length": 500.0, "epoch": 142.6, "grad_norm": 0.6677610278129578, "kl": 0.7574449777603149, "learning_rate": 3.3908258544363145e-06, "loss": 0.0303, "reward": -0.266355037689209, "reward_std": 3.46443247795105, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4583333134651184, "rewards/wrapped_driving_reward": -1.9746882915496826, "rewards/wrapped_format_reward": 0.75, "step": 713 }, { "completion_length": 500.0, "epoch": 142.8, "grad_norm": 0.4841836988925934, "kl": 0.5822250247001648, "learning_rate": 3.3857275949942896e-06, "loss": 0.0233, "reward": 3.0125911235809326, "reward_std": 0.5904353260993958, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.13759109377861023, "rewards/wrapped_format_reward": 0.875, "step": 714 }, { "completion_length": 500.0, "epoch": 143.0, "grad_norm": 0.6958256363868713, "kl": 0.4327092468738556, "learning_rate": 3.3806251198030843e-06, "loss": 0.0173, "reward": 1.4330600500106812, "reward_std": 2.3058290481567383, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9545454978942871, "rewards/wrapped_driving_reward": -1.0214855670928955, "rewards/wrapped_format_reward": 0.5, "step": 715 }, { "completion_length": 500.0, "epoch": 143.2, "grad_norm": 0.6068946719169617, "kl": 0.7822107076644897, "learning_rate": 3.375518453148669e-06, "loss": 0.0313, "reward": 1.1330852508544922, "reward_std": 2.4693946838378906, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.8669147491455078, "rewards/wrapped_format_reward": 1.0, "step": 716 }, { "completion_length": 500.0, "epoch": 143.4, "grad_norm": 0.5714890956878662, "kl": 0.5352396965026855, "learning_rate": 3.370407619336966e-06, "loss": 0.0214, "reward": 2.959530830383301, "reward_std": 0.21941396594047546, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.859375, "rewards/wrapped_driving_reward": 0.2251558005809784, "rewards/wrapped_format_reward": 0.875, "step": 717 }, { "completion_length": 500.0, "epoch": 143.6, "grad_norm": 0.7257641553878784, "kl": 0.8822948336601257, "learning_rate": 3.3652926426937327e-06, "loss": 0.0353, "reward": 3.610146999359131, "reward_std": 0.20129412412643433, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.6101469397544861, "rewards/wrapped_format_reward": 1.0, "step": 718 }, { "completion_length": 500.0, "epoch": 143.8, "grad_norm": 0.5029004812240601, "kl": 1.3567814826965332, "learning_rate": 3.360173547564442e-06, "loss": 0.0543, "reward": 1.8939110040664673, "reward_std": 1.931662917137146, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.1060889959335327, "rewards/wrapped_format_reward": 1.0, "step": 719 }, { "completion_length": 481.0, "epoch": 144.0, "grad_norm": 0.4105921685695648, "kl": 1.52871572971344, "learning_rate": 3.3550503583141726e-06, "loss": 0.0611, "reward": 3.451643943786621, "reward_std": 0.4683004915714264, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.4516439437866211, "rewards/wrapped_format_reward": 1.0, "step": 720 }, { "completion_length": 500.0, "epoch": 144.2, "grad_norm": 0.7768301367759705, "kl": 1.18900465965271, "learning_rate": 3.3499230993274857e-06, "loss": 0.0476, "reward": 0.9932074546813965, "reward_std": 3.367805242538452, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.006792664527893, "rewards/wrapped_format_reward": 0.5, "step": 721 }, { "completion_length": 490.0, "epoch": 144.4, "grad_norm": 0.6140910983085632, "kl": 0.9986215829849243, "learning_rate": 3.344791795008318e-06, "loss": 0.0399, "reward": 3.1455395221710205, "reward_std": 0.6940726637840271, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.39553946256637573, "rewards/wrapped_format_reward": 0.75, "step": 722 }, { "completion_length": 468.0, "epoch": 144.6, "grad_norm": 0.7327135801315308, "kl": 1.4013735055923462, "learning_rate": 3.339656469779856e-06, "loss": 0.0561, "reward": 2.5551810264587402, "reward_std": 0.48002633452415466, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.19481904804706573, "rewards/wrapped_format_reward": 0.75, "step": 723 }, { "completion_length": 428.0, "epoch": 144.8, "grad_norm": 0.5205874443054199, "kl": 1.3122892379760742, "learning_rate": 3.3345171480844275e-06, "loss": 0.0525, "reward": 3.3511462211608887, "reward_std": 0.05403972789645195, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.35114631056785583, "rewards/wrapped_format_reward": 1.0, "step": 724 }, { "completion_length": 430.0, "epoch": 145.0, "grad_norm": 0.5604906678199768, "kl": 0.6418074369430542, "learning_rate": 3.3293738543833807e-06, "loss": 0.0257, "reward": 3.1239919662475586, "reward_std": 0.22990824282169342, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9090908765792847, "rewards/wrapped_driving_reward": 0.33990123867988586, "rewards/wrapped_format_reward": 0.875, "step": 725 }, { "completion_length": 362.0, "epoch": 145.2, "grad_norm": 0.6044005155563354, "kl": 1.2028814554214478, "learning_rate": 3.3242266131569685e-06, "loss": 0.0481, "reward": 3.8261451721191406, "reward_std": 0.014617701061069965, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.8261451721191406, "rewards/wrapped_format_reward": 1.0, "step": 726 }, { "completion_length": 500.0, "epoch": 145.4, "grad_norm": 0.6275110244750977, "kl": 0.6762914657592773, "learning_rate": 3.3190754489042343e-06, "loss": 0.0271, "reward": 2.7210917472839355, "reward_std": 0.9790669679641724, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9772727489471436, "rewards/wrapped_driving_reward": 0.11881905049085617, "rewards/wrapped_format_reward": 0.625, "step": 727 }, { "completion_length": 500.0, "epoch": 145.6, "grad_norm": 0.5946706533432007, "kl": 0.5865952372550964, "learning_rate": 3.313920386142892e-06, "loss": 0.0235, "reward": 2.5557713508605957, "reward_std": 1.0863637924194336, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.3192285895347595, "rewards/wrapped_format_reward": 0.875, "step": 728 }, { "completion_length": 492.0, "epoch": 145.8, "grad_norm": 0.5380379557609558, "kl": 1.6467604637145996, "learning_rate": 3.308761449409213e-06, "loss": 0.0659, "reward": 1.8841032981872559, "reward_std": 1.9241235256195068, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.1158965826034546, "rewards/wrapped_format_reward": 1.0, "step": 729 }, { "completion_length": 500.0, "epoch": 146.0, "grad_norm": 0.6382113099098206, "kl": 0.7868921160697937, "learning_rate": 3.303598663257904e-06, "loss": 0.0315, "reward": 1.3918712139129639, "reward_std": 3.2618727684020996, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7083333134651184, "rewards/wrapped_driving_reward": -0.9414620995521545, "rewards/wrapped_format_reward": 0.875, "step": 730 }, { "completion_length": 500.0, "epoch": 146.2, "grad_norm": 0.5164018273353577, "kl": 1.0673645734786987, "learning_rate": 3.298432052261998e-06, "loss": 0.0427, "reward": 2.141380548477173, "reward_std": 0.5501049160957336, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9146825075149536, "rewards/wrapped_driving_reward": -0.39830195903778076, "rewards/wrapped_format_reward": 0.625, "step": 731 }, { "completion_length": 500.0, "epoch": 146.4, "grad_norm": 0.493426114320755, "kl": 0.8695380687713623, "learning_rate": 3.293261641012731e-06, "loss": 0.0348, "reward": 3.68026065826416, "reward_std": 0.3121219873428345, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.6802605986595154, "rewards/wrapped_format_reward": 1.0, "step": 732 }, { "completion_length": 500.0, "epoch": 146.6, "grad_norm": 0.5762398838996887, "kl": 1.1291520595550537, "learning_rate": 3.288087454119425e-06, "loss": 0.0452, "reward": 2.828002691268921, "reward_std": 0.4208530783653259, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9642857313156128, "rewards/wrapped_driving_reward": 0.23871691524982452, "rewards/wrapped_format_reward": 0.625, "step": 733 }, { "completion_length": 500.0, "epoch": 146.8, "grad_norm": 0.4853045642375946, "kl": 0.49225303530693054, "learning_rate": 3.282909516209374e-06, "loss": 0.0197, "reward": 2.2937583923339844, "reward_std": 1.2878106832504272, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.5812417268753052, "rewards/wrapped_format_reward": 0.875, "step": 734 }, { "completion_length": 361.0, "epoch": 147.0, "grad_norm": 0.5176259279251099, "kl": 0.6392626762390137, "learning_rate": 3.277727851927727e-06, "loss": 0.0256, "reward": 2.8032870292663574, "reward_std": 0.37582576274871826, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.07171304523944855, "rewards/wrapped_format_reward": 0.875, "step": 735 }, { "completion_length": 500.0, "epoch": 147.2, "grad_norm": 0.5171045660972595, "kl": 0.9497402310371399, "learning_rate": 3.272542485937369e-06, "loss": 0.038, "reward": 1.833735704421997, "reward_std": 3.556070566177368, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.5412643551826477, "rewards/wrapped_format_reward": 0.875, "step": 736 }, { "completion_length": 500.0, "epoch": 147.4, "grad_norm": 0.5060833096504211, "kl": 1.1487382650375366, "learning_rate": 3.2673534429188005e-06, "loss": 0.0459, "reward": 1.8757826089859009, "reward_std": 0.8785591125488281, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9750000238418579, "rewards/wrapped_driving_reward": -0.849217414855957, "rewards/wrapped_format_reward": 0.75, "step": 737 }, { "completion_length": 401.0, "epoch": 147.6, "grad_norm": 0.5445796251296997, "kl": 1.0297980308532715, "learning_rate": 3.2621607475700272e-06, "loss": 0.0412, "reward": 3.600897789001465, "reward_std": 0.257000595331192, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.7258977890014648, "rewards/wrapped_format_reward": 0.875, "step": 738 }, { "completion_length": 500.0, "epoch": 147.8, "grad_norm": 0.640143096446991, "kl": 0.9937734603881836, "learning_rate": 3.256964424606437e-06, "loss": 0.0398, "reward": 1.05731201171875, "reward_std": 3.0485732555389404, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6875, "rewards/wrapped_driving_reward": -1.25518798828125, "rewards/wrapped_format_reward": 0.875, "step": 739 }, { "completion_length": 500.0, "epoch": 148.0, "grad_norm": 0.5605321526527405, "kl": 0.5616591572761536, "learning_rate": 3.2517644987606827e-06, "loss": 0.0225, "reward": 0.9253636598587036, "reward_std": 2.987279176712036, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.074636459350586, "rewards/wrapped_format_reward": 0.5, "step": 740 }, { "completion_length": 500.0, "epoch": 148.2, "grad_norm": 0.5340696573257446, "kl": 0.4105786383152008, "learning_rate": 3.2465609947825692e-06, "loss": 0.0164, "reward": 2.3854103088378906, "reward_std": 0.5500273108482361, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9166666865348816, "rewards/wrapped_driving_reward": -0.031256303191185, "rewards/wrapped_format_reward": 0.5, "step": 741 }, { "completion_length": 500.0, "epoch": 148.4, "grad_norm": 0.5821884870529175, "kl": 1.4779444932937622, "learning_rate": 3.2413539374389275e-06, "loss": 0.0591, "reward": 0.9070387482643127, "reward_std": 2.9707443714141846, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6875, "rewards/wrapped_driving_reward": -1.155461311340332, "rewards/wrapped_format_reward": 0.625, "step": 742 }, { "completion_length": 500.0, "epoch": 148.6, "grad_norm": 0.5381807684898376, "kl": 1.0403640270233154, "learning_rate": 3.2361433515135053e-06, "loss": 0.0416, "reward": 1.9871805906295776, "reward_std": 3.6582367420196533, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7361111044883728, "rewards/wrapped_driving_reward": -0.373930424451828, "rewards/wrapped_format_reward": 0.875, "step": 743 }, { "completion_length": 500.0, "epoch": 148.8, "grad_norm": 0.5100287795066833, "kl": 1.2638732194900513, "learning_rate": 3.230929261806842e-06, "loss": 0.0506, "reward": 3.228283405303955, "reward_std": 0.640960693359375, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.4782832860946655, "rewards/wrapped_format_reward": 0.75, "step": 744 }, { "completion_length": 500.0, "epoch": 149.0, "grad_norm": 0.501332700252533, "kl": 1.4255468845367432, "learning_rate": 3.225711693136156e-06, "loss": 0.057, "reward": 1.001197338104248, "reward_std": 2.017551898956299, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.8738027811050415, "rewards/wrapped_format_reward": 0.875, "step": 745 }, { "completion_length": 500.0, "epoch": 149.2, "grad_norm": 0.5337965488433838, "kl": 1.2356843948364258, "learning_rate": 3.2204906703352236e-06, "loss": 0.0494, "reward": 2.2676119804382324, "reward_std": 0.3297145366668701, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.48238787055015564, "rewards/wrapped_format_reward": 0.75, "step": 746 }, { "completion_length": 388.0, "epoch": 149.4, "grad_norm": 0.5865975022315979, "kl": 1.1721208095550537, "learning_rate": 3.215266218254261e-06, "loss": 0.0469, "reward": 3.2063422203063965, "reward_std": 0.2641158699989319, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.20634236931800842, "rewards/wrapped_format_reward": 1.0, "step": 747 }, { "completion_length": 500.0, "epoch": 149.6, "grad_norm": 0.569024920463562, "kl": 0.868983805179596, "learning_rate": 3.2100383617598075e-06, "loss": 0.0348, "reward": 2.9906487464904785, "reward_std": 0.3553977310657501, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9772727489471436, "rewards/wrapped_driving_reward": 0.763375997543335, "rewards/wrapped_format_reward": 0.25, "step": 748 }, { "completion_length": 500.0, "epoch": 149.8, "grad_norm": 0.5583156943321228, "kl": 1.0145978927612305, "learning_rate": 3.2048071257346043e-06, "loss": 0.0406, "reward": 3.5098564624786377, "reward_std": 0.30953365564346313, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.6348564624786377, "rewards/wrapped_format_reward": 0.875, "step": 749 }, { "completion_length": 383.0, "epoch": 150.0, "grad_norm": 0.792302668094635, "kl": 0.8888383507728577, "learning_rate": 3.199572535077481e-06, "loss": 0.0356, "reward": 0.08345681428909302, "reward_std": 2.1669135093688965, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -2.9165432453155518, "rewards/wrapped_format_reward": 1.0, "step": 750 }, { "completion_length": 500.0, "epoch": 150.2, "grad_norm": 0.5216514468193054, "kl": 0.9542403817176819, "learning_rate": 3.194334614703231e-06, "loss": 0.0382, "reward": -0.3409658670425415, "reward_std": 3.7293381690979004, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.8409658670425415, "rewards/wrapped_format_reward": 0.5, "step": 751 }, { "completion_length": 500.0, "epoch": 150.4, "grad_norm": 0.4775712788105011, "kl": 1.2860143184661865, "learning_rate": 3.189093389542498e-06, "loss": 0.0514, "reward": 1.3992867469787598, "reward_std": 1.6630216836929321, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.2257131338119507, "rewards/wrapped_format_reward": 0.625, "step": 752 }, { "completion_length": 500.0, "epoch": 150.6, "grad_norm": 0.49834057688713074, "kl": 0.9300306439399719, "learning_rate": 3.183848884541656e-06, "loss": 0.0372, "reward": 2.7901477813720703, "reward_std": 1.1417899131774902, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.29014769196510315, "rewards/wrapped_format_reward": 0.5, "step": 753 }, { "completion_length": 404.0, "epoch": 150.8, "grad_norm": 0.5298438668251038, "kl": 1.067299485206604, "learning_rate": 3.1786011246626858e-06, "loss": 0.0427, "reward": 2.8012261390686035, "reward_std": 0.03125178441405296, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.1987738460302353, "rewards/wrapped_format_reward": 1.0, "step": 754 }, { "completion_length": 483.0, "epoch": 151.0, "grad_norm": 0.5239464640617371, "kl": 1.0353827476501465, "learning_rate": 3.173350134883066e-06, "loss": 0.0414, "reward": 2.897402763366699, "reward_std": 0.7444739937782288, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8999999761581421, "rewards/wrapped_driving_reward": -0.002597332000732422, "rewards/wrapped_format_reward": 1.0, "step": 755 }, { "completion_length": 500.0, "epoch": 151.2, "grad_norm": 0.5170385241508484, "kl": 0.5912831425666809, "learning_rate": 3.1680959401956425e-06, "loss": 0.0237, "reward": 2.6039931774139404, "reward_std": 0.9651803970336914, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.39600682258605957, "rewards/wrapped_format_reward": 1.0, "step": 756 }, { "completion_length": 470.0, "epoch": 151.4, "grad_norm": 0.5630961656570435, "kl": 1.3199890851974487, "learning_rate": 3.1628385656085204e-06, "loss": 0.0528, "reward": 0.8596276640892029, "reward_std": 1.6715515851974487, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -2.0153722763061523, "rewards/wrapped_format_reward": 0.875, "step": 757 }, { "completion_length": 500.0, "epoch": 151.6, "grad_norm": 0.5590241551399231, "kl": 0.7221583724021912, "learning_rate": 3.157578036144937e-06, "loss": 0.0289, "reward": 2.7205729484558105, "reward_std": 0.37680330872535706, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.1544271856546402, "rewards/wrapped_format_reward": 0.875, "step": 758 }, { "completion_length": 500.0, "epoch": 151.8, "grad_norm": 0.6123512387275696, "kl": 0.9747918844223022, "learning_rate": 3.1523143768431475e-06, "loss": 0.039, "reward": 1.3794039487838745, "reward_std": 3.6140682697296143, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7395833134651184, "rewards/wrapped_driving_reward": -0.48517945408821106, "rewards/wrapped_format_reward": 0.375, "step": 759 }, { "completion_length": 500.0, "epoch": 152.0, "grad_norm": 0.5005034804344177, "kl": 0.639513373374939, "learning_rate": 3.147047612756302e-06, "loss": 0.0256, "reward": 2.4191792011260986, "reward_std": 1.3698595762252808, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.20582084357738495, "rewards/wrapped_format_reward": 0.625, "step": 760 }, { "completion_length": 500.0, "epoch": 152.2, "grad_norm": 0.4641207158565521, "kl": 1.0183018445968628, "learning_rate": 3.1417777689523297e-06, "loss": 0.0407, "reward": 1.737586259841919, "reward_std": 3.4952027797698975, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.6374138593673706, "rewards/wrapped_format_reward": 0.875, "step": 761 }, { "completion_length": 500.0, "epoch": 152.4, "grad_norm": 0.5382841229438782, "kl": 0.6911734342575073, "learning_rate": 3.136504870513819e-06, "loss": 0.0276, "reward": 3.6939151287078857, "reward_std": 0.23970358073711395, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.8189151287078857, "rewards/wrapped_format_reward": 0.875, "step": 762 }, { "completion_length": 500.0, "epoch": 152.6, "grad_norm": 0.5685204863548279, "kl": 1.0369704961776733, "learning_rate": 3.131228942537895e-06, "loss": 0.0415, "reward": 3.4881484508514404, "reward_std": 0.1292411834001541, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9583333134651184, "rewards/wrapped_driving_reward": 0.5298152565956116, "rewards/wrapped_format_reward": 1.0, "step": 763 }, { "completion_length": 500.0, "epoch": 152.8, "grad_norm": 0.4972493648529053, "kl": 1.0108205080032349, "learning_rate": 3.125950010136104e-06, "loss": 0.0404, "reward": 2.9704365730285645, "reward_std": 0.15474218130111694, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9166666865348816, "rewards/wrapped_driving_reward": 0.05377001687884331, "rewards/wrapped_format_reward": 1.0, "step": 764 }, { "completion_length": 500.0, "epoch": 153.0, "grad_norm": 0.5436493158340454, "kl": 0.8178736567497253, "learning_rate": 3.120668098434291e-06, "loss": 0.0327, "reward": -1.375, "reward_std": 0.4787135720252991, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.625, "step": 765 }, { "completion_length": 500.0, "epoch": 153.2, "grad_norm": 1.6990164518356323, "kl": 1.2837238311767578, "learning_rate": 3.115383232572483e-06, "loss": 0.0513, "reward": 0.06262272596359253, "reward_std": 1.4747098684310913, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -2.8123772144317627, "rewards/wrapped_format_reward": 0.875, "step": 766 }, { "completion_length": 500.0, "epoch": 153.4, "grad_norm": 0.6486381888389587, "kl": 0.36142438650131226, "learning_rate": 3.1100954377047665e-06, "loss": 0.0145, "reward": 1.3286774158477783, "reward_std": 3.5647871494293213, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9213225245475769, "rewards/wrapped_format_reward": 0.75, "step": 767 }, { "completion_length": 500.0, "epoch": 153.6, "grad_norm": 0.6793121695518494, "kl": 1.1117873191833496, "learning_rate": 3.1048047389991693e-06, "loss": 0.0445, "reward": 3.3907761573791504, "reward_std": 0.20889942348003387, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.5157762765884399, "rewards/wrapped_format_reward": 0.875, "step": 768 }, { "completion_length": 500.0, "epoch": 153.8, "grad_norm": 0.4357717037200928, "kl": 0.67461097240448, "learning_rate": 3.0995111616375417e-06, "loss": 0.027, "reward": 1.8588078022003174, "reward_std": 3.5830955505371094, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.5161921381950378, "rewards/wrapped_format_reward": 0.875, "step": 769 }, { "completion_length": 500.0, "epoch": 154.0, "grad_norm": 0.48567458987236023, "kl": 0.8509070873260498, "learning_rate": 3.094214730815433e-06, "loss": 0.034, "reward": 2.213784694671631, "reward_std": 0.4306184649467468, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9375, "rewards/wrapped_driving_reward": -0.22371524572372437, "rewards/wrapped_format_reward": 0.5, "step": 770 }, { "completion_length": 500.0, "epoch": 154.2, "grad_norm": 0.826822817325592, "kl": 0.3968334496021271, "learning_rate": 3.088915471741976e-06, "loss": 0.0159, "reward": 2.260512351989746, "reward_std": 0.8328320384025574, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8500000238418579, "rewards/wrapped_driving_reward": -0.21448758244514465, "rewards/wrapped_format_reward": 0.625, "step": 771 }, { "completion_length": 461.0, "epoch": 154.4, "grad_norm": 0.5886190533638, "kl": 1.100509524345398, "learning_rate": 3.0836134096397642e-06, "loss": 0.044, "reward": 3.2229135036468506, "reward_std": 0.6925670504570007, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.3479134142398834, "rewards/wrapped_format_reward": 0.875, "step": 772 }, { "completion_length": 500.0, "epoch": 154.6, "grad_norm": 0.5568466782569885, "kl": 0.4733794629573822, "learning_rate": 3.0783085697447324e-06, "loss": 0.0189, "reward": 1.964868187904358, "reward_std": 1.187056303024292, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.9101318120956421, "rewards/wrapped_format_reward": 0.875, "step": 773 }, { "completion_length": 500.0, "epoch": 154.8, "grad_norm": 1.7119587659835815, "kl": 1.4648423194885254, "learning_rate": 3.073000977306036e-06, "loss": 0.0586, "reward": 2.845578908920288, "reward_std": 0.34594619274139404, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.02942105382680893, "rewards/wrapped_format_reward": 0.875, "step": 774 }, { "completion_length": 500.0, "epoch": 155.0, "grad_norm": 0.5909410715103149, "kl": 0.5023986101150513, "learning_rate": 3.0676906575859335e-06, "loss": 0.0201, "reward": 2.984121799468994, "reward_std": 0.6469627022743225, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9027777314186096, "rewards/wrapped_driving_reward": 0.45634397864341736, "rewards/wrapped_format_reward": 0.625, "step": 775 }, { "completion_length": 500.0, "epoch": 155.2, "grad_norm": 0.46868669986724854, "kl": 1.4069966077804565, "learning_rate": 3.062377635859663e-06, "loss": 0.0563, "reward": 2.647207736968994, "reward_std": 0.2306966334581375, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9722222089767456, "rewards/wrapped_driving_reward": -0.20001450181007385, "rewards/wrapped_format_reward": 0.875, "step": 776 }, { "completion_length": 471.0, "epoch": 155.4, "grad_norm": 0.4950220286846161, "kl": 1.027194857597351, "learning_rate": 3.0570619374153234e-06, "loss": 0.0411, "reward": 3.526076316833496, "reward_std": 0.2638986110687256, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9642857313156128, "rewards/wrapped_driving_reward": 0.8117905259132385, "rewards/wrapped_format_reward": 0.75, "step": 777 }, { "completion_length": 500.0, "epoch": 155.6, "grad_norm": 0.4979739487171173, "kl": 0.7722088694572449, "learning_rate": 3.051743587553754e-06, "loss": 0.0309, "reward": 3.0607147216796875, "reward_std": 0.5555264949798584, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.18571479618549347, "rewards/wrapped_format_reward": 0.875, "step": 778 }, { "completion_length": 500.0, "epoch": 155.8, "grad_norm": 0.5324958562850952, "kl": 0.9706301689147949, "learning_rate": 3.0464226115884115e-06, "loss": 0.0388, "reward": 1.3502414226531982, "reward_std": 2.969165563583374, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.7747586369514465, "rewards/wrapped_format_reward": 0.625, "step": 779 }, { "completion_length": 500.0, "epoch": 156.0, "grad_norm": 0.5909656286239624, "kl": 0.8194612860679626, "learning_rate": 3.0410990348452572e-06, "loss": 0.0328, "reward": 1.87015700340271, "reward_std": 1.9332139492034912, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.1298428773880005, "rewards/wrapped_format_reward": 1.0, "step": 780 }, { "completion_length": 500.0, "epoch": 156.2, "grad_norm": 0.664778470993042, "kl": 0.9126893281936646, "learning_rate": 3.035772882662627e-06, "loss": 0.0365, "reward": 3.700016975402832, "reward_std": 0.2690635919570923, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.8250167965888977, "rewards/wrapped_format_reward": 0.875, "step": 781 }, { "completion_length": 500.0, "epoch": 156.4, "grad_norm": 0.4898008704185486, "kl": 0.34648996591567993, "learning_rate": 3.030444180391116e-06, "loss": 0.0139, "reward": 1.926034688949585, "reward_std": 0.2995733916759491, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9772727489471436, "rewards/wrapped_driving_reward": -0.1762380450963974, "rewards/wrapped_format_reward": 0.125, "step": 782 }, { "completion_length": 500.0, "epoch": 156.6, "grad_norm": 0.7782924175262451, "kl": 1.022233247756958, "learning_rate": 3.0251129533934565e-06, "loss": 0.0409, "reward": 3.3980164527893066, "reward_std": 0.24236531555652618, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9642857313156128, "rewards/wrapped_driving_reward": 0.4337307810783386, "rewards/wrapped_format_reward": 1.0, "step": 783 }, { "completion_length": 500.0, "epoch": 156.8, "grad_norm": 0.6505765914916992, "kl": 0.8364863991737366, "learning_rate": 3.019779227044398e-06, "loss": 0.0335, "reward": 3.134411573410034, "reward_std": 0.42541173100471497, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.96875, "rewards/wrapped_driving_reward": 0.29066160321235657, "rewards/wrapped_format_reward": 0.875, "step": 784 }, { "completion_length": 500.0, "epoch": 157.0, "grad_norm": 0.508212685585022, "kl": 0.38002079725265503, "learning_rate": 3.0144430267305874e-06, "loss": 0.0152, "reward": 3.031036376953125, "reward_std": 0.23074941337108612, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.0310362558811903, "rewards/wrapped_format_reward": 1.0, "step": 785 }, { "completion_length": 500.0, "epoch": 157.2, "grad_norm": 0.5105311274528503, "kl": 0.2681836783885956, "learning_rate": 3.0091043778504438e-06, "loss": 0.0107, "reward": -2.249131917953491, "reward_std": 3.5017361640930176, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.999131917953491, "rewards/wrapped_format_reward": 0.25, "step": 786 }, { "completion_length": 500.0, "epoch": 157.4, "grad_norm": 0.5429096221923828, "kl": 0.4873754680156708, "learning_rate": 3.0037633058140433e-06, "loss": 0.0195, "reward": 3.7257652282714844, "reward_std": 0.21732982993125916, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.7257651090621948, "rewards/wrapped_format_reward": 1.0, "step": 787 }, { "completion_length": 500.0, "epoch": 157.6, "grad_norm": 0.6699551343917847, "kl": 0.7164745926856995, "learning_rate": 2.998419836042993e-06, "loss": 0.0287, "reward": 3.4064254760742188, "reward_std": 0.2806980609893799, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.5314255952835083, "rewards/wrapped_format_reward": 0.875, "step": 788 }, { "completion_length": 500.0, "epoch": 157.8, "grad_norm": 0.4117250442504883, "kl": 1.1326197385787964, "learning_rate": 2.993073993970316e-06, "loss": 0.0453, "reward": 2.703610897064209, "reward_std": 0.4096671938896179, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.29638922214508057, "rewards/wrapped_format_reward": 1.0, "step": 789 }, { "completion_length": 355.0, "epoch": 158.0, "grad_norm": 0.5928639769554138, "kl": 0.5766419768333435, "learning_rate": 2.9877258050403214e-06, "loss": 0.0231, "reward": 3.0424065589904785, "reward_std": 0.2919481694698334, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.04240674898028374, "rewards/wrapped_format_reward": 1.0, "step": 790 }, { "completion_length": 500.0, "epoch": 158.2, "grad_norm": 0.5171490907669067, "kl": 0.8181880712509155, "learning_rate": 2.9823752947084926e-06, "loss": 0.0327, "reward": 3.0578365325927734, "reward_std": 0.42794013023376465, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.307836651802063, "rewards/wrapped_format_reward": 0.75, "step": 791 }, { "completion_length": 500.0, "epoch": 158.4, "grad_norm": 0.5789440870285034, "kl": 0.6994017958641052, "learning_rate": 2.9770224884413625e-06, "loss": 0.028, "reward": 2.8126344680786133, "reward_std": 0.8600202202796936, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.18763434886932373, "rewards/wrapped_format_reward": 0.625, "step": 792 }, { "completion_length": 383.0, "epoch": 158.6, "grad_norm": 0.7086771130561829, "kl": 1.1586806774139404, "learning_rate": 2.9716674117163886e-06, "loss": 0.0463, "reward": 1.134717583656311, "reward_std": 2.779279947280884, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.240282416343689, "rewards/wrapped_format_reward": 0.875, "step": 793 }, { "completion_length": 499.0, "epoch": 158.8, "grad_norm": 0.4350188970565796, "kl": 0.5939379334449768, "learning_rate": 2.966310090021837e-06, "loss": 0.0237, "reward": 3.8331589698791504, "reward_std": 0.011624080128967762, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.8331590890884399, "rewards/wrapped_format_reward": 1.0, "step": 794 }, { "completion_length": 420.0, "epoch": 159.0, "grad_norm": 0.6273412704467773, "kl": 0.8126957416534424, "learning_rate": 2.9609505488566585e-06, "loss": 0.0325, "reward": 1.8683500289916992, "reward_std": 2.0532751083374023, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7833333611488342, "rewards/wrapped_driving_reward": -0.7899832129478455, "rewards/wrapped_format_reward": 0.875, "step": 795 }, { "completion_length": 500.0, "epoch": 159.2, "grad_norm": 0.7259314060211182, "kl": 0.9824731945991516, "learning_rate": 2.9555888137303695e-06, "loss": 0.0393, "reward": -1.75, "reward_std": 1.1902379989624023, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 796 }, { "completion_length": 500.0, "epoch": 159.4, "grad_norm": 0.9288682341575623, "kl": 1.0356067419052124, "learning_rate": 2.9502249101629248e-06, "loss": 0.0414, "reward": 3.432770013809204, "reward_std": 0.463666170835495, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9772727489471436, "rewards/wrapped_driving_reward": 0.8304973840713501, "rewards/wrapped_format_reward": 0.625, "step": 797 }, { "completion_length": 500.0, "epoch": 159.6, "grad_norm": 0.6679523587226868, "kl": 0.6253860592842102, "learning_rate": 2.944858863684605e-06, "loss": 0.025, "reward": 2.203218936920166, "reward_std": 2.1417882442474365, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.953125, "rewards/wrapped_driving_reward": -0.4999062418937683, "rewards/wrapped_format_reward": 0.75, "step": 798 }, { "completion_length": 500.0, "epoch": 159.8, "grad_norm": 0.7524790167808533, "kl": 0.8385742902755737, "learning_rate": 2.939490699835887e-06, "loss": 0.0335, "reward": 1.3603956699371338, "reward_std": 3.2636396884918213, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.8896043300628662, "rewards/wrapped_format_reward": 0.75, "step": 799 }, { "completion_length": 500.0, "epoch": 160.0, "grad_norm": 0.6979427337646484, "kl": 0.27322471141815186, "learning_rate": 2.9341204441673267e-06, "loss": 0.0109, "reward": -0.4729641079902649, "reward_std": 3.7896523475646973, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.9729641675949097, "rewards/wrapped_format_reward": 0.5, "step": 800 }, { "completion_length": 500.0, "epoch": 160.2, "grad_norm": 0.5591041445732117, "kl": 1.02774178981781, "learning_rate": 2.9287481222394358e-06, "loss": 0.0411, "reward": 3.2085986137390137, "reward_std": 0.36301255226135254, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.5835985541343689, "rewards/wrapped_format_reward": 0.625, "step": 801 }, { "completion_length": 500.0, "epoch": 160.4, "grad_norm": 0.5321090817451477, "kl": 0.6709323525428772, "learning_rate": 2.9233737596225616e-06, "loss": 0.0268, "reward": 0.336200475692749, "reward_std": 3.2529170513153076, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6722221970558167, "rewards/wrapped_driving_reward": -1.5860216617584229, "rewards/wrapped_format_reward": 0.5, "step": 802 }, { "completion_length": 500.0, "epoch": 160.6, "grad_norm": 0.7834795713424683, "kl": 1.0378621816635132, "learning_rate": 2.9179973818967643e-06, "loss": 0.0415, "reward": 1.797407627105713, "reward_std": 3.538046360015869, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.45259231328964233, "rewards/wrapped_format_reward": 0.75, "step": 803 }, { "completion_length": 400.0, "epoch": 160.8, "grad_norm": 0.530414342880249, "kl": 1.1836286783218384, "learning_rate": 2.912619014651694e-06, "loss": 0.0473, "reward": 2.7897591590881348, "reward_std": 0.024412136524915695, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.21024088561534882, "rewards/wrapped_format_reward": 1.0, "step": 804 }, { "completion_length": 500.0, "epoch": 161.0, "grad_norm": 0.7451685667037964, "kl": 1.1952420473098755, "learning_rate": 2.9072386834864723e-06, "loss": 0.0478, "reward": 2.9604384899139404, "reward_std": 0.519312858581543, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9583333134651184, "rewards/wrapped_driving_reward": 0.12710511684417725, "rewards/wrapped_format_reward": 0.875, "step": 805 }, { "completion_length": 500.0, "epoch": 161.2, "grad_norm": 0.5481700301170349, "kl": 1.0363727807998657, "learning_rate": 2.9018564140095657e-06, "loss": 0.0415, "reward": 1.8664171695709229, "reward_std": 3.5856990814208984, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": -0.3835829496383667, "rewards/wrapped_format_reward": 0.875, "step": 806 }, { "completion_length": 500.0, "epoch": 161.4, "grad_norm": 0.4871310889720917, "kl": 0.572235107421875, "learning_rate": 2.896472231838668e-06, "loss": 0.0229, "reward": 2.6908867359161377, "reward_std": 0.48215678334236145, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.19088676571846008, "rewards/wrapped_format_reward": 0.5, "step": 807 }, { "completion_length": 500.0, "epoch": 161.6, "grad_norm": 0.5225286483764648, "kl": 1.6882096529006958, "learning_rate": 2.8910861626005774e-06, "loss": 0.0675, "reward": 2.6616103649139404, "reward_std": 0.28830450773239136, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9750000238418579, "rewards/wrapped_driving_reward": -0.1883895993232727, "rewards/wrapped_format_reward": 0.875, "step": 808 }, { "completion_length": 500.0, "epoch": 161.8, "grad_norm": 0.5506367683410645, "kl": 0.40021976828575134, "learning_rate": 2.8856982319310724e-06, "loss": 0.016, "reward": 0.8944512009620667, "reward_std": 2.975062131881714, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.1055488586425781, "rewards/wrapped_format_reward": 0.5, "step": 809 }, { "completion_length": 431.0, "epoch": 162.0, "grad_norm": 0.6307826042175293, "kl": 0.8738049268722534, "learning_rate": 2.880308465474792e-06, "loss": 0.035, "reward": 3.2625396251678467, "reward_std": 0.23850522935390472, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.3875396251678467, "rewards/wrapped_format_reward": 0.875, "step": 810 }, { "completion_length": 500.0, "epoch": 162.2, "grad_norm": 0.461091548204422, "kl": 1.3140265941619873, "learning_rate": 2.8749168888851126e-06, "loss": 0.0526, "reward": 2.7970871925354004, "reward_std": 0.45916786789894104, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9583333134651184, "rewards/wrapped_driving_reward": 0.21375392377376556, "rewards/wrapped_format_reward": 0.625, "step": 811 }, { "completion_length": 399.0, "epoch": 162.4, "grad_norm": 0.5810427069664001, "kl": 1.0196524858474731, "learning_rate": 2.8695235278240272e-06, "loss": 0.0408, "reward": 3.457486629486084, "reward_std": 0.3558793365955353, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.45748674869537354, "rewards/wrapped_format_reward": 1.0, "step": 812 }, { "completion_length": 500.0, "epoch": 162.6, "grad_norm": 0.46939176321029663, "kl": 0.9351792931556702, "learning_rate": 2.8641284079620203e-06, "loss": 0.0374, "reward": 0.05920994281768799, "reward_std": 2.118419885635376, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -2.9407901763916016, "rewards/wrapped_format_reward": 1.0, "step": 813 }, { "completion_length": 500.0, "epoch": 162.8, "grad_norm": 0.528785228729248, "kl": 0.5687186121940613, "learning_rate": 2.858731554977948e-06, "loss": 0.0227, "reward": -0.43504947423934937, "reward_std": 3.5419013500213623, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.060049295425415, "rewards/wrapped_format_reward": 0.625, "step": 814 }, { "completion_length": 500.0, "epoch": 163.0, "grad_norm": 0.48642876744270325, "kl": 0.55952388048172, "learning_rate": 2.8533329945589192e-06, "loss": 0.0224, "reward": 0.7796438336372375, "reward_std": 3.644176483154297, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.5953562259674072, "rewards/wrapped_format_reward": 0.875, "step": 815 }, { "completion_length": 500.0, "epoch": 163.2, "grad_norm": 0.5921107530593872, "kl": 0.8645963072776794, "learning_rate": 2.847932752400164e-06, "loss": 0.0346, "reward": 3.478057861328125, "reward_std": 0.34204110503196716, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.6030576825141907, "rewards/wrapped_format_reward": 0.875, "step": 816 }, { "completion_length": 500.0, "epoch": 163.4, "grad_norm": 0.5188066959381104, "kl": 0.912796676158905, "learning_rate": 2.8425308542049208e-06, "loss": 0.0365, "reward": 2.615131139755249, "reward_std": 0.12927274405956268, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9722222089767456, "rewards/wrapped_driving_reward": -0.3570910692214966, "rewards/wrapped_format_reward": 1.0, "step": 817 }, { "completion_length": 500.0, "epoch": 163.6, "grad_norm": 0.4986792206764221, "kl": 0.6091123223304749, "learning_rate": 2.837127325684308e-06, "loss": 0.0244, "reward": 0.9812687039375305, "reward_std": 3.3806614875793457, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.2687312364578247, "rewards/wrapped_format_reward": 0.75, "step": 818 }, { "completion_length": 500.0, "epoch": 163.8, "grad_norm": 0.6001780033111572, "kl": 0.6903250217437744, "learning_rate": 2.8317221925572058e-06, "loss": 0.0276, "reward": 2.1244254112243652, "reward_std": 0.6480088233947754, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.2505744695663452, "rewards/wrapped_format_reward": 0.375, "step": 819 }, { "completion_length": 500.0, "epoch": 164.0, "grad_norm": 0.4543159604072571, "kl": 0.9673792123794556, "learning_rate": 2.82631548055013e-06, "loss": 0.0387, "reward": 3.105719804763794, "reward_std": 0.18213030695915222, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8333333134651184, "rewards/wrapped_driving_reward": 0.5223863124847412, "rewards/wrapped_format_reward": 0.75, "step": 820 }, { "completion_length": 500.0, "epoch": 164.2, "grad_norm": 0.7704506516456604, "kl": 1.6314855813980103, "learning_rate": 2.820907215397111e-06, "loss": 0.0653, "reward": 2.759028673171997, "reward_std": 0.06708598881959915, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9772727489471436, "rewards/wrapped_driving_reward": -0.21824395656585693, "rewards/wrapped_format_reward": 1.0, "step": 821 }, { "completion_length": 500.0, "epoch": 164.4, "grad_norm": 0.5132427215576172, "kl": 0.21008968353271484, "learning_rate": 2.815497422839575e-06, "loss": 0.0084, "reward": 2.4701147079467773, "reward_std": 0.5563299655914307, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9750000238418579, "rewards/wrapped_driving_reward": -0.37988513708114624, "rewards/wrapped_format_reward": 0.875, "step": 822 }, { "completion_length": 492.0, "epoch": 164.6, "grad_norm": 0.5533198118209839, "kl": 1.597398281097412, "learning_rate": 2.8100861286262137e-06, "loss": 0.0639, "reward": 3.173506021499634, "reward_std": 0.10093068331480026, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9821428656578064, "rewards/wrapped_driving_reward": 0.3163631558418274, "rewards/wrapped_format_reward": 0.875, "step": 823 }, { "completion_length": 487.0, "epoch": 164.8, "grad_norm": 0.5174404978752136, "kl": 1.3452297449111938, "learning_rate": 2.804673358512869e-06, "loss": 0.0538, "reward": 3.791170120239258, "reward_std": 0.12576456367969513, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.7911700010299683, "rewards/wrapped_format_reward": 1.0, "step": 824 }, { "completion_length": 500.0, "epoch": 165.0, "grad_norm": 0.5471038818359375, "kl": 1.292214274406433, "learning_rate": 2.7992591382624064e-06, "loss": 0.0517, "reward": 3.421455144882202, "reward_std": 0.5310376286506653, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.6714551448822021, "rewards/wrapped_format_reward": 0.75, "step": 825 }, { "completion_length": 408.0, "epoch": 165.2, "grad_norm": 0.6716032028198242, "kl": 0.8129616975784302, "learning_rate": 2.7938434936445946e-06, "loss": 0.0325, "reward": 3.068939685821533, "reward_std": 0.7359381914138794, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.5689395070075989, "rewards/wrapped_format_reward": 0.5, "step": 826 }, { "completion_length": 368.0, "epoch": 165.4, "grad_norm": 0.7269822359085083, "kl": 0.660163402557373, "learning_rate": 2.78842645043598e-06, "loss": 0.0264, "reward": 3.6374306678771973, "reward_std": 0.3847697377204895, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.6374306678771973, "rewards/wrapped_format_reward": 1.0, "step": 827 }, { "completion_length": 433.0, "epoch": 165.6, "grad_norm": 0.49772003293037415, "kl": 0.4905422031879425, "learning_rate": 2.7830080344197675e-06, "loss": 0.0196, "reward": 2.9236674308776855, "reward_std": 0.4421462118625641, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9772727489471436, "rewards/wrapped_driving_reward": 0.19639474153518677, "rewards/wrapped_format_reward": 0.75, "step": 828 }, { "completion_length": 500.0, "epoch": 165.8, "grad_norm": 0.5490583777427673, "kl": 0.39918583631515503, "learning_rate": 2.7775882713856946e-06, "loss": 0.016, "reward": 2.375999927520752, "reward_std": 0.4072107970714569, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9772727489471436, "rewards/wrapped_driving_reward": -0.22627297043800354, "rewards/wrapped_format_reward": 0.625, "step": 829 }, { "completion_length": 500.0, "epoch": 166.0, "grad_norm": 0.5898881554603577, "kl": 1.5381332635879517, "learning_rate": 2.7721671871299115e-06, "loss": 0.0615, "reward": 0.9043681621551514, "reward_std": 3.2865047454833984, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6875, "rewards/wrapped_driving_reward": -1.158131718635559, "rewards/wrapped_format_reward": 0.625, "step": 830 }, { "completion_length": 441.0, "epoch": 166.2, "grad_norm": 0.5331966280937195, "kl": 0.8137264251708984, "learning_rate": 2.766744807454857e-06, "loss": 0.0325, "reward": 3.826152801513672, "reward_std": 0.014650849625468254, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.8261529803276062, "rewards/wrapped_format_reward": 1.0, "step": 831 }, { "completion_length": 397.0, "epoch": 166.4, "grad_norm": 0.4468265771865845, "kl": 0.6693469882011414, "learning_rate": 2.761321158169134e-06, "loss": 0.0268, "reward": 1.760857343673706, "reward_std": 1.9348771572113037, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.239142656326294, "rewards/wrapped_format_reward": 1.0, "step": 832 }, { "completion_length": 365.0, "epoch": 166.6, "grad_norm": 0.6002892255783081, "kl": 0.7958039045333862, "learning_rate": 2.75589626508739e-06, "loss": 0.0318, "reward": 3.139150619506836, "reward_std": 0.13503800332546234, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9166666865348816, "rewards/wrapped_driving_reward": 0.2224840521812439, "rewards/wrapped_format_reward": 1.0, "step": 833 }, { "completion_length": 500.0, "epoch": 166.8, "grad_norm": 0.6386081576347351, "kl": 0.9411477446556091, "learning_rate": 2.750470154030191e-06, "loss": 0.0376, "reward": 3.0650391578674316, "reward_std": 0.6393033862113953, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.5650390386581421, "rewards/wrapped_format_reward": 0.5, "step": 834 }, { "completion_length": 500.0, "epoch": 167.0, "grad_norm": 0.3837875723838806, "kl": 1.657114028930664, "learning_rate": 2.7450428508239024e-06, "loss": 0.0663, "reward": 2.6719822883605957, "reward_std": 0.24073880910873413, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.20301775634288788, "rewards/wrapped_format_reward": 0.875, "step": 835 }, { "completion_length": 500.0, "epoch": 167.2, "grad_norm": 0.506295919418335, "kl": 0.7120727896690369, "learning_rate": 2.7396143813005603e-06, "loss": 0.0285, "reward": 1.3677772283554077, "reward_std": 3.2470555305480957, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7250000238418579, "rewards/wrapped_driving_reward": -0.7322226762771606, "rewards/wrapped_format_reward": 0.625, "step": 836 }, { "completion_length": 500.0, "epoch": 167.4, "grad_norm": 0.4985957741737366, "kl": 0.9464759230613708, "learning_rate": 2.734184771297756e-06, "loss": 0.0379, "reward": 2.5576024055480957, "reward_std": 0.2574041485786438, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.06739747524261475, "rewards/wrapped_format_reward": 0.625, "step": 837 }, { "completion_length": 439.0, "epoch": 167.6, "grad_norm": 0.5637804865837097, "kl": 0.8566745519638062, "learning_rate": 2.7287540466585067e-06, "loss": 0.0343, "reward": 2.9292807579040527, "reward_std": 0.7874415516853333, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.0707191526889801, "rewards/wrapped_format_reward": 1.0, "step": 838 }, { "completion_length": 409.0, "epoch": 167.8, "grad_norm": 0.5939984321594238, "kl": 0.6225539445877075, "learning_rate": 2.7233222332311344e-06, "loss": 0.0249, "reward": 3.39684796333313, "reward_std": 0.42441999912261963, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.949999988079071, "rewards/wrapped_driving_reward": 0.8218478560447693, "rewards/wrapped_format_reward": 0.625, "step": 839 }, { "completion_length": 500.0, "epoch": 168.0, "grad_norm": 0.6109358668327332, "kl": 0.8564927577972412, "learning_rate": 2.717889356869146e-06, "loss": 0.0343, "reward": 3.2804532051086426, "reward_std": 0.4187309145927429, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9722222089767456, "rewards/wrapped_driving_reward": 0.3082309365272522, "rewards/wrapped_format_reward": 1.0, "step": 840 }, { "completion_length": 500.0, "epoch": 168.2, "grad_norm": 0.5494788289070129, "kl": 1.4602153301239014, "learning_rate": 2.7124554434311047e-06, "loss": 0.0584, "reward": 1.7961541414260864, "reward_std": 2.5778276920318604, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9583333134651184, "rewards/wrapped_driving_reward": -0.662179172039032, "rewards/wrapped_format_reward": 0.5, "step": 841 }, { "completion_length": 500.0, "epoch": 168.4, "grad_norm": 0.4992530345916748, "kl": 1.0213444232940674, "learning_rate": 2.707020518780511e-06, "loss": 0.0409, "reward": 2.7865848541259766, "reward_std": 0.029527578502893448, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.21341529488563538, "rewards/wrapped_format_reward": 1.0, "step": 842 }, { "completion_length": 458.0, "epoch": 168.6, "grad_norm": 0.5227845311164856, "kl": 0.5304321646690369, "learning_rate": 2.7015846087856796e-06, "loss": 0.0212, "reward": 1.1565890312194824, "reward_std": 2.35237979888916, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9375, "rewards/wrapped_driving_reward": -1.7809109687805176, "rewards/wrapped_format_reward": 1.0, "step": 843 }, { "completion_length": 500.0, "epoch": 168.8, "grad_norm": 0.5163724422454834, "kl": 1.1655185222625732, "learning_rate": 2.696147739319613e-06, "loss": 0.0466, "reward": 1.2925224304199219, "reward_std": 3.271446704864502, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9574775099754333, "rewards/wrapped_format_reward": 0.75, "step": 844 }, { "completion_length": 391.0, "epoch": 169.0, "grad_norm": 0.5346372127532959, "kl": 0.9912291765213013, "learning_rate": 2.6907099362598815e-06, "loss": 0.0396, "reward": 3.68274188041687, "reward_std": 0.24550725519657135, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9722222089767456, "rewards/wrapped_driving_reward": 0.835519552230835, "rewards/wrapped_format_reward": 0.875, "step": 845 }, { "completion_length": 500.0, "epoch": 169.2, "grad_norm": 0.4762868285179138, "kl": 0.9365013837814331, "learning_rate": 2.6852712254884988e-06, "loss": 0.0375, "reward": 3.4418392181396484, "reward_std": 0.48263806104660034, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9821428656578064, "rewards/wrapped_driving_reward": 0.8346964716911316, "rewards/wrapped_format_reward": 0.625, "step": 846 }, { "completion_length": 500.0, "epoch": 169.4, "grad_norm": 0.5167540311813354, "kl": 0.4593685567378998, "learning_rate": 2.6798316328917988e-06, "loss": 0.0184, "reward": 2.625422477722168, "reward_std": 0.3436520993709564, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.12457744032144547, "rewards/wrapped_format_reward": 0.75, "step": 847 }, { "completion_length": 500.0, "epoch": 169.6, "grad_norm": 0.5815231204032898, "kl": 1.4257601499557495, "learning_rate": 2.6743911843603134e-06, "loss": 0.057, "reward": 2.8921163082122803, "reward_std": 0.4228496849536896, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9750000238418579, "rewards/wrapped_driving_reward": 0.16711634397506714, "rewards/wrapped_format_reward": 0.75, "step": 848 }, { "completion_length": 500.0, "epoch": 169.8, "grad_norm": 0.6285993456840515, "kl": 1.027273178100586, "learning_rate": 2.6689499057886483e-06, "loss": 0.0411, "reward": 1.422483205795288, "reward_std": 3.62673020362854, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.7025167942047119, "rewards/wrapped_format_reward": 0.625, "step": 849 }, { "completion_length": 500.0, "epoch": 170.0, "grad_norm": 0.5311692953109741, "kl": 0.724616527557373, "learning_rate": 2.663507823075358e-06, "loss": 0.029, "reward": 0.08719003200531006, "reward_std": 2.17438006401062, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -2.9128098487854004, "rewards/wrapped_format_reward": 1.0, "step": 850 }, { "completion_length": 500.0, "epoch": 170.2, "grad_norm": 0.6036850810050964, "kl": 0.88832688331604, "learning_rate": 2.6580649621228267e-06, "loss": 0.0355, "reward": 0.6683868169784546, "reward_std": 3.1441006660461426, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6666666865348816, "rewards/wrapped_driving_reward": -1.2482798099517822, "rewards/wrapped_format_reward": 0.5, "step": 851 }, { "completion_length": 500.0, "epoch": 170.4, "grad_norm": 0.5897907614707947, "kl": 1.2000190019607544, "learning_rate": 2.6526213488371427e-06, "loss": 0.048, "reward": 2.9573283195495605, "reward_std": 0.03875737264752388, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.08232828229665756, "rewards/wrapped_format_reward": 0.875, "step": 852 }, { "completion_length": 500.0, "epoch": 170.6, "grad_norm": 0.5143535137176514, "kl": 0.4949548840522766, "learning_rate": 2.6471770091279725e-06, "loss": 0.0198, "reward": 2.131885528564453, "reward_std": 0.730937123298645, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.4931145906448364, "rewards/wrapped_format_reward": 0.625, "step": 853 }, { "completion_length": 393.0, "epoch": 170.8, "grad_norm": 0.5587767958641052, "kl": 1.4226547479629517, "learning_rate": 2.641731968908444e-06, "loss": 0.0569, "reward": 2.96309232711792, "reward_std": 0.2758857309818268, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9375, "rewards/wrapped_driving_reward": 0.15059247612953186, "rewards/wrapped_format_reward": 0.875, "step": 854 }, { "completion_length": 500.0, "epoch": 171.0, "grad_norm": 0.49916645884513855, "kl": 0.8778588771820068, "learning_rate": 2.6362862540950163e-06, "loss": 0.0351, "reward": 3.481083631515503, "reward_std": 0.4503914415836334, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.7310836315155029, "rewards/wrapped_format_reward": 0.75, "step": 855 }, { "completion_length": 500.0, "epoch": 171.2, "grad_norm": 0.5146768689155579, "kl": 0.9753352403640747, "learning_rate": 2.6308398906073603e-06, "loss": 0.039, "reward": 3.0682339668273926, "reward_std": 0.6612880825996399, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.31823405623435974, "rewards/wrapped_format_reward": 0.75, "step": 856 }, { "completion_length": 417.0, "epoch": 171.4, "grad_norm": 1.3383334875106812, "kl": 0.49587488174438477, "learning_rate": 2.6253929043682336e-06, "loss": 0.0198, "reward": 3.442657470703125, "reward_std": 0.5057659149169922, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.817657470703125, "rewards/wrapped_format_reward": 0.625, "step": 857 }, { "completion_length": 500.0, "epoch": 171.6, "grad_norm": 1.6063464879989624, "kl": 1.655110239982605, "learning_rate": 2.61994532130336e-06, "loss": 0.0662, "reward": 2.950542449951172, "reward_std": 0.4103209972381592, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.96875, "rewards/wrapped_driving_reward": 0.10679234564304352, "rewards/wrapped_format_reward": 0.875, "step": 858 }, { "completion_length": 500.0, "epoch": 171.8, "grad_norm": 0.5413594245910645, "kl": 0.9937121272087097, "learning_rate": 2.6144971673413023e-06, "loss": 0.0397, "reward": 2.7468223571777344, "reward_std": 0.6185001134872437, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.003177560865879059, "rewards/wrapped_format_reward": 0.75, "step": 859 }, { "completion_length": 405.0, "epoch": 172.0, "grad_norm": 0.5254268646240234, "kl": 0.7705658674240112, "learning_rate": 2.6090484684133406e-06, "loss": 0.0308, "reward": 1.0588258504867554, "reward_std": 2.706279993057251, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7272727489471436, "rewards/wrapped_driving_reward": -1.4184470176696777, "rewards/wrapped_format_reward": 1.0, "step": 860 }, { "completion_length": 449.0, "epoch": 172.2, "grad_norm": 0.5308093428611755, "kl": 0.827573299407959, "learning_rate": 2.603599250453349e-06, "loss": 0.0331, "reward": 3.1276378631591797, "reward_std": 0.10545176267623901, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9285714626312256, "rewards/wrapped_driving_reward": 0.32406648993492126, "rewards/wrapped_format_reward": 0.875, "step": 861 }, { "completion_length": 452.0, "epoch": 172.4, "grad_norm": 0.6962711215019226, "kl": 0.7359398603439331, "learning_rate": 2.5981495393976718e-06, "loss": 0.0294, "reward": 2.441457748413086, "reward_std": 2.2943224906921387, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.5585423111915588, "rewards/wrapped_format_reward": 1.0, "step": 862 }, { "completion_length": 500.0, "epoch": 172.6, "grad_norm": 0.6115143895149231, "kl": 1.3224482536315918, "learning_rate": 2.592699361185002e-06, "loss": 0.0529, "reward": 1.3186888694763184, "reward_std": 3.2175989151000977, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.0563111305236816, "rewards/wrapped_format_reward": 0.875, "step": 863 }, { "completion_length": 421.0, "epoch": 172.8, "grad_norm": 0.648125410079956, "kl": 1.027504563331604, "learning_rate": 2.587248741756253e-06, "loss": 0.0411, "reward": -0.10881543159484863, "reward_std": 1.7823691368103027, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -3.1088154315948486, "rewards/wrapped_format_reward": 1.0, "step": 864 }, { "completion_length": 500.0, "epoch": 173.0, "grad_norm": 0.57094806432724, "kl": 1.3319047689437866, "learning_rate": 2.5817977070544408e-06, "loss": 0.0533, "reward": 3.846771717071533, "reward_std": 0.01095542497932911, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.8467714786529541, "rewards/wrapped_format_reward": 1.0, "step": 865 }, { "completion_length": 329.0, "epoch": 173.2, "grad_norm": 0.808718740940094, "kl": 0.6123311519622803, "learning_rate": 2.5763462830245573e-06, "loss": 0.0245, "reward": 2.751035690307617, "reward_std": 0.5787281394004822, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.12396407127380371, "rewards/wrapped_format_reward": 0.875, "step": 866 }, { "completion_length": 385.0, "epoch": 173.4, "grad_norm": 0.6459723711013794, "kl": 0.8475527763366699, "learning_rate": 2.570894495613446e-06, "loss": 0.0339, "reward": 3.8003950119018555, "reward_std": 0.08562152832746506, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.8003950119018555, "rewards/wrapped_format_reward": 1.0, "step": 867 }, { "completion_length": 500.0, "epoch": 173.6, "grad_norm": 0.600118100643158, "kl": 1.1404991149902344, "learning_rate": 2.5654423707696834e-06, "loss": 0.0456, "reward": 1.9638034105300903, "reward_std": 0.40437883138656616, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8999999761581421, "rewards/wrapped_driving_reward": -0.43619662523269653, "rewards/wrapped_format_reward": 0.5, "step": 868 }, { "completion_length": 491.0, "epoch": 173.8, "grad_norm": 0.4509470462799072, "kl": 0.9321443438529968, "learning_rate": 2.5599899344434478e-06, "loss": 0.0373, "reward": 3.2264249324798584, "reward_std": 0.32198405265808105, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9166666865348816, "rewards/wrapped_driving_reward": 0.3097583055496216, "rewards/wrapped_format_reward": 1.0, "step": 869 }, { "completion_length": 399.0, "epoch": 174.0, "grad_norm": 0.578890323638916, "kl": 1.2454216480255127, "learning_rate": 2.554537212586403e-06, "loss": 0.0498, "reward": 3.1222589015960693, "reward_std": 0.37537509202957153, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.37225890159606934, "rewards/wrapped_format_reward": 0.75, "step": 870 }, { "completion_length": 500.0, "epoch": 174.2, "grad_norm": 0.4751075804233551, "kl": 1.4109488725662231, "learning_rate": 2.5490842311515706e-06, "loss": 0.0564, "reward": 0.9386399388313293, "reward_std": 2.473573684692383, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.9363600015640259, "rewards/wrapped_format_reward": 0.875, "step": 871 }, { "completion_length": 474.0, "epoch": 174.4, "grad_norm": 0.5804294943809509, "kl": 0.7709574103355408, "learning_rate": 2.543631016093209e-06, "loss": 0.0308, "reward": 3.2371604442596436, "reward_std": 0.5636550188064575, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.4871605336666107, "rewards/wrapped_format_reward": 0.75, "step": 872 }, { "completion_length": 469.0, "epoch": 174.6, "grad_norm": 0.460746705532074, "kl": 1.7947441339492798, "learning_rate": 2.5381775933666865e-06, "loss": 0.0718, "reward": 2.5671794414520264, "reward_std": 0.5235273241996765, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.1828204095363617, "rewards/wrapped_format_reward": 0.75, "step": 873 }, { "completion_length": 500.0, "epoch": 174.8, "grad_norm": 0.5847729444503784, "kl": 0.7998570799827576, "learning_rate": 2.5327239889283613e-06, "loss": 0.032, "reward": 0.6826847195625305, "reward_std": 3.0680971145629883, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.6923152208328247, "rewards/wrapped_format_reward": 0.875, "step": 874 }, { "completion_length": 418.0, "epoch": 175.0, "grad_norm": 0.5057737231254578, "kl": 0.4298799932003021, "learning_rate": 2.527270228735456e-06, "loss": 0.0172, "reward": 3.286527156829834, "reward_std": 0.6175230741500854, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.5365269780158997, "rewards/wrapped_format_reward": 0.75, "step": 875 }, { "completion_length": 500.0, "epoch": 175.2, "grad_norm": 0.5232069492340088, "kl": 0.9115723371505737, "learning_rate": 2.521816338745935e-06, "loss": 0.0365, "reward": 1.682668924331665, "reward_std": 3.1222028732299805, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.71875, "rewards/wrapped_driving_reward": -0.786081075668335, "rewards/wrapped_format_reward": 1.0, "step": 876 }, { "completion_length": 500.0, "epoch": 175.4, "grad_norm": 0.4667075574398041, "kl": 1.3090119361877441, "learning_rate": 2.5163623449183797e-06, "loss": 0.0524, "reward": 3.2737345695495605, "reward_std": 0.5244059562683105, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.5237348079681396, "rewards/wrapped_format_reward": 0.75, "step": 877 }, { "completion_length": 500.0, "epoch": 175.6, "grad_norm": 0.4833585023880005, "kl": 1.0329177379608154, "learning_rate": 2.510908273211867e-06, "loss": 0.0413, "reward": 1.6408182382583618, "reward_std": 1.9210997819900513, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.96875, "rewards/wrapped_driving_reward": -1.0779317617416382, "rewards/wrapped_format_reward": 0.75, "step": 878 }, { "completion_length": 500.0, "epoch": 175.8, "grad_norm": 0.5182239413261414, "kl": 0.934988796710968, "learning_rate": 2.5054541495858427e-06, "loss": 0.0374, "reward": 0.9360474944114685, "reward_std": 1.3490967750549316, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9564394354820251, "rewards/wrapped_driving_reward": -1.7703919410705566, "rewards/wrapped_format_reward": 0.75, "step": 879 }, { "completion_length": 475.0, "epoch": 176.0, "grad_norm": 0.5969507694244385, "kl": 1.24098801612854, "learning_rate": 2.5e-06, "loss": 0.0496, "reward": 1.8240338563919067, "reward_std": 3.2637932300567627, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.4259660840034485, "rewards/wrapped_format_reward": 0.75, "step": 880 }, { "completion_length": 500.0, "epoch": 176.2, "grad_norm": 0.537401020526886, "kl": 0.6495235562324524, "learning_rate": 2.494545850414158e-06, "loss": 0.026, "reward": 1.929699420928955, "reward_std": 1.9812421798706055, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.6953005790710449, "rewards/wrapped_format_reward": 0.625, "step": 881 }, { "completion_length": 386.0, "epoch": 176.4, "grad_norm": 0.5999033451080322, "kl": 1.48947274684906, "learning_rate": 2.489091726788134e-06, "loss": 0.0596, "reward": 1.8825178146362305, "reward_std": 1.9242823123931885, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.1174821853637695, "rewards/wrapped_format_reward": 1.0, "step": 882 }, { "completion_length": 428.0, "epoch": 176.6, "grad_norm": 0.5353307723999023, "kl": 0.5975685715675354, "learning_rate": 2.4836376550816207e-06, "loss": 0.0239, "reward": 0.9004546403884888, "reward_std": 2.195589065551758, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9750000238418579, "rewards/wrapped_driving_reward": -2.074545383453369, "rewards/wrapped_format_reward": 1.0, "step": 883 }, { "completion_length": 500.0, "epoch": 176.8, "grad_norm": 0.6042259335517883, "kl": 1.6270625591278076, "learning_rate": 2.4781836612540656e-06, "loss": 0.0651, "reward": 1.2540318965911865, "reward_std": 3.1778993606567383, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": -0.9959681034088135, "rewards/wrapped_format_reward": 0.875, "step": 884 }, { "completion_length": 429.0, "epoch": 177.0, "grad_norm": 0.5502638816833496, "kl": 1.7028800249099731, "learning_rate": 2.4727297712645446e-06, "loss": 0.0681, "reward": 3.253913640975952, "reward_std": 0.4737393260002136, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.37891364097595215, "rewards/wrapped_format_reward": 0.875, "step": 885 }, { "completion_length": 419.0, "epoch": 177.2, "grad_norm": 0.5808356404304504, "kl": 0.5641009211540222, "learning_rate": 2.4672760110716395e-06, "loss": 0.0226, "reward": 2.7367911338806152, "reward_std": 0.09191377460956573, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.949999988079071, "rewards/wrapped_driving_reward": -0.21320880949497223, "rewards/wrapped_format_reward": 1.0, "step": 886 }, { "completion_length": 436.0, "epoch": 177.4, "grad_norm": 0.5215104222297668, "kl": 1.0268008708953857, "learning_rate": 2.4618224066333143e-06, "loss": 0.0411, "reward": 3.6914260387420654, "reward_std": 0.2449469417333603, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.8164260387420654, "rewards/wrapped_format_reward": 0.875, "step": 887 }, { "completion_length": 500.0, "epoch": 177.6, "grad_norm": 0.5653296113014221, "kl": 1.398972511291504, "learning_rate": 2.4563689839067913e-06, "loss": 0.056, "reward": 2.985867738723755, "reward_std": 0.318721204996109, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.23586782813072205, "rewards/wrapped_format_reward": 0.75, "step": 888 }, { "completion_length": 500.0, "epoch": 177.8, "grad_norm": 0.6126723289489746, "kl": 1.4683130979537964, "learning_rate": 2.45091576884843e-06, "loss": 0.0587, "reward": 3.5237417221069336, "reward_std": 0.20529741048812866, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9318181872367859, "rewards/wrapped_driving_reward": 0.7169235348701477, "rewards/wrapped_format_reward": 0.875, "step": 889 }, { "completion_length": 500.0, "epoch": 178.0, "grad_norm": 0.6861754655838013, "kl": 0.6537904739379883, "learning_rate": 2.4454627874135976e-06, "loss": 0.0262, "reward": -2.0917086601257324, "reward_std": 1.6465706825256348, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -3.8417086601257324, "rewards/wrapped_format_reward": 0.75, "step": 890 }, { "completion_length": 500.0, "epoch": 178.2, "grad_norm": 0.5680333971977234, "kl": 1.307004451751709, "learning_rate": 2.4400100655565535e-06, "loss": 0.0523, "reward": 1.553303599357605, "reward_std": 3.3759968280792236, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.821696400642395, "rewards/wrapped_format_reward": 0.875, "step": 891 }, { "completion_length": 500.0, "epoch": 178.4, "grad_norm": 0.5444093346595764, "kl": 0.9408724308013916, "learning_rate": 2.434557629230318e-06, "loss": 0.0376, "reward": 2.4606845378875732, "reward_std": 0.2496892362833023, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.16431555151939392, "rewards/wrapped_format_reward": 0.625, "step": 892 }, { "completion_length": 332.0, "epoch": 178.6, "grad_norm": 1.0570348501205444, "kl": 1.3375152349472046, "learning_rate": 2.4291055043865547e-06, "loss": 0.0535, "reward": 3.1871886253356934, "reward_std": 0.2547098398208618, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.18718869984149933, "rewards/wrapped_format_reward": 1.0, "step": 893 }, { "completion_length": 500.0, "epoch": 178.8, "grad_norm": 0.6612321734428406, "kl": 0.9976045489311218, "learning_rate": 2.423653716975444e-06, "loss": 0.0399, "reward": -1.1944348812103271, "reward_std": 2.345881700515747, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -3.069434881210327, "rewards/wrapped_format_reward": 0.375, "step": 894 }, { "completion_length": 500.0, "epoch": 179.0, "grad_norm": 0.5748969316482544, "kl": 1.190211296081543, "learning_rate": 2.41820229294556e-06, "loss": 0.0476, "reward": 3.7105839252471924, "reward_std": 0.25292056798934937, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.875, "rewards/wrapped_driving_reward": 0.8355838656425476, "rewards/wrapped_format_reward": 1.0, "step": 895 }, { "completion_length": 500.0, "epoch": 179.2, "grad_norm": 0.44747933745384216, "kl": 1.1943285465240479, "learning_rate": 2.4127512582437486e-06, "loss": 0.0478, "reward": 3.0833992958068848, "reward_std": 0.8473286628723145, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.45839935541152954, "rewards/wrapped_format_reward": 0.625, "step": 896 }, { "completion_length": 485.0, "epoch": 179.4, "grad_norm": 0.48663514852523804, "kl": 1.2799426317214966, "learning_rate": 2.4073006388149992e-06, "loss": 0.0512, "reward": 3.4635372161865234, "reward_std": 0.1930120885372162, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9642857313156128, "rewards/wrapped_driving_reward": 0.4992515742778778, "rewards/wrapped_format_reward": 1.0, "step": 897 }, { "completion_length": 500.0, "epoch": 179.6, "grad_norm": 0.5305557250976562, "kl": 1.188493013381958, "learning_rate": 2.4018504606023295e-06, "loss": 0.0475, "reward": -1.625, "reward_std": 1.25, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.875, "step": 898 }, { "completion_length": 343.0, "epoch": 179.8, "grad_norm": 0.5853403210639954, "kl": 0.7435815334320068, "learning_rate": 2.3964007495466523e-06, "loss": 0.0297, "reward": 2.5528595447540283, "reward_std": 0.24696195125579834, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.4471404552459717, "rewards/wrapped_format_reward": 1.0, "step": 899 }, { "completion_length": 500.0, "epoch": 180.0, "grad_norm": 0.5050544738769531, "kl": 1.337159276008606, "learning_rate": 2.3909515315866606e-06, "loss": 0.0535, "reward": 2.4426145553588867, "reward_std": 0.22859933972358704, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.949999988079071, "rewards/wrapped_driving_reward": -0.3823854923248291, "rewards/wrapped_format_reward": 0.875, "step": 900 }, { "completion_length": 500.0, "epoch": 180.2, "grad_norm": 0.5133768320083618, "kl": 0.31792768836021423, "learning_rate": 2.385502832658699e-06, "loss": 0.0127, "reward": 3.5273141860961914, "reward_std": 0.3408697247505188, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.7773142457008362, "rewards/wrapped_format_reward": 0.75, "step": 901 }, { "completion_length": 500.0, "epoch": 180.4, "grad_norm": 0.7845637202262878, "kl": 0.5812042355537415, "learning_rate": 2.380054678696641e-06, "loss": 0.0232, "reward": 3.3489980697631836, "reward_std": 0.296768456697464, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.4739980399608612, "rewards/wrapped_format_reward": 0.875, "step": 902 }, { "completion_length": 464.0, "epoch": 180.6, "grad_norm": 0.05685199797153473, "kl": 0.641217827796936, "learning_rate": 2.3746070956317664e-06, "loss": 0.0256, "reward": -1.0, "reward_std": 0.0, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 1.0, "step": 903 }, { "completion_length": 500.0, "epoch": 180.8, "grad_norm": 0.611503005027771, "kl": 1.1354436874389648, "learning_rate": 2.3691601093926406e-06, "loss": 0.0454, "reward": 1.4279756546020508, "reward_std": 3.2888336181640625, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9470243453979492, "rewards/wrapped_format_reward": 0.875, "step": 904 }, { "completion_length": 410.0, "epoch": 181.0, "grad_norm": 0.5386159420013428, "kl": 0.9454559683799744, "learning_rate": 2.363713745904984e-06, "loss": 0.0378, "reward": 2.3741612434387207, "reward_std": 0.2962803542613983, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.875, "rewards/wrapped_driving_reward": -0.3758388161659241, "rewards/wrapped_format_reward": 0.875, "step": 905 }, { "completion_length": 500.0, "epoch": 181.2, "grad_norm": 0.5813198089599609, "kl": 0.8604588508605957, "learning_rate": 2.358268031091556e-06, "loss": 0.0344, "reward": 3.2131857872009277, "reward_std": 0.4338037073612213, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.4631856083869934, "rewards/wrapped_format_reward": 0.75, "step": 906 }, { "completion_length": 500.0, "epoch": 181.4, "grad_norm": 0.565249502658844, "kl": 1.1992684602737427, "learning_rate": 2.3528229908720275e-06, "loss": 0.048, "reward": 2.822211980819702, "reward_std": 0.4615172743797302, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.949999988079071, "rewards/wrapped_driving_reward": -0.12778803706169128, "rewards/wrapped_format_reward": 1.0, "step": 907 }, { "completion_length": 500.0, "epoch": 181.6, "grad_norm": 0.4963197112083435, "kl": 1.3221276998519897, "learning_rate": 2.3473786511628577e-06, "loss": 0.0529, "reward": 2.4108781814575195, "reward_std": 0.7984676361083984, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.21412202715873718, "rewards/wrapped_format_reward": 0.625, "step": 908 }, { "completion_length": 500.0, "epoch": 181.8, "grad_norm": 0.5683193802833557, "kl": 1.0979485511779785, "learning_rate": 2.3419350378771737e-06, "loss": 0.0439, "reward": 3.236751079559326, "reward_std": 0.17679031193256378, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.23675096035003662, "rewards/wrapped_format_reward": 1.0, "step": 909 }, { "completion_length": 488.0, "epoch": 182.0, "grad_norm": 0.6277251243591309, "kl": 1.2162151336669922, "learning_rate": 2.3364921769246423e-06, "loss": 0.0486, "reward": 3.5269453525543213, "reward_std": 0.3045935332775116, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8678977489471436, "rewards/wrapped_driving_reward": 0.784047544002533, "rewards/wrapped_format_reward": 0.875, "step": 910 }, { "completion_length": 467.0, "epoch": 182.2, "grad_norm": 0.8996431827545166, "kl": 1.1654489040374756, "learning_rate": 2.3310500942113525e-06, "loss": 0.0466, "reward": 2.1091933250427246, "reward_std": 3.406883955001831, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7222222089767456, "rewards/wrapped_driving_reward": -0.36302876472473145, "rewards/wrapped_format_reward": 1.0, "step": 911 }, { "completion_length": 500.0, "epoch": 182.4, "grad_norm": 0.458644837141037, "kl": 1.0396286249160767, "learning_rate": 2.325608815639687e-06, "loss": 0.0416, "reward": 1.601841926574707, "reward_std": 1.9085595607757568, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9431818127632141, "rewards/wrapped_driving_reward": -1.0913399457931519, "rewards/wrapped_format_reward": 0.75, "step": 912 }, { "completion_length": 448.0, "epoch": 182.6, "grad_norm": 0.4919649064540863, "kl": 0.8752197027206421, "learning_rate": 2.3201683671082016e-06, "loss": 0.035, "reward": 0.7716760635375977, "reward_std": 2.0968692302703857, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -2.2283239364624023, "rewards/wrapped_format_reward": 1.0, "step": 913 }, { "completion_length": 500.0, "epoch": 182.8, "grad_norm": 0.6369486451148987, "kl": 1.6084184646606445, "learning_rate": 2.314728774511502e-06, "loss": 0.0643, "reward": 3.28770112991333, "reward_std": 0.470788836479187, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.5377011299133301, "rewards/wrapped_format_reward": 0.75, "step": 914 }, { "completion_length": 500.0, "epoch": 183.0, "grad_norm": 0.6774345636367798, "kl": 0.25260379910469055, "learning_rate": 2.3092900637401193e-06, "loss": 0.0101, "reward": 0.5662202835083008, "reward_std": 2.430043935775757, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -2.058779716491699, "rewards/wrapped_format_reward": 0.625, "step": 915 }, { "completion_length": 500.0, "epoch": 183.2, "grad_norm": 0.56723952293396, "kl": 0.6391198039054871, "learning_rate": 2.3038522606803882e-06, "loss": 0.0256, "reward": 1.4860867261886597, "reward_std": 2.0119857788085938, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9375, "rewards/wrapped_driving_reward": -1.2014132738113403, "rewards/wrapped_format_reward": 0.75, "step": 916 }, { "completion_length": 500.0, "epoch": 183.4, "grad_norm": 0.5726743340492249, "kl": 0.4170107841491699, "learning_rate": 2.298415391214321e-06, "loss": 0.0167, "reward": 2.559278964996338, "reward_std": 0.6913965940475464, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.19072100520133972, "rewards/wrapped_format_reward": 0.75, "step": 917 }, { "completion_length": 500.0, "epoch": 183.6, "grad_norm": 0.563683032989502, "kl": 0.6422572135925293, "learning_rate": 2.29297948121949e-06, "loss": 0.0257, "reward": 3.118239641189575, "reward_std": 0.38170966506004333, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9142857193946838, "rewards/wrapped_driving_reward": 0.5789539217948914, "rewards/wrapped_format_reward": 0.625, "step": 918 }, { "completion_length": 446.0, "epoch": 183.8, "grad_norm": 0.49984797835350037, "kl": 0.9756030440330505, "learning_rate": 2.287544556568896e-06, "loss": 0.039, "reward": 3.495962381362915, "reward_std": 0.41120827198028564, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.620962381362915, "rewards/wrapped_format_reward": 0.875, "step": 919 }, { "completion_length": 437.0, "epoch": 184.0, "grad_norm": 0.5067118406295776, "kl": 1.6291453838348389, "learning_rate": 2.2821106431308546e-06, "loss": 0.0652, "reward": 2.8344063758850098, "reward_std": 0.3185327351093292, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.1655937284231186, "rewards/wrapped_format_reward": 1.0, "step": 920 }, { "completion_length": 492.0, "epoch": 184.2, "grad_norm": 0.5153319239616394, "kl": 0.6589206457138062, "learning_rate": 2.276677766768866e-06, "loss": 0.0264, "reward": 2.9638633728027344, "reward_std": 0.556221067905426, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.03613676130771637, "rewards/wrapped_format_reward": 1.0, "step": 921 }, { "completion_length": 500.0, "epoch": 184.4, "grad_norm": 0.49304455518722534, "kl": 0.7782519459724426, "learning_rate": 2.271245953341494e-06, "loss": 0.0311, "reward": -0.006941735744476318, "reward_std": 1.9861165285110474, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -3.006941795349121, "rewards/wrapped_format_reward": 1.0, "step": 922 }, { "completion_length": 500.0, "epoch": 184.6, "grad_norm": 0.4311465919017792, "kl": 1.5542595386505127, "learning_rate": 2.265815228702245e-06, "loss": 0.0622, "reward": 2.506084442138672, "reward_std": 2.3388657569885254, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.49391549825668335, "rewards/wrapped_format_reward": 1.0, "step": 923 }, { "completion_length": 456.0, "epoch": 184.8, "grad_norm": 0.5129895806312561, "kl": 1.0902389287948608, "learning_rate": 2.26038561869944e-06, "loss": 0.0436, "reward": 3.8182454109191895, "reward_std": 0.009648384526371956, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.8182453513145447, "rewards/wrapped_format_reward": 1.0, "step": 924 }, { "completion_length": 500.0, "epoch": 185.0, "grad_norm": 0.4159168004989624, "kl": 1.704729437828064, "learning_rate": 2.2549571491760985e-06, "loss": 0.0682, "reward": 0.9104196429252625, "reward_std": 2.661571502685547, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.3395801782608032, "rewards/wrapped_format_reward": 0.75, "step": 925 }, { "completion_length": 500.0, "epoch": 185.2, "grad_norm": 0.49064022302627563, "kl": 1.2203823328018188, "learning_rate": 2.24952984596981e-06, "loss": 0.0488, "reward": 2.0226497650146484, "reward_std": 3.357693910598755, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.3523501753807068, "rewards/wrapped_format_reward": 0.875, "step": 926 }, { "completion_length": 500.0, "epoch": 185.4, "grad_norm": 0.4919830858707428, "kl": 0.5248955488204956, "learning_rate": 2.2441037349126107e-06, "loss": 0.021, "reward": 0.7122844457626343, "reward_std": 2.855849266052246, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.4127154350280762, "rewards/wrapped_format_reward": 0.625, "step": 927 }, { "completion_length": 500.0, "epoch": 185.6, "grad_norm": 0.49590083956718445, "kl": 1.2909214496612549, "learning_rate": 2.238678841830867e-06, "loss": 0.0516, "reward": 2.668900489807129, "reward_std": 0.48628270626068115, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9166666865348816, "rewards/wrapped_driving_reward": -0.12276604771614075, "rewards/wrapped_format_reward": 0.875, "step": 928 }, { "completion_length": 500.0, "epoch": 185.8, "grad_norm": 0.601020872592926, "kl": 0.44025442004203796, "learning_rate": 2.2332551925451436e-06, "loss": 0.0176, "reward": 1.2071342468261719, "reward_std": 2.818470001220703, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.734375, "rewards/wrapped_driving_reward": -1.1522407531738281, "rewards/wrapped_format_reward": 0.875, "step": 929 }, { "completion_length": 415.0, "epoch": 186.0, "grad_norm": 0.6828198432922363, "kl": 0.971450924873352, "learning_rate": 2.2278328128700893e-06, "loss": 0.0389, "reward": 3.055501699447632, "reward_std": 0.4391587972640991, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.180501788854599, "rewards/wrapped_format_reward": 0.875, "step": 930 }, { "completion_length": 500.0, "epoch": 186.2, "grad_norm": 0.5190205574035645, "kl": 0.7996862530708313, "learning_rate": 2.2224117286143063e-06, "loss": 0.032, "reward": 0.7720038890838623, "reward_std": 2.37174391746521, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.9779961109161377, "rewards/wrapped_format_reward": 0.75, "step": 931 }, { "completion_length": 407.0, "epoch": 186.4, "grad_norm": 1.0009135007858276, "kl": 1.1303850412368774, "learning_rate": 2.2169919655802338e-06, "loss": 0.0452, "reward": 3.2563962936401367, "reward_std": 0.2707050144672394, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.875, "rewards/wrapped_driving_reward": 0.5063963532447815, "rewards/wrapped_format_reward": 0.875, "step": 932 }, { "completion_length": 500.0, "epoch": 186.6, "grad_norm": 0.5876384973526001, "kl": 1.2698439359664917, "learning_rate": 2.2115735495640212e-06, "loss": 0.0508, "reward": 3.4274063110351562, "reward_std": 0.5421848297119141, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.552406370639801, "rewards/wrapped_format_reward": 0.875, "step": 933 }, { "completion_length": 500.0, "epoch": 186.8, "grad_norm": 0.433315247297287, "kl": 1.0054932832717896, "learning_rate": 2.2061565063554063e-06, "loss": 0.0402, "reward": 1.0803213119506836, "reward_std": 3.504443883895874, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.169678807258606, "rewards/wrapped_format_reward": 0.75, "step": 934 }, { "completion_length": 496.0, "epoch": 187.0, "grad_norm": 0.5682697892189026, "kl": 1.0767149925231934, "learning_rate": 2.2007408617375944e-06, "loss": 0.0431, "reward": 1.842423439025879, "reward_std": 3.241006374359131, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.6575766801834106, "rewards/wrapped_format_reward": 1.0, "step": 935 }, { "completion_length": 463.0, "epoch": 187.2, "grad_norm": 0.5375118255615234, "kl": 1.372418999671936, "learning_rate": 2.195326641487132e-06, "loss": 0.0549, "reward": 3.8171489238739014, "reward_std": 0.022597435861825943, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9886363744735718, "rewards/wrapped_driving_reward": 0.8285125494003296, "rewards/wrapped_format_reward": 1.0, "step": 936 }, { "completion_length": 500.0, "epoch": 187.4, "grad_norm": 0.7156015634536743, "kl": 1.1508220434188843, "learning_rate": 2.1899138713737876e-06, "loss": 0.046, "reward": 1.4790055751800537, "reward_std": 3.656684637069702, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.7709944844245911, "rewards/wrapped_format_reward": 0.75, "step": 937 }, { "completion_length": 480.0, "epoch": 187.6, "grad_norm": 0.498759925365448, "kl": 1.381080150604248, "learning_rate": 2.1845025771604263e-06, "loss": 0.0552, "reward": 2.6462209224700928, "reward_std": 0.09563220292329788, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9583333134651184, "rewards/wrapped_driving_reward": -0.3121124505996704, "rewards/wrapped_format_reward": 1.0, "step": 938 }, { "completion_length": 500.0, "epoch": 187.8, "grad_norm": 0.4997135102748871, "kl": 1.664068579673767, "learning_rate": 2.1790927846028894e-06, "loss": 0.0666, "reward": 3.414687156677246, "reward_std": 0.5244473218917847, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9375, "rewards/wrapped_driving_reward": 0.7271872162818909, "rewards/wrapped_format_reward": 0.75, "step": 939 }, { "completion_length": 410.0, "epoch": 188.0, "grad_norm": 0.5040246248245239, "kl": 0.9109085202217102, "learning_rate": 2.173684519449872e-06, "loss": 0.0364, "reward": 1.0221105813980103, "reward_std": 1.975825309753418, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.8528895378112793, "rewards/wrapped_format_reward": 0.875, "step": 940 }, { "completion_length": 496.0, "epoch": 188.2, "grad_norm": 0.478110134601593, "kl": 0.8791089057922363, "learning_rate": 2.1682778074427955e-06, "loss": 0.0352, "reward": 2.912130832672119, "reward_std": 0.4089687168598175, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.08786918222904205, "rewards/wrapped_format_reward": 1.0, "step": 941 }, { "completion_length": 486.0, "epoch": 188.4, "grad_norm": 0.5197477340698242, "kl": 0.9789789915084839, "learning_rate": 2.1628726743156933e-06, "loss": 0.0392, "reward": 3.500166893005371, "reward_std": 0.3842155933380127, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.5001667737960815, "rewards/wrapped_format_reward": 1.0, "step": 942 }, { "completion_length": 464.0, "epoch": 188.6, "grad_norm": 0.47153791785240173, "kl": 0.7001530528068542, "learning_rate": 2.1574691457950805e-06, "loss": 0.028, "reward": 0.8245824575424194, "reward_std": 2.4226925373077393, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -2.05041766166687, "rewards/wrapped_format_reward": 0.875, "step": 943 }, { "completion_length": 338.0, "epoch": 188.8, "grad_norm": 0.910094678401947, "kl": 1.3480033874511719, "learning_rate": 2.1520672475998374e-06, "loss": 0.0539, "reward": 3.2739739418029785, "reward_std": 0.31017762422561646, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.27397388219833374, "rewards/wrapped_format_reward": 1.0, "step": 944 }, { "completion_length": 500.0, "epoch": 189.0, "grad_norm": 0.49829399585723877, "kl": 1.7790124416351318, "learning_rate": 2.146667005441082e-06, "loss": 0.0712, "reward": 2.5157852172851562, "reward_std": 0.26337930560112, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.35921457409858704, "rewards/wrapped_format_reward": 0.875, "step": 945 }, { "completion_length": 459.0, "epoch": 189.2, "grad_norm": 0.5464431047439575, "kl": 0.9494650959968567, "learning_rate": 2.1412684450220524e-06, "loss": 0.038, "reward": 3.2252697944641113, "reward_std": 0.448541522026062, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9583333134651184, "rewards/wrapped_driving_reward": 0.39193639159202576, "rewards/wrapped_format_reward": 0.875, "step": 946 }, { "completion_length": 500.0, "epoch": 189.4, "grad_norm": 0.49399006366729736, "kl": 1.0156264305114746, "learning_rate": 2.1358715920379814e-06, "loss": 0.0406, "reward": 2.7399253845214844, "reward_std": 0.6259793639183044, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.010074526071548462, "rewards/wrapped_format_reward": 0.75, "step": 947 }, { "completion_length": 500.0, "epoch": 189.6, "grad_norm": 0.5094689130783081, "kl": 1.19485604763031, "learning_rate": 2.1304764721759736e-06, "loss": 0.0478, "reward": 1.7101205587387085, "reward_std": 3.811704158782959, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.5398794412612915, "rewards/wrapped_format_reward": 0.75, "step": 948 }, { "completion_length": 500.0, "epoch": 189.8, "grad_norm": 0.5310368537902832, "kl": 0.5689219832420349, "learning_rate": 2.1250831111148873e-06, "loss": 0.0228, "reward": -0.20518773794174194, "reward_std": 4.116705417633057, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.8301877975463867, "rewards/wrapped_format_reward": 0.625, "step": 949 }, { "completion_length": 459.0, "epoch": 190.0, "grad_norm": 0.46584364771842957, "kl": 1.1638586521148682, "learning_rate": 2.1196915345252085e-06, "loss": 0.0466, "reward": 2.8259994983673096, "reward_std": 0.048405181616544724, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.17400041222572327, "rewards/wrapped_format_reward": 1.0, "step": 950 }, { "completion_length": 500.0, "epoch": 190.2, "grad_norm": 0.561079740524292, "kl": 1.2665772438049316, "learning_rate": 2.114301768068928e-06, "loss": 0.0507, "reward": 0.7488883137702942, "reward_std": 3.192291021347046, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.1261117458343506, "rewards/wrapped_format_reward": 0.375, "step": 951 }, { "completion_length": 489.0, "epoch": 190.4, "grad_norm": 0.5475406646728516, "kl": 0.964567244052887, "learning_rate": 2.1089138373994226e-06, "loss": 0.0386, "reward": 3.7516729831695557, "reward_std": 0.17056289315223694, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9166666865348816, "rewards/wrapped_driving_reward": 0.8350062370300293, "rewards/wrapped_format_reward": 1.0, "step": 952 }, { "completion_length": 500.0, "epoch": 190.6, "grad_norm": 0.44733574986457825, "kl": 0.8066545128822327, "learning_rate": 2.1035277681613325e-06, "loss": 0.0323, "reward": -0.9628837704658508, "reward_std": 2.6176187992095947, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -3.087883949279785, "rewards/wrapped_format_reward": 0.625, "step": 953 }, { "completion_length": 500.0, "epoch": 190.8, "grad_norm": 0.6097102165222168, "kl": 0.7529609203338623, "learning_rate": 2.0981435859904347e-06, "loss": 0.0301, "reward": 2.6820497512817383, "reward_std": 0.7259109020233154, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.18204964697360992, "rewards/wrapped_format_reward": 0.5, "step": 954 }, { "completion_length": 500.0, "epoch": 191.0, "grad_norm": 0.48875048756599426, "kl": 0.9087252616882324, "learning_rate": 2.0927613165135285e-06, "loss": 0.0363, "reward": 2.9861044883728027, "reward_std": 0.2680176794528961, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.11110442876815796, "rewards/wrapped_format_reward": 0.875, "step": 955 }, { "completion_length": 500.0, "epoch": 191.2, "grad_norm": 0.6656526327133179, "kl": 1.1753857135772705, "learning_rate": 2.087380985348306e-06, "loss": 0.047, "reward": 2.8272321224212646, "reward_std": 0.5713879466056824, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.04776783287525177, "rewards/wrapped_format_reward": 0.875, "step": 956 }, { "completion_length": 480.0, "epoch": 191.4, "grad_norm": 0.5438945293426514, "kl": 0.9849552512168884, "learning_rate": 2.0820026181032356e-06, "loss": 0.0394, "reward": 3.3579394817352295, "reward_std": 0.3115105628967285, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.35793960094451904, "rewards/wrapped_format_reward": 1.0, "step": 957 }, { "completion_length": 500.0, "epoch": 191.6, "grad_norm": 0.4994749128818512, "kl": 1.893036961555481, "learning_rate": 2.0766262403774388e-06, "loss": 0.0757, "reward": 3.585599899291992, "reward_std": 0.2987547218799591, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.835599958896637, "rewards/wrapped_format_reward": 0.75, "step": 958 }, { "completion_length": 500.0, "epoch": 191.8, "grad_norm": 0.44899803400039673, "kl": 1.1986360549926758, "learning_rate": 2.0712518777605646e-06, "loss": 0.0479, "reward": 2.7396349906921387, "reward_std": 0.25676774978637695, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.2603647708892822, "rewards/wrapped_format_reward": 1.0, "step": 959 }, { "completion_length": 500.0, "epoch": 192.0, "grad_norm": 0.7886030673980713, "kl": 1.005682349205017, "learning_rate": 2.0658795558326745e-06, "loss": 0.0402, "reward": 2.3123788833618164, "reward_std": 0.9716589450836182, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.18762129545211792, "rewards/wrapped_format_reward": 0.5, "step": 960 }, { "completion_length": 500.0, "epoch": 192.2, "grad_norm": 0.5696879029273987, "kl": 1.0842654705047607, "learning_rate": 2.0605093001641138e-06, "loss": 0.0434, "reward": -0.4991178512573242, "reward_std": 3.465120553970337, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.249117851257324, "rewards/wrapped_format_reward": 0.75, "step": 961 }, { "completion_length": 483.0, "epoch": 192.4, "grad_norm": 0.6350774168968201, "kl": 1.3506046533584595, "learning_rate": 2.0551411363153963e-06, "loss": 0.054, "reward": 3.539217948913574, "reward_std": 0.3723929822444916, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.5392177700996399, "rewards/wrapped_format_reward": 1.0, "step": 962 }, { "completion_length": 500.0, "epoch": 192.6, "grad_norm": 0.49636659026145935, "kl": 1.1797510385513306, "learning_rate": 2.0497750898370757e-06, "loss": 0.0472, "reward": 2.146454095840454, "reward_std": 0.8986145257949829, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9583333134651184, "rewards/wrapped_driving_reward": -0.5618792176246643, "rewards/wrapped_format_reward": 0.75, "step": 963 }, { "completion_length": 418.0, "epoch": 192.8, "grad_norm": 0.7261518836021423, "kl": 0.6447563171386719, "learning_rate": 2.0444111862696313e-06, "loss": 0.0258, "reward": 3.6410467624664307, "reward_std": 0.2871745228767395, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.7660467624664307, "rewards/wrapped_format_reward": 0.875, "step": 964 }, { "completion_length": 500.0, "epoch": 193.0, "grad_norm": 0.6479581594467163, "kl": 0.9154292941093445, "learning_rate": 2.039049451143342e-06, "loss": 0.0366, "reward": 2.706965923309326, "reward_std": 0.5069039463996887, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9545454382896423, "rewards/wrapped_driving_reward": -0.12257963418960571, "rewards/wrapped_format_reward": 0.875, "step": 965 }, { "completion_length": 500.0, "epoch": 193.2, "grad_norm": 0.5651826858520508, "kl": 1.1386218070983887, "learning_rate": 2.0336899099781636e-06, "loss": 0.0455, "reward": 2.9072165489196777, "reward_std": 0.7617118954658508, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.15721642971038818, "rewards/wrapped_format_reward": 0.75, "step": 966 }, { "completion_length": 500.0, "epoch": 193.4, "grad_norm": 0.4863123893737793, "kl": 0.2626260221004486, "learning_rate": 2.0283325882836126e-06, "loss": 0.0105, "reward": 1.4328651428222656, "reward_std": 3.309224843978882, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9421347975730896, "rewards/wrapped_format_reward": 0.875, "step": 967 }, { "completion_length": 500.0, "epoch": 193.6, "grad_norm": 0.4705544114112854, "kl": 0.5639627575874329, "learning_rate": 2.022977511558638e-06, "loss": 0.0226, "reward": 3.455763816833496, "reward_std": 0.25063416361808777, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.875, "rewards/wrapped_driving_reward": 0.8307638764381409, "rewards/wrapped_format_reward": 0.75, "step": 968 }, { "completion_length": 500.0, "epoch": 193.8, "grad_norm": 0.460371196269989, "kl": 1.470854640007019, "learning_rate": 2.0176247052915078e-06, "loss": 0.0588, "reward": 2.5401976108551025, "reward_std": 0.22898149490356445, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8854166865348816, "rewards/wrapped_driving_reward": -0.2202192097902298, "rewards/wrapped_format_reward": 0.875, "step": 969 }, { "completion_length": 500.0, "epoch": 194.0, "grad_norm": 0.49431416392326355, "kl": 1.0209243297576904, "learning_rate": 2.01227419495968e-06, "loss": 0.0408, "reward": 1.3964576721191406, "reward_std": 3.288207530975342, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6726190447807312, "rewards/wrapped_driving_reward": -0.9011614322662354, "rewards/wrapped_format_reward": 0.875, "step": 970 }, { "completion_length": 500.0, "epoch": 194.2, "grad_norm": 0.7239106893539429, "kl": 0.4642423689365387, "learning_rate": 2.0069260060296854e-06, "loss": 0.0186, "reward": 3.4259514808654785, "reward_std": 0.2672251760959625, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9318181872367859, "rewards/wrapped_driving_reward": 0.49413320422172546, "rewards/wrapped_format_reward": 1.0, "step": 971 }, { "completion_length": 446.0, "epoch": 194.4, "grad_norm": 0.5294839143753052, "kl": 0.9924250245094299, "learning_rate": 2.0015801639570076e-06, "loss": 0.0397, "reward": 1.7371912002563477, "reward_std": 1.832466959953308, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9107142686843872, "rewards/wrapped_driving_reward": -1.04852294921875, "rewards/wrapped_format_reward": 0.875, "step": 972 }, { "completion_length": 500.0, "epoch": 194.6, "grad_norm": 0.5141122341156006, "kl": 1.1143356561660767, "learning_rate": 1.996236694185957e-06, "loss": 0.0446, "reward": 1.409009575843811, "reward_std": 3.6155614852905273, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7291666865348816, "rewards/wrapped_driving_reward": -0.8201570510864258, "rewards/wrapped_format_reward": 0.75, "step": 973 }, { "completion_length": 500.0, "epoch": 194.8, "grad_norm": 0.5137813687324524, "kl": 0.6774077415466309, "learning_rate": 1.9908956221495567e-06, "loss": 0.0271, "reward": 3.294393301010132, "reward_std": 0.5990864634513855, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.96875, "rewards/wrapped_driving_reward": 0.3256433606147766, "rewards/wrapped_format_reward": 1.0, "step": 974 }, { "completion_length": 500.0, "epoch": 195.0, "grad_norm": 0.6752596497535706, "kl": 0.8337000012397766, "learning_rate": 1.985556973269413e-06, "loss": 0.0333, "reward": 1.436514139175415, "reward_std": 3.2987518310546875, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9384859800338745, "rewards/wrapped_format_reward": 0.875, "step": 975 }, { "completion_length": 500.0, "epoch": 195.2, "grad_norm": 0.6689338088035583, "kl": 0.6801450848579407, "learning_rate": 1.9802207729556023e-06, "loss": 0.0272, "reward": 3.2011795043945312, "reward_std": 0.6917397975921631, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.5761793851852417, "rewards/wrapped_format_reward": 0.625, "step": 976 }, { "completion_length": 500.0, "epoch": 195.4, "grad_norm": 0.560636579990387, "kl": 0.8436287045478821, "learning_rate": 1.9748870466065444e-06, "loss": 0.0337, "reward": -0.6823706030845642, "reward_std": 2.8480265140533447, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -2.932370662689209, "rewards/wrapped_format_reward": 0.75, "step": 977 }, { "completion_length": 406.0, "epoch": 195.6, "grad_norm": 0.5505325794219971, "kl": 0.6946606040000916, "learning_rate": 1.9695558196088846e-06, "loss": 0.0278, "reward": 3.748781204223633, "reward_std": 0.14656925201416016, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.7487812042236328, "rewards/wrapped_format_reward": 1.0, "step": 978 }, { "completion_length": 494.0, "epoch": 195.8, "grad_norm": 0.6658393144607544, "kl": 1.4692611694335938, "learning_rate": 1.964227117337374e-06, "loss": 0.0588, "reward": 2.6252613067626953, "reward_std": 0.14504392445087433, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9791666865348816, "rewards/wrapped_driving_reward": -0.35390543937683105, "rewards/wrapped_format_reward": 1.0, "step": 979 }, { "completion_length": 500.0, "epoch": 196.0, "grad_norm": 0.5573218464851379, "kl": 0.9373491406440735, "learning_rate": 1.958900965154743e-06, "loss": 0.0375, "reward": 3.241145133972168, "reward_std": 0.23681922256946564, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.949999988079071, "rewards/wrapped_driving_reward": 0.29114508628845215, "rewards/wrapped_format_reward": 1.0, "step": 980 }, { "completion_length": 500.0, "epoch": 196.2, "grad_norm": 0.5613752603530884, "kl": 0.5691253542900085, "learning_rate": 1.9535773884115894e-06, "loss": 0.0228, "reward": 2.539503335952759, "reward_std": 0.3668363094329834, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.335496723651886, "rewards/wrapped_format_reward": 0.875, "step": 981 }, { "completion_length": 500.0, "epoch": 196.4, "grad_norm": 0.5520111918449402, "kl": 0.8869532942771912, "learning_rate": 1.9482564124462478e-06, "loss": 0.0355, "reward": 1.5741268396377563, "reward_std": 3.410243511199951, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.4258730709552765, "rewards/wrapped_format_reward": 0.5, "step": 982 }, { "completion_length": 500.0, "epoch": 196.6, "grad_norm": 0.4561191201210022, "kl": 1.3822728395462036, "learning_rate": 1.942938062584678e-06, "loss": 0.0553, "reward": 2.657071113586426, "reward_std": 0.630805253982544, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.3429288864135742, "rewards/wrapped_format_reward": 1.0, "step": 983 }, { "completion_length": 500.0, "epoch": 196.8, "grad_norm": 102.46700286865234, "kl": 10.124485969543457, "learning_rate": 1.937622364140338e-06, "loss": 0.405, "reward": 1.304389238357544, "reward_std": 3.5367956161499023, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.945610761642456, "rewards/wrapped_format_reward": 0.75, "step": 984 }, { "completion_length": 468.0, "epoch": 197.0, "grad_norm": 0.5332741141319275, "kl": 1.2333430051803589, "learning_rate": 1.9323093424140673e-06, "loss": 0.0493, "reward": 3.2868287563323975, "reward_std": 0.4407948851585388, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.4118286073207855, "rewards/wrapped_format_reward": 0.875, "step": 985 }, { "completion_length": 500.0, "epoch": 197.2, "grad_norm": 2.2904465198516846, "kl": 0.9502900242805481, "learning_rate": 1.926999022693965e-06, "loss": 0.038, "reward": 2.5531535148620605, "reward_std": 1.0269813537597656, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.053153496235609055, "rewards/wrapped_format_reward": 0.5, "step": 986 }, { "completion_length": 500.0, "epoch": 197.4, "grad_norm": 0.4350486099720001, "kl": 1.0802429914474487, "learning_rate": 1.9216914302552693e-06, "loss": 0.0432, "reward": 3.8217079639434814, "reward_std": 0.014752035960555077, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.8217079043388367, "rewards/wrapped_format_reward": 1.0, "step": 987 }, { "completion_length": 500.0, "epoch": 197.6, "grad_norm": 0.5089243054389954, "kl": 0.6424485445022583, "learning_rate": 1.9163865903602374e-06, "loss": 0.0257, "reward": 3.3335821628570557, "reward_std": 0.20193266868591309, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.33358216285705566, "rewards/wrapped_format_reward": 1.0, "step": 988 }, { "completion_length": 500.0, "epoch": 197.8, "grad_norm": 0.5370872616767883, "kl": 0.7112429738044739, "learning_rate": 1.9110845282580253e-06, "loss": 0.0285, "reward": 2.584986925125122, "reward_std": 0.441521018743515, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.2900131940841675, "rewards/wrapped_format_reward": 0.875, "step": 989 }, { "completion_length": 448.0, "epoch": 198.0, "grad_norm": 0.4503290355205536, "kl": 0.8598915934562683, "learning_rate": 1.9057852691845677e-06, "loss": 0.0344, "reward": 3.0328993797302246, "reward_std": 0.5645407438278198, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.032899484038352966, "rewards/wrapped_format_reward": 1.0, "step": 990 }, { "completion_length": 500.0, "epoch": 198.2, "grad_norm": 0.4580807089805603, "kl": 1.1359678506851196, "learning_rate": 1.9004888383624596e-06, "loss": 0.0454, "reward": 3.2506661415100098, "reward_std": 0.22670303285121918, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.2506660521030426, "rewards/wrapped_format_reward": 1.0, "step": 991 }, { "completion_length": 500.0, "epoch": 198.4, "grad_norm": 0.44507095217704773, "kl": 1.269830346107483, "learning_rate": 1.895195261000831e-06, "loss": 0.0508, "reward": -0.32085761427879333, "reward_std": 2.576094150543213, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": -2.445857524871826, "rewards/wrapped_format_reward": 0.75, "step": 992 }, { "completion_length": 500.0, "epoch": 198.6, "grad_norm": 0.519321084022522, "kl": 1.010072946548462, "learning_rate": 1.8899045622952337e-06, "loss": 0.0404, "reward": 2.7637908458709717, "reward_std": 0.07630608975887299, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9583333134651184, "rewards/wrapped_driving_reward": -0.1945425271987915, "rewards/wrapped_format_reward": 1.0, "step": 993 }, { "completion_length": 500.0, "epoch": 198.8, "grad_norm": 0.5224224925041199, "kl": 1.6516927480697632, "learning_rate": 1.8846167674275175e-06, "loss": 0.0661, "reward": 3.530534267425537, "reward_std": 0.47409287095069885, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9375, "rewards/wrapped_driving_reward": 0.5930341482162476, "rewards/wrapped_format_reward": 1.0, "step": 994 }, { "completion_length": 500.0, "epoch": 199.0, "grad_norm": 0.6636751890182495, "kl": 0.4684174656867981, "learning_rate": 1.8793319015657091e-06, "loss": 0.0187, "reward": 1.255099892616272, "reward_std": 3.1992766857147217, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": -0.3699001669883728, "rewards/wrapped_format_reward": 0.25, "step": 995 }, { "completion_length": 489.0, "epoch": 199.2, "grad_norm": 0.4272195100784302, "kl": 1.0444061756134033, "learning_rate": 1.874049989863896e-06, "loss": 0.0418, "reward": 3.1618120670318604, "reward_std": 0.15273608267307281, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.2868121862411499, "rewards/wrapped_format_reward": 0.875, "step": 996 }, { "completion_length": 415.0, "epoch": 199.4, "grad_norm": 0.4787036180496216, "kl": 1.132507085800171, "learning_rate": 1.8687710574621051e-06, "loss": 0.0453, "reward": 3.367159843444824, "reward_std": 0.13511835038661957, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.36715978384017944, "rewards/wrapped_format_reward": 1.0, "step": 997 }, { "completion_length": 391.0, "epoch": 199.6, "grad_norm": 0.556138277053833, "kl": 1.152779221534729, "learning_rate": 1.8634951294861809e-06, "loss": 0.0461, "reward": 3.3572402000427246, "reward_std": 0.3249020576477051, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9583333134651184, "rewards/wrapped_driving_reward": 0.3989068865776062, "rewards/wrapped_format_reward": 1.0, "step": 998 }, { "completion_length": 500.0, "epoch": 199.8, "grad_norm": 0.4999628961086273, "kl": 0.8470302820205688, "learning_rate": 1.8582222310476699e-06, "loss": 0.0339, "reward": 1.7823580503463745, "reward_std": 3.5314066410064697, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5833333134651184, "rewards/wrapped_driving_reward": -0.4259752333164215, "rewards/wrapped_format_reward": 0.875, "step": 999 }, { "completion_length": 500.0, "epoch": 200.0, "grad_norm": 0.5336737632751465, "kl": 0.8840068578720093, "learning_rate": 1.852952387243698e-06, "loss": 0.0354, "reward": 0.5844348669052124, "reward_std": 2.797942638397217, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": -1.1655651330947876, "rewards/wrapped_format_reward": 0.375, "step": 1000 } ], "logging_steps": 1, "max_steps": 1600, "num_input_tokens_seen": 0, "num_train_epochs": 320, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }