{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 30.0, "eval_steps": 500, "global_step": 750, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 750.0, "epoch": 0.04, "grad_norm": 12918.7890625, "kl": 1354.3233642578125, "learning_rate": 3.1250000000000005e-08, "loss": 54.1729, "reward": 0.7556291222572327, "reward_std": 3.186340093612671, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9943709373474121, "rewards/wrapped_format_reward": 0.25, "step": 1 }, { "completion_length": 750.0, "epoch": 0.08, "grad_norm": 118175080.0, "kl": 11214800.0, "learning_rate": 6.250000000000001e-08, "loss": 448592.0, "reward": -0.4338679313659668, "reward_std": 4.129403591156006, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.6838679313659668, "rewards/wrapped_format_reward": 0.25, "step": 2 }, { "completion_length": 750.0, "epoch": 0.12, "grad_norm": 31.031461715698242, "kl": 11.660053253173828, "learning_rate": 9.375e-08, "loss": 0.4664, "reward": 2.544208526611328, "reward_std": 0.4753165543079376, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.16920866072177887, "rewards/wrapped_format_reward": 0.375, "step": 3 }, { "completion_length": 750.0, "epoch": 0.16, "grad_norm": 44.48072814941406, "kl": 10.989585876464844, "learning_rate": 1.2500000000000002e-07, "loss": 0.4396, "reward": 0.6104838252067566, "reward_std": 2.7562637329101562, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.1395162343978882, "rewards/wrapped_format_reward": 0.25, "step": 4 }, { "completion_length": 750.0, "epoch": 0.2, "grad_norm": 177.81173706054688, "kl": 29.71923065185547, "learning_rate": 1.5625e-07, "loss": 1.1888, "reward": 0.7251285314559937, "reward_std": 3.179880380630493, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.0248714685440063, "rewards/wrapped_format_reward": 0.25, "step": 5 }, { "completion_length": 750.0, "epoch": 0.24, "grad_norm": 538.7103881835938, "kl": 77.67715454101562, "learning_rate": 1.875e-07, "loss": 3.1071, "reward": 1.9332659244537354, "reward_std": 3.2915334701538086, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.44173407554626465, "rewards/wrapped_format_reward": 0.875, "step": 6 }, { "completion_length": 750.0, "epoch": 0.28, "grad_norm": 1767973.125, "kl": 117501.96875, "learning_rate": 2.1875e-07, "loss": 4700.0791, "reward": -2.279214859008789, "reward_std": 3.441570281982422, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.779214859008789, "rewards/wrapped_format_reward": 0.0, "step": 7 }, { "completion_length": 750.0, "epoch": 0.32, "grad_norm": 43.647186279296875, "kl": 15.320382118225098, "learning_rate": 2.5000000000000004e-07, "loss": 0.6128, "reward": 0.5996664762496948, "reward_std": 3.1008288860321045, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": -0.9003335237503052, "rewards/wrapped_format_reward": 0.125, "step": 8 }, { "completion_length": 750.0, "epoch": 0.36, "grad_norm": 913985.875, "kl": 96429.6015625, "learning_rate": 2.8125e-07, "loss": 3857.1833, "reward": -0.5126274824142456, "reward_std": 3.7559752464294434, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.137627601623535, "rewards/wrapped_format_reward": 0.625, "step": 9 }, { "completion_length": 750.0, "epoch": 0.4, "grad_norm": 15.540242195129395, "kl": 5.14943265914917, "learning_rate": 3.125e-07, "loss": 0.206, "reward": 0.7949561476707458, "reward_std": 3.2785708904266357, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.8300438523292542, "rewards/wrapped_format_reward": 0.125, "step": 10 }, { "completion_length": 750.0, "epoch": 0.44, "grad_norm": 30.965370178222656, "kl": 9.962629318237305, "learning_rate": 3.4375000000000004e-07, "loss": 0.3985, "reward": -1.625, "reward_std": 0.25, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.375, "step": 11 }, { "completion_length": 750.0, "epoch": 0.48, "grad_norm": 66.62539672851562, "kl": 12.977354049682617, "learning_rate": 3.75e-07, "loss": 0.5191, "reward": -2.25, "reward_std": 1.1902379989624023, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.25, "step": 12 }, { "completion_length": 750.0, "epoch": 0.52, "grad_norm": 5716130.0, "kl": 649889.6875, "learning_rate": 4.0625000000000003e-07, "loss": 25995.5879, "reward": -3.875, "reward_std": 0.25, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.125, "step": 13 }, { "completion_length": 750.0, "epoch": 0.56, "grad_norm": 87.81452941894531, "kl": 14.151000022888184, "learning_rate": 4.375e-07, "loss": 0.566, "reward": -2.25, "reward_std": 1.1902379989624023, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.25, "step": 14 }, { "completion_length": 750.0, "epoch": 0.6, "grad_norm": 573.2177124023438, "kl": 77.2968521118164, "learning_rate": 4.6875000000000006e-07, "loss": 3.0919, "reward": -0.5207003355026245, "reward_std": 4.02908182144165, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.7707003355026245, "rewards/wrapped_format_reward": 0.25, "step": 15 }, { "completion_length": 750.0, "epoch": 0.64, "grad_norm": 691142.75, "kl": 47404.98828125, "learning_rate": 5.000000000000001e-07, "loss": 1896.199, "reward": -0.6887123584747314, "reward_std": 3.8238091468811035, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.0637123584747314, "rewards/wrapped_format_reward": 0.375, "step": 16 }, { "completion_length": 750.0, "epoch": 0.68, "grad_norm": 54.24480056762695, "kl": 13.760282516479492, "learning_rate": 5.3125e-07, "loss": 0.5504, "reward": -0.7901134490966797, "reward_std": 3.464918375015259, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.0401134490966797, "rewards/wrapped_format_reward": 0.25, "step": 17 }, { "completion_length": 750.0, "epoch": 0.72, "grad_norm": 2934.07861328125, "kl": 389.90374755859375, "learning_rate": 5.625e-07, "loss": 15.5961, "reward": 0.821560800075531, "reward_std": 3.2411201000213623, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6785714626312256, "rewards/wrapped_driving_reward": -0.9820106625556946, "rewards/wrapped_format_reward": 0.375, "step": 18 }, { "completion_length": 750.0, "epoch": 0.76, "grad_norm": 6.749892234802246, "kl": 2.5047595500946045, "learning_rate": 5.9375e-07, "loss": 0.1002, "reward": 2.3981642723083496, "reward_std": 0.7526259422302246, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.14816424250602722, "rewards/wrapped_format_reward": 0.25, "step": 19 }, { "completion_length": 750.0, "epoch": 0.8, "grad_norm": 2.598149061203003, "kl": 1.0905311107635498, "learning_rate": 6.25e-07, "loss": 0.0436, "reward": 1.3226027488708496, "reward_std": 2.890576124191284, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.5523972511291504, "rewards/wrapped_format_reward": 0.375, "step": 20 }, { "completion_length": 750.0, "epoch": 0.84, "grad_norm": 19.09586524963379, "kl": 5.524669170379639, "learning_rate": 6.562500000000001e-07, "loss": 0.221, "reward": 2.1500957012176514, "reward_std": 0.18781162798404694, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.09990427643060684, "rewards/wrapped_format_reward": 0.25, "step": 21 }, { "completion_length": 750.0, "epoch": 0.88, "grad_norm": 20067694.0, "kl": 1812346.0, "learning_rate": 6.875000000000001e-07, "loss": 72493.8359, "reward": -0.6700150966644287, "reward_std": 3.845890998840332, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.7950150966644287, "rewards/wrapped_format_reward": 0.125, "step": 22 }, { "completion_length": 750.0, "epoch": 0.92, "grad_norm": 23.432483673095703, "kl": 6.9398393630981445, "learning_rate": 7.1875e-07, "loss": 0.2776, "reward": -0.5996897220611572, "reward_std": 1.5026532411575317, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -2.974689483642578, "rewards/wrapped_format_reward": 0.375, "step": 23 }, { "completion_length": 750.0, "epoch": 0.96, "grad_norm": 13.615424156188965, "kl": 2.8062376976013184, "learning_rate": 7.5e-07, "loss": 0.1122, "reward": -1.6607142686843872, "reward_std": 0.47155481576919556, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9642857313156128, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.375, "step": 24 }, { "completion_length": 750.0, "epoch": 1.0, "grad_norm": 630.5245361328125, "kl": 71.51762390136719, "learning_rate": 7.8125e-07, "loss": 2.8607, "reward": -0.30632930994033813, "reward_std": 2.6143534183502197, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -2.1813292503356934, "rewards/wrapped_format_reward": 0.375, "step": 25 }, { "completion_length": 750.0, "epoch": 1.04, "grad_norm": 5.207806587219238, "kl": 2.9511070251464844, "learning_rate": 8.125000000000001e-07, "loss": 0.118, "reward": 3.2842254638671875, "reward_std": 0.35285714268684387, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.7842254638671875, "rewards/wrapped_format_reward": 0.5, "step": 26 }, { "completion_length": 750.0, "epoch": 1.08, "grad_norm": 30.472753524780273, "kl": 9.41655445098877, "learning_rate": 8.437500000000001e-07, "loss": 0.3767, "reward": 0.07714283466339111, "reward_std": 2.906501293182373, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.9228571653366089, "rewards/wrapped_format_reward": 0.5, "step": 27 }, { "completion_length": 750.0, "epoch": 1.12, "grad_norm": 16.97286033630371, "kl": 7.253666400909424, "learning_rate": 8.75e-07, "loss": 0.2901, "reward": -0.07512685656547546, "reward_std": 2.7640981674194336, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.8251267671585083, "rewards/wrapped_format_reward": 0.25, "step": 28 }, { "completion_length": 750.0, "epoch": 1.16, "grad_norm": 1228.613037109375, "kl": 160.93626403808594, "learning_rate": 9.0625e-07, "loss": 6.4375, "reward": -1.879897117614746, "reward_std": 2.8305463790893555, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -3.254897117614746, "rewards/wrapped_format_reward": 0.375, "step": 29 }, { "completion_length": 750.0, "epoch": 1.2, "grad_norm": 633888.125, "kl": 70588.515625, "learning_rate": 9.375000000000001e-07, "loss": 2823.5408, "reward": -2.25, "reward_std": 1.2583057880401611, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.25, "step": 30 }, { "completion_length": 750.0, "epoch": 1.24, "grad_norm": 62.07278060913086, "kl": 7.735680103302002, "learning_rate": 9.6875e-07, "loss": 0.3094, "reward": 0.7910268306732178, "reward_std": 3.1987111568450928, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9589731693267822, "rewards/wrapped_format_reward": 0.25, "step": 31 }, { "completion_length": 750.0, "epoch": 1.28, "grad_norm": 10.78812026977539, "kl": 3.6586508750915527, "learning_rate": 1.0000000000000002e-06, "loss": 0.1463, "reward": 3.0689785480499268, "reward_std": 0.5278481245040894, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.6939785480499268, "rewards/wrapped_format_reward": 0.375, "step": 32 }, { "completion_length": 750.0, "epoch": 1.32, "grad_norm": 16.368228912353516, "kl": 5.255997180938721, "learning_rate": 1.03125e-06, "loss": 0.2102, "reward": 1.9649852514266968, "reward_std": 1.2069640159606934, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.41001471877098083, "rewards/wrapped_format_reward": 0.375, "step": 33 }, { "completion_length": 750.0, "epoch": 1.3599999999999999, "grad_norm": 4.65078067779541, "kl": 2.813537120819092, "learning_rate": 1.0625e-06, "loss": 0.1125, "reward": 1.057879090309143, "reward_std": 3.4074575901031494, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9421209096908569, "rewards/wrapped_format_reward": 0.5, "step": 34 }, { "completion_length": 750.0, "epoch": 1.4, "grad_norm": 8.35409164428711, "kl": 4.364602565765381, "learning_rate": 1.0937500000000001e-06, "loss": 0.1746, "reward": 0.4969135522842407, "reward_std": 3.1264455318450928, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.2530864477157593, "rewards/wrapped_format_reward": 0.25, "step": 35 }, { "completion_length": 750.0, "epoch": 1.44, "grad_norm": 22052300.0, "kl": 1883969.0, "learning_rate": 1.125e-06, "loss": 75358.7578, "reward": 3.022550106048584, "reward_std": 0.5745974183082581, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.6475501656532288, "rewards/wrapped_format_reward": 0.375, "step": 36 }, { "completion_length": 750.0, "epoch": 1.48, "grad_norm": 3.6894285678863525, "kl": 2.3495442867279053, "learning_rate": 1.1562500000000002e-06, "loss": 0.094, "reward": 2.525552749633789, "reward_std": 0.5908641219139099, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.025552626699209213, "rewards/wrapped_format_reward": 0.5, "step": 37 }, { "completion_length": 750.0, "epoch": 1.52, "grad_norm": 35878.66796875, "kl": 4133.42822265625, "learning_rate": 1.1875e-06, "loss": 165.3371, "reward": 2.06065034866333, "reward_std": 0.20606659352779388, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9583333134651184, "rewards/wrapped_driving_reward": -0.02268284745514393, "rewards/wrapped_format_reward": 0.125, "step": 38 }, { "completion_length": 750.0, "epoch": 1.56, "grad_norm": 24.238332748413086, "kl": 6.0253143310546875, "learning_rate": 1.21875e-06, "loss": 0.241, "reward": 0.5050567388534546, "reward_std": 2.368363380432129, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -2.119943380355835, "rewards/wrapped_format_reward": 0.625, "step": 39 }, { "completion_length": 750.0, "epoch": 1.6, "grad_norm": 9.88853931427002, "kl": 2.069253921508789, "learning_rate": 1.25e-06, "loss": 0.0828, "reward": -0.38425058126449585, "reward_std": 4.177649021148682, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.634250521659851, "rewards/wrapped_format_reward": 0.25, "step": 40 }, { "completion_length": 750.0, "epoch": 1.6400000000000001, "grad_norm": 245.3301239013672, "kl": 16.598329544067383, "learning_rate": 1.28125e-06, "loss": 0.6639, "reward": 0.8135783672332764, "reward_std": 3.220566987991333, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9364216327667236, "rewards/wrapped_format_reward": 0.25, "step": 41 }, { "completion_length": 750.0, "epoch": 1.6800000000000002, "grad_norm": 7.694745063781738, "kl": 3.324167490005493, "learning_rate": 1.3125000000000001e-06, "loss": 0.133, "reward": 0.5644169449806213, "reward_std": 3.078721523284912, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.0605831146240234, "rewards/wrapped_format_reward": 0.125, "step": 42 }, { "completion_length": 750.0, "epoch": 1.72, "grad_norm": 427072.96875, "kl": 22608.654296875, "learning_rate": 1.34375e-06, "loss": 904.3459, "reward": -2.875, "reward_std": 1.314977765083313, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.125, "step": 43 }, { "completion_length": 750.0, "epoch": 1.76, "grad_norm": 169.96109008789062, "kl": 13.643832206726074, "learning_rate": 1.3750000000000002e-06, "loss": 0.5458, "reward": -2.75, "reward_std": 1.1902379989624023, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.25, "step": 44 }, { "completion_length": 750.0, "epoch": 1.8, "grad_norm": 1.771145224571228, "kl": 1.1954182386398315, "learning_rate": 1.40625e-06, "loss": 0.0478, "reward": -1.5, "reward_std": 0.40824830532073975, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.5, "step": 45 }, { "completion_length": 750.0, "epoch": 1.8399999999999999, "grad_norm": 92.88534545898438, "kl": 12.012389183044434, "learning_rate": 1.4375e-06, "loss": 0.4805, "reward": 1.650305986404419, "reward_std": 2.439289093017578, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.8496940732002258, "rewards/wrapped_format_reward": 0.5, "step": 46 }, { "completion_length": 750.0, "epoch": 1.88, "grad_norm": 82.52506256103516, "kl": 9.6003999710083, "learning_rate": 1.4687500000000001e-06, "loss": 0.384, "reward": 2.39274001121521, "reward_std": 0.08233900368213654, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9615384340286255, "rewards/wrapped_driving_reward": 0.05620140582323074, "rewards/wrapped_format_reward": 0.375, "step": 47 }, { "completion_length": 750.0, "epoch": 1.92, "grad_norm": 163.56568908691406, "kl": 20.40851593017578, "learning_rate": 1.5e-06, "loss": 0.8163, "reward": 2.7317166328430176, "reward_std": 0.4628515839576721, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.606716513633728, "rewards/wrapped_format_reward": 0.125, "step": 48 }, { "completion_length": 750.0, "epoch": 1.96, "grad_norm": 7.335759162902832, "kl": 2.264324426651001, "learning_rate": 1.5312500000000002e-06, "loss": 0.0906, "reward": -1.90625, "reward_std": 0.1875, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.96875, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.125, "step": 49 }, { "completion_length": 750.0, "epoch": 2.0, "grad_norm": 157.92581176757812, "kl": 25.004880905151367, "learning_rate": 1.5625e-06, "loss": 1.0002, "reward": -2.5, "reward_std": 1.7320507764816284, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.5, "step": 50 }, { "completion_length": 750.0, "epoch": 2.04, "grad_norm": 173265.328125, "kl": 19213.73828125, "learning_rate": 1.59375e-06, "loss": 768.5495, "reward": -2.8977272510528564, "reward_std": 1.295454502105713, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.47727274894714355, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.125, "step": 51 }, { "completion_length": 750.0, "epoch": 2.08, "grad_norm": 94274.2890625, "kl": 11336.91015625, "learning_rate": 1.6250000000000001e-06, "loss": 453.4765, "reward": -0.661945104598999, "reward_std": 3.859673023223877, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.6619449853897095, "rewards/wrapped_format_reward": 0.0, "step": 52 }, { "completion_length": 750.0, "epoch": 2.12, "grad_norm": 8.44528579711914, "kl": 3.5165162086486816, "learning_rate": 1.65625e-06, "loss": 0.1407, "reward": 0.029820501804351807, "reward_std": 1.8348188400268555, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -2.345179557800293, "rewards/wrapped_format_reward": 0.375, "step": 53 }, { "completion_length": 750.0, "epoch": 2.16, "grad_norm": 847.0797729492188, "kl": 106.91917419433594, "learning_rate": 1.6875000000000001e-06, "loss": 4.2768, "reward": 1.4347769021987915, "reward_std": 3.6399362087249756, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.4402230978012085, "rewards/wrapped_format_reward": 0.375, "step": 54 }, { "completion_length": 750.0, "epoch": 2.2, "grad_norm": 2.3560614585876465, "kl": 1.385076642036438, "learning_rate": 1.71875e-06, "loss": 0.0554, "reward": 2.0996251106262207, "reward_std": 1.1530507802963257, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.6503750085830688, "rewards/wrapped_format_reward": 0.75, "step": 55 }, { "completion_length": 750.0, "epoch": 2.24, "grad_norm": 21.23666000366211, "kl": 5.022896766662598, "learning_rate": 1.75e-06, "loss": 0.2009, "reward": 0.9647395610809326, "reward_std": 3.323491334915161, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.5352604389190674, "rewards/wrapped_format_reward": 0.0, "step": 56 }, { "completion_length": 750.0, "epoch": 2.2800000000000002, "grad_norm": 5.362183570861816, "kl": 1.007752776145935, "learning_rate": 1.78125e-06, "loss": 0.0403, "reward": 2.879687786102295, "reward_std": 0.23178231716156006, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.6296878457069397, "rewards/wrapped_format_reward": 0.25, "step": 57 }, { "completion_length": 750.0, "epoch": 2.32, "grad_norm": 18.63233184814453, "kl": 5.4822258949279785, "learning_rate": 1.8125e-06, "loss": 0.2193, "reward": 2.6016368865966797, "reward_std": 0.5958145260810852, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.22663694620132446, "rewards/wrapped_format_reward": 0.375, "step": 58 }, { "completion_length": 750.0, "epoch": 2.36, "grad_norm": 7.565363883972168, "kl": 1.224977970123291, "learning_rate": 1.8437500000000003e-06, "loss": 0.049, "reward": -1.625, "reward_std": 0.4787135720252991, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.375, "step": 59 }, { "completion_length": 750.0, "epoch": 2.4, "grad_norm": 1.9011263847351074, "kl": 1.0183587074279785, "learning_rate": 1.8750000000000003e-06, "loss": 0.0407, "reward": 2.4753334522247314, "reward_std": 0.7654703855514526, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.024666596204042435, "rewards/wrapped_format_reward": 0.5, "step": 60 }, { "completion_length": 750.0, "epoch": 2.44, "grad_norm": 10.706725120544434, "kl": 0.6977672576904297, "learning_rate": 1.90625e-06, "loss": 0.0279, "reward": -1.615384578704834, "reward_std": 0.46895742416381836, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.884615421295166, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.5, "step": 61 }, { "completion_length": 750.0, "epoch": 2.48, "grad_norm": 4.232525825500488, "kl": 1.5266469717025757, "learning_rate": 1.9375e-06, "loss": 0.0611, "reward": 1.7749261856079102, "reward_std": 0.21998588740825653, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.4750739336013794, "rewards/wrapped_format_reward": 0.25, "step": 62 }, { "completion_length": 750.0, "epoch": 2.52, "grad_norm": 1.8051197528839111, "kl": 0.7413498163223267, "learning_rate": 1.96875e-06, "loss": 0.0297, "reward": 2.4623327255249023, "reward_std": 0.44108253717422485, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.037667326629161835, "rewards/wrapped_format_reward": 0.5, "step": 63 }, { "completion_length": 750.0, "epoch": 2.56, "grad_norm": 1.0843530893325806, "kl": 0.8077827095985413, "learning_rate": 2.0000000000000003e-06, "loss": 0.0323, "reward": 0.8493964672088623, "reward_std": 1.4425324201583862, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.4006034135818481, "rewards/wrapped_format_reward": 0.25, "step": 64 }, { "completion_length": 750.0, "epoch": 2.6, "grad_norm": 79322.578125, "kl": 9403.984375, "learning_rate": 2.0312500000000002e-06, "loss": 376.1593, "reward": -0.5744374394416809, "reward_std": 3.084221124649048, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -2.5744376182556152, "rewards/wrapped_format_reward": 0.5, "step": 65 }, { "completion_length": 750.0, "epoch": 2.64, "grad_norm": 1.5267881155014038, "kl": 1.258867621421814, "learning_rate": 2.0625e-06, "loss": 0.0504, "reward": 2.676821231842041, "reward_std": 0.3912288248538971, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.05182119458913803, "rewards/wrapped_format_reward": 0.625, "step": 66 }, { "completion_length": 750.0, "epoch": 2.68, "grad_norm": 108.15802764892578, "kl": 15.135726928710938, "learning_rate": 2.09375e-06, "loss": 0.6054, "reward": 1.7942759990692139, "reward_std": 0.9946174025535583, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.7057239413261414, "rewards/wrapped_format_reward": 0.5, "step": 67 }, { "completion_length": 750.0, "epoch": 2.7199999999999998, "grad_norm": 3.034290075302124, "kl": 1.1447491645812988, "learning_rate": 2.125e-06, "loss": 0.0458, "reward": 0.49094319343566895, "reward_std": 3.0496573448181152, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.259056806564331, "rewards/wrapped_format_reward": 0.25, "step": 68 }, { "completion_length": 750.0, "epoch": 2.76, "grad_norm": 2.874622344970703, "kl": 1.35330331325531, "learning_rate": 2.1562500000000003e-06, "loss": 0.0541, "reward": 2.652649402618408, "reward_std": 0.8381170034408569, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.09735051542520523, "rewards/wrapped_format_reward": 0.75, "step": 69 }, { "completion_length": 750.0, "epoch": 2.8, "grad_norm": 10.109249114990234, "kl": 1.9703751802444458, "learning_rate": 2.1875000000000002e-06, "loss": 0.0788, "reward": -1.7727272510528564, "reward_std": 0.4545454680919647, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9772727489471436, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.25, "step": 70 }, { "completion_length": 750.0, "epoch": 2.84, "grad_norm": 3.9002420902252197, "kl": 0.9838883280754089, "learning_rate": 2.21875e-06, "loss": 0.0394, "reward": 3.0588574409484863, "reward_std": 0.5791709423065186, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.8088575601577759, "rewards/wrapped_format_reward": 0.25, "step": 71 }, { "completion_length": 750.0, "epoch": 2.88, "grad_norm": 13.448805809020996, "kl": 1.7225451469421387, "learning_rate": 2.25e-06, "loss": 0.0689, "reward": 2.681211471557617, "reward_std": 0.5453749895095825, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.05621166527271271, "rewards/wrapped_format_reward": 0.625, "step": 72 }, { "completion_length": 750.0, "epoch": 2.92, "grad_norm": 1.240159273147583, "kl": 0.9635999202728271, "learning_rate": 2.28125e-06, "loss": 0.0385, "reward": 1.7876918315887451, "reward_std": 0.17503832280635834, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.21230819821357727, "rewards/wrapped_format_reward": 0.0, "step": 73 }, { "completion_length": 750.0, "epoch": 2.96, "grad_norm": 1.296797513961792, "kl": 0.6391518115997314, "learning_rate": 2.3125000000000003e-06, "loss": 0.0256, "reward": 2.1097092628479004, "reward_std": 0.30647313594818115, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.015290826559066772, "rewards/wrapped_format_reward": 0.125, "step": 74 }, { "completion_length": 750.0, "epoch": 3.0, "grad_norm": 7.442214488983154, "kl": 0.5369942784309387, "learning_rate": 2.3437500000000002e-06, "loss": 0.0215, "reward": -1.5277777910232544, "reward_std": 0.4120110273361206, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9722222089767456, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.5, "step": 75 }, { "completion_length": 750.0, "epoch": 3.04, "grad_norm": 1.290964961051941, "kl": 0.6742348670959473, "learning_rate": 2.375e-06, "loss": 0.027, "reward": 2.5172605514526367, "reward_std": 0.3828223943710327, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.949999988079071, "rewards/wrapped_driving_reward": 0.31726059317588806, "rewards/wrapped_format_reward": 0.25, "step": 76 }, { "completion_length": 750.0, "epoch": 3.08, "grad_norm": 2.531158208847046, "kl": 0.9481576681137085, "learning_rate": 2.40625e-06, "loss": 0.0379, "reward": 1.286086916923523, "reward_std": 3.2022881507873535, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.963913083076477, "rewards/wrapped_format_reward": 0.75, "step": 77 }, { "completion_length": 750.0, "epoch": 3.12, "grad_norm": 7.368044376373291, "kl": 1.763913631439209, "learning_rate": 2.4375e-06, "loss": 0.0706, "reward": 2.8683407306671143, "reward_std": 0.7174854278564453, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.6183407306671143, "rewards/wrapped_format_reward": 0.25, "step": 78 }, { "completion_length": 750.0, "epoch": 3.16, "grad_norm": 1.6558523178100586, "kl": 0.8889983892440796, "learning_rate": 2.4687500000000003e-06, "loss": 0.0356, "reward": -0.9643077850341797, "reward_std": 3.584562301635742, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.2143075466156006, "rewards/wrapped_format_reward": 0.25, "step": 79 }, { "completion_length": 750.0, "epoch": 3.2, "grad_norm": 0.8536248207092285, "kl": 0.5134472250938416, "learning_rate": 2.5e-06, "loss": 0.0205, "reward": 0.7003893256187439, "reward_std": 3.1346724033355713, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9246107935905457, "rewards/wrapped_format_reward": 0.125, "step": 80 }, { "completion_length": 750.0, "epoch": 3.24, "grad_norm": 3.913572311401367, "kl": 1.072222352027893, "learning_rate": 2.53125e-06, "loss": 0.0429, "reward": -2.125, "reward_std": 0.9464846849441528, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.375, "step": 81 }, { "completion_length": 750.0, "epoch": 3.2800000000000002, "grad_norm": 0.7095546722412109, "kl": 0.43932077288627625, "learning_rate": 2.5625e-06, "loss": 0.0176, "reward": 0.578816831111908, "reward_std": 2.7208759784698486, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.0461831092834473, "rewards/wrapped_format_reward": 0.125, "step": 82 }, { "completion_length": 750.0, "epoch": 3.32, "grad_norm": 1.8258914947509766, "kl": 0.8880151510238647, "learning_rate": 2.5937500000000004e-06, "loss": 0.0355, "reward": -0.019976496696472168, "reward_std": 1.7887709140777588, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9722222089767456, "rewards/wrapped_driving_reward": -2.3671987056732178, "rewards/wrapped_format_reward": 0.375, "step": 83 }, { "completion_length": 750.0, "epoch": 3.36, "grad_norm": 21.77155303955078, "kl": 4.538379669189453, "learning_rate": 2.6250000000000003e-06, "loss": 0.1815, "reward": 2.3662824630737305, "reward_std": 0.3309035003185272, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.3837175965309143, "rewards/wrapped_format_reward": 0.75, "step": 84 }, { "completion_length": 750.0, "epoch": 3.4, "grad_norm": 3.119682788848877, "kl": 0.6306832432746887, "learning_rate": 2.65625e-06, "loss": 0.0252, "reward": -1.765625, "reward_std": 0.2718330919742584, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.984375, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.25, "step": 85 }, { "completion_length": 750.0, "epoch": 3.44, "grad_norm": 0.8083566427230835, "kl": 0.6314530372619629, "learning_rate": 2.6875e-06, "loss": 0.0253, "reward": 1.092934012413025, "reward_std": 3.537501335144043, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9070659875869751, "rewards/wrapped_format_reward": 0.5, "step": 86 }, { "completion_length": 750.0, "epoch": 3.48, "grad_norm": 3.2034406661987305, "kl": 0.4187563955783844, "learning_rate": 2.71875e-06, "loss": 0.0168, "reward": -0.22412437200546265, "reward_std": 4.363674640655518, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.5991244316101074, "rewards/wrapped_format_reward": 0.375, "step": 87 }, { "completion_length": 750.0, "epoch": 3.52, "grad_norm": 5.19920015335083, "kl": 0.4368354082107544, "learning_rate": 2.7500000000000004e-06, "loss": 0.0175, "reward": 3.156670570373535, "reward_std": 0.563427209854126, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.7816706299781799, "rewards/wrapped_format_reward": 0.375, "step": 88 }, { "completion_length": 750.0, "epoch": 3.56, "grad_norm": 1.6918214559555054, "kl": 0.6822372674942017, "learning_rate": 2.7812500000000003e-06, "loss": 0.0273, "reward": 2.3220458030700684, "reward_std": 0.5045586824417114, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.07204583287239075, "rewards/wrapped_format_reward": 0.25, "step": 89 }, { "completion_length": 750.0, "epoch": 3.6, "grad_norm": 1.532529592514038, "kl": 0.5102221965789795, "learning_rate": 2.8125e-06, "loss": 0.0204, "reward": 2.4476795196533203, "reward_std": 0.35076332092285156, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.05232050269842148, "rewards/wrapped_format_reward": 0.5, "step": 90 }, { "completion_length": 750.0, "epoch": 3.64, "grad_norm": 2.078274726867676, "kl": 0.726905107498169, "learning_rate": 2.84375e-06, "loss": 0.0291, "reward": 2.7883927822113037, "reward_std": 0.3999682664871216, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.1633928269147873, "rewards/wrapped_format_reward": 0.625, "step": 91 }, { "completion_length": 750.0, "epoch": 3.68, "grad_norm": 8.694371223449707, "kl": 2.171297788619995, "learning_rate": 2.875e-06, "loss": 0.0869, "reward": 1.4226815700531006, "reward_std": 2.959470272064209, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9523183703422546, "rewards/wrapped_format_reward": 0.875, "step": 92 }, { "completion_length": 750.0, "epoch": 3.7199999999999998, "grad_norm": 4.719028472900391, "kl": 0.4048087000846863, "learning_rate": 2.9062500000000003e-06, "loss": 0.0162, "reward": -1.375, "reward_std": 0.4787135720252991, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.625, "step": 93 }, { "completion_length": 750.0, "epoch": 3.76, "grad_norm": 2.958584785461426, "kl": 0.5307955741882324, "learning_rate": 2.9375000000000003e-06, "loss": 0.0212, "reward": 3.054999828338623, "reward_std": 0.5884072184562683, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.8049997687339783, "rewards/wrapped_format_reward": 0.25, "step": 94 }, { "completion_length": 750.0, "epoch": 3.8, "grad_norm": 3.486060380935669, "kl": 1.3434487581253052, "learning_rate": 2.96875e-06, "loss": 0.0537, "reward": 1.0327720642089844, "reward_std": 3.362086057662964, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.0922279357910156, "rewards/wrapped_format_reward": 0.625, "step": 95 }, { "completion_length": 750.0, "epoch": 3.84, "grad_norm": 0.23967012763023376, "kl": 0.4081713855266571, "learning_rate": 3e-06, "loss": 0.0163, "reward": -1.5, "reward_std": 0.0, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.5, "step": 96 }, { "completion_length": 750.0, "epoch": 3.88, "grad_norm": 1.5661548376083374, "kl": 0.6307891011238098, "learning_rate": 3.03125e-06, "loss": 0.0252, "reward": 2.490572929382324, "reward_std": 0.5211818218231201, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.009427059441804886, "rewards/wrapped_format_reward": 0.5, "step": 97 }, { "completion_length": 750.0, "epoch": 3.92, "grad_norm": 1.4608389139175415, "kl": 0.41656869649887085, "learning_rate": 3.0625000000000003e-06, "loss": 0.0167, "reward": -1.5714285373687744, "reward_std": 0.5313312411308289, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9285714626312256, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.5, "step": 98 }, { "completion_length": 750.0, "epoch": 3.96, "grad_norm": 2.444063663482666, "kl": 0.5416926741600037, "learning_rate": 3.0937500000000002e-06, "loss": 0.0217, "reward": -1.5722651481628418, "reward_std": 1.5172808170318604, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -3.572265148162842, "rewards/wrapped_format_reward": 0.5, "step": 99 }, { "completion_length": 750.0, "epoch": 4.0, "grad_norm": 16.588134765625, "kl": 4.053507328033447, "learning_rate": 3.125e-06, "loss": 0.1621, "reward": 0.8218609094619751, "reward_std": 2.972221612930298, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.053139090538025, "rewards/wrapped_format_reward": 0.375, "step": 100 }, { "completion_length": 750.0, "epoch": 4.04, "grad_norm": 2.6997458934783936, "kl": 0.8300076127052307, "learning_rate": 3.15625e-06, "loss": 0.0332, "reward": -0.017622053623199463, "reward_std": 3.1953601837158203, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7142857313156128, "rewards/wrapped_driving_reward": -1.731907844543457, "rewards/wrapped_format_reward": 0.25, "step": 101 }, { "completion_length": 750.0, "epoch": 4.08, "grad_norm": 4.315985202789307, "kl": 0.4477883577346802, "learning_rate": 3.1875e-06, "loss": 0.0179, "reward": -0.8165792226791382, "reward_std": 3.698456287384033, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.46875, "rewards/wrapped_driving_reward": -2.0353293418884277, "rewards/wrapped_format_reward": 0.25, "step": 102 }, { "completion_length": 750.0, "epoch": 4.12, "grad_norm": 4.780234336853027, "kl": 0.9834432601928711, "learning_rate": 3.2187500000000003e-06, "loss": 0.0393, "reward": -2.125, "reward_std": 1.314977765083313, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.375, "step": 103 }, { "completion_length": 750.0, "epoch": 4.16, "grad_norm": 1.8854734897613525, "kl": 1.0403401851654053, "learning_rate": 3.2500000000000002e-06, "loss": 0.0416, "reward": 2.849097728729248, "reward_std": 0.5133286118507385, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.09909792244434357, "rewards/wrapped_format_reward": 0.75, "step": 104 }, { "completion_length": 750.0, "epoch": 4.2, "grad_norm": 407.2416076660156, "kl": 102.19829559326172, "learning_rate": 3.28125e-06, "loss": 4.0879, "reward": 0.3436872959136963, "reward_std": 2.937178373336792, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7250000238418579, "rewards/wrapped_driving_reward": -1.3813128471374512, "rewards/wrapped_format_reward": 0.25, "step": 105 }, { "completion_length": 750.0, "epoch": 4.24, "grad_norm": 1.6854506731033325, "kl": 0.731993556022644, "learning_rate": 3.3125e-06, "loss": 0.0293, "reward": 3.0254149436950684, "reward_std": 0.7993280291557312, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.4004148244857788, "rewards/wrapped_format_reward": 0.625, "step": 106 }, { "completion_length": 750.0, "epoch": 4.28, "grad_norm": 0.8972920179367065, "kl": 0.429858922958374, "learning_rate": 3.34375e-06, "loss": 0.0172, "reward": 2.303318500518799, "reward_std": 0.5618811249732971, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.32168152928352356, "rewards/wrapped_format_reward": 0.625, "step": 107 }, { "completion_length": 750.0, "epoch": 4.32, "grad_norm": 1.9023702144622803, "kl": 0.5097759962081909, "learning_rate": 3.3750000000000003e-06, "loss": 0.0204, "reward": 2.4162795543670654, "reward_std": 0.23349861800670624, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.29127955436706543, "rewards/wrapped_format_reward": 0.125, "step": 108 }, { "completion_length": 750.0, "epoch": 4.36, "grad_norm": 0.93095463514328, "kl": 0.4000062942504883, "learning_rate": 3.40625e-06, "loss": 0.016, "reward": 2.886277198791504, "reward_std": 0.5716841220855713, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.26127734780311584, "rewards/wrapped_format_reward": 0.625, "step": 109 }, { "completion_length": 750.0, "epoch": 4.4, "grad_norm": 0.5152439475059509, "kl": 0.3538358807563782, "learning_rate": 3.4375e-06, "loss": 0.0142, "reward": -1.625, "reward_std": 0.25, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.375, "step": 110 }, { "completion_length": 750.0, "epoch": 4.44, "grad_norm": 0.6482228636741638, "kl": 0.4172901213169098, "learning_rate": 3.46875e-06, "loss": 0.0167, "reward": 2.2451469898223877, "reward_std": 0.4397418797016144, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.12014690786600113, "rewards/wrapped_format_reward": 0.125, "step": 111 }, { "completion_length": 750.0, "epoch": 4.48, "grad_norm": 0.7293546199798584, "kl": 0.37956511974334717, "learning_rate": 3.5e-06, "loss": 0.0152, "reward": -2.038461685180664, "reward_std": 0.07692313194274902, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9615384340286255, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.0, "step": 112 }, { "completion_length": 750.0, "epoch": 4.52, "grad_norm": 0.9292676448822021, "kl": 0.5957732200622559, "learning_rate": 3.5312500000000007e-06, "loss": 0.0238, "reward": 2.8711514472961426, "reward_std": 0.12351223826408386, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.7461515069007874, "rewards/wrapped_format_reward": 0.125, "step": 113 }, { "completion_length": 750.0, "epoch": 4.5600000000000005, "grad_norm": 0.8307209014892578, "kl": 0.5839378833770752, "learning_rate": 3.5625e-06, "loss": 0.0234, "reward": 2.626485824584961, "reward_std": 0.4099518358707428, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.3764858841896057, "rewards/wrapped_format_reward": 0.25, "step": 114 }, { "completion_length": 750.0, "epoch": 4.6, "grad_norm": 1.3391035795211792, "kl": 0.6039181351661682, "learning_rate": 3.59375e-06, "loss": 0.0242, "reward": 2.645888566970825, "reward_std": 0.7008417248725891, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.949999988079071, "rewards/wrapped_driving_reward": 0.32088854908943176, "rewards/wrapped_format_reward": 0.375, "step": 115 }, { "completion_length": 750.0, "epoch": 4.64, "grad_norm": 0.5842669606208801, "kl": 0.39022237062454224, "learning_rate": 3.625e-06, "loss": 0.0156, "reward": 2.825923442840576, "reward_std": 0.4296809136867523, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.20092333853244781, "rewards/wrapped_format_reward": 0.625, "step": 116 }, { "completion_length": 750.0, "epoch": 4.68, "grad_norm": 0.7245670557022095, "kl": 0.3917812705039978, "learning_rate": 3.65625e-06, "loss": 0.0157, "reward": 1.6012243032455444, "reward_std": 1.6945689916610718, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.7737756371498108, "rewards/wrapped_format_reward": 0.375, "step": 117 }, { "completion_length": 750.0, "epoch": 4.72, "grad_norm": 0.947012722492218, "kl": 0.4678252935409546, "learning_rate": 3.6875000000000007e-06, "loss": 0.0187, "reward": 0.9042448997497559, "reward_std": 2.274247169494629, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.949999988079071, "rewards/wrapped_driving_reward": -1.6707550287246704, "rewards/wrapped_format_reward": 0.625, "step": 118 }, { "completion_length": 750.0, "epoch": 4.76, "grad_norm": 0.9988442659378052, "kl": 0.6183064579963684, "learning_rate": 3.7187500000000006e-06, "loss": 0.0247, "reward": -2.375, "reward_std": 1.108677864074707, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.125, "step": 119 }, { "completion_length": 750.0, "epoch": 4.8, "grad_norm": 0.7130206823348999, "kl": 0.5347681641578674, "learning_rate": 3.7500000000000005e-06, "loss": 0.0214, "reward": -1.4166667461395264, "reward_std": 0.5527708530426025, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9583333134651184, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.625, "step": 120 }, { "completion_length": 750.0, "epoch": 4.84, "grad_norm": 0.627507746219635, "kl": 0.4992324709892273, "learning_rate": 3.78125e-06, "loss": 0.02, "reward": 1.178471326828003, "reward_std": 3.4837844371795654, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.699999988079071, "rewards/wrapped_driving_reward": -0.7715286612510681, "rewards/wrapped_format_reward": 0.5, "step": 121 }, { "completion_length": 750.0, "epoch": 4.88, "grad_norm": 0.7256002426147461, "kl": 0.393168568611145, "learning_rate": 3.8125e-06, "loss": 0.0157, "reward": 2.3499269485473633, "reward_std": 0.28019580245018005, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.2750731110572815, "rewards/wrapped_format_reward": 0.625, "step": 122 }, { "completion_length": 750.0, "epoch": 4.92, "grad_norm": 0.9897744059562683, "kl": 0.4026646018028259, "learning_rate": 3.84375e-06, "loss": 0.0161, "reward": 0.7121872305870056, "reward_std": 2.5061769485473633, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": -1.0378127098083496, "rewards/wrapped_format_reward": 0.375, "step": 123 }, { "completion_length": 750.0, "epoch": 4.96, "grad_norm": 1.374245047569275, "kl": 0.4849807322025299, "learning_rate": 3.875e-06, "loss": 0.0194, "reward": 0.6754664182662964, "reward_std": 1.7975075244903564, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.8245335817337036, "rewards/wrapped_format_reward": 0.5, "step": 124 }, { "completion_length": 750.0, "epoch": 5.0, "grad_norm": 152.49087524414062, "kl": 41.7037239074707, "learning_rate": 3.90625e-06, "loss": 1.6681, "reward": 2.801071882247925, "reward_std": 0.3472815752029419, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.17607180774211884, "rewards/wrapped_format_reward": 0.625, "step": 125 }, { "completion_length": 750.0, "epoch": 5.04, "grad_norm": 0.5340915322303772, "kl": 0.3046044409275055, "learning_rate": 3.9375e-06, "loss": 0.0122, "reward": -2.5, "reward_std": 1.2247449159622192, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.5, "step": 126 }, { "completion_length": 750.0, "epoch": 5.08, "grad_norm": 2.454094648361206, "kl": 1.0489752292633057, "learning_rate": 3.96875e-06, "loss": 0.042, "reward": -1.5, "reward_std": 0.40824830532073975, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.5, "step": 127 }, { "completion_length": 750.0, "epoch": 5.12, "grad_norm": 0.9271315932273865, "kl": 0.5982043743133545, "learning_rate": 4.000000000000001e-06, "loss": 0.0239, "reward": 2.9926440715789795, "reward_std": 0.4446185231208801, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.6176440715789795, "rewards/wrapped_format_reward": 0.375, "step": 128 }, { "completion_length": 750.0, "epoch": 5.16, "grad_norm": 1.4375823736190796, "kl": 0.532067596912384, "learning_rate": 4.031250000000001e-06, "loss": 0.0213, "reward": -1.75, "reward_std": 0.5, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.25, "step": 129 }, { "completion_length": 750.0, "epoch": 5.2, "grad_norm": 1.463436245918274, "kl": 0.30711984634399414, "learning_rate": 4.0625000000000005e-06, "loss": 0.0123, "reward": 0.6086172461509705, "reward_std": 2.8491289615631104, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6833333373069763, "rewards/wrapped_driving_reward": -1.4497160911560059, "rewards/wrapped_format_reward": 0.625, "step": 130 }, { "completion_length": 750.0, "epoch": 5.24, "grad_norm": 0.5281797647476196, "kl": 0.3673165738582611, "learning_rate": 4.09375e-06, "loss": 0.0147, "reward": 2.542213201522827, "reward_std": 0.4680797755718231, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.08278678357601166, "rewards/wrapped_format_reward": 0.625, "step": 131 }, { "completion_length": 750.0, "epoch": 5.28, "grad_norm": 1.2764793634414673, "kl": 0.626862645149231, "learning_rate": 4.125e-06, "loss": 0.0251, "reward": 2.830139636993408, "reward_std": 0.4498087763786316, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8541666865348816, "rewards/wrapped_driving_reward": 0.6009730100631714, "rewards/wrapped_format_reward": 0.375, "step": 132 }, { "completion_length": 750.0, "epoch": 5.32, "grad_norm": 0.768198549747467, "kl": 0.6894717812538147, "learning_rate": 4.15625e-06, "loss": 0.0276, "reward": 2.4615352153778076, "reward_std": 0.5109394192695618, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.03846481069922447, "rewards/wrapped_format_reward": 0.5, "step": 133 }, { "completion_length": 750.0, "epoch": 5.36, "grad_norm": 1.2825806140899658, "kl": 0.46785154938697815, "learning_rate": 4.1875e-06, "loss": 0.0187, "reward": 3.512366771697998, "reward_std": 0.23564468324184418, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.7623668909072876, "rewards/wrapped_format_reward": 0.75, "step": 134 }, { "completion_length": 750.0, "epoch": 5.4, "grad_norm": 1.3568897247314453, "kl": 0.6507589817047119, "learning_rate": 4.21875e-06, "loss": 0.026, "reward": 0.9110469222068787, "reward_std": 2.9727671146392822, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9639530181884766, "rewards/wrapped_format_reward": 0.375, "step": 135 }, { "completion_length": 750.0, "epoch": 5.44, "grad_norm": 0.7310919761657715, "kl": 0.5235786437988281, "learning_rate": 4.25e-06, "loss": 0.0209, "reward": 0.7459380030632019, "reward_std": 3.165867805480957, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.0040620565414429, "rewards/wrapped_format_reward": 0.25, "step": 136 }, { "completion_length": 750.0, "epoch": 5.48, "grad_norm": 1.3897533416748047, "kl": 0.5510402917861938, "learning_rate": 4.28125e-06, "loss": 0.022, "reward": 3.019265651702881, "reward_std": 0.16442914307117462, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.019265584647655487, "rewards/wrapped_format_reward": 1.0, "step": 137 }, { "completion_length": 750.0, "epoch": 5.52, "grad_norm": 1.4078574180603027, "kl": 0.49462607502937317, "learning_rate": 4.312500000000001e-06, "loss": 0.0198, "reward": 1.5707786083221436, "reward_std": 2.394756317138672, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.9292213916778564, "rewards/wrapped_format_reward": 0.5, "step": 138 }, { "completion_length": 750.0, "epoch": 5.5600000000000005, "grad_norm": 1.664668083190918, "kl": 0.5930428504943848, "learning_rate": 4.3437500000000006e-06, "loss": 0.0237, "reward": 0.5512915849685669, "reward_std": 2.111541986465454, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9750000238418579, "rewards/wrapped_driving_reward": -1.923708438873291, "rewards/wrapped_format_reward": 0.5, "step": 139 }, { "completion_length": 750.0, "epoch": 5.6, "grad_norm": 6.902396202087402, "kl": 1.005578637123108, "learning_rate": 4.3750000000000005e-06, "loss": 0.0402, "reward": 0.832996129989624, "reward_std": 0.8886765837669373, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.949999988079071, "rewards/wrapped_driving_reward": -1.4920037984848022, "rewards/wrapped_format_reward": 0.375, "step": 140 }, { "completion_length": 750.0, "epoch": 5.64, "grad_norm": 1.0129903554916382, "kl": 0.672616183757782, "learning_rate": 4.40625e-06, "loss": 0.0269, "reward": -2.125, "reward_std": 1.314977765083313, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.375, "step": 141 }, { "completion_length": 750.0, "epoch": 5.68, "grad_norm": 0.5481407642364502, "kl": 0.46889528632164, "learning_rate": 4.4375e-06, "loss": 0.0188, "reward": 1.875490665435791, "reward_std": 0.4471076428890228, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.3745094835758209, "rewards/wrapped_format_reward": 0.25, "step": 142 }, { "completion_length": 750.0, "epoch": 5.72, "grad_norm": 0.8321424126625061, "kl": 0.7620508074760437, "learning_rate": 4.46875e-06, "loss": 0.0305, "reward": -1.625, "reward_std": 0.4787135720252991, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.375, "step": 143 }, { "completion_length": 750.0, "epoch": 5.76, "grad_norm": 1.097217321395874, "kl": 0.5194978713989258, "learning_rate": 4.5e-06, "loss": 0.0208, "reward": 2.9378552436828613, "reward_std": 0.6964905261993408, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.3128551244735718, "rewards/wrapped_format_reward": 0.625, "step": 144 }, { "completion_length": 750.0, "epoch": 5.8, "grad_norm": 1.5057528018951416, "kl": 0.6985796689987183, "learning_rate": 4.53125e-06, "loss": 0.0279, "reward": 0.8791663646697998, "reward_std": 3.2669684886932373, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.1208335161209106, "rewards/wrapped_format_reward": 0.5, "step": 145 }, { "completion_length": 750.0, "epoch": 5.84, "grad_norm": 0.9157974123954773, "kl": 0.6999198794364929, "learning_rate": 4.5625e-06, "loss": 0.028, "reward": 2.6941776275634766, "reward_std": 0.41056010127067566, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.1941775530576706, "rewards/wrapped_format_reward": 0.5, "step": 146 }, { "completion_length": 738.0, "epoch": 5.88, "grad_norm": 1.194362998008728, "kl": 0.8087308406829834, "learning_rate": 4.59375e-06, "loss": 0.0323, "reward": 1.8218350410461426, "reward_std": 1.240020751953125, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.5531650185585022, "rewards/wrapped_format_reward": 0.375, "step": 147 }, { "completion_length": 750.0, "epoch": 5.92, "grad_norm": 1.5525860786437988, "kl": 0.5995261073112488, "learning_rate": 4.625000000000001e-06, "loss": 0.024, "reward": 2.7626960277557373, "reward_std": 0.8373459577560425, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.3876959979534149, "rewards/wrapped_format_reward": 0.375, "step": 148 }, { "completion_length": 750.0, "epoch": 5.96, "grad_norm": 0.7849404811859131, "kl": 0.5838685035705566, "learning_rate": 4.6562500000000005e-06, "loss": 0.0234, "reward": 2.0091466903686523, "reward_std": 0.5293838977813721, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.987500011920929, "rewards/wrapped_driving_reward": -0.35335326194763184, "rewards/wrapped_format_reward": 0.375, "step": 149 }, { "completion_length": 750.0, "epoch": 6.0, "grad_norm": 8.725290298461914, "kl": 0.5788177847862244, "learning_rate": 4.6875000000000004e-06, "loss": 0.0232, "reward": 2.588832378387451, "reward_std": 0.47200268507003784, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.08883260190486908, "rewards/wrapped_format_reward": 0.5, "step": 150 }, { "completion_length": 750.0, "epoch": 6.04, "grad_norm": 1.5502936840057373, "kl": 0.7367026805877686, "learning_rate": 4.71875e-06, "loss": 0.0295, "reward": 3.0267982482910156, "reward_std": 0.3287774324417114, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.5267983675003052, "rewards/wrapped_format_reward": 0.5, "step": 151 }, { "completion_length": 750.0, "epoch": 6.08, "grad_norm": 1.2154241800308228, "kl": 0.6294659972190857, "learning_rate": 4.75e-06, "loss": 0.0252, "reward": -2.125, "reward_std": 1.314977765083313, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.375, "step": 152 }, { "completion_length": 750.0, "epoch": 6.12, "grad_norm": 0.6927015781402588, "kl": 0.466768741607666, "learning_rate": 4.781250000000001e-06, "loss": 0.0187, "reward": 1.2741777896881104, "reward_std": 3.52226185798645, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9758223295211792, "rewards/wrapped_format_reward": 0.75, "step": 153 }, { "completion_length": 750.0, "epoch": 6.16, "grad_norm": 0.49416881799697876, "kl": 0.3236115872859955, "learning_rate": 4.8125e-06, "loss": 0.0129, "reward": 1.7459195852279663, "reward_std": 1.0475239753723145, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.6290804147720337, "rewards/wrapped_format_reward": 0.375, "step": 154 }, { "completion_length": 750.0, "epoch": 6.2, "grad_norm": 0.6880607008934021, "kl": 0.4876292645931244, "learning_rate": 4.84375e-06, "loss": 0.0195, "reward": 1.5318889617919922, "reward_std": 1.4943149089813232, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9166666865348816, "rewards/wrapped_driving_reward": -0.7597777247428894, "rewards/wrapped_format_reward": 0.375, "step": 155 }, { "completion_length": 750.0, "epoch": 6.24, "grad_norm": 0.46220606565475464, "kl": 0.39146143198013306, "learning_rate": 4.875e-06, "loss": 0.0157, "reward": -1.8253967761993408, "reward_std": 0.5452560186386108, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6746032238006592, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.5, "step": 156 }, { "completion_length": 750.0, "epoch": 6.28, "grad_norm": 0.62086421251297, "kl": 0.4868091642856598, "learning_rate": 4.90625e-06, "loss": 0.0195, "reward": 1.3802235126495361, "reward_std": 3.6146950721740723, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.6197764873504639, "rewards/wrapped_format_reward": 0.5, "step": 157 }, { "completion_length": 750.0, "epoch": 6.32, "grad_norm": 0.7884992957115173, "kl": 0.8528112173080444, "learning_rate": 4.937500000000001e-06, "loss": 0.0341, "reward": 0.6898324489593506, "reward_std": 1.9514682292938232, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.9351675510406494, "rewards/wrapped_format_reward": 0.625, "step": 158 }, { "completion_length": 750.0, "epoch": 6.36, "grad_norm": 0.5661314129829407, "kl": 0.37956494092941284, "learning_rate": 4.9687500000000005e-06, "loss": 0.0152, "reward": 1.9870556592941284, "reward_std": 0.8114857077598572, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.78125, "rewards/wrapped_driving_reward": -0.16919440031051636, "rewards/wrapped_format_reward": 0.375, "step": 159 }, { "completion_length": 750.0, "epoch": 6.4, "grad_norm": 0.4809100925922394, "kl": 0.3137115240097046, "learning_rate": 5e-06, "loss": 0.0125, "reward": 1.137058973312378, "reward_std": 3.0915822982788086, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.8629410266876221, "rewards/wrapped_format_reward": 0.5, "step": 160 }, { "completion_length": 750.0, "epoch": 6.44, "grad_norm": 0.7742670774459839, "kl": 0.7155295610427856, "learning_rate": 4.99999405044338e-06, "loss": 0.0286, "reward": -0.08523339033126831, "reward_std": 3.0506365299224854, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.835233449935913, "rewards/wrapped_format_reward": 0.25, "step": 161 }, { "completion_length": 750.0, "epoch": 6.48, "grad_norm": 0.510188639163971, "kl": 0.45238742232322693, "learning_rate": 4.999976201801837e-06, "loss": 0.0181, "reward": 0.43522799015045166, "reward_std": 3.8238766193389893, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.5647720098495483, "rewards/wrapped_format_reward": 0.5, "step": 162 }, { "completion_length": 750.0, "epoch": 6.52, "grad_norm": 0.57041335105896, "kl": 0.6957306861877441, "learning_rate": 4.999946454160323e-06, "loss": 0.0278, "reward": 2.5085318088531494, "reward_std": 0.6753469705581665, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.13353177905082703, "rewards/wrapped_format_reward": 0.375, "step": 163 }, { "completion_length": 750.0, "epoch": 6.5600000000000005, "grad_norm": 1.1680339574813843, "kl": 0.583303689956665, "learning_rate": 4.9999048076604286e-06, "loss": 0.0233, "reward": 2.320394992828369, "reward_std": 0.9881588220596313, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.984375, "rewards/wrapped_driving_reward": -0.2889798581600189, "rewards/wrapped_format_reward": 0.625, "step": 164 }, { "completion_length": 750.0, "epoch": 6.6, "grad_norm": 0.4880649447441101, "kl": 0.5123260617256165, "learning_rate": 4.999851262500375e-06, "loss": 0.0205, "reward": 2.9611833095550537, "reward_std": 0.3708806037902832, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.08618323504924774, "rewards/wrapped_format_reward": 0.875, "step": 165 }, { "completion_length": 750.0, "epoch": 6.64, "grad_norm": 0.7171524167060852, "kl": 0.6323699951171875, "learning_rate": 4.999785818935018e-06, "loss": 0.0253, "reward": 2.383418083190918, "reward_std": 1.0746413469314575, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.11658180505037308, "rewards/wrapped_format_reward": 0.5, "step": 166 }, { "completion_length": 750.0, "epoch": 6.68, "grad_norm": 0.8785050511360168, "kl": 0.6350330114364624, "learning_rate": 4.999708477275846e-06, "loss": 0.0254, "reward": -1.5, "reward_std": 0.5773502588272095, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.5, "step": 167 }, { "completion_length": 750.0, "epoch": 6.72, "grad_norm": 0.7559798359870911, "kl": 0.8585944771766663, "learning_rate": 4.9996192378909785e-06, "loss": 0.0343, "reward": 1.9524728059768677, "reward_std": 1.5683073997497559, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.7975271940231323, "rewards/wrapped_format_reward": 0.75, "step": 168 }, { "completion_length": 750.0, "epoch": 6.76, "grad_norm": 1.71358060836792, "kl": 0.8341420292854309, "learning_rate": 4.999518101205162e-06, "loss": 0.0334, "reward": 1.887375831604004, "reward_std": 1.938693881034851, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.7376242876052856, "rewards/wrapped_format_reward": 0.625, "step": 169 }, { "completion_length": 750.0, "epoch": 6.8, "grad_norm": 0.574404776096344, "kl": 0.5479518175125122, "learning_rate": 4.999405067699773e-06, "loss": 0.0219, "reward": 3.3870460987091064, "reward_std": 0.16866222023963928, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.6370459198951721, "rewards/wrapped_format_reward": 0.75, "step": 170 }, { "completion_length": 750.0, "epoch": 6.84, "grad_norm": 0.5955837965011597, "kl": 0.3409351110458374, "learning_rate": 4.99928013791281e-06, "loss": 0.0136, "reward": 2.3762004375457764, "reward_std": 0.7474427819252014, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.1237996518611908, "rewards/wrapped_format_reward": 0.5, "step": 171 }, { "completion_length": 750.0, "epoch": 6.88, "grad_norm": 0.6897561550140381, "kl": 0.621300458908081, "learning_rate": 4.999143312438893e-06, "loss": 0.0249, "reward": 1.5901850461959839, "reward_std": 0.3631085157394409, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9886363744735718, "rewards/wrapped_driving_reward": -1.148451328277588, "rewards/wrapped_format_reward": 0.75, "step": 172 }, { "completion_length": 750.0, "epoch": 6.92, "grad_norm": 0.5545635223388672, "kl": 0.6974574327468872, "learning_rate": 4.998994591929266e-06, "loss": 0.0279, "reward": -0.6720701456069946, "reward_std": 2.0031087398529053, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -3.047070026397705, "rewards/wrapped_format_reward": 0.375, "step": 173 }, { "completion_length": 750.0, "epoch": 6.96, "grad_norm": 0.5723159313201904, "kl": 0.5589190721511841, "learning_rate": 4.998833977091783e-06, "loss": 0.0224, "reward": -1.25, "reward_std": 0.28867512941360474, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 174 }, { "completion_length": 750.0, "epoch": 7.0, "grad_norm": 0.6870840787887573, "kl": 0.7596628665924072, "learning_rate": 4.998661468690914e-06, "loss": 0.0304, "reward": -1.125, "reward_std": 0.25, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.875, "step": 175 }, { "completion_length": 750.0, "epoch": 7.04, "grad_norm": 0.5063377022743225, "kl": 0.5469480156898499, "learning_rate": 4.99847706754774e-06, "loss": 0.0219, "reward": 1.822561264038086, "reward_std": 0.155478835105896, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.6774387359619141, "rewards/wrapped_format_reward": 0.5, "step": 176 }, { "completion_length": 739.0, "epoch": 7.08, "grad_norm": 0.5202552080154419, "kl": 0.8028308749198914, "learning_rate": 4.998280774539943e-06, "loss": 0.0321, "reward": 3.8296477794647217, "reward_std": 0.14273938536643982, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.8296477198600769, "rewards/wrapped_format_reward": 1.0, "step": 177 }, { "completion_length": 750.0, "epoch": 7.12, "grad_norm": 0.5187572240829468, "kl": 0.35041287541389465, "learning_rate": 4.998072590601808e-06, "loss": 0.014, "reward": -1.875, "reward_std": 0.25, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.125, "step": 178 }, { "completion_length": 750.0, "epoch": 7.16, "grad_norm": 0.6030856370925903, "kl": 0.27407315373420715, "learning_rate": 4.9978525167242176e-06, "loss": 0.011, "reward": 2.659794330596924, "reward_std": 0.6332101225852966, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.2847943902015686, "rewards/wrapped_format_reward": 0.375, "step": 179 }, { "completion_length": 750.0, "epoch": 7.2, "grad_norm": 0.9357779622077942, "kl": 0.9146249890327454, "learning_rate": 4.997620553954645e-06, "loss": 0.0366, "reward": 2.8247017860412598, "reward_std": 0.35369452834129333, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.96875, "rewards/wrapped_driving_reward": -0.01904815435409546, "rewards/wrapped_format_reward": 0.875, "step": 180 }, { "completion_length": 741.0, "epoch": 7.24, "grad_norm": 0.8797070980072021, "kl": 0.39436712861061096, "learning_rate": 4.997376703397151e-06, "loss": 0.0158, "reward": 2.7287731170654297, "reward_std": 0.3972662091255188, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.22877322137355804, "rewards/wrapped_format_reward": 0.5, "step": 181 }, { "completion_length": 750.0, "epoch": 7.28, "grad_norm": 0.4403473436832428, "kl": 0.4500048756599426, "learning_rate": 4.9971209662123774e-06, "loss": 0.018, "reward": 2.8664050102233887, "reward_std": 0.34985023736953735, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.949999988079071, "rewards/wrapped_driving_reward": 0.16640505194664001, "rewards/wrapped_format_reward": 0.75, "step": 182 }, { "completion_length": 750.0, "epoch": 7.32, "grad_norm": 1.0706084966659546, "kl": 1.0191062688827515, "learning_rate": 4.996853343617542e-06, "loss": 0.0408, "reward": 3.047149658203125, "reward_std": 0.3775829076766968, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.1721496433019638, "rewards/wrapped_format_reward": 0.875, "step": 183 }, { "completion_length": 750.0, "epoch": 7.36, "grad_norm": 0.5287938714027405, "kl": 0.6224220395088196, "learning_rate": 4.9965738368864345e-06, "loss": 0.0249, "reward": 0.23351562023162842, "reward_std": 2.851604700088501, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.5164843797683716, "rewards/wrapped_format_reward": 0.25, "step": 184 }, { "completion_length": 750.0, "epoch": 7.4, "grad_norm": 0.672430157661438, "kl": 0.666401207447052, "learning_rate": 4.996282447349408e-06, "loss": 0.0267, "reward": -2.125, "reward_std": 1.314977765083313, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.375, "step": 185 }, { "completion_length": 750.0, "epoch": 7.44, "grad_norm": 0.630480170249939, "kl": 0.44004517793655396, "learning_rate": 4.995979176393372e-06, "loss": 0.0176, "reward": -1.5, "reward_std": 0.5773502588272095, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.5, "step": 186 }, { "completion_length": 750.0, "epoch": 7.48, "grad_norm": 0.7747740149497986, "kl": 0.5829688310623169, "learning_rate": 4.99566402546179e-06, "loss": 0.0233, "reward": 2.4287631511688232, "reward_std": 0.47438302636146545, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.07123684883117676, "rewards/wrapped_format_reward": 0.5, "step": 187 }, { "completion_length": 750.0, "epoch": 7.52, "grad_norm": 1.1100307703018188, "kl": 0.8917567133903503, "learning_rate": 4.995336996054668e-06, "loss": 0.0357, "reward": 2.5867562294006348, "reward_std": 0.7861148118972778, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.08675637096166611, "rewards/wrapped_format_reward": 0.5, "step": 188 }, { "completion_length": 750.0, "epoch": 7.5600000000000005, "grad_norm": 1.0037741661071777, "kl": 1.1412770748138428, "learning_rate": 4.99499808972855e-06, "loss": 0.0457, "reward": 2.342970609664917, "reward_std": 0.25529083609580994, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.09297055006027222, "rewards/wrapped_format_reward": 0.25, "step": 189 }, { "completion_length": 750.0, "epoch": 7.6, "grad_norm": 0.5152557492256165, "kl": 0.46443066000938416, "learning_rate": 4.994647308096509e-06, "loss": 0.0186, "reward": 0.16651475429534912, "reward_std": 2.873490333557129, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.5834852457046509, "rewards/wrapped_format_reward": 0.25, "step": 190 }, { "completion_length": 750.0, "epoch": 7.64, "grad_norm": 0.9836487174034119, "kl": 0.8981544971466064, "learning_rate": 4.994284652828143e-06, "loss": 0.0359, "reward": -1.587499976158142, "reward_std": 0.480234295129776, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9125000238418579, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.5, "step": 191 }, { "completion_length": 750.0, "epoch": 7.68, "grad_norm": 0.5183222889900208, "kl": 0.600283145904541, "learning_rate": 4.993910125649561e-06, "loss": 0.024, "reward": 2.7084898948669434, "reward_std": 1.0434271097183228, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.04151032119989395, "rewards/wrapped_format_reward": 0.75, "step": 192 }, { "completion_length": 750.0, "epoch": 7.72, "grad_norm": 0.5068610310554504, "kl": 0.7425740361213684, "learning_rate": 4.99352372834338e-06, "loss": 0.0297, "reward": 1.2116459608078003, "reward_std": 3.281907558441162, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6785714626312256, "rewards/wrapped_driving_reward": -0.7169255018234253, "rewards/wrapped_format_reward": 0.5, "step": 193 }, { "completion_length": 750.0, "epoch": 7.76, "grad_norm": 0.7546151876449585, "kl": 0.5269922614097595, "learning_rate": 4.993125462748714e-06, "loss": 0.0211, "reward": 2.08638334274292, "reward_std": 0.4966030418872833, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.08638344705104828, "rewards/wrapped_format_reward": 0.0, "step": 194 }, { "completion_length": 750.0, "epoch": 7.8, "grad_norm": 0.6085935831069946, "kl": 0.7513608932495117, "learning_rate": 4.992715330761167e-06, "loss": 0.0301, "reward": 0.6888164281845093, "reward_std": 2.792956829071045, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.936183512210846, "rewards/wrapped_format_reward": 0.125, "step": 195 }, { "completion_length": 750.0, "epoch": 7.84, "grad_norm": 0.6035264730453491, "kl": 0.49863916635513306, "learning_rate": 4.992293334332821e-06, "loss": 0.0199, "reward": 2.2989866733551025, "reward_std": 0.7346133589744568, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.875, "rewards/wrapped_driving_reward": -0.3260132670402527, "rewards/wrapped_format_reward": 0.75, "step": 196 }, { "completion_length": 750.0, "epoch": 7.88, "grad_norm": 0.6059936285018921, "kl": 0.5347932577133179, "learning_rate": 4.9918594754722286e-06, "loss": 0.0214, "reward": 0.0036406517028808594, "reward_std": 3.585599660873413, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.9963593482971191, "rewards/wrapped_format_reward": 0.5, "step": 197 }, { "completion_length": 750.0, "epoch": 7.92, "grad_norm": 0.6184878945350647, "kl": 0.45021745562553406, "learning_rate": 4.991413756244404e-06, "loss": 0.018, "reward": 3.160332679748535, "reward_std": 0.3758719265460968, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.5353326201438904, "rewards/wrapped_format_reward": 0.625, "step": 198 }, { "completion_length": 750.0, "epoch": 7.96, "grad_norm": 0.5479530096054077, "kl": 0.5874747633934021, "learning_rate": 4.990956178770814e-06, "loss": 0.0235, "reward": 2.7277915477752686, "reward_std": 0.21160702407360077, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.22779148817062378, "rewards/wrapped_format_reward": 0.5, "step": 199 }, { "completion_length": 552.0, "epoch": 8.0, "grad_norm": 0.8959174156188965, "kl": 0.7743228077888489, "learning_rate": 4.990486745229364e-06, "loss": 0.031, "reward": -1.75, "reward_std": 0.5692750215530396, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.875, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.375, "step": 200 }, { "completion_length": 750.0, "epoch": 8.04, "grad_norm": 0.5334154367446899, "kl": 0.5155819654464722, "learning_rate": 4.990005457854392e-06, "loss": 0.0206, "reward": 2.358083963394165, "reward_std": 0.6079920530319214, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9166666865348816, "rewards/wrapped_driving_reward": -0.05858280509710312, "rewards/wrapped_format_reward": 0.5, "step": 201 }, { "completion_length": 750.0, "epoch": 8.08, "grad_norm": 0.8115833401679993, "kl": 0.7826488614082336, "learning_rate": 4.989512318936654e-06, "loss": 0.0313, "reward": 2.9173591136932373, "reward_std": 0.7723499536514282, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9772727489471436, "rewards/wrapped_driving_reward": 0.31508636474609375, "rewards/wrapped_format_reward": 0.625, "step": 202 }, { "completion_length": 750.0, "epoch": 8.12, "grad_norm": 1.8089442253112793, "kl": 0.6300475597381592, "learning_rate": 4.989007330823319e-06, "loss": 0.0252, "reward": 2.464756488800049, "reward_std": 0.43305322527885437, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.2147563397884369, "rewards/wrapped_format_reward": 0.25, "step": 203 }, { "completion_length": 750.0, "epoch": 8.16, "grad_norm": 19.12211799621582, "kl": 5.240139484405518, "learning_rate": 4.988490495917948e-06, "loss": 0.2096, "reward": -1.25, "reward_std": 0.28867512941360474, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 204 }, { "completion_length": 750.0, "epoch": 8.2, "grad_norm": 0.7113471627235413, "kl": 0.6434310674667358, "learning_rate": 4.987961816680493e-06, "loss": 0.0257, "reward": 0.7629947662353516, "reward_std": 3.2651803493499756, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.237005352973938, "rewards/wrapped_format_reward": 0.5, "step": 205 }, { "completion_length": 726.0, "epoch": 8.24, "grad_norm": 0.5587561726570129, "kl": 0.759925365447998, "learning_rate": 4.987421295627279e-06, "loss": 0.0304, "reward": 1.319366455078125, "reward_std": 2.2250618934631348, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.8056334257125854, "rewards/wrapped_format_reward": 0.125, "step": 206 }, { "completion_length": 750.0, "epoch": 8.28, "grad_norm": 0.5961654186248779, "kl": 0.5305419564247131, "learning_rate": 4.986868935330998e-06, "loss": 0.0212, "reward": 2.6727981567382812, "reward_std": 0.3446647524833679, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.047798238694667816, "rewards/wrapped_format_reward": 0.625, "step": 207 }, { "completion_length": 750.0, "epoch": 8.32, "grad_norm": 0.7032870650291443, "kl": 0.6918764114379883, "learning_rate": 4.986304738420684e-06, "loss": 0.0277, "reward": -0.13961869478225708, "reward_std": 3.194305896759033, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6937500238418579, "rewards/wrapped_driving_reward": -2.0833687782287598, "rewards/wrapped_format_reward": 0.5, "step": 208 }, { "completion_length": 750.0, "epoch": 8.36, "grad_norm": 1.351664662361145, "kl": 0.44911444187164307, "learning_rate": 4.985728707581717e-06, "loss": 0.018, "reward": 2.386528968811035, "reward_std": 0.66311115026474, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8333333730697632, "rewards/wrapped_driving_reward": 0.178195521235466, "rewards/wrapped_format_reward": 0.375, "step": 209 }, { "completion_length": 750.0, "epoch": 8.4, "grad_norm": 0.8452834486961365, "kl": 0.8271775841712952, "learning_rate": 4.985140845555799e-06, "loss": 0.0331, "reward": 1.1259584426879883, "reward_std": 2.119089365005493, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.8740414381027222, "rewards/wrapped_format_reward": 1.0, "step": 210 }, { "completion_length": 750.0, "epoch": 8.44, "grad_norm": 1.1207107305526733, "kl": 0.8840889930725098, "learning_rate": 4.984541155140945e-06, "loss": 0.0354, "reward": 2.5146608352661133, "reward_std": 0.5953065156936646, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.949999988079071, "rewards/wrapped_driving_reward": 0.31466078758239746, "rewards/wrapped_format_reward": 0.25, "step": 211 }, { "completion_length": 750.0, "epoch": 8.48, "grad_norm": 0.7914042472839355, "kl": 0.9506034851074219, "learning_rate": 4.9839296391914696e-06, "loss": 0.038, "reward": 3.054720401763916, "reward_std": 0.5219823122024536, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.1797204613685608, "rewards/wrapped_format_reward": 0.875, "step": 212 }, { "completion_length": 750.0, "epoch": 8.52, "grad_norm": 0.7439658045768738, "kl": 0.5630563497543335, "learning_rate": 4.98330630061797e-06, "loss": 0.0225, "reward": -0.11316037178039551, "reward_std": 3.441740036010742, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.7381603717803955, "rewards/wrapped_format_reward": 0.125, "step": 213 }, { "completion_length": 750.0, "epoch": 8.56, "grad_norm": 1.1477609872817993, "kl": 1.0046734809875488, "learning_rate": 4.982671142387316e-06, "loss": 0.0402, "reward": 3.0662035942077637, "reward_std": 0.0674816370010376, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.5662035346031189, "rewards/wrapped_format_reward": 0.5, "step": 214 }, { "completion_length": 750.0, "epoch": 8.6, "grad_norm": 0.46614599227905273, "kl": 0.7210499048233032, "learning_rate": 4.982024167522638e-06, "loss": 0.0288, "reward": 2.4149999618530273, "reward_std": 0.46334606409072876, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.04000008851289749, "rewards/wrapped_format_reward": 0.375, "step": 215 }, { "completion_length": 636.0, "epoch": 8.64, "grad_norm": 0.550269603729248, "kl": 0.9145827293395996, "learning_rate": 4.981365379103306e-06, "loss": 0.0366, "reward": 3.2154905796051025, "reward_std": 0.5181944370269775, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.7154906988143921, "rewards/wrapped_format_reward": 0.5, "step": 216 }, { "completion_length": 750.0, "epoch": 8.68, "grad_norm": 0.44090768694877625, "kl": 0.9645712971687317, "learning_rate": 4.980694780264918e-06, "loss": 0.0386, "reward": -1.274999976158142, "reward_std": 0.4856266975402832, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9750000238418579, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 217 }, { "completion_length": 750.0, "epoch": 8.72, "grad_norm": 1.1925650835037231, "kl": 0.8605116605758667, "learning_rate": 4.980012374199288e-06, "loss": 0.0344, "reward": 2.6870789527893066, "reward_std": 0.7157343626022339, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.0629209354519844, "rewards/wrapped_format_reward": 0.75, "step": 218 }, { "completion_length": 750.0, "epoch": 8.76, "grad_norm": 0.6395624876022339, "kl": 0.8992162942886353, "learning_rate": 4.979318164154426e-06, "loss": 0.036, "reward": -1.28125, "reward_std": 0.32874444127082825, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.96875, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 219 }, { "completion_length": 750.0, "epoch": 8.8, "grad_norm": 0.5243760943412781, "kl": 0.7105860710144043, "learning_rate": 4.978612153434527e-06, "loss": 0.0284, "reward": 3.133758783340454, "reward_std": 0.4695283770561218, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.25875866413116455, "rewards/wrapped_format_reward": 0.875, "step": 220 }, { "completion_length": 750.0, "epoch": 8.84, "grad_norm": 0.5799823999404907, "kl": 0.8737132549285889, "learning_rate": 4.97789434539995e-06, "loss": 0.0349, "reward": 1.885632038116455, "reward_std": 1.8393102884292603, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.8643680214881897, "rewards/wrapped_format_reward": 0.75, "step": 221 }, { "completion_length": 750.0, "epoch": 8.88, "grad_norm": 0.46507924795150757, "kl": 0.9933305382728577, "learning_rate": 4.977164743467206e-06, "loss": 0.0397, "reward": -2.049999952316284, "reward_std": 0.8225975036621094, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.699999988079071, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.5, "step": 222 }, { "completion_length": 750.0, "epoch": 8.92, "grad_norm": 0.557388961315155, "kl": 0.6470035314559937, "learning_rate": 4.976423351108943e-06, "loss": 0.0259, "reward": -1.25, "reward_std": 0.28867512941360474, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 223 }, { "completion_length": 750.0, "epoch": 8.96, "grad_norm": 0.7340157628059387, "kl": 0.6879977583885193, "learning_rate": 4.975670171853926e-06, "loss": 0.0275, "reward": 2.4769201278686523, "reward_std": 0.25078660249710083, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.875, "rewards/wrapped_driving_reward": 0.10192020237445831, "rewards/wrapped_format_reward": 0.5, "step": 224 }, { "completion_length": 750.0, "epoch": 9.0, "grad_norm": 1.4428342580795288, "kl": 1.1961815357208252, "learning_rate": 4.97490520928702e-06, "loss": 0.0478, "reward": 3.0022120475769043, "reward_std": 0.3715779185295105, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.6272119879722595, "rewards/wrapped_format_reward": 0.375, "step": 225 }, { "completion_length": 750.0, "epoch": 9.04, "grad_norm": 0.5274134874343872, "kl": 0.8709424734115601, "learning_rate": 4.974128467049177e-06, "loss": 0.0348, "reward": 1.142529845237732, "reward_std": 3.107614040374756, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9824702143669128, "rewards/wrapped_format_reward": 0.625, "step": 226 }, { "completion_length": 750.0, "epoch": 9.08, "grad_norm": 0.7395073771476746, "kl": 0.2381790727376938, "learning_rate": 4.9733399488374115e-06, "loss": 0.0095, "reward": 3.141404628753662, "reward_std": 0.23124907910823822, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.14140476286411285, "rewards/wrapped_format_reward": 1.0, "step": 227 }, { "completion_length": 750.0, "epoch": 9.12, "grad_norm": 0.5715224742889404, "kl": 0.47389835119247437, "learning_rate": 4.972539658404793e-06, "loss": 0.019, "reward": 2.177046775817871, "reward_std": 0.8025394678115845, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.875, "rewards/wrapped_driving_reward": -0.32295334339141846, "rewards/wrapped_format_reward": 0.625, "step": 228 }, { "completion_length": 750.0, "epoch": 9.16, "grad_norm": 0.614587128162384, "kl": 0.877197802066803, "learning_rate": 4.971727599560418e-06, "loss": 0.0351, "reward": 2.604313373565674, "reward_std": 0.43783459067344666, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.020686477422714233, "rewards/wrapped_format_reward": 0.625, "step": 229 }, { "completion_length": 750.0, "epoch": 9.2, "grad_norm": 0.49547889828681946, "kl": 0.8945198655128479, "learning_rate": 4.970903776169403e-06, "loss": 0.0358, "reward": 2.9887380599975586, "reward_std": 0.49843111634254456, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9750000238418579, "rewards/wrapped_driving_reward": 0.3887380361557007, "rewards/wrapped_format_reward": 0.625, "step": 230 }, { "completion_length": 750.0, "epoch": 9.24, "grad_norm": 0.6391835808753967, "kl": 0.8877568244934082, "learning_rate": 4.9700681921528495e-06, "loss": 0.0355, "reward": 2.256884813308716, "reward_std": 0.810869038105011, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9285714626312256, "rewards/wrapped_driving_reward": -0.04668661952018738, "rewards/wrapped_format_reward": 0.375, "step": 231 }, { "completion_length": 750.0, "epoch": 9.28, "grad_norm": 0.7567249536514282, "kl": 0.9957519769668579, "learning_rate": 4.9692208514878445e-06, "loss": 0.0398, "reward": 2.5888562202453613, "reward_std": 0.6119778156280518, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9565972089767456, "rewards/wrapped_driving_reward": 0.13225889205932617, "rewards/wrapped_format_reward": 0.5, "step": 232 }, { "completion_length": 750.0, "epoch": 9.32, "grad_norm": 0.63723224401474, "kl": 0.8475660681724548, "learning_rate": 4.968361758207428e-06, "loss": 0.0339, "reward": -1.8181817531585693, "reward_std": 0.23764224350452423, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9318181872367859, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.25, "step": 233 }, { "completion_length": 750.0, "epoch": 9.36, "grad_norm": 0.48718228936195374, "kl": 1.1619231700897217, "learning_rate": 4.9674909164005805e-06, "loss": 0.0465, "reward": 3.505831718444824, "reward_std": 0.3402004837989807, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.6308315992355347, "rewards/wrapped_format_reward": 0.875, "step": 234 }, { "completion_length": 676.0, "epoch": 9.4, "grad_norm": 0.8502306342124939, "kl": 1.0721936225891113, "learning_rate": 4.966608330212198e-06, "loss": 0.0429, "reward": 3.320247173309326, "reward_std": 0.5307442545890808, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.5702470541000366, "rewards/wrapped_format_reward": 0.75, "step": 235 }, { "completion_length": 635.0, "epoch": 9.44, "grad_norm": 0.6630327105522156, "kl": 0.8887981176376343, "learning_rate": 4.965714003843079e-06, "loss": 0.0356, "reward": 2.481696844100952, "reward_std": 0.16960932314395905, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.018303271383047104, "rewards/wrapped_format_reward": 0.5, "step": 236 }, { "completion_length": 750.0, "epoch": 9.48, "grad_norm": 0.45822253823280334, "kl": 0.851169764995575, "learning_rate": 4.9648079415499e-06, "loss": 0.034, "reward": -1.8214285373687744, "reward_std": 0.5639389753341675, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9285714626312256, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.25, "step": 237 }, { "completion_length": 750.0, "epoch": 9.52, "grad_norm": 0.41027647256851196, "kl": 1.0325894355773926, "learning_rate": 4.963890147645195e-06, "loss": 0.0413, "reward": 2.844029426574707, "reward_std": 0.30431851744651794, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.030970722436904907, "rewards/wrapped_format_reward": 0.875, "step": 238 }, { "completion_length": 750.0, "epoch": 9.56, "grad_norm": 3.95577073097229, "kl": 1.8168144226074219, "learning_rate": 4.962960626497339e-06, "loss": 0.0727, "reward": 2.185295343399048, "reward_std": 0.4056129455566406, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.31470465660095215, "rewards/wrapped_format_reward": 0.5, "step": 239 }, { "completion_length": 750.0, "epoch": 9.6, "grad_norm": 0.84366774559021, "kl": 1.09842050075531, "learning_rate": 4.962019382530521e-06, "loss": 0.0439, "reward": 3.1190567016601562, "reward_std": 0.16080652177333832, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.1190568059682846, "rewards/wrapped_format_reward": 1.0, "step": 240 }, { "completion_length": 750.0, "epoch": 9.64, "grad_norm": 0.40419045090675354, "kl": 1.4857383966445923, "learning_rate": 4.961066420224729e-06, "loss": 0.0594, "reward": 2.7421183586120605, "reward_std": 0.3939764201641083, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.007881544530391693, "rewards/wrapped_format_reward": 0.75, "step": 241 }, { "completion_length": 703.0, "epoch": 9.68, "grad_norm": 0.5581960678100586, "kl": 0.8319287896156311, "learning_rate": 4.960101744115727e-06, "loss": 0.0333, "reward": 1.3577722311019897, "reward_std": 0.8074946999549866, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.8922277688980103, "rewards/wrapped_format_reward": 0.25, "step": 242 }, { "completion_length": 750.0, "epoch": 9.72, "grad_norm": 1.4876646995544434, "kl": 1.1022382974624634, "learning_rate": 4.959125358795031e-06, "loss": 0.0441, "reward": -2.125, "reward_std": 1.314977765083313, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.375, "step": 243 }, { "completion_length": 733.0, "epoch": 9.76, "grad_norm": 0.7890307903289795, "kl": 1.5635696649551392, "learning_rate": 4.958137268909887e-06, "loss": 0.0625, "reward": 1.0345841646194458, "reward_std": 2.7603647708892822, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.3404158353805542, "rewards/wrapped_format_reward": 0.875, "step": 244 }, { "completion_length": 750.0, "epoch": 9.8, "grad_norm": 0.44861987233161926, "kl": 0.25233525037765503, "learning_rate": 4.957137479163253e-06, "loss": 0.0101, "reward": -1.7541667222976685, "reward_std": 0.3909568190574646, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8708333373069763, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.375, "step": 245 }, { "completion_length": 533.0, "epoch": 9.84, "grad_norm": 0.5020561218261719, "kl": 0.9620947241783142, "learning_rate": 4.956125994313775e-06, "loss": 0.0385, "reward": 3.3699028491973877, "reward_std": 0.5193167924880981, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.4949028491973877, "rewards/wrapped_format_reward": 0.875, "step": 246 }, { "completion_length": 750.0, "epoch": 9.88, "grad_norm": 0.7062340974807739, "kl": 0.9898033738136292, "learning_rate": 4.95510281917576e-06, "loss": 0.0396, "reward": -1.875, "reward_std": 1.1814539432525635, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.625, "step": 247 }, { "completion_length": 750.0, "epoch": 9.92, "grad_norm": 0.44811582565307617, "kl": 0.43252551555633545, "learning_rate": 4.9540679586191605e-06, "loss": 0.0173, "reward": 2.317924976348877, "reward_std": 0.17351354658603668, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.06792493164539337, "rewards/wrapped_format_reward": 0.25, "step": 248 }, { "completion_length": 750.0, "epoch": 9.96, "grad_norm": 0.47783583402633667, "kl": 0.9962712526321411, "learning_rate": 4.953021417569545e-06, "loss": 0.0399, "reward": 3.022937059402466, "reward_std": 0.4499557316303253, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.14793699979782104, "rewards/wrapped_format_reward": 0.875, "step": 249 }, { "completion_length": 750.0, "epoch": 10.0, "grad_norm": 0.5202720761299133, "kl": 0.5016875863075256, "learning_rate": 4.9519632010080765e-06, "loss": 0.0201, "reward": 1.3368468284606934, "reward_std": 3.5631000995635986, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.6631531715393066, "rewards/wrapped_format_reward": 0.5, "step": 250 }, { "completion_length": 750.0, "epoch": 10.04, "grad_norm": 0.889390766620636, "kl": 1.2343968152999878, "learning_rate": 4.950893313971492e-06, "loss": 0.0494, "reward": 3.506786346435547, "reward_std": 0.3962436020374298, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9444444179534912, "rewards/wrapped_driving_reward": 0.8123420476913452, "rewards/wrapped_format_reward": 0.75, "step": 251 }, { "completion_length": 750.0, "epoch": 10.08, "grad_norm": 0.5827829241752625, "kl": 0.948403000831604, "learning_rate": 4.949811761552074e-06, "loss": 0.0379, "reward": 2.5721993446350098, "reward_std": 0.5560285449028015, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9583333134651184, "rewards/wrapped_driving_reward": -0.011133967898786068, "rewards/wrapped_format_reward": 0.625, "step": 252 }, { "completion_length": 750.0, "epoch": 10.12, "grad_norm": 0.5650044679641724, "kl": 0.9299434423446655, "learning_rate": 4.9487185488976284e-06, "loss": 0.0372, "reward": -1.716269850730896, "reward_std": 0.5084477663040161, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.908730149269104, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.375, "step": 253 }, { "completion_length": 599.0, "epoch": 10.16, "grad_norm": 0.4880934953689575, "kl": 0.7951986789703369, "learning_rate": 4.94761368121146e-06, "loss": 0.0318, "reward": 2.573094367980957, "reward_std": 0.27557268738746643, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.1769055724143982, "rewards/wrapped_format_reward": 0.75, "step": 254 }, { "completion_length": 750.0, "epoch": 10.2, "grad_norm": 0.8892874121665955, "kl": 0.7362837195396423, "learning_rate": 4.9464971637523465e-06, "loss": 0.0295, "reward": -1.28125, "reward_std": 0.4827762544155121, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.96875, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 255 }, { "completion_length": 750.0, "epoch": 10.24, "grad_norm": 0.8154737949371338, "kl": 0.9433515667915344, "learning_rate": 4.9453690018345144e-06, "loss": 0.0377, "reward": 1.883481502532959, "reward_std": 0.9224264025688171, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.4915185868740082, "rewards/wrapped_format_reward": 0.375, "step": 256 }, { "completion_length": 750.0, "epoch": 10.28, "grad_norm": 0.587221086025238, "kl": 0.7820435166358948, "learning_rate": 4.944229200827616e-06, "loss": 0.0313, "reward": -1.1607142686843872, "reward_std": 0.23600271344184875, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9642857313156128, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.875, "step": 257 }, { "completion_length": 750.0, "epoch": 10.32, "grad_norm": 0.7322145700454712, "kl": 0.9088730812072754, "learning_rate": 4.943077766156698e-06, "loss": 0.0364, "reward": 0.9441255927085876, "reward_std": 1.5783616304397583, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9791666865348816, "rewards/wrapped_driving_reward": -1.910041093826294, "rewards/wrapped_format_reward": 0.875, "step": 258 }, { "completion_length": 750.0, "epoch": 10.36, "grad_norm": 0.7966383099555969, "kl": 1.125408411026001, "learning_rate": 4.941914703302181e-06, "loss": 0.045, "reward": 2.580202102661133, "reward_std": 0.40770646929740906, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.08020199090242386, "rewards/wrapped_format_reward": 0.5, "step": 259 }, { "completion_length": 695.0, "epoch": 10.4, "grad_norm": 0.4875122308731079, "kl": 0.8961836695671082, "learning_rate": 4.9407400177998335e-06, "loss": 0.0358, "reward": 2.2389979362487793, "reward_std": 0.7594300508499146, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.7610019445419312, "rewards/wrapped_format_reward": 1.0, "step": 260 }, { "completion_length": 612.0, "epoch": 10.44, "grad_norm": 0.8443101048469543, "kl": 0.8345216512680054, "learning_rate": 4.939553715240741e-06, "loss": 0.0334, "reward": 2.9486937522888184, "reward_std": 0.7755388617515564, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.05130642652511597, "rewards/wrapped_format_reward": 1.0, "step": 261 }, { "completion_length": 750.0, "epoch": 10.48, "grad_norm": 0.4315735697746277, "kl": 0.5944791436195374, "learning_rate": 4.938355801271282e-06, "loss": 0.0238, "reward": -0.26047587394714355, "reward_std": 3.4582109451293945, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.6354758739471436, "rewards/wrapped_format_reward": 0.375, "step": 262 }, { "completion_length": 750.0, "epoch": 10.52, "grad_norm": 0.4449390172958374, "kl": 1.0638983249664307, "learning_rate": 4.937146281593103e-06, "loss": 0.0426, "reward": 3.349001407623291, "reward_std": 0.18792293965816498, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.7240012884140015, "rewards/wrapped_format_reward": 0.625, "step": 263 }, { "completion_length": 750.0, "epoch": 10.56, "grad_norm": 0.5087334513664246, "kl": 0.9471940994262695, "learning_rate": 4.935925161963089e-06, "loss": 0.0379, "reward": -1.625, "reward_std": 1.25, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.875, "step": 264 }, { "completion_length": 732.0, "epoch": 10.6, "grad_norm": 0.5004269480705261, "kl": 0.9943680167198181, "learning_rate": 4.9346924481933345e-06, "loss": 0.0398, "reward": 3.4356508255004883, "reward_std": 0.5672562122344971, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9583333134651184, "rewards/wrapped_driving_reward": 0.7273174524307251, "rewards/wrapped_format_reward": 0.75, "step": 265 }, { "completion_length": 750.0, "epoch": 10.64, "grad_norm": 0.39916032552719116, "kl": 1.0476347208023071, "learning_rate": 4.933448146151122e-06, "loss": 0.0419, "reward": 2.414046049118042, "reward_std": 0.3546769917011261, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8392857313156128, "rewards/wrapped_driving_reward": 0.07476034015417099, "rewards/wrapped_format_reward": 0.5, "step": 266 }, { "completion_length": 735.0, "epoch": 10.68, "grad_norm": 0.4085545241832733, "kl": 0.9289141297340393, "learning_rate": 4.932192261758885e-06, "loss": 0.0372, "reward": -1.5, "reward_std": 0.5773502588272095, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.5, "step": 267 }, { "completion_length": 750.0, "epoch": 10.72, "grad_norm": 0.7282282114028931, "kl": 0.5237314701080322, "learning_rate": 4.930924800994192e-06, "loss": 0.0209, "reward": 1.163808822631836, "reward_std": 3.132412910461426, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9611911177635193, "rewards/wrapped_format_reward": 0.625, "step": 268 }, { "completion_length": 750.0, "epoch": 10.76, "grad_norm": 0.5592875480651855, "kl": 1.2230463027954102, "learning_rate": 4.929645769889704e-06, "loss": 0.0489, "reward": 1.8117026090621948, "reward_std": 1.2372390031814575, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.9382973909378052, "rewards/wrapped_format_reward": 0.75, "step": 269 }, { "completion_length": 666.0, "epoch": 10.8, "grad_norm": 0.6668244004249573, "kl": 1.1006290912628174, "learning_rate": 4.928355174533153e-06, "loss": 0.044, "reward": 0.7610301971435547, "reward_std": 1.6584932804107666, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -2.113969564437866, "rewards/wrapped_format_reward": 0.875, "step": 270 }, { "completion_length": 750.0, "epoch": 10.84, "grad_norm": 0.9733495712280273, "kl": 0.7359632253646851, "learning_rate": 4.927053021067321e-06, "loss": 0.0294, "reward": 2.9857444763183594, "reward_std": 0.5656386613845825, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.11074452847242355, "rewards/wrapped_format_reward": 0.875, "step": 271 }, { "completion_length": 750.0, "epoch": 10.88, "grad_norm": 0.9902675747871399, "kl": 0.6186420321464539, "learning_rate": 4.925739315689991e-06, "loss": 0.0247, "reward": 2.7555360794067383, "reward_std": 0.022727251052856445, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9886363744735718, "rewards/wrapped_driving_reward": 0.7668997645378113, "rewards/wrapped_format_reward": 0.0, "step": 272 }, { "completion_length": 750.0, "epoch": 10.92, "grad_norm": 0.5004103183746338, "kl": 1.1375739574432373, "learning_rate": 4.924414064653938e-06, "loss": 0.0455, "reward": 2.6497280597686768, "reward_std": 0.5490097999572754, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.14972800016403198, "rewards/wrapped_format_reward": 0.5, "step": 273 }, { "completion_length": 615.0, "epoch": 10.96, "grad_norm": 0.686726450920105, "kl": 0.96458899974823, "learning_rate": 4.923077274266886e-06, "loss": 0.0386, "reward": 2.8679394721984863, "reward_std": 0.5836524367332458, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9027777910232544, "rewards/wrapped_driving_reward": 0.2151617854833603, "rewards/wrapped_format_reward": 0.75, "step": 274 }, { "completion_length": 750.0, "epoch": 11.0, "grad_norm": 0.41074368357658386, "kl": 0.7832292914390564, "learning_rate": 4.9217289508914836e-06, "loss": 0.0313, "reward": 2.4006309509277344, "reward_std": 0.9599378108978271, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.34936898946762085, "rewards/wrapped_format_reward": 0.75, "step": 275 }, { "completion_length": 417.0, "epoch": 11.04, "grad_norm": 0.6297289133071899, "kl": 0.5816258788108826, "learning_rate": 4.92036910094527e-06, "loss": 0.0233, "reward": 2.59334397315979, "reward_std": 0.5557723045349121, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.09334398806095123, "rewards/wrapped_format_reward": 0.5, "step": 276 }, { "completion_length": 534.0, "epoch": 11.08, "grad_norm": 0.5123348832130432, "kl": 0.8924129605293274, "learning_rate": 4.91899773090065e-06, "loss": 0.0357, "reward": 1.080291986465454, "reward_std": 1.076037883758545, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9861111044883728, "rewards/wrapped_driving_reward": -1.7808191776275635, "rewards/wrapped_format_reward": 0.875, "step": 277 }, { "completion_length": 599.0, "epoch": 11.12, "grad_norm": 0.46222391724586487, "kl": 0.4757728576660156, "learning_rate": 4.917614847284858e-06, "loss": 0.019, "reward": 2.672243118286133, "reward_std": 0.4222791790962219, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9791666865348816, "rewards/wrapped_driving_reward": -0.3069234788417816, "rewards/wrapped_format_reward": 1.0, "step": 278 }, { "completion_length": 749.0, "epoch": 11.16, "grad_norm": 0.586867094039917, "kl": 1.4221863746643066, "learning_rate": 4.91622045667993e-06, "loss": 0.0569, "reward": 2.0654354095458984, "reward_std": 2.719116687774658, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.6845643520355225, "rewards/wrapped_format_reward": 0.75, "step": 279 }, { "completion_length": 750.0, "epoch": 11.2, "grad_norm": 0.4978845417499542, "kl": 0.7794169783592224, "learning_rate": 4.914814565722671e-06, "loss": 0.0312, "reward": 3.286668300628662, "reward_std": 0.5568961501121521, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.5366683602333069, "rewards/wrapped_format_reward": 0.75, "step": 280 }, { "completion_length": 750.0, "epoch": 11.24, "grad_norm": 1.042169213294983, "kl": 1.359074592590332, "learning_rate": 4.913397181104623e-06, "loss": 0.0544, "reward": 0.6235643029212952, "reward_std": 2.8482930660247803, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.6264357566833496, "rewards/wrapped_format_reward": 0.75, "step": 281 }, { "completion_length": 750.0, "epoch": 11.28, "grad_norm": 0.6336880922317505, "kl": 1.260665774345398, "learning_rate": 4.9119683095720325e-06, "loss": 0.0504, "reward": 3.2773139476776123, "reward_std": 0.8147690892219543, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": 0.6523139476776123, "rewards/wrapped_format_reward": 0.875, "step": 282 }, { "completion_length": 750.0, "epoch": 11.32, "grad_norm": 0.5326210260391235, "kl": 1.2928297519683838, "learning_rate": 4.9105279579258234e-06, "loss": 0.0517, "reward": 3.1767990589141846, "reward_std": 0.6041759252548218, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.4267989993095398, "rewards/wrapped_format_reward": 0.75, "step": 283 }, { "completion_length": 520.0, "epoch": 11.36, "grad_norm": 0.48858872056007385, "kl": 0.4190160632133484, "learning_rate": 4.909076133021558e-06, "loss": 0.0168, "reward": -1.4375, "reward_std": 0.5907269716262817, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9375, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.625, "step": 284 }, { "completion_length": 676.0, "epoch": 11.4, "grad_norm": 0.43427374958992004, "kl": 1.0146484375, "learning_rate": 4.907612841769407e-06, "loss": 0.0406, "reward": -1.25, "reward_std": 0.5, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 285 }, { "completion_length": 615.0, "epoch": 11.44, "grad_norm": 0.5486767888069153, "kl": 0.7701943516731262, "learning_rate": 4.906138091134118e-06, "loss": 0.0308, "reward": 3.0628390312194824, "reward_std": 0.11028631031513214, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.06283894926309586, "rewards/wrapped_format_reward": 1.0, "step": 286 }, { "completion_length": 595.0, "epoch": 11.48, "grad_norm": 0.4506373107433319, "kl": 0.901443600654602, "learning_rate": 4.904651888134982e-06, "loss": 0.0361, "reward": -1.25, "reward_std": 0.5, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 287 }, { "completion_length": 744.0, "epoch": 11.52, "grad_norm": 0.4597472846508026, "kl": 1.1629990339279175, "learning_rate": 4.903154239845798e-06, "loss": 0.0465, "reward": 2.7203586101531982, "reward_std": 0.7249525785446167, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.84375, "rewards/wrapped_driving_reward": 0.0016086697578430176, "rewards/wrapped_format_reward": 0.875, "step": 288 }, { "completion_length": 750.0, "epoch": 11.56, "grad_norm": 0.5143890976905823, "kl": 0.9513099789619446, "learning_rate": 4.901645153394838e-06, "loss": 0.0381, "reward": -1.25, "reward_std": 0.28867512941360474, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 289 }, { "completion_length": 750.0, "epoch": 11.6, "grad_norm": 0.532035768032074, "kl": 0.6954802870750427, "learning_rate": 4.900124635964823e-06, "loss": 0.0278, "reward": 3.240325689315796, "reward_std": 0.25314152240753174, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9722222089767456, "rewards/wrapped_driving_reward": 0.2681034803390503, "rewards/wrapped_format_reward": 1.0, "step": 290 }, { "completion_length": 750.0, "epoch": 11.64, "grad_norm": 0.7568380832672119, "kl": 0.890608549118042, "learning_rate": 4.898592694792871e-06, "loss": 0.0356, "reward": 3.097019672393799, "reward_std": 0.5597526431083679, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.22201967239379883, "rewards/wrapped_format_reward": 0.875, "step": 291 }, { "completion_length": 597.0, "epoch": 11.68, "grad_norm": 0.5061165690422058, "kl": 0.8536003232002258, "learning_rate": 4.897049337170483e-06, "loss": 0.0341, "reward": 2.722294330596924, "reward_std": 0.21757638454437256, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9444444179534912, "rewards/wrapped_driving_reward": 0.15284982323646545, "rewards/wrapped_format_reward": 0.625, "step": 292 }, { "completion_length": 750.0, "epoch": 11.72, "grad_norm": 0.5000802278518677, "kl": 0.9599359035491943, "learning_rate": 4.895494570443492e-06, "loss": 0.0384, "reward": 2.7536168098449707, "reward_std": 0.6582252383232117, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.12138298898935318, "rewards/wrapped_format_reward": 0.875, "step": 293 }, { "completion_length": 750.0, "epoch": 11.76, "grad_norm": 0.5710813999176025, "kl": 0.9540033340454102, "learning_rate": 4.8939284020120365e-06, "loss": 0.0382, "reward": 2.571502208709717, "reward_std": 0.4067968428134918, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9166666865348816, "rewards/wrapped_driving_reward": 0.029835540801286697, "rewards/wrapped_format_reward": 0.625, "step": 294 }, { "completion_length": 750.0, "epoch": 11.8, "grad_norm": 1.1970958709716797, "kl": 1.2357957363128662, "learning_rate": 4.8923508393305224e-06, "loss": 0.0494, "reward": 0.6461101770401001, "reward_std": 2.581754207611084, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.8538898229599, "rewards/wrapped_format_reward": 1.0, "step": 295 }, { "completion_length": 738.0, "epoch": 11.84, "grad_norm": 0.3895174264907837, "kl": 0.5224874019622803, "learning_rate": 4.890761889907589e-06, "loss": 0.0209, "reward": 2.2970166206359863, "reward_std": 0.6513614058494568, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9583333134651184, "rewards/wrapped_driving_reward": -0.16131654381752014, "rewards/wrapped_format_reward": 0.5, "step": 296 }, { "completion_length": 581.0, "epoch": 11.88, "grad_norm": 0.5378090739250183, "kl": 0.9656004309654236, "learning_rate": 4.8891615613060715e-06, "loss": 0.0386, "reward": 2.449643135070801, "reward_std": 1.1916462182998657, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.5503568649291992, "rewards/wrapped_format_reward": 1.0, "step": 297 }, { "completion_length": 750.0, "epoch": 11.92, "grad_norm": 1.0380306243896484, "kl": 1.3637113571166992, "learning_rate": 4.887549861142967e-06, "loss": 0.0545, "reward": -1.125, "reward_std": 0.25, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.875, "step": 298 }, { "completion_length": 660.0, "epoch": 11.96, "grad_norm": 0.5935966968536377, "kl": 1.596313238143921, "learning_rate": 4.885926797089396e-06, "loss": 0.0639, "reward": 1.3272292613983154, "reward_std": 2.9181787967681885, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9227706789970398, "rewards/wrapped_format_reward": 0.75, "step": 299 }, { "completion_length": 605.0, "epoch": 12.0, "grad_norm": 0.5052400231361389, "kl": 1.1359528303146362, "learning_rate": 4.884292376870567e-06, "loss": 0.0454, "reward": 1.1219098567962646, "reward_std": 2.8648571968078613, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.3780901432037354, "rewards/wrapped_format_reward": 1.0, "step": 300 }, { "completion_length": 750.0, "epoch": 12.04, "grad_norm": 0.42255502939224243, "kl": 1.3561640977859497, "learning_rate": 4.882646608265743e-06, "loss": 0.0542, "reward": -1.375, "reward_std": 0.4787135720252991, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.625, "step": 301 }, { "completion_length": 672.0, "epoch": 12.08, "grad_norm": 0.7515414953231812, "kl": 1.0097947120666504, "learning_rate": 4.880989499108196e-06, "loss": 0.0404, "reward": 2.7926979064941406, "reward_std": 0.243763267993927, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.4176979064941406, "rewards/wrapped_format_reward": 0.375, "step": 302 }, { "completion_length": 750.0, "epoch": 12.12, "grad_norm": 0.482781320810318, "kl": 1.0105189085006714, "learning_rate": 4.8793210572851795e-06, "loss": 0.0404, "reward": 1.3847792148590088, "reward_std": 1.7117525339126587, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.4902207851409912, "rewards/wrapped_format_reward": 0.875, "step": 303 }, { "completion_length": 561.0, "epoch": 12.16, "grad_norm": 2.1605520248413086, "kl": 0.9172693490982056, "learning_rate": 4.8776412907378845e-06, "loss": 0.0367, "reward": 2.826827049255371, "reward_std": 0.5229109525680542, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9772727489471436, "rewards/wrapped_driving_reward": -0.025445779785513878, "rewards/wrapped_format_reward": 0.875, "step": 304 }, { "completion_length": 750.0, "epoch": 12.2, "grad_norm": 0.7399263381958008, "kl": 1.4161840677261353, "learning_rate": 4.875950207461403e-06, "loss": 0.0566, "reward": 2.040844202041626, "reward_std": 3.3612470626831055, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7321428656578064, "rewards/wrapped_driving_reward": -0.44129857420921326, "rewards/wrapped_format_reward": 1.0, "step": 305 }, { "completion_length": 750.0, "epoch": 12.24, "grad_norm": 0.5619301795959473, "kl": 1.1429786682128906, "learning_rate": 4.874247815504693e-06, "loss": 0.0457, "reward": 3.1347854137420654, "reward_std": 0.2799624502658844, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9861111044883728, "rewards/wrapped_driving_reward": 0.1486743986606598, "rewards/wrapped_format_reward": 1.0, "step": 306 }, { "completion_length": 532.0, "epoch": 12.28, "grad_norm": 1.447466492652893, "kl": 0.9073767066001892, "learning_rate": 4.872534122970536e-06, "loss": 0.0363, "reward": -1.125, "reward_std": 0.25, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.875, "step": 307 }, { "completion_length": 397.0, "epoch": 12.32, "grad_norm": 3.610201597213745, "kl": 0.4948464035987854, "learning_rate": 4.870809138015499e-06, "loss": 0.0198, "reward": 1.3169913291931152, "reward_std": 3.2238197326660156, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9330087304115295, "rewards/wrapped_format_reward": 0.75, "step": 308 }, { "completion_length": 631.0, "epoch": 12.36, "grad_norm": 1.619842529296875, "kl": 0.6367069482803345, "learning_rate": 4.8690728688499e-06, "loss": 0.0255, "reward": -1.125, "reward_std": 0.25, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.875, "step": 309 }, { "completion_length": 597.0, "epoch": 12.4, "grad_norm": 1.2359195947647095, "kl": 0.6074169278144836, "learning_rate": 4.867325323737765e-06, "loss": 0.0243, "reward": 2.357518196105957, "reward_std": 0.7375664710998535, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.6424819231033325, "rewards/wrapped_format_reward": 1.0, "step": 310 }, { "completion_length": 469.0, "epoch": 12.44, "grad_norm": 4.1004719734191895, "kl": 0.5925214290618896, "learning_rate": 4.865566510996787e-06, "loss": 0.0237, "reward": 2.563359022140503, "reward_std": 0.33094266057014465, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.96875, "rewards/wrapped_driving_reward": -0.15539111196994781, "rewards/wrapped_format_reward": 0.75, "step": 311 }, { "completion_length": 740.0, "epoch": 12.48, "grad_norm": 11.807194709777832, "kl": 1.1617281436920166, "learning_rate": 4.863796438998293e-06, "loss": 0.0465, "reward": 2.8380610942840576, "reward_std": 0.5543819069862366, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.08806122839450836, "rewards/wrapped_format_reward": 0.75, "step": 312 }, { "completion_length": 750.0, "epoch": 12.52, "grad_norm": 5.792469024658203, "kl": 0.404826819896698, "learning_rate": 4.862015116167195e-06, "loss": 0.0162, "reward": 2.552396774291992, "reward_std": 0.6585280895233154, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.1976032257080078, "rewards/wrapped_format_reward": 0.75, "step": 313 }, { "completion_length": 750.0, "epoch": 12.56, "grad_norm": 8.077740669250488, "kl": 0.5904800295829773, "learning_rate": 4.860222550981961e-06, "loss": 0.0236, "reward": 2.346454381942749, "reward_std": 0.9884578585624695, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.403545618057251, "rewards/wrapped_format_reward": 0.75, "step": 314 }, { "completion_length": 750.0, "epoch": 12.6, "grad_norm": 4.022387504577637, "kl": 0.7494456768035889, "learning_rate": 4.858418751974564e-06, "loss": 0.03, "reward": -1.1458332538604736, "reward_std": 0.1717960387468338, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8541666865348816, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 1.0, "step": 315 }, { "completion_length": 750.0, "epoch": 12.64, "grad_norm": 4.89057731628418, "kl": 0.7992514967918396, "learning_rate": 4.856603727730446e-06, "loss": 0.032, "reward": 2.9716575145721436, "reward_std": 0.8948715329170227, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.949999988079071, "rewards/wrapped_driving_reward": 0.39665764570236206, "rewards/wrapped_format_reward": 0.625, "step": 316 }, { "completion_length": 750.0, "epoch": 12.68, "grad_norm": 4.50545072555542, "kl": 0.706625759601593, "learning_rate": 4.854777486888481e-06, "loss": 0.0283, "reward": -0.9577881693840027, "reward_std": 2.518893003463745, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -3.0827882289886475, "rewards/wrapped_format_reward": 0.625, "step": 317 }, { "completion_length": 750.0, "epoch": 12.72, "grad_norm": 1.048317790031433, "kl": 0.4814412593841553, "learning_rate": 4.852940038140927e-06, "loss": 0.0193, "reward": 2.9507060050964355, "reward_std": 0.5745217204093933, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.32570600509643555, "rewards/wrapped_format_reward": 0.625, "step": 318 }, { "completion_length": 561.0, "epoch": 12.76, "grad_norm": 2.460906744003296, "kl": 0.2864713668823242, "learning_rate": 4.8510913902333876e-06, "loss": 0.0115, "reward": 2.4077253341674805, "reward_std": 0.4348691999912262, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9375, "rewards/wrapped_driving_reward": -0.5297746658325195, "rewards/wrapped_format_reward": 1.0, "step": 319 }, { "completion_length": 469.0, "epoch": 12.8, "grad_norm": 4.8929314613342285, "kl": 0.5974794626235962, "learning_rate": 4.849231551964771e-06, "loss": 0.0239, "reward": 2.981731653213501, "reward_std": 0.12611062824726105, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9583333134651184, "rewards/wrapped_driving_reward": 0.02339838445186615, "rewards/wrapped_format_reward": 1.0, "step": 320 }, { "completion_length": 750.0, "epoch": 12.84, "grad_norm": 1.4380724430084229, "kl": 0.7370598912239075, "learning_rate": 4.8473605321872484e-06, "loss": 0.0295, "reward": 2.465222120285034, "reward_std": 0.3577950894832611, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.034777797758579254, "rewards/wrapped_format_reward": 0.5, "step": 321 }, { "completion_length": 750.0, "epoch": 12.88, "grad_norm": 0.7334043979644775, "kl": 0.6930970549583435, "learning_rate": 4.845478339806211e-06, "loss": 0.0277, "reward": 2.6659016609191895, "reward_std": 0.4846745729446411, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.04090156406164169, "rewards/wrapped_format_reward": 0.625, "step": 322 }, { "completion_length": 704.0, "epoch": 12.92, "grad_norm": 2.4858033657073975, "kl": 0.7707926034927368, "learning_rate": 4.843584983780225e-06, "loss": 0.0308, "reward": 1.1393108367919922, "reward_std": 2.8294146060943604, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": -1.2356891632080078, "rewards/wrapped_format_reward": 1.0, "step": 323 }, { "completion_length": 750.0, "epoch": 12.96, "grad_norm": 2.020785093307495, "kl": 1.2838598489761353, "learning_rate": 4.841680473120994e-06, "loss": 0.0514, "reward": -1.375, "reward_std": 0.4787135720252991, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.625, "step": 324 }, { "completion_length": 739.0, "epoch": 13.0, "grad_norm": 0.6641396284103394, "kl": 1.4216065406799316, "learning_rate": 4.839764816893315e-06, "loss": 0.0569, "reward": 2.8785228729248047, "reward_std": 0.4891079366207123, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.2535229027271271, "rewards/wrapped_format_reward": 0.625, "step": 325 }, { "completion_length": 477.0, "epoch": 13.04, "grad_norm": 1.4598071575164795, "kl": 0.6853589415550232, "learning_rate": 4.83783802421503e-06, "loss": 0.0274, "reward": 1.5590646266937256, "reward_std": 3.725144386291504, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.6909353733062744, "rewards/wrapped_format_reward": 0.75, "step": 326 }, { "completion_length": 750.0, "epoch": 13.08, "grad_norm": 0.4631199240684509, "kl": 1.2964632511138916, "learning_rate": 4.835900104256989e-06, "loss": 0.0519, "reward": 0.21913164854049683, "reward_std": 2.63443660736084, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -2.0308685302734375, "rewards/wrapped_format_reward": 0.75, "step": 327 }, { "completion_length": 559.0, "epoch": 13.12, "grad_norm": 0.8331103324890137, "kl": 0.8320713639259338, "learning_rate": 4.833951066243004e-06, "loss": 0.0333, "reward": -1.850000023841858, "reward_std": 1.12101149559021, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6499999761581421, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 328 }, { "completion_length": 745.0, "epoch": 13.16, "grad_norm": 0.05458225682377815, "kl": 1.0023459196090698, "learning_rate": 4.831990919449806e-06, "loss": 0.0401, "reward": -1.0, "reward_std": 0.0, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 1.0, "step": 329 }, { "completion_length": 705.0, "epoch": 13.2, "grad_norm": 0.8195479512214661, "kl": 1.466266393661499, "learning_rate": 4.830019673206997e-06, "loss": 0.0587, "reward": 0.984784722328186, "reward_std": 3.389918565750122, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7222222089767456, "rewards/wrapped_driving_reward": -0.8624374866485596, "rewards/wrapped_format_reward": 0.375, "step": 330 }, { "completion_length": 605.0, "epoch": 13.24, "grad_norm": 0.4812968969345093, "kl": 1.0380324125289917, "learning_rate": 4.828037336897009e-06, "loss": 0.0415, "reward": 2.9450440406799316, "reward_std": 0.32202810049057007, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.07004398852586746, "rewards/wrapped_format_reward": 0.875, "step": 331 }, { "completion_length": 750.0, "epoch": 13.28, "grad_norm": 0.5454792380332947, "kl": 1.0666571855545044, "learning_rate": 4.826043919955062e-06, "loss": 0.0427, "reward": 3.0907585620880127, "reward_std": 0.5051584839820862, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.09075860679149628, "rewards/wrapped_format_reward": 1.0, "step": 332 }, { "completion_length": 534.0, "epoch": 13.32, "grad_norm": 0.6665737628936768, "kl": 0.9193040132522583, "learning_rate": 4.824039431869112e-06, "loss": 0.0368, "reward": 3.3314661979675293, "reward_std": 0.48887741565704346, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.875, "rewards/wrapped_driving_reward": 0.5814663171768188, "rewards/wrapped_format_reward": 0.875, "step": 333 }, { "completion_length": 750.0, "epoch": 13.36, "grad_norm": 0.8595104217529297, "kl": 0.8587233424186707, "learning_rate": 4.822023882179811e-06, "loss": 0.0343, "reward": 3.173095226287842, "reward_std": 0.4861666262149811, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.6730952262878418, "rewards/wrapped_format_reward": 0.5, "step": 334 }, { "completion_length": 750.0, "epoch": 13.4, "grad_norm": 0.661893367767334, "kl": 0.8537664413452148, "learning_rate": 4.8199972804804615e-06, "loss": 0.0342, "reward": 1.0913279056549072, "reward_std": 3.0786876678466797, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9086720943450928, "rewards/wrapped_format_reward": 0.5, "step": 335 }, { "completion_length": 725.0, "epoch": 13.44, "grad_norm": 0.5498742461204529, "kl": 0.8321976065635681, "learning_rate": 4.817959636416969e-06, "loss": 0.0333, "reward": 3.1832504272460938, "reward_std": 0.1209394559264183, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.18325048685073853, "rewards/wrapped_format_reward": 1.0, "step": 336 }, { "completion_length": 750.0, "epoch": 13.48, "grad_norm": 1.1294459104537964, "kl": 1.4946460723876953, "learning_rate": 4.815910959687795e-06, "loss": 0.0598, "reward": 2.673915386199951, "reward_std": 0.5557320713996887, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.20108462870121002, "rewards/wrapped_format_reward": 0.875, "step": 337 }, { "completion_length": 750.0, "epoch": 13.52, "grad_norm": 0.8809202909469604, "kl": 1.7684038877487183, "learning_rate": 4.8138512600439165e-06, "loss": 0.0707, "reward": 1.2998652458190918, "reward_std": 3.222731351852417, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9501346945762634, "rewards/wrapped_format_reward": 0.75, "step": 338 }, { "completion_length": 528.0, "epoch": 13.56, "grad_norm": 0.6754530072212219, "kl": 0.7506433129310608, "learning_rate": 4.8117805472887706e-06, "loss": 0.03, "reward": 2.930798053741455, "reward_std": 0.7055428624153137, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.4307979345321655, "rewards/wrapped_format_reward": 0.5, "step": 339 }, { "completion_length": 634.0, "epoch": 13.6, "grad_norm": 0.6422954797744751, "kl": 1.1495076417922974, "learning_rate": 4.809698831278217e-06, "loss": 0.046, "reward": 2.855012893676758, "reward_std": 0.6868377923965454, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9375, "rewards/wrapped_driving_reward": 0.16751310229301453, "rewards/wrapped_format_reward": 0.75, "step": 340 }, { "completion_length": 750.0, "epoch": 13.64, "grad_norm": 0.8380364775657654, "kl": 0.7830958962440491, "learning_rate": 4.807606121920486e-06, "loss": 0.0313, "reward": 2.7610697746276855, "reward_std": 0.17693300545215607, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9642857313156128, "rewards/wrapped_driving_reward": 0.046783991158008575, "rewards/wrapped_format_reward": 0.75, "step": 341 }, { "completion_length": 750.0, "epoch": 13.68, "grad_norm": 0.45640650391578674, "kl": 0.6177964210510254, "learning_rate": 4.80550242917613e-06, "loss": 0.0247, "reward": 0.28038734197616577, "reward_std": 2.0201590061187744, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.949999988079071, "rewards/wrapped_driving_reward": -2.2946126461029053, "rewards/wrapped_format_reward": 0.625, "step": 342 }, { "completion_length": 750.0, "epoch": 13.72, "grad_norm": 0.8011844158172607, "kl": 0.8235171437263489, "learning_rate": 4.803387763057981e-06, "loss": 0.0329, "reward": 3.0518081188201904, "reward_std": 0.5246614217758179, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.17680811882019043, "rewards/wrapped_format_reward": 0.875, "step": 343 }, { "completion_length": 750.0, "epoch": 13.76, "grad_norm": 0.8099798560142517, "kl": 1.021049976348877, "learning_rate": 4.801262133631101e-06, "loss": 0.0408, "reward": -1.625, "reward_std": 0.25, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.375, "step": 344 }, { "completion_length": 668.0, "epoch": 13.8, "grad_norm": 0.9226036667823792, "kl": 0.9379998445510864, "learning_rate": 4.799125551012731e-06, "loss": 0.0375, "reward": 2.803008556365967, "reward_std": 0.44537705183029175, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.1780085265636444, "rewards/wrapped_format_reward": 0.625, "step": 345 }, { "completion_length": 750.0, "epoch": 13.84, "grad_norm": 0.6966492533683777, "kl": 1.2720391750335693, "learning_rate": 4.796978025372247e-06, "loss": 0.0509, "reward": -1.25, "reward_std": 0.28867512941360474, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.875, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.875, "step": 346 }, { "completion_length": 547.0, "epoch": 13.88, "grad_norm": 0.5560488700866699, "kl": 0.5576459169387817, "learning_rate": 4.794819566931107e-06, "loss": 0.0223, "reward": 2.6760520935058594, "reward_std": 0.14635096490383148, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.96875, "rewards/wrapped_driving_reward": -0.1676977276802063, "rewards/wrapped_format_reward": 0.875, "step": 347 }, { "completion_length": 606.0, "epoch": 13.92, "grad_norm": 0.1242881491780281, "kl": 0.9755803942680359, "learning_rate": 4.79265018596281e-06, "loss": 0.039, "reward": -1.0, "reward_std": 0.0, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 1.0, "step": 348 }, { "completion_length": 476.0, "epoch": 13.96, "grad_norm": 0.9491596221923828, "kl": 0.4472344219684601, "learning_rate": 4.79046989279284e-06, "loss": 0.0179, "reward": 2.4583888053894043, "reward_std": 1.3171792030334473, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7321428656578064, "rewards/wrapped_driving_reward": 0.2262459248304367, "rewards/wrapped_format_reward": 0.5, "step": 349 }, { "completion_length": 750.0, "epoch": 14.0, "grad_norm": 0.43453091382980347, "kl": 0.9528393149375916, "learning_rate": 4.788278697798619e-06, "loss": 0.0381, "reward": 1.0954630374908447, "reward_std": 1.8647366762161255, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.7795369625091553, "rewards/wrapped_format_reward": 0.875, "step": 350 }, { "completion_length": 750.0, "epoch": 14.04, "grad_norm": 0.5005083084106445, "kl": 0.8873422145843506, "learning_rate": 4.7860766114094555e-06, "loss": 0.0355, "reward": 2.2282726764678955, "reward_std": 0.6381257176399231, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.27172738313674927, "rewards/wrapped_format_reward": 0.5, "step": 351 }, { "completion_length": 736.0, "epoch": 14.08, "grad_norm": 0.6733227968215942, "kl": 0.902958333492279, "learning_rate": 4.783863644106502e-06, "loss": 0.0361, "reward": 2.6390929222106934, "reward_std": 0.7452251315116882, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8999999761581421, "rewards/wrapped_driving_reward": 0.11409273743629456, "rewards/wrapped_format_reward": 0.625, "step": 352 }, { "completion_length": 750.0, "epoch": 14.12, "grad_norm": 0.3890395164489746, "kl": 0.21702060103416443, "learning_rate": 4.781639806422699e-06, "loss": 0.0087, "reward": 2.652873992919922, "reward_std": 0.42699992656707764, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.027873992919921875, "rewards/wrapped_format_reward": 0.625, "step": 353 }, { "completion_length": 750.0, "epoch": 14.16, "grad_norm": 2.0187084674835205, "kl": 1.372572660446167, "learning_rate": 4.779405108942722e-06, "loss": 0.0549, "reward": 2.4973678588867188, "reward_std": 0.37870702147483826, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.002632094081491232, "rewards/wrapped_format_reward": 0.5, "step": 354 }, { "completion_length": 528.0, "epoch": 14.2, "grad_norm": 0.6127402782440186, "kl": 0.7212586998939514, "learning_rate": 4.77715956230294e-06, "loss": 0.0288, "reward": -1.0456349849700928, "reward_std": 0.055128760635852814, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.954365074634552, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 1.0, "step": 355 }, { "completion_length": 750.0, "epoch": 14.24, "grad_norm": 0.7625749111175537, "kl": 1.294076681137085, "learning_rate": 4.774903177191358e-06, "loss": 0.0518, "reward": 2.880377769470215, "reward_std": 0.6150814890861511, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.13037782907485962, "rewards/wrapped_format_reward": 0.75, "step": 356 }, { "completion_length": 750.0, "epoch": 14.28, "grad_norm": 1.118186116218567, "kl": 1.3434715270996094, "learning_rate": 4.77263596434757e-06, "loss": 0.0537, "reward": 3.2224440574645996, "reward_std": 0.4224244952201843, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9642857313156128, "rewards/wrapped_driving_reward": 0.38315847516059875, "rewards/wrapped_format_reward": 0.875, "step": 357 }, { "completion_length": 750.0, "epoch": 14.32, "grad_norm": 0.5385660529136658, "kl": 1.1639105081558228, "learning_rate": 4.770357934562704e-06, "loss": 0.0466, "reward": 3.100466728210449, "reward_std": 0.24647371470928192, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9375, "rewards/wrapped_driving_reward": 0.16296681761741638, "rewards/wrapped_format_reward": 1.0, "step": 358 }, { "completion_length": 750.0, "epoch": 14.36, "grad_norm": 0.7925487756729126, "kl": 1.1513047218322754, "learning_rate": 4.7680690986793734e-06, "loss": 0.0461, "reward": -2.125, "reward_std": 1.0307763814926147, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.5, "step": 359 }, { "completion_length": 750.0, "epoch": 14.4, "grad_norm": 0.5221444368362427, "kl": 0.9014317989349365, "learning_rate": 4.765769467591626e-06, "loss": 0.0361, "reward": 3.1284172534942627, "reward_std": 0.13471664488315582, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.1284172236919403, "rewards/wrapped_format_reward": 1.0, "step": 360 }, { "completion_length": 619.0, "epoch": 14.44, "grad_norm": 0.7254578471183777, "kl": 1.2201420068740845, "learning_rate": 4.7634590522448886e-06, "loss": 0.0488, "reward": -1.28125, "reward_std": 0.4827762544155121, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.96875, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 361 }, { "completion_length": 750.0, "epoch": 14.48, "grad_norm": 0.6827868819236755, "kl": 1.4321162700653076, "learning_rate": 4.761137863635921e-06, "loss": 0.0573, "reward": 3.0515081882476807, "reward_std": 0.42571601271629333, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.051508113741874695, "rewards/wrapped_format_reward": 1.0, "step": 362 }, { "completion_length": 589.0, "epoch": 14.52, "grad_norm": 0.8641681671142578, "kl": 1.0050054788589478, "learning_rate": 4.758805912812755e-06, "loss": 0.0402, "reward": 3.0080392360687256, "reward_std": 0.8377665281295776, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.25803929567337036, "rewards/wrapped_format_reward": 0.75, "step": 363 }, { "completion_length": 666.0, "epoch": 14.56, "grad_norm": 0.9824696779251099, "kl": 0.7488256692886353, "learning_rate": 4.7564632108746524e-06, "loss": 0.03, "reward": -1.25, "reward_std": 0.28867512941360474, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 364 }, { "completion_length": 750.0, "epoch": 14.6, "grad_norm": 0.3840194344520569, "kl": 1.0556010007858276, "learning_rate": 4.75410976897204e-06, "loss": 0.0422, "reward": 3.167947292327881, "reward_std": 0.44172561168670654, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.1679472178220749, "rewards/wrapped_format_reward": 1.0, "step": 365 }, { "completion_length": 630.0, "epoch": 14.64, "grad_norm": 0.4588969647884369, "kl": 0.5971285104751587, "learning_rate": 4.7517455983064694e-06, "loss": 0.0239, "reward": -1.0227272510528564, "reward_std": 0.04545450210571289, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9772727489471436, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 1.0, "step": 366 }, { "completion_length": 730.0, "epoch": 14.68, "grad_norm": 0.834652841091156, "kl": 1.3273468017578125, "learning_rate": 4.7493707101305545e-06, "loss": 0.0531, "reward": 2.759453773498535, "reward_std": 0.5071139335632324, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9583333134651184, "rewards/wrapped_driving_reward": 0.1761203557252884, "rewards/wrapped_format_reward": 0.625, "step": 367 }, { "completion_length": 750.0, "epoch": 14.72, "grad_norm": 0.5011023879051208, "kl": 1.0096584558486938, "learning_rate": 4.746985115747918e-06, "loss": 0.0404, "reward": 2.915914535522461, "reward_std": 0.5742588043212891, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9642857313156128, "rewards/wrapped_driving_reward": 0.7016288638114929, "rewards/wrapped_format_reward": 0.25, "step": 368 }, { "completion_length": 750.0, "epoch": 14.76, "grad_norm": 0.6958830952644348, "kl": 1.282517910003662, "learning_rate": 4.744588826513145e-06, "loss": 0.0513, "reward": 2.461658477783203, "reward_std": 1.062435269355774, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.2883416414260864, "rewards/wrapped_format_reward": 0.75, "step": 369 }, { "completion_length": 750.0, "epoch": 14.8, "grad_norm": 0.5009940266609192, "kl": 0.7665615081787109, "learning_rate": 4.742181853831721e-06, "loss": 0.0307, "reward": 3.0534534454345703, "reward_std": 0.42255762219429016, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.949999988079071, "rewards/wrapped_driving_reward": 0.10345339775085449, "rewards/wrapped_format_reward": 1.0, "step": 370 }, { "completion_length": 750.0, "epoch": 14.84, "grad_norm": 0.3792387545108795, "kl": 0.9433217644691467, "learning_rate": 4.739764209159984e-06, "loss": 0.0377, "reward": 1.4562020301818848, "reward_std": 3.3310694694519043, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.7937980890274048, "rewards/wrapped_format_reward": 0.75, "step": 371 }, { "completion_length": 750.0, "epoch": 14.88, "grad_norm": 0.6532329320907593, "kl": 1.3375815153121948, "learning_rate": 4.737335904005063e-06, "loss": 0.0535, "reward": 1.3177659511566162, "reward_std": 2.252401113510132, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.1822340488433838, "rewards/wrapped_format_reward": 0.5, "step": 372 }, { "completion_length": 564.0, "epoch": 14.92, "grad_norm": 0.6528195738792419, "kl": 0.9802216291427612, "learning_rate": 4.734896949924831e-06, "loss": 0.0392, "reward": 1.9856492280960083, "reward_std": 0.9138504266738892, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.875, "rewards/wrapped_driving_reward": -0.2643508017063141, "rewards/wrapped_format_reward": 0.375, "step": 373 }, { "completion_length": 560.0, "epoch": 14.96, "grad_norm": 0.613873302936554, "kl": 1.106533169746399, "learning_rate": 4.732447358527843e-06, "loss": 0.0443, "reward": 3.070741891860962, "reward_std": 0.6192570328712463, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.19574186205863953, "rewards/wrapped_format_reward": 0.875, "step": 374 }, { "completion_length": 742.0, "epoch": 15.0, "grad_norm": 0.5938997268676758, "kl": 1.087781310081482, "learning_rate": 4.729987141473286e-06, "loss": 0.0435, "reward": 3.193850040435791, "reward_std": 0.5507637858390808, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.949999988079071, "rewards/wrapped_driving_reward": 0.6188501119613647, "rewards/wrapped_format_reward": 0.625, "step": 375 }, { "completion_length": 527.0, "epoch": 15.04, "grad_norm": 0.44303637742996216, "kl": 0.551745593547821, "learning_rate": 4.72751631047092e-06, "loss": 0.0221, "reward": 2.2447049617767334, "reward_std": 0.40859082341194153, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.3802950978279114, "rewards/wrapped_format_reward": 0.625, "step": 376 }, { "completion_length": 504.0, "epoch": 15.08, "grad_norm": 0.4766407310962677, "kl": 0.5846800804138184, "learning_rate": 4.725034877281025e-06, "loss": 0.0234, "reward": 2.4882283210754395, "reward_std": 0.9573307633399963, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.5117717385292053, "rewards/wrapped_format_reward": 1.0, "step": 377 }, { "completion_length": 741.0, "epoch": 15.12, "grad_norm": 1.0294671058654785, "kl": 0.909957230091095, "learning_rate": 4.7225428537143414e-06, "loss": 0.0364, "reward": 2.707183361053467, "reward_std": 0.2954871654510498, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9615384340286255, "rewards/wrapped_driving_reward": 0.12064509093761444, "rewards/wrapped_format_reward": 0.625, "step": 378 }, { "completion_length": 725.0, "epoch": 15.16, "grad_norm": 1.9352197647094727, "kl": 1.4188382625579834, "learning_rate": 4.720040251632019e-06, "loss": 0.0568, "reward": 2.6802406311035156, "reward_std": 0.7824314832687378, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8434343338012695, "rewards/wrapped_driving_reward": 0.08680635690689087, "rewards/wrapped_format_reward": 0.75, "step": 379 }, { "completion_length": 680.0, "epoch": 15.2, "grad_norm": 1.030442237854004, "kl": 0.9825220704078674, "learning_rate": 4.717527082945555e-06, "loss": 0.0393, "reward": 3.433924436569214, "reward_std": 0.5535728335380554, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.6839244961738586, "rewards/wrapped_format_reward": 0.75, "step": 380 }, { "completion_length": 750.0, "epoch": 15.24, "grad_norm": 0.6829676032066345, "kl": 1.046954870223999, "learning_rate": 4.715003359616741e-06, "loss": 0.0419, "reward": -1.125, "reward_std": 0.25, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.875, "step": 381 }, { "completion_length": 750.0, "epoch": 15.28, "grad_norm": 0.5402231812477112, "kl": 0.7217246294021606, "learning_rate": 4.712469093657605e-06, "loss": 0.0289, "reward": 2.7015037536621094, "reward_std": 0.6882119178771973, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.17349615693092346, "rewards/wrapped_format_reward": 0.875, "step": 382 }, { "completion_length": 656.0, "epoch": 15.32, "grad_norm": 0.8153801560401917, "kl": 1.4001483917236328, "learning_rate": 4.709924297130354e-06, "loss": 0.056, "reward": -1.2777777910232544, "reward_std": 0.48432207107543945, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9722222089767456, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 383 }, { "completion_length": 663.0, "epoch": 15.36, "grad_norm": 0.3903270661830902, "kl": 1.1370939016342163, "learning_rate": 4.707368982147318e-06, "loss": 0.0455, "reward": 0.018291592597961426, "reward_std": 2.9197871685028076, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7250000238418579, "rewards/wrapped_driving_reward": -2.2067084312438965, "rewards/wrapped_format_reward": 0.75, "step": 384 }, { "completion_length": 750.0, "epoch": 15.4, "grad_norm": 0.49858081340789795, "kl": 0.7010886073112488, "learning_rate": 4.704803160870888e-06, "loss": 0.028, "reward": 2.679309844970703, "reward_std": 0.4782131314277649, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.07069025933742523, "rewards/wrapped_format_reward": 0.75, "step": 385 }, { "completion_length": 750.0, "epoch": 15.44, "grad_norm": 0.6627248525619507, "kl": 1.3270666599273682, "learning_rate": 4.702226845513465e-06, "loss": 0.0531, "reward": -1.25, "reward_std": 0.5, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 386 }, { "completion_length": 750.0, "epoch": 15.48, "grad_norm": 0.7167011499404907, "kl": 1.1115460395812988, "learning_rate": 4.699640048337394e-06, "loss": 0.0445, "reward": -1.375, "reward_std": 0.4787135720252991, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.625, "step": 387 }, { "completion_length": 750.0, "epoch": 15.52, "grad_norm": 0.33501261472702026, "kl": 1.264574408531189, "learning_rate": 4.697042781654913e-06, "loss": 0.0506, "reward": 2.486931324005127, "reward_std": 0.23576277494430542, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.1380685567855835, "rewards/wrapped_format_reward": 0.625, "step": 388 }, { "completion_length": 750.0, "epoch": 15.56, "grad_norm": 0.5400851368904114, "kl": 1.1933934688568115, "learning_rate": 4.694435057828092e-06, "loss": 0.0477, "reward": 1.214620590209961, "reward_std": 2.837777614593506, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.0353795289993286, "rewards/wrapped_format_reward": 0.75, "step": 389 }, { "completion_length": 750.0, "epoch": 15.6, "grad_norm": 0.44655412435531616, "kl": 0.6380826830863953, "learning_rate": 4.69181688926877e-06, "loss": 0.0255, "reward": 3.148019313812256, "reward_std": 0.17191006243228912, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.1480191946029663, "rewards/wrapped_format_reward": 1.0, "step": 390 }, { "completion_length": 690.0, "epoch": 15.64, "grad_norm": 0.6045768857002258, "kl": 1.1987158060073853, "learning_rate": 4.6891882884384994e-06, "loss": 0.0479, "reward": 1.2684307098388672, "reward_std": 2.863405704498291, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.71875, "rewards/wrapped_driving_reward": -0.825319230556488, "rewards/wrapped_format_reward": 0.625, "step": 391 }, { "completion_length": 741.0, "epoch": 15.68, "grad_norm": 0.4550888240337372, "kl": 1.3106390237808228, "learning_rate": 4.68654926784849e-06, "loss": 0.0524, "reward": 2.7248356342315674, "reward_std": 0.2842490077018738, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.22483548521995544, "rewards/wrapped_format_reward": 0.5, "step": 392 }, { "completion_length": 750.0, "epoch": 15.72, "grad_norm": 0.49233898520469666, "kl": 1.008697509765625, "learning_rate": 4.683899840059543e-06, "loss": 0.0403, "reward": -1.6673097610473633, "reward_std": 1.2686203718185425, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -3.6673097610473633, "rewards/wrapped_format_reward": 0.75, "step": 393 }, { "completion_length": 750.0, "epoch": 15.76, "grad_norm": 0.5187327861785889, "kl": 1.2990044355392456, "learning_rate": 4.681240017681994e-06, "loss": 0.052, "reward": -1.125, "reward_std": 0.25, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.875, "step": 394 }, { "completion_length": 506.0, "epoch": 15.8, "grad_norm": 0.6884944438934326, "kl": 0.7200514078140259, "learning_rate": 4.678569813375654e-06, "loss": 0.0288, "reward": 3.613212823867798, "reward_std": 0.14553166925907135, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.6132129430770874, "rewards/wrapped_format_reward": 1.0, "step": 395 }, { "completion_length": 635.0, "epoch": 15.84, "grad_norm": 0.5450335741043091, "kl": 1.369678258895874, "learning_rate": 4.675889239849749e-06, "loss": 0.0548, "reward": 2.0136775970458984, "reward_std": 1.1577099561691284, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.6113223433494568, "rewards/wrapped_format_reward": 0.625, "step": 396 }, { "completion_length": 750.0, "epoch": 15.88, "grad_norm": 0.4165554642677307, "kl": 1.6805375814437866, "learning_rate": 4.67319830986286e-06, "loss": 0.0672, "reward": 3.094829559326172, "reward_std": 0.6602194309234619, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.3448294997215271, "rewards/wrapped_format_reward": 0.75, "step": 397 }, { "completion_length": 750.0, "epoch": 15.92, "grad_norm": 0.4113013744354248, "kl": 0.983104944229126, "learning_rate": 4.670497036222856e-06, "loss": 0.0393, "reward": 2.9975690841674805, "reward_std": 0.22446000576019287, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.00243115215562284, "rewards/wrapped_format_reward": 1.0, "step": 398 }, { "completion_length": 750.0, "epoch": 15.96, "grad_norm": 0.8818913102149963, "kl": 1.1384490728378296, "learning_rate": 4.667785431786843e-06, "loss": 0.0455, "reward": 3.1137232780456543, "reward_std": 0.4699656367301941, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.949999988079071, "rewards/wrapped_driving_reward": 0.538723349571228, "rewards/wrapped_format_reward": 0.625, "step": 399 }, { "completion_length": 750.0, "epoch": 16.0, "grad_norm": 0.41796690225601196, "kl": 1.118818759918213, "learning_rate": 4.665063509461098e-06, "loss": 0.0448, "reward": 2.9344351291656494, "reward_std": 0.2725065052509308, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.05943508818745613, "rewards/wrapped_format_reward": 0.875, "step": 400 }, { "completion_length": 510.0, "epoch": 16.04, "grad_norm": 0.5188367366790771, "kl": 0.7825106382369995, "learning_rate": 4.662331282201002e-06, "loss": 0.0313, "reward": 2.8269712924957275, "reward_std": 0.5569232106208801, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9166666865348816, "rewards/wrapped_driving_reward": 0.03530450910329819, "rewards/wrapped_format_reward": 0.875, "step": 401 }, { "completion_length": 673.0, "epoch": 16.08, "grad_norm": 0.06149033084511757, "kl": 0.8966481685638428, "learning_rate": 4.65958876301099e-06, "loss": 0.0359, "reward": -1.0, "reward_std": 0.0, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 1.0, "step": 402 }, { "completion_length": 750.0, "epoch": 16.12, "grad_norm": 0.5415775775909424, "kl": 1.1003456115722656, "learning_rate": 4.65683596494448e-06, "loss": 0.044, "reward": -1.3068182468414307, "reward_std": 0.24159422516822815, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9431818127632141, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 403 }, { "completion_length": 620.0, "epoch": 16.16, "grad_norm": 0.42184168100357056, "kl": 1.1518326997756958, "learning_rate": 4.654072901103815e-06, "loss": 0.0461, "reward": 3.293445348739624, "reward_std": 0.3596523404121399, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.29344528913497925, "rewards/wrapped_format_reward": 1.0, "step": 404 }, { "completion_length": 738.0, "epoch": 16.2, "grad_norm": 0.5668161511421204, "kl": 1.5937199592590332, "learning_rate": 4.651299584640198e-06, "loss": 0.0637, "reward": -1.2777777910232544, "reward_std": 0.5555555820465088, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9722222089767456, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 405 }, { "completion_length": 582.0, "epoch": 16.24, "grad_norm": 0.46643057465553284, "kl": 1.0229408740997314, "learning_rate": 4.648516028753632e-06, "loss": 0.0409, "reward": 2.7324419021606445, "reward_std": 0.4210711717605591, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.96875, "rewards/wrapped_driving_reward": -0.23630811274051666, "rewards/wrapped_format_reward": 1.0, "step": 406 }, { "completion_length": 750.0, "epoch": 16.28, "grad_norm": 0.7909368872642517, "kl": 1.144740104675293, "learning_rate": 4.645722246692856e-06, "loss": 0.0458, "reward": -1.125, "reward_std": 0.25, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.875, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 1.0, "step": 407 }, { "completion_length": 750.0, "epoch": 16.32, "grad_norm": 0.40184658765792847, "kl": 1.196555733680725, "learning_rate": 4.642918251755281e-06, "loss": 0.0479, "reward": 2.915738582611084, "reward_std": 0.2729151248931885, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.08426124602556229, "rewards/wrapped_format_reward": 1.0, "step": 408 }, { "completion_length": 750.0, "epoch": 16.36, "grad_norm": 0.38038399815559387, "kl": 0.5281871557235718, "learning_rate": 4.6401040572869295e-06, "loss": 0.0211, "reward": 2.0221829414367676, "reward_std": 0.7135373950004578, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.7278171181678772, "rewards/wrapped_format_reward": 0.75, "step": 409 }, { "completion_length": 750.0, "epoch": 16.4, "grad_norm": 0.6318146586418152, "kl": 1.4290169477462769, "learning_rate": 4.637279676682367e-06, "loss": 0.0572, "reward": 2.355543851852417, "reward_std": 1.3740731477737427, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.3944561779499054, "rewards/wrapped_format_reward": 0.75, "step": 410 }, { "completion_length": 750.0, "epoch": 16.44, "grad_norm": 0.8606741428375244, "kl": 1.26045560836792, "learning_rate": 4.634445123384644e-06, "loss": 0.0504, "reward": 2.573911190032959, "reward_std": 0.6645153760910034, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.07391127943992615, "rewards/wrapped_format_reward": 0.5, "step": 411 }, { "completion_length": 750.0, "epoch": 16.48, "grad_norm": 0.40005964040756226, "kl": 1.007511019706726, "learning_rate": 4.631600410885231e-06, "loss": 0.0403, "reward": 0.41647136211395264, "reward_std": 1.9502681493759155, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.949999988079071, "rewards/wrapped_driving_reward": -2.0335285663604736, "rewards/wrapped_format_reward": 0.5, "step": 412 }, { "completion_length": 593.0, "epoch": 16.52, "grad_norm": 0.5277693271636963, "kl": 1.2088240385055542, "learning_rate": 4.6287455527239475e-06, "loss": 0.0484, "reward": 2.7194738388061523, "reward_std": 0.4919568598270416, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.09447365999221802, "rewards/wrapped_format_reward": 0.625, "step": 413 }, { "completion_length": 750.0, "epoch": 16.56, "grad_norm": 0.4249366223812103, "kl": 0.8536944389343262, "learning_rate": 4.625880562488908e-06, "loss": 0.0341, "reward": 3.555159091949463, "reward_std": 0.2293834686279297, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.6801592111587524, "rewards/wrapped_format_reward": 0.875, "step": 414 }, { "completion_length": 750.0, "epoch": 16.6, "grad_norm": 0.6364783644676208, "kl": 1.720573902130127, "learning_rate": 4.623005453816447e-06, "loss": 0.0688, "reward": 2.6921558380126953, "reward_std": 0.33053314685821533, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.06715589016675949, "rewards/wrapped_format_reward": 0.625, "step": 415 }, { "completion_length": 750.0, "epoch": 16.64, "grad_norm": 0.5456056594848633, "kl": 1.410339117050171, "learning_rate": 4.620120240391065e-06, "loss": 0.0564, "reward": 2.0680339336395264, "reward_std": 1.2569472789764404, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.681966245174408, "rewards/wrapped_format_reward": 0.75, "step": 416 }, { "completion_length": 590.0, "epoch": 16.68, "grad_norm": 0.5361428260803223, "kl": 0.9750890731811523, "learning_rate": 4.617224935945354e-06, "loss": 0.039, "reward": 3.0615792274475098, "reward_std": 0.41482946276664734, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.18657910823822021, "rewards/wrapped_format_reward": 0.875, "step": 417 }, { "completion_length": 750.0, "epoch": 16.72, "grad_norm": 0.3962303102016449, "kl": 1.1152567863464355, "learning_rate": 4.614319554259934e-06, "loss": 0.0446, "reward": 2.119654417037964, "reward_std": 0.8136077523231506, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.38034552335739136, "rewards/wrapped_format_reward": 0.5, "step": 418 }, { "completion_length": 650.0, "epoch": 16.76, "grad_norm": 0.4923894703388214, "kl": 0.6831648945808411, "learning_rate": 4.611404109163392e-06, "loss": 0.0273, "reward": 2.7168376445770264, "reward_std": 0.5373032093048096, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.03316231817007065, "rewards/wrapped_format_reward": 0.75, "step": 419 }, { "completion_length": 750.0, "epoch": 16.8, "grad_norm": 0.5763388872146606, "kl": 1.2227815389633179, "learning_rate": 4.608478614532215e-06, "loss": 0.0489, "reward": 3.023646354675293, "reward_std": 0.09265323728322983, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.14864633977413177, "rewards/wrapped_format_reward": 0.875, "step": 420 }, { "completion_length": 472.0, "epoch": 16.84, "grad_norm": 0.7298195958137512, "kl": 1.2331024408340454, "learning_rate": 4.605543084290716e-06, "loss": 0.0493, "reward": 3.369495153427124, "reward_std": 0.40679696202278137, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.49449509382247925, "rewards/wrapped_format_reward": 0.875, "step": 421 }, { "completion_length": 750.0, "epoch": 16.88, "grad_norm": 0.5339130163192749, "kl": 1.6134005784988403, "learning_rate": 4.602597532410982e-06, "loss": 0.0645, "reward": 3.0414419174194336, "reward_std": 0.6409664154052734, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9750000238418579, "rewards/wrapped_driving_reward": 0.31644195318222046, "rewards/wrapped_format_reward": 0.75, "step": 422 }, { "completion_length": 750.0, "epoch": 16.92, "grad_norm": 0.43448933959007263, "kl": 1.194190263748169, "learning_rate": 4.599641972912791e-06, "loss": 0.0478, "reward": 1.0178461074829102, "reward_std": 3.0411243438720703, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.699999988079071, "rewards/wrapped_driving_reward": -0.9321538805961609, "rewards/wrapped_format_reward": 0.5, "step": 423 }, { "completion_length": 750.0, "epoch": 16.96, "grad_norm": 0.65577232837677, "kl": 1.1475871801376343, "learning_rate": 4.596676419863561e-06, "loss": 0.0459, "reward": 3.7684073448181152, "reward_std": 0.06588174402713776, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.7684074640274048, "rewards/wrapped_format_reward": 1.0, "step": 424 }, { "completion_length": 750.0, "epoch": 17.0, "grad_norm": 0.4012209177017212, "kl": 1.4444128274917603, "learning_rate": 4.59370088737827e-06, "loss": 0.0578, "reward": -1.0416667461395264, "reward_std": 0.08333337306976318, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9583333134651184, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 1.0, "step": 425 }, { "completion_length": 520.0, "epoch": 17.04, "grad_norm": 0.5853270888328552, "kl": 1.0507853031158447, "learning_rate": 4.590715389619399e-06, "loss": 0.042, "reward": 2.998222827911377, "reward_std": 0.3459242880344391, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.942307710647583, "rewards/wrapped_driving_reward": 0.18091517686843872, "rewards/wrapped_format_reward": 0.875, "step": 426 }, { "completion_length": 517.0, "epoch": 17.08, "grad_norm": 0.47325876355171204, "kl": 0.7153458595275879, "learning_rate": 4.587719940796858e-06, "loss": 0.0286, "reward": 3.0170512199401855, "reward_std": 0.35850608348846436, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.01705138385295868, "rewards/wrapped_format_reward": 1.0, "step": 427 }, { "completion_length": 750.0, "epoch": 17.12, "grad_norm": 0.46970948576927185, "kl": 0.750531792640686, "learning_rate": 4.584714555167921e-06, "loss": 0.03, "reward": 1.5067301988601685, "reward_std": 2.1269452571868896, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.493269681930542, "rewards/wrapped_format_reward": 1.0, "step": 428 }, { "completion_length": 690.0, "epoch": 17.16, "grad_norm": 0.4938046336174011, "kl": 0.9764611124992371, "learning_rate": 4.581699247037157e-06, "loss": 0.0391, "reward": 3.021587371826172, "reward_std": 0.5245987772941589, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.984375, "rewards/wrapped_driving_reward": 0.28721246123313904, "rewards/wrapped_format_reward": 0.75, "step": 429 }, { "completion_length": 750.0, "epoch": 17.2, "grad_norm": 0.40477994084358215, "kl": 1.7532377243041992, "learning_rate": 4.578674030756364e-06, "loss": 0.0701, "reward": 2.8085033893585205, "reward_std": 0.7948499917984009, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.875, "rewards/wrapped_driving_reward": 0.30850329995155334, "rewards/wrapped_format_reward": 0.625, "step": 430 }, { "completion_length": 444.0, "epoch": 17.24, "grad_norm": 0.7702034115791321, "kl": 0.8725172877311707, "learning_rate": 4.5756389207244965e-06, "loss": 0.0349, "reward": 2.688070774078369, "reward_std": 0.7989228963851929, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.90625, "rewards/wrapped_driving_reward": 0.15682078897953033, "rewards/wrapped_format_reward": 0.625, "step": 431 }, { "completion_length": 750.0, "epoch": 17.28, "grad_norm": 0.5551993250846863, "kl": 0.5444793105125427, "learning_rate": 4.572593931387604e-06, "loss": 0.0218, "reward": -1.625, "reward_std": 0.4787135720252991, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.375, "step": 432 }, { "completion_length": 750.0, "epoch": 17.32, "grad_norm": 0.4201129078865051, "kl": 1.078975796699524, "learning_rate": 4.569539077238756e-06, "loss": 0.0432, "reward": 1.402123212814331, "reward_std": 2.957822561264038, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.737500011920929, "rewards/wrapped_driving_reward": -0.8353768587112427, "rewards/wrapped_format_reward": 0.75, "step": 433 }, { "completion_length": 750.0, "epoch": 17.36, "grad_norm": 0.47643131017684937, "kl": 1.2594619989395142, "learning_rate": 4.566474372817971e-06, "loss": 0.0504, "reward": 3.108989715576172, "reward_std": 0.7875818014144897, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.23398981988430023, "rewards/wrapped_format_reward": 0.875, "step": 434 }, { "completion_length": 750.0, "epoch": 17.4, "grad_norm": 0.4843199551105499, "kl": 0.4984547793865204, "learning_rate": 4.5633998327121595e-06, "loss": 0.0199, "reward": 0.8982076644897461, "reward_std": 3.3170034885406494, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7321428656578064, "rewards/wrapped_driving_reward": -1.083935260772705, "rewards/wrapped_format_reward": 0.5, "step": 435 }, { "completion_length": 544.0, "epoch": 17.44, "grad_norm": 0.49070167541503906, "kl": 0.45354408025741577, "learning_rate": 4.560315471555039e-06, "loss": 0.0181, "reward": 1.0802078247070312, "reward_std": 2.0147366523742676, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.9197921752929688, "rewards/wrapped_format_reward": 1.0, "step": 436 }, { "completion_length": 750.0, "epoch": 17.48, "grad_norm": 0.4404445290565491, "kl": 1.0045150518417358, "learning_rate": 4.557221304027077e-06, "loss": 0.0402, "reward": 2.6005172729492188, "reward_std": 0.8598195910453796, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": 0.10051736980676651, "rewards/wrapped_format_reward": 0.75, "step": 437 }, { "completion_length": 750.0, "epoch": 17.52, "grad_norm": 0.501968264579773, "kl": 1.408727765083313, "learning_rate": 4.55411734485541e-06, "loss": 0.0563, "reward": 2.800401210784912, "reward_std": 0.44490158557891846, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": 0.05040114372968674, "rewards/wrapped_format_reward": 1.0, "step": 438 }, { "completion_length": 750.0, "epoch": 17.56, "grad_norm": 0.5490036010742188, "kl": 1.1226475238800049, "learning_rate": 4.551003608813784e-06, "loss": 0.0449, "reward": 0.49088820815086365, "reward_std": 1.7767810821533203, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -2.3841118812561035, "rewards/wrapped_format_reward": 0.875, "step": 439 }, { "completion_length": 716.0, "epoch": 17.6, "grad_norm": 0.636411726474762, "kl": 1.195053219795227, "learning_rate": 4.54788011072248e-06, "loss": 0.0478, "reward": 1.8398276567459106, "reward_std": 1.4837794303894043, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.1601723432540894, "rewards/wrapped_format_reward": 1.0, "step": 440 }, { "completion_length": 511.0, "epoch": 17.64, "grad_norm": 0.6527817845344543, "kl": 0.8692322969436646, "learning_rate": 4.544746865448239e-06, "loss": 0.0348, "reward": -1.2857142686843872, "reward_std": 0.48092880845069885, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9642857313156128, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 441 }, { "completion_length": 594.0, "epoch": 17.68, "grad_norm": 0.6898093223571777, "kl": 1.2306536436080933, "learning_rate": 4.541603887904198e-06, "loss": 0.0492, "reward": -1.0625, "reward_std": 0.125, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9375, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 1.0, "step": 442 }, { "completion_length": 750.0, "epoch": 17.72, "grad_norm": 0.3844476640224457, "kl": 0.9789438247680664, "learning_rate": 4.538451193049814e-06, "loss": 0.0392, "reward": 2.9422659873962402, "reward_std": 0.6176936030387878, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9722222089767456, "rewards/wrapped_driving_reward": -0.02995637059211731, "rewards/wrapped_format_reward": 1.0, "step": 443 }, { "completion_length": 750.0, "epoch": 17.76, "grad_norm": 0.3810977339744568, "kl": 0.8543868660926819, "learning_rate": 4.535288795890799e-06, "loss": 0.0342, "reward": -1.75, "reward_std": 1.5, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 444 }, { "completion_length": 750.0, "epoch": 17.8, "grad_norm": 0.5191684365272522, "kl": 0.4153221845626831, "learning_rate": 4.532116711479039e-06, "loss": 0.0166, "reward": 2.4012069702148438, "reward_std": 0.41621556878089905, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.2237929105758667, "rewards/wrapped_format_reward": 0.625, "step": 445 }, { "completion_length": 630.0, "epoch": 17.84, "grad_norm": 0.5015482902526855, "kl": 0.5795023441314697, "learning_rate": 4.528934954912531e-06, "loss": 0.0232, "reward": 2.996004819869995, "reward_std": 0.16818639636039734, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.984375, "rewards/wrapped_driving_reward": 0.011629827320575714, "rewards/wrapped_format_reward": 1.0, "step": 446 }, { "completion_length": 556.0, "epoch": 17.88, "grad_norm": 0.9043763279914856, "kl": 1.600888967514038, "learning_rate": 4.525743541335309e-06, "loss": 0.064, "reward": 3.2926249504089355, "reward_std": 0.6622192859649658, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.417624831199646, "rewards/wrapped_format_reward": 0.875, "step": 447 }, { "completion_length": 750.0, "epoch": 17.92, "grad_norm": 0.5094466209411621, "kl": 0.9114040732383728, "learning_rate": 4.522542485937369e-06, "loss": 0.0365, "reward": -1.2035714387893677, "reward_std": 0.21614274382591248, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9214285612106323, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.875, "step": 448 }, { "completion_length": 653.0, "epoch": 17.96, "grad_norm": 0.6721723675727844, "kl": 1.3184740543365479, "learning_rate": 4.519331803954599e-06, "loss": 0.0527, "reward": 2.8614964485168457, "reward_std": 0.2383938878774643, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.1114964485168457, "rewards/wrapped_format_reward": 0.75, "step": 449 }, { "completion_length": 750.0, "epoch": 18.0, "grad_norm": 0.4677048623561859, "kl": 0.28004857897758484, "learning_rate": 4.516111510668707e-06, "loss": 0.0112, "reward": -0.9574483036994934, "reward_std": 3.5327820777893066, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.2074482440948486, "rewards/wrapped_format_reward": 0.25, "step": 450 }, { "completion_length": 723.0, "epoch": 18.04, "grad_norm": 0.8021937608718872, "kl": 1.2376339435577393, "learning_rate": 4.512881621407146e-06, "loss": 0.0495, "reward": 2.9628384113311768, "reward_std": 0.32038041949272156, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9226190447807312, "rewards/wrapped_driving_reward": 0.16521935164928436, "rewards/wrapped_format_reward": 0.875, "step": 451 }, { "completion_length": 750.0, "epoch": 18.08, "grad_norm": 0.6682026386260986, "kl": 0.9013283252716064, "learning_rate": 4.509642151543043e-06, "loss": 0.0361, "reward": 2.2376132011413574, "reward_std": 1.6770848035812378, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.5123868584632874, "rewards/wrapped_format_reward": 0.75, "step": 452 }, { "completion_length": 750.0, "epoch": 18.12, "grad_norm": 0.3643920123577118, "kl": 0.7929832935333252, "learning_rate": 4.506393116495128e-06, "loss": 0.0317, "reward": 2.7846744060516357, "reward_std": 0.17307978868484497, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9027777910232544, "rewards/wrapped_driving_reward": 0.006896574050188065, "rewards/wrapped_format_reward": 0.875, "step": 453 }, { "completion_length": 734.0, "epoch": 18.16, "grad_norm": 1.3546857833862305, "kl": 1.7289873361587524, "learning_rate": 4.503134531727652e-06, "loss": 0.0692, "reward": 3.0436387062072754, "reward_std": 0.573936939239502, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9083333015441895, "rewards/wrapped_driving_reward": 0.3853055536746979, "rewards/wrapped_format_reward": 0.75, "step": 454 }, { "completion_length": 750.0, "epoch": 18.2, "grad_norm": 0.9518548250198364, "kl": 1.1346514225006104, "learning_rate": 4.499866412750324e-06, "loss": 0.0454, "reward": 2.846243381500244, "reward_std": 0.40401574969291687, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.90625, "rewards/wrapped_driving_reward": 0.18999344110488892, "rewards/wrapped_format_reward": 0.75, "step": 455 }, { "completion_length": 528.0, "epoch": 18.24, "grad_norm": 0.5974892973899841, "kl": 1.403336763381958, "learning_rate": 4.496588775118232e-06, "loss": 0.0561, "reward": 2.7360715866088867, "reward_std": 0.7795047760009766, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.2360716164112091, "rewards/wrapped_format_reward": 0.5, "step": 456 }, { "completion_length": 750.0, "epoch": 18.28, "grad_norm": 0.7698376774787903, "kl": 0.9995352625846863, "learning_rate": 4.493301634431768e-06, "loss": 0.04, "reward": 0.8301829099655151, "reward_std": 3.2255442142486572, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.692307710647583, "rewards/wrapped_driving_reward": -1.2371246814727783, "rewards/wrapped_format_reward": 0.625, "step": 457 }, { "completion_length": 750.0, "epoch": 18.32, "grad_norm": 0.7532805800437927, "kl": 1.3239738941192627, "learning_rate": 4.490005006336555e-06, "loss": 0.053, "reward": 1.7292661666870117, "reward_std": 1.0531554222106934, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.2707338333129883, "rewards/wrapped_format_reward": 1.0, "step": 458 }, { "completion_length": 609.0, "epoch": 18.36, "grad_norm": 1.3088675737380981, "kl": 1.1671568155288696, "learning_rate": 4.486698906523375e-06, "loss": 0.0467, "reward": -1.375, "reward_std": 0.25, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.625, "step": 459 }, { "completion_length": 565.0, "epoch": 18.4, "grad_norm": 0.5267041325569153, "kl": 0.8863846659660339, "learning_rate": 4.4833833507280884e-06, "loss": 0.0355, "reward": 1.512794852256775, "reward_std": 3.00907826423645, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9872051477432251, "rewards/wrapped_format_reward": 1.0, "step": 460 }, { "completion_length": 750.0, "epoch": 18.44, "grad_norm": 0.4046444296836853, "kl": 1.289667010307312, "learning_rate": 4.4800583547315654e-06, "loss": 0.0516, "reward": -1.0499999523162842, "reward_std": 0.10000002384185791, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.949999988079071, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 1.0, "step": 461 }, { "completion_length": 592.0, "epoch": 18.48, "grad_norm": 0.6363309621810913, "kl": 0.770276665687561, "learning_rate": 4.476723934359609e-06, "loss": 0.0308, "reward": 2.9551913738250732, "reward_std": 0.18403679132461548, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.08019141852855682, "rewards/wrapped_format_reward": 0.875, "step": 462 }, { "completion_length": 341.0, "epoch": 18.52, "grad_norm": 0.6210933327674866, "kl": 0.530462384223938, "learning_rate": 4.473380105482875e-06, "loss": 0.0212, "reward": 3.187347650527954, "reward_std": 0.11080538481473923, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.1873476356267929, "rewards/wrapped_format_reward": 1.0, "step": 463 }, { "completion_length": 483.0, "epoch": 18.56, "grad_norm": 0.5502752661705017, "kl": 1.0583032369613647, "learning_rate": 4.470026884016805e-06, "loss": 0.0423, "reward": 1.718454122543335, "reward_std": 1.522723913192749, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.9065459370613098, "rewards/wrapped_format_reward": 0.625, "step": 464 }, { "completion_length": 656.0, "epoch": 18.6, "grad_norm": 0.5941818952560425, "kl": 1.023593544960022, "learning_rate": 4.466664285921543e-06, "loss": 0.0409, "reward": -1.125, "reward_std": 0.25, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.875, "step": 465 }, { "completion_length": 750.0, "epoch": 18.64, "grad_norm": 0.5130367875099182, "kl": 1.292258858680725, "learning_rate": 4.463292327201862e-06, "loss": 0.0517, "reward": 2.46085524559021, "reward_std": 0.42149004340171814, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.4141446650028229, "rewards/wrapped_format_reward": 0.875, "step": 466 }, { "completion_length": 750.0, "epoch": 18.68, "grad_norm": 0.592551589012146, "kl": 0.7206482291221619, "learning_rate": 4.459911023907092e-06, "loss": 0.0288, "reward": 2.441016674041748, "reward_std": 0.3532237708568573, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.06601664423942566, "rewards/wrapped_format_reward": 0.375, "step": 467 }, { "completion_length": 750.0, "epoch": 18.72, "grad_norm": 0.8326399326324463, "kl": 0.8357914090156555, "learning_rate": 4.456520392131035e-06, "loss": 0.0334, "reward": 2.6499860286712646, "reward_std": 0.7204391956329346, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.875, "rewards/wrapped_driving_reward": 0.2749861180782318, "rewards/wrapped_format_reward": 0.5, "step": 468 }, { "completion_length": 750.0, "epoch": 18.76, "grad_norm": 0.5404223799705505, "kl": 0.6306906938552856, "learning_rate": 4.453120448011897e-06, "loss": 0.0252, "reward": -1.2777777910232544, "reward_std": 0.48432207107543945, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9722222089767456, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 469 }, { "completion_length": 750.0, "epoch": 18.8, "grad_norm": 0.42292672395706177, "kl": 1.1823399066925049, "learning_rate": 4.4497112077322045e-06, "loss": 0.0473, "reward": -1.1458332538604736, "reward_std": 0.23935678601264954, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9791666865348816, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.875, "step": 470 }, { "completion_length": 413.0, "epoch": 18.84, "grad_norm": 0.5878785848617554, "kl": 0.6619639992713928, "learning_rate": 4.446292687518734e-06, "loss": 0.0265, "reward": 2.993729591369629, "reward_std": 0.3032970428466797, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.006270239129662514, "rewards/wrapped_format_reward": 1.0, "step": 471 }, { "completion_length": 522.0, "epoch": 18.88, "grad_norm": 0.616311252117157, "kl": 1.3041799068450928, "learning_rate": 4.442864903642428e-06, "loss": 0.0522, "reward": 2.797008514404297, "reward_std": 0.7625215649604797, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9886363744735718, "rewards/wrapped_driving_reward": 0.18337202072143555, "rewards/wrapped_format_reward": 0.625, "step": 472 }, { "completion_length": 576.0, "epoch": 18.92, "grad_norm": 0.6430866122245789, "kl": 0.9943831562995911, "learning_rate": 4.439427872418321e-06, "loss": 0.0398, "reward": 3.5124568939208984, "reward_std": 0.24802593886852264, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.5124570727348328, "rewards/wrapped_format_reward": 1.0, "step": 473 }, { "completion_length": 750.0, "epoch": 18.96, "grad_norm": 0.42341163754463196, "kl": 0.9567270278930664, "learning_rate": 4.435981610205464e-06, "loss": 0.0383, "reward": 2.988504648208618, "reward_std": 0.34725552797317505, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.1135048121213913, "rewards/wrapped_format_reward": 0.875, "step": 474 }, { "completion_length": 750.0, "epoch": 19.0, "grad_norm": 0.5046776533126831, "kl": 1.2646851539611816, "learning_rate": 4.432526133406843e-06, "loss": 0.0506, "reward": 3.1114625930786133, "reward_std": 0.835891842842102, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.6114627122879028, "rewards/wrapped_format_reward": 0.5, "step": 475 }, { "completion_length": 750.0, "epoch": 19.04, "grad_norm": 0.4383629262447357, "kl": 0.8644335269927979, "learning_rate": 4.4290614584693005e-06, "loss": 0.0346, "reward": 2.6626029014587402, "reward_std": 0.4813341796398163, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9642857313156128, "rewards/wrapped_driving_reward": 0.07331724464893341, "rewards/wrapped_format_reward": 0.625, "step": 476 }, { "completion_length": 750.0, "epoch": 19.08, "grad_norm": 0.46059325337409973, "kl": 0.7996255159378052, "learning_rate": 4.425587601883461e-06, "loss": 0.032, "reward": 1.2542250156402588, "reward_std": 3.507305383682251, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7272727489471436, "rewards/wrapped_driving_reward": -0.9730477929115295, "rewards/wrapped_format_reward": 0.75, "step": 477 }, { "completion_length": 750.0, "epoch": 19.12, "grad_norm": 0.48959171772003174, "kl": 2.0407848358154297, "learning_rate": 4.422104580183649e-06, "loss": 0.0816, "reward": 3.022857189178467, "reward_std": 0.16870427131652832, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.949999988079071, "rewards/wrapped_driving_reward": 0.07285712659358978, "rewards/wrapped_format_reward": 1.0, "step": 478 }, { "completion_length": 750.0, "epoch": 19.16, "grad_norm": 0.4211970567703247, "kl": 0.9752914309501648, "learning_rate": 4.418612409947814e-06, "loss": 0.039, "reward": 3.0421531200408936, "reward_std": 0.3262219727039337, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8928571343421936, "rewards/wrapped_driving_reward": 0.14929600059986115, "rewards/wrapped_format_reward": 1.0, "step": 479 }, { "completion_length": 750.0, "epoch": 19.2, "grad_norm": 0.3921389877796173, "kl": 0.8673704862594604, "learning_rate": 4.415111107797445e-06, "loss": 0.0347, "reward": -1.6749999523162842, "reward_std": 0.39475730061531067, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.824999988079071, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.5, "step": 480 }, { "completion_length": 750.0, "epoch": 19.24, "grad_norm": 0.49681219458580017, "kl": 1.1280070543289185, "learning_rate": 4.4116006903975015e-06, "loss": 0.0451, "reward": 2.109313726425171, "reward_std": 1.7568812370300293, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9045454263687134, "rewards/wrapped_driving_reward": -0.5452316403388977, "rewards/wrapped_format_reward": 0.75, "step": 481 }, { "completion_length": 750.0, "epoch": 19.28, "grad_norm": 0.5449213981628418, "kl": 1.4810718297958374, "learning_rate": 4.408081174456322e-06, "loss": 0.0592, "reward": 2.895340919494629, "reward_std": 0.3929472267627716, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.14534106850624084, "rewards/wrapped_format_reward": 0.75, "step": 482 }, { "completion_length": 750.0, "epoch": 19.32, "grad_norm": 0.46818554401397705, "kl": 1.1236234903335571, "learning_rate": 4.404552576725557e-06, "loss": 0.0449, "reward": -1.25, "reward_std": 0.28867512941360474, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 483 }, { "completion_length": 554.0, "epoch": 19.36, "grad_norm": 0.6208794116973877, "kl": 1.3854421377182007, "learning_rate": 4.401014914000078e-06, "loss": 0.0554, "reward": 3.256164073944092, "reward_std": 0.4106229543685913, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.5061641335487366, "rewards/wrapped_format_reward": 0.75, "step": 484 }, { "completion_length": 511.0, "epoch": 19.4, "grad_norm": 0.5070251822471619, "kl": 1.0283787250518799, "learning_rate": 4.397468203117905e-06, "loss": 0.0411, "reward": 3.588742256164551, "reward_std": 0.2591555416584015, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9642857313156128, "rewards/wrapped_driving_reward": 0.6244565844535828, "rewards/wrapped_format_reward": 1.0, "step": 485 }, { "completion_length": 750.0, "epoch": 19.44, "grad_norm": 0.4414016604423523, "kl": 1.3710557222366333, "learning_rate": 4.393912460960125e-06, "loss": 0.0548, "reward": 2.9028897285461426, "reward_std": 0.3639879524707794, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.09711016714572906, "rewards/wrapped_format_reward": 1.0, "step": 486 }, { "completion_length": 750.0, "epoch": 19.48, "grad_norm": 0.8334219455718994, "kl": 1.6127806901931763, "learning_rate": 4.3903477044508066e-06, "loss": 0.0645, "reward": -1.5, "reward_std": 0.40824830532073975, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.5, "step": 487 }, { "completion_length": 656.0, "epoch": 19.52, "grad_norm": 0.4386281371116638, "kl": 0.916235089302063, "learning_rate": 4.386773950556931e-06, "loss": 0.0366, "reward": 2.9088566303253174, "reward_std": 0.24869580566883087, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9545454382896423, "rewards/wrapped_driving_reward": -0.045688893646001816, "rewards/wrapped_format_reward": 1.0, "step": 488 }, { "completion_length": 741.0, "epoch": 19.56, "grad_norm": 0.4149533212184906, "kl": 0.9770787358283997, "learning_rate": 4.3831912162882946e-06, "loss": 0.0391, "reward": 1.848495602607727, "reward_std": 2.5709009170532227, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.901504397392273, "rewards/wrapped_format_reward": 0.75, "step": 489 }, { "completion_length": 587.0, "epoch": 19.6, "grad_norm": 0.4931059181690216, "kl": 0.6863958835601807, "learning_rate": 4.379599518697444e-06, "loss": 0.0275, "reward": 1.1175979375839233, "reward_std": 2.750753402709961, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.2574020624160767, "rewards/wrapped_format_reward": 0.875, "step": 490 }, { "completion_length": 543.0, "epoch": 19.64, "grad_norm": 0.5057757496833801, "kl": 1.1397721767425537, "learning_rate": 4.375998874879585e-06, "loss": 0.0456, "reward": 3.4474494457244873, "reward_std": 0.37866705656051636, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.4474495053291321, "rewards/wrapped_format_reward": 1.0, "step": 491 }, { "completion_length": 750.0, "epoch": 19.68, "grad_norm": 0.46242570877075195, "kl": 1.0390926599502563, "learning_rate": 4.372389301972506e-06, "loss": 0.0416, "reward": 1.491578459739685, "reward_std": 3.673814058303833, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.7584214806556702, "rewards/wrapped_format_reward": 0.75, "step": 492 }, { "completion_length": 750.0, "epoch": 19.72, "grad_norm": 0.38337427377700806, "kl": 0.9781420230865479, "learning_rate": 4.368770817156493e-06, "loss": 0.0391, "reward": 1.3044135570526123, "reward_std": 2.237086534500122, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9722222089767456, "rewards/wrapped_driving_reward": -1.2928086519241333, "rewards/wrapped_format_reward": 0.625, "step": 493 }, { "completion_length": 528.0, "epoch": 19.76, "grad_norm": 0.5733584761619568, "kl": 0.773429811000824, "learning_rate": 4.365143437654249e-06, "loss": 0.0309, "reward": 3.358289957046509, "reward_std": 0.4428289532661438, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.35829001665115356, "rewards/wrapped_format_reward": 1.0, "step": 494 }, { "completion_length": 750.0, "epoch": 19.8, "grad_norm": 0.38473784923553467, "kl": 0.9460413455963135, "learning_rate": 4.3615071807308165e-06, "loss": 0.0378, "reward": 2.6372628211975098, "reward_std": 0.507134735584259, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9027777910232544, "rewards/wrapped_driving_reward": -0.015514791011810303, "rewards/wrapped_format_reward": 0.75, "step": 495 }, { "completion_length": 750.0, "epoch": 19.84, "grad_norm": 0.5427440404891968, "kl": 1.3654718399047852, "learning_rate": 4.357862063693486e-06, "loss": 0.0546, "reward": -1.053030252456665, "reward_std": 0.07872962206602097, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9469696879386902, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 1.0, "step": 496 }, { "completion_length": 750.0, "epoch": 19.88, "grad_norm": 0.6441575884819031, "kl": 1.5351756811141968, "learning_rate": 4.354208103891723e-06, "loss": 0.0614, "reward": 2.9220848083496094, "reward_std": 0.5418745875358582, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9583333134651184, "rewards/wrapped_driving_reward": 0.46375155448913574, "rewards/wrapped_format_reward": 0.5, "step": 497 }, { "completion_length": 750.0, "epoch": 19.92, "grad_norm": 0.3897554874420166, "kl": 0.6224919557571411, "learning_rate": 4.350545318717081e-06, "loss": 0.0249, "reward": 0.7557200193405151, "reward_std": 2.027337074279785, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -2.1192798614501953, "rewards/wrapped_format_reward": 0.875, "step": 498 }, { "completion_length": 539.0, "epoch": 19.96, "grad_norm": 0.501442015171051, "kl": 0.8325724601745605, "learning_rate": 4.3468737256031155e-06, "loss": 0.0333, "reward": -1.125, "reward_std": 0.25, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.875, "step": 499 }, { "completion_length": 750.0, "epoch": 20.0, "grad_norm": 0.3670285940170288, "kl": 1.4814090728759766, "learning_rate": 4.34319334202531e-06, "loss": 0.0593, "reward": 2.8780927658081055, "reward_std": 0.0776260644197464, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9750000238418579, "rewards/wrapped_driving_reward": -0.0969071239233017, "rewards/wrapped_format_reward": 1.0, "step": 500 }, { "completion_length": 720.0, "epoch": 20.04, "grad_norm": 0.5521990060806274, "kl": 1.595299243927002, "learning_rate": 4.339504185500984e-06, "loss": 0.0638, "reward": 1.5800509452819824, "reward_std": 3.053708791732788, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9199489951133728, "rewards/wrapped_format_reward": 1.0, "step": 501 }, { "completion_length": 750.0, "epoch": 20.08, "grad_norm": 0.45915642380714417, "kl": 1.1580920219421387, "learning_rate": 4.335806273589214e-06, "loss": 0.0463, "reward": -1.4583332538604736, "reward_std": 0.5335936546325684, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9166666865348816, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.625, "step": 502 }, { "completion_length": 515.0, "epoch": 20.12, "grad_norm": 0.5437774062156677, "kl": 1.1531405448913574, "learning_rate": 4.332099623890749e-06, "loss": 0.0461, "reward": 3.0214743614196777, "reward_std": 0.1465667486190796, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.96875, "rewards/wrapped_driving_reward": 0.052724581211805344, "rewards/wrapped_format_reward": 1.0, "step": 503 }, { "completion_length": 750.0, "epoch": 20.16, "grad_norm": 0.661466121673584, "kl": 1.5797182321548462, "learning_rate": 4.328384254047927e-06, "loss": 0.0632, "reward": 3.216904640197754, "reward_std": 0.47574853897094727, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.935606062412262, "rewards/wrapped_driving_reward": 0.2812984585762024, "rewards/wrapped_format_reward": 1.0, "step": 504 }, { "completion_length": 619.0, "epoch": 20.2, "grad_norm": 0.535003662109375, "kl": 1.1837904453277588, "learning_rate": 4.324660181744589e-06, "loss": 0.0474, "reward": 2.576167345046997, "reward_std": 0.9729455709457397, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.4238327443599701, "rewards/wrapped_format_reward": 1.0, "step": 505 }, { "completion_length": 679.0, "epoch": 20.24, "grad_norm": 0.6109884977340698, "kl": 0.4138145446777344, "learning_rate": 4.320927424706001e-06, "loss": 0.0166, "reward": 2.9425160884857178, "reward_std": 0.9789575338363647, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8333333730697632, "rewards/wrapped_driving_reward": 0.35918280482292175, "rewards/wrapped_format_reward": 0.75, "step": 506 }, { "completion_length": 483.0, "epoch": 20.28, "grad_norm": 0.4926319122314453, "kl": 0.7010176777839661, "learning_rate": 4.317186000698761e-06, "loss": 0.028, "reward": 2.29917049407959, "reward_std": 0.5825653076171875, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9772727489471436, "rewards/wrapped_driving_reward": -0.6781020760536194, "rewards/wrapped_format_reward": 1.0, "step": 507 }, { "completion_length": 696.0, "epoch": 20.32, "grad_norm": 0.3954756557941437, "kl": 0.993597686290741, "learning_rate": 4.313435927530719e-06, "loss": 0.0397, "reward": -1.375, "reward_std": 0.4787135720252991, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.625, "step": 508 }, { "completion_length": 381.0, "epoch": 20.36, "grad_norm": 0.5946969985961914, "kl": 0.3980226218700409, "learning_rate": 4.309677223050895e-06, "loss": 0.0159, "reward": 1.2032806873321533, "reward_std": 2.8211827278137207, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.1717194318771362, "rewards/wrapped_format_reward": 0.875, "step": 509 }, { "completion_length": 750.0, "epoch": 20.4, "grad_norm": 0.4983665943145752, "kl": 0.5070443749427795, "learning_rate": 4.305909905149389e-06, "loss": 0.0203, "reward": -1.625, "reward_std": 0.4787135720252991, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.375, "step": 510 }, { "completion_length": 750.0, "epoch": 20.44, "grad_norm": 0.43326103687286377, "kl": 1.5419466495513916, "learning_rate": 4.3021339917572975e-06, "loss": 0.0617, "reward": 2.3160929679870605, "reward_std": 0.9564554691314697, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.5589069128036499, "rewards/wrapped_format_reward": 0.875, "step": 511 }, { "completion_length": 750.0, "epoch": 20.48, "grad_norm": 0.5289414525032043, "kl": 1.1936659812927246, "learning_rate": 4.2983495008466285e-06, "loss": 0.0477, "reward": 0.6631726026535034, "reward_std": 1.2805912494659424, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -2.086827278137207, "rewards/wrapped_format_reward": 0.75, "step": 512 }, { "completion_length": 630.0, "epoch": 20.52, "grad_norm": 0.7556226253509521, "kl": 1.03369140625, "learning_rate": 4.294556450430216e-06, "loss": 0.0413, "reward": 2.696643352508545, "reward_std": 0.5111956596374512, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9642857313156128, "rewards/wrapped_driving_reward": -0.2676423490047455, "rewards/wrapped_format_reward": 1.0, "step": 513 }, { "completion_length": 750.0, "epoch": 20.56, "grad_norm": 0.6190569996833801, "kl": 0.35693028569221497, "learning_rate": 4.290754858561636e-06, "loss": 0.0143, "reward": 2.3937301635742188, "reward_std": 0.33194005489349365, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.143730029463768, "rewards/wrapped_format_reward": 0.25, "step": 514 }, { "completion_length": 428.0, "epoch": 20.6, "grad_norm": 0.7335030436515808, "kl": 0.48697853088378906, "learning_rate": 4.2869447433351165e-06, "loss": 0.0195, "reward": 3.3049356937408447, "reward_std": 0.6103650331497192, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.4299355745315552, "rewards/wrapped_format_reward": 0.875, "step": 515 }, { "completion_length": 750.0, "epoch": 20.64, "grad_norm": 0.5130233764648438, "kl": 1.0960686206817627, "learning_rate": 4.283126122885455e-06, "loss": 0.0438, "reward": 2.9208106994628906, "reward_std": 0.15237730741500854, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.17081058025360107, "rewards/wrapped_format_reward": 0.75, "step": 516 }, { "completion_length": 682.0, "epoch": 20.68, "grad_norm": 1.5805164575576782, "kl": 1.2938767671585083, "learning_rate": 4.2792990153879286e-06, "loss": 0.0518, "reward": 2.5840139389038086, "reward_std": 0.5336389541625977, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.08401414752006531, "rewards/wrapped_format_reward": 0.5, "step": 517 }, { "completion_length": 518.0, "epoch": 20.72, "grad_norm": 0.795953094959259, "kl": 1.048414945602417, "learning_rate": 4.275463439058214e-06, "loss": 0.0419, "reward": 3.040872097015381, "reward_std": 0.6307140588760376, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8392857313156128, "rewards/wrapped_driving_reward": 0.20158648490905762, "rewards/wrapped_format_reward": 1.0, "step": 518 }, { "completion_length": 750.0, "epoch": 20.76, "grad_norm": 1.237001657485962, "kl": 1.0653915405273438, "learning_rate": 4.271619412152293e-06, "loss": 0.0426, "reward": 2.947390079498291, "reward_std": 0.2647410035133362, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9166666865348816, "rewards/wrapped_driving_reward": 0.03072343021631241, "rewards/wrapped_format_reward": 1.0, "step": 519 }, { "completion_length": 750.0, "epoch": 20.8, "grad_norm": 2.3102457523345947, "kl": 1.6519412994384766, "learning_rate": 4.267766952966369e-06, "loss": 0.0661, "reward": 2.849301338195801, "reward_std": 0.4467388987541199, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.09930121898651123, "rewards/wrapped_format_reward": 0.75, "step": 520 }, { "completion_length": 450.0, "epoch": 20.84, "grad_norm": 0.6221670508384705, "kl": 0.32503193616867065, "learning_rate": 4.2639060798367835e-06, "loss": 0.013, "reward": -1.125, "reward_std": 0.25, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.875, "step": 521 }, { "completion_length": 690.0, "epoch": 20.88, "grad_norm": 0.8544734120368958, "kl": 0.9178805351257324, "learning_rate": 4.260036811139922e-06, "loss": 0.0367, "reward": 1.6469182968139648, "reward_std": 3.2096569538116455, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.6030816435813904, "rewards/wrapped_format_reward": 0.75, "step": 522 }, { "completion_length": 750.0, "epoch": 20.92, "grad_norm": 1.0491329431533813, "kl": 1.2910516262054443, "learning_rate": 4.25615916529213e-06, "loss": 0.0516, "reward": 2.478778839111328, "reward_std": 0.7036353945732117, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9166666865348816, "rewards/wrapped_driving_reward": -0.1878880113363266, "rewards/wrapped_format_reward": 0.75, "step": 523 }, { "completion_length": 639.0, "epoch": 20.96, "grad_norm": 0.7800697088241577, "kl": 0.911841869354248, "learning_rate": 4.2522731607496275e-06, "loss": 0.0365, "reward": 3.115720272064209, "reward_std": 0.08318884670734406, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.11572031676769257, "rewards/wrapped_format_reward": 1.0, "step": 524 }, { "completion_length": 750.0, "epoch": 21.0, "grad_norm": 0.4819512367248535, "kl": 0.9187918901443481, "learning_rate": 4.248378816008418e-06, "loss": 0.0368, "reward": -1.0, "reward_std": 0.0, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 1.0, "step": 525 }, { "completion_length": 750.0, "epoch": 21.04, "grad_norm": 1.4163870811462402, "kl": 1.6186529397964478, "learning_rate": 4.244476149604201e-06, "loss": 0.0647, "reward": 2.948265552520752, "reward_std": 0.6075405478477478, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9545454382896423, "rewards/wrapped_driving_reward": 0.11871998757123947, "rewards/wrapped_format_reward": 0.875, "step": 526 }, { "completion_length": 750.0, "epoch": 21.08, "grad_norm": 0.9614571332931519, "kl": 0.9680662751197815, "learning_rate": 4.2405651801122835e-06, "loss": 0.0387, "reward": -1.6165865659713745, "reward_std": 0.4794272482395172, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8834134340286255, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.5, "step": 527 }, { "completion_length": 750.0, "epoch": 21.12, "grad_norm": 0.7541413307189941, "kl": 1.3542858362197876, "learning_rate": 4.236645926147493e-06, "loss": 0.0542, "reward": 2.6069326400756836, "reward_std": 0.21937526762485504, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9791666865348816, "rewards/wrapped_driving_reward": -0.372234046459198, "rewards/wrapped_format_reward": 1.0, "step": 528 }, { "completion_length": 512.0, "epoch": 21.16, "grad_norm": 0.5188172459602356, "kl": 0.6754733920097351, "learning_rate": 4.2327184063640905e-06, "loss": 0.027, "reward": 3.0157723426818848, "reward_std": 0.20528268814086914, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.875, "rewards/wrapped_driving_reward": 0.14077217876911163, "rewards/wrapped_format_reward": 1.0, "step": 529 }, { "completion_length": 750.0, "epoch": 21.2, "grad_norm": 0.5220324397087097, "kl": 1.3572345972061157, "learning_rate": 4.228782639455674e-06, "loss": 0.0543, "reward": 2.6850199699401855, "reward_std": 0.41600048542022705, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.06001979485154152, "rewards/wrapped_format_reward": 0.625, "step": 530 }, { "completion_length": 750.0, "epoch": 21.24, "grad_norm": 0.5855657458305359, "kl": 0.9850445985794067, "learning_rate": 4.224838644155099e-06, "loss": 0.0394, "reward": 0.7505922317504883, "reward_std": 3.3031928539276123, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.4994077682495117, "rewards/wrapped_format_reward": 0.75, "step": 531 }, { "completion_length": 560.0, "epoch": 21.28, "grad_norm": 0.5909777283668518, "kl": 1.309668779373169, "learning_rate": 4.220886439234385e-06, "loss": 0.0524, "reward": 1.8188085556030273, "reward_std": 3.256636619567871, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.43119144439697266, "rewards/wrapped_format_reward": 0.75, "step": 532 }, { "completion_length": 750.0, "epoch": 21.32, "grad_norm": 0.38810762763023376, "kl": 1.0575664043426514, "learning_rate": 4.216926043504626e-06, "loss": 0.0423, "reward": 2.893254041671753, "reward_std": 0.7650725245475769, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.2682541012763977, "rewards/wrapped_format_reward": 0.625, "step": 533 }, { "completion_length": 711.0, "epoch": 21.36, "grad_norm": 0.560529887676239, "kl": 1.4421268701553345, "learning_rate": 4.212957475815898e-06, "loss": 0.0577, "reward": 3.511518955230713, "reward_std": 0.41801729798316956, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9375, "rewards/wrapped_driving_reward": 0.5740189552307129, "rewards/wrapped_format_reward": 1.0, "step": 534 }, { "completion_length": 486.0, "epoch": 21.4, "grad_norm": 0.9958294630050659, "kl": 0.6733448505401611, "learning_rate": 4.2089807550571786e-06, "loss": 0.0269, "reward": -1.25, "reward_std": 0.28867512941360474, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 535 }, { "completion_length": 750.0, "epoch": 21.44, "grad_norm": 0.5459821820259094, "kl": 1.3347562551498413, "learning_rate": 4.204995900156247e-06, "loss": 0.0534, "reward": 3.5074427127838135, "reward_std": 0.6772680878639221, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.5074427127838135, "rewards/wrapped_format_reward": 1.0, "step": 536 }, { "completion_length": 750.0, "epoch": 21.48, "grad_norm": 0.5361037850379944, "kl": 1.1742392778396606, "learning_rate": 4.2010029300795986e-06, "loss": 0.047, "reward": -1.625, "reward_std": 1.25, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.875, "step": 537 }, { "completion_length": 516.0, "epoch": 21.52, "grad_norm": 12.987887382507324, "kl": 2.4212050437927246, "learning_rate": 4.197001863832355e-06, "loss": 0.0968, "reward": 2.888948678970337, "reward_std": 0.5455219745635986, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9285714626312256, "rewards/wrapped_driving_reward": -0.039622798562049866, "rewards/wrapped_format_reward": 1.0, "step": 538 }, { "completion_length": 750.0, "epoch": 21.56, "grad_norm": 0.40430188179016113, "kl": 0.3120957911014557, "learning_rate": 4.192992720458172e-06, "loss": 0.0125, "reward": 2.160168170928955, "reward_std": 0.5079329013824463, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.21483179926872253, "rewards/wrapped_format_reward": 0.375, "step": 539 }, { "completion_length": 735.0, "epoch": 21.6, "grad_norm": 0.48742803931236267, "kl": 1.1038862466812134, "learning_rate": 4.188975519039151e-06, "loss": 0.0442, "reward": 2.9028220176696777, "reward_std": 0.44946083426475525, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.949999988079071, "rewards/wrapped_driving_reward": 0.20282205939292908, "rewards/wrapped_format_reward": 0.75, "step": 540 }, { "completion_length": 750.0, "epoch": 21.64, "grad_norm": 0.40997225046157837, "kl": 1.483988642692566, "learning_rate": 4.184950278695745e-06, "loss": 0.0594, "reward": 3.176731586456299, "reward_std": 1.1836580038070679, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.426731675863266, "rewards/wrapped_format_reward": 0.75, "step": 541 }, { "completion_length": 750.0, "epoch": 21.68, "grad_norm": 0.6240445375442505, "kl": 1.3704639673233032, "learning_rate": 4.18091701858667e-06, "loss": 0.0548, "reward": 1.4026062488555908, "reward_std": 3.274519920349121, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.7223937511444092, "rewards/wrapped_format_reward": 0.625, "step": 542 }, { "completion_length": 750.0, "epoch": 21.72, "grad_norm": 0.4963267147541046, "kl": 1.1881283521652222, "learning_rate": 4.1768757579088145e-06, "loss": 0.0475, "reward": 2.731823682785034, "reward_std": 0.5495507717132568, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.1068236380815506, "rewards/wrapped_format_reward": 0.625, "step": 543 }, { "completion_length": 566.0, "epoch": 21.76, "grad_norm": 0.6541410088539124, "kl": 1.4289416074752808, "learning_rate": 4.172826515897146e-06, "loss": 0.0572, "reward": -1.1666667461395264, "reward_std": 0.3333333730697632, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9583333134651184, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.875, "step": 544 }, { "completion_length": 540.0, "epoch": 21.8, "grad_norm": 0.46453943848609924, "kl": 0.6854754686355591, "learning_rate": 4.168769311824619e-06, "loss": 0.0274, "reward": 3.0061306953430176, "reward_std": 0.44360265135765076, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9772727489471436, "rewards/wrapped_driving_reward": 0.028858043253421783, "rewards/wrapped_format_reward": 1.0, "step": 545 }, { "completion_length": 750.0, "epoch": 21.84, "grad_norm": 0.42537233233451843, "kl": 1.2528382539749146, "learning_rate": 4.164704165002086e-06, "loss": 0.0501, "reward": 2.6121349334716797, "reward_std": 0.5790495276451111, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.1378651112318039, "rewards/wrapped_format_reward": 0.75, "step": 546 }, { "completion_length": 750.0, "epoch": 21.88, "grad_norm": 0.42918747663497925, "kl": 1.3132213354110718, "learning_rate": 4.160631094778205e-06, "loss": 0.0525, "reward": -1.25, "reward_std": 0.5, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 547 }, { "completion_length": 750.0, "epoch": 21.92, "grad_norm": 0.6208239793777466, "kl": 1.4367201328277588, "learning_rate": 4.1565501205393445e-06, "loss": 0.0575, "reward": 2.702597141265869, "reward_std": 0.7576584219932556, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.17240279912948608, "rewards/wrapped_format_reward": 0.875, "step": 548 }, { "completion_length": 750.0, "epoch": 21.96, "grad_norm": 0.514549732208252, "kl": 1.6398608684539795, "learning_rate": 4.152461261709494e-06, "loss": 0.0656, "reward": 2.376189947128296, "reward_std": 0.6441675424575806, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.2488100230693817, "rewards/wrapped_format_reward": 0.625, "step": 549 }, { "completion_length": 451.0, "epoch": 22.0, "grad_norm": 0.5030242800712585, "kl": 1.0479674339294434, "learning_rate": 4.1483645377501726e-06, "loss": 0.0419, "reward": 3.0598669052124023, "reward_std": 0.3327578008174896, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.18486672639846802, "rewards/wrapped_format_reward": 0.875, "step": 550 }, { "completion_length": 622.0, "epoch": 22.04, "grad_norm": 0.38805630803108215, "kl": 0.7481611371040344, "learning_rate": 4.144259968160332e-06, "loss": 0.0299, "reward": 3.129121780395508, "reward_std": 0.25338175892829895, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.12912192940711975, "rewards/wrapped_format_reward": 1.0, "step": 551 }, { "completion_length": 494.0, "epoch": 22.08, "grad_norm": 0.550291895866394, "kl": 1.145378828048706, "learning_rate": 4.140147572476269e-06, "loss": 0.0458, "reward": 2.7440075874328613, "reward_std": 0.8928554654121399, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.25599223375320435, "rewards/wrapped_format_reward": 1.0, "step": 552 }, { "completion_length": 542.0, "epoch": 22.12, "grad_norm": 0.47073522210121155, "kl": 1.0155128240585327, "learning_rate": 4.136027370271526e-06, "loss": 0.0406, "reward": 1.1688520908355713, "reward_std": 2.8173623085021973, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6041666865348816, "rewards/wrapped_driving_reward": -0.9353145360946655, "rewards/wrapped_format_reward": 0.75, "step": 553 }, { "completion_length": 750.0, "epoch": 22.16, "grad_norm": 0.7193614840507507, "kl": 1.4088541269302368, "learning_rate": 4.1318993811568065e-06, "loss": 0.0564, "reward": 3.4254112243652344, "reward_std": 0.43707841634750366, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.4254113435745239, "rewards/wrapped_format_reward": 1.0, "step": 554 }, { "completion_length": 750.0, "epoch": 22.2, "grad_norm": 0.3912142515182495, "kl": 1.4723032712936401, "learning_rate": 4.127763624779873e-06, "loss": 0.0589, "reward": 0.24002844095230103, "reward_std": 1.8333266973495483, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -2.6349716186523438, "rewards/wrapped_format_reward": 0.875, "step": 555 }, { "completion_length": 612.0, "epoch": 22.24, "grad_norm": 0.8369085192680359, "kl": 1.3998736143112183, "learning_rate": 4.123620120825459e-06, "loss": 0.056, "reward": 2.403351306915283, "reward_std": 0.4577612578868866, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9583333134651184, "rewards/wrapped_driving_reward": -0.054981887340545654, "rewards/wrapped_format_reward": 0.5, "step": 556 }, { "completion_length": 477.0, "epoch": 22.28, "grad_norm": 0.5052926540374756, "kl": 0.6547082662582397, "learning_rate": 4.119468889015175e-06, "loss": 0.0262, "reward": 1.30754554271698, "reward_std": 2.665022611618042, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.69245445728302, "rewards/wrapped_format_reward": 1.0, "step": 557 }, { "completion_length": 739.0, "epoch": 22.32, "grad_norm": 0.6290760636329651, "kl": 1.5631662607192993, "learning_rate": 4.11530994910741e-06, "loss": 0.0625, "reward": -1.0499999523162842, "reward_std": 0.10000002384185791, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.949999988079071, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 1.0, "step": 558 }, { "completion_length": 364.0, "epoch": 22.36, "grad_norm": 0.6056944727897644, "kl": 0.5589709877967834, "learning_rate": 4.111143320897244e-06, "loss": 0.0224, "reward": 3.2965807914733887, "reward_std": 0.5819244384765625, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.2965807318687439, "rewards/wrapped_format_reward": 1.0, "step": 559 }, { "completion_length": 682.0, "epoch": 22.4, "grad_norm": 0.4512479305267334, "kl": 1.2240394353866577, "learning_rate": 4.106969024216348e-06, "loss": 0.049, "reward": 2.993767738342285, "reward_std": 0.24015867710113525, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.006232157349586487, "rewards/wrapped_format_reward": 1.0, "step": 560 }, { "completion_length": 750.0, "epoch": 22.44, "grad_norm": 0.4886869490146637, "kl": 1.3827241659164429, "learning_rate": 4.102787078932896e-06, "loss": 0.0553, "reward": 2.665478229522705, "reward_std": 0.4790554344654083, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.08452148735523224, "rewards/wrapped_format_reward": 0.75, "step": 561 }, { "completion_length": 673.0, "epoch": 22.48, "grad_norm": 0.3894011080265045, "kl": 1.2513083219528198, "learning_rate": 4.098597504951462e-06, "loss": 0.0501, "reward": -1.625, "reward_std": 0.4787135720252991, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.375, "step": 562 }, { "completion_length": 631.0, "epoch": 22.52, "grad_norm": 0.4578774869441986, "kl": 1.1499855518341064, "learning_rate": 4.094400322212933e-06, "loss": 0.046, "reward": 2.9452905654907227, "reward_std": 0.7577587366104126, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.05470933020114899, "rewards/wrapped_format_reward": 1.0, "step": 563 }, { "completion_length": 750.0, "epoch": 22.56, "grad_norm": 0.423875093460083, "kl": 0.8647013902664185, "learning_rate": 4.09019555069441e-06, "loss": 0.0346, "reward": 2.480205535888672, "reward_std": 0.4056702256202698, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.10520540177822113, "rewards/wrapped_format_reward": 0.375, "step": 564 }, { "completion_length": 750.0, "epoch": 22.6, "grad_norm": 0.39600902795791626, "kl": 1.215203881263733, "learning_rate": 4.085983210409114e-06, "loss": 0.0486, "reward": 2.527067184448242, "reward_std": 0.6684492826461792, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8928571343421936, "rewards/wrapped_driving_reward": 0.13421006500720978, "rewards/wrapped_format_reward": 0.5, "step": 565 }, { "completion_length": 492.0, "epoch": 22.64, "grad_norm": 4.348442554473877, "kl": 0.9590518474578857, "learning_rate": 4.081763321406291e-06, "loss": 0.0384, "reward": -1.0499999523162842, "reward_std": 0.10000002384185791, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.949999988079071, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 1.0, "step": 566 }, { "completion_length": 750.0, "epoch": 22.68, "grad_norm": 0.5797974467277527, "kl": 1.4900603294372559, "learning_rate": 4.077535903771115e-06, "loss": 0.0596, "reward": 3.0958666801452637, "reward_std": 0.3664493262767792, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.22086681425571442, "rewards/wrapped_format_reward": 0.875, "step": 567 }, { "completion_length": 600.0, "epoch": 22.72, "grad_norm": 0.5184460878372192, "kl": 1.305143117904663, "learning_rate": 4.073300977624594e-06, "loss": 0.0522, "reward": 2.9294841289520264, "reward_std": 0.5147085189819336, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8958333134651184, "rewards/wrapped_driving_reward": 0.28365081548690796, "rewards/wrapped_format_reward": 0.75, "step": 568 }, { "completion_length": 750.0, "epoch": 22.76, "grad_norm": 0.9449083209037781, "kl": 1.0648530721664429, "learning_rate": 4.069058563123476e-06, "loss": 0.0426, "reward": -1.25, "reward_std": 0.5, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 569 }, { "completion_length": 750.0, "epoch": 22.8, "grad_norm": 0.6300418376922607, "kl": 1.4819085597991943, "learning_rate": 4.064808680460149e-06, "loss": 0.0593, "reward": 2.6562626361846924, "reward_std": 0.36848315596580505, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.21873745322227478, "rewards/wrapped_format_reward": 0.875, "step": 570 }, { "completion_length": 681.0, "epoch": 22.84, "grad_norm": 0.45206159353256226, "kl": 0.7465510368347168, "learning_rate": 4.060551349862545e-06, "loss": 0.0299, "reward": 2.4971559047698975, "reward_std": 0.5323460102081299, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.3778441548347473, "rewards/wrapped_format_reward": 0.875, "step": 571 }, { "completion_length": 750.0, "epoch": 22.88, "grad_norm": 0.49330395460128784, "kl": 1.0091811418533325, "learning_rate": 4.056286591594049e-06, "loss": 0.0404, "reward": 1.2864571809768677, "reward_std": 2.8978612422943115, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9635427594184875, "rewards/wrapped_format_reward": 0.75, "step": 572 }, { "completion_length": 750.0, "epoch": 22.92, "grad_norm": 1.236677885055542, "kl": 1.20204758644104, "learning_rate": 4.052014425953399e-06, "loss": 0.0481, "reward": -1.2884615659713745, "reward_std": 0.4798709750175476, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9615384340286255, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 573 }, { "completion_length": 750.0, "epoch": 22.96, "grad_norm": 0.36019617319107056, "kl": 1.6590219736099243, "learning_rate": 4.047734873274586e-06, "loss": 0.0664, "reward": 3.00655460357666, "reward_std": 0.28616681694984436, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.13155463337898254, "rewards/wrapped_format_reward": 0.875, "step": 574 }, { "completion_length": 577.0, "epoch": 23.0, "grad_norm": 1.064710259437561, "kl": 1.6679836511611938, "learning_rate": 4.043447953926763e-06, "loss": 0.0667, "reward": 3.2319164276123047, "reward_std": 0.5175439119338989, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.23191656172275543, "rewards/wrapped_format_reward": 1.0, "step": 575 }, { "completion_length": 750.0, "epoch": 23.04, "grad_norm": 0.46895185112953186, "kl": 1.807165265083313, "learning_rate": 4.039153688314146e-06, "loss": 0.0723, "reward": -1.0833332538604736, "reward_std": 0.16666662693023682, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9166666865348816, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 1.0, "step": 576 }, { "completion_length": 750.0, "epoch": 23.08, "grad_norm": 0.3290964365005493, "kl": 1.4155315160751343, "learning_rate": 4.034852096875917e-06, "loss": 0.0566, "reward": 1.61959969997406, "reward_std": 2.1258249282836914, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.00540030002594, "rewards/wrapped_format_reward": 0.625, "step": 577 }, { "completion_length": 750.0, "epoch": 23.12, "grad_norm": 0.5254642367362976, "kl": 0.9085728526115417, "learning_rate": 4.0305432000861236e-06, "loss": 0.0363, "reward": 2.5806849002838135, "reward_std": 0.534013032913208, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9375, "rewards/wrapped_driving_reward": 0.018184844404459, "rewards/wrapped_format_reward": 0.625, "step": 578 }, { "completion_length": 485.0, "epoch": 23.16, "grad_norm": 0.5562568306922913, "kl": 1.061842918395996, "learning_rate": 4.026227018453587e-06, "loss": 0.0425, "reward": 1.7475911378860474, "reward_std": 0.9556852579116821, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.0024088621139526, "rewards/wrapped_format_reward": 0.75, "step": 579 }, { "completion_length": 750.0, "epoch": 23.2, "grad_norm": 0.7456584572792053, "kl": 1.79913330078125, "learning_rate": 4.021903572521802e-06, "loss": 0.072, "reward": 3.1010560989379883, "reward_std": 0.11367816478013992, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.10105594992637634, "rewards/wrapped_format_reward": 1.0, "step": 580 }, { "completion_length": 750.0, "epoch": 23.24, "grad_norm": 0.3952648341655731, "kl": 1.7831506729125977, "learning_rate": 4.0175728828688355e-06, "loss": 0.0713, "reward": 3.3662846088409424, "reward_std": 0.44146811962127686, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9565972089767456, "rewards/wrapped_driving_reward": 0.40968745946884155, "rewards/wrapped_format_reward": 1.0, "step": 581 }, { "completion_length": 750.0, "epoch": 23.28, "grad_norm": 0.5361789464950562, "kl": 1.3672832250595093, "learning_rate": 4.013234970107236e-06, "loss": 0.0547, "reward": 2.194763422012329, "reward_std": 0.9637414216995239, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.8052366375923157, "rewards/wrapped_format_reward": 1.0, "step": 582 }, { "completion_length": 559.0, "epoch": 23.32, "grad_norm": 0.5380634069442749, "kl": 1.4674052000045776, "learning_rate": 4.0088898548839285e-06, "loss": 0.0587, "reward": 2.7343053817749023, "reward_std": 0.47148850560188293, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.10930530726909637, "rewards/wrapped_format_reward": 0.625, "step": 583 }, { "completion_length": 750.0, "epoch": 23.36, "grad_norm": 0.4427616000175476, "kl": 0.7964285612106323, "learning_rate": 4.0045375578801216e-06, "loss": 0.0319, "reward": 1.0508185625076294, "reward_std": 3.381551742553711, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7142857313156128, "rewards/wrapped_driving_reward": -1.0384670495986938, "rewards/wrapped_format_reward": 0.625, "step": 584 }, { "completion_length": 750.0, "epoch": 23.4, "grad_norm": 0.4768621623516083, "kl": 0.8902795910835266, "learning_rate": 4.000178099811203e-06, "loss": 0.0356, "reward": 0.9182654619216919, "reward_std": 2.618046760559082, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9567345976829529, "rewards/wrapped_format_reward": 0.375, "step": 585 }, { "completion_length": 561.0, "epoch": 23.44, "grad_norm": 0.4203995168209076, "kl": 0.7094324231147766, "learning_rate": 3.995811501426648e-06, "loss": 0.0284, "reward": 2.076686143875122, "reward_std": 2.1102030277252197, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.9233137965202332, "rewards/wrapped_format_reward": 1.0, "step": 586 }, { "completion_length": 655.0, "epoch": 23.48, "grad_norm": 0.6416264772415161, "kl": 1.6330536603927612, "learning_rate": 3.991437783509916e-06, "loss": 0.0653, "reward": 3.16111421585083, "reward_std": 0.5641868710517883, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9375, "rewards/wrapped_driving_reward": 0.3486141860485077, "rewards/wrapped_format_reward": 0.875, "step": 587 }, { "completion_length": 750.0, "epoch": 23.52, "grad_norm": 0.5578856468200684, "kl": 1.2766082286834717, "learning_rate": 3.987056966878354e-06, "loss": 0.0511, "reward": -1.5277777910232544, "reward_std": 0.3643020987510681, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9722222089767456, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.5, "step": 588 }, { "completion_length": 580.0, "epoch": 23.56, "grad_norm": 0.6553155779838562, "kl": 1.447812795639038, "learning_rate": 3.982669072383093e-06, "loss": 0.0579, "reward": -1.375, "reward_std": 0.4787135720252991, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.625, "step": 589 }, { "completion_length": 472.0, "epoch": 23.6, "grad_norm": 0.515069842338562, "kl": 0.9640145897865295, "learning_rate": 3.978274120908957e-06, "loss": 0.0386, "reward": -1.21875, "reward_std": 0.21347814798355103, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.90625, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.875, "step": 590 }, { "completion_length": 610.0, "epoch": 23.64, "grad_norm": 0.4059649109840393, "kl": 1.3692982196807861, "learning_rate": 3.973872133374354e-06, "loss": 0.0548, "reward": 3.1105618476867676, "reward_std": 0.07002107799053192, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9270833134651184, "rewards/wrapped_driving_reward": 0.18347838521003723, "rewards/wrapped_format_reward": 1.0, "step": 591 }, { "completion_length": 750.0, "epoch": 23.68, "grad_norm": 0.7165226936340332, "kl": 1.6977932453155518, "learning_rate": 3.969463130731183e-06, "loss": 0.0679, "reward": 2.785404682159424, "reward_std": 0.4484281837940216, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.03540457785129547, "rewards/wrapped_format_reward": 0.75, "step": 592 }, { "completion_length": 510.0, "epoch": 23.72, "grad_norm": 0.5600489377975464, "kl": 0.7112762928009033, "learning_rate": 3.965047133964735e-06, "loss": 0.0285, "reward": 2.8892576694488525, "reward_std": 0.4215315580368042, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.014257688075304031, "rewards/wrapped_format_reward": 0.875, "step": 593 }, { "completion_length": 750.0, "epoch": 23.76, "grad_norm": 1.5231815576553345, "kl": 1.4100277423858643, "learning_rate": 3.960624164093587e-06, "loss": 0.0564, "reward": 3.6343533992767334, "reward_std": 0.24779343605041504, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.6343532800674438, "rewards/wrapped_format_reward": 1.0, "step": 594 }, { "completion_length": 750.0, "epoch": 23.8, "grad_norm": 0.5205547213554382, "kl": 1.2938545942306519, "learning_rate": 3.956194242169506e-06, "loss": 0.0518, "reward": 3.1121644973754883, "reward_std": 0.49655723571777344, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.36216452717781067, "rewards/wrapped_format_reward": 0.75, "step": 595 }, { "completion_length": 606.0, "epoch": 23.84, "grad_norm": 1.4387381076812744, "kl": 0.6888614892959595, "learning_rate": 3.951757389277349e-06, "loss": 0.0276, "reward": 3.201009750366211, "reward_std": 0.42461612820625305, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.2010095715522766, "rewards/wrapped_format_reward": 1.0, "step": 596 }, { "completion_length": 750.0, "epoch": 23.88, "grad_norm": 0.6438161730766296, "kl": 1.303391695022583, "learning_rate": 3.947313626534965e-06, "loss": 0.0521, "reward": 2.792616844177246, "reward_std": 0.48476383090019226, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.042616650462150574, "rewards/wrapped_format_reward": 0.75, "step": 597 }, { "completion_length": 696.0, "epoch": 23.92, "grad_norm": 0.4826262891292572, "kl": 0.8998121023178101, "learning_rate": 3.942862975093085e-06, "loss": 0.036, "reward": 3.009612798690796, "reward_std": 0.4383618235588074, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.1346127986907959, "rewards/wrapped_format_reward": 0.875, "step": 598 }, { "completion_length": 750.0, "epoch": 23.96, "grad_norm": 0.3816401958465576, "kl": 0.34214648604393005, "learning_rate": 3.938405456135231e-06, "loss": 0.0137, "reward": -1.53125, "reward_std": 0.5436661839485168, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.96875, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.5, "step": 599 }, { "completion_length": 624.0, "epoch": 24.0, "grad_norm": 0.48791933059692383, "kl": 0.9707455039024353, "learning_rate": 3.933941090877615e-06, "loss": 0.0388, "reward": 2.9003190994262695, "reward_std": 0.7066404819488525, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9642857313156128, "rewards/wrapped_driving_reward": 0.4360334277153015, "rewards/wrapped_format_reward": 0.5, "step": 600 }, { "completion_length": 750.0, "epoch": 24.04, "grad_norm": 0.4375697672367096, "kl": 0.7251780033111572, "learning_rate": 3.929469900569031e-06, "loss": 0.029, "reward": 1.5899887084960938, "reward_std": 0.8056603670120239, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9583333134651184, "rewards/wrapped_driving_reward": -1.1183446645736694, "rewards/wrapped_format_reward": 0.75, "step": 601 }, { "completion_length": 750.0, "epoch": 24.08, "grad_norm": 0.6219246983528137, "kl": 0.916982114315033, "learning_rate": 3.924991906490758e-06, "loss": 0.0367, "reward": 1.9977405071258545, "reward_std": 1.210959553718567, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.37725937366485596, "rewards/wrapped_format_reward": 0.375, "step": 602 }, { "completion_length": 617.0, "epoch": 24.12, "grad_norm": 0.5328007340431213, "kl": 0.8875964283943176, "learning_rate": 3.92050712995646e-06, "loss": 0.0355, "reward": 3.042351245880127, "reward_std": 0.12825651466846466, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.04235142096877098, "rewards/wrapped_format_reward": 1.0, "step": 603 }, { "completion_length": 616.0, "epoch": 24.16, "grad_norm": 0.5093056559562683, "kl": 1.141879677772522, "learning_rate": 3.916015592312083e-06, "loss": 0.0457, "reward": 2.9928762912750244, "reward_std": 0.5530683994293213, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.11787624657154083, "rewards/wrapped_format_reward": 0.875, "step": 604 }, { "completion_length": 750.0, "epoch": 24.2, "grad_norm": 0.4226742386817932, "kl": 1.135231614112854, "learning_rate": 3.911517314935752e-06, "loss": 0.0454, "reward": 2.794490098953247, "reward_std": 0.5061540007591248, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.04449019581079483, "rewards/wrapped_format_reward": 0.75, "step": 605 }, { "completion_length": 617.0, "epoch": 24.24, "grad_norm": 0.5197967290878296, "kl": 1.5424551963806152, "learning_rate": 3.907012319237672e-06, "loss": 0.0617, "reward": 0.6565590500831604, "reward_std": 1.6452194452285767, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.949999988079071, "rewards/wrapped_driving_reward": -1.7934409379959106, "rewards/wrapped_format_reward": 0.5, "step": 606 }, { "completion_length": 651.0, "epoch": 24.28, "grad_norm": 0.551070511341095, "kl": 1.370877981185913, "learning_rate": 3.902500626660025e-06, "loss": 0.0548, "reward": 3.000037670135498, "reward_std": 0.07534631341695786, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9772727489471436, "rewards/wrapped_driving_reward": 0.022764792665839195, "rewards/wrapped_format_reward": 1.0, "step": 607 }, { "completion_length": 750.0, "epoch": 24.32, "grad_norm": 1.6095973253250122, "kl": 2.3942067623138428, "learning_rate": 3.897982258676867e-06, "loss": 0.0958, "reward": 1.3801480531692505, "reward_std": 3.5869948863983154, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.8698518872261047, "rewards/wrapped_format_reward": 0.75, "step": 608 }, { "completion_length": 750.0, "epoch": 24.36, "grad_norm": 0.35611942410469055, "kl": 1.5788023471832275, "learning_rate": 3.8934572367940285e-06, "loss": 0.0632, "reward": 3.4379539489746094, "reward_std": 0.21014034748077393, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9791666865348816, "rewards/wrapped_driving_reward": 0.5837870836257935, "rewards/wrapped_format_reward": 0.875, "step": 609 }, { "completion_length": 452.0, "epoch": 24.4, "grad_norm": 0.5418919920921326, "kl": 0.8879813551902771, "learning_rate": 3.888925582549006e-06, "loss": 0.0355, "reward": 2.4353699684143066, "reward_std": 0.7164378762245178, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.5646300315856934, "rewards/wrapped_format_reward": 1.0, "step": 610 }, { "completion_length": 750.0, "epoch": 24.44, "grad_norm": 0.6341983079910278, "kl": 1.4851973056793213, "learning_rate": 3.8843873175108685e-06, "loss": 0.0594, "reward": 3.246654987335205, "reward_std": 0.551415205001831, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.930555522441864, "rewards/wrapped_driving_reward": 0.6910994052886963, "rewards/wrapped_format_reward": 0.625, "step": 611 }, { "completion_length": 750.0, "epoch": 24.48, "grad_norm": 0.6838855147361755, "kl": 1.131606936454773, "learning_rate": 3.879842463280146e-06, "loss": 0.0453, "reward": -1.75, "reward_std": 0.5, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.25, "step": 612 }, { "completion_length": 484.0, "epoch": 24.52, "grad_norm": 0.5183650851249695, "kl": 1.2218328714370728, "learning_rate": 3.875291041488734e-06, "loss": 0.0489, "reward": 3.104029655456543, "reward_std": 0.5371575951576233, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.229029580950737, "rewards/wrapped_format_reward": 0.875, "step": 613 }, { "completion_length": 750.0, "epoch": 24.56, "grad_norm": 0.39312297105789185, "kl": 1.4679391384124756, "learning_rate": 3.870733073799785e-06, "loss": 0.0587, "reward": 1.5885634422302246, "reward_std": 3.0621588230133057, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9114365577697754, "rewards/wrapped_format_reward": 1.0, "step": 614 }, { "completion_length": 750.0, "epoch": 24.6, "grad_norm": 0.4172166883945465, "kl": 1.2957038879394531, "learning_rate": 3.866168581907609e-06, "loss": 0.0518, "reward": 2.2104885578155518, "reward_std": 0.834479033946991, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.414511501789093, "rewards/wrapped_format_reward": 0.625, "step": 615 }, { "completion_length": 750.0, "epoch": 24.64, "grad_norm": 1.0627772808074951, "kl": 0.836889922618866, "learning_rate": 3.861597587537568e-06, "loss": 0.0335, "reward": -2.125, "reward_std": 1.314977765083313, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.375, "step": 616 }, { "completion_length": 686.0, "epoch": 24.68, "grad_norm": 0.41289910674095154, "kl": 1.7737808227539062, "learning_rate": 3.8570201124459745e-06, "loss": 0.071, "reward": 2.3616647720336914, "reward_std": 0.9351847171783447, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.6383353471755981, "rewards/wrapped_format_reward": 1.0, "step": 617 }, { "completion_length": 581.0, "epoch": 24.72, "grad_norm": 0.779534101486206, "kl": 1.1413559913635254, "learning_rate": 3.8524361784199855e-06, "loss": 0.0457, "reward": 3.272754192352295, "reward_std": 0.6635096669197083, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9375, "rewards/wrapped_driving_reward": 0.5852542519569397, "rewards/wrapped_format_reward": 0.75, "step": 618 }, { "completion_length": 750.0, "epoch": 24.76, "grad_norm": 0.42929840087890625, "kl": 1.5810688734054565, "learning_rate": 3.847845807277501e-06, "loss": 0.0632, "reward": -1.5, "reward_std": 0.5773502588272095, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.5, "step": 619 }, { "completion_length": 557.0, "epoch": 24.8, "grad_norm": 0.42557451128959656, "kl": 0.7237415313720703, "learning_rate": 3.8432490208670605e-06, "loss": 0.0289, "reward": 3.082545280456543, "reward_std": 0.1998668611049652, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.0825452134013176, "rewards/wrapped_format_reward": 1.0, "step": 620 }, { "completion_length": 750.0, "epoch": 24.84, "grad_norm": 0.5002950429916382, "kl": 1.1155179738998413, "learning_rate": 3.838645841067735e-06, "loss": 0.0446, "reward": -1.149999976158142, "reward_std": 0.29999998211860657, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8500000238418579, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 1.0, "step": 621 }, { "completion_length": 615.0, "epoch": 24.88, "grad_norm": 0.44429194927215576, "kl": 1.1357996463775635, "learning_rate": 3.83403628978903e-06, "loss": 0.0454, "reward": 3.2363839149475098, "reward_std": 0.5219359993934631, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.36138415336608887, "rewards/wrapped_format_reward": 0.875, "step": 622 }, { "completion_length": 665.0, "epoch": 24.92, "grad_norm": 0.41479218006134033, "kl": 1.0186771154403687, "learning_rate": 3.829420388970772e-06, "loss": 0.0407, "reward": 2.8114256858825684, "reward_std": 0.4211501479148865, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.06142570078372955, "rewards/wrapped_format_reward": 0.75, "step": 623 }, { "completion_length": 620.0, "epoch": 24.96, "grad_norm": 0.5567065477371216, "kl": 0.8860252499580383, "learning_rate": 3.824798160583012e-06, "loss": 0.0354, "reward": -0.21129226684570312, "reward_std": 2.2909140586853027, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.987500011920929, "rewards/wrapped_driving_reward": -2.9487922191619873, "rewards/wrapped_format_reward": 0.75, "step": 624 }, { "completion_length": 750.0, "epoch": 25.0, "grad_norm": 0.6249330043792725, "kl": 1.2577399015426636, "learning_rate": 3.82016962662592e-06, "loss": 0.0503, "reward": -1.1666667461395264, "reward_std": 0.235702246427536, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9583333134651184, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.875, "step": 625 }, { "completion_length": 750.0, "epoch": 25.04, "grad_norm": 0.4327258765697479, "kl": 1.0329591035842896, "learning_rate": 3.815534809129674e-06, "loss": 0.0413, "reward": -0.3447999358177185, "reward_std": 3.6673803329467773, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.0947999954223633, "rewards/wrapped_format_reward": 0.75, "step": 626 }, { "completion_length": 735.0, "epoch": 25.08, "grad_norm": 0.9782822728157043, "kl": 1.77318274974823, "learning_rate": 3.8108937301543613e-06, "loss": 0.0709, "reward": -1.25, "reward_std": 0.28867512941360474, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 627 }, { "completion_length": 750.0, "epoch": 25.12, "grad_norm": 0.4559270143508911, "kl": 0.4221249222755432, "learning_rate": 3.806246411789872e-06, "loss": 0.0169, "reward": 1.5232298374176025, "reward_std": 3.688981056213379, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.6017701625823975, "rewards/wrapped_format_reward": 0.625, "step": 628 }, { "completion_length": 523.0, "epoch": 25.16, "grad_norm": 2.418384552001953, "kl": 1.021903395652771, "learning_rate": 3.8015928761557937e-06, "loss": 0.0409, "reward": -1.5, "reward_std": 0.40824830532073975, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.5, "step": 629 }, { "completion_length": 481.0, "epoch": 25.2, "grad_norm": 1.3654940128326416, "kl": 0.7937284708023071, "learning_rate": 3.796933145401304e-06, "loss": 0.0317, "reward": 2.0573642253875732, "reward_std": 3.373218059539795, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.44263583421707153, "rewards/wrapped_format_reward": 1.0, "step": 630 }, { "completion_length": 750.0, "epoch": 25.24, "grad_norm": 6.166470527648926, "kl": 1.57492995262146, "learning_rate": 3.7922672417050687e-06, "loss": 0.063, "reward": 3.1056036949157715, "reward_std": 0.28644248843193054, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9750000238418579, "rewards/wrapped_driving_reward": 0.2556036412715912, "rewards/wrapped_format_reward": 0.875, "step": 631 }, { "completion_length": 444.0, "epoch": 25.28, "grad_norm": 0.11741337180137634, "kl": 0.7082597017288208, "learning_rate": 3.787595187275136e-06, "loss": 0.0283, "reward": -1.0, "reward_std": 0.0, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 1.0, "step": 632 }, { "completion_length": 671.0, "epoch": 25.32, "grad_norm": 0.5336841344833374, "kl": 1.2229433059692383, "learning_rate": 3.782917004348826e-06, "loss": 0.0489, "reward": 2.960865020751953, "reward_std": 0.3422490060329437, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.08586501330137253, "rewards/wrapped_format_reward": 0.875, "step": 633 }, { "completion_length": 750.0, "epoch": 25.36, "grad_norm": 0.7414114475250244, "kl": 1.5562310218811035, "learning_rate": 3.77823271519263e-06, "loss": 0.0622, "reward": 1.009045124053955, "reward_std": 3.342817783355713, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": -0.7409549355506897, "rewards/wrapped_format_reward": 0.375, "step": 634 }, { "completion_length": 750.0, "epoch": 25.4, "grad_norm": 0.40905439853668213, "kl": 1.2026870250701904, "learning_rate": 3.773542342102105e-06, "loss": 0.0481, "reward": 3.4216301441192627, "reward_std": 0.28488659858703613, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.4216301441192627, "rewards/wrapped_format_reward": 1.0, "step": 635 }, { "completion_length": 750.0, "epoch": 25.44, "grad_norm": 0.44360411167144775, "kl": 1.3418991565704346, "learning_rate": 3.768845907401761e-06, "loss": 0.0537, "reward": 1.978645920753479, "reward_std": 2.0448219776153564, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.8963539600372314, "rewards/wrapped_format_reward": 0.875, "step": 636 }, { "completion_length": 750.0, "epoch": 25.48, "grad_norm": 0.5750876069068909, "kl": 1.2612106800079346, "learning_rate": 3.764143433444962e-06, "loss": 0.0504, "reward": -1.75, "reward_std": 1.1902379989624023, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 637 }, { "completion_length": 593.0, "epoch": 25.52, "grad_norm": 0.6619950532913208, "kl": 1.4557205438613892, "learning_rate": 3.759434942613816e-06, "loss": 0.0582, "reward": 2.910365104675293, "reward_std": 0.15222014486789703, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.03536504879593849, "rewards/wrapped_format_reward": 0.875, "step": 638 }, { "completion_length": 445.0, "epoch": 25.56, "grad_norm": 0.5441882610321045, "kl": 0.7263493537902832, "learning_rate": 3.75472045731907e-06, "loss": 0.0291, "reward": 1.806579828262329, "reward_std": 1.926430344581604, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.068420171737671, "rewards/wrapped_format_reward": 0.875, "step": 639 }, { "completion_length": 750.0, "epoch": 25.6, "grad_norm": 0.37299302220344543, "kl": 1.4050265550613403, "learning_rate": 3.7500000000000005e-06, "loss": 0.0562, "reward": 3.081347703933716, "reward_std": 0.26734286546707153, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.08134761452674866, "rewards/wrapped_format_reward": 1.0, "step": 640 }, { "completion_length": 658.0, "epoch": 25.64, "grad_norm": 0.4773707687854767, "kl": 1.1113680601119995, "learning_rate": 3.7452735931243108e-06, "loss": 0.0445, "reward": 3.2923827171325684, "reward_std": 0.3292248547077179, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.2923825681209564, "rewards/wrapped_format_reward": 1.0, "step": 641 }, { "completion_length": 750.0, "epoch": 25.68, "grad_norm": 0.47026556730270386, "kl": 1.7575290203094482, "learning_rate": 3.7405412591880213e-06, "loss": 0.0703, "reward": -1.774999976158142, "reward_std": 1.4840823411941528, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7250000238418579, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 642 }, { "completion_length": 750.0, "epoch": 25.72, "grad_norm": 0.4005660116672516, "kl": 1.021146297454834, "learning_rate": 3.735803020715362e-06, "loss": 0.0408, "reward": 0.7224711775779724, "reward_std": 1.9892507791519165, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -2.152529001235962, "rewards/wrapped_format_reward": 0.875, "step": 643 }, { "completion_length": 492.0, "epoch": 25.76, "grad_norm": 0.9656462669372559, "kl": 0.7833800315856934, "learning_rate": 3.7310589002586683e-06, "loss": 0.0313, "reward": 1.5308680534362793, "reward_std": 3.0411288738250732, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9691319465637207, "rewards/wrapped_format_reward": 1.0, "step": 644 }, { "completion_length": 750.0, "epoch": 25.8, "grad_norm": 0.36192595958709717, "kl": 1.2042280435562134, "learning_rate": 3.7263089203982698e-06, "loss": 0.0482, "reward": 0.9078962802886963, "reward_std": 2.2103357315063477, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.8421037197113037, "rewards/wrapped_format_reward": 0.75, "step": 645 }, { "completion_length": 750.0, "epoch": 25.84, "grad_norm": 0.38677889108657837, "kl": 1.4992326498031616, "learning_rate": 3.721553103742388e-06, "loss": 0.06, "reward": 0.9938499927520752, "reward_std": 3.3306725025177, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.1311500072479248, "rewards/wrapped_format_reward": 0.625, "step": 646 }, { "completion_length": 750.0, "epoch": 25.88, "grad_norm": 0.6154844164848328, "kl": 1.3798651695251465, "learning_rate": 3.7167914729270205e-06, "loss": 0.0552, "reward": 2.4758121967315674, "reward_std": 0.3415810167789459, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9791666865348816, "rewards/wrapped_driving_reward": -0.2533544898033142, "rewards/wrapped_format_reward": 0.75, "step": 647 }, { "completion_length": 750.0, "epoch": 25.92, "grad_norm": 0.3917384743690491, "kl": 0.8261408805847168, "learning_rate": 3.7120240506158433e-06, "loss": 0.033, "reward": -0.25806379318237305, "reward_std": 1.679756999015808, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9285714626312256, "rewards/wrapped_driving_reward": -3.1866352558135986, "rewards/wrapped_format_reward": 1.0, "step": 648 }, { "completion_length": 562.0, "epoch": 25.96, "grad_norm": 0.5384634137153625, "kl": 1.1905111074447632, "learning_rate": 3.7072508595000935e-06, "loss": 0.0476, "reward": 2.2796902656555176, "reward_std": 0.8608195781707764, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8678571581840515, "rewards/wrapped_driving_reward": -0.46316689252853394, "rewards/wrapped_format_reward": 0.875, "step": 649 }, { "completion_length": 750.0, "epoch": 26.0, "grad_norm": 0.36869606375694275, "kl": 1.1743124723434448, "learning_rate": 3.7024719222984696e-06, "loss": 0.047, "reward": 2.691631317138672, "reward_std": 0.5718726515769958, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.84375, "rewards/wrapped_driving_reward": 0.09788113832473755, "rewards/wrapped_format_reward": 0.75, "step": 650 }, { "completion_length": 582.0, "epoch": 26.04, "grad_norm": 0.8777485489845276, "kl": 1.086169719696045, "learning_rate": 3.6976872617570163e-06, "loss": 0.0434, "reward": 3.3562309741973877, "reward_std": 0.4827421307563782, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.6062309741973877, "rewards/wrapped_format_reward": 0.75, "step": 651 }, { "completion_length": 695.0, "epoch": 26.08, "grad_norm": 0.8116427063941956, "kl": 1.1721394062042236, "learning_rate": 3.6928969006490212e-06, "loss": 0.0469, "reward": 3.1609246730804443, "reward_std": 0.4948073923587799, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.2859245240688324, "rewards/wrapped_format_reward": 0.875, "step": 652 }, { "completion_length": 750.0, "epoch": 26.12, "grad_norm": 0.6845850348472595, "kl": 0.6479750275611877, "learning_rate": 3.6881008617749042e-06, "loss": 0.0259, "reward": 0.7977969646453857, "reward_std": 3.241457223892212, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7083333134651184, "rewards/wrapped_driving_reward": -0.9105363488197327, "rewards/wrapped_format_reward": 0.25, "step": 653 }, { "completion_length": 750.0, "epoch": 26.16, "grad_norm": 0.402665376663208, "kl": 1.176146388053894, "learning_rate": 3.6832991679621087e-06, "loss": 0.047, "reward": -1.3214285373687744, "reward_std": 0.38905078172683716, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9285714626312256, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 654 }, { "completion_length": 615.0, "epoch": 26.2, "grad_norm": 0.4569839835166931, "kl": 1.0099406242370605, "learning_rate": 3.6784918420649952e-06, "loss": 0.0404, "reward": 2.8477678298950195, "reward_std": 0.21331287920475006, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.02723217010498047, "rewards/wrapped_format_reward": 0.875, "step": 655 }, { "completion_length": 448.0, "epoch": 26.24, "grad_norm": 0.6247033476829529, "kl": 0.5885288715362549, "learning_rate": 3.6736789069647273e-06, "loss": 0.0235, "reward": 2.829052448272705, "reward_std": 0.3819313943386078, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8157051205635071, "rewards/wrapped_driving_reward": 0.2633473873138428, "rewards/wrapped_format_reward": 0.75, "step": 656 }, { "completion_length": 526.0, "epoch": 26.28, "grad_norm": 0.5222458839416504, "kl": 1.7896028757095337, "learning_rate": 3.6688603855691713e-06, "loss": 0.0716, "reward": 2.5255279541015625, "reward_std": 0.7786571979522705, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7613636255264282, "rewards/wrapped_driving_reward": 0.014164302498102188, "rewards/wrapped_format_reward": 0.75, "step": 657 }, { "completion_length": 745.0, "epoch": 26.32, "grad_norm": 0.6088178753852844, "kl": 1.4155036211013794, "learning_rate": 3.664036300812779e-06, "loss": 0.0566, "reward": 1.3521000146865845, "reward_std": 3.5682103633880615, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.8978999853134155, "rewards/wrapped_format_reward": 0.75, "step": 658 }, { "completion_length": 686.0, "epoch": 26.36, "grad_norm": 1.1373714208602905, "kl": 0.8373534083366394, "learning_rate": 3.6592066756564825e-06, "loss": 0.0335, "reward": -1.25, "reward_std": 0.5, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 659 }, { "completion_length": 613.0, "epoch": 26.4, "grad_norm": 0.40486952662467957, "kl": 0.6469566226005554, "learning_rate": 3.654371533087586e-06, "loss": 0.0259, "reward": -0.7124611735343933, "reward_std": 2.128596067428589, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -3.212461233139038, "rewards/wrapped_format_reward": 1.0, "step": 660 }, { "completion_length": 750.0, "epoch": 26.44, "grad_norm": 0.4986326992511749, "kl": 1.450907588005066, "learning_rate": 3.64953089611965e-06, "loss": 0.058, "reward": 2.939197301864624, "reward_std": 0.35693034529685974, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.884615421295166, "rewards/wrapped_driving_reward": 0.17958186566829681, "rewards/wrapped_format_reward": 0.875, "step": 661 }, { "completion_length": 533.0, "epoch": 26.48, "grad_norm": 0.6424423456192017, "kl": 1.0434520244598389, "learning_rate": 3.6446847877923917e-06, "loss": 0.0417, "reward": 2.148531913757324, "reward_std": 0.4976847171783447, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.8514681458473206, "rewards/wrapped_format_reward": 1.0, "step": 662 }, { "completion_length": 495.0, "epoch": 26.52, "grad_norm": 0.4873075783252716, "kl": 0.7196347713470459, "learning_rate": 3.639833231171569e-06, "loss": 0.0288, "reward": 2.8510382175445557, "reward_std": 0.5779574513435364, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.023961812257766724, "rewards/wrapped_format_reward": 0.875, "step": 663 }, { "completion_length": 750.0, "epoch": 26.56, "grad_norm": 0.46740320324897766, "kl": 1.3179888725280762, "learning_rate": 3.634976249348867e-06, "loss": 0.0527, "reward": 0.5123102068901062, "reward_std": 3.0649352073669434, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7083333134651184, "rewards/wrapped_driving_reward": -1.6960229873657227, "rewards/wrapped_format_reward": 0.75, "step": 664 }, { "completion_length": 750.0, "epoch": 26.6, "grad_norm": 0.40568238496780396, "kl": 1.0916098356246948, "learning_rate": 3.6301138654418e-06, "loss": 0.0437, "reward": 2.6497511863708496, "reward_std": 0.5616340041160583, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.27475130558013916, "rewards/wrapped_format_reward": 0.375, "step": 665 }, { "completion_length": 750.0, "epoch": 26.64, "grad_norm": 0.43641456961631775, "kl": 1.1734923124313354, "learning_rate": 3.625246102593588e-06, "loss": 0.0469, "reward": 0.9107986688613892, "reward_std": 3.306107759475708, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.0892013311386108, "rewards/wrapped_format_reward": 0.5, "step": 666 }, { "completion_length": 531.0, "epoch": 26.68, "grad_norm": 0.0914665013551712, "kl": 0.9140002727508545, "learning_rate": 3.6203729839730567e-06, "loss": 0.0366, "reward": -1.0, "reward_std": 0.0, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 1.0, "step": 667 }, { "completion_length": 750.0, "epoch": 26.72, "grad_norm": 0.5565376877784729, "kl": 2.005063772201538, "learning_rate": 3.6154945327745223e-06, "loss": 0.0802, "reward": -1.375, "reward_std": 0.4787135720252991, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.625, "step": 668 }, { "completion_length": 716.0, "epoch": 26.76, "grad_norm": 0.5163022875785828, "kl": 1.0322647094726562, "learning_rate": 3.610610772217682e-06, "loss": 0.0413, "reward": 1.4180800914764404, "reward_std": 1.708691954612732, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.5819199085235596, "rewards/wrapped_format_reward": 1.0, "step": 669 }, { "completion_length": 750.0, "epoch": 26.8, "grad_norm": 0.5534092783927917, "kl": 0.7664154171943665, "learning_rate": 3.6057217255475034e-06, "loss": 0.0307, "reward": 3.130662441253662, "reward_std": 0.6534955501556396, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.925000011920929, "rewards/wrapped_driving_reward": 0.5806624889373779, "rewards/wrapped_format_reward": 0.625, "step": 670 }, { "completion_length": 750.0, "epoch": 26.84, "grad_norm": 0.5211420655250549, "kl": 1.697385311126709, "learning_rate": 3.600827416034115e-06, "loss": 0.0679, "reward": 1.2811267375946045, "reward_std": 3.542926549911499, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.737500011920929, "rewards/wrapped_driving_reward": -0.8313732743263245, "rewards/wrapped_format_reward": 0.625, "step": 671 }, { "completion_length": 750.0, "epoch": 26.88, "grad_norm": 0.4372680187225342, "kl": 1.1519041061401367, "learning_rate": 3.595927866972694e-06, "loss": 0.0461, "reward": 2.822664260864258, "reward_std": 0.1902877241373062, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.19766449928283691, "rewards/wrapped_format_reward": 0.625, "step": 672 }, { "completion_length": 750.0, "epoch": 26.92, "grad_norm": 0.4989381730556488, "kl": 1.5017751455307007, "learning_rate": 3.591023101683355e-06, "loss": 0.0601, "reward": -1.0277777910232544, "reward_std": 0.05555558204650879, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9722222089767456, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 1.0, "step": 673 }, { "completion_length": 750.0, "epoch": 26.96, "grad_norm": 0.58172607421875, "kl": 1.2829159498214722, "learning_rate": 3.586113143511043e-06, "loss": 0.0513, "reward": 2.83713960647583, "reward_std": 0.4975849390029907, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.03786037862300873, "rewards/wrapped_format_reward": 0.875, "step": 674 }, { "completion_length": 750.0, "epoch": 27.0, "grad_norm": 0.49302029609680176, "kl": 1.1698321104049683, "learning_rate": 3.5811980158254156e-06, "loss": 0.0468, "reward": 1.1822489500045776, "reward_std": 3.4572482109069824, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.0677510499954224, "rewards/wrapped_format_reward": 0.75, "step": 675 }, { "completion_length": 750.0, "epoch": 27.04, "grad_norm": 0.4445314407348633, "kl": 1.0804122686386108, "learning_rate": 3.5762777420207382e-06, "loss": 0.0432, "reward": -1.1875, "reward_std": 0.23935678601264954, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9375, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.875, "step": 676 }, { "completion_length": 489.0, "epoch": 27.08, "grad_norm": 0.4960654675960541, "kl": 1.2753853797912598, "learning_rate": 3.5713523455157686e-06, "loss": 0.051, "reward": 0.35062074661254883, "reward_std": 1.9192057847976685, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -2.524379253387451, "rewards/wrapped_format_reward": 0.875, "step": 677 }, { "completion_length": 750.0, "epoch": 27.12, "grad_norm": 0.3840659260749817, "kl": 0.7700368165969849, "learning_rate": 3.566421849753646e-06, "loss": 0.0308, "reward": 1.9260783195495605, "reward_std": 1.246964931488037, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.5739217400550842, "rewards/wrapped_format_reward": 0.5, "step": 678 }, { "completion_length": 637.0, "epoch": 27.16, "grad_norm": 0.3894844651222229, "kl": 0.9075636863708496, "learning_rate": 3.5614862782017833e-06, "loss": 0.0363, "reward": 2.386613368988037, "reward_std": 1.434007167816162, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9772727489471436, "rewards/wrapped_driving_reward": -0.5906594395637512, "rewards/wrapped_format_reward": 1.0, "step": 679 }, { "completion_length": 750.0, "epoch": 27.2, "grad_norm": 2.2594847679138184, "kl": 1.5272691249847412, "learning_rate": 3.556545654351749e-06, "loss": 0.0611, "reward": 3.106175422668457, "reward_std": 0.4387306869029999, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9821428656578064, "rewards/wrapped_driving_reward": 0.12403266131877899, "rewards/wrapped_format_reward": 1.0, "step": 680 }, { "completion_length": 750.0, "epoch": 27.24, "grad_norm": 0.37824928760528564, "kl": 1.3627076148986816, "learning_rate": 3.551600001719161e-06, "loss": 0.0545, "reward": -1.2204545736312866, "reward_std": 0.20708855986595154, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9045454263687134, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.875, "step": 681 }, { "completion_length": 750.0, "epoch": 27.28, "grad_norm": 0.3919558525085449, "kl": 1.2976757287979126, "learning_rate": 3.5466493438435707e-06, "loss": 0.0519, "reward": -1.375, "reward_std": 0.4787135720252991, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.875, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 682 }, { "completion_length": 750.0, "epoch": 27.32, "grad_norm": 0.49715036153793335, "kl": 0.9081407189369202, "learning_rate": 3.541693704288355e-06, "loss": 0.0363, "reward": -1.25, "reward_std": 0.5, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 683 }, { "completion_length": 750.0, "epoch": 27.36, "grad_norm": 0.540151834487915, "kl": 1.1362556219100952, "learning_rate": 3.536733106640598e-06, "loss": 0.0455, "reward": 1.1239418983459473, "reward_std": 3.4454872608184814, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.0010579824447632, "rewards/wrapped_format_reward": 0.625, "step": 684 }, { "completion_length": 564.0, "epoch": 27.4, "grad_norm": 0.41077589988708496, "kl": 1.017673134803772, "learning_rate": 3.531767574510987e-06, "loss": 0.0407, "reward": 2.436887264251709, "reward_std": 0.47174349427223206, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.43811261653900146, "rewards/wrapped_format_reward": 0.875, "step": 685 }, { "completion_length": 687.0, "epoch": 27.44, "grad_norm": 0.3432943522930145, "kl": 1.0499712228775024, "learning_rate": 3.5267971315336936e-06, "loss": 0.042, "reward": 2.8152427673339844, "reward_std": 0.34102752804756165, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9722222089767456, "rewards/wrapped_driving_reward": 0.21802052855491638, "rewards/wrapped_format_reward": 0.625, "step": 686 }, { "completion_length": 644.0, "epoch": 27.48, "grad_norm": 0.41452446579933167, "kl": 0.9771077632904053, "learning_rate": 3.5218218013662626e-06, "loss": 0.0391, "reward": 2.786472797393799, "reward_std": 0.441410094499588, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.036472804844379425, "rewards/wrapped_format_reward": 0.75, "step": 687 }, { "completion_length": 750.0, "epoch": 27.52, "grad_norm": 0.3727835714817047, "kl": 1.0077673196792603, "learning_rate": 3.516841607689501e-06, "loss": 0.0403, "reward": 2.619081974029541, "reward_std": 0.729070246219635, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.25591808557510376, "rewards/wrapped_format_reward": 0.875, "step": 688 }, { "completion_length": 750.0, "epoch": 27.56, "grad_norm": 0.38604769110679626, "kl": 0.7550218105316162, "learning_rate": 3.511856574207364e-06, "loss": 0.0302, "reward": -0.4012797772884369, "reward_std": 3.317986488342285, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.276279926300049, "rewards/wrapped_format_reward": 0.875, "step": 689 }, { "completion_length": 750.0, "epoch": 27.6, "grad_norm": 0.7709413170814514, "kl": 1.3966618776321411, "learning_rate": 3.5068667246468437e-06, "loss": 0.0559, "reward": 3.257936716079712, "reward_std": 0.17135444283485413, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.2579367756843567, "rewards/wrapped_format_reward": 1.0, "step": 690 }, { "completion_length": 750.0, "epoch": 27.64, "grad_norm": 0.4005296230316162, "kl": 0.7381643056869507, "learning_rate": 3.5018720827578523e-06, "loss": 0.0295, "reward": 1.0149767398834229, "reward_std": 1.716369867324829, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8794642686843872, "rewards/wrapped_driving_reward": -1.3644875288009644, "rewards/wrapped_format_reward": 0.5, "step": 691 }, { "completion_length": 538.0, "epoch": 27.68, "grad_norm": 0.6042305827140808, "kl": 0.7873980402946472, "learning_rate": 3.496872672313116e-06, "loss": 0.0315, "reward": 3.5595736503601074, "reward_std": 0.09154798090457916, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9303977489471436, "rewards/wrapped_driving_reward": 0.6291758418083191, "rewards/wrapped_format_reward": 1.0, "step": 692 }, { "completion_length": 660.0, "epoch": 27.72, "grad_norm": 1.7775932550430298, "kl": 0.5605735778808594, "learning_rate": 3.491868517108053e-06, "loss": 0.0224, "reward": 3.6727917194366455, "reward_std": 0.36999544501304626, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9642857313156128, "rewards/wrapped_driving_reward": 0.7085059285163879, "rewards/wrapped_format_reward": 1.0, "step": 693 }, { "completion_length": 750.0, "epoch": 27.76, "grad_norm": 0.5017414093017578, "kl": 0.5329836010932922, "learning_rate": 3.486859640960668e-06, "loss": 0.0213, "reward": 2.7044076919555664, "reward_std": 0.31959784030914307, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.17059244215488434, "rewards/wrapped_format_reward": 0.875, "step": 694 }, { "completion_length": 750.0, "epoch": 27.8, "grad_norm": 0.5945215225219727, "kl": 2.0084989070892334, "learning_rate": 3.481846067711436e-06, "loss": 0.0803, "reward": 2.8925669193267822, "reward_std": 0.6678995490074158, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.39256682991981506, "rewards/wrapped_format_reward": 0.5, "step": 695 }, { "completion_length": 566.0, "epoch": 27.84, "grad_norm": 0.4806859791278839, "kl": 1.042332649230957, "learning_rate": 3.476827821223184e-06, "loss": 0.0417, "reward": 3.4408185482025146, "reward_std": 0.12357556074857712, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.44081851840019226, "rewards/wrapped_format_reward": 1.0, "step": 696 }, { "completion_length": 737.0, "epoch": 27.88, "grad_norm": 0.638775646686554, "kl": 1.0922187566757202, "learning_rate": 3.4718049253809894e-06, "loss": 0.0437, "reward": 2.549114465713501, "reward_std": 0.2564372420310974, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.07588561624288559, "rewards/wrapped_format_reward": 0.625, "step": 697 }, { "completion_length": 750.0, "epoch": 27.92, "grad_norm": 0.4154502749443054, "kl": 0.6086506843566895, "learning_rate": 3.466777404092052e-06, "loss": 0.0243, "reward": -1.25, "reward_std": 0.5, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 698 }, { "completion_length": 750.0, "epoch": 27.96, "grad_norm": 0.41160765290260315, "kl": 1.0010474920272827, "learning_rate": 3.4617452812855908e-06, "loss": 0.04, "reward": 1.1086905002593994, "reward_std": 3.1339128017425537, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.0163094997406006, "rewards/wrapped_format_reward": 0.625, "step": 699 }, { "completion_length": 750.0, "epoch": 28.0, "grad_norm": 0.5900842547416687, "kl": 0.7559553384780884, "learning_rate": 3.4567085809127247e-06, "loss": 0.0302, "reward": 2.649343729019165, "reward_std": 0.5213807225227356, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.3506563901901245, "rewards/wrapped_format_reward": 1.0, "step": 700 }, { "completion_length": 696.0, "epoch": 28.04, "grad_norm": 0.360720157623291, "kl": 0.8732097744941711, "learning_rate": 3.4516673269463617e-06, "loss": 0.0349, "reward": 2.775423049926758, "reward_std": 0.3682582676410675, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9583333134651184, "rewards/wrapped_driving_reward": 0.06708974391222, "rewards/wrapped_format_reward": 0.75, "step": 701 }, { "completion_length": 750.0, "epoch": 28.08, "grad_norm": 0.47541409730911255, "kl": 1.2193118333816528, "learning_rate": 3.4466215433810827e-06, "loss": 0.0488, "reward": -0.1097484827041626, "reward_std": 2.7219316959381104, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -2.359748363494873, "rewards/wrapped_format_reward": 0.75, "step": 702 }, { "completion_length": 426.0, "epoch": 28.12, "grad_norm": 0.49380800127983093, "kl": 0.5956392884254456, "learning_rate": 3.441571254233027e-06, "loss": 0.0238, "reward": 2.9899373054504395, "reward_std": 0.14259079098701477, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.010062739253044128, "rewards/wrapped_format_reward": 1.0, "step": 703 }, { "completion_length": 750.0, "epoch": 28.16, "grad_norm": 0.4521055817604065, "kl": 1.294080376625061, "learning_rate": 3.436516483539781e-06, "loss": 0.0518, "reward": -1.2708332538604736, "reward_std": 0.4876958429813385, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9791666865348816, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 704 }, { "completion_length": 750.0, "epoch": 28.2, "grad_norm": 0.4913451373577118, "kl": 1.2871887683868408, "learning_rate": 3.4314572553602577e-06, "loss": 0.0515, "reward": -1.524999976158142, "reward_std": 0.04999999329447746, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9750000238418579, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.5, "step": 705 }, { "completion_length": 492.0, "epoch": 28.24, "grad_norm": 0.4417870044708252, "kl": 0.6419669985771179, "learning_rate": 3.426393593774591e-06, "loss": 0.0257, "reward": -1.0499999523162842, "reward_std": 0.10000002384185791, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.949999988079071, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 1.0, "step": 706 }, { "completion_length": 750.0, "epoch": 28.28, "grad_norm": 0.43232548236846924, "kl": 1.069222331047058, "learning_rate": 3.421325522884013e-06, "loss": 0.0428, "reward": 2.791191816329956, "reward_std": 0.3710920810699463, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8999999761581421, "rewards/wrapped_driving_reward": 0.14119189977645874, "rewards/wrapped_format_reward": 0.75, "step": 707 }, { "completion_length": 750.0, "epoch": 28.32, "grad_norm": 0.3766399919986725, "kl": 0.7563959360122681, "learning_rate": 3.4162530668107435e-06, "loss": 0.0303, "reward": 3.1373729705810547, "reward_std": 0.3874880373477936, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.3873729705810547, "rewards/wrapped_format_reward": 0.75, "step": 708 }, { "completion_length": 750.0, "epoch": 28.36, "grad_norm": 0.4037916660308838, "kl": 1.1408348083496094, "learning_rate": 3.4111762496978753e-06, "loss": 0.0456, "reward": 2.582378625869751, "reward_std": 0.47437480092048645, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.20737852156162262, "rewards/wrapped_format_reward": 0.375, "step": 709 }, { "completion_length": 571.0, "epoch": 28.4, "grad_norm": 0.43852096796035767, "kl": 1.0089608430862427, "learning_rate": 3.406095095709254e-06, "loss": 0.0404, "reward": 1.6179953813552856, "reward_std": 3.1362671852111816, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.6320046186447144, "rewards/wrapped_format_reward": 0.75, "step": 710 }, { "completion_length": 750.0, "epoch": 28.44, "grad_norm": 0.5104399919509888, "kl": 1.659734845161438, "learning_rate": 3.401009629029375e-06, "loss": 0.0664, "reward": 3.0378050804138184, "reward_std": 0.20842501521110535, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.03780514374375343, "rewards/wrapped_format_reward": 1.0, "step": 711 }, { "completion_length": 750.0, "epoch": 28.48, "grad_norm": 0.4440138041973114, "kl": 1.08497154712677, "learning_rate": 3.39591987386325e-06, "loss": 0.0434, "reward": 1.182783603668213, "reward_std": 3.174339771270752, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9422163367271423, "rewards/wrapped_format_reward": 0.625, "step": 712 }, { "completion_length": 750.0, "epoch": 28.52, "grad_norm": 0.5364874601364136, "kl": 1.3724675178527832, "learning_rate": 3.3908258544363145e-06, "loss": 0.0549, "reward": 1.609416127204895, "reward_std": 3.407630443572998, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.7655838131904602, "rewards/wrapped_format_reward": 0.875, "step": 713 }, { "completion_length": 656.0, "epoch": 28.56, "grad_norm": 0.4007849395275116, "kl": 1.350628137588501, "learning_rate": 3.3857275949942896e-06, "loss": 0.054, "reward": 0.9743061065673828, "reward_std": 1.7018401622772217, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.7756938934326172, "rewards/wrapped_format_reward": 0.75, "step": 714 }, { "completion_length": 750.0, "epoch": 28.6, "grad_norm": 0.5446045398712158, "kl": 1.2371965646743774, "learning_rate": 3.3806251198030843e-06, "loss": 0.0495, "reward": 2.513890027999878, "reward_std": 0.7349424958229065, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9166666269302368, "rewards/wrapped_driving_reward": 0.09722331911325455, "rewards/wrapped_format_reward": 0.5, "step": 715 }, { "completion_length": 750.0, "epoch": 28.64, "grad_norm": 0.5443968772888184, "kl": 1.1408532857894897, "learning_rate": 3.375518453148669e-06, "loss": 0.0456, "reward": 3.2692654132843018, "reward_std": 0.6697713732719421, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.5192654132843018, "rewards/wrapped_format_reward": 0.75, "step": 716 }, { "completion_length": 750.0, "epoch": 28.68, "grad_norm": 0.3911254405975342, "kl": 1.4690412282943726, "learning_rate": 3.370407619336966e-06, "loss": 0.0588, "reward": -1.2867647409439087, "reward_std": 0.33652594685554504, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8382353186607361, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.875, "step": 717 }, { "completion_length": 750.0, "epoch": 28.72, "grad_norm": 0.403899610042572, "kl": 0.8253454566001892, "learning_rate": 3.3652926426937327e-06, "loss": 0.033, "reward": 1.4513518810272217, "reward_std": 2.3746840953826904, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -1.1736482381820679, "rewards/wrapped_format_reward": 0.625, "step": 718 }, { "completion_length": 750.0, "epoch": 28.76, "grad_norm": 0.9827480316162109, "kl": 1.8536406755447388, "learning_rate": 3.360173547564442e-06, "loss": 0.0741, "reward": 3.377265214920044, "reward_std": 0.47734910249710083, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.6272653341293335, "rewards/wrapped_format_reward": 0.75, "step": 719 }, { "completion_length": 750.0, "epoch": 28.8, "grad_norm": 6.140714645385742, "kl": 1.9834489822387695, "learning_rate": 3.3550503583141726e-06, "loss": 0.0793, "reward": -2.075000047683716, "reward_std": 1.2867920398712158, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.675000011920929, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.5, "step": 720 }, { "completion_length": 750.0, "epoch": 28.84, "grad_norm": 0.5307136178016663, "kl": 0.8859113454818726, "learning_rate": 3.3499230993274857e-06, "loss": 0.0354, "reward": 2.316016435623169, "reward_std": 0.2834460735321045, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.4339835047721863, "rewards/wrapped_format_reward": 0.75, "step": 721 }, { "completion_length": 613.0, "epoch": 28.88, "grad_norm": 0.3882230222225189, "kl": 1.422294020652771, "learning_rate": 3.344791795008318e-06, "loss": 0.0569, "reward": 1.4253931045532227, "reward_std": 3.3321378231048584, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6499999761581421, "rewards/wrapped_driving_reward": -0.8496068120002747, "rewards/wrapped_format_reward": 0.875, "step": 722 }, { "completion_length": 371.0, "epoch": 28.92, "grad_norm": 0.5566720366477966, "kl": 0.9256033897399902, "learning_rate": 3.339656469779856e-06, "loss": 0.037, "reward": 3.1350290775299072, "reward_std": 0.2685109078884125, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.1350291520357132, "rewards/wrapped_format_reward": 1.0, "step": 723 }, { "completion_length": 580.0, "epoch": 28.96, "grad_norm": 0.42029738426208496, "kl": 0.6244791746139526, "learning_rate": 3.3345171480844275e-06, "loss": 0.025, "reward": 3.0256998538970947, "reward_std": 0.4103839099407196, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.025699838995933533, "rewards/wrapped_format_reward": 1.0, "step": 724 }, { "completion_length": 596.0, "epoch": 29.0, "grad_norm": 0.5300978422164917, "kl": 1.0864909887313843, "learning_rate": 3.3293738543833807e-06, "loss": 0.0435, "reward": 1.6714462041854858, "reward_std": 3.4748284816741943, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.7035538554191589, "rewards/wrapped_format_reward": 0.875, "step": 725 }, { "completion_length": 750.0, "epoch": 29.04, "grad_norm": 0.5925276875495911, "kl": 1.2335766553878784, "learning_rate": 3.3242266131569685e-06, "loss": 0.0493, "reward": 1.1432609558105469, "reward_std": 3.1195735931396484, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.2317389249801636, "rewards/wrapped_format_reward": 0.875, "step": 726 }, { "completion_length": 623.0, "epoch": 29.08, "grad_norm": 0.4075430929660797, "kl": 0.5598421692848206, "learning_rate": 3.3190754489042343e-06, "loss": 0.0224, "reward": 2.513282537460327, "reward_std": 0.3340396583080292, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.4867174029350281, "rewards/wrapped_format_reward": 1.0, "step": 727 }, { "completion_length": 750.0, "epoch": 29.12, "grad_norm": 111.52384948730469, "kl": 5.0521063804626465, "learning_rate": 3.313920386142892e-06, "loss": 0.2021, "reward": 2.7720518112182617, "reward_std": 0.9143410325050354, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.39705172181129456, "rewards/wrapped_format_reward": 0.375, "step": 728 }, { "completion_length": 597.0, "epoch": 29.16, "grad_norm": 0.4811553955078125, "kl": 1.3416039943695068, "learning_rate": 3.308761449409213e-06, "loss": 0.0537, "reward": 3.2421016693115234, "reward_std": 0.36916667222976685, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.367101788520813, "rewards/wrapped_format_reward": 0.875, "step": 729 }, { "completion_length": 750.0, "epoch": 29.2, "grad_norm": 0.41855716705322266, "kl": 1.2373117208480835, "learning_rate": 3.303598663257904e-06, "loss": 0.0495, "reward": 2.902125835418701, "reward_std": 0.21973761916160583, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.027125656604766846, "rewards/wrapped_format_reward": 0.875, "step": 730 }, { "completion_length": 496.0, "epoch": 29.24, "grad_norm": 0.5248737931251526, "kl": 1.0021467208862305, "learning_rate": 3.298432052261998e-06, "loss": 0.0401, "reward": 3.6586220264434814, "reward_std": 0.3303280174732208, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.6586220264434814, "rewards/wrapped_format_reward": 1.0, "step": 731 }, { "completion_length": 750.0, "epoch": 29.28, "grad_norm": 0.4102158546447754, "kl": 1.7108817100524902, "learning_rate": 3.293261641012731e-06, "loss": 0.0684, "reward": 2.710984230041504, "reward_std": 0.6047499775886536, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.28901582956314087, "rewards/wrapped_format_reward": 1.0, "step": 732 }, { "completion_length": 750.0, "epoch": 29.32, "grad_norm": 0.4515760540962219, "kl": 1.3170658349990845, "learning_rate": 3.288087454119425e-06, "loss": 0.0527, "reward": -0.36018359661102295, "reward_std": 3.6547343730926514, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.9851834774017334, "rewards/wrapped_format_reward": 0.625, "step": 733 }, { "completion_length": 563.0, "epoch": 29.36, "grad_norm": 0.0380023717880249, "kl": 0.8448745012283325, "learning_rate": 3.282909516209374e-06, "loss": 0.0338, "reward": -1.0, "reward_std": 0.0, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 1.0, "step": 734 }, { "completion_length": 750.0, "epoch": 29.4, "grad_norm": 0.8251069188117981, "kl": 1.3977208137512207, "learning_rate": 3.277727851927727e-06, "loss": 0.0559, "reward": -1.3977272510528564, "reward_std": 0.4886803925037384, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9772727489471436, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.625, "step": 735 }, { "completion_length": 639.0, "epoch": 29.44, "grad_norm": 0.5299381613731384, "kl": 1.5736980438232422, "learning_rate": 3.272542485937369e-06, "loss": 0.0629, "reward": 2.132500648498535, "reward_std": 2.141465902328491, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.8674995303153992, "rewards/wrapped_format_reward": 1.0, "step": 736 }, { "completion_length": 750.0, "epoch": 29.48, "grad_norm": 0.49962857365608215, "kl": 0.6524366140365601, "learning_rate": 3.2673534429188005e-06, "loss": 0.0261, "reward": 3.875, "reward_std": 0.25, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 1.0, "rewards/wrapped_format_reward": 0.875, "step": 737 }, { "completion_length": 573.0, "epoch": 29.52, "grad_norm": 0.3988845646381378, "kl": 1.1060757637023926, "learning_rate": 3.2621607475700272e-06, "loss": 0.0442, "reward": 3.0906763076782227, "reward_std": 0.19022376835346222, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.09067624807357788, "rewards/wrapped_format_reward": 1.0, "step": 738 }, { "completion_length": 750.0, "epoch": 29.56, "grad_norm": 0.59360671043396, "kl": 1.5008355379104614, "learning_rate": 3.256964424606437e-06, "loss": 0.06, "reward": -1.25, "reward_std": 0.28867512941360474, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.75, "step": 739 }, { "completion_length": 750.0, "epoch": 29.6, "grad_norm": 0.42812031507492065, "kl": 0.4313167333602905, "learning_rate": 3.2517644987606827e-06, "loss": 0.0173, "reward": 2.9479095935821533, "reward_std": 0.5999106168746948, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.875, "rewards/wrapped_driving_reward": 0.5729095935821533, "rewards/wrapped_format_reward": 0.5, "step": 740 }, { "completion_length": 633.0, "epoch": 29.64, "grad_norm": 0.4350597858428955, "kl": 1.3522546291351318, "learning_rate": 3.2465609947825692e-06, "loss": 0.0541, "reward": 2.7952303886413574, "reward_std": 0.5983152389526367, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9583333134651184, "rewards/wrapped_driving_reward": -0.1631031036376953, "rewards/wrapped_format_reward": 1.0, "step": 741 }, { "completion_length": 512.0, "epoch": 29.68, "grad_norm": 0.5423455238342285, "kl": 0.5965325236320496, "learning_rate": 3.2413539374389275e-06, "loss": 0.0239, "reward": 2.7277941703796387, "reward_std": 0.38129425048828125, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.022205986082553864, "rewards/wrapped_format_reward": 0.75, "step": 742 }, { "completion_length": 750.0, "epoch": 29.72, "grad_norm": 0.40301433205604553, "kl": 1.3201624155044556, "learning_rate": 3.2361433515135053e-06, "loss": 0.0528, "reward": 1.1487705707550049, "reward_std": 3.1022140979766846, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.2262296676635742, "rewards/wrapped_format_reward": 0.875, "step": 743 }, { "completion_length": 750.0, "epoch": 29.76, "grad_norm": 0.5065982341766357, "kl": 1.7616549730300903, "learning_rate": 3.230929261806842e-06, "loss": 0.0705, "reward": 2.8082611560821533, "reward_std": 0.5932582020759583, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.058261215686798096, "rewards/wrapped_format_reward": 0.75, "step": 744 }, { "completion_length": 553.0, "epoch": 29.8, "grad_norm": 0.06308308243751526, "kl": 1.113646388053894, "learning_rate": 3.225711693136156e-06, "loss": 0.0445, "reward": -1.0, "reward_std": 0.0, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 1.0, "step": 745 }, { "completion_length": 546.0, "epoch": 29.84, "grad_norm": 0.4473098814487457, "kl": 0.48006105422973633, "learning_rate": 3.2204906703352236e-06, "loss": 0.0192, "reward": -1.125, "reward_std": 0.25, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.875, "step": 746 }, { "completion_length": 750.0, "epoch": 29.88, "grad_norm": 0.4587174654006958, "kl": 1.3874768018722534, "learning_rate": 3.215266218254261e-06, "loss": 0.0555, "reward": 2.485217809677124, "reward_std": 1.0316221714019775, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": -0.514782190322876, "rewards/wrapped_format_reward": 1.0, "step": 747 }, { "completion_length": 460.0, "epoch": 29.92, "grad_norm": 0.47659605741500854, "kl": 0.39824411273002625, "learning_rate": 3.2100383617598075e-06, "loss": 0.0159, "reward": 2.951653480529785, "reward_std": 0.28944987058639526, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9895833134651184, "rewards/wrapped_driving_reward": -0.03792976588010788, "rewards/wrapped_format_reward": 1.0, "step": 748 }, { "completion_length": 714.0, "epoch": 29.96, "grad_norm": 0.47844377160072327, "kl": 1.2070376873016357, "learning_rate": 3.2048071257346043e-06, "loss": 0.0483, "reward": 0.6153709888458252, "reward_std": 1.9917418956756592, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9750000238418579, "rewards/wrapped_driving_reward": -2.1096291542053223, "rewards/wrapped_format_reward": 0.75, "step": 749 }, { "completion_length": 750.0, "epoch": 30.0, "grad_norm": 0.741182804107666, "kl": 1.9630638360977173, "learning_rate": 3.199572535077481e-06, "loss": 0.0785, "reward": 3.7558329105377197, "reward_std": 0.19139595329761505, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9608585834503174, "rewards/wrapped_driving_reward": 0.7949742674827576, "rewards/wrapped_format_reward": 1.0, "step": 750 } ], "logging_steps": 1, "max_steps": 1600, "num_input_tokens_seen": 0, "num_train_epochs": 64, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }