{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 250.0, "eval_steps": 500, "global_step": 1250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 500.0, "epoch": 0.2, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.0833333333333335e-08, "loss": 0.0, "reward": -4.0, "reward_std": 0.0, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.0, "step": 1 }, { "completion_length": 500.0, "epoch": 0.4, "grad_norm": 0.4331946074962616, "kl": 0.0, "learning_rate": 4.166666666666667e-08, "loss": -0.0, "reward": -2.195732355117798, "reward_std": 2.577455759048462, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4375, "rewards/wrapped_driving_reward": -3.258232355117798, "rewards/wrapped_format_reward": 0.125, "step": 2 }, { "completion_length": 500.0, "epoch": 0.6, "grad_norm": 0.43272849917411804, "kl": 0.0009923786856234074, "learning_rate": 6.250000000000001e-08, "loss": 0.0, "reward": -1.0750184059143066, "reward_std": 3.4081573486328125, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2232142835855484, "rewards/wrapped_driving_reward": -1.9232327938079834, "rewards/wrapped_format_reward": 0.125, "step": 3 }, { "completion_length": 500.0, "epoch": 0.8, "grad_norm": 10.146150588989258, "kl": 0.0021735988557338715, "learning_rate": 8.333333333333334e-08, "loss": 0.0001, "reward": 1.4993090629577637, "reward_std": 3.6849262714385986, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.59375, "rewards/wrapped_driving_reward": -0.3444410264492035, "rewards/wrapped_format_reward": 0.5, "step": 4 }, { "completion_length": 500.0, "epoch": 1.0, "grad_norm": 3.436992883682251, "kl": 0.001074329949915409, "learning_rate": 1.0416666666666667e-07, "loss": 0.0, "reward": -1.1928160190582275, "reward_std": 3.2491295337677, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3055555522441864, "rewards/wrapped_driving_reward": -1.9983716011047363, "rewards/wrapped_format_reward": 0.0, "step": 5 }, { "completion_length": 500.0, "epoch": 1.2, "grad_norm": 0.0012362411944195628, "kl": 0.0008123984443955123, "learning_rate": 1.2500000000000002e-07, "loss": 0.0, "reward": -4.0, "reward_std": 0.0, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.0, "step": 6 }, { "completion_length": 500.0, "epoch": 1.4, "grad_norm": 3.972846031188965, "kl": 0.0018862163415178657, "learning_rate": 1.4583333333333335e-07, "loss": 0.0001, "reward": -1.2093137502670288, "reward_std": 3.227802038192749, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3901515007019043, "rewards/wrapped_driving_reward": -2.0994651317596436, "rewards/wrapped_format_reward": 0.0, "step": 7 }, { "completion_length": 500.0, "epoch": 1.6, "grad_norm": 1.9137581586837769, "kl": 0.0010808327933773398, "learning_rate": 1.6666666666666668e-07, "loss": 0.0, "reward": -1.0708822011947632, "reward_std": 3.423583745956421, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2916666567325592, "rewards/wrapped_driving_reward": -2.112548828125, "rewards/wrapped_format_reward": 0.25, "step": 8 }, { "completion_length": 500.0, "epoch": 1.8, "grad_norm": 8.79112720489502, "kl": 0.0014885602286085486, "learning_rate": 1.875e-07, "loss": 0.0001, "reward": -2.4617133140563965, "reward_std": 3.076573371887207, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.125, "rewards/wrapped_driving_reward": -2.8367133140563965, "rewards/wrapped_format_reward": 0.0, "step": 9 }, { "completion_length": 500.0, "epoch": 2.0, "grad_norm": 0.0010773384710773826, "kl": 0.0006336356163956225, "learning_rate": 2.0833333333333333e-07, "loss": 0.0, "reward": -4.0, "reward_std": 0.0, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.0, "step": 10 }, { "completion_length": 500.0, "epoch": 2.2, "grad_norm": 2.8661322593688965, "kl": 0.001619816990569234, "learning_rate": 2.2916666666666666e-07, "loss": 0.0001, "reward": 0.19541072845458984, "reward_std": 2.79884672164917, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4732142686843872, "rewards/wrapped_driving_reward": -1.0278035402297974, "rewards/wrapped_format_reward": 0.0, "step": 11 }, { "completion_length": 500.0, "epoch": 2.4, "grad_norm": 3.63022518157959, "kl": 0.0011069196043536067, "learning_rate": 2.5000000000000004e-07, "loss": 0.0, "reward": -0.7504351139068604, "reward_std": 3.4696340560913086, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.1770833432674408, "rewards/wrapped_driving_reward": -1.9275184869766235, "rewards/wrapped_format_reward": 0.5, "step": 12 }, { "completion_length": 500.0, "epoch": 2.6, "grad_norm": 1.3208059072494507, "kl": 0.0007781968452036381, "learning_rate": 2.7083333333333337e-07, "loss": 0.0, "reward": -0.5810263752937317, "reward_std": 3.9478907585144043, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.5810264348983765, "rewards/wrapped_format_reward": 0.125, "step": 13 }, { "completion_length": 500.0, "epoch": 2.8, "grad_norm": 4.958929061889648, "kl": 0.0007627051090821624, "learning_rate": 2.916666666666667e-07, "loss": 0.0, "reward": 0.09956195950508118, "reward_std": 2.970421552658081, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4571428596973419, "rewards/wrapped_driving_reward": -1.3575809001922607, "rewards/wrapped_format_reward": 0.25, "step": 14 }, { "completion_length": 500.0, "epoch": 3.0, "grad_norm": 9.031028747558594, "kl": 0.0017744852229952812, "learning_rate": 3.125e-07, "loss": 0.0001, "reward": -0.7453500032424927, "reward_std": 3.2129933834075928, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.9953501224517822, "rewards/wrapped_format_reward": 0.375, "step": 15 }, { "completion_length": 500.0, "epoch": 3.2, "grad_norm": 4.5779242515563965, "kl": 0.0011605366598814726, "learning_rate": 3.3333333333333335e-07, "loss": 0.0, "reward": -0.6313304901123047, "reward_std": 3.0313284397125244, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -2.0063304901123047, "rewards/wrapped_format_reward": 0.5, "step": 16 }, { "completion_length": 500.0, "epoch": 3.4, "grad_norm": 4.1963629722595215, "kl": 0.0015463099116459489, "learning_rate": 3.541666666666667e-07, "loss": 0.0001, "reward": -1.0474066734313965, "reward_std": 3.410123825073242, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4166666865348816, "rewards/wrapped_driving_reward": -2.089073419570923, "rewards/wrapped_format_reward": 0.125, "step": 17 }, { "completion_length": 500.0, "epoch": 3.6, "grad_norm": 2.828728675842285, "kl": 0.0015656519681215286, "learning_rate": 3.75e-07, "loss": 0.0001, "reward": -0.7301186323165894, "reward_std": 3.2286598682403564, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.980118751525879, "rewards/wrapped_format_reward": 0.25, "step": 18 }, { "completion_length": 500.0, "epoch": 3.8, "grad_norm": 2.103822946548462, "kl": 0.0015016624238342047, "learning_rate": 3.9583333333333334e-07, "loss": 0.0001, "reward": 0.5382012724876404, "reward_std": 2.7029671669006348, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5833333134651184, "rewards/wrapped_driving_reward": -0.9201321005821228, "rewards/wrapped_format_reward": 0.125, "step": 19 }, { "completion_length": 500.0, "epoch": 4.0, "grad_norm": 1.4944398403167725, "kl": 0.0011990186758339405, "learning_rate": 4.1666666666666667e-07, "loss": 0.0, "reward": -2.170128107070923, "reward_std": 3.0023434162139893, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.125, "rewards/wrapped_driving_reward": -2.795128107070923, "rewards/wrapped_format_reward": 0.25, "step": 20 }, { "completion_length": 500.0, "epoch": 4.2, "grad_norm": 0.00705456268042326, "kl": 0.0014940756373107433, "learning_rate": 4.375e-07, "loss": 0.0001, "reward": -4.0, "reward_std": 0.0, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.0, "step": 21 }, { "completion_length": 500.0, "epoch": 4.4, "grad_norm": 0.3451469838619232, "kl": 0.0007462741923518479, "learning_rate": 4.583333333333333e-07, "loss": 0.0, "reward": -1.0851657390594482, "reward_std": 2.5890932083129883, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6679292917251587, "rewards/wrapped_driving_reward": -2.6280951499938965, "rewards/wrapped_format_reward": 0.125, "step": 22 }, { "completion_length": 500.0, "epoch": 4.6, "grad_norm": 50.23689651489258, "kl": 0.0034247653093189, "learning_rate": 4.791666666666667e-07, "loss": 0.0001, "reward": -0.5867406129837036, "reward_std": 3.6583194732666016, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3901515007019043, "rewards/wrapped_driving_reward": -1.601892113685608, "rewards/wrapped_format_reward": 0.125, "step": 23 }, { "completion_length": 500.0, "epoch": 4.8, "grad_norm": 0.6233353614807129, "kl": 0.0010215980000793934, "learning_rate": 5.000000000000001e-07, "loss": 0.0, "reward": -1.15825355052948, "reward_std": 3.295187473297119, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2857142686843872, "rewards/wrapped_driving_reward": -2.068967819213867, "rewards/wrapped_format_reward": 0.125, "step": 24 }, { "completion_length": 500.0, "epoch": 5.0, "grad_norm": 0.3912212550640106, "kl": 0.000855346501339227, "learning_rate": 5.208333333333334e-07, "loss": 0.0, "reward": 0.6220214366912842, "reward_std": 2.7480878829956055, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.734375, "rewards/wrapped_driving_reward": -0.9873536229133606, "rewards/wrapped_format_reward": 0.125, "step": 25 }, { "completion_length": 500.0, "epoch": 5.2, "grad_norm": 0.9482459425926208, "kl": 0.0009874895913526416, "learning_rate": 5.416666666666667e-07, "loss": 0.0, "reward": -2.19559645652771, "reward_std": 2.9796664714813232, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.0535714291036129, "rewards/wrapped_driving_reward": -2.9991679191589355, "rewards/wrapped_format_reward": 0.5, "step": 26 }, { "completion_length": 500.0, "epoch": 5.4, "grad_norm": 4.4480109214782715, "kl": 0.0021480382420122623, "learning_rate": 5.625e-07, "loss": 0.0001, "reward": 0.7437606453895569, "reward_std": 2.506401300430298, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6515151262283325, "rewards/wrapped_driving_reward": -0.9077544212341309, "rewards/wrapped_format_reward": 0.25, "step": 27 }, { "completion_length": 500.0, "epoch": 5.6, "grad_norm": 2.110179901123047, "kl": 0.0010772650130093098, "learning_rate": 5.833333333333334e-07, "loss": 0.0, "reward": -2.052107810974121, "reward_std": 3.263345241546631, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.802107810974121, "rewards/wrapped_format_reward": 0.25, "step": 28 }, { "completion_length": 500.0, "epoch": 5.8, "grad_norm": 0.4768023192882538, "kl": 0.0009159984765574336, "learning_rate": 6.041666666666667e-07, "loss": 0.0, "reward": 0.34745848178863525, "reward_std": 2.903594493865967, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5719696879386902, "rewards/wrapped_driving_reward": -0.9745111465454102, "rewards/wrapped_format_reward": 0.0, "step": 29 }, { "completion_length": 500.0, "epoch": 6.0, "grad_norm": 0.3444426357746124, "kl": 0.000862763321492821, "learning_rate": 6.25e-07, "loss": 0.0, "reward": -0.7680141925811768, "reward_std": 3.4674270153045654, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3125, "rewards/wrapped_driving_reward": -1.9555140733718872, "rewards/wrapped_format_reward": 0.375, "step": 30 }, { "completion_length": 500.0, "epoch": 6.2, "grad_norm": 1.7202939987182617, "kl": 0.0010583762777969241, "learning_rate": 6.458333333333334e-07, "loss": 0.0, "reward": -1.404773235321045, "reward_std": 3.095156192779541, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4375, "rewards/wrapped_driving_reward": -2.467273235321045, "rewards/wrapped_format_reward": 0.125, "step": 31 }, { "completion_length": 500.0, "epoch": 6.4, "grad_norm": 0.7919694781303406, "kl": 0.0008548864279873669, "learning_rate": 6.666666666666667e-07, "loss": 0.0, "reward": 0.7130440473556519, "reward_std": 3.1504263877868652, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6428571343421936, "rewards/wrapped_driving_reward": -0.804813027381897, "rewards/wrapped_format_reward": 0.125, "step": 32 }, { "completion_length": 500.0, "epoch": 6.6, "grad_norm": 3.1069979667663574, "kl": 0.0012187480460852385, "learning_rate": 6.875000000000001e-07, "loss": 0.0, "reward": -2.399667739868164, "reward_std": 2.877002477645874, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.06818182021379471, "rewards/wrapped_driving_reward": -2.9678494930267334, "rewards/wrapped_format_reward": 0.25, "step": 33 }, { "completion_length": 500.0, "epoch": 6.8, "grad_norm": 0.730955183506012, "kl": 0.0008993800729513168, "learning_rate": 7.083333333333334e-07, "loss": 0.0, "reward": -2.6763646602630615, "reward_std": 2.647270679473877, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.1818181872367859, "rewards/wrapped_driving_reward": -3.108182907104492, "rewards/wrapped_format_reward": 0.0, "step": 34 }, { "completion_length": 500.0, "epoch": 7.0, "grad_norm": 5.383649826049805, "kl": 0.006175986025482416, "learning_rate": 7.291666666666667e-07, "loss": 0.0002, "reward": 3.2691361904144287, "reward_std": 0.3647014796733856, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9017857313156128, "rewards/wrapped_driving_reward": 0.8673505187034607, "rewards/wrapped_format_reward": 0.5, "step": 35 }, { "completion_length": 500.0, "epoch": 7.2, "grad_norm": 0.41836127638816833, "kl": 0.0009351923363283277, "learning_rate": 7.5e-07, "loss": 0.0, "reward": -3.75, "reward_std": 0.5, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.25, "step": 36 }, { "completion_length": 500.0, "epoch": 7.4, "grad_norm": 1.9916703701019287, "kl": 0.0017392473528161645, "learning_rate": 7.708333333333334e-07, "loss": 0.0001, "reward": -2.597564935684204, "reward_std": 2.804870128631592, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -3.097564935684204, "rewards/wrapped_format_reward": 0.0, "step": 37 }, { "completion_length": 500.0, "epoch": 7.6, "grad_norm": 0.433552622795105, "kl": 0.0009353554341942072, "learning_rate": 7.916666666666667e-07, "loss": 0.0, "reward": -2.5462069511413574, "reward_std": 2.5850212574005127, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.125, "rewards/wrapped_driving_reward": -3.0462067127227783, "rewards/wrapped_format_reward": 0.125, "step": 38 }, { "completion_length": 500.0, "epoch": 7.8, "grad_norm": 1.2196906805038452, "kl": 0.0008541917777620256, "learning_rate": 8.125000000000001e-07, "loss": 0.0, "reward": -1.038360834121704, "reward_std": 3.1846678256988525, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2083333283662796, "rewards/wrapped_driving_reward": -1.9966940879821777, "rewards/wrapped_format_reward": 0.25, "step": 39 }, { "completion_length": 500.0, "epoch": 8.0, "grad_norm": 1.281370997428894, "kl": 0.0012979316525161266, "learning_rate": 8.333333333333333e-07, "loss": 0.0001, "reward": -3.875, "reward_std": 0.25, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.125, "step": 40 }, { "completion_length": 500.0, "epoch": 8.2, "grad_norm": 0.6021063923835754, "kl": 0.0010175962233915925, "learning_rate": 8.541666666666667e-07, "loss": 0.0, "reward": 0.5651559829711914, "reward_std": 3.080660820007324, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -0.9348440170288086, "rewards/wrapped_format_reward": 0.25, "step": 41 }, { "completion_length": 500.0, "epoch": 8.4, "grad_norm": 0.3452380895614624, "kl": 0.0009200552594847977, "learning_rate": 8.75e-07, "loss": 0.0, "reward": -2.3739185333251953, "reward_std": 3.2521629333496094, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.1666666716337204, "rewards/wrapped_driving_reward": -2.7905852794647217, "rewards/wrapped_format_reward": 0.0, "step": 42 }, { "completion_length": 500.0, "epoch": 8.6, "grad_norm": 2.9192402362823486, "kl": 0.0010521383956074715, "learning_rate": 8.958333333333334e-07, "loss": 0.0, "reward": -2.741067886352539, "reward_std": 2.197209596633911, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.015625, "rewards/wrapped_driving_reward": -3.131692886352539, "rewards/wrapped_format_reward": 0.125, "step": 43 }, { "completion_length": 500.0, "epoch": 8.8, "grad_norm": 1.652388572692871, "kl": 0.001037920475937426, "learning_rate": 9.166666666666666e-07, "loss": 0.0, "reward": -0.9824157357215881, "reward_std": 3.4965455532073975, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4318181872367859, "rewards/wrapped_driving_reward": -1.914233922958374, "rewards/wrapped_format_reward": 0.0, "step": 44 }, { "completion_length": 500.0, "epoch": 9.0, "grad_norm": 1.7152076959609985, "kl": 0.0010146588319912553, "learning_rate": 9.375000000000001e-07, "loss": 0.0, "reward": 1.4902138710021973, "reward_std": 0.6985129117965698, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.3847861886024475, "rewards/wrapped_format_reward": 0.125, "step": 45 }, { "completion_length": 500.0, "epoch": 9.2, "grad_norm": 0.43010541796684265, "kl": 0.0009054208057932556, "learning_rate": 9.583333333333334e-07, "loss": 0.0, "reward": 0.844261884689331, "reward_std": 2.5686442852020264, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3854166567325592, "rewards/wrapped_driving_reward": -1.0411547422409058, "rewards/wrapped_format_reward": 0.75, "step": 46 }, { "completion_length": 500.0, "epoch": 9.4, "grad_norm": 1.6765927076339722, "kl": 0.0007561460370197892, "learning_rate": 9.791666666666667e-07, "loss": 0.0, "reward": -0.8482348918914795, "reward_std": 3.6395058631896973, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4375, "rewards/wrapped_driving_reward": -1.9107348918914795, "rewards/wrapped_format_reward": 0.125, "step": 47 }, { "completion_length": 500.0, "epoch": 9.6, "grad_norm": 0.39042791724205017, "kl": 0.0008848680299706757, "learning_rate": 1.0000000000000002e-06, "loss": 0.0, "reward": -1.178413987159729, "reward_std": 3.0193841457366943, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2770833373069763, "rewards/wrapped_driving_reward": -2.2054972648620605, "rewards/wrapped_format_reward": 0.25, "step": 48 }, { "completion_length": 500.0, "epoch": 9.8, "grad_norm": 0.8740907907485962, "kl": 0.0009530234383419156, "learning_rate": 1.0208333333333334e-06, "loss": 0.0, "reward": -1.9666993618011475, "reward_std": 2.7736213207244873, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.9666993618011475, "rewards/wrapped_format_reward": 0.5, "step": 49 }, { "completion_length": 500.0, "epoch": 10.0, "grad_norm": 0.3344237208366394, "kl": 0.0007590156164951622, "learning_rate": 1.0416666666666667e-06, "loss": 0.0, "reward": -3.875, "reward_std": 0.25, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.125, "step": 50 }, { "completion_length": 500.0, "epoch": 10.2, "grad_norm": 0.32544267177581787, "kl": 0.0008944781147874892, "learning_rate": 1.0625e-06, "loss": 0.0, "reward": 0.4576635956764221, "reward_std": 2.982259511947632, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5952380895614624, "rewards/wrapped_driving_reward": -1.012574553489685, "rewards/wrapped_format_reward": 0.125, "step": 51 }, { "completion_length": 500.0, "epoch": 10.4, "grad_norm": 5.682590007781982, "kl": 0.0026439097709953785, "learning_rate": 1.0833333333333335e-06, "loss": 0.0001, "reward": -1.0434162616729736, "reward_std": 3.424217462539673, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.9184162616729736, "rewards/wrapped_format_reward": 0.0, "step": 52 }, { "completion_length": 500.0, "epoch": 10.6, "grad_norm": 0.5044906735420227, "kl": 0.0010058499174192548, "learning_rate": 1.1041666666666668e-06, "loss": 0.0, "reward": 1.684746265411377, "reward_std": 0.37403419613838196, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8125, "rewards/wrapped_driving_reward": -0.12775368988513947, "rewards/wrapped_format_reward": 0.0, "step": 53 }, { "completion_length": 500.0, "epoch": 10.8, "grad_norm": 3.0662996768951416, "kl": 0.0013428920647129416, "learning_rate": 1.125e-06, "loss": 0.0001, "reward": -0.6375584602355957, "reward_std": 3.885220527648926, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4318181872367859, "rewards/wrapped_driving_reward": -1.6943767070770264, "rewards/wrapped_format_reward": 0.125, "step": 54 }, { "completion_length": 500.0, "epoch": 11.0, "grad_norm": 8.792601585388184, "kl": 0.002546559553593397, "learning_rate": 1.1458333333333333e-06, "loss": 0.0001, "reward": -1.0800423622131348, "reward_std": 3.3800711631774902, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3499999940395355, "rewards/wrapped_driving_reward": -1.9300422668457031, "rewards/wrapped_format_reward": 0.0, "step": 55 }, { "completion_length": 500.0, "epoch": 11.2, "grad_norm": 0.4684543013572693, "kl": 0.0008864883566275239, "learning_rate": 1.1666666666666668e-06, "loss": 0.0, "reward": 0.3871076703071594, "reward_std": 3.0099942684173584, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6000000238418579, "rewards/wrapped_driving_reward": -1.2128922939300537, "rewards/wrapped_format_reward": 0.25, "step": 56 }, { "completion_length": 500.0, "epoch": 11.4, "grad_norm": 3.0129141807556152, "kl": 0.001087621902115643, "learning_rate": 1.1875e-06, "loss": 0.0, "reward": -0.7230278253555298, "reward_std": 3.5053629875183105, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.42500001192092896, "rewards/wrapped_driving_reward": -1.898027777671814, "rewards/wrapped_format_reward": 0.25, "step": 57 }, { "completion_length": 500.0, "epoch": 11.6, "grad_norm": 4.786574363708496, "kl": 0.001360047492198646, "learning_rate": 1.2083333333333333e-06, "loss": 0.0001, "reward": -1.245678186416626, "reward_std": 3.180633544921875, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -1.995678186416626, "rewards/wrapped_format_reward": 0.0, "step": 58 }, { "completion_length": 500.0, "epoch": 11.8, "grad_norm": 17.00609588623047, "kl": 0.0034802549052983522, "learning_rate": 1.2291666666666666e-06, "loss": 0.0001, "reward": 1.0649046897888184, "reward_std": 3.3774664402008057, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5625, "rewards/wrapped_driving_reward": -0.37259531021118164, "rewards/wrapped_format_reward": 0.125, "step": 59 }, { "completion_length": 500.0, "epoch": 12.0, "grad_norm": 3.239806652069092, "kl": 0.003311566775664687, "learning_rate": 1.25e-06, "loss": 0.0001, "reward": 0.46374011039733887, "reward_std": 3.0523617267608643, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5833333730697632, "rewards/wrapped_driving_reward": -1.1195932626724243, "rewards/wrapped_format_reward": 0.25, "step": 60 }, { "completion_length": 500.0, "epoch": 12.2, "grad_norm": 0.33646300435066223, "kl": 0.000738381699193269, "learning_rate": 1.2708333333333334e-06, "loss": 0.0, "reward": -1.9281842708587646, "reward_std": 2.4151525497436523, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.20000000298023224, "rewards/wrapped_driving_reward": -3.0031843185424805, "rewards/wrapped_format_reward": 0.375, "step": 61 }, { "completion_length": 500.0, "epoch": 12.4, "grad_norm": 3.0721795558929443, "kl": 0.00422726571559906, "learning_rate": 1.2916666666666669e-06, "loss": 0.0002, "reward": -2.1507418155670166, "reward_std": 2.7292230129241943, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -3.0257418155670166, "rewards/wrapped_format_reward": 0.375, "step": 62 }, { "completion_length": 500.0, "epoch": 12.6, "grad_norm": 0.3383236527442932, "kl": 0.0008720280602574348, "learning_rate": 1.3125000000000001e-06, "loss": 0.0, "reward": 0.4363464117050171, "reward_std": 2.9959378242492676, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5255681872367859, "rewards/wrapped_driving_reward": -0.964221715927124, "rewards/wrapped_format_reward": 0.125, "step": 63 }, { "completion_length": 500.0, "epoch": 12.8, "grad_norm": 1.1513729095458984, "kl": 0.0009778901003301144, "learning_rate": 1.3333333333333334e-06, "loss": 0.0, "reward": -1.2415308952331543, "reward_std": 3.18522047996521, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4375, "rewards/wrapped_driving_reward": -2.1790308952331543, "rewards/wrapped_format_reward": 0.0, "step": 64 }, { "completion_length": 500.0, "epoch": 13.0, "grad_norm": 0.3557533025741577, "kl": 0.0007779670413583517, "learning_rate": 1.3541666666666667e-06, "loss": 0.0, "reward": -2.398324966430664, "reward_std": 2.8796792030334473, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.1666666716337204, "rewards/wrapped_driving_reward": -3.0649914741516113, "rewards/wrapped_format_reward": 0.25, "step": 65 }, { "completion_length": 500.0, "epoch": 13.2, "grad_norm": 3.0979981422424316, "kl": 0.002764773555099964, "learning_rate": 1.3750000000000002e-06, "loss": 0.0001, "reward": 0.25424015522003174, "reward_std": 2.8697030544281006, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -0.870759904384613, "rewards/wrapped_format_reward": 0.0, "step": 66 }, { "completion_length": 500.0, "epoch": 13.4, "grad_norm": 0.0013586197746917605, "kl": 0.0009537562145851552, "learning_rate": 1.3958333333333335e-06, "loss": 0.0, "reward": -4.0, "reward_std": 0.0, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.0, "step": 67 }, { "completion_length": 500.0, "epoch": 13.6, "grad_norm": 2.009368419647217, "kl": 0.002123823156580329, "learning_rate": 1.4166666666666667e-06, "loss": 0.0001, "reward": -3.625, "reward_std": 0.25, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.375, "step": 68 }, { "completion_length": 500.0, "epoch": 13.8, "grad_norm": 4.284774303436279, "kl": 0.0029185714665800333, "learning_rate": 1.4375e-06, "loss": 0.0001, "reward": 0.39181816577911377, "reward_std": 3.028280019760132, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5583333373069763, "rewards/wrapped_driving_reward": -1.1665152311325073, "rewards/wrapped_format_reward": 0.25, "step": 69 }, { "completion_length": 500.0, "epoch": 14.0, "grad_norm": 8.606492042541504, "kl": 0.0037718252278864384, "learning_rate": 1.4583333333333335e-06, "loss": 0.0002, "reward": -1.2599546909332275, "reward_std": 3.1716644763946533, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2818181812763214, "rewards/wrapped_driving_reward": -2.0417728424072266, "rewards/wrapped_format_reward": 0.0, "step": 70 }, { "completion_length": 500.0, "epoch": 14.2, "grad_norm": 1.687968373298645, "kl": 0.0014080241089686751, "learning_rate": 1.4791666666666668e-06, "loss": 0.0001, "reward": 1.1474496126174927, "reward_std": 3.4946651458740234, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.390625, "rewards/wrapped_driving_reward": -0.3681753873825073, "rewards/wrapped_format_reward": 0.375, "step": 71 }, { "completion_length": 500.0, "epoch": 14.4, "grad_norm": 2.8704042434692383, "kl": 0.0016583104152232409, "learning_rate": 1.5e-06, "loss": 0.0001, "reward": 0.42686039209365845, "reward_std": 2.952040910720825, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3083333373069763, "rewards/wrapped_driving_reward": -0.8814729452133179, "rewards/wrapped_format_reward": 0.25, "step": 72 }, { "completion_length": 500.0, "epoch": 14.6, "grad_norm": 0.32328060269355774, "kl": 0.0007918566698208451, "learning_rate": 1.5208333333333333e-06, "loss": 0.0, "reward": -0.9926960468292236, "reward_std": 3.4759135246276855, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3068181872367859, "rewards/wrapped_driving_reward": -2.0495142936706543, "rewards/wrapped_format_reward": 0.25, "step": 73 }, { "completion_length": 500.0, "epoch": 14.8, "grad_norm": 33.80134201049805, "kl": 0.0036835242062807083, "learning_rate": 1.5416666666666668e-06, "loss": 0.0001, "reward": -0.7441283464431763, "reward_std": 3.4832684993743896, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -1.9941283464431763, "rewards/wrapped_format_reward": 0.5, "step": 74 }, { "completion_length": 500.0, "epoch": 15.0, "grad_norm": 2.6355581283569336, "kl": 0.003368059406057, "learning_rate": 1.5625e-06, "loss": 0.0001, "reward": -2.332406997680664, "reward_std": 3.335186243057251, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.957406997680664, "rewards/wrapped_format_reward": 0.125, "step": 75 }, { "completion_length": 500.0, "epoch": 15.2, "grad_norm": 0.3515971302986145, "kl": 0.0008471008623018861, "learning_rate": 1.5833333333333333e-06, "loss": 0.0, "reward": 1.0511770248413086, "reward_std": 3.3706252574920654, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.550000011920929, "rewards/wrapped_driving_reward": -0.3738229274749756, "rewards/wrapped_format_reward": 0.125, "step": 76 }, { "completion_length": 500.0, "epoch": 15.4, "grad_norm": 2.122354745864868, "kl": 0.002714228816330433, "learning_rate": 1.6041666666666668e-06, "loss": 0.0001, "reward": 1.80057692527771, "reward_std": 0.6607468128204346, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9166666865348816, "rewards/wrapped_driving_reward": -0.24108988046646118, "rewards/wrapped_format_reward": 0.125, "step": 77 }, { "completion_length": 500.0, "epoch": 15.6, "grad_norm": 13.85558032989502, "kl": 0.006086647976189852, "learning_rate": 1.6250000000000001e-06, "loss": 0.0002, "reward": -0.6382275819778442, "reward_std": 3.3426644802093506, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.46875, "rewards/wrapped_driving_reward": -1.9819775819778442, "rewards/wrapped_format_reward": 0.375, "step": 78 }, { "completion_length": 500.0, "epoch": 15.8, "grad_norm": 2.019861936569214, "kl": 0.0009517069556750357, "learning_rate": 1.6458333333333334e-06, "loss": 0.0, "reward": -1.128483533859253, "reward_std": 3.035717010498047, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4318181872367859, "rewards/wrapped_driving_reward": -2.1853017807006836, "rewards/wrapped_format_reward": 0.125, "step": 79 }, { "completion_length": 500.0, "epoch": 16.0, "grad_norm": 2.5614662170410156, "kl": 0.0013113848399370909, "learning_rate": 1.6666666666666667e-06, "loss": 0.0001, "reward": -2.397922992706299, "reward_std": 3.2041540145874023, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.17499999701976776, "rewards/wrapped_driving_reward": -2.947923183441162, "rewards/wrapped_format_reward": 0.125, "step": 80 }, { "completion_length": 500.0, "epoch": 16.2, "grad_norm": 0.3375092148780823, "kl": 0.0006573036080226302, "learning_rate": 1.6875000000000001e-06, "loss": 0.0, "reward": 1.6859958171844482, "reward_std": 0.27753275632858276, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6818181872367859, "rewards/wrapped_driving_reward": 0.0041775694116950035, "rewards/wrapped_format_reward": 0.0, "step": 81 }, { "completion_length": 500.0, "epoch": 16.4, "grad_norm": 0.3876532316207886, "kl": 0.000868563074618578, "learning_rate": 1.7083333333333334e-06, "loss": 0.0, "reward": -0.7118014097213745, "reward_std": 3.8014400005340576, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.5868014097213745, "rewards/wrapped_format_reward": 0.0, "step": 82 }, { "completion_length": 500.0, "epoch": 16.6, "grad_norm": 0.4464600682258606, "kl": 0.0009050997905433178, "learning_rate": 1.7291666666666667e-06, "loss": 0.0, "reward": -2.545438289642334, "reward_std": 2.909123182296753, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -3.045438289642334, "rewards/wrapped_format_reward": 0.0, "step": 83 }, { "completion_length": 500.0, "epoch": 16.8, "grad_norm": 7.183985710144043, "kl": 0.006316404789686203, "learning_rate": 1.75e-06, "loss": 0.0003, "reward": -1.0390408039093018, "reward_std": 3.426302433013916, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2916666865348816, "rewards/wrapped_driving_reward": -1.8307075500488281, "rewards/wrapped_format_reward": 0.0, "step": 84 }, { "completion_length": 500.0, "epoch": 17.0, "grad_norm": 4.626237869262695, "kl": 0.0024934238754212856, "learning_rate": 1.7708333333333337e-06, "loss": 0.0001, "reward": -0.7597904205322266, "reward_std": 3.164623260498047, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -1.8847904205322266, "rewards/wrapped_format_reward": 0.375, "step": 85 }, { "completion_length": 500.0, "epoch": 17.2, "grad_norm": 3.646864175796509, "kl": 0.006598799955099821, "learning_rate": 1.7916666666666667e-06, "loss": 0.0003, "reward": -0.8650339841842651, "reward_std": 3.405834436416626, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -2.1150341033935547, "rewards/wrapped_format_reward": 0.375, "step": 86 }, { "completion_length": 500.0, "epoch": 17.4, "grad_norm": 0.38579967617988586, "kl": 0.0009482253226451576, "learning_rate": 1.8125e-06, "loss": 0.0, "reward": -1.179398536682129, "reward_std": 2.9798529148101807, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3333333134651184, "rewards/wrapped_driving_reward": -2.1377317905426025, "rewards/wrapped_format_reward": 0.125, "step": 87 }, { "completion_length": 500.0, "epoch": 17.6, "grad_norm": 5.8937668800354, "kl": 0.0027326152194291353, "learning_rate": 1.8333333333333333e-06, "loss": 0.0001, "reward": 2.8879857063293457, "reward_std": 0.8419812321662903, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.898809552192688, "rewards/wrapped_driving_reward": 0.48917609453201294, "rewards/wrapped_format_reward": 0.5, "step": 88 }, { "completion_length": 500.0, "epoch": 17.8, "grad_norm": 0.4111790359020233, "kl": 0.0008414680487476289, "learning_rate": 1.854166666666667e-06, "loss": 0.0, "reward": -2.695969581604004, "reward_std": 2.608060836791992, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.10000000149011612, "rewards/wrapped_driving_reward": -3.0459694862365723, "rewards/wrapped_format_reward": 0.0, "step": 89 }, { "completion_length": 500.0, "epoch": 18.0, "grad_norm": 1.19579017162323, "kl": 0.0015203645452857018, "learning_rate": 1.8750000000000003e-06, "loss": 0.0001, "reward": -2.6511423587799072, "reward_std": 2.6977152824401855, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.09375, "rewards/wrapped_driving_reward": -2.9948923587799072, "rewards/wrapped_format_reward": 0.0, "step": 90 }, { "completion_length": 500.0, "epoch": 18.2, "grad_norm": 3.6108834743499756, "kl": 0.019146539270877838, "learning_rate": 1.8958333333333333e-06, "loss": 0.0008, "reward": -0.8020716905593872, "reward_std": 2.8371593952178955, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3031249940395355, "rewards/wrapped_driving_reward": -1.9801967144012451, "rewards/wrapped_format_reward": 0.375, "step": 91 }, { "completion_length": 500.0, "epoch": 18.4, "grad_norm": 1.2253782749176025, "kl": 0.0009205341921187937, "learning_rate": 1.916666666666667e-06, "loss": 0.0, "reward": -0.9882822036743164, "reward_std": 3.4778919219970703, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.28928571939468384, "rewards/wrapped_driving_reward": -1.9025678634643555, "rewards/wrapped_format_reward": 0.125, "step": 92 }, { "completion_length": 500.0, "epoch": 18.6, "grad_norm": 0.396138995885849, "kl": 0.0009751519537530839, "learning_rate": 1.9375e-06, "loss": 0.0, "reward": -1.189629077911377, "reward_std": 3.282799243927002, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.1805555522441864, "rewards/wrapped_driving_reward": -1.9951846599578857, "rewards/wrapped_format_reward": 0.125, "step": 93 }, { "completion_length": 500.0, "epoch": 18.8, "grad_norm": 5.001523971557617, "kl": 0.0025945594534277916, "learning_rate": 1.9583333333333334e-06, "loss": 0.0001, "reward": -0.38306254148483276, "reward_std": 4.17663049697876, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.34375, "rewards/wrapped_driving_reward": -1.6018126010894775, "rewards/wrapped_format_reward": 0.375, "step": 94 }, { "completion_length": 500.0, "epoch": 19.0, "grad_norm": 2.349184274673462, "kl": 0.009762048721313477, "learning_rate": 1.9791666666666666e-06, "loss": 0.0004, "reward": -2.3886988162994385, "reward_std": 2.599043846130371, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.20000000298023224, "rewards/wrapped_driving_reward": -3.3386988639831543, "rewards/wrapped_format_reward": 0.5, "step": 95 }, { "completion_length": 500.0, "epoch": 19.2, "grad_norm": 2.266725778579712, "kl": 0.0014983770670369267, "learning_rate": 2.0000000000000003e-06, "loss": 0.0001, "reward": -2.6680169105529785, "reward_std": 2.663965940475464, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.10000000149011612, "rewards/wrapped_driving_reward": -3.018017053604126, "rewards/wrapped_format_reward": 0.0, "step": 96 }, { "completion_length": 500.0, "epoch": 19.4, "grad_norm": 1.2776412963867188, "kl": 0.0011785050155594945, "learning_rate": 2.0208333333333336e-06, "loss": 0.0, "reward": -2.4784326553344727, "reward_std": 2.42277193069458, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -2.9784326553344727, "rewards/wrapped_format_reward": 0.25, "step": 97 }, { "completion_length": 500.0, "epoch": 19.6, "grad_norm": 5.367588996887207, "kl": 0.0031056797597557306, "learning_rate": 2.041666666666667e-06, "loss": 0.0001, "reward": -2.46954345703125, "reward_std": 3.0609130859375, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.203125, "rewards/wrapped_driving_reward": -3.04766845703125, "rewards/wrapped_format_reward": 0.125, "step": 98 }, { "completion_length": 500.0, "epoch": 19.8, "grad_norm": 0.42096036672592163, "kl": 0.0008003456750884652, "learning_rate": 2.0625e-06, "loss": 0.0, "reward": -2.5779201984405518, "reward_std": 2.521865129470825, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.0833333358168602, "rewards/wrapped_driving_reward": -3.0362536907196045, "rewards/wrapped_format_reward": 0.125, "step": 99 }, { "completion_length": 500.0, "epoch": 20.0, "grad_norm": 2.5479648113250732, "kl": 0.0011869773734360933, "learning_rate": 2.0833333333333334e-06, "loss": 0.0, "reward": 0.8239460587501526, "reward_std": 3.235414743423462, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5308441519737244, "rewards/wrapped_driving_reward": -0.7068981528282166, "rewards/wrapped_format_reward": 0.25, "step": 100 }, { "completion_length": 500.0, "epoch": 20.2, "grad_norm": 0.386593222618103, "kl": 0.0008260260801762342, "learning_rate": 2.1041666666666667e-06, "loss": 0.0, "reward": -2.655496120452881, "reward_std": 2.689007520675659, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.1428571492433548, "rewards/wrapped_driving_reward": -3.048353433609009, "rewards/wrapped_format_reward": 0.0, "step": 101 }, { "completion_length": 500.0, "epoch": 20.4, "grad_norm": 1.7718898057937622, "kl": 0.03829975798726082, "learning_rate": 2.125e-06, "loss": 0.0015, "reward": -2.821061134338379, "reward_std": 1.417920708656311, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.17307692766189575, "rewards/wrapped_driving_reward": -3.61913800239563, "rewards/wrapped_format_reward": 0.375, "step": 102 }, { "completion_length": 500.0, "epoch": 20.6, "grad_norm": 0.3890434205532074, "kl": 0.0009301622048951685, "learning_rate": 2.1458333333333333e-06, "loss": 0.0, "reward": -1.0485785007476807, "reward_std": 3.4387564659118652, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.42500001192092896, "rewards/wrapped_driving_reward": -2.223578453063965, "rewards/wrapped_format_reward": 0.25, "step": 103 }, { "completion_length": 500.0, "epoch": 20.8, "grad_norm": 1.8730741739273071, "kl": 0.003595958696678281, "learning_rate": 2.166666666666667e-06, "loss": 0.0001, "reward": -1.01571524143219, "reward_std": 3.184199333190918, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2708333134651184, "rewards/wrapped_driving_reward": -1.9115486145019531, "rewards/wrapped_format_reward": 0.125, "step": 104 }, { "completion_length": 500.0, "epoch": 21.0, "grad_norm": 9.170056343078613, "kl": 0.01782212406396866, "learning_rate": 2.1875000000000002e-06, "loss": 0.0007, "reward": -0.8671887516975403, "reward_std": 3.62764048576355, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -1.992188811302185, "rewards/wrapped_format_reward": 0.375, "step": 105 }, { "completion_length": 500.0, "epoch": 21.2, "grad_norm": 0.5751691460609436, "kl": 0.0012470635119825602, "learning_rate": 2.2083333333333335e-06, "loss": 0.0, "reward": 2.0378382205963135, "reward_std": 0.5405812859535217, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.781818151473999, "rewards/wrapped_driving_reward": 0.006020138971507549, "rewards/wrapped_format_reward": 0.25, "step": 106 }, { "completion_length": 500.0, "epoch": 21.4, "grad_norm": 0.3796798288822174, "kl": 0.0008278587483800948, "learning_rate": 2.2291666666666668e-06, "loss": 0.0, "reward": -0.9299300909042358, "reward_std": 3.0426578521728516, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.1818181872367859, "rewards/wrapped_driving_reward": -2.111748456954956, "rewards/wrapped_format_reward": 0.5, "step": 107 }, { "completion_length": 500.0, "epoch": 21.6, "grad_norm": 3.6200075149536133, "kl": 0.00310003524646163, "learning_rate": 2.25e-06, "loss": 0.0001, "reward": -0.7615724205970764, "reward_std": 3.7413580417633057, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.6365723609924316, "rewards/wrapped_format_reward": 0.0, "step": 108 }, { "completion_length": 500.0, "epoch": 21.8, "grad_norm": 5.526390075683594, "kl": 0.0022852052934467793, "learning_rate": 2.2708333333333333e-06, "loss": 0.0001, "reward": -2.3737564086914062, "reward_std": 2.928654193878174, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.125, "rewards/wrapped_driving_reward": -2.9987564086914062, "rewards/wrapped_format_reward": 0.25, "step": 109 }, { "completion_length": 500.0, "epoch": 22.0, "grad_norm": 3.324976921081543, "kl": 0.027398547157645226, "learning_rate": 2.2916666666666666e-06, "loss": 0.0011, "reward": -1.607568621635437, "reward_std": 2.372925281524658, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4427083432674408, "rewards/wrapped_driving_reward": -2.9252769947052, "rewards/wrapped_format_reward": 0.125, "step": 110 }, { "completion_length": 500.0, "epoch": 22.2, "grad_norm": 37.2985954284668, "kl": 0.02997448667883873, "learning_rate": 2.3125000000000003e-06, "loss": 0.0012, "reward": -3.625, "reward_std": 0.4787135720252991, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.375, "step": 111 }, { "completion_length": 500.0, "epoch": 22.4, "grad_norm": 1.5794304609298706, "kl": 0.003044416196644306, "learning_rate": 2.3333333333333336e-06, "loss": 0.0001, "reward": -2.8739120960235596, "reward_std": 2.252175807952881, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.1666666716337204, "rewards/wrapped_driving_reward": -3.290578842163086, "rewards/wrapped_format_reward": 0.0, "step": 112 }, { "completion_length": 500.0, "epoch": 22.6, "grad_norm": 4.685129642486572, "kl": 0.003984578885138035, "learning_rate": 2.354166666666667e-06, "loss": 0.0002, "reward": -0.772267758846283, "reward_std": 3.1760997772216797, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.0222678184509277, "rewards/wrapped_format_reward": 0.25, "step": 113 }, { "completion_length": 500.0, "epoch": 22.8, "grad_norm": 12.547566413879395, "kl": 0.030495142564177513, "learning_rate": 2.375e-06, "loss": 0.0012, "reward": 1.7730681896209717, "reward_std": 0.5380395650863647, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": 0.02306819148361683, "rewards/wrapped_format_reward": 0.0, "step": 114 }, { "completion_length": 500.0, "epoch": 23.0, "grad_norm": 2.9405810832977295, "kl": 0.001207878114655614, "learning_rate": 2.395833333333334e-06, "loss": 0.0, "reward": -1.1156203746795654, "reward_std": 3.3368895053863525, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.9906203746795654, "rewards/wrapped_format_reward": 0.0, "step": 115 }, { "completion_length": 500.0, "epoch": 23.2, "grad_norm": 1.6627004146575928, "kl": 0.0010354293044656515, "learning_rate": 2.4166666666666667e-06, "loss": 0.0, "reward": -0.9815444350242615, "reward_std": 3.485572576522827, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.40625, "rewards/wrapped_driving_reward": -2.012794256210327, "rewards/wrapped_format_reward": 0.125, "step": 116 }, { "completion_length": 500.0, "epoch": 23.4, "grad_norm": 15.359031677246094, "kl": 0.01042311079800129, "learning_rate": 2.4375e-06, "loss": 0.0004, "reward": -2.5663912296295166, "reward_std": 2.867217540740967, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.125, "rewards/wrapped_driving_reward": -2.9413912296295166, "rewards/wrapped_format_reward": 0.0, "step": 117 }, { "completion_length": 500.0, "epoch": 23.6, "grad_norm": 1.0322840213775635, "kl": 0.0010455718729645014, "learning_rate": 2.4583333333333332e-06, "loss": 0.0, "reward": -0.7912392616271973, "reward_std": 3.706042528152466, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3068181872367859, "rewards/wrapped_driving_reward": -1.973057508468628, "rewards/wrapped_format_reward": 0.375, "step": 118 }, { "completion_length": 500.0, "epoch": 23.8, "grad_norm": 2.897580862045288, "kl": 0.0041581131517887115, "learning_rate": 2.479166666666667e-06, "loss": 0.0002, "reward": -0.5823937058448792, "reward_std": 3.6703875064849854, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -1.5823936462402344, "rewards/wrapped_format_reward": 0.25, "step": 119 }, { "completion_length": 500.0, "epoch": 24.0, "grad_norm": 0.8678112030029297, "kl": 0.005057378206402063, "learning_rate": 2.5e-06, "loss": 0.0002, "reward": 1.8686788082122803, "reward_std": 0.3760830760002136, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6818181872367859, "rewards/wrapped_driving_reward": -0.06313925981521606, "rewards/wrapped_format_reward": 0.25, "step": 120 }, { "completion_length": 500.0, "epoch": 24.2, "grad_norm": 0.45261016488075256, "kl": 0.0010572966421023011, "learning_rate": 2.5208333333333335e-06, "loss": 0.0, "reward": -2.406386137008667, "reward_std": 3.187227725982666, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.1875, "rewards/wrapped_driving_reward": -2.968886137008667, "rewards/wrapped_format_reward": 0.125, "step": 121 }, { "completion_length": 500.0, "epoch": 24.4, "grad_norm": 1.2322733402252197, "kl": 0.001089164288714528, "learning_rate": 2.5416666666666668e-06, "loss": 0.0, "reward": -1.1155859231948853, "reward_std": 3.3368074893951416, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.9905859231948853, "rewards/wrapped_format_reward": 0.0, "step": 122 }, { "completion_length": 500.0, "epoch": 24.6, "grad_norm": 0.3946419060230255, "kl": 0.0009771620389074087, "learning_rate": 2.5625e-06, "loss": 0.0, "reward": -2.3373544216156006, "reward_std": 2.7000937461853027, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.09375, "rewards/wrapped_driving_reward": -2.9311044216156006, "rewards/wrapped_format_reward": 0.25, "step": 123 }, { "completion_length": 500.0, "epoch": 24.8, "grad_norm": 0.416446328163147, "kl": 0.0009063539328053594, "learning_rate": 2.5833333333333337e-06, "loss": 0.0, "reward": -2.6977710723876953, "reward_std": 2.6044580936431885, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.0625, "rewards/wrapped_driving_reward": -3.0102710723876953, "rewards/wrapped_format_reward": 0.0, "step": 124 }, { "completion_length": 500.0, "epoch": 25.0, "grad_norm": 0.37381142377853394, "kl": 0.0009684949764050543, "learning_rate": 2.604166666666667e-06, "loss": 0.0, "reward": -0.5735440254211426, "reward_std": 3.6742913722991943, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3020833134651184, "rewards/wrapped_driving_reward": -1.6256272792816162, "rewards/wrapped_format_reward": 0.25, "step": 125 }, { "completion_length": 500.0, "epoch": 25.2, "grad_norm": 0.3716869354248047, "kl": 0.0008447925210930407, "learning_rate": 2.6250000000000003e-06, "loss": 0.0, "reward": -0.8787834644317627, "reward_std": 3.3303563594818115, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2916666865348816, "rewards/wrapped_driving_reward": -1.920450210571289, "rewards/wrapped_format_reward": 0.25, "step": 126 }, { "completion_length": 500.0, "epoch": 25.4, "grad_norm": 1.7830382585525513, "kl": 0.0016754590906202793, "learning_rate": 2.6458333333333336e-06, "loss": 0.0001, "reward": 2.153042793273926, "reward_std": 0.3702898919582367, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": 0.15304280817508698, "rewards/wrapped_format_reward": 0.375, "step": 127 }, { "completion_length": 500.0, "epoch": 25.6, "grad_norm": 0.40261051058769226, "kl": 0.0009311916655860841, "learning_rate": 2.666666666666667e-06, "loss": 0.0, "reward": -2.4677088260650635, "reward_std": 2.7414004802703857, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.10000000149011612, "rewards/wrapped_driving_reward": -2.942708730697632, "rewards/wrapped_format_reward": 0.125, "step": 128 }, { "completion_length": 500.0, "epoch": 25.8, "grad_norm": 6.308819770812988, "kl": 0.00869703572243452, "learning_rate": 2.6875e-06, "loss": 0.0003, "reward": -0.47751596570014954, "reward_std": 3.236863136291504, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.41818180680274963, "rewards/wrapped_driving_reward": -2.020697832107544, "rewards/wrapped_format_reward": 0.625, "step": 129 }, { "completion_length": 500.0, "epoch": 26.0, "grad_norm": 3.463308572769165, "kl": 0.03951391950249672, "learning_rate": 2.7083333333333334e-06, "loss": 0.0016, "reward": -2.465014934539795, "reward_std": 2.109844207763672, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.0416666679084301, "rewards/wrapped_driving_reward": -3.1316816806793213, "rewards/wrapped_format_reward": 0.375, "step": 130 }, { "completion_length": 500.0, "epoch": 26.2, "grad_norm": 0.37268656492233276, "kl": 0.0010275749955326319, "learning_rate": 2.7291666666666667e-06, "loss": 0.0, "reward": -2.0108275413513184, "reward_std": 3.3450615406036377, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.15625, "rewards/wrapped_driving_reward": -2.7920775413513184, "rewards/wrapped_format_reward": 0.375, "step": 131 }, { "completion_length": 500.0, "epoch": 26.4, "grad_norm": 0.3964381217956543, "kl": 0.001012228662148118, "learning_rate": 2.7500000000000004e-06, "loss": 0.0, "reward": -1.2674376964569092, "reward_std": 3.008662223815918, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -2.267437696456909, "rewards/wrapped_format_reward": 0.125, "step": 132 }, { "completion_length": 500.0, "epoch": 26.6, "grad_norm": 10.317098617553711, "kl": 0.03428129479289055, "learning_rate": 2.7708333333333336e-06, "loss": 0.0014, "reward": -1.227315902709961, "reward_std": 2.922797679901123, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.26704543828964233, "rewards/wrapped_driving_reward": -2.119361162185669, "rewards/wrapped_format_reward": 0.125, "step": 133 }, { "completion_length": 500.0, "epoch": 26.8, "grad_norm": 1.919240951538086, "kl": 0.00681189214810729, "learning_rate": 2.791666666666667e-06, "loss": 0.0003, "reward": 0.37134385108947754, "reward_std": 2.954814910888672, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.003656268119812, "rewards/wrapped_format_reward": 0.125, "step": 134 }, { "completion_length": 500.0, "epoch": 27.0, "grad_norm": 4.358583927154541, "kl": 0.003853685688227415, "learning_rate": 2.8125e-06, "loss": 0.0002, "reward": -3.180288314819336, "reward_std": 0.9465946555137634, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3197115361690521, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.0, "step": 135 }, { "completion_length": 500.0, "epoch": 27.2, "grad_norm": 4.834251880645752, "kl": 0.01798548549413681, "learning_rate": 2.8333333333333335e-06, "loss": 0.0007, "reward": -0.00926351547241211, "reward_std": 2.662809133529663, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4791666865348816, "rewards/wrapped_driving_reward": -1.238430142402649, "rewards/wrapped_format_reward": 0.0, "step": 136 }, { "completion_length": 500.0, "epoch": 27.4, "grad_norm": 1.3516547679901123, "kl": 0.0019082785584032536, "learning_rate": 2.8541666666666667e-06, "loss": 0.0001, "reward": -0.8441787958145142, "reward_std": 3.6675186157226562, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4318181872367859, "rewards/wrapped_driving_reward": -2.0259971618652344, "rewards/wrapped_format_reward": 0.25, "step": 137 }, { "completion_length": 500.0, "epoch": 27.6, "grad_norm": 0.3614887595176697, "kl": 0.0007737466366961598, "learning_rate": 2.875e-06, "loss": 0.0, "reward": -1.5659279823303223, "reward_std": 2.8618316650390625, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.1354166716337204, "rewards/wrapped_driving_reward": -2.3263447284698486, "rewards/wrapped_format_reward": 0.125, "step": 138 }, { "completion_length": 500.0, "epoch": 27.8, "grad_norm": 0.44359442591667175, "kl": 0.0009050779044628143, "learning_rate": 2.8958333333333337e-06, "loss": 0.0, "reward": 0.5474408864974976, "reward_std": 2.7063920497894287, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5249999761581421, "rewards/wrapped_driving_reward": -0.9775590300559998, "rewards/wrapped_format_reward": 0.25, "step": 139 }, { "completion_length": 500.0, "epoch": 28.0, "grad_norm": 1.6172740459442139, "kl": 0.001949971541762352, "learning_rate": 2.916666666666667e-06, "loss": 0.0001, "reward": -2.165811061859131, "reward_std": 3.6683778762817383, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.790811061859131, "rewards/wrapped_format_reward": 0.125, "step": 140 }, { "completion_length": 500.0, "epoch": 28.2, "grad_norm": 1.963468074798584, "kl": 0.004007345996797085, "learning_rate": 2.9375000000000003e-06, "loss": 0.0002, "reward": -2.3711447715759277, "reward_std": 3.2577102184295654, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.125, "rewards/wrapped_driving_reward": -2.9961447715759277, "rewards/wrapped_format_reward": 0.25, "step": 141 }, { "completion_length": 500.0, "epoch": 28.4, "grad_norm": 2.7539641857147217, "kl": 0.00553749967366457, "learning_rate": 2.9583333333333335e-06, "loss": 0.0002, "reward": -0.709455668926239, "reward_std": 3.2490553855895996, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4129464328289032, "rewards/wrapped_driving_reward": -1.8724020719528198, "rewards/wrapped_format_reward": 0.25, "step": 142 }, { "completion_length": 500.0, "epoch": 28.6, "grad_norm": 3.479469060897827, "kl": 0.033329278230667114, "learning_rate": 2.979166666666667e-06, "loss": 0.0013, "reward": -0.8142069578170776, "reward_std": 3.702267646789551, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3482142686843872, "rewards/wrapped_driving_reward": -1.9124212265014648, "rewards/wrapped_format_reward": 0.25, "step": 143 }, { "completion_length": 500.0, "epoch": 28.8, "grad_norm": 3.352234363555908, "kl": 0.010930047370493412, "learning_rate": 3e-06, "loss": 0.0004, "reward": 0.9485020637512207, "reward_std": 3.3182427883148193, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3333333432674408, "rewards/wrapped_driving_reward": -0.38483119010925293, "rewards/wrapped_format_reward": 0.25, "step": 144 }, { "completion_length": 500.0, "epoch": 29.0, "grad_norm": 3.932000160217285, "kl": 0.03878607600927353, "learning_rate": 3.0208333333333334e-06, "loss": 0.0016, "reward": 0.1870807409286499, "reward_std": 2.7980854511260986, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4166666865348816, "rewards/wrapped_driving_reward": -1.104585886001587, "rewards/wrapped_format_reward": 0.125, "step": 145 }, { "completion_length": 500.0, "epoch": 29.2, "grad_norm": 0.0015932704554870725, "kl": 0.001104721101000905, "learning_rate": 3.0416666666666666e-06, "loss": 0.0, "reward": -4.0, "reward_std": 0.0, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.0, "step": 146 }, { "completion_length": 500.0, "epoch": 29.4, "grad_norm": 7.533337593078613, "kl": 0.04573941230773926, "learning_rate": 3.0625000000000003e-06, "loss": 0.0018, "reward": 0.9483587741851807, "reward_std": 3.3045196533203125, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -0.8016412258148193, "rewards/wrapped_format_reward": 0.5, "step": 147 }, { "completion_length": 500.0, "epoch": 29.6, "grad_norm": 2.8840458393096924, "kl": 0.03999151289463043, "learning_rate": 3.0833333333333336e-06, "loss": 0.0016, "reward": 0.3635343313217163, "reward_std": 2.5867059230804443, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": -1.1364656686782837, "rewards/wrapped_format_reward": 0.125, "step": 148 }, { "completion_length": 500.0, "epoch": 29.8, "grad_norm": 3.9054391384124756, "kl": 0.038462840020656586, "learning_rate": 3.104166666666667e-06, "loss": 0.0015, "reward": 0.5946906208992004, "reward_std": 3.0916733741760254, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.46590909361839294, "rewards/wrapped_driving_reward": -0.8712185025215149, "rewards/wrapped_format_reward": 0.25, "step": 149 }, { "completion_length": 500.0, "epoch": 30.0, "grad_norm": 1.3047268390655518, "kl": 0.003437787527218461, "learning_rate": 3.125e-06, "loss": 0.0001, "reward": -1.262099027633667, "reward_std": 3.1614561080932617, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.21875, "rewards/wrapped_driving_reward": -1.9808489084243774, "rewards/wrapped_format_reward": 0.0, "step": 150 }, { "completion_length": 500.0, "epoch": 30.2, "grad_norm": 19.819337844848633, "kl": 0.02368580363690853, "learning_rate": 3.1458333333333334e-06, "loss": 0.0009, "reward": -0.593837320804596, "reward_std": 3.3858869075775146, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.9688372611999512, "rewards/wrapped_format_reward": 0.5, "step": 151 }, { "completion_length": 500.0, "epoch": 30.4, "grad_norm": 0.36471259593963623, "kl": 0.0008758799522183836, "learning_rate": 3.1666666666666667e-06, "loss": 0.0, "reward": 0.32638394832611084, "reward_std": 2.8899521827697754, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6636363863945007, "rewards/wrapped_driving_reward": -1.0872524976730347, "rewards/wrapped_format_reward": 0.0, "step": 152 }, { "completion_length": 500.0, "epoch": 30.6, "grad_norm": 3.6062331199645996, "kl": 0.010750241577625275, "learning_rate": 3.1875e-06, "loss": 0.0004, "reward": 0.9008145332336426, "reward_std": 3.3484742641448975, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3392857313156128, "rewards/wrapped_driving_reward": -0.8134711980819702, "rewards/wrapped_format_reward": 0.625, "step": 153 }, { "completion_length": 500.0, "epoch": 30.8, "grad_norm": 3.008584976196289, "kl": 0.004083903506398201, "learning_rate": 3.2083333333333337e-06, "loss": 0.0002, "reward": 0.6495532393455505, "reward_std": 3.1026666164398193, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.8504468202590942, "rewards/wrapped_format_reward": 0.0, "step": 154 }, { "completion_length": 500.0, "epoch": 31.0, "grad_norm": 4.257124423980713, "kl": 0.007256774697452784, "learning_rate": 3.229166666666667e-06, "loss": 0.0003, "reward": 1.5835230350494385, "reward_std": 0.16466718912124634, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5773809552192688, "rewards/wrapped_driving_reward": 0.006142044439911842, "rewards/wrapped_format_reward": 0.0, "step": 155 }, { "completion_length": 500.0, "epoch": 31.2, "grad_norm": 2.010960578918457, "kl": 0.02292311191558838, "learning_rate": 3.2500000000000002e-06, "loss": 0.0009, "reward": 1.8099894523620605, "reward_std": 0.18248680233955383, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7178758978843689, "rewards/wrapped_driving_reward": 0.09211361408233643, "rewards/wrapped_format_reward": 0.0, "step": 156 }, { "completion_length": 500.0, "epoch": 31.4, "grad_norm": 0.4033740758895874, "kl": 0.0010355673730373383, "learning_rate": 3.2708333333333335e-06, "loss": 0.0, "reward": -3.75, "reward_std": 0.5, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.25, "step": 157 }, { "completion_length": 500.0, "epoch": 31.6, "grad_norm": 17.625986099243164, "kl": 0.04252302274107933, "learning_rate": 3.2916666666666668e-06, "loss": 0.0017, "reward": -2.468630075454712, "reward_std": 2.739564895629883, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.2142857164144516, "rewards/wrapped_driving_reward": -3.0579159259796143, "rewards/wrapped_format_reward": 0.125, "step": 158 }, { "completion_length": 500.0, "epoch": 31.8, "grad_norm": 6.371521949768066, "kl": 0.02408897504210472, "learning_rate": 3.3125e-06, "loss": 0.001, "reward": -0.5439435243606567, "reward_std": 4.002199172973633, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3125, "rewards/wrapped_driving_reward": -1.6064435243606567, "rewards/wrapped_format_reward": 0.25, "step": 159 }, { "completion_length": 500.0, "epoch": 32.0, "grad_norm": 0.8184000253677368, "kl": 0.027131706476211548, "learning_rate": 3.3333333333333333e-06, "loss": 0.0011, "reward": 1.6387406587600708, "reward_std": 0.7142547965049744, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8103895783424377, "rewards/wrapped_driving_reward": -0.17164897918701172, "rewards/wrapped_format_reward": 0.0, "step": 160 }, { "completion_length": 500.0, "epoch": 32.2, "grad_norm": 0.3655173182487488, "kl": 0.0010108003625646234, "learning_rate": 3.3541666666666666e-06, "loss": 0.0, "reward": -0.9831902980804443, "reward_std": 3.484320640563965, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3166666626930237, "rewards/wrapped_driving_reward": -2.0498569011688232, "rewards/wrapped_format_reward": 0.25, "step": 161 }, { "completion_length": 500.0, "epoch": 32.4, "grad_norm": 2.8332362174987793, "kl": 0.025723226368427277, "learning_rate": 3.3750000000000003e-06, "loss": 0.001, "reward": -0.8011678457260132, "reward_std": 3.6942384243011475, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2857142686843872, "rewards/wrapped_driving_reward": -1.5868821144104004, "rewards/wrapped_format_reward": 0.0, "step": 162 }, { "completion_length": 500.0, "epoch": 32.6, "grad_norm": 0.0015628942055627704, "kl": 0.0009882381418719888, "learning_rate": 3.3958333333333336e-06, "loss": 0.0, "reward": -4.0, "reward_std": 0.0, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.0, "step": 163 }, { "completion_length": 500.0, "epoch": 32.8, "grad_norm": 0.22536346316337585, "kl": 0.014067083597183228, "learning_rate": 3.416666666666667e-06, "loss": 0.0006, "reward": -4.0, "reward_std": 0.0, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.0, "step": 164 }, { "completion_length": 500.0, "epoch": 33.0, "grad_norm": 2.8652682304382324, "kl": 0.0301588773727417, "learning_rate": 3.4375e-06, "loss": 0.0012, "reward": 0.6044188737869263, "reward_std": 3.0828073024749756, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.48571425676345825, "rewards/wrapped_driving_reward": -1.0062953233718872, "rewards/wrapped_format_reward": 0.375, "step": 165 }, { "completion_length": 500.0, "epoch": 33.2, "grad_norm": 14.549288749694824, "kl": 0.03827816992998123, "learning_rate": 3.4583333333333334e-06, "loss": 0.0015, "reward": 0.20543813705444336, "reward_std": 2.8183021545410156, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3499999940395355, "rewards/wrapped_driving_reward": -1.0195618867874146, "rewards/wrapped_format_reward": 0.125, "step": 166 }, { "completion_length": 500.0, "epoch": 33.4, "grad_norm": 23.31963539123535, "kl": 0.0236306581646204, "learning_rate": 3.4791666666666667e-06, "loss": 0.0009, "reward": -1.3284555673599243, "reward_std": 2.110996961593628, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -2.4534554481506348, "rewards/wrapped_format_reward": 0.0, "step": 167 }, { "completion_length": 500.0, "epoch": 33.6, "grad_norm": 0.4104408323764801, "kl": 0.0013704978628084064, "learning_rate": 3.5e-06, "loss": 0.0001, "reward": -2.5163447856903076, "reward_std": 2.9673104286193848, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.125, "rewards/wrapped_driving_reward": -3.0163450241088867, "rewards/wrapped_format_reward": 0.125, "step": 168 }, { "completion_length": 500.0, "epoch": 33.8, "grad_norm": 7.642683982849121, "kl": 0.03837261348962784, "learning_rate": 3.520833333333334e-06, "loss": 0.0015, "reward": 0.8579701781272888, "reward_std": 3.2719130516052246, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.36666667461395264, "rewards/wrapped_driving_reward": -0.508696436882019, "rewards/wrapped_format_reward": 0.25, "step": 169 }, { "completion_length": 500.0, "epoch": 34.0, "grad_norm": 8.982998847961426, "kl": 0.06479807198047638, "learning_rate": 3.5416666666666673e-06, "loss": 0.0026, "reward": 1.5301133394241333, "reward_std": 1.1181119680404663, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.65625, "rewards/wrapped_driving_reward": -0.5011366009712219, "rewards/wrapped_format_reward": 0.375, "step": 170 }, { "completion_length": 500.0, "epoch": 34.2, "grad_norm": 2.2494914531707764, "kl": 0.0033163258340209723, "learning_rate": 3.5625e-06, "loss": 0.0001, "reward": -0.6762468814849854, "reward_std": 3.5722789764404297, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3020833134651184, "rewards/wrapped_driving_reward": -1.853330135345459, "rewards/wrapped_format_reward": 0.375, "step": 171 }, { "completion_length": 500.0, "epoch": 34.4, "grad_norm": 0.90104740858078, "kl": 0.006812056060880423, "learning_rate": 3.5833333333333335e-06, "loss": 0.0003, "reward": -2.3616232872009277, "reward_std": 3.2767534255981445, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.9866232872009277, "rewards/wrapped_format_reward": 0.125, "step": 172 }, { "completion_length": 500.0, "epoch": 34.6, "grad_norm": 0.3947916030883789, "kl": 0.0011263922788202763, "learning_rate": 3.6041666666666667e-06, "loss": 0.0, "reward": -0.5480427742004395, "reward_std": 4.001402854919434, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4375, "rewards/wrapped_driving_reward": -1.6105427742004395, "rewards/wrapped_format_reward": 0.125, "step": 173 }, { "completion_length": 500.0, "epoch": 34.8, "grad_norm": 0.6156601309776306, "kl": 0.01154815312474966, "learning_rate": 3.625e-06, "loss": 0.0005, "reward": -2.329235315322876, "reward_std": 3.017416000366211, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -3.079235315322876, "rewards/wrapped_format_reward": 0.25, "step": 174 }, { "completion_length": 500.0, "epoch": 35.0, "grad_norm": 0.4099337160587311, "kl": 0.0011381495278328657, "learning_rate": 3.6458333333333333e-06, "loss": 0.0, "reward": -2.365032434463501, "reward_std": 2.083897829055786, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.33181819319725037, "rewards/wrapped_driving_reward": -3.1968507766723633, "rewards/wrapped_format_reward": 0.0, "step": 175 }, { "completion_length": 500.0, "epoch": 35.2, "grad_norm": 4.004872798919678, "kl": 0.029580948874354362, "learning_rate": 3.6666666666666666e-06, "loss": 0.0012, "reward": -1.2987949848175049, "reward_std": 3.119826316833496, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3125, "rewards/wrapped_driving_reward": -2.111294746398926, "rewards/wrapped_format_reward": 0.0, "step": 176 }, { "completion_length": 500.0, "epoch": 35.4, "grad_norm": 1.671083927154541, "kl": 0.014694800600409508, "learning_rate": 3.6875000000000007e-06, "loss": 0.0006, "reward": -1.2172389030456543, "reward_std": 3.2450075149536133, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -1.9672389030456543, "rewards/wrapped_format_reward": 0.0, "step": 177 }, { "completion_length": 500.0, "epoch": 35.6, "grad_norm": 0.4049564301967621, "kl": 0.0011514219222590327, "learning_rate": 3.708333333333334e-06, "loss": 0.0, "reward": 0.8965679407119751, "reward_std": 3.293184757232666, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.53125, "rewards/wrapped_driving_reward": -0.3846820592880249, "rewards/wrapped_format_reward": 0.0, "step": 178 }, { "completion_length": 500.0, "epoch": 35.8, "grad_norm": 0.0426897257566452, "kl": 0.011578786186873913, "learning_rate": 3.7291666666666672e-06, "loss": 0.0005, "reward": -4.0, "reward_std": 0.0, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.0, "step": 179 }, { "completion_length": 500.0, "epoch": 36.0, "grad_norm": 2.753361701965332, "kl": 0.03497646003961563, "learning_rate": 3.7500000000000005e-06, "loss": 0.0014, "reward": -0.8905968070030212, "reward_std": 3.5913023948669434, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4375, "rewards/wrapped_driving_reward": -1.953096866607666, "rewards/wrapped_format_reward": 0.125, "step": 180 }, { "completion_length": 500.0, "epoch": 36.2, "grad_norm": 4.126710891723633, "kl": 0.023542242124676704, "learning_rate": 3.7708333333333334e-06, "loss": 0.0009, "reward": -2.637735366821289, "reward_std": 2.724529266357422, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.09375, "rewards/wrapped_driving_reward": -2.981485366821289, "rewards/wrapped_format_reward": 0.0, "step": 181 }, { "completion_length": 500.0, "epoch": 36.4, "grad_norm": 2.1164910793304443, "kl": 0.03186760097742081, "learning_rate": 3.7916666666666666e-06, "loss": 0.0013, "reward": -0.6355093121528625, "reward_std": 3.886032819747925, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.32499998807907104, "rewards/wrapped_driving_reward": -1.5855093002319336, "rewards/wrapped_format_reward": 0.125, "step": 182 }, { "completion_length": 500.0, "epoch": 36.6, "grad_norm": 0.9481387734413147, "kl": 0.02689046412706375, "learning_rate": 3.8125e-06, "loss": 0.0011, "reward": 0.9345278143882751, "reward_std": 3.2953097820281982, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5880681872367859, "rewards/wrapped_driving_reward": -0.778540313243866, "rewards/wrapped_format_reward": 0.375, "step": 183 }, { "completion_length": 500.0, "epoch": 36.8, "grad_norm": 0.3303874135017395, "kl": 0.0011166415642946959, "learning_rate": 3.833333333333334e-06, "loss": 0.0, "reward": -2.608830213546753, "reward_std": 2.782339572906494, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.1875, "rewards/wrapped_driving_reward": -3.046330213546753, "rewards/wrapped_format_reward": 0.0, "step": 184 }, { "completion_length": 500.0, "epoch": 37.0, "grad_norm": 0.5033643245697021, "kl": 0.001327235484495759, "learning_rate": 3.854166666666667e-06, "loss": 0.0001, "reward": -1.2191828489303589, "reward_std": 3.2157087326049805, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.23863635957241058, "rewards/wrapped_driving_reward": -1.9578192234039307, "rewards/wrapped_format_reward": 0.0, "step": 185 }, { "completion_length": 500.0, "epoch": 37.2, "grad_norm": 0.3546835780143738, "kl": 0.0012730626622214913, "learning_rate": 3.875e-06, "loss": 0.0001, "reward": -2.6489737033843994, "reward_std": 2.702052593231201, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -2.8989737033843994, "rewards/wrapped_format_reward": 0.0, "step": 186 }, { "completion_length": 500.0, "epoch": 37.4, "grad_norm": 0.3939490020275116, "kl": 0.0011839700164273381, "learning_rate": 3.8958333333333334e-06, "loss": 0.0, "reward": -2.425374746322632, "reward_std": 2.5269436836242676, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.125, "rewards/wrapped_driving_reward": -3.050374746322632, "rewards/wrapped_format_reward": 0.25, "step": 187 }, { "completion_length": 500.0, "epoch": 37.6, "grad_norm": 4.388774394989014, "kl": 0.03779164329171181, "learning_rate": 3.916666666666667e-06, "loss": 0.0015, "reward": -2.095153331756592, "reward_std": 3.8096938133239746, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.2083333283662796, "rewards/wrapped_driving_reward": -2.8034865856170654, "rewards/wrapped_format_reward": 0.25, "step": 188 }, { "completion_length": 500.0, "epoch": 37.8, "grad_norm": 0.3407905697822571, "kl": 0.0012132242554798722, "learning_rate": 3.9375e-06, "loss": 0.0, "reward": 0.9215935468673706, "reward_std": 3.2830312252044678, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6458333134651184, "rewards/wrapped_driving_reward": -0.849239706993103, "rewards/wrapped_format_reward": 0.375, "step": 189 }, { "completion_length": 500.0, "epoch": 38.0, "grad_norm": 12.550337791442871, "kl": 0.13451190292835236, "learning_rate": 3.958333333333333e-06, "loss": 0.0054, "reward": 1.0308945178985596, "reward_std": 2.702942371368408, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5249999761581421, "rewards/wrapped_driving_reward": -0.9941054582595825, "rewards/wrapped_format_reward": 0.75, "step": 190 }, { "completion_length": 500.0, "epoch": 38.2, "grad_norm": 13.734210968017578, "kl": 0.07660804688930511, "learning_rate": 3.9791666666666665e-06, "loss": 0.0031, "reward": -2.5775606632232666, "reward_std": 2.844878673553467, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.21590909361839294, "rewards/wrapped_driving_reward": -3.0434696674346924, "rewards/wrapped_format_reward": 0.0, "step": 191 }, { "completion_length": 500.0, "epoch": 38.4, "grad_norm": 0.39941278100013733, "kl": 0.0012940344167873263, "learning_rate": 4.000000000000001e-06, "loss": 0.0001, "reward": 1.0290822982788086, "reward_std": 3.354139566421509, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.42811357975006104, "rewards/wrapped_driving_reward": -0.3990311622619629, "rewards/wrapped_format_reward": 0.25, "step": 192 }, { "completion_length": 500.0, "epoch": 38.6, "grad_norm": 5.0886054039001465, "kl": 0.1496005654335022, "learning_rate": 4.020833333333334e-06, "loss": 0.006, "reward": 0.23665398359298706, "reward_std": 2.8346645832061768, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4821428656578064, "rewards/wrapped_driving_reward": -0.9954888820648193, "rewards/wrapped_format_reward": 0.0, "step": 193 }, { "completion_length": 500.0, "epoch": 38.8, "grad_norm": 1.0010530948638916, "kl": 0.01472307275980711, "learning_rate": 4.041666666666667e-06, "loss": 0.0006, "reward": -0.31997019052505493, "reward_std": 2.5833802223205566, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4886363744735718, "rewards/wrapped_driving_reward": -1.8086066246032715, "rewards/wrapped_format_reward": 0.25, "step": 194 }, { "completion_length": 500.0, "epoch": 39.0, "grad_norm": 4.434115886688232, "kl": 0.07414662837982178, "learning_rate": 4.0625000000000005e-06, "loss": 0.003, "reward": 1.8734700679779053, "reward_std": 0.7573453783988953, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6666666865348816, "rewards/wrapped_driving_reward": -0.16819655895233154, "rewards/wrapped_format_reward": 0.375, "step": 195 }, { "completion_length": 500.0, "epoch": 39.2, "grad_norm": 0.35328763723373413, "kl": 0.0011651457753032446, "learning_rate": 4.083333333333334e-06, "loss": 0.0, "reward": -0.9308372139930725, "reward_std": 3.265139579772949, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3499999940395355, "rewards/wrapped_driving_reward": -2.030837297439575, "rewards/wrapped_format_reward": 0.25, "step": 196 }, { "completion_length": 500.0, "epoch": 39.4, "grad_norm": 2.841628313064575, "kl": 0.1545742154121399, "learning_rate": 4.104166666666667e-06, "loss": 0.0062, "reward": 0.7136911153793335, "reward_std": 3.221494436264038, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4709596037864685, "rewards/wrapped_driving_reward": -0.8822685480117798, "rewards/wrapped_format_reward": 0.375, "step": 197 }, { "completion_length": 500.0, "epoch": 39.6, "grad_norm": 0.3438974618911743, "kl": 0.001065694261342287, "learning_rate": 4.125e-06, "loss": 0.0, "reward": -0.33991003036499023, "reward_std": 4.232908725738525, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.36250001192092896, "rewards/wrapped_driving_reward": -1.5774099826812744, "rewards/wrapped_format_reward": 0.375, "step": 198 }, { "completion_length": 500.0, "epoch": 39.8, "grad_norm": 5.850861549377441, "kl": 0.08895232528448105, "learning_rate": 4.145833333333334e-06, "loss": 0.0036, "reward": -1.0433223247528076, "reward_std": 3.4140782356262207, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4000000059604645, "rewards/wrapped_driving_reward": -2.0683224201202393, "rewards/wrapped_format_reward": 0.125, "step": 199 }, { "completion_length": 500.0, "epoch": 40.0, "grad_norm": 1.8783408403396606, "kl": 0.06220545247197151, "learning_rate": 4.166666666666667e-06, "loss": 0.0025, "reward": -2.5082778930664062, "reward_std": 2.9834442138671875, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.203125, "rewards/wrapped_driving_reward": -2.9614028930664062, "rewards/wrapped_format_reward": 0.0, "step": 200 }, { "completion_length": 500.0, "epoch": 40.2, "grad_norm": 0.3625878691673279, "kl": 0.0015527592040598392, "learning_rate": 4.1875e-06, "loss": 0.0001, "reward": -2.2801547050476074, "reward_std": 2.7830231189727783, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.1944444477558136, "rewards/wrapped_driving_reward": -2.9745991230010986, "rewards/wrapped_format_reward": 0.25, "step": 201 }, { "completion_length": 500.0, "epoch": 40.4, "grad_norm": 1.3591216802597046, "kl": 0.031130777671933174, "learning_rate": 4.208333333333333e-06, "loss": 0.0012, "reward": -0.9100172519683838, "reward_std": 3.5681514739990234, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.432692289352417, "rewards/wrapped_driving_reward": -1.9677093029022217, "rewards/wrapped_format_reward": 0.125, "step": 202 }, { "completion_length": 500.0, "epoch": 40.6, "grad_norm": 0.35409578680992126, "kl": 0.0012434074888005853, "learning_rate": 4.229166666666667e-06, "loss": 0.0, "reward": -2.404740571975708, "reward_std": 2.534834384918213, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.09375, "rewards/wrapped_driving_reward": -2.998490571975708, "rewards/wrapped_format_reward": 0.25, "step": 203 }, { "completion_length": 500.0, "epoch": 40.8, "grad_norm": 0.9982916712760925, "kl": 0.05762110650539398, "learning_rate": 4.25e-06, "loss": 0.0023, "reward": -2.0678892135620117, "reward_std": 3.5387465953826904, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.10000000149011612, "rewards/wrapped_driving_reward": -2.79288911819458, "rewards/wrapped_format_reward": 0.375, "step": 204 }, { "completion_length": 500.0, "epoch": 41.0, "grad_norm": 0.5429545640945435, "kl": 0.010603474453091621, "learning_rate": 4.270833333333333e-06, "loss": 0.0004, "reward": -1.1565592288970947, "reward_std": 3.0510413646698, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3214285671710968, "rewards/wrapped_driving_reward": -2.227987766265869, "rewards/wrapped_format_reward": 0.25, "step": 205 }, { "completion_length": 500.0, "epoch": 41.2, "grad_norm": 0.692771315574646, "kl": 0.033038631081581116, "learning_rate": 4.2916666666666665e-06, "loss": 0.0013, "reward": 0.6099777817726135, "reward_std": 3.07470703125, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6875, "rewards/wrapped_driving_reward": -0.9525222778320312, "rewards/wrapped_format_reward": 0.125, "step": 206 }, { "completion_length": 500.0, "epoch": 41.4, "grad_norm": 0.5571784973144531, "kl": 0.006391022354364395, "learning_rate": 4.312500000000001e-06, "loss": 0.0003, "reward": -2.5556817054748535, "reward_std": 2.888636350631714, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.1875, "rewards/wrapped_driving_reward": -2.9931819438934326, "rewards/wrapped_format_reward": 0.0, "step": 207 }, { "completion_length": 500.0, "epoch": 41.6, "grad_norm": 2.603093147277832, "kl": 0.08236520737409592, "learning_rate": 4.333333333333334e-06, "loss": 0.0033, "reward": 0.7872141003608704, "reward_std": 3.2228293418884277, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.42500001192092896, "rewards/wrapped_driving_reward": -0.3877858519554138, "rewards/wrapped_format_reward": 0.0, "step": 208 }, { "completion_length": 500.0, "epoch": 41.8, "grad_norm": 2.8598968982696533, "kl": 0.03701096028089523, "learning_rate": 4.354166666666667e-06, "loss": 0.0015, "reward": -3.75, "reward_std": 0.5, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.25, "step": 209 }, { "completion_length": 500.0, "epoch": 42.0, "grad_norm": 0.5654446482658386, "kl": 0.016315069049596786, "learning_rate": 4.3750000000000005e-06, "loss": 0.0007, "reward": -1.0942189693450928, "reward_std": 3.3554024696350098, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.42500001192092896, "rewards/wrapped_driving_reward": -2.019218921661377, "rewards/wrapped_format_reward": 0.0, "step": 210 }, { "completion_length": 500.0, "epoch": 42.2, "grad_norm": 1.1546794176101685, "kl": 0.1304369419813156, "learning_rate": 4.395833333333334e-06, "loss": 0.0052, "reward": -2.067227363586426, "reward_std": 2.8944802284240723, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.1875, "rewards/wrapped_driving_reward": -3.004727363586426, "rewards/wrapped_format_reward": 0.5, "step": 211 }, { "completion_length": 500.0, "epoch": 42.4, "grad_norm": 0.4224312901496887, "kl": 0.001306671998463571, "learning_rate": 4.416666666666667e-06, "loss": 0.0001, "reward": -1.3893475532531738, "reward_std": 3.059168577194214, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4318181872367859, "rewards/wrapped_driving_reward": -2.3211658000946045, "rewards/wrapped_format_reward": 0.0, "step": 212 }, { "completion_length": 500.0, "epoch": 42.6, "grad_norm": 14.517871856689453, "kl": 0.15590544044971466, "learning_rate": 4.4375e-06, "loss": 0.0062, "reward": -0.8690509796142578, "reward_std": 3.619702100753784, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.28125, "rewards/wrapped_driving_reward": -2.025300979614258, "rewards/wrapped_format_reward": 0.375, "step": 213 }, { "completion_length": 500.0, "epoch": 42.8, "grad_norm": 0.42172175645828247, "kl": 0.0013596608769148588, "learning_rate": 4.4583333333333336e-06, "loss": 0.0001, "reward": -0.6579513549804688, "reward_std": 3.57773756980896, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3068181872367859, "rewards/wrapped_driving_reward": -1.5897696018218994, "rewards/wrapped_format_reward": 0.125, "step": 214 }, { "completion_length": 500.0, "epoch": 43.0, "grad_norm": 1.4108898639678955, "kl": 0.05196976661682129, "learning_rate": 4.479166666666667e-06, "loss": 0.0021, "reward": 1.4945721626281738, "reward_std": 0.5106267333030701, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.3854166865348816, "rewards/wrapped_driving_reward": -0.14084455370903015, "rewards/wrapped_format_reward": 0.25, "step": 215 }, { "completion_length": 500.0, "epoch": 43.2, "grad_norm": 0.38824036717414856, "kl": 0.0016642104601487517, "learning_rate": 4.5e-06, "loss": 0.0001, "reward": -0.8108178377151489, "reward_std": 3.692883253097534, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3194444477558136, "rewards/wrapped_driving_reward": -2.1302623748779297, "rewards/wrapped_format_reward": 0.5, "step": 216 }, { "completion_length": 500.0, "epoch": 43.4, "grad_norm": 14.23416519165039, "kl": 0.13234129548072815, "learning_rate": 4.520833333333333e-06, "loss": 0.0053, "reward": -2.400029420852661, "reward_std": 2.876281499862671, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -3.025029182434082, "rewards/wrapped_format_reward": 0.125, "step": 217 }, { "completion_length": 500.0, "epoch": 43.6, "grad_norm": 4.683525562286377, "kl": 0.1954280287027359, "learning_rate": 4.541666666666667e-06, "loss": 0.0078, "reward": -1.0995738506317139, "reward_std": 3.3534717559814453, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2986111044883728, "rewards/wrapped_driving_reward": -1.8981850147247314, "rewards/wrapped_format_reward": 0.0, "step": 218 }, { "completion_length": 500.0, "epoch": 43.8, "grad_norm": 0.3793950080871582, "kl": 0.001844916958361864, "learning_rate": 4.5625e-06, "loss": 0.0001, "reward": -0.7520895004272461, "reward_std": 3.759147882461548, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.34375, "rewards/wrapped_driving_reward": -1.595839500427246, "rewards/wrapped_format_reward": 0.0, "step": 219 }, { "completion_length": 500.0, "epoch": 44.0, "grad_norm": 0.4106435477733612, "kl": 0.0017210771329700947, "learning_rate": 4.583333333333333e-06, "loss": 0.0001, "reward": 1.8025760650634766, "reward_std": 0.23965147137641907, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.49166664481163025, "rewards/wrapped_driving_reward": 0.06090930849313736, "rewards/wrapped_format_reward": 0.25, "step": 220 }, { "completion_length": 500.0, "epoch": 44.2, "grad_norm": 4.90147066116333, "kl": 0.293916791677475, "learning_rate": 4.6041666666666665e-06, "loss": 0.0118, "reward": 1.2585606575012207, "reward_std": 3.5554494857788086, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -0.3664393424987793, "rewards/wrapped_format_reward": 0.375, "step": 221 }, { "completion_length": 500.0, "epoch": 44.4, "grad_norm": 2.4181244373321533, "kl": 0.09946326911449432, "learning_rate": 4.625000000000001e-06, "loss": 0.004, "reward": -0.7496248483657837, "reward_std": 3.2129709720611572, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3214285671710968, "rewards/wrapped_driving_reward": -1.946053385734558, "rewards/wrapped_format_reward": 0.375, "step": 222 }, { "completion_length": 500.0, "epoch": 44.6, "grad_norm": 0.3938154876232147, "kl": 0.0027729361318051815, "learning_rate": 4.645833333333334e-06, "loss": 0.0001, "reward": 1.8255128860473633, "reward_std": 0.3188542425632477, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6972222328186035, "rewards/wrapped_driving_reward": -0.12170933187007904, "rewards/wrapped_format_reward": 0.25, "step": 223 }, { "completion_length": 500.0, "epoch": 44.8, "grad_norm": 1.139906883239746, "kl": 0.07463247328996658, "learning_rate": 4.666666666666667e-06, "loss": 0.003, "reward": -1.154573917388916, "reward_std": 3.352905511856079, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4318181872367859, "rewards/wrapped_driving_reward": -2.3363921642303467, "rewards/wrapped_format_reward": 0.25, "step": 224 }, { "completion_length": 500.0, "epoch": 45.0, "grad_norm": 0.37643423676490784, "kl": 0.0015645886305719614, "learning_rate": 4.6875000000000004e-06, "loss": 0.0001, "reward": -0.12135392427444458, "reward_std": 2.5997867584228516, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4166666865348816, "rewards/wrapped_driving_reward": -1.2880206108093262, "rewards/wrapped_format_reward": 0.0, "step": 225 }, { "completion_length": 500.0, "epoch": 45.2, "grad_norm": 0.4397854506969452, "kl": 0.0018841986311599612, "learning_rate": 4.708333333333334e-06, "loss": 0.0001, "reward": -1.1124364137649536, "reward_std": 3.4070234298706055, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2916666865348816, "rewards/wrapped_driving_reward": -2.0291032791137695, "rewards/wrapped_format_reward": 0.125, "step": 226 }, { "completion_length": 500.0, "epoch": 45.4, "grad_norm": 0.603453516960144, "kl": 0.02336149290204048, "learning_rate": 4.729166666666667e-06, "loss": 0.0009, "reward": 1.599435806274414, "reward_std": 0.5891650915145874, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6500000357627869, "rewards/wrapped_driving_reward": -0.17556418478488922, "rewards/wrapped_format_reward": 0.125, "step": 227 }, { "completion_length": 500.0, "epoch": 45.6, "grad_norm": 2.822200059890747, "kl": 0.33935749530792236, "learning_rate": 4.75e-06, "loss": 0.0136, "reward": -2.294963836669922, "reward_std": 3.08575439453125, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.11363636702299118, "rewards/wrapped_driving_reward": -2.783600091934204, "rewards/wrapped_format_reward": 0.125, "step": 228 }, { "completion_length": 500.0, "epoch": 45.8, "grad_norm": 0.5101578235626221, "kl": 0.013524656184017658, "learning_rate": 4.770833333333334e-06, "loss": 0.0005, "reward": -1.138512134552002, "reward_std": 3.435119390487671, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3636363744735718, "rewards/wrapped_driving_reward": -2.127148389816284, "rewards/wrapped_format_reward": 0.125, "step": 229 }, { "completion_length": 500.0, "epoch": 46.0, "grad_norm": 19.814189910888672, "kl": 0.25205615162849426, "learning_rate": 4.791666666666668e-06, "loss": 0.0101, "reward": 0.36332249641418457, "reward_std": 2.917628049850464, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5625, "rewards/wrapped_driving_reward": -0.9491775631904602, "rewards/wrapped_format_reward": 0.0, "step": 230 }, { "completion_length": 500.0, "epoch": 46.2, "grad_norm": 1.2908904552459717, "kl": 0.10442691296339035, "learning_rate": 4.8125e-06, "loss": 0.0042, "reward": 0.2518876791000366, "reward_std": 2.5249195098876953, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3693181872367859, "rewards/wrapped_driving_reward": -0.992430567741394, "rewards/wrapped_format_reward": 0.125, "step": 231 }, { "completion_length": 500.0, "epoch": 46.4, "grad_norm": 0.37878403067588806, "kl": 0.0021237193141132593, "learning_rate": 4.833333333333333e-06, "loss": 0.0001, "reward": 1.22785484790802, "reward_std": 0.8809930682182312, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8541666865348816, "rewards/wrapped_driving_reward": -0.8763118982315063, "rewards/wrapped_format_reward": 0.25, "step": 232 }, { "completion_length": 500.0, "epoch": 46.6, "grad_norm": 36.95953369140625, "kl": 0.40437987446784973, "learning_rate": 4.854166666666667e-06, "loss": 0.0162, "reward": -1.0619306564331055, "reward_std": 3.3970491886138916, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.0619306564331055, "rewards/wrapped_format_reward": 0.25, "step": 233 }, { "completion_length": 500.0, "epoch": 46.8, "grad_norm": 47.118255615234375, "kl": 0.134329155087471, "learning_rate": 4.875e-06, "loss": 0.0054, "reward": -0.5947959423065186, "reward_std": 3.65548038482666, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -1.8447959423065186, "rewards/wrapped_format_reward": 0.5, "step": 234 }, { "completion_length": 500.0, "epoch": 47.0, "grad_norm": 0.6403725743293762, "kl": 0.023185797035694122, "learning_rate": 4.895833333333333e-06, "loss": 0.0009, "reward": 1.51215398311615, "reward_std": 0.33645421266555786, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6640625, "rewards/wrapped_driving_reward": -0.1519085168838501, "rewards/wrapped_format_reward": 0.0, "step": 235 }, { "completion_length": 500.0, "epoch": 47.2, "grad_norm": 1.1939831972122192, "kl": 0.07250034809112549, "learning_rate": 4.9166666666666665e-06, "loss": 0.0029, "reward": -2.619947910308838, "reward_std": 2.438190460205078, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.171875, "rewards/wrapped_driving_reward": -3.166822910308838, "rewards/wrapped_format_reward": 0.125, "step": 236 }, { "completion_length": 500.0, "epoch": 47.4, "grad_norm": 2.614314556121826, "kl": 0.2160005122423172, "learning_rate": 4.937500000000001e-06, "loss": 0.0086, "reward": -1.6490904092788696, "reward_std": 2.716189384460449, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2929292917251587, "rewards/wrapped_driving_reward": -2.4420197010040283, "rewards/wrapped_format_reward": 0.0, "step": 237 }, { "completion_length": 500.0, "epoch": 47.6, "grad_norm": 78.35983276367188, "kl": 0.09474217891693115, "learning_rate": 4.958333333333334e-06, "loss": 0.0038, "reward": 0.05065804719924927, "reward_std": 2.749250650405884, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.45625001192092896, "rewards/wrapped_driving_reward": -1.2805919647216797, "rewards/wrapped_format_reward": 0.125, "step": 238 }, { "completion_length": 500.0, "epoch": 47.8, "grad_norm": 0.3830593526363373, "kl": 0.0018576495349407196, "learning_rate": 4.979166666666667e-06, "loss": 0.0001, "reward": -2.5445353984832764, "reward_std": 2.9109292030334473, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.125, "rewards/wrapped_driving_reward": -2.9195353984832764, "rewards/wrapped_format_reward": 0.0, "step": 239 }, { "completion_length": 500.0, "epoch": 48.0, "grad_norm": 3.0679774284362793, "kl": 0.2315346747636795, "learning_rate": 5e-06, "loss": 0.0093, "reward": -0.948199987411499, "reward_std": 3.5248489379882812, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.42500001192092896, "rewards/wrapped_driving_reward": -1.9982000589370728, "rewards/wrapped_format_reward": 0.125, "step": 240 }, { "completion_length": 500.0, "epoch": 48.2, "grad_norm": 3.1146676540374756, "kl": 0.1782953292131424, "learning_rate": 4.999997355752031e-06, "loss": 0.0071, "reward": -0.7975939512252808, "reward_std": 3.513246774673462, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.1875, "rewards/wrapped_driving_reward": -1.8600939512252808, "rewards/wrapped_format_reward": 0.375, "step": 241 }, { "completion_length": 500.0, "epoch": 48.4, "grad_norm": 4.299013137817383, "kl": 0.19927579164505005, "learning_rate": 4.999989423013716e-06, "loss": 0.008, "reward": -0.9984800815582275, "reward_std": 3.1900885105133057, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.998479962348938, "rewards/wrapped_format_reward": 0.125, "step": 242 }, { "completion_length": 500.0, "epoch": 48.6, "grad_norm": 0.3299877643585205, "kl": 0.0019166120328009129, "learning_rate": 4.999976201801837e-06, "loss": 0.0001, "reward": -2.3678572177886963, "reward_std": 2.9404144287109375, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.1818181872367859, "rewards/wrapped_driving_reward": -3.049675464630127, "rewards/wrapped_format_reward": 0.25, "step": 243 }, { "completion_length": 500.0, "epoch": 48.8, "grad_norm": 1.1573808193206787, "kl": 0.1410599946975708, "learning_rate": 4.999957692144361e-06, "loss": 0.0056, "reward": 0.8284584283828735, "reward_std": 3.2214972972869873, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.484375, "rewards/wrapped_driving_reward": -0.40591663122177124, "rewards/wrapped_format_reward": 0.0, "step": 244 }, { "completion_length": 500.0, "epoch": 49.0, "grad_norm": 0.47671303153038025, "kl": 0.06269445270299911, "learning_rate": 4.999933894080444e-06, "loss": 0.0025, "reward": 0.38734376430511475, "reward_std": 2.944716215133667, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6477272510528564, "rewards/wrapped_driving_reward": -1.0103834867477417, "rewards/wrapped_format_reward": 0.0, "step": 245 }, { "completion_length": 500.0, "epoch": 49.2, "grad_norm": 0.41526517271995544, "kl": 0.0028381492011249065, "learning_rate": 4.9999048076604286e-06, "loss": 0.0001, "reward": 0.23170125484466553, "reward_std": 2.49680233001709, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.2202381044626236, "rewards/wrapped_driving_reward": -0.9885368347167969, "rewards/wrapped_format_reward": 0.25, "step": 246 }, { "completion_length": 500.0, "epoch": 49.4, "grad_norm": 2.5495622158050537, "kl": 0.2154771089553833, "learning_rate": 4.999870432945843e-06, "loss": 0.0086, "reward": -2.2984023094177246, "reward_std": 3.403195381164551, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.7984023094177246, "rewards/wrapped_format_reward": 0.0, "step": 247 }, { "completion_length": 500.0, "epoch": 49.6, "grad_norm": 5.153905868530273, "kl": 0.3080277740955353, "learning_rate": 4.999830770009406e-06, "loss": 0.0123, "reward": 0.2953673005104065, "reward_std": 2.8969929218292236, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4092261791229248, "rewards/wrapped_driving_reward": -0.9888589382171631, "rewards/wrapped_format_reward": 0.125, "step": 248 }, { "completion_length": 500.0, "epoch": 49.8, "grad_norm": 0.5580005049705505, "kl": 0.043262895196676254, "learning_rate": 4.999785818935018e-06, "loss": 0.0017, "reward": -2.4887704849243164, "reward_std": 2.6994357109069824, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.1818181872367859, "rewards/wrapped_driving_reward": -3.045588493347168, "rewards/wrapped_format_reward": 0.125, "step": 249 }, { "completion_length": 500.0, "epoch": 50.0, "grad_norm": 2.3369898796081543, "kl": 0.2971566617488861, "learning_rate": 4.999735579817769e-06, "loss": 0.0119, "reward": -1.1610161066055298, "reward_std": 3.2840569019317627, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -1.9110161066055298, "rewards/wrapped_format_reward": 0.0, "step": 250 }, { "completion_length": 500.0, "epoch": 50.2, "grad_norm": 0.4173508882522583, "kl": 0.003112471429631114, "learning_rate": 4.9996800527639354e-06, "loss": 0.0001, "reward": -1.1911332607269287, "reward_std": 3.311530590057373, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -2.0661332607269287, "rewards/wrapped_format_reward": 0.0, "step": 251 }, { "completion_length": 500.0, "epoch": 50.4, "grad_norm": 1.8770030736923218, "kl": 0.1331283450126648, "learning_rate": 4.9996192378909785e-06, "loss": 0.0053, "reward": -1.1617484092712402, "reward_std": 2.9961960315704346, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.22499999403953552, "rewards/wrapped_driving_reward": -2.0117483139038086, "rewards/wrapped_format_reward": 0.125, "step": 252 }, { "completion_length": 500.0, "epoch": 50.6, "grad_norm": 5.769409656524658, "kl": 0.2073991745710373, "learning_rate": 4.999553135327546e-06, "loss": 0.0083, "reward": -3.75, "reward_std": 0.5, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.25, "step": 253 }, { "completion_length": 500.0, "epoch": 50.8, "grad_norm": 0.416715532541275, "kl": 0.003338834270834923, "learning_rate": 4.999481745213471e-06, "loss": 0.0001, "reward": 0.39701712131500244, "reward_std": 2.943756341934204, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4970238208770752, "rewards/wrapped_driving_reward": -1.1000065803527832, "rewards/wrapped_format_reward": 0.25, "step": 254 }, { "completion_length": 500.0, "epoch": 51.0, "grad_norm": 0.4497571587562561, "kl": 0.0044916169717907906, "learning_rate": 4.999405067699773e-06, "loss": 0.0002, "reward": 0.5951623916625977, "reward_std": 3.1178059577941895, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5833333730697632, "rewards/wrapped_driving_reward": -0.8631709814071655, "rewards/wrapped_format_reward": 0.125, "step": 255 }, { "completion_length": 500.0, "epoch": 51.2, "grad_norm": 2.1747357845306396, "kl": 0.4025267958641052, "learning_rate": 4.999323102948655e-06, "loss": 0.0161, "reward": -1.957139492034912, "reward_std": 3.11260986328125, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.957139492034912, "rewards/wrapped_format_reward": 0.5, "step": 256 }, { "completion_length": 500.0, "epoch": 51.4, "grad_norm": 1.526734709739685, "kl": 0.15158718824386597, "learning_rate": 4.9992358511335035e-06, "loss": 0.0061, "reward": -1.0233807563781738, "reward_std": 3.175232172012329, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.32499998807907104, "rewards/wrapped_driving_reward": -1.9733808040618896, "rewards/wrapped_format_reward": 0.125, "step": 257 }, { "completion_length": 500.0, "epoch": 51.6, "grad_norm": 2.924349546432495, "kl": 0.13476546108722687, "learning_rate": 4.999143312438893e-06, "loss": 0.0054, "reward": 2.7032151222229004, "reward_std": 0.22770251333713531, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6499999761581421, "rewards/wrapped_driving_reward": 0.6782150864601135, "rewards/wrapped_format_reward": 0.375, "step": 258 }, { "completion_length": 500.0, "epoch": 51.8, "grad_norm": 3.879314422607422, "kl": 0.5085331201553345, "learning_rate": 4.99904548706058e-06, "loss": 0.0203, "reward": 2.018110513687134, "reward_std": 0.8205149173736572, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": 0.018110457807779312, "rewards/wrapped_format_reward": 0.5, "step": 259 }, { "completion_length": 500.0, "epoch": 52.0, "grad_norm": 1.5741368532180786, "kl": 0.17756682634353638, "learning_rate": 4.998942375205502e-06, "loss": 0.0071, "reward": 0.5688350200653076, "reward_std": 3.046790599822998, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5833333134651184, "rewards/wrapped_driving_reward": -1.0144983530044556, "rewards/wrapped_format_reward": 0.25, "step": 260 }, { "completion_length": 500.0, "epoch": 52.2, "grad_norm": 6.336862564086914, "kl": 0.19336561858654022, "learning_rate": 4.998833977091783e-06, "loss": 0.0077, "reward": -0.49781960248947144, "reward_std": 4.047021865844727, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.22499999403953552, "rewards/wrapped_driving_reward": -1.5978196859359741, "rewards/wrapped_format_reward": 0.375, "step": 261 }, { "completion_length": 500.0, "epoch": 52.4, "grad_norm": 0.774104118347168, "kl": 0.1010366752743721, "learning_rate": 4.998720292948727e-06, "loss": 0.004, "reward": -0.8497058153152466, "reward_std": 3.3681893348693848, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.9747058153152466, "rewards/wrapped_format_reward": 0.25, "step": 262 }, { "completion_length": 500.0, "epoch": 52.6, "grad_norm": 0.36933571100234985, "kl": 0.06732728332281113, "learning_rate": 4.998601323016824e-06, "loss": 0.0027, "reward": 0.40385496616363525, "reward_std": 2.9700136184692383, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4000000059604645, "rewards/wrapped_driving_reward": -0.9961450695991516, "rewards/wrapped_format_reward": 0.25, "step": 263 }, { "completion_length": 500.0, "epoch": 52.8, "grad_norm": 0.39391028881073, "kl": 0.003927405923604965, "learning_rate": 4.99847706754774e-06, "loss": 0.0002, "reward": -0.6202123165130615, "reward_std": 3.9359829425811768, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.8702123165130615, "rewards/wrapped_format_reward": 0.375, "step": 264 }, { "completion_length": 500.0, "epoch": 53.0, "grad_norm": 5.415218830108643, "kl": 0.17419731616973877, "learning_rate": 4.9983475268043254e-06, "loss": 0.007, "reward": -1.1574738025665283, "reward_std": 3.333486557006836, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -2.1574738025665283, "rewards/wrapped_format_reward": 0.125, "step": 265 }, { "completion_length": 500.0, "epoch": 53.2, "grad_norm": 1.1539275646209717, "kl": 0.3938099145889282, "learning_rate": 4.998212701060612e-06, "loss": 0.0158, "reward": 1.269005298614502, "reward_std": 2.8699021339416504, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.40625, "rewards/wrapped_driving_reward": -0.38724470138549805, "rewards/wrapped_format_reward": 0.5, "step": 266 }, { "completion_length": 500.0, "epoch": 53.4, "grad_norm": 11.512609481811523, "kl": 0.2840820550918579, "learning_rate": 4.998072590601808e-06, "loss": 0.0114, "reward": 0.8071303963661194, "reward_std": 3.2237534523010254, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5791666507720947, "rewards/wrapped_driving_reward": -1.0220361948013306, "rewards/wrapped_format_reward": 0.5, "step": 267 }, { "completion_length": 500.0, "epoch": 53.6, "grad_norm": 0.42872354388237, "kl": 0.004085747059434652, "learning_rate": 4.9979271957243035e-06, "loss": 0.0002, "reward": -2.604086399078369, "reward_std": 2.7918272018432617, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.1428571492433548, "rewards/wrapped_driving_reward": -2.996943473815918, "rewards/wrapped_format_reward": 0.0, "step": 268 }, { "completion_length": 500.0, "epoch": 53.8, "grad_norm": 1.7167718410491943, "kl": 0.2890765964984894, "learning_rate": 4.997776516735667e-06, "loss": 0.0116, "reward": -1.7580829858779907, "reward_std": 2.456212282180786, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4901515245437622, "rewards/wrapped_driving_reward": -3.248234510421753, "rewards/wrapped_format_reward": 0.25, "step": 269 }, { "completion_length": 500.0, "epoch": 54.0, "grad_norm": 0.5007515549659729, "kl": 0.06032608821988106, "learning_rate": 4.997620553954645e-06, "loss": 0.0024, "reward": 0.5694142580032349, "reward_std": 3.0647597312927246, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.45454543828964233, "rewards/wrapped_driving_reward": -1.0101312398910522, "rewards/wrapped_format_reward": 0.375, "step": 270 }, { "completion_length": 500.0, "epoch": 54.2, "grad_norm": 0.42161646485328674, "kl": 0.003139057895168662, "learning_rate": 4.99745930771116e-06, "loss": 0.0001, "reward": 0.9141009449958801, "reward_std": 2.9500954151153564, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5813717842102051, "rewards/wrapped_driving_reward": -0.7922708988189697, "rewards/wrapped_format_reward": 0.375, "step": 271 }, { "completion_length": 500.0, "epoch": 54.4, "grad_norm": 0.41687673330307007, "kl": 0.00326509028673172, "learning_rate": 4.997292778346312e-06, "loss": 0.0001, "reward": -2.5603370666503906, "reward_std": 2.8793258666992188, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -3.0603370666503906, "rewards/wrapped_format_reward": 0.0, "step": 272 }, { "completion_length": 500.0, "epoch": 54.6, "grad_norm": 0.3959712088108063, "kl": 0.14926669001579285, "learning_rate": 4.9971209662123774e-06, "loss": 0.006, "reward": 0.6993837952613831, "reward_std": 3.2752537727355957, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5623737573623657, "rewards/wrapped_driving_reward": -0.8629899621009827, "rewards/wrapped_format_reward": 0.25, "step": 273 }, { "completion_length": 500.0, "epoch": 54.8, "grad_norm": 0.3514886498451233, "kl": 0.0030937029514461756, "learning_rate": 4.996943871672807e-06, "loss": 0.0001, "reward": -0.6663916110992432, "reward_std": 3.8655571937561035, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4000000059604645, "rewards/wrapped_driving_reward": -1.8163914680480957, "rewards/wrapped_format_reward": 0.25, "step": 274 }, { "completion_length": 500.0, "epoch": 55.0, "grad_norm": 0.3679739832878113, "kl": 0.0026038195937871933, "learning_rate": 4.996761495102227e-06, "loss": 0.0001, "reward": -2.3016748428344727, "reward_std": 3.396650552749634, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -2.8016748428344727, "rewards/wrapped_format_reward": 0.25, "step": 275 }, { "completion_length": 500.0, "epoch": 55.2, "grad_norm": 1.3922810554504395, "kl": 0.21797779202461243, "learning_rate": 4.9965738368864345e-06, "loss": 0.0087, "reward": 0.0015410780906677246, "reward_std": 2.685657024383545, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3541666865348816, "rewards/wrapped_driving_reward": -1.2276256084442139, "rewards/wrapped_format_reward": 0.125, "step": 276 }, { "completion_length": 500.0, "epoch": 55.4, "grad_norm": 1.249841570854187, "kl": 0.2836349904537201, "learning_rate": 4.996380897422405e-06, "loss": 0.0113, "reward": 0.4582923650741577, "reward_std": 2.975569725036621, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.0417077541351318, "rewards/wrapped_format_reward": 0.25, "step": 277 }, { "completion_length": 500.0, "epoch": 55.6, "grad_norm": 4.349127769470215, "kl": 0.6486947536468506, "learning_rate": 4.996182677118278e-06, "loss": 0.0259, "reward": -0.5984437465667725, "reward_std": 3.405778408050537, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3901515007019043, "rewards/wrapped_driving_reward": -1.9885952472686768, "rewards/wrapped_format_reward": 0.5, "step": 278 }, { "completion_length": 467.0, "epoch": 55.8, "grad_norm": 2.509852170944214, "kl": 0.6388497948646545, "learning_rate": 4.995979176393372e-06, "loss": 0.0256, "reward": 2.645427942276001, "reward_std": 0.5950685143470764, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": 0.6454278826713562, "rewards/wrapped_format_reward": 0.25, "step": 279 }, { "completion_length": 500.0, "epoch": 56.0, "grad_norm": 2.851917028427124, "kl": 0.19445133209228516, "learning_rate": 4.995770395678171e-06, "loss": 0.0078, "reward": 0.3463352918624878, "reward_std": 2.9164505004882812, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": -1.1536647081375122, "rewards/wrapped_format_reward": 0.125, "step": 280 }, { "completion_length": 500.0, "epoch": 56.2, "grad_norm": 0.42350760102272034, "kl": 0.04766889289021492, "learning_rate": 4.9955563354143285e-06, "loss": 0.0019, "reward": 1.01877760887146, "reward_std": 3.0202696323394775, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4166666865348816, "rewards/wrapped_driving_reward": -0.39788898825645447, "rewards/wrapped_format_reward": 0.25, "step": 281 }, { "completion_length": 500.0, "epoch": 56.4, "grad_norm": 1.5306605100631714, "kl": 0.2947113513946533, "learning_rate": 4.995336996054668e-06, "loss": 0.0118, "reward": -0.1445428729057312, "reward_std": 2.5796546936035156, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.3945426940917969, "rewards/wrapped_format_reward": 0.125, "step": 282 }, { "completion_length": 500.0, "epoch": 56.6, "grad_norm": 0.38275909423828125, "kl": 0.003526146523654461, "learning_rate": 4.99511237806318e-06, "loss": 0.0001, "reward": -1.1158170700073242, "reward_std": 3.336292028427124, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.9908171892166138, "rewards/wrapped_format_reward": 0.0, "step": 283 }, { "completion_length": 500.0, "epoch": 56.8, "grad_norm": 0.45490142703056335, "kl": 0.09113866090774536, "learning_rate": 4.994882481915019e-06, "loss": 0.0036, "reward": -0.9650794267654419, "reward_std": 2.967007875442505, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2767857313156128, "rewards/wrapped_driving_reward": -1.9918651580810547, "rewards/wrapped_format_reward": 0.25, "step": 284 }, { "completion_length": 500.0, "epoch": 57.0, "grad_norm": 0.6580032110214233, "kl": 0.20218665897846222, "learning_rate": 4.994647308096509e-06, "loss": 0.0081, "reward": -3.875, "reward_std": 0.25, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.125, "step": 285 }, { "completion_length": 500.0, "epoch": 57.2, "grad_norm": 8.386362075805664, "kl": 0.4461314380168915, "learning_rate": 4.994406857105136e-06, "loss": 0.0178, "reward": 0.5840720534324646, "reward_std": 3.0974204540252686, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.484375, "rewards/wrapped_driving_reward": -0.9003030061721802, "rewards/wrapped_format_reward": 0.25, "step": 286 }, { "completion_length": 500.0, "epoch": 57.4, "grad_norm": 0.448266863822937, "kl": 0.0026026677805930376, "learning_rate": 4.9941611294495495e-06, "loss": 0.0001, "reward": -2.178774833679199, "reward_std": 3.6424503326416016, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.1964285671710968, "rewards/wrapped_driving_reward": -2.8752033710479736, "rewards/wrapped_format_reward": 0.25, "step": 287 }, { "completion_length": 500.0, "epoch": 57.6, "grad_norm": 1.679646372795105, "kl": 0.20971831679344177, "learning_rate": 4.993910125649561e-06, "loss": 0.0084, "reward": 1.969580054283142, "reward_std": 0.40913528203964233, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5220588445663452, "rewards/wrapped_driving_reward": 0.07252118736505508, "rewards/wrapped_format_reward": 0.375, "step": 288 }, { "completion_length": 500.0, "epoch": 57.8, "grad_norm": 0.2821173071861267, "kl": 0.0028246240690350533, "learning_rate": 4.993653846236144e-06, "loss": 0.0001, "reward": 0.9076694250106812, "reward_std": 2.957883358001709, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4375, "rewards/wrapped_driving_reward": -0.5298306941986084, "rewards/wrapped_format_reward": 0.25, "step": 289 }, { "completion_length": 500.0, "epoch": 58.0, "grad_norm": 1.2047299146652222, "kl": 0.29308056831359863, "learning_rate": 4.993392291751431e-06, "loss": 0.0117, "reward": -0.6820242404937744, "reward_std": 3.285200834274292, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -2.0570242404937744, "rewards/wrapped_format_reward": 0.5, "step": 290 }, { "completion_length": 500.0, "epoch": 58.2, "grad_norm": 0.38363662362098694, "kl": 0.0025257065426558256, "learning_rate": 4.993125462748714e-06, "loss": 0.0001, "reward": -0.6149693727493286, "reward_std": 3.9089224338531494, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.22499999403953552, "rewards/wrapped_driving_reward": -1.589969277381897, "rewards/wrapped_format_reward": 0.25, "step": 291 }, { "completion_length": 500.0, "epoch": 58.4, "grad_norm": 0.8080898523330688, "kl": 0.20920060575008392, "learning_rate": 4.992853359792444e-06, "loss": 0.0084, "reward": -2.0424954891204834, "reward_std": 2.943457841873169, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.1818181872367859, "rewards/wrapped_driving_reward": -2.849313735961914, "rewards/wrapped_format_reward": 0.375, "step": 292 }, { "completion_length": 500.0, "epoch": 58.6, "grad_norm": 0.4658648371696472, "kl": 0.07586698979139328, "learning_rate": 4.9925759834582254e-06, "loss": 0.003, "reward": 0.4167907238006592, "reward_std": 2.305239677429199, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4068181812763214, "rewards/wrapped_driving_reward": -1.1150274276733398, "rewards/wrapped_format_reward": 0.375, "step": 293 }, { "completion_length": 500.0, "epoch": 58.8, "grad_norm": 8.748403549194336, "kl": 0.2903243601322174, "learning_rate": 4.992293334332821e-06, "loss": 0.0116, "reward": 0.17369729280471802, "reward_std": 2.7883899211883545, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": -1.2013027667999268, "rewards/wrapped_format_reward": 0.0, "step": 294 }, { "completion_length": 500.0, "epoch": 59.0, "grad_norm": 0.3755791187286377, "kl": 0.004894225392490625, "learning_rate": 4.9920054130141445e-06, "loss": 0.0002, "reward": 0.47168922424316406, "reward_std": 2.994149684906006, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.512499988079071, "rewards/wrapped_driving_reward": -0.915810763835907, "rewards/wrapped_format_reward": 0.125, "step": 295 }, { "completion_length": 500.0, "epoch": 59.2, "grad_norm": 0.3929916322231293, "kl": 0.004553962033241987, "learning_rate": 4.991712220111265e-06, "loss": 0.0002, "reward": 0.22208917140960693, "reward_std": 2.8153343200683594, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5341880321502686, "rewards/wrapped_driving_reward": -1.062098741531372, "rewards/wrapped_format_reward": 0.0, "step": 296 }, { "completion_length": 500.0, "epoch": 59.4, "grad_norm": 0.41047510504722595, "kl": 0.00469350116327405, "learning_rate": 4.991413756244404e-06, "loss": 0.0002, "reward": -1.1782411336898804, "reward_std": 3.260157346725464, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3035714328289032, "rewards/wrapped_driving_reward": -1.9818124771118164, "rewards/wrapped_format_reward": 0.0, "step": 297 }, { "completion_length": 500.0, "epoch": 59.6, "grad_norm": 3.261988401412964, "kl": 0.3944035470485687, "learning_rate": 4.99111002204493e-06, "loss": 0.0158, "reward": 2.40362548828125, "reward_std": 0.452992707490921, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7250000238418579, "rewards/wrapped_driving_reward": 0.5536257028579712, "rewards/wrapped_format_reward": 0.125, "step": 298 }, { "completion_length": 500.0, "epoch": 59.8, "grad_norm": 0.4695773422718048, "kl": 0.004432830028235912, "learning_rate": 4.990801018155361e-06, "loss": 0.0002, "reward": -0.06609618663787842, "reward_std": 2.646121025085449, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.2818181812763214, "rewards/wrapped_driving_reward": -1.0979143381118774, "rewards/wrapped_format_reward": 0.0, "step": 299 }, { "completion_length": 500.0, "epoch": 60.0, "grad_norm": 6.382628917694092, "kl": 0.2269033044576645, "learning_rate": 4.990486745229364e-06, "loss": 0.0091, "reward": 0.12916043400764465, "reward_std": 3.019442558288574, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5519230365753174, "rewards/wrapped_driving_reward": -1.4227626323699951, "rewards/wrapped_format_reward": 0.25, "step": 300 }, { "completion_length": 500.0, "epoch": 60.2, "grad_norm": 2.289363384246826, "kl": 0.4308004081249237, "learning_rate": 4.990167203931753e-06, "loss": 0.0172, "reward": 0.4380396008491516, "reward_std": 3.004404067993164, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -0.6869603395462036, "rewards/wrapped_format_reward": 0.0, "step": 301 }, { "completion_length": 500.0, "epoch": 60.4, "grad_norm": 0.34541359543800354, "kl": 0.0026338391471654177, "learning_rate": 4.989842394938482e-06, "loss": 0.0001, "reward": 0.008644580841064453, "reward_std": 2.6733663082122803, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4166666865348816, "rewards/wrapped_driving_reward": -1.1580220460891724, "rewards/wrapped_format_reward": 0.0, "step": 302 }, { "completion_length": 500.0, "epoch": 60.6, "grad_norm": 0.40867820382118225, "kl": 0.0034318449907004833, "learning_rate": 4.989512318936654e-06, "loss": 0.0001, "reward": -0.49820494651794434, "reward_std": 3.4901528358459473, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.8732049465179443, "rewards/wrapped_format_reward": 0.375, "step": 303 }, { "completion_length": 500.0, "epoch": 60.8, "grad_norm": 2.397559404373169, "kl": 0.26956814527511597, "learning_rate": 4.989176976624511e-06, "loss": 0.0108, "reward": 0.6745624542236328, "reward_std": 3.1912028789520264, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6000000238418579, "rewards/wrapped_driving_reward": -0.9254375696182251, "rewards/wrapped_format_reward": 0.25, "step": 304 }, { "completion_length": 500.0, "epoch": 61.0, "grad_norm": 2.3476884365081787, "kl": 0.31312525272369385, "learning_rate": 4.988836368711435e-06, "loss": 0.0125, "reward": 1.7296613454818726, "reward_std": 0.7248514890670776, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.27033859491348267, "rewards/wrapped_format_reward": 0.25, "step": 305 }, { "completion_length": 500.0, "epoch": 61.2, "grad_norm": 0.4137451946735382, "kl": 0.05325795337557793, "learning_rate": 4.988490495917948e-06, "loss": 0.0021, "reward": 0.3951526880264282, "reward_std": 2.953357458114624, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -0.9798471927642822, "rewards/wrapped_format_reward": 0.25, "step": 306 }, { "completion_length": 500.0, "epoch": 61.4, "grad_norm": 0.6206872463226318, "kl": 0.056155622005462646, "learning_rate": 4.988139358975707e-06, "loss": 0.0022, "reward": -0.9187849164009094, "reward_std": 3.277235507965088, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.30000001192092896, "rewards/wrapped_driving_reward": -2.093784809112549, "rewards/wrapped_format_reward": 0.375, "step": 307 }, { "completion_length": 500.0, "epoch": 61.6, "grad_norm": 1.5620518922805786, "kl": 0.2103491574525833, "learning_rate": 4.987782958627508e-06, "loss": 0.0084, "reward": 1.7064940929412842, "reward_std": 0.3729006052017212, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.47430557012557983, "rewards/wrapped_driving_reward": -0.017811477184295654, "rewards/wrapped_format_reward": 0.25, "step": 308 }, { "completion_length": 500.0, "epoch": 61.8, "grad_norm": 0.45032092928886414, "kl": 0.1164650246500969, "learning_rate": 4.987421295627279e-06, "loss": 0.0047, "reward": 0.4541400074958801, "reward_std": 2.9789493083953857, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.1708600521087646, "rewards/wrapped_format_reward": 0.125, "step": 309 }, { "completion_length": 500.0, "epoch": 62.0, "grad_norm": 2.286036252975464, "kl": 0.37701109051704407, "learning_rate": 4.9870543707400835e-06, "loss": 0.0151, "reward": 0.9700980186462402, "reward_std": 3.3365931510925293, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3080357313156128, "rewards/wrapped_driving_reward": -0.462937593460083, "rewards/wrapped_format_reward": 0.375, "step": 310 }, { "completion_length": 500.0, "epoch": 62.2, "grad_norm": 0.9412902593612671, "kl": 0.4719122350215912, "learning_rate": 4.986682184742111e-06, "loss": 0.0189, "reward": -1.1503251791000366, "reward_std": 3.0121943950653076, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3484848737716675, "rewards/wrapped_driving_reward": -2.248810052871704, "rewards/wrapped_format_reward": 0.25, "step": 311 }, { "completion_length": 500.0, "epoch": 62.4, "grad_norm": 0.40759605169296265, "kl": 0.004896071273833513, "learning_rate": 4.986304738420684e-06, "loss": 0.0002, "reward": 2.4090352058410645, "reward_std": 0.401497483253479, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6696428656578064, "rewards/wrapped_driving_reward": -0.010607685893774033, "rewards/wrapped_format_reward": 0.75, "step": 312 }, { "completion_length": 500.0, "epoch": 62.6, "grad_norm": 0.3452640771865845, "kl": 0.003664538264274597, "learning_rate": 4.985922032574252e-06, "loss": 0.0001, "reward": -1.0280461311340332, "reward_std": 3.437983751296997, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.442307710647583, "rewards/wrapped_driving_reward": -2.095353841781616, "rewards/wrapped_format_reward": 0.125, "step": 313 }, { "completion_length": 500.0, "epoch": 62.8, "grad_norm": 33.71268081665039, "kl": 0.42201700806617737, "learning_rate": 4.985534068012391e-06, "loss": 0.0169, "reward": 1.1327743530273438, "reward_std": 3.4218854904174805, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.36722564697265625, "rewards/wrapped_format_reward": 0.0, "step": 314 }, { "completion_length": 500.0, "epoch": 63.0, "grad_norm": 13.974825859069824, "kl": 0.2791546583175659, "learning_rate": 4.985140845555799e-06, "loss": 0.0112, "reward": -1.1279728412628174, "reward_std": 3.347849130630493, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3187499940395355, "rewards/wrapped_driving_reward": -2.071722984313965, "rewards/wrapped_format_reward": 0.125, "step": 315 }, { "completion_length": 500.0, "epoch": 63.2, "grad_norm": 2.090256929397583, "kl": 0.22176861763000488, "learning_rate": 4.9847423660363e-06, "loss": 0.0089, "reward": 1.3426780700683594, "reward_std": 0.3560336232185364, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5166667103767395, "rewards/wrapped_driving_reward": -0.2989885210990906, "rewards/wrapped_format_reward": 0.125, "step": 316 }, { "completion_length": 500.0, "epoch": 63.4, "grad_norm": 0.377633273601532, "kl": 0.005016393028199673, "learning_rate": 4.984338630296836e-06, "loss": 0.0002, "reward": 0.21509039402008057, "reward_std": 2.826265573501587, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3499999940395355, "rewards/wrapped_driving_reward": -1.1349096298217773, "rewards/wrapped_format_reward": 0.25, "step": 317 }, { "completion_length": 500.0, "epoch": 63.6, "grad_norm": 6.6817498207092285, "kl": 0.33422529697418213, "learning_rate": 4.9839296391914696e-06, "loss": 0.0134, "reward": 1.0928232669830322, "reward_std": 3.0657143592834473, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5982142686843872, "rewards/wrapped_driving_reward": -0.3803909420967102, "rewards/wrapped_format_reward": 0.125, "step": 318 }, { "completion_length": 500.0, "epoch": 63.8, "grad_norm": 0.9221271872520447, "kl": 0.44944262504577637, "learning_rate": 4.983515393585379e-06, "loss": 0.018, "reward": 1.702825903892517, "reward_std": 0.3978630602359772, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6068181991577148, "rewards/wrapped_driving_reward": -0.028992218896746635, "rewards/wrapped_format_reward": 0.125, "step": 319 }, { "completion_length": 500.0, "epoch": 64.0, "grad_norm": 6.990262985229492, "kl": 0.22952702641487122, "learning_rate": 4.983095894354858e-06, "loss": 0.0092, "reward": 0.6090406775474548, "reward_std": 3.087157964706421, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4375000298023224, "rewards/wrapped_driving_reward": -0.9534592628479004, "rewards/wrapped_format_reward": 0.375, "step": 320 }, { "completion_length": 500.0, "epoch": 64.2, "grad_norm": 4.146872520446777, "kl": 0.3527936637401581, "learning_rate": 4.982671142387316e-06, "loss": 0.0141, "reward": -0.8656878471374512, "reward_std": 3.6415810585021973, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.38474026322364807, "rewards/wrapped_driving_reward": -2.0004281997680664, "rewards/wrapped_format_reward": 0.25, "step": 321 }, { "completion_length": 500.0, "epoch": 64.4, "grad_norm": 0.37092164158821106, "kl": 0.004467155784368515, "learning_rate": 4.982241138581273e-06, "loss": 0.0002, "reward": -2.2984180450439453, "reward_std": 3.4031639099121094, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.125, "rewards/wrapped_driving_reward": -2.7984180450439453, "rewards/wrapped_format_reward": 0.125, "step": 322 }, { "completion_length": 500.0, "epoch": 64.6, "grad_norm": 0.6264036893844604, "kl": 0.08939827233552933, "learning_rate": 4.981805883846357e-06, "loss": 0.0036, "reward": -2.4163126945495605, "reward_std": 2.511791467666626, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.0833333358168602, "rewards/wrapped_driving_reward": -2.9996461868286133, "rewards/wrapped_format_reward": 0.25, "step": 323 }, { "completion_length": 500.0, "epoch": 64.8, "grad_norm": 0.29659271240234375, "kl": 0.009662508033216, "learning_rate": 4.981365379103306e-06, "loss": 0.0004, "reward": 0.39709973335266113, "reward_std": 2.9523847103118896, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5409451723098755, "rewards/wrapped_driving_reward": -1.0188454389572144, "rewards/wrapped_format_reward": 0.125, "step": 324 }, { "completion_length": 500.0, "epoch": 65.0, "grad_norm": 7.854650020599365, "kl": 0.38552114367485046, "learning_rate": 4.980919625283962e-06, "loss": 0.0154, "reward": -0.9804279804229736, "reward_std": 3.520909547805786, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -2.2304279804229736, "rewards/wrapped_format_reward": 0.375, "step": 325 }, { "completion_length": 500.0, "epoch": 65.2, "grad_norm": 0.44117897748947144, "kl": 0.006218797527253628, "learning_rate": 4.980468623331273e-06, "loss": 0.0002, "reward": 1.3627103567123413, "reward_std": 0.3762909770011902, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5910714268684387, "rewards/wrapped_driving_reward": -0.353361040353775, "rewards/wrapped_format_reward": 0.125, "step": 326 }, { "completion_length": 500.0, "epoch": 65.4, "grad_norm": 1.4037021398544312, "kl": 0.38747310638427734, "learning_rate": 4.980012374199288e-06, "loss": 0.0155, "reward": -0.6231173276901245, "reward_std": 3.8994014263153076, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.6231173276901245, "rewards/wrapped_format_reward": 0.125, "step": 327 }, { "completion_length": 500.0, "epoch": 65.6, "grad_norm": 2.3190011978149414, "kl": 0.25002360343933105, "learning_rate": 4.979550878853154e-06, "loss": 0.01, "reward": -0.6996442079544067, "reward_std": 3.268649101257324, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2874999940395355, "rewards/wrapped_driving_reward": -1.9871442317962646, "rewards/wrapped_format_reward": 0.5, "step": 328 }, { "completion_length": 500.0, "epoch": 65.8, "grad_norm": 0.37686654925346375, "kl": 0.006784873083233833, "learning_rate": 4.97908413826912e-06, "loss": 0.0003, "reward": -1.1586905717849731, "reward_std": 3.2879345417022705, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.0336904525756836, "rewards/wrapped_format_reward": 0.125, "step": 329 }, { "completion_length": 500.0, "epoch": 66.0, "grad_norm": 1.805623173713684, "kl": 0.40524211525917053, "learning_rate": 4.978612153434527e-06, "loss": 0.0162, "reward": -0.08653664588928223, "reward_std": 2.7601754665374756, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3380681872367859, "rewards/wrapped_driving_reward": -1.674604892730713, "rewards/wrapped_format_reward": 0.5, "step": 330 }, { "completion_length": 500.0, "epoch": 66.2, "grad_norm": 0.38753998279571533, "kl": 0.005745263770222664, "learning_rate": 4.97813492534781e-06, "loss": 0.0002, "reward": 0.27020639181137085, "reward_std": 2.9552714824676514, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.32499998807907104, "rewards/wrapped_driving_reward": -0.9297935366630554, "rewards/wrapped_format_reward": 0.125, "step": 331 }, { "completion_length": 500.0, "epoch": 66.4, "grad_norm": 0.9115275144577026, "kl": 0.3008367717266083, "learning_rate": 4.9776524550184965e-06, "loss": 0.012, "reward": 0.6117656826972961, "reward_std": 3.0815846920013428, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4444444477558136, "rewards/wrapped_driving_reward": -0.7076786756515503, "rewards/wrapped_format_reward": 0.125, "step": 332 }, { "completion_length": 500.0, "epoch": 66.6, "grad_norm": 0.408721387386322, "kl": 0.006442686542868614, "learning_rate": 4.977164743467206e-06, "loss": 0.0003, "reward": 2.024855852127075, "reward_std": 0.7860668897628784, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.810606062412262, "rewards/wrapped_driving_reward": -0.28575026988983154, "rewards/wrapped_format_reward": 0.5, "step": 333 }, { "completion_length": 500.0, "epoch": 66.8, "grad_norm": 0.4926323890686035, "kl": 0.20976316928863525, "learning_rate": 4.97667179172564e-06, "loss": 0.0084, "reward": 0.6569395065307617, "reward_std": 3.1219942569732666, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4124999940395355, "rewards/wrapped_driving_reward": -0.8805604577064514, "rewards/wrapped_format_reward": 0.375, "step": 334 }, { "completion_length": 500.0, "epoch": 67.0, "grad_norm": 0.5138577222824097, "kl": 0.1235075369477272, "learning_rate": 4.9761736008365906e-06, "loss": 0.0049, "reward": -0.7582014799118042, "reward_std": 3.743921995162964, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3425324559211731, "rewards/wrapped_driving_reward": -1.600733995437622, "rewards/wrapped_format_reward": 0.0, "step": 335 }, { "completion_length": 500.0, "epoch": 67.2, "grad_norm": 0.3995298743247986, "kl": 0.005109015386551619, "learning_rate": 4.975670171853926e-06, "loss": 0.0002, "reward": -1.069366455078125, "reward_std": 3.387301206588745, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.44999998807907104, "rewards/wrapped_driving_reward": -2.019366502761841, "rewards/wrapped_format_reward": 0.0, "step": 336 }, { "completion_length": 500.0, "epoch": 67.4, "grad_norm": 3.9272031784057617, "kl": 0.3972117304801941, "learning_rate": 4.975161505842603e-06, "loss": 0.0159, "reward": 0.659011960029602, "reward_std": 3.177152156829834, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4097222089767456, "rewards/wrapped_driving_reward": -1.0007102489471436, "rewards/wrapped_format_reward": 0.5, "step": 337 }, { "completion_length": 500.0, "epoch": 67.6, "grad_norm": 1.9709585905075073, "kl": 0.34756138920783997, "learning_rate": 4.97464760387865e-06, "loss": 0.0139, "reward": -1.0175073146820068, "reward_std": 3.2108185291290283, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.40625, "rewards/wrapped_driving_reward": -2.298757314682007, "rewards/wrapped_format_reward": 0.375, "step": 338 }, { "completion_length": 500.0, "epoch": 67.8, "grad_norm": 1.3277071714401245, "kl": 0.7304494380950928, "learning_rate": 4.974128467049177e-06, "loss": 0.0292, "reward": 1.208156704902649, "reward_std": 3.5106117725372314, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3392857313156128, "rewards/wrapped_driving_reward": -0.3811289668083191, "rewards/wrapped_format_reward": 0.5, "step": 339 }, { "completion_length": 500.0, "epoch": 68.0, "grad_norm": 0.5980082154273987, "kl": 0.1545403152704239, "learning_rate": 4.973604096452361e-06, "loss": 0.0062, "reward": 1.781707525253296, "reward_std": 0.38252297043800354, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7374999523162842, "rewards/wrapped_driving_reward": 0.04420744255185127, "rewards/wrapped_format_reward": 0.0, "step": 340 }, { "completion_length": 500.0, "epoch": 68.2, "grad_norm": 5.6277008056640625, "kl": 0.6528012156486511, "learning_rate": 4.97307449319746e-06, "loss": 0.0261, "reward": -2.5417990684509277, "reward_std": 2.9164016246795654, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -3.041799306869507, "rewards/wrapped_format_reward": 0.0, "step": 341 }, { "completion_length": 500.0, "epoch": 68.4, "grad_norm": 1.4556660652160645, "kl": 0.22342568635940552, "learning_rate": 4.972539658404793e-06, "loss": 0.0089, "reward": 1.7447742223739624, "reward_std": 0.46966177225112915, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6625000238418579, "rewards/wrapped_driving_reward": -0.1677258014678955, "rewards/wrapped_format_reward": 0.25, "step": 342 }, { "completion_length": 500.0, "epoch": 68.6, "grad_norm": 0.32919180393218994, "kl": 0.004485347308218479, "learning_rate": 4.971999593205748e-06, "loss": 0.0002, "reward": -0.8774416446685791, "reward_std": 3.6358015537261963, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.28125, "rewards/wrapped_driving_reward": -1.908691644668579, "rewards/wrapped_format_reward": 0.25, "step": 343 }, { "completion_length": 500.0, "epoch": 68.8, "grad_norm": 0.4438071846961975, "kl": 0.022456657141447067, "learning_rate": 4.971454298742779e-06, "loss": 0.0009, "reward": 1.4681932926177979, "reward_std": 3.670335054397583, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6666666865348816, "rewards/wrapped_driving_reward": -0.4484734535217285, "rewards/wrapped_format_reward": 0.5, "step": 344 }, { "completion_length": 500.0, "epoch": 69.0, "grad_norm": 0.5931668877601624, "kl": 0.21155548095703125, "learning_rate": 4.970903776169403e-06, "loss": 0.0085, "reward": 1.7453134059906006, "reward_std": 0.27375465631484985, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5980769395828247, "rewards/wrapped_driving_reward": 0.1472364068031311, "rewards/wrapped_format_reward": 0.0, "step": 345 }, { "completion_length": 500.0, "epoch": 69.2, "grad_norm": 1.203963041305542, "kl": 0.2919057607650757, "learning_rate": 4.97034802665019e-06, "loss": 0.0117, "reward": -1.129254937171936, "reward_std": 3.315586566925049, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4166666865348816, "rewards/wrapped_driving_reward": -2.170921802520752, "rewards/wrapped_format_reward": 0.125, "step": 346 }, { "completion_length": 500.0, "epoch": 69.4, "grad_norm": 0.7416115403175354, "kl": 0.22743701934814453, "learning_rate": 4.969787051360776e-06, "loss": 0.0091, "reward": 0.7157018184661865, "reward_std": 3.1705451011657715, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.578125, "rewards/wrapped_driving_reward": -0.8624231815338135, "rewards/wrapped_format_reward": 0.25, "step": 347 }, { "completion_length": 500.0, "epoch": 69.6, "grad_norm": 0.4830350875854492, "kl": 0.17163938283920288, "learning_rate": 4.9692208514878445e-06, "loss": 0.0069, "reward": -0.09030771255493164, "reward_std": 2.9694387912750244, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6041666865348816, "rewards/wrapped_driving_reward": -1.8194743394851685, "rewards/wrapped_format_reward": 0.375, "step": 348 }, { "completion_length": 500.0, "epoch": 69.8, "grad_norm": 0.2983047366142273, "kl": 0.004596756771206856, "learning_rate": 4.9686494282291354e-06, "loss": 0.0002, "reward": -2.3006930351257324, "reward_std": 2.772320032119751, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.125, "rewards/wrapped_driving_reward": -2.9256930351257324, "rewards/wrapped_format_reward": 0.25, "step": 349 }, { "completion_length": 500.0, "epoch": 70.0, "grad_norm": 0.9741338491439819, "kl": 0.2408900409936905, "learning_rate": 4.968072782793436e-06, "loss": 0.0096, "reward": 2.6951208114624023, "reward_std": 0.5698649883270264, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.421875, "rewards/wrapped_driving_reward": 0.7732457518577576, "rewards/wrapped_format_reward": 0.5, "step": 350 }, { "completion_length": 500.0, "epoch": 70.2, "grad_norm": 4.554775238037109, "kl": 0.4316965341567993, "learning_rate": 4.9674909164005805e-06, "loss": 0.0173, "reward": 1.6274943351745605, "reward_std": 0.7876996397972107, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5681818127632141, "rewards/wrapped_driving_reward": -0.06568745523691177, "rewards/wrapped_format_reward": 0.125, "step": 351 }, { "completion_length": 500.0, "epoch": 70.4, "grad_norm": 2.6662397384643555, "kl": 0.7358002066612244, "learning_rate": 4.966903830281449e-06, "loss": 0.0294, "reward": 0.41893279552459717, "reward_std": 2.621450662612915, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5625, "rewards/wrapped_driving_reward": -1.0185672044754028, "rewards/wrapped_format_reward": 0.125, "step": 352 }, { "completion_length": 500.0, "epoch": 70.6, "grad_norm": 5.053070545196533, "kl": 0.6792211532592773, "learning_rate": 4.966311525677961e-06, "loss": 0.0272, "reward": -0.49536943435668945, "reward_std": 4.055595397949219, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3333333134651184, "rewards/wrapped_driving_reward": -1.5787028074264526, "rewards/wrapped_format_reward": 0.25, "step": 353 }, { "completion_length": 500.0, "epoch": 70.8, "grad_norm": 0.6820557117462158, "kl": 0.6210047602653503, "learning_rate": 4.965714003843079e-06, "loss": 0.0248, "reward": 0.41455984115600586, "reward_std": 3.0012035369873047, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3738636374473572, "rewards/wrapped_driving_reward": -0.9593039155006409, "rewards/wrapped_format_reward": 0.25, "step": 354 }, { "completion_length": 500.0, "epoch": 71.0, "grad_norm": 4.2020440101623535, "kl": 0.7750915288925171, "learning_rate": 4.965111266040798e-06, "loss": 0.031, "reward": 2.4024367332458496, "reward_std": 0.6636844277381897, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6281249523162842, "rewards/wrapped_driving_reward": 0.1493116319179535, "rewards/wrapped_format_reward": 0.625, "step": 355 }, { "completion_length": 500.0, "epoch": 71.2, "grad_norm": 0.6401116251945496, "kl": 0.3446768820285797, "learning_rate": 4.964503313546149e-06, "loss": 0.0138, "reward": 0.8104474544525146, "reward_std": 3.2581946849823, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3187499940395355, "rewards/wrapped_driving_reward": -0.5083025097846985, "rewards/wrapped_format_reward": 0.25, "step": 356 }, { "completion_length": 500.0, "epoch": 71.4, "grad_norm": 8.429099082946777, "kl": 0.9006807208061218, "learning_rate": 4.963890147645195e-06, "loss": 0.036, "reward": 1.889892578125, "reward_std": 0.5496275424957275, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.4734848439693451, "rewards/wrapped_driving_reward": 0.16640779376029968, "rewards/wrapped_format_reward": 0.25, "step": 357 }, { "completion_length": 500.0, "epoch": 71.6, "grad_norm": 4.372265815734863, "kl": 1.383968710899353, "learning_rate": 4.963271769635024e-06, "loss": 0.0554, "reward": 2.0750181674957275, "reward_std": 0.5676727890968323, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.4749999940395355, "rewards/wrapped_driving_reward": 0.22501814365386963, "rewards/wrapped_format_reward": 0.375, "step": 358 }, { "completion_length": 500.0, "epoch": 71.8, "grad_norm": 2.1324520111083984, "kl": 0.41738006472587585, "learning_rate": 4.962648180823753e-06, "loss": 0.0167, "reward": -0.9892675876617432, "reward_std": 3.4826595783233643, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3083333373069763, "rewards/wrapped_driving_reward": -1.7976008653640747, "rewards/wrapped_format_reward": 0.0, "step": 359 }, { "completion_length": 500.0, "epoch": 72.0, "grad_norm": 0.6744612455368042, "kl": 0.17285731434822083, "learning_rate": 4.962019382530521e-06, "loss": 0.0069, "reward": 0.3177332878112793, "reward_std": 2.5485317707061768, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5868055820465088, "rewards/wrapped_driving_reward": -1.1440722942352295, "rewards/wrapped_format_reward": 0.125, "step": 360 }, { "completion_length": 500.0, "epoch": 72.2, "grad_norm": 0.6285593509674072, "kl": 0.33941590785980225, "learning_rate": 4.961385376085486e-06, "loss": 0.0136, "reward": 0.2698374390602112, "reward_std": 2.8500523567199707, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5714285373687744, "rewards/wrapped_driving_reward": -1.176591157913208, "rewards/wrapped_format_reward": 0.125, "step": 361 }, { "completion_length": 500.0, "epoch": 72.4, "grad_norm": 0.458164244890213, "kl": 0.09140162914991379, "learning_rate": 4.960746162829825e-06, "loss": 0.0037, "reward": 0.39255642890930176, "reward_std": 2.957616090774536, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5381944179534912, "rewards/wrapped_driving_reward": -0.8956379890441895, "rewards/wrapped_format_reward": 0.0, "step": 362 }, { "completion_length": 500.0, "epoch": 72.6, "grad_norm": 0.8604902029037476, "kl": 0.5563132762908936, "learning_rate": 4.960101744115727e-06, "loss": 0.0223, "reward": 0.682576596736908, "reward_std": 3.148731231689453, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6041666865348816, "rewards/wrapped_driving_reward": -0.6715900897979736, "rewards/wrapped_format_reward": 0.0, "step": 363 }, { "completion_length": 500.0, "epoch": 72.8, "grad_norm": 0.3864164352416992, "kl": 0.010636747814714909, "learning_rate": 4.959452121306397e-06, "loss": 0.0004, "reward": 2.0540273189544678, "reward_std": 0.3745618760585785, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8484848737716675, "rewards/wrapped_driving_reward": 0.2055424451828003, "rewards/wrapped_format_reward": 0.0, "step": 364 }, { "completion_length": 500.0, "epoch": 73.0, "grad_norm": 0.6751132011413574, "kl": 0.3509099781513214, "learning_rate": 4.958797295776045e-06, "loss": 0.014, "reward": -0.012695908546447754, "reward_std": 2.71622371673584, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6401515007019043, "rewards/wrapped_driving_reward": -1.527847409248352, "rewards/wrapped_format_reward": 0.125, "step": 365 }, { "completion_length": 500.0, "epoch": 73.2, "grad_norm": 0.6991415619850159, "kl": 0.26292094588279724, "learning_rate": 4.958137268909887e-06, "loss": 0.0105, "reward": -0.9175713062286377, "reward_std": 3.619429349899292, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.30000001192092896, "rewards/wrapped_driving_reward": -1.9675713777542114, "rewards/wrapped_format_reward": 0.25, "step": 366 }, { "completion_length": 500.0, "epoch": 73.4, "grad_norm": 0.4259271025657654, "kl": 0.006191871128976345, "learning_rate": 4.957472042104143e-06, "loss": 0.0002, "reward": -1.5161292552947998, "reward_std": 2.4123644828796387, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.578125, "rewards/wrapped_driving_reward": -2.9692542552948, "rewards/wrapped_format_reward": 0.125, "step": 367 }, { "completion_length": 500.0, "epoch": 73.6, "grad_norm": 0.41756653785705566, "kl": 0.190852090716362, "learning_rate": 4.956801616766033e-06, "loss": 0.0076, "reward": 2.1905903816223145, "reward_std": 0.8349334001541138, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.59375, "rewards/wrapped_driving_reward": 0.09684042632579803, "rewards/wrapped_format_reward": 0.5, "step": 368 }, { "completion_length": 500.0, "epoch": 73.8, "grad_norm": 0.9244159460067749, "kl": 0.4430541396141052, "learning_rate": 4.956125994313775e-06, "loss": 0.0177, "reward": 1.5870847702026367, "reward_std": 0.30261340737342834, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6979166865348816, "rewards/wrapped_driving_reward": -0.11083187907934189, "rewards/wrapped_format_reward": 0.0, "step": 369 }, { "completion_length": 500.0, "epoch": 74.0, "grad_norm": 0.7584779858589172, "kl": 0.37870338559150696, "learning_rate": 4.955445176176577e-06, "loss": 0.0151, "reward": -0.4603646993637085, "reward_std": 4.091524124145508, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.5853646993637085, "rewards/wrapped_format_reward": 0.25, "step": 370 }, { "completion_length": 500.0, "epoch": 74.2, "grad_norm": 0.5257065296173096, "kl": 0.14303787052631378, "learning_rate": 4.954759163794642e-06, "loss": 0.0057, "reward": -0.9337491989135742, "reward_std": 3.272832155227661, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3068181872367859, "rewards/wrapped_driving_reward": -1.9905673265457153, "rewards/wrapped_format_reward": 0.25, "step": 371 }, { "completion_length": 500.0, "epoch": 74.4, "grad_norm": 2.895681858062744, "kl": 0.6397900581359863, "learning_rate": 4.9540679586191605e-06, "loss": 0.0256, "reward": 0.5877273082733154, "reward_std": 3.06107759475708, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9122726917266846, "rewards/wrapped_format_reward": 0.0, "step": 372 }, { "completion_length": 500.0, "epoch": 74.6, "grad_norm": 0.4324706792831421, "kl": 0.1139565035700798, "learning_rate": 4.9533715621123046e-06, "loss": 0.0046, "reward": 0.969906210899353, "reward_std": 3.3792827129364014, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.45681819319725037, "rewards/wrapped_driving_reward": -0.611911952495575, "rewards/wrapped_format_reward": 0.375, "step": 373 }, { "completion_length": 500.0, "epoch": 74.8, "grad_norm": 0.48021069169044495, "kl": 0.010700889863073826, "learning_rate": 4.952669975747232e-06, "loss": 0.0004, "reward": -3.875, "reward_std": 0.25, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.125, "step": 374 }, { "completion_length": 500.0, "epoch": 75.0, "grad_norm": 0.43843770027160645, "kl": 0.12145346403121948, "learning_rate": 4.9519632010080765e-06, "loss": 0.0049, "reward": 0.22283601760864258, "reward_std": 2.8182504177093506, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.41874998807907104, "rewards/wrapped_driving_reward": -1.1959140300750732, "rewards/wrapped_format_reward": 0.25, "step": 375 }, { "completion_length": 500.0, "epoch": 75.2, "grad_norm": 0.3983944356441498, "kl": 0.2168252021074295, "learning_rate": 4.951251239389949e-06, "loss": 0.0087, "reward": 0.22715109586715698, "reward_std": 2.8382680416107178, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4375, "rewards/wrapped_driving_reward": -1.3353488445281982, "rewards/wrapped_format_reward": 0.375, "step": 376 }, { "completion_length": 500.0, "epoch": 75.4, "grad_norm": 1.3143914937973022, "kl": 0.49245813488960266, "learning_rate": 4.950534092398931e-06, "loss": 0.0197, "reward": -0.5440161228179932, "reward_std": 3.7116076946258545, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.42045456171035767, "rewards/wrapped_driving_reward": -1.589470624923706, "rewards/wrapped_format_reward": 0.125, "step": 377 }, { "completion_length": 500.0, "epoch": 75.6, "grad_norm": 0.7405281662940979, "kl": 0.4652176797389984, "learning_rate": 4.949811761552074e-06, "loss": 0.0186, "reward": -0.9247722625732422, "reward_std": 3.602470874786377, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3888888955116272, "rewards/wrapped_driving_reward": -2.0636610984802246, "rewards/wrapped_format_reward": 0.25, "step": 378 }, { "completion_length": 500.0, "epoch": 75.8, "grad_norm": 1.0121004581451416, "kl": 0.9068050980567932, "learning_rate": 4.9490842483773974e-06, "loss": 0.0363, "reward": 0.36130303144454956, "reward_std": 2.9178526401519775, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.2916666567325592, "rewards/wrapped_driving_reward": -0.805363655090332, "rewards/wrapped_format_reward": 0.125, "step": 379 }, { "completion_length": 500.0, "epoch": 76.0, "grad_norm": 0.3849989175796509, "kl": 0.008540408685803413, "learning_rate": 4.948351554413879e-06, "loss": 0.0003, "reward": -0.9468990564346313, "reward_std": 3.264946699142456, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3386363685131073, "rewards/wrapped_driving_reward": -1.9105353355407715, "rewards/wrapped_format_reward": 0.125, "step": 380 }, { "completion_length": 500.0, "epoch": 76.2, "grad_norm": 0.6559692621231079, "kl": 0.7614105343818665, "learning_rate": 4.94761368121146e-06, "loss": 0.0305, "reward": 0.15053200721740723, "reward_std": 2.780266761779785, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.46875, "rewards/wrapped_driving_reward": -1.1932181119918823, "rewards/wrapped_format_reward": 0.125, "step": 381 }, { "completion_length": 500.0, "epoch": 76.4, "grad_norm": 0.3702482283115387, "kl": 0.20814312994480133, "learning_rate": 4.946870630331035e-06, "loss": 0.0083, "reward": 0.8605266809463501, "reward_std": 3.243834972381592, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.546875, "rewards/wrapped_driving_reward": -0.6863483190536499, "rewards/wrapped_format_reward": 0.25, "step": 382 }, { "completion_length": 500.0, "epoch": 76.6, "grad_norm": 0.7226535081863403, "kl": 0.9034067988395691, "learning_rate": 4.9461224033444544e-06, "loss": 0.0361, "reward": 0.27967095375061035, "reward_std": 2.896880626678467, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5083333253860474, "rewards/wrapped_driving_reward": -1.6036624908447266, "rewards/wrapped_format_reward": 0.625, "step": 383 }, { "completion_length": 500.0, "epoch": 76.8, "grad_norm": 0.6067720651626587, "kl": 0.12315172702074051, "learning_rate": 4.9453690018345144e-06, "loss": 0.0049, "reward": -0.4893726110458374, "reward_std": 3.798448085784912, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.22499999403953552, "rewards/wrapped_driving_reward": -1.5893726348876953, "rewards/wrapped_format_reward": 0.375, "step": 384 }, { "completion_length": 500.0, "epoch": 77.0, "grad_norm": 0.4272429943084717, "kl": 0.00867474265396595, "learning_rate": 4.94461042739496e-06, "loss": 0.0003, "reward": 0.48846930265426636, "reward_std": 3.02353572845459, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3883928656578064, "rewards/wrapped_driving_reward": -0.89992356300354, "rewards/wrapped_format_reward": 0.25, "step": 385 }, { "completion_length": 477.0, "epoch": 77.2, "grad_norm": 0.476276695728302, "kl": 0.3388914167881012, "learning_rate": 4.943846681630479e-06, "loss": 0.0136, "reward": 0.190776526927948, "reward_std": 2.4944610595703125, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3499999940395355, "rewards/wrapped_driving_reward": -1.1592234373092651, "rewards/wrapped_format_reward": 0.25, "step": 386 }, { "completion_length": 500.0, "epoch": 77.4, "grad_norm": 0.53319251537323, "kl": 0.5233060717582703, "learning_rate": 4.943077766156698e-06, "loss": 0.0209, "reward": 2.4598963260650635, "reward_std": 0.17958295345306396, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.671875, "rewards/wrapped_driving_reward": 0.5380213856697083, "rewards/wrapped_format_reward": 0.25, "step": 387 }, { "completion_length": 500.0, "epoch": 77.6, "grad_norm": 0.4270462393760681, "kl": 0.007158294320106506, "learning_rate": 4.942303682600178e-06, "loss": 0.0003, "reward": 0.5006191730499268, "reward_std": 3.0305392742156982, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4558081030845642, "rewards/wrapped_driving_reward": -0.9551889896392822, "rewards/wrapped_format_reward": 0.25, "step": 388 }, { "completion_length": 500.0, "epoch": 77.8, "grad_norm": 0.6703900098800659, "kl": 0.6842387318611145, "learning_rate": 4.941524432598415e-06, "loss": 0.0274, "reward": -1.2612661123275757, "reward_std": 3.1640005111694336, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.0112662315368652, "rewards/wrapped_format_reward": 0.0, "step": 389 }, { "completion_length": 500.0, "epoch": 78.0, "grad_norm": 1.440481185913086, "kl": 0.5535719990730286, "learning_rate": 4.9407400177998335e-06, "loss": 0.0221, "reward": -0.8308386206626892, "reward_std": 3.6611149311065674, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2083333432674408, "rewards/wrapped_driving_reward": -1.9141719341278076, "rewards/wrapped_format_reward": 0.375, "step": 390 }, { "completion_length": 500.0, "epoch": 78.2, "grad_norm": 0.46088123321533203, "kl": 0.4104781746864319, "learning_rate": 4.9399504398637835e-06, "loss": 0.0164, "reward": 2.22438645362854, "reward_std": 0.4959229826927185, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8401514887809753, "rewards/wrapped_driving_reward": 0.2592349946498871, "rewards/wrapped_format_reward": 0.125, "step": 391 }, { "completion_length": 500.0, "epoch": 78.4, "grad_norm": 0.41232195496559143, "kl": 0.27239614725112915, "learning_rate": 4.939155700460536e-06, "loss": 0.0109, "reward": 1.3294200897216797, "reward_std": 3.238236904144287, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5874999761581421, "rewards/wrapped_driving_reward": -0.3830798864364624, "rewards/wrapped_format_reward": 0.375, "step": 392 }, { "completion_length": 500.0, "epoch": 78.6, "grad_norm": 79.61272430419922, "kl": 1.242353916168213, "learning_rate": 4.938355801271282e-06, "loss": 0.0497, "reward": 0.6597337126731873, "reward_std": 3.137744665145874, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6931818127632141, "rewards/wrapped_driving_reward": -1.0334481000900269, "rewards/wrapped_format_reward": 0.25, "step": 393 }, { "completion_length": 500.0, "epoch": 78.8, "grad_norm": 0.6254439353942871, "kl": 0.32317063212394714, "learning_rate": 4.937550743988127e-06, "loss": 0.0129, "reward": 1.7920137643814087, "reward_std": 0.5354746580123901, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.625568151473999, "rewards/wrapped_driving_reward": -0.4585544764995575, "rewards/wrapped_format_reward": 0.625, "step": 394 }, { "completion_length": 500.0, "epoch": 79.0, "grad_norm": 0.38457387685775757, "kl": 0.5373590588569641, "learning_rate": 4.936740530314087e-06, "loss": 0.0215, "reward": 0.1730683445930481, "reward_std": 2.877331495285034, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4318181872367859, "rewards/wrapped_driving_reward": -1.7587498426437378, "rewards/wrapped_format_reward": 0.75, "step": 395 }, { "completion_length": 500.0, "epoch": 79.2, "grad_norm": 0.6702293753623962, "kl": 0.557269275188446, "learning_rate": 4.935925161963089e-06, "loss": 0.0223, "reward": 1.288943886756897, "reward_std": 0.6816022396087646, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5984848737716675, "rewards/wrapped_driving_reward": -0.4345410466194153, "rewards/wrapped_format_reward": 0.125, "step": 396 }, { "completion_length": 500.0, "epoch": 79.4, "grad_norm": 8.266891479492188, "kl": 1.4165321588516235, "learning_rate": 4.935104640659959e-06, "loss": 0.0567, "reward": 0.45650458335876465, "reward_std": 3.0426013469696045, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.21875, "rewards/wrapped_driving_reward": -1.012245535850525, "rewards/wrapped_format_reward": 0.5, "step": 397 }, { "completion_length": 500.0, "epoch": 79.6, "grad_norm": 0.65053790807724, "kl": 0.45794495940208435, "learning_rate": 4.934278968140428e-06, "loss": 0.0183, "reward": -0.4596288204193115, "reward_std": 4.093783855438232, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.5846288204193115, "rewards/wrapped_format_reward": 0.125, "step": 398 }, { "completion_length": 500.0, "epoch": 79.8, "grad_norm": 0.46314430236816406, "kl": 0.5075499415397644, "learning_rate": 4.933448146151122e-06, "loss": 0.0203, "reward": 1.7097347974777222, "reward_std": 0.5429360270500183, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.71875, "rewards/wrapped_driving_reward": -0.13401514291763306, "rewards/wrapped_format_reward": 0.125, "step": 399 }, { "completion_length": 500.0, "epoch": 80.0, "grad_norm": 0.6950944066047668, "kl": 0.3538559675216675, "learning_rate": 4.93261217644956e-06, "loss": 0.0142, "reward": 0.39778077602386475, "reward_std": 2.652611255645752, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5687500238418579, "rewards/wrapped_driving_reward": -1.4209691286087036, "rewards/wrapped_format_reward": 0.5, "step": 400 }, { "completion_length": 417.0, "epoch": 80.2, "grad_norm": 0.6640767455101013, "kl": 1.0406416654586792, "learning_rate": 4.931771060804152e-06, "loss": 0.0416, "reward": 2.95090651512146, "reward_std": 0.567731499671936, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.786237359046936, "rewards/wrapped_driving_reward": 0.5396692156791687, "rewards/wrapped_format_reward": 0.625, "step": 401 }, { "completion_length": 500.0, "epoch": 80.4, "grad_norm": 0.7255659103393555, "kl": 0.7492356896400452, "learning_rate": 4.930924800994192e-06, "loss": 0.03, "reward": 0.889411211013794, "reward_std": 3.33335280418396, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5833333134651184, "rewards/wrapped_driving_reward": -0.9439221620559692, "rewards/wrapped_format_reward": 0.5, "step": 402 }, { "completion_length": 500.0, "epoch": 80.6, "grad_norm": 55.34680938720703, "kl": 0.841439425945282, "learning_rate": 4.930073398809857e-06, "loss": 0.0337, "reward": -2.488555908203125, "reward_std": 2.6998631954193115, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.09375, "rewards/wrapped_driving_reward": -2.957305908203125, "rewards/wrapped_format_reward": 0.125, "step": 403 }, { "completion_length": 500.0, "epoch": 80.8, "grad_norm": 0.4021887183189392, "kl": 0.009278661571443081, "learning_rate": 4.929216856052201e-06, "loss": 0.0004, "reward": -0.5718256235122681, "reward_std": 3.405724048614502, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.821825623512268, "rewards/wrapped_format_reward": 0.25, "step": 404 }, { "completion_length": 500.0, "epoch": 81.0, "grad_norm": 3.827162981033325, "kl": 0.7532582879066467, "learning_rate": 4.928355174533153e-06, "loss": 0.0301, "reward": 0.27101099491119385, "reward_std": 2.883357048034668, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.40625, "rewards/wrapped_driving_reward": -1.1352390050888062, "rewards/wrapped_format_reward": 0.25, "step": 405 }, { "completion_length": 500.0, "epoch": 81.2, "grad_norm": 0.6682533025741577, "kl": 0.9012861847877502, "learning_rate": 4.927488356075515e-06, "loss": 0.0361, "reward": 0.09313106536865234, "reward_std": 2.730103015899658, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4375, "rewards/wrapped_driving_reward": -1.0943689346313477, "rewards/wrapped_format_reward": 0.0, "step": 406 }, { "completion_length": 500.0, "epoch": 81.4, "grad_norm": 0.5417198538780212, "kl": 0.41213977336883545, "learning_rate": 4.926616402512952e-06, "loss": 0.0165, "reward": -1.3022674322128296, "reward_std": 2.9187700748443604, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -2.552267551422119, "rewards/wrapped_format_reward": 0.375, "step": 407 }, { "completion_length": 500.0, "epoch": 81.6, "grad_norm": 1.2818468809127808, "kl": 0.6974972486495972, "learning_rate": 4.925739315689991e-06, "loss": 0.0279, "reward": 2.659097671508789, "reward_std": 0.6468947529792786, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5107142925262451, "rewards/wrapped_driving_reward": 0.39838337898254395, "rewards/wrapped_format_reward": 0.75, "step": 408 }, { "completion_length": 500.0, "epoch": 81.8, "grad_norm": 0.4666033089160919, "kl": 0.2668006122112274, "learning_rate": 4.924857097462023e-06, "loss": 0.0107, "reward": -0.5389512777328491, "reward_std": 4.004843711853027, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4318181872367859, "rewards/wrapped_driving_reward": -1.5957695245742798, "rewards/wrapped_format_reward": 0.125, "step": 409 }, { "completion_length": 500.0, "epoch": 82.0, "grad_norm": 1.2812401056289673, "kl": 0.9626360535621643, "learning_rate": 4.9239697496952904e-06, "loss": 0.0385, "reward": -2.234349489212036, "reward_std": 2.903162717819214, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.984349489212036, "rewards/wrapped_format_reward": 0.25, "step": 410 }, { "completion_length": 500.0, "epoch": 82.2, "grad_norm": 0.644499659538269, "kl": 0.8354526162147522, "learning_rate": 4.923077274266886e-06, "loss": 0.0334, "reward": 0.48837804794311523, "reward_std": 3.006782293319702, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -0.8866219520568848, "rewards/wrapped_format_reward": 0.25, "step": 411 }, { "completion_length": 500.0, "epoch": 82.4, "grad_norm": 2.690115451812744, "kl": 0.7986117005348206, "learning_rate": 4.922179673064752e-06, "loss": 0.0319, "reward": -0.877657413482666, "reward_std": 3.3253700733184814, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.1666666716337204, "rewards/wrapped_driving_reward": -2.0443239212036133, "rewards/wrapped_format_reward": 0.5, "step": 412 }, { "completion_length": 500.0, "epoch": 82.6, "grad_norm": 0.4054378867149353, "kl": 0.013790788128972054, "learning_rate": 4.921276947987672e-06, "loss": 0.0006, "reward": -0.5940237045288086, "reward_std": 3.6547868251800537, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4375, "rewards/wrapped_driving_reward": -1.6565237045288086, "rewards/wrapped_format_reward": 0.125, "step": 413 }, { "completion_length": 500.0, "epoch": 82.8, "grad_norm": 3.897165060043335, "kl": 1.4948945045471191, "learning_rate": 4.92036910094527e-06, "loss": 0.0598, "reward": 2.222888469696045, "reward_std": 0.5692758560180664, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": 0.0978885143995285, "rewards/wrapped_format_reward": 0.625, "step": 414 }, { "completion_length": 500.0, "epoch": 83.0, "grad_norm": 0.6666826605796814, "kl": 0.5109995603561401, "learning_rate": 4.919456133858003e-06, "loss": 0.0204, "reward": -0.8746355772018433, "reward_std": 3.3271138668060303, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2916666865348816, "rewards/wrapped_driving_reward": -1.7913023233413696, "rewards/wrapped_format_reward": 0.125, "step": 415 }, { "completion_length": 500.0, "epoch": 83.2, "grad_norm": 1.92827570438385, "kl": 1.322735071182251, "learning_rate": 4.91853804865716e-06, "loss": 0.0529, "reward": -0.5941190719604492, "reward_std": 3.9339089393615723, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2291666567325592, "rewards/wrapped_driving_reward": -1.573285698890686, "rewards/wrapped_format_reward": 0.25, "step": 416 }, { "completion_length": 500.0, "epoch": 83.4, "grad_norm": 0.6745986342430115, "kl": 0.505329430103302, "learning_rate": 4.917614847284858e-06, "loss": 0.0202, "reward": -0.9166472554206848, "reward_std": 3.3589961528778076, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3541666865348816, "rewards/wrapped_driving_reward": -2.1458141803741455, "rewards/wrapped_format_reward": 0.375, "step": 417 }, { "completion_length": 500.0, "epoch": 83.6, "grad_norm": 4.199557781219482, "kl": 1.2294397354125977, "learning_rate": 4.916686531694035e-06, "loss": 0.0492, "reward": 1.6677517890930176, "reward_std": 0.404989629983902, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6770833134651184, "rewards/wrapped_driving_reward": -0.009331552311778069, "rewards/wrapped_format_reward": 0.0, "step": 418 }, { "completion_length": 500.0, "epoch": 83.8, "grad_norm": 0.5093013644218445, "kl": 0.5093794465065002, "learning_rate": 4.9157531038484494e-06, "loss": 0.0204, "reward": 0.7348726987838745, "reward_std": 2.8353636264801025, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.59375, "rewards/wrapped_driving_reward": -0.9838773608207703, "rewards/wrapped_format_reward": 0.375, "step": 419 }, { "completion_length": 500.0, "epoch": 84.0, "grad_norm": 0.4214366376399994, "kl": 0.12739227712154388, "learning_rate": 4.914814565722671e-06, "loss": 0.0051, "reward": 0.06370812654495239, "reward_std": 2.8763251304626465, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3166666626930237, "rewards/wrapped_driving_reward": -1.2529585361480713, "rewards/wrapped_format_reward": 0.25, "step": 420 }, { "completion_length": 500.0, "epoch": 84.2, "grad_norm": 0.6959801316261292, "kl": 1.1503578424453735, "learning_rate": 4.913870919302083e-06, "loss": 0.046, "reward": 1.276258945465088, "reward_std": 3.544874668121338, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": -0.5987411737442017, "rewards/wrapped_format_reward": 0.5, "step": 421 }, { "completion_length": 500.0, "epoch": 84.4, "grad_norm": 0.6742568612098694, "kl": 0.5141565799713135, "learning_rate": 4.912922166582874e-06, "loss": 0.0206, "reward": -1.0406144857406616, "reward_std": 3.4172041416168213, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.40625, "rewards/wrapped_driving_reward": -1.946864366531372, "rewards/wrapped_format_reward": 0.0, "step": 422 }, { "completion_length": 500.0, "epoch": 84.6, "grad_norm": 0.6518729329109192, "kl": 0.7472659349441528, "learning_rate": 4.9119683095720325e-06, "loss": 0.0299, "reward": 1.693802833557129, "reward_std": 0.29680135846138, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.45681819319725037, "rewards/wrapped_driving_reward": -0.013015310280025005, "rewards/wrapped_format_reward": 0.25, "step": 423 }, { "completion_length": 500.0, "epoch": 84.8, "grad_norm": 0.7331550121307373, "kl": 0.6410544514656067, "learning_rate": 4.911009350287348e-06, "loss": 0.0256, "reward": 1.389469861984253, "reward_std": 0.43002644181251526, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6306818127632141, "rewards/wrapped_driving_reward": -0.24121199548244476, "rewards/wrapped_format_reward": 0.0, "step": 424 }, { "completion_length": 500.0, "epoch": 85.0, "grad_norm": 0.5071043968200684, "kl": 0.2673094868659973, "learning_rate": 4.910045290757399e-06, "loss": 0.0107, "reward": 1.6472147703170776, "reward_std": 0.5979868173599243, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5372024178504944, "rewards/wrapped_driving_reward": -0.01498757116496563, "rewards/wrapped_format_reward": 0.125, "step": 425 }, { "completion_length": 500.0, "epoch": 85.2, "grad_norm": 0.4037346839904785, "kl": 0.2433103322982788, "learning_rate": 4.909076133021558e-06, "loss": 0.0097, "reward": 1.7711191177368164, "reward_std": 0.41978758573532104, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5687500238418579, "rewards/wrapped_driving_reward": -0.04763093218207359, "rewards/wrapped_format_reward": 0.25, "step": 426 }, { "completion_length": 500.0, "epoch": 85.4, "grad_norm": 0.3942338824272156, "kl": 0.014928624033927917, "learning_rate": 4.908101879129977e-06, "loss": 0.0006, "reward": -2.022028923034668, "reward_std": 1.4208987951278687, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.47904038429260254, "rewards/wrapped_driving_reward": -3.6260693073272705, "rewards/wrapped_format_reward": 0.375, "step": 427 }, { "completion_length": 500.0, "epoch": 85.6, "grad_norm": 0.5229224562644958, "kl": 1.1797394752502441, "learning_rate": 4.907122531143595e-06, "loss": 0.0472, "reward": 2.057708740234375, "reward_std": 0.6911286115646362, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5538690090179443, "rewards/wrapped_driving_reward": 0.25383973121643066, "rewards/wrapped_format_reward": 0.25, "step": 428 }, { "completion_length": 500.0, "epoch": 85.8, "grad_norm": 0.4138140380382538, "kl": 0.8060752749443054, "learning_rate": 4.906138091134118e-06, "loss": 0.0322, "reward": 2.941300868988037, "reward_std": 0.7528426051139832, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6656249761581421, "rewards/wrapped_driving_reward": 0.775675892829895, "rewards/wrapped_format_reward": 0.5, "step": 429 }, { "completion_length": 500.0, "epoch": 86.0, "grad_norm": 1.6084057092666626, "kl": 1.5639433860778809, "learning_rate": 4.905148561184033e-06, "loss": 0.0626, "reward": 0.7152248620986938, "reward_std": 2.8380167484283447, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.543749988079071, "rewards/wrapped_driving_reward": -1.0785250663757324, "rewards/wrapped_format_reward": 0.5, "step": 430 }, { "completion_length": 500.0, "epoch": 86.2, "grad_norm": 0.793965756893158, "kl": 1.6652806997299194, "learning_rate": 4.904153943386588e-06, "loss": 0.0666, "reward": 1.4550268650054932, "reward_std": 3.6481876373291016, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.531818151473999, "rewards/wrapped_driving_reward": -0.5767912268638611, "rewards/wrapped_format_reward": 0.75, "step": 431 }, { "completion_length": 500.0, "epoch": 86.4, "grad_norm": 0.4197085499763489, "kl": 0.388515830039978, "learning_rate": 4.903154239845798e-06, "loss": 0.0155, "reward": 2.14121413230896, "reward_std": 0.5500617027282715, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.71875, "rewards/wrapped_driving_reward": 0.42246413230895996, "rewards/wrapped_format_reward": 0.0, "step": 432 }, { "completion_length": 500.0, "epoch": 86.6, "grad_norm": 0.8733661770820618, "kl": 1.6929181814193726, "learning_rate": 4.9021494526764315e-06, "loss": 0.0677, "reward": -1.1920154094696045, "reward_std": 3.250979423522949, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -2.1920154094696045, "rewards/wrapped_format_reward": 0.125, "step": 433 }, { "completion_length": 500.0, "epoch": 86.8, "grad_norm": 0.4056279957294464, "kl": 0.4129205346107483, "learning_rate": 4.901139584004014e-06, "loss": 0.0165, "reward": 0.33355867862701416, "reward_std": 2.93294358253479, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.550000011920929, "rewards/wrapped_driving_reward": -1.09144127368927, "rewards/wrapped_format_reward": 0.125, "step": 434 }, { "completion_length": 500.0, "epoch": 87.0, "grad_norm": 1.0974806547164917, "kl": 1.1918952465057373, "learning_rate": 4.900124635964823e-06, "loss": 0.0477, "reward": -0.3588448762893677, "reward_std": 2.147490978240967, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.45972222089767456, "rewards/wrapped_driving_reward": -1.943567156791687, "rewards/wrapped_format_reward": 0.125, "step": 435 }, { "completion_length": 500.0, "epoch": 87.2, "grad_norm": 0.8155910968780518, "kl": 0.01346185989677906, "learning_rate": 4.899104610705874e-06, "loss": 0.0005, "reward": 0.33636826276779175, "reward_std": 2.900613307952881, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5511363744735718, "rewards/wrapped_driving_reward": -1.0897680521011353, "rewards/wrapped_format_reward": 0.125, "step": 436 }, { "completion_length": 500.0, "epoch": 87.4, "grad_norm": 0.7519292831420898, "kl": 0.420356810092926, "learning_rate": 4.898079510384929e-06, "loss": 0.0168, "reward": 1.5522735118865967, "reward_std": 0.47781333327293396, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6753472089767456, "rewards/wrapped_driving_reward": -0.37307366728782654, "rewards/wrapped_format_reward": 0.25, "step": 437 }, { "completion_length": 500.0, "epoch": 87.6, "grad_norm": 2.0137500762939453, "kl": 1.7654343843460083, "learning_rate": 4.897049337170483e-06, "loss": 0.0706, "reward": 0.5818377137184143, "reward_std": 2.7649831771850586, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.30000001192092896, "rewards/wrapped_driving_reward": -0.8431622385978699, "rewards/wrapped_format_reward": 0.375, "step": 438 }, { "completion_length": 500.0, "epoch": 87.8, "grad_norm": 0.48328566551208496, "kl": 0.9588396549224854, "learning_rate": 4.896014093241763e-06, "loss": 0.0384, "reward": 1.3982526063919067, "reward_std": 0.5576097965240479, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.4005681872367859, "rewards/wrapped_driving_reward": -0.0023155699018388987, "rewards/wrapped_format_reward": 0.0, "step": 439 }, { "completion_length": 372.0, "epoch": 88.0, "grad_norm": 0.7031254172325134, "kl": 1.1994409561157227, "learning_rate": 4.894973780788722e-06, "loss": 0.048, "reward": 0.5088822245597839, "reward_std": 3.0980026721954346, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.44999998807907104, "rewards/wrapped_driving_reward": -0.9411178827285767, "rewards/wrapped_format_reward": 0.25, "step": 440 }, { "completion_length": 500.0, "epoch": 88.2, "grad_norm": 2.8440561294555664, "kl": 0.9831880331039429, "learning_rate": 4.8939284020120365e-06, "loss": 0.0393, "reward": 0.980705738067627, "reward_std": 2.253283977508545, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6000000238418579, "rewards/wrapped_driving_reward": -1.119294285774231, "rewards/wrapped_format_reward": 0.5, "step": 441 }, { "completion_length": 500.0, "epoch": 88.4, "grad_norm": 0.4284060299396515, "kl": 0.8977882862091064, "learning_rate": 4.892877959123097e-06, "loss": 0.0359, "reward": 0.3894255757331848, "reward_std": 2.9320125579833984, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -0.86057448387146, "rewards/wrapped_format_reward": 0.0, "step": 442 }, { "completion_length": 500.0, "epoch": 88.6, "grad_norm": 0.43213698267936707, "kl": 0.11996592581272125, "learning_rate": 4.89182245434401e-06, "loss": 0.0048, "reward": -1.3073043823242188, "reward_std": 3.148372173309326, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.265625, "rewards/wrapped_driving_reward": -2.0729293823242188, "rewards/wrapped_format_reward": 0.0, "step": 443 }, { "completion_length": 500.0, "epoch": 88.8, "grad_norm": 0.5716166496276855, "kl": 0.3549193739891052, "learning_rate": 4.890761889907589e-06, "loss": 0.0142, "reward": 0.5434389114379883, "reward_std": 3.20829701423645, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.43717533349990845, "rewards/wrapped_driving_reward": -1.0187362432479858, "rewards/wrapped_format_reward": 0.375, "step": 444 }, { "completion_length": 500.0, "epoch": 89.0, "grad_norm": 0.4193194806575775, "kl": 0.20122253894805908, "learning_rate": 4.889696268057349e-06, "loss": 0.008, "reward": -0.5508648753166199, "reward_std": 3.4957611560821533, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.9258649349212646, "rewards/wrapped_format_reward": 0.5, "step": 445 }, { "completion_length": 500.0, "epoch": 89.2, "grad_norm": 0.4390241801738739, "kl": 0.015177929773926735, "learning_rate": 4.888625591047505e-06, "loss": 0.0006, "reward": 1.724879264831543, "reward_std": 0.7149723768234253, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.3999999761581421, "rewards/wrapped_driving_reward": 0.07487919926643372, "rewards/wrapped_format_reward": 0.25, "step": 446 }, { "completion_length": 312.0, "epoch": 89.4, "grad_norm": 2.1881051063537598, "kl": 1.9536460638046265, "learning_rate": 4.887549861142967e-06, "loss": 0.0781, "reward": 0.2170354127883911, "reward_std": 2.8930094242095947, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3794642686843872, "rewards/wrapped_driving_reward": -1.287428855895996, "rewards/wrapped_format_reward": 0.375, "step": 447 }, { "completion_length": 500.0, "epoch": 89.6, "grad_norm": 1.2952624559402466, "kl": 1.0334632396697998, "learning_rate": 4.88646908061933e-06, "loss": 0.0413, "reward": -0.2442917823791504, "reward_std": 4.344462871551514, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.34375, "rewards/wrapped_driving_reward": -1.5880416631698608, "rewards/wrapped_format_reward": 0.5, "step": 448 }, { "completion_length": 500.0, "epoch": 89.8, "grad_norm": 0.4427298605442047, "kl": 0.6207051873207092, "learning_rate": 4.885383251762877e-06, "loss": 0.0248, "reward": 2.1898977756500244, "reward_std": 0.5821254849433899, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7702991366386414, "rewards/wrapped_driving_reward": -0.2054014354944229, "rewards/wrapped_format_reward": 0.625, "step": 449 }, { "completion_length": 500.0, "epoch": 90.0, "grad_norm": 0.5388855934143066, "kl": 1.0115077495574951, "learning_rate": 4.884292376870567e-06, "loss": 0.0405, "reward": 2.037525177001953, "reward_std": 0.5797269344329834, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.643750011920929, "rewards/wrapped_driving_reward": 0.1437750905752182, "rewards/wrapped_format_reward": 0.25, "step": 450 }, { "completion_length": 500.0, "epoch": 90.2, "grad_norm": 1.7166131734848022, "kl": 1.5529717206954956, "learning_rate": 4.883196458250037e-06, "loss": 0.0621, "reward": 2.3594932556152344, "reward_std": 0.8208974599838257, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6888889074325562, "rewards/wrapped_driving_reward": 0.17060428857803345, "rewards/wrapped_format_reward": 0.5, "step": 451 }, { "completion_length": 500.0, "epoch": 90.4, "grad_norm": 0.8398679494857788, "kl": 0.9991617798805237, "learning_rate": 4.8820954982195905e-06, "loss": 0.04, "reward": 1.959275484085083, "reward_std": 0.25932568311691284, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.875, "rewards/wrapped_driving_reward": -0.290724515914917, "rewards/wrapped_format_reward": 0.375, "step": 452 }, { "completion_length": 500.0, "epoch": 90.6, "grad_norm": 0.4421575367450714, "kl": 0.6176390051841736, "learning_rate": 4.880989499108196e-06, "loss": 0.0247, "reward": 0.9777252078056335, "reward_std": 3.386754274368286, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -0.39727485179901123, "rewards/wrapped_format_reward": 0.375, "step": 453 }, { "completion_length": 500.0, "epoch": 90.8, "grad_norm": 0.6734241247177124, "kl": 0.6196687817573547, "learning_rate": 4.879878463255483e-06, "loss": 0.0248, "reward": 0.035056740045547485, "reward_std": 2.8004674911499023, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.1607142835855484, "rewards/wrapped_driving_reward": -1.125657558441162, "rewards/wrapped_format_reward": 0.25, "step": 454 }, { "completion_length": 500.0, "epoch": 91.0, "grad_norm": 0.432086706161499, "kl": 0.23082022368907928, "learning_rate": 4.878762393011735e-06, "loss": 0.0092, "reward": 0.7312225103378296, "reward_std": 3.202765703201294, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.518750011920929, "rewards/wrapped_driving_reward": -0.9125275611877441, "rewards/wrapped_format_reward": 0.375, "step": 455 }, { "completion_length": 500.0, "epoch": 91.2, "grad_norm": 0.4672556221485138, "kl": 0.2818826138973236, "learning_rate": 4.8776412907378845e-06, "loss": 0.0113, "reward": -1.3701595067977905, "reward_std": 3.0371835231781006, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.21875, "rewards/wrapped_driving_reward": -2.088909387588501, "rewards/wrapped_format_reward": 0.0, "step": 456 }, { "completion_length": 500.0, "epoch": 91.4, "grad_norm": 0.8472755551338196, "kl": 1.7518972158432007, "learning_rate": 4.87651515880551e-06, "loss": 0.0701, "reward": 1.7843856811523438, "reward_std": 0.242258682847023, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7857142686843872, "rewards/wrapped_driving_reward": -0.0013285325840115547, "rewards/wrapped_format_reward": 0.0, "step": 457 }, { "completion_length": 500.0, "epoch": 91.6, "grad_norm": 0.48336735367774963, "kl": 0.4738996624946594, "learning_rate": 4.875383999596828e-06, "loss": 0.019, "reward": 2.511242151260376, "reward_std": 0.3284090459346771, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5847222208976746, "rewards/wrapped_driving_reward": 0.5515199899673462, "rewards/wrapped_format_reward": 0.375, "step": 458 }, { "completion_length": 500.0, "epoch": 91.8, "grad_norm": 0.6792411208152771, "kl": 1.5720727443695068, "learning_rate": 4.874247815504693e-06, "loss": 0.0629, "reward": 2.0669398307800293, "reward_std": 0.740801990032196, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6636363863945007, "rewards/wrapped_driving_reward": 0.1533033847808838, "rewards/wrapped_format_reward": 0.25, "step": 459 }, { "completion_length": 500.0, "epoch": 92.0, "grad_norm": 1.1329480409622192, "kl": 1.284619927406311, "learning_rate": 4.873106608932585e-06, "loss": 0.0514, "reward": 0.6621590852737427, "reward_std": 1.5119678974151611, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8068181872367859, "rewards/wrapped_driving_reward": -1.2696590423583984, "rewards/wrapped_format_reward": 0.125, "step": 460 }, { "completion_length": 500.0, "epoch": 92.2, "grad_norm": 3.491309642791748, "kl": 1.2074986696243286, "learning_rate": 4.871960382294611e-06, "loss": 0.0483, "reward": 1.2318992614746094, "reward_std": 0.664622962474823, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.4752747416496277, "rewards/wrapped_driving_reward": -0.4933754801750183, "rewards/wrapped_format_reward": 0.25, "step": 461 }, { "completion_length": 500.0, "epoch": 92.4, "grad_norm": 0.6012648940086365, "kl": 0.7964257001876831, "learning_rate": 4.870809138015499e-06, "loss": 0.0319, "reward": 0.7039540410041809, "reward_std": 3.1964340209960938, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5625, "rewards/wrapped_driving_reward": -0.9835458993911743, "rewards/wrapped_format_reward": 0.375, "step": 462 }, { "completion_length": 500.0, "epoch": 92.6, "grad_norm": 0.7076395153999329, "kl": 1.378025770187378, "learning_rate": 4.869652878530586e-06, "loss": 0.0551, "reward": 1.9368897676467896, "reward_std": 0.9331097602844238, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6136363744735718, "rewards/wrapped_driving_reward": -0.17674663662910461, "rewards/wrapped_format_reward": 0.5, "step": 463 }, { "completion_length": 500.0, "epoch": 92.8, "grad_norm": 0.37601929903030396, "kl": 0.07725519686937332, "learning_rate": 4.868491606285823e-06, "loss": 0.0031, "reward": 2.537005662918091, "reward_std": 0.17966555058956146, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6748737096786499, "rewards/wrapped_driving_reward": 0.7371318340301514, "rewards/wrapped_format_reward": 0.125, "step": 464 }, { "completion_length": 500.0, "epoch": 93.0, "grad_norm": 0.3961866796016693, "kl": 0.06434271484613419, "learning_rate": 4.867325323737765e-06, "loss": 0.0026, "reward": -0.08214747905731201, "reward_std": 2.7034084796905518, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4636363685131073, "rewards/wrapped_driving_reward": -1.4207837581634521, "rewards/wrapped_format_reward": 0.125, "step": 465 }, { "completion_length": 500.0, "epoch": 93.2, "grad_norm": 0.515503466129303, "kl": 0.7670941948890686, "learning_rate": 4.866154033353561e-06, "loss": 0.0307, "reward": 0.5504806041717529, "reward_std": 3.077503204345703, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.42045456171035767, "rewards/wrapped_driving_reward": -1.11997389793396, "rewards/wrapped_format_reward": 0.5, "step": 466 }, { "completion_length": 500.0, "epoch": 93.4, "grad_norm": 0.4547137916088104, "kl": 1.259108543395996, "learning_rate": 4.864977737610959e-06, "loss": 0.0504, "reward": 2.1892905235290527, "reward_std": 0.13488423824310303, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.628125011920929, "rewards/wrapped_driving_reward": 0.18616540729999542, "rewards/wrapped_format_reward": 0.375, "step": 467 }, { "completion_length": 500.0, "epoch": 93.6, "grad_norm": 0.39233630895614624, "kl": 0.8455955982208252, "learning_rate": 4.863796438998293e-06, "loss": 0.0338, "reward": 1.7829570770263672, "reward_std": 1.168465495109558, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.620707094669342, "rewards/wrapped_driving_reward": -0.3377498984336853, "rewards/wrapped_format_reward": 0.5, "step": 468 }, { "completion_length": 500.0, "epoch": 93.8, "grad_norm": 0.4378278851509094, "kl": 0.7639008164405823, "learning_rate": 4.862610140014479e-06, "loss": 0.0306, "reward": 2.175501823425293, "reward_std": 0.711540937423706, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6541666388511658, "rewards/wrapped_driving_reward": 0.2713351547718048, "rewards/wrapped_format_reward": 0.25, "step": 469 }, { "completion_length": 500.0, "epoch": 94.0, "grad_norm": 0.37471047043800354, "kl": 0.33596715331077576, "learning_rate": 4.861418843169012e-06, "loss": 0.0134, "reward": 1.253250002861023, "reward_std": 0.31675300002098083, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.71875, "rewards/wrapped_driving_reward": -0.46549999713897705, "rewards/wrapped_format_reward": 0.0, "step": 470 }, { "completion_length": 500.0, "epoch": 94.2, "grad_norm": 0.41066431999206543, "kl": 0.7373389005661011, "learning_rate": 4.860222550981961e-06, "loss": 0.0295, "reward": 1.3983137607574463, "reward_std": 3.6222870349884033, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": -0.4766860902309418, "rewards/wrapped_format_reward": 0.5, "step": 471 }, { "completion_length": 500.0, "epoch": 94.4, "grad_norm": 0.5545728802680969, "kl": 1.111506462097168, "learning_rate": 4.859021265983959e-06, "loss": 0.0445, "reward": 0.44755399227142334, "reward_std": 3.0138490200042725, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4422348737716675, "rewards/wrapped_driving_reward": -0.9946808815002441, "rewards/wrapped_format_reward": 0.25, "step": 472 }, { "completion_length": 340.0, "epoch": 94.6, "grad_norm": 0.5120311379432678, "kl": 1.3492473363876343, "learning_rate": 4.8578149907162035e-06, "loss": 0.054, "reward": 2.156475067138672, "reward_std": 0.6874570846557617, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6458333134651184, "rewards/wrapped_driving_reward": 0.010641898959875107, "rewards/wrapped_format_reward": 0.5, "step": 473 }, { "completion_length": 500.0, "epoch": 94.8, "grad_norm": 0.44791045784950256, "kl": 0.6434275507926941, "learning_rate": 4.856603727730446e-06, "loss": 0.0257, "reward": -1.7244491577148438, "reward_std": 2.2597899436950684, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3499999940395355, "rewards/wrapped_driving_reward": -3.074449062347412, "rewards/wrapped_format_reward": 0.25, "step": 474 }, { "completion_length": 500.0, "epoch": 95.0, "grad_norm": 0.8485005497932434, "kl": 1.348606824874878, "learning_rate": 4.855387479588991e-06, "loss": 0.0539, "reward": 1.8527848720550537, "reward_std": 0.7476688623428345, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.4541666507720947, "rewards/wrapped_driving_reward": 0.023618236184120178, "rewards/wrapped_format_reward": 0.375, "step": 475 }, { "completion_length": 500.0, "epoch": 95.2, "grad_norm": 0.3853321671485901, "kl": 0.24151970446109772, "learning_rate": 4.854166248864689e-06, "loss": 0.0097, "reward": 0.4717317819595337, "reward_std": 2.6661465167999268, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5151515007019043, "rewards/wrapped_driving_reward": -1.2934197187423706, "rewards/wrapped_format_reward": 0.5, "step": 476 }, { "completion_length": 500.0, "epoch": 95.4, "grad_norm": 0.5518341660499573, "kl": 0.753800630569458, "learning_rate": 4.852940038140927e-06, "loss": 0.0302, "reward": 1.9437934160232544, "reward_std": 0.5748332142829895, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8422619104385376, "rewards/wrapped_driving_reward": -0.2734684646129608, "rewards/wrapped_format_reward": 0.375, "step": 477 }, { "completion_length": 500.0, "epoch": 95.6, "grad_norm": 0.5776206851005554, "kl": 0.9822722673416138, "learning_rate": 4.851708850011631e-06, "loss": 0.0393, "reward": 0.7107340097427368, "reward_std": 3.164551258087158, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3901515305042267, "rewards/wrapped_driving_reward": -0.8044174909591675, "rewards/wrapped_format_reward": 0.375, "step": 478 }, { "completion_length": 500.0, "epoch": 95.8, "grad_norm": 3.6034088134765625, "kl": 1.1710273027420044, "learning_rate": 4.850472687081253e-06, "loss": 0.0468, "reward": 0.6904103755950928, "reward_std": 3.202547311782837, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6590908765792847, "rewards/wrapped_driving_reward": -1.218680500984192, "rewards/wrapped_format_reward": 0.5, "step": 479 }, { "completion_length": 500.0, "epoch": 96.0, "grad_norm": 0.45567023754119873, "kl": 0.40337690711021423, "learning_rate": 4.849231551964771e-06, "loss": 0.0161, "reward": -0.5991268754005432, "reward_std": 3.64422869682312, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2604166865348816, "rewards/wrapped_driving_reward": -1.6095435619354248, "rewards/wrapped_format_reward": 0.25, "step": 480 }, { "completion_length": 500.0, "epoch": 96.2, "grad_norm": 0.5505725741386414, "kl": 0.7656367421150208, "learning_rate": 4.847985447287681e-06, "loss": 0.0306, "reward": 2.3976004123687744, "reward_std": 0.4736846089363098, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7408459782600403, "rewards/wrapped_driving_reward": 0.03175440803170204, "rewards/wrapped_format_reward": 0.625, "step": 481 }, { "completion_length": 500.0, "epoch": 96.4, "grad_norm": 0.45343244075775146, "kl": 0.6199319958686829, "learning_rate": 4.846734375685989e-06, "loss": 0.0248, "reward": 2.3475000858306885, "reward_std": 0.8652529120445251, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6261904835700989, "rewards/wrapped_driving_reward": 0.3463096022605896, "rewards/wrapped_format_reward": 0.375, "step": 482 }, { "completion_length": 500.0, "epoch": 96.6, "grad_norm": 0.3729810416698456, "kl": 0.47601577639579773, "learning_rate": 4.845478339806211e-06, "loss": 0.019, "reward": 0.2215256690979004, "reward_std": 2.8646438121795654, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.38749998807907104, "rewards/wrapped_driving_reward": -1.2909743785858154, "rewards/wrapped_format_reward": 0.375, "step": 483 }, { "completion_length": 500.0, "epoch": 96.8, "grad_norm": 0.43768781423568726, "kl": 0.5539119839668274, "learning_rate": 4.844217342305363e-06, "loss": 0.0222, "reward": -2.151515007019043, "reward_std": 0.9406149983406067, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4734848737716675, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.625, "step": 484 }, { "completion_length": 500.0, "epoch": 97.0, "grad_norm": 1.7781202793121338, "kl": 2.4269611835479736, "learning_rate": 4.842951385850957e-06, "loss": 0.0971, "reward": 1.8921316862106323, "reward_std": 0.6051745414733887, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": 0.017131807282567024, "rewards/wrapped_format_reward": 0.375, "step": 485 }, { "completion_length": 500.0, "epoch": 97.2, "grad_norm": 1.187008023262024, "kl": 1.609662652015686, "learning_rate": 4.841680473120994e-06, "loss": 0.0644, "reward": 1.960210919380188, "reward_std": 0.6634494066238403, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.53125, "rewards/wrapped_driving_reward": 0.05396085977554321, "rewards/wrapped_format_reward": 0.375, "step": 486 }, { "completion_length": 500.0, "epoch": 97.4, "grad_norm": 0.5124778151512146, "kl": 0.3628022372722626, "learning_rate": 4.840404606803963e-06, "loss": 0.0145, "reward": 0.39914047718048096, "reward_std": 2.975532054901123, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4023042917251587, "rewards/wrapped_driving_reward": -1.2531636953353882, "rewards/wrapped_format_reward": 0.5, "step": 487 }, { "completion_length": 500.0, "epoch": 97.6, "grad_norm": 0.6879832744598389, "kl": 0.7469982504844666, "learning_rate": 4.839123789598829e-06, "loss": 0.0299, "reward": 0.6401175260543823, "reward_std": 0.6992170214653015, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6193910241127014, "rewards/wrapped_driving_reward": -1.2292735576629639, "rewards/wrapped_format_reward": 0.25, "step": 488 }, { "completion_length": 399.0, "epoch": 97.8, "grad_norm": 0.6237967610359192, "kl": 1.441658616065979, "learning_rate": 4.83783802421503e-06, "loss": 0.0577, "reward": 2.1392810344696045, "reward_std": 0.6474350690841675, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6000000238418579, "rewards/wrapped_driving_reward": -0.0857190191745758, "rewards/wrapped_format_reward": 0.625, "step": 489 }, { "completion_length": 500.0, "epoch": 98.0, "grad_norm": 0.4759804904460907, "kl": 0.377165824174881, "learning_rate": 4.836547313372472e-06, "loss": 0.0151, "reward": -1.1276339292526245, "reward_std": 3.3167312145233154, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.19696970283985138, "rewards/wrapped_driving_reward": -1.9496036767959595, "rewards/wrapped_format_reward": 0.125, "step": 490 }, { "completion_length": 500.0, "epoch": 98.2, "grad_norm": 0.5299642086029053, "kl": 1.3804445266723633, "learning_rate": 4.835251659801522e-06, "loss": 0.0552, "reward": -1.6679158210754395, "reward_std": 0.3146505653858185, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7202380895614624, "rewards/wrapped_driving_reward": -3.8881540298461914, "rewards/wrapped_format_reward": 0.5, "step": 491 }, { "completion_length": 500.0, "epoch": 98.4, "grad_norm": 0.8449365496635437, "kl": 0.88936448097229, "learning_rate": 4.833951066243004e-06, "loss": 0.0356, "reward": -0.9219812154769897, "reward_std": 3.307687282562256, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3541666567325592, "rewards/wrapped_driving_reward": -2.0261478424072266, "rewards/wrapped_format_reward": 0.25, "step": 492 }, { "completion_length": 500.0, "epoch": 98.6, "grad_norm": 0.49737241864204407, "kl": 0.24093657732009888, "learning_rate": 4.832645535448193e-06, "loss": 0.0096, "reward": -0.3710002899169922, "reward_std": 3.9087464809417725, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4464285671710968, "rewards/wrapped_driving_reward": -1.692428708076477, "rewards/wrapped_format_reward": 0.375, "step": 493 }, { "completion_length": 500.0, "epoch": 98.8, "grad_norm": 0.548768937587738, "kl": 0.6372228264808655, "learning_rate": 4.8313350701788054e-06, "loss": 0.0255, "reward": 0.4414939880371094, "reward_std": 2.9852190017700195, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4437499940395355, "rewards/wrapped_driving_reward": -1.0022560358047485, "rewards/wrapped_format_reward": 0.25, "step": 494 }, { "completion_length": 500.0, "epoch": 99.0, "grad_norm": 0.5245958566665649, "kl": 0.9060522317886353, "learning_rate": 4.830019673206997e-06, "loss": 0.0362, "reward": 1.7761955261230469, "reward_std": 0.7908852696418762, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6812500357627869, "rewards/wrapped_driving_reward": -0.1550544649362564, "rewards/wrapped_format_reward": 0.25, "step": 495 }, { "completion_length": 500.0, "epoch": 99.2, "grad_norm": 0.5090546607971191, "kl": 0.9043675661087036, "learning_rate": 4.828699347315357e-06, "loss": 0.0362, "reward": 0.39988845586776733, "reward_std": 2.9460675716400146, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.369047611951828, "rewards/wrapped_driving_reward": -0.8441591262817383, "rewards/wrapped_format_reward": 0.125, "step": 496 }, { "completion_length": 500.0, "epoch": 99.4, "grad_norm": 0.7547634840011597, "kl": 0.936507523059845, "learning_rate": 4.8273740952969e-06, "loss": 0.0375, "reward": -0.501820981502533, "reward_std": 3.7566311359405518, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.328125, "rewards/wrapped_driving_reward": -1.5799460411071777, "rewards/wrapped_format_reward": 0.25, "step": 497 }, { "completion_length": 500.0, "epoch": 99.6, "grad_norm": 0.6466436982154846, "kl": 1.7943109273910522, "learning_rate": 4.826043919955062e-06, "loss": 0.0718, "reward": 2.4197564125061035, "reward_std": 0.2094937115907669, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.800000011920929, "rewards/wrapped_driving_reward": -0.0052434951066970825, "rewards/wrapped_format_reward": 0.625, "step": 498 }, { "completion_length": 500.0, "epoch": 99.8, "grad_norm": 0.42996838688850403, "kl": 0.36297762393951416, "learning_rate": 4.824708824103694e-06, "loss": 0.0145, "reward": -0.16156989336013794, "reward_std": 2.625190019607544, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.42500001192092896, "rewards/wrapped_driving_reward": -1.586569905281067, "rewards/wrapped_format_reward": 0.25, "step": 499 }, { "completion_length": 500.0, "epoch": 100.0, "grad_norm": 0.8012004494667053, "kl": 1.8889646530151367, "learning_rate": 4.823368810567056e-06, "loss": 0.0756, "reward": 0.8908319473266602, "reward_std": 2.290397882461548, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6818181872367859, "rewards/wrapped_driving_reward": -0.9159862399101257, "rewards/wrapped_format_reward": 0.125, "step": 500 }, { "completion_length": 500.0, "epoch": 100.2, "grad_norm": 0.5142181515693665, "kl": 1.2798620462417603, "learning_rate": 4.822023882179811e-06, "loss": 0.0512, "reward": -0.5808591842651367, "reward_std": 3.67710280418396, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.22499999403953552, "rewards/wrapped_driving_reward": -1.680859088897705, "rewards/wrapped_format_reward": 0.375, "step": 501 }, { "completion_length": 500.0, "epoch": 100.4, "grad_norm": 0.44062864780426025, "kl": 0.3407534658908844, "learning_rate": 4.820674041787017e-06, "loss": 0.0136, "reward": 0.8439583778381348, "reward_std": 2.5781030654907227, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5401785373687744, "rewards/wrapped_driving_reward": -1.0712201595306396, "rewards/wrapped_format_reward": 0.625, "step": 502 }, { "completion_length": 500.0, "epoch": 100.6, "grad_norm": 0.467164009809494, "kl": 0.2597810626029968, "learning_rate": 4.819319292244125e-06, "loss": 0.0104, "reward": -0.12285977602005005, "reward_std": 2.596940755844116, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4427083134651184, "rewards/wrapped_driving_reward": -1.3155680894851685, "rewards/wrapped_format_reward": 0.0, "step": 503 }, { "completion_length": 500.0, "epoch": 100.8, "grad_norm": 1.7052229642868042, "kl": 1.9896000623703003, "learning_rate": 4.817959636416969e-06, "loss": 0.0796, "reward": -1.3930771350860596, "reward_std": 3.0145397186279297, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.10000000149011612, "rewards/wrapped_driving_reward": -1.993077039718628, "rewards/wrapped_format_reward": 0.0, "step": 504 }, { "completion_length": 500.0, "epoch": 101.0, "grad_norm": 0.6971648335456848, "kl": 1.9582891464233398, "learning_rate": 4.8165950771817635e-06, "loss": 0.0783, "reward": 1.634942889213562, "reward_std": 0.7667340636253357, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6328125, "rewards/wrapped_driving_reward": -0.122869573533535, "rewards/wrapped_format_reward": 0.125, "step": 505 }, { "completion_length": 500.0, "epoch": 101.2, "grad_norm": 0.5006844401359558, "kl": 0.9780154824256897, "learning_rate": 4.815225617425095e-06, "loss": 0.0391, "reward": 2.6801013946533203, "reward_std": 0.4480738043785095, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5218750238418579, "rewards/wrapped_driving_reward": 0.7832264304161072, "rewards/wrapped_format_reward": 0.375, "step": 506 }, { "completion_length": 500.0, "epoch": 101.4, "grad_norm": 0.6148558855056763, "kl": 2.1717538833618164, "learning_rate": 4.8138512600439165e-06, "loss": 0.0869, "reward": 2.133512496948242, "reward_std": 0.4723384976387024, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5873737335205078, "rewards/wrapped_driving_reward": 0.17113864421844482, "rewards/wrapped_format_reward": 0.375, "step": 507 }, { "completion_length": 500.0, "epoch": 101.6, "grad_norm": 0.4935348629951477, "kl": 0.14443062245845795, "learning_rate": 4.81247200794554e-06, "loss": 0.0058, "reward": -0.09753605723381042, "reward_std": 2.5551700592041016, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5723214149475098, "rewards/wrapped_driving_reward": -2.0448575019836426, "rewards/wrapped_format_reward": 0.375, "step": 508 }, { "completion_length": 500.0, "epoch": 101.8, "grad_norm": 0.942416787147522, "kl": 1.2242225408554077, "learning_rate": 4.811087864047636e-06, "loss": 0.049, "reward": 1.8122155666351318, "reward_std": 0.7886653542518616, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5479166507720947, "rewards/wrapped_driving_reward": -0.23570111393928528, "rewards/wrapped_format_reward": 0.5, "step": 509 }, { "completion_length": 500.0, "epoch": 102.0, "grad_norm": 0.7499745488166809, "kl": 1.1658865213394165, "learning_rate": 4.809698831278217e-06, "loss": 0.0466, "reward": -0.7502414584159851, "reward_std": 3.752504825592041, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.00024151802063, "rewards/wrapped_format_reward": 0.5, "step": 510 }, { "completion_length": 500.0, "epoch": 102.2, "grad_norm": 0.43321874737739563, "kl": 0.9147093892097473, "learning_rate": 4.808304912575643e-06, "loss": 0.0366, "reward": -0.9792734980583191, "reward_std": 3.5107717514038086, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -1.9792734384536743, "rewards/wrapped_format_reward": 0.25, "step": 511 }, { "completion_length": 500.0, "epoch": 102.4, "grad_norm": 2.5993704795837402, "kl": 1.413003921508789, "learning_rate": 4.806906110888606e-06, "loss": 0.0565, "reward": 2.7378745079040527, "reward_std": 0.5923124551773071, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5755681991577148, "rewards/wrapped_driving_reward": 0.2873062193393707, "rewards/wrapped_format_reward": 0.875, "step": 512 }, { "completion_length": 340.0, "epoch": 102.6, "grad_norm": 0.5577855110168457, "kl": 1.2526390552520752, "learning_rate": 4.80550242917613e-06, "loss": 0.0501, "reward": -0.5180244445800781, "reward_std": 2.8751118183135986, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3385416865348816, "rewards/wrapped_driving_reward": -1.9815661907196045, "rewards/wrapped_format_reward": 0.625, "step": 513 }, { "completion_length": 500.0, "epoch": 102.8, "grad_norm": 0.4672858417034149, "kl": 0.7466950416564941, "learning_rate": 4.80409387040756e-06, "loss": 0.0299, "reward": 2.8180227279663086, "reward_std": 0.24733002483844757, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": 0.8180227875709534, "rewards/wrapped_format_reward": 0.25, "step": 514 }, { "completion_length": 500.0, "epoch": 103.0, "grad_norm": 0.6199758052825928, "kl": 1.1773701906204224, "learning_rate": 4.802680437562559e-06, "loss": 0.0471, "reward": 0.9091346263885498, "reward_std": 3.331557035446167, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5583333373069763, "rewards/wrapped_driving_reward": -0.8991987109184265, "rewards/wrapped_format_reward": 0.5, "step": 515 }, { "completion_length": 500.0, "epoch": 103.2, "grad_norm": 0.5635596513748169, "kl": 0.7991167306900024, "learning_rate": 4.801262133631101e-06, "loss": 0.032, "reward": 0.9160671234130859, "reward_std": 3.2819769382476807, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.671875, "rewards/wrapped_driving_reward": -1.1308077573776245, "rewards/wrapped_format_reward": 0.625, "step": 516 }, { "completion_length": 500.0, "epoch": 103.4, "grad_norm": 0.7657514214515686, "kl": 2.3280792236328125, "learning_rate": 4.799838961613464e-06, "loss": 0.0931, "reward": 1.3210253715515137, "reward_std": 3.5857467651367188, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4124999940395355, "rewards/wrapped_driving_reward": -0.3414744734764099, "rewards/wrapped_format_reward": 0.5, "step": 517 }, { "completion_length": 500.0, "epoch": 103.6, "grad_norm": 0.6965065002441406, "kl": 1.41036057472229, "learning_rate": 4.798410924520223e-06, "loss": 0.0564, "reward": -0.9389865398406982, "reward_std": 3.56209135055542, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.20000000298023224, "rewards/wrapped_driving_reward": -1.888986587524414, "rewards/wrapped_format_reward": 0.25, "step": 518 }, { "completion_length": 500.0, "epoch": 103.8, "grad_norm": 0.4593130052089691, "kl": 0.536795973777771, "learning_rate": 4.796978025372247e-06, "loss": 0.0215, "reward": 1.5728414058685303, "reward_std": 0.5975411534309387, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5977272987365723, "rewards/wrapped_driving_reward": -0.774885892868042, "rewards/wrapped_format_reward": 0.75, "step": 519 }, { "completion_length": 500.0, "epoch": 104.0, "grad_norm": 0.5618796348571777, "kl": 1.3304004669189453, "learning_rate": 4.7955402672006855e-06, "loss": 0.0532, "reward": -0.9953773617744446, "reward_std": 3.4694395065307617, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.125, "rewards/wrapped_driving_reward": -1.9953771829605103, "rewards/wrapped_format_reward": 0.375, "step": 520 }, { "completion_length": 500.0, "epoch": 104.2, "grad_norm": 0.46769797801971436, "kl": 1.3101164102554321, "learning_rate": 4.7940976530469725e-06, "loss": 0.0524, "reward": 2.2564780712127686, "reward_std": 0.6004188060760498, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": 0.13147804141044617, "rewards/wrapped_format_reward": 0.375, "step": 521 }, { "completion_length": 500.0, "epoch": 104.4, "grad_norm": 0.5183075666427612, "kl": 0.4070712924003601, "learning_rate": 4.79265018596281e-06, "loss": 0.0163, "reward": 2.875926971435547, "reward_std": 0.08624047785997391, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8157467842102051, "rewards/wrapped_driving_reward": 0.43518009781837463, "rewards/wrapped_format_reward": 0.625, "step": 522 }, { "completion_length": 500.0, "epoch": 104.6, "grad_norm": 0.4478956460952759, "kl": 0.029893802478909492, "learning_rate": 4.791197869010169e-06, "loss": 0.0012, "reward": -0.8608925342559814, "reward_std": 3.3533756732940674, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.35641026496887207, "rewards/wrapped_driving_reward": -2.0923030376434326, "rewards/wrapped_format_reward": 0.375, "step": 523 }, { "completion_length": 500.0, "epoch": 104.8, "grad_norm": 0.6298028230667114, "kl": 1.1029764413833618, "learning_rate": 4.789740705261278e-06, "loss": 0.0441, "reward": -0.6450392007827759, "reward_std": 3.6097323894500732, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4166666865348816, "rewards/wrapped_driving_reward": -2.0617058277130127, "rewards/wrapped_format_reward": 0.5, "step": 524 }, { "completion_length": 310.0, "epoch": 105.0, "grad_norm": 0.6106534600257874, "kl": 0.8999290466308594, "learning_rate": 4.788278697798619e-06, "loss": 0.036, "reward": 3.3090169429779053, "reward_std": 0.3094661831855774, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.4958333373069763, "rewards/wrapped_driving_reward": 0.8131834864616394, "rewards/wrapped_format_reward": 1.0, "step": 525 }, { "completion_length": 500.0, "epoch": 105.2, "grad_norm": 0.3837595582008362, "kl": 0.680999219417572, "learning_rate": 4.786811849714918e-06, "loss": 0.0272, "reward": 2.642338752746582, "reward_std": 0.1630004346370697, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": 0.14233872294425964, "rewards/wrapped_format_reward": 0.875, "step": 526 }, { "completion_length": 500.0, "epoch": 105.4, "grad_norm": 0.5555136799812317, "kl": 1.9456011056900024, "learning_rate": 4.785340164113146e-06, "loss": 0.0778, "reward": 1.8795998096466064, "reward_std": 0.7651036381721497, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.683506965637207, "rewards/wrapped_driving_reward": -0.1789071261882782, "rewards/wrapped_format_reward": 0.375, "step": 527 }, { "completion_length": 500.0, "epoch": 105.6, "grad_norm": 0.5619902014732361, "kl": 0.46370929479599, "learning_rate": 4.783863644106502e-06, "loss": 0.0185, "reward": 1.9660954475402832, "reward_std": 0.7242228388786316, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6615190505981445, "rewards/wrapped_driving_reward": 0.05457644537091255, "rewards/wrapped_format_reward": 0.25, "step": 528 }, { "completion_length": 395.0, "epoch": 105.8, "grad_norm": 0.6083903908729553, "kl": 1.742167353630066, "learning_rate": 4.782382292818417e-06, "loss": 0.0697, "reward": 2.2587099075317383, "reward_std": 0.5401583313941956, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6885417103767395, "rewards/wrapped_driving_reward": -0.05483181029558182, "rewards/wrapped_format_reward": 0.625, "step": 529 }, { "completion_length": 500.0, "epoch": 106.0, "grad_norm": 0.5853642225265503, "kl": 1.1986284255981445, "learning_rate": 4.780896113382536e-06, "loss": 0.0479, "reward": 2.7307024002075195, "reward_std": 0.6891061067581177, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6480768918991089, "rewards/wrapped_driving_reward": 0.5826254487037659, "rewards/wrapped_format_reward": 0.5, "step": 530 }, { "completion_length": 500.0, "epoch": 106.2, "grad_norm": 0.5651938319206238, "kl": 1.5314892530441284, "learning_rate": 4.779405108942722e-06, "loss": 0.0613, "reward": 1.9537711143493652, "reward_std": 0.5861890316009521, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8104166984558105, "rewards/wrapped_driving_reward": -0.6066454648971558, "rewards/wrapped_format_reward": 0.75, "step": 531 }, { "completion_length": 500.0, "epoch": 106.4, "grad_norm": 0.3306523263454437, "kl": 0.4132198691368103, "learning_rate": 4.777909282653043e-06, "loss": 0.0165, "reward": 0.8026348352432251, "reward_std": 2.129302501678467, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5701389312744141, "rewards/wrapped_driving_reward": -1.017504096031189, "rewards/wrapped_format_reward": 0.25, "step": 532 }, { "completion_length": 500.0, "epoch": 106.6, "grad_norm": 0.4126357138156891, "kl": 1.6830040216445923, "learning_rate": 4.776408637677768e-06, "loss": 0.0673, "reward": 1.9422099590301514, "reward_std": 0.5061317086219788, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6166666746139526, "rewards/wrapped_driving_reward": 0.07554324716329575, "rewards/wrapped_format_reward": 0.25, "step": 533 }, { "completion_length": 500.0, "epoch": 106.8, "grad_norm": 0.4970954656600952, "kl": 0.8660544753074646, "learning_rate": 4.774903177191358e-06, "loss": 0.0346, "reward": -0.14617103338241577, "reward_std": 1.8106459379196167, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7298611402511597, "rewards/wrapped_driving_reward": -2.3760321140289307, "rewards/wrapped_format_reward": 0.5, "step": 534 }, { "completion_length": 421.0, "epoch": 107.0, "grad_norm": 0.5018829703330994, "kl": 1.2226200103759766, "learning_rate": 4.773392904378463e-06, "loss": 0.0489, "reward": 2.568552255630493, "reward_std": 0.8392052054405212, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6000000238418579, "rewards/wrapped_driving_reward": 0.34355229139328003, "rewards/wrapped_format_reward": 0.625, "step": 535 }, { "completion_length": 489.0, "epoch": 107.2, "grad_norm": 0.5290039777755737, "kl": 1.2756959199905396, "learning_rate": 4.7718778224339115e-06, "loss": 0.051, "reward": 2.1997711658477783, "reward_std": 0.5815731883049011, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5788690447807312, "rewards/wrapped_driving_reward": 0.2459021806716919, "rewards/wrapped_format_reward": 0.375, "step": 536 }, { "completion_length": 440.0, "epoch": 107.4, "grad_norm": 0.4976600408554077, "kl": 1.8072471618652344, "learning_rate": 4.770357934562704e-06, "loss": 0.0723, "reward": 0.5550402402877808, "reward_std": 3.05448579788208, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6193181872367859, "rewards/wrapped_driving_reward": -1.3142778873443604, "rewards/wrapped_format_reward": 0.5, "step": 537 }, { "completion_length": 500.0, "epoch": 107.6, "grad_norm": 0.4468809962272644, "kl": 1.2450345754623413, "learning_rate": 4.768833243980009e-06, "loss": 0.0498, "reward": 2.366654396057129, "reward_std": 0.7478683590888977, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.65625, "rewards/wrapped_driving_reward": 0.21040436625480652, "rewards/wrapped_format_reward": 0.5, "step": 538 }, { "completion_length": 500.0, "epoch": 107.8, "grad_norm": 0.5830360054969788, "kl": 1.625077486038208, "learning_rate": 4.767303753911156e-06, "loss": 0.065, "reward": 1.1235265731811523, "reward_std": 3.1114587783813477, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.2736111283302307, "rewards/wrapped_driving_reward": -0.5250846147537231, "rewards/wrapped_format_reward": 0.625, "step": 539 }, { "completion_length": 500.0, "epoch": 108.0, "grad_norm": 0.5232158303260803, "kl": 1.1184170246124268, "learning_rate": 4.765769467591626e-06, "loss": 0.0447, "reward": 2.236095905303955, "reward_std": 0.9097519516944885, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.762499988079071, "rewards/wrapped_driving_reward": -0.2764042019844055, "rewards/wrapped_format_reward": 0.75, "step": 540 }, { "completion_length": 500.0, "epoch": 108.2, "grad_norm": 0.5139183402061462, "kl": 1.1571197509765625, "learning_rate": 4.764230388267043e-06, "loss": 0.0463, "reward": 0.7769237756729126, "reward_std": 2.8545281887054443, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4444444477558136, "rewards/wrapped_driving_reward": -1.0425206422805786, "rewards/wrapped_format_reward": 0.625, "step": 541 }, { "completion_length": 500.0, "epoch": 108.4, "grad_norm": 0.6705910563468933, "kl": 1.2363864183425903, "learning_rate": 4.762686519193175e-06, "loss": 0.0495, "reward": -1.4404573440551758, "reward_std": 2.725029468536377, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.1875, "rewards/wrapped_driving_reward": -2.502957344055176, "rewards/wrapped_format_reward": 0.375, "step": 542 }, { "completion_length": 264.0, "epoch": 108.6, "grad_norm": 0.6742754578590393, "kl": 0.6286165118217468, "learning_rate": 4.761137863635921e-06, "loss": 0.0251, "reward": 2.884040355682373, "reward_std": 0.3912404477596283, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6547619104385376, "rewards/wrapped_driving_reward": 0.22927850484848022, "rewards/wrapped_format_reward": 1.0, "step": 543 }, { "completion_length": 500.0, "epoch": 108.8, "grad_norm": 0.5893082618713379, "kl": 0.5943479537963867, "learning_rate": 4.759584424871302e-06, "loss": 0.0238, "reward": 0.6722060441970825, "reward_std": 3.2552413940429688, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4040403962135315, "rewards/wrapped_driving_reward": -0.8568344116210938, "rewards/wrapped_format_reward": 0.375, "step": 544 }, { "completion_length": 500.0, "epoch": 109.0, "grad_norm": 0.3778823912143707, "kl": 2.369497299194336, "learning_rate": 4.758026206185461e-06, "loss": 0.0948, "reward": 2.6130523681640625, "reward_std": 0.8472208976745605, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6631944179534912, "rewards/wrapped_driving_reward": 0.4498576819896698, "rewards/wrapped_format_reward": 0.5, "step": 545 }, { "completion_length": 500.0, "epoch": 109.2, "grad_norm": 0.5389312505722046, "kl": 1.9787030220031738, "learning_rate": 4.7564632108746524e-06, "loss": 0.0791, "reward": 2.212176561355591, "reward_std": 0.6097406148910522, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6505681872367859, "rewards/wrapped_driving_reward": 0.0616084523499012, "rewards/wrapped_format_reward": 0.5, "step": 546 }, { "completion_length": 500.0, "epoch": 109.4, "grad_norm": 0.5773468613624573, "kl": 2.1879804134368896, "learning_rate": 4.754895442245232e-06, "loss": 0.0875, "reward": 1.9608826637268066, "reward_std": 0.7703467011451721, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.39375001192092896, "rewards/wrapped_driving_reward": 0.06713273376226425, "rewards/wrapped_format_reward": 0.5, "step": 547 }, { "completion_length": 500.0, "epoch": 109.6, "grad_norm": 0.47151243686676025, "kl": 1.2900985479354858, "learning_rate": 4.7533229036136555e-06, "loss": 0.0516, "reward": -0.8987966775894165, "reward_std": 3.319197177886963, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4375, "rewards/wrapped_driving_reward": -2.211296796798706, "rewards/wrapped_format_reward": 0.375, "step": 548 }, { "completion_length": 500.0, "epoch": 109.8, "grad_norm": 0.7865459322929382, "kl": 0.9134626984596252, "learning_rate": 4.7517455983064694e-06, "loss": 0.0365, "reward": 2.7501766681671143, "reward_std": 0.48907047510147095, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6000000238418579, "rewards/wrapped_driving_reward": 0.7751765847206116, "rewards/wrapped_format_reward": 0.375, "step": 549 }, { "completion_length": 500.0, "epoch": 110.0, "grad_norm": 0.705788254737854, "kl": 0.9574757814407349, "learning_rate": 4.750163529660303e-06, "loss": 0.0383, "reward": -2.3608083724975586, "reward_std": 2.3855879306793213, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -3.3608083724975586, "rewards/wrapped_format_reward": 0.125, "step": 550 }, { "completion_length": 500.0, "epoch": 110.2, "grad_norm": 0.554609477519989, "kl": 0.7532652616500854, "learning_rate": 4.748576701021861e-06, "loss": 0.0301, "reward": -0.3499680161476135, "reward_std": 3.936481475830078, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2874999940395355, "rewards/wrapped_driving_reward": -1.637467861175537, "rewards/wrapped_format_reward": 0.5, "step": 551 }, { "completion_length": 318.0, "epoch": 110.4, "grad_norm": 0.6001780033111572, "kl": 1.6765778064727783, "learning_rate": 4.746985115747918e-06, "loss": 0.0671, "reward": 0.056570351123809814, "reward_std": 2.740037679672241, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5229166746139526, "rewards/wrapped_driving_reward": -1.7163463830947876, "rewards/wrapped_format_reward": 0.5, "step": 552 }, { "completion_length": 430.0, "epoch": 110.6, "grad_norm": 0.3936501443386078, "kl": 1.9558618068695068, "learning_rate": 4.745388777205311e-06, "loss": 0.0782, "reward": 2.665126323699951, "reward_std": 0.14663144946098328, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6660714149475098, "rewards/wrapped_driving_reward": 0.124054916203022, "rewards/wrapped_format_reward": 0.875, "step": 553 }, { "completion_length": 500.0, "epoch": 110.8, "grad_norm": 2.804915428161621, "kl": 1.7594517469406128, "learning_rate": 4.7437876887709326e-06, "loss": 0.0704, "reward": 1.1689927577972412, "reward_std": 1.3152838945388794, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7208333611488342, "rewards/wrapped_driving_reward": -1.1768405437469482, "rewards/wrapped_format_reward": 0.625, "step": 554 }, { "completion_length": 284.0, "epoch": 111.0, "grad_norm": 0.6140881180763245, "kl": 0.8588994145393372, "learning_rate": 4.742181853831721e-06, "loss": 0.0344, "reward": 2.3088929653167725, "reward_std": 0.5343014001846313, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5678030252456665, "rewards/wrapped_driving_reward": 0.3660898804664612, "rewards/wrapped_format_reward": 0.375, "step": 555 }, { "completion_length": 500.0, "epoch": 111.2, "grad_norm": 0.4993399381637573, "kl": 0.5447943806648254, "learning_rate": 4.740571275784659e-06, "loss": 0.0218, "reward": 2.0253894329071045, "reward_std": 0.21934282779693604, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.37637364864349365, "rewards/wrapped_driving_reward": 0.1490158885717392, "rewards/wrapped_format_reward": 0.5, "step": 556 }, { "completion_length": 500.0, "epoch": 111.4, "grad_norm": 1.7358314990997314, "kl": 1.2244584560394287, "learning_rate": 4.738955958036759e-06, "loss": 0.049, "reward": 0.34858763217926025, "reward_std": 2.6462531089782715, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5008741617202759, "rewards/wrapped_driving_reward": -1.402286410331726, "rewards/wrapped_format_reward": 0.5, "step": 557 }, { "completion_length": 461.0, "epoch": 111.6, "grad_norm": 0.5092949271202087, "kl": 1.657390832901001, "learning_rate": 4.737335904005063e-06, "loss": 0.0663, "reward": 0.6630178689956665, "reward_std": 3.1516642570495605, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5562770366668701, "rewards/wrapped_driving_reward": -1.0182591676712036, "rewards/wrapped_format_reward": 0.375, "step": 558 }, { "completion_length": 500.0, "epoch": 111.8, "grad_norm": 0.6119469404220581, "kl": 0.85076904296875, "learning_rate": 4.7357111171166295e-06, "loss": 0.034, "reward": 2.1360678672790527, "reward_std": 0.911134660243988, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8125, "rewards/wrapped_driving_reward": -0.05143224447965622, "rewards/wrapped_format_reward": 0.375, "step": 559 }, { "completion_length": 500.0, "epoch": 112.0, "grad_norm": 0.6579754948616028, "kl": 2.398366689682007, "learning_rate": 4.734081600808531e-06, "loss": 0.0959, "reward": 0.9975037574768066, "reward_std": 3.109951972961426, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -0.8774962425231934, "rewards/wrapped_format_reward": 0.625, "step": 560 }, { "completion_length": 500.0, "epoch": 112.2, "grad_norm": 0.42901667952537537, "kl": 1.1184028387069702, "learning_rate": 4.732447358527843e-06, "loss": 0.0447, "reward": 1.180442452430725, "reward_std": 1.7219181060791016, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6164772510528564, "rewards/wrapped_driving_reward": -1.1860346794128418, "rewards/wrapped_format_reward": 0.75, "step": 561 }, { "completion_length": 500.0, "epoch": 112.4, "grad_norm": 0.6626478433609009, "kl": 1.9512630701065063, "learning_rate": 4.730808393731639e-06, "loss": 0.0781, "reward": -0.29985374212265015, "reward_std": 3.131761074066162, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.174853563308716, "rewards/wrapped_format_reward": 0.875, "step": 562 }, { "completion_length": 500.0, "epoch": 112.6, "grad_norm": 0.5774132013320923, "kl": 1.328711986541748, "learning_rate": 4.7291647098869834e-06, "loss": 0.0531, "reward": 0.6384716033935547, "reward_std": 3.103172540664673, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6388888955116272, "rewards/wrapped_driving_reward": -1.0004172325134277, "rewards/wrapped_format_reward": 0.25, "step": 563 }, { "completion_length": 500.0, "epoch": 112.8, "grad_norm": 0.5996187329292297, "kl": 1.8008451461791992, "learning_rate": 4.72751631047092e-06, "loss": 0.072, "reward": 0.4193859100341797, "reward_std": 3.003329277038574, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3821428716182709, "rewards/wrapped_driving_reward": -1.337756872177124, "rewards/wrapped_format_reward": 0.625, "step": 564 }, { "completion_length": 500.0, "epoch": 113.0, "grad_norm": 1.3308876752853394, "kl": 1.5455653667449951, "learning_rate": 4.725863198970473e-06, "loss": 0.0618, "reward": -0.39976298809051514, "reward_std": 3.6030492782592773, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -1.7747629880905151, "rewards/wrapped_format_reward": 0.625, "step": 565 }, { "completion_length": 500.0, "epoch": 113.2, "grad_norm": 1.8822331428527832, "kl": 2.873924970626831, "learning_rate": 4.72420537888263e-06, "loss": 0.115, "reward": 2.0858964920043945, "reward_std": 0.7741968035697937, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.4749999940395355, "rewards/wrapped_driving_reward": 0.2358965277671814, "rewards/wrapped_format_reward": 0.375, "step": 566 }, { "completion_length": 500.0, "epoch": 113.4, "grad_norm": 0.43895184993743896, "kl": 0.9438685774803162, "learning_rate": 4.7225428537143414e-06, "loss": 0.0378, "reward": 0.8183106184005737, "reward_std": 3.246364116668701, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.43958330154418945, "rewards/wrapped_driving_reward": -0.7462728023529053, "rewards/wrapped_format_reward": 0.375, "step": 567 }, { "completion_length": 500.0, "epoch": 113.6, "grad_norm": 0.6256127953529358, "kl": 0.7444883584976196, "learning_rate": 4.720875626982511e-06, "loss": 0.0298, "reward": -0.48403680324554443, "reward_std": 3.7771100997924805, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4318181872367859, "rewards/wrapped_driving_reward": -2.0408549308776855, "rewards/wrapped_format_reward": 0.625, "step": 568 }, { "completion_length": 500.0, "epoch": 113.8, "grad_norm": 0.6889533996582031, "kl": 1.5083221197128296, "learning_rate": 4.719203702213986e-06, "loss": 0.0603, "reward": -1.019493579864502, "reward_std": 2.4130260944366455, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.448484867811203, "rewards/wrapped_driving_reward": -2.7179782390594482, "rewards/wrapped_format_reward": 0.5, "step": 569 }, { "completion_length": 418.0, "epoch": 114.0, "grad_norm": 0.5046432614326477, "kl": 1.4851415157318115, "learning_rate": 4.717527082945555e-06, "loss": 0.0594, "reward": 3.070833683013916, "reward_std": 0.8115704655647278, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7250000238418579, "rewards/wrapped_driving_reward": 0.5958337783813477, "rewards/wrapped_format_reward": 0.75, "step": 570 }, { "completion_length": 500.0, "epoch": 114.2, "grad_norm": 0.701687216758728, "kl": 0.6930884122848511, "learning_rate": 4.715845772723934e-06, "loss": 0.0277, "reward": -0.01327204704284668, "reward_std": 2.6603503227233887, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4756944477558136, "rewards/wrapped_driving_reward": -1.363966464996338, "rewards/wrapped_format_reward": 0.125, "step": 571 }, { "completion_length": 355.0, "epoch": 114.4, "grad_norm": 0.6142638325691223, "kl": 1.454174280166626, "learning_rate": 4.714159775105766e-06, "loss": 0.0582, "reward": 2.151031494140625, "reward_std": 0.5596021413803101, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.550000011920929, "rewards/wrapped_driving_reward": 0.10103163123130798, "rewards/wrapped_format_reward": 0.5, "step": 572 }, { "completion_length": 500.0, "epoch": 114.6, "grad_norm": 0.6146017909049988, "kl": 1.0724257230758667, "learning_rate": 4.712469093657605e-06, "loss": 0.0429, "reward": -2.56577205657959, "reward_std": 2.251687526702881, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.125, "rewards/wrapped_driving_reward": -3.19077205657959, "rewards/wrapped_format_reward": 0.25, "step": 573 }, { "completion_length": 500.0, "epoch": 114.8, "grad_norm": 0.5557622909545898, "kl": 2.0290610790252686, "learning_rate": 4.710773731955918e-06, "loss": 0.0812, "reward": 3.123636245727539, "reward_std": 0.5092527270317078, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.4749999940395355, "rewards/wrapped_driving_reward": 0.7736363410949707, "rewards/wrapped_format_reward": 0.875, "step": 574 }, { "completion_length": 500.0, "epoch": 115.0, "grad_norm": 0.4426519572734833, "kl": 0.9794008731842041, "learning_rate": 4.70907369358707e-06, "loss": 0.0392, "reward": 1.6929361820220947, "reward_std": 0.9372192025184631, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6746031641960144, "rewards/wrapped_driving_reward": -0.10666707158088684, "rewards/wrapped_format_reward": 0.125, "step": 575 }, { "completion_length": 359.0, "epoch": 115.2, "grad_norm": 0.722548246383667, "kl": 1.860018014907837, "learning_rate": 4.707368982147318e-06, "loss": 0.0744, "reward": 1.0876185894012451, "reward_std": 3.0673789978027344, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5381944179534912, "rewards/wrapped_driving_reward": -1.075575828552246, "rewards/wrapped_format_reward": 0.875, "step": 576 }, { "completion_length": 500.0, "epoch": 115.4, "grad_norm": 0.8116574883460999, "kl": 1.9904812574386597, "learning_rate": 4.705659601242807e-06, "loss": 0.0796, "reward": 1.0343968868255615, "reward_std": 3.3568084239959717, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.484279602766037, "rewards/wrapped_driving_reward": -0.9498826265335083, "rewards/wrapped_format_reward": 0.75, "step": 577 }, { "completion_length": 500.0, "epoch": 115.6, "grad_norm": 0.9164823889732361, "kl": 3.183170795440674, "learning_rate": 4.703945554489559e-06, "loss": 0.1273, "reward": 1.990584373474121, "reward_std": 1.2147678136825562, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.581250011920929, "rewards/wrapped_driving_reward": -0.09066560864448547, "rewards/wrapped_format_reward": 0.5, "step": 578 }, { "completion_length": 500.0, "epoch": 115.8, "grad_norm": 0.6700800061225891, "kl": 2.131450891494751, "learning_rate": 4.702226845513465e-06, "loss": 0.0853, "reward": 3.0608229637145996, "reward_std": 0.5285645723342896, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5791666507720947, "rewards/wrapped_driving_reward": 0.7316560745239258, "rewards/wrapped_format_reward": 0.75, "step": 579 }, { "completion_length": 500.0, "epoch": 116.0, "grad_norm": 0.7118693590164185, "kl": 0.5586986541748047, "learning_rate": 4.700503477950278e-06, "loss": 0.0223, "reward": 0.774085283279419, "reward_std": 3.1828384399414062, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4437499940395355, "rewards/wrapped_driving_reward": -0.9196646809577942, "rewards/wrapped_format_reward": 0.5, "step": 580 }, { "completion_length": 500.0, "epoch": 116.2, "grad_norm": 4.251746654510498, "kl": 2.8645055294036865, "learning_rate": 4.698775455445609e-06, "loss": 0.1146, "reward": 0.2876031994819641, "reward_std": 2.8924572467803955, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.675000011920929, "rewards/wrapped_driving_reward": -1.6373969316482544, "rewards/wrapped_format_reward": 0.5, "step": 581 }, { "completion_length": 500.0, "epoch": 116.4, "grad_norm": 1.3924583196640015, "kl": 1.367591381072998, "learning_rate": 4.697042781654913e-06, "loss": 0.0547, "reward": 1.9106945991516113, "reward_std": 0.5295900106430054, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": 0.035694561898708344, "rewards/wrapped_format_reward": 0.5, "step": 582 }, { "completion_length": 500.0, "epoch": 116.6, "grad_norm": 0.787873387336731, "kl": 1.4614962339401245, "learning_rate": 4.695305460243487e-06, "loss": 0.0585, "reward": -0.9608310461044312, "reward_std": 3.5229275226593018, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.0858311653137207, "rewards/wrapped_format_reward": 0.375, "step": 583 }, { "completion_length": 500.0, "epoch": 116.8, "grad_norm": 0.742299497127533, "kl": 1.2044023275375366, "learning_rate": 4.693563494886455e-06, "loss": 0.0482, "reward": 0.17274639010429382, "reward_std": 1.9086068868637085, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5738095641136169, "rewards/wrapped_driving_reward": -2.2760632038116455, "rewards/wrapped_format_reward": 0.875, "step": 584 }, { "completion_length": 500.0, "epoch": 117.0, "grad_norm": 0.6664013862609863, "kl": 1.9357571601867676, "learning_rate": 4.69181688926877e-06, "loss": 0.0774, "reward": 0.35599881410598755, "reward_std": 2.6406307220458984, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.30000001192092896, "rewards/wrapped_driving_reward": -1.0690011978149414, "rewards/wrapped_format_reward": 0.375, "step": 585 }, { "completion_length": 500.0, "epoch": 117.2, "grad_norm": 0.7028970122337341, "kl": 2.051926612854004, "learning_rate": 4.690065647085197e-06, "loss": 0.0821, "reward": 1.781968116760254, "reward_std": 0.9806572198867798, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.4225108325481415, "rewards/wrapped_driving_reward": -0.14054261147975922, "rewards/wrapped_format_reward": 0.5, "step": 586 }, { "completion_length": 500.0, "epoch": 117.4, "grad_norm": 1.085777759552002, "kl": 2.20784854888916, "learning_rate": 4.688309772040312e-06, "loss": 0.0883, "reward": 1.4857362508773804, "reward_std": 2.3479039669036865, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5708333253860474, "rewards/wrapped_driving_reward": -0.7100971341133118, "rewards/wrapped_format_reward": 0.625, "step": 587 }, { "completion_length": 500.0, "epoch": 117.6, "grad_norm": 0.4722326099872589, "kl": 0.969474196434021, "learning_rate": 4.68654926784849e-06, "loss": 0.0388, "reward": 1.652014970779419, "reward_std": 0.8095440864562988, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7954545617103577, "rewards/wrapped_driving_reward": -0.39343956112861633, "rewards/wrapped_format_reward": 0.25, "step": 588 }, { "completion_length": 500.0, "epoch": 117.8, "grad_norm": 0.5880841016769409, "kl": 2.293309450149536, "learning_rate": 4.684784138233899e-06, "loss": 0.0917, "reward": 0.9533482193946838, "reward_std": 2.969332218170166, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3187499940395355, "rewards/wrapped_driving_reward": -0.9904017448425293, "rewards/wrapped_format_reward": 0.875, "step": 589 }, { "completion_length": 500.0, "epoch": 118.0, "grad_norm": 0.5650131702423096, "kl": 1.4274591207504272, "learning_rate": 4.6830143869304904e-06, "loss": 0.0571, "reward": 1.9011516571044922, "reward_std": 0.5124629735946655, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6944444179534912, "rewards/wrapped_driving_reward": -0.04329286515712738, "rewards/wrapped_format_reward": 0.25, "step": 590 }, { "completion_length": 500.0, "epoch": 118.2, "grad_norm": 0.5348109602928162, "kl": 1.2789981365203857, "learning_rate": 4.681240017681994e-06, "loss": 0.0512, "reward": 0.761728048324585, "reward_std": 3.2140259742736816, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6000000238418579, "rewards/wrapped_driving_reward": -1.0882718563079834, "rewards/wrapped_format_reward": 0.5, "step": 591 }, { "completion_length": 500.0, "epoch": 118.4, "grad_norm": 0.861023485660553, "kl": 1.509196162223816, "learning_rate": 4.679461034241906e-06, "loss": 0.0604, "reward": -0.897118091583252, "reward_std": 3.1433229446411133, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.20000000298023224, "rewards/wrapped_driving_reward": -2.0971179008483887, "rewards/wrapped_format_reward": 0.5, "step": 592 }, { "completion_length": 500.0, "epoch": 118.6, "grad_norm": 0.39133670926094055, "kl": 2.0143675804138184, "learning_rate": 4.677677440373488e-06, "loss": 0.0806, "reward": 0.7585340738296509, "reward_std": 2.539468288421631, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -0.9914658665657043, "rewards/wrapped_format_reward": 0.5, "step": 593 }, { "completion_length": 500.0, "epoch": 118.8, "grad_norm": 0.6113512516021729, "kl": 1.3778290748596191, "learning_rate": 4.675889239849749e-06, "loss": 0.0551, "reward": 1.2398369312286377, "reward_std": 1.4317293167114258, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7599431872367859, "rewards/wrapped_driving_reward": -1.2701061964035034, "rewards/wrapped_format_reward": 0.75, "step": 594 }, { "completion_length": 483.0, "epoch": 119.0, "grad_norm": 0.5425323247909546, "kl": 2.229952573776245, "learning_rate": 4.674096436453448e-06, "loss": 0.0892, "reward": 2.9711570739746094, "reward_std": 0.45285564661026, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.4270833432674408, "rewards/wrapped_driving_reward": 0.6690738201141357, "rewards/wrapped_format_reward": 0.875, "step": 595 }, { "completion_length": 500.0, "epoch": 119.2, "grad_norm": 0.5472980737686157, "kl": 1.5082556009292603, "learning_rate": 4.672299033977076e-06, "loss": 0.0603, "reward": 0.427512526512146, "reward_std": 2.7119789123535156, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.45113635063171387, "rewards/wrapped_driving_reward": -1.0236238241195679, "rewards/wrapped_format_reward": 0.25, "step": 596 }, { "completion_length": 500.0, "epoch": 119.4, "grad_norm": 0.7268889546394348, "kl": 2.009782075881958, "learning_rate": 4.670497036222856e-06, "loss": 0.0804, "reward": 0.5923322439193726, "reward_std": 3.18241024017334, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.543154776096344, "rewards/wrapped_driving_reward": -1.4508225917816162, "rewards/wrapped_format_reward": 0.75, "step": 597 }, { "completion_length": 424.0, "epoch": 119.6, "grad_norm": 2.143587589263916, "kl": 3.168560266494751, "learning_rate": 4.668690447002731e-06, "loss": 0.1267, "reward": -0.33397817611694336, "reward_std": 3.6877365112304688, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.9589781761169434, "rewards/wrapped_format_reward": 0.75, "step": 598 }, { "completion_length": 207.0, "epoch": 119.8, "grad_norm": 0.8902432322502136, "kl": 1.1999379396438599, "learning_rate": 4.666879270138358e-06, "loss": 0.048, "reward": 3.102839946746826, "reward_std": 0.3624434769153595, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.44583332538604736, "rewards/wrapped_driving_reward": 0.7820066213607788, "rewards/wrapped_format_reward": 0.875, "step": 599 }, { "completion_length": 373.0, "epoch": 120.0, "grad_norm": 0.7244733572006226, "kl": 2.218181610107422, "learning_rate": 4.665063509461098e-06, "loss": 0.0887, "reward": 2.5216073989868164, "reward_std": 0.705155611038208, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5910714268684387, "rewards/wrapped_driving_reward": -0.06946408003568649, "rewards/wrapped_format_reward": 1.0, "step": 600 }, { "completion_length": 436.0, "epoch": 120.2, "grad_norm": 0.4794348478317261, "kl": 1.6529284715652466, "learning_rate": 4.663243168812005e-06, "loss": 0.0661, "reward": 2.27215838432312, "reward_std": 0.39335283637046814, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6499999761581421, "rewards/wrapped_driving_reward": 0.12215844541788101, "rewards/wrapped_format_reward": 0.5, "step": 601 }, { "completion_length": 500.0, "epoch": 120.4, "grad_norm": 0.5198622345924377, "kl": 1.4576425552368164, "learning_rate": 4.661418252041827e-06, "loss": 0.0583, "reward": 2.4116311073303223, "reward_std": 0.6456475853919983, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8416666984558105, "rewards/wrapped_driving_reward": -0.180035799741745, "rewards/wrapped_format_reward": 0.75, "step": 602 }, { "completion_length": 500.0, "epoch": 120.6, "grad_norm": 0.6898744106292725, "kl": 0.7933554649353027, "learning_rate": 4.65958876301099e-06, "loss": 0.0317, "reward": 1.9946479797363281, "reward_std": 0.9817599654197693, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6625000238418579, "rewards/wrapped_driving_reward": -0.2928519546985626, "rewards/wrapped_format_reward": 0.625, "step": 603 }, { "completion_length": 500.0, "epoch": 120.8, "grad_norm": 0.679854154586792, "kl": 1.468674659729004, "learning_rate": 4.657754705589591e-06, "loss": 0.0587, "reward": 1.2992442846298218, "reward_std": 2.8911292552948, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3142857253551483, "rewards/wrapped_driving_reward": -0.3900414705276489, "rewards/wrapped_format_reward": 0.625, "step": 604 }, { "completion_length": 500.0, "epoch": 121.0, "grad_norm": 0.7568489909172058, "kl": 0.9791430234909058, "learning_rate": 4.655916083657394e-06, "loss": 0.0392, "reward": 0.7246707677841187, "reward_std": 2.9081780910491943, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4833333492279053, "rewards/wrapped_driving_reward": -1.008662462234497, "rewards/wrapped_format_reward": 0.5, "step": 605 }, { "completion_length": 333.0, "epoch": 121.2, "grad_norm": 0.700584352016449, "kl": 1.8299589157104492, "learning_rate": 4.654072901103815e-06, "loss": 0.0732, "reward": 2.1314048767089844, "reward_std": 0.746786892414093, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.47083330154418945, "rewards/wrapped_driving_reward": -0.33942848443984985, "rewards/wrapped_format_reward": 1.0, "step": 606 }, { "completion_length": 437.0, "epoch": 121.4, "grad_norm": 0.6252568960189819, "kl": 2.579108238220215, "learning_rate": 4.65222516182792e-06, "loss": 0.1032, "reward": 1.616267442703247, "reward_std": 2.0787417888641357, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6714285612106323, "rewards/wrapped_driving_reward": -1.0551612377166748, "rewards/wrapped_format_reward": 1.0, "step": 607 }, { "completion_length": 457.0, "epoch": 121.6, "grad_norm": 0.46883735060691833, "kl": 1.9563435316085815, "learning_rate": 4.650372869738415e-06, "loss": 0.0783, "reward": 1.710051417350769, "reward_std": 3.1737630367279053, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4375, "rewards/wrapped_driving_reward": -0.47744858264923096, "rewards/wrapped_format_reward": 1.0, "step": 608 }, { "completion_length": 381.0, "epoch": 121.8, "grad_norm": 0.9286189675331116, "kl": 2.453148365020752, "learning_rate": 4.648516028753632e-06, "loss": 0.0981, "reward": 0.9989381432533264, "reward_std": 2.679488182067871, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5277777910232544, "rewards/wrapped_driving_reward": -1.1538395881652832, "rewards/wrapped_format_reward": 0.875, "step": 609 }, { "completion_length": 500.0, "epoch": 122.0, "grad_norm": 0.5811746716499329, "kl": 1.0126054286956787, "learning_rate": 4.646654642801533e-06, "loss": 0.0405, "reward": 2.4342222213745117, "reward_std": 0.24681280553340912, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7465908527374268, "rewards/wrapped_driving_reward": -0.062368687242269516, "rewards/wrapped_format_reward": 0.75, "step": 610 }, { "completion_length": 500.0, "epoch": 122.2, "grad_norm": 0.8860847353935242, "kl": 2.0405843257904053, "learning_rate": 4.6447887158196905e-06, "loss": 0.0816, "reward": 2.160917282104492, "reward_std": 0.8034979701042175, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.296875, "rewards/wrapped_driving_reward": 0.11404214054346085, "rewards/wrapped_format_reward": 0.75, "step": 611 }, { "completion_length": 500.0, "epoch": 122.4, "grad_norm": 0.5286173224449158, "kl": 0.638897716999054, "learning_rate": 4.642918251755281e-06, "loss": 0.0256, "reward": 0.9057855010032654, "reward_std": 2.952286958694458, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3020833134651184, "rewards/wrapped_driving_reward": -0.8962979316711426, "rewards/wrapped_format_reward": 0.75, "step": 612 }, { "completion_length": 366.0, "epoch": 122.6, "grad_norm": 0.6095961928367615, "kl": 1.6495823860168457, "learning_rate": 4.641043254565083e-06, "loss": 0.066, "reward": 2.488039493560791, "reward_std": 0.6265414357185364, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": 0.11303944885730743, "rewards/wrapped_format_reward": 0.625, "step": 613 }, { "completion_length": 500.0, "epoch": 122.8, "grad_norm": 3.3152482509613037, "kl": 1.307226538658142, "learning_rate": 4.639163728215463e-06, "loss": 0.0523, "reward": 1.7497800588607788, "reward_std": 0.852914571762085, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.40625, "rewards/wrapped_driving_reward": -0.15647001564502716, "rewards/wrapped_format_reward": 0.5, "step": 614 }, { "completion_length": 500.0, "epoch": 123.0, "grad_norm": 0.7540839910507202, "kl": 1.6758192777633667, "learning_rate": 4.637279676682367e-06, "loss": 0.067, "reward": -0.8324633836746216, "reward_std": 3.3802878856658936, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3333333432674408, "rewards/wrapped_driving_reward": -2.0407967567443848, "rewards/wrapped_format_reward": 0.375, "step": 615 }, { "completion_length": 500.0, "epoch": 123.2, "grad_norm": 0.4686509370803833, "kl": 0.5778969526290894, "learning_rate": 4.635391103951315e-06, "loss": 0.0231, "reward": 0.07920819520950317, "reward_std": 2.5757508277893066, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5746753215789795, "rewards/wrapped_driving_reward": -1.495467185974121, "rewards/wrapped_format_reward": 0.25, "step": 616 }, { "completion_length": 500.0, "epoch": 123.4, "grad_norm": 0.8271039724349976, "kl": 2.0932607650756836, "learning_rate": 4.633498014017389e-06, "loss": 0.0837, "reward": -0.2360994815826416, "reward_std": 3.488034248352051, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3125, "rewards/wrapped_driving_reward": -1.6735994815826416, "rewards/wrapped_format_reward": 0.625, "step": 617 }, { "completion_length": 364.0, "epoch": 123.6, "grad_norm": 0.586908757686615, "kl": 1.9832216501235962, "learning_rate": 4.631600410885231e-06, "loss": 0.0793, "reward": 2.3356025218963623, "reward_std": 0.6788437962532043, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5479166507720947, "rewards/wrapped_driving_reward": 0.1626858115196228, "rewards/wrapped_format_reward": 0.625, "step": 618 }, { "completion_length": 500.0, "epoch": 123.8, "grad_norm": 0.32363465428352356, "kl": 2.4831607341766357, "learning_rate": 4.629698298569026e-06, "loss": 0.0993, "reward": 0.8630160093307495, "reward_std": 2.6244120597839355, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5416666269302368, "rewards/wrapped_driving_reward": -1.1786506175994873, "rewards/wrapped_format_reward": 0.75, "step": 619 }, { "completion_length": 425.0, "epoch": 124.0, "grad_norm": 0.6082574725151062, "kl": 1.8717342615127563, "learning_rate": 4.627791681092499e-06, "loss": 0.0749, "reward": -0.08876347541809082, "reward_std": 3.6596477031707764, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.40714284777641296, "rewards/wrapped_driving_reward": -1.8709063529968262, "rewards/wrapped_format_reward": 0.875, "step": 620 }, { "completion_length": 431.0, "epoch": 124.2, "grad_norm": 0.5879464745521545, "kl": 1.4940499067306519, "learning_rate": 4.625880562488908e-06, "loss": 0.0598, "reward": 1.0814306735992432, "reward_std": 2.7485666275024414, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.53125, "rewards/wrapped_driving_reward": -0.9498193264007568, "rewards/wrapped_format_reward": 0.75, "step": 621 }, { "completion_length": 330.0, "epoch": 124.4, "grad_norm": 0.7117013931274414, "kl": 1.1768485307693481, "learning_rate": 4.623964946801027e-06, "loss": 0.0471, "reward": 2.0680885314941406, "reward_std": 0.7602452039718628, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5071429014205933, "rewards/wrapped_driving_reward": -0.18905426561832428, "rewards/wrapped_format_reward": 0.75, "step": 622 }, { "completion_length": 447.0, "epoch": 124.6, "grad_norm": 0.563094973564148, "kl": 2.0866267681121826, "learning_rate": 4.622044838081148e-06, "loss": 0.0835, "reward": 1.0568556785583496, "reward_std": 2.7327163219451904, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5347222089767456, "rewards/wrapped_driving_reward": -0.8528665900230408, "rewards/wrapped_format_reward": 0.625, "step": 623 }, { "completion_length": 500.0, "epoch": 124.8, "grad_norm": 0.8436673879623413, "kl": 1.9623671770095825, "learning_rate": 4.620120240391065e-06, "loss": 0.0785, "reward": 1.848639965057373, "reward_std": 0.5852110385894775, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6791666746139526, "rewards/wrapped_driving_reward": -0.08052676171064377, "rewards/wrapped_format_reward": 0.25, "step": 624 }, { "completion_length": 500.0, "epoch": 125.0, "grad_norm": 0.8329010009765625, "kl": 1.7306761741638184, "learning_rate": 4.61819115780207e-06, "loss": 0.0692, "reward": 2.5882558822631836, "reward_std": 0.4951888620853424, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.4092262089252472, "rewards/wrapped_driving_reward": 0.804029643535614, "rewards/wrapped_format_reward": 0.375, "step": 625 }, { "completion_length": 452.0, "epoch": 125.2, "grad_norm": 0.6543009877204895, "kl": 2.1465976238250732, "learning_rate": 4.61625759439494e-06, "loss": 0.0859, "reward": 3.292895555496216, "reward_std": 0.19293895363807678, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": 0.792895495891571, "rewards/wrapped_format_reward": 0.875, "step": 626 }, { "completion_length": 500.0, "epoch": 125.4, "grad_norm": 0.5550938248634338, "kl": 1.5278606414794922, "learning_rate": 4.614319554259934e-06, "loss": 0.0611, "reward": 2.2828598022460938, "reward_std": 0.6602820158004761, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7083333134651184, "rewards/wrapped_driving_reward": 0.07452647387981415, "rewards/wrapped_format_reward": 0.5, "step": 627 }, { "completion_length": 500.0, "epoch": 125.6, "grad_norm": 0.6649501919746399, "kl": 2.589494466781616, "learning_rate": 4.6123770414967765e-06, "loss": 0.1036, "reward": 1.9842591285705566, "reward_std": 0.4864676296710968, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.640625, "rewards/wrapped_driving_reward": -0.2813658118247986, "rewards/wrapped_format_reward": 0.625, "step": 628 }, { "completion_length": 445.0, "epoch": 125.8, "grad_norm": 0.662386417388916, "kl": 1.6613373756408691, "learning_rate": 4.610430060214656e-06, "loss": 0.0665, "reward": 2.0100724697113037, "reward_std": 0.31454190611839294, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6354166865348816, "rewards/wrapped_driving_reward": 0.1246558278799057, "rewards/wrapped_format_reward": 0.25, "step": 629 }, { "completion_length": 500.0, "epoch": 126.0, "grad_norm": 0.3220883905887604, "kl": 1.36296808719635, "learning_rate": 4.608478614532215e-06, "loss": 0.0545, "reward": 0.3127203583717346, "reward_std": 1.642824649810791, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.4472222328186035, "rewards/wrapped_driving_reward": -1.8845018148422241, "rewards/wrapped_format_reward": 0.75, "step": 630 }, { "completion_length": 467.0, "epoch": 126.2, "grad_norm": 0.5425591468811035, "kl": 2.051196575164795, "learning_rate": 4.606522708577537e-06, "loss": 0.082, "reward": 2.591169834136963, "reward_std": 1.0145400762557983, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7604166865348816, "rewards/wrapped_driving_reward": 0.33075299859046936, "rewards/wrapped_format_reward": 0.5, "step": 631 }, { "completion_length": 500.0, "epoch": 126.4, "grad_norm": 0.8837341070175171, "kl": 2.851381301879883, "learning_rate": 4.604562346488144e-06, "loss": 0.1141, "reward": 1.51163649559021, "reward_std": 3.6928539276123047, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.46875, "rewards/wrapped_driving_reward": -0.33211344480514526, "rewards/wrapped_format_reward": 0.625, "step": 632 }, { "completion_length": 500.0, "epoch": 126.6, "grad_norm": 0.4911397397518158, "kl": 1.3893539905548096, "learning_rate": 4.602597532410982e-06, "loss": 0.0556, "reward": 1.5815069675445557, "reward_std": 0.7094965577125549, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5062500238418579, "rewards/wrapped_driving_reward": -0.42474302649497986, "rewards/wrapped_format_reward": 0.5, "step": 633 }, { "completion_length": 500.0, "epoch": 126.8, "grad_norm": 0.5640321373939514, "kl": 2.7671730518341064, "learning_rate": 4.600628270502415e-06, "loss": 0.1107, "reward": 1.0614972114562988, "reward_std": 3.3871705532073975, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6000000238418579, "rewards/wrapped_driving_reward": -1.0385026931762695, "rewards/wrapped_format_reward": 0.75, "step": 634 }, { "completion_length": 490.0, "epoch": 127.0, "grad_norm": 0.4937014877796173, "kl": 1.4064245223999023, "learning_rate": 4.5986545649282164e-06, "loss": 0.0563, "reward": 1.2878098487854004, "reward_std": 0.932208240032196, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5625, "rewards/wrapped_driving_reward": -0.6496900916099548, "rewards/wrapped_format_reward": 0.375, "step": 635 }, { "completion_length": 500.0, "epoch": 127.2, "grad_norm": 0.437845379114151, "kl": 1.4801615476608276, "learning_rate": 4.596676419863561e-06, "loss": 0.0592, "reward": 2.271235466003418, "reward_std": 1.1963107585906982, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5873737335205078, "rewards/wrapped_driving_reward": -0.06613828241825104, "rewards/wrapped_format_reward": 0.75, "step": 636 }, { "completion_length": 500.0, "epoch": 127.4, "grad_norm": 0.4203113317489624, "kl": 2.1121630668640137, "learning_rate": 4.594693839493012e-06, "loss": 0.0845, "reward": 0.4698132574558258, "reward_std": 2.6874301433563232, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.1551867723464966, "rewards/wrapped_format_reward": 0.5, "step": 637 }, { "completion_length": 500.0, "epoch": 127.6, "grad_norm": 0.527201235294342, "kl": 2.8536908626556396, "learning_rate": 4.592706828010518e-06, "loss": 0.1141, "reward": 2.8774492740631104, "reward_std": 0.4833984375, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.42500001192092896, "rewards/wrapped_driving_reward": 0.7024492621421814, "rewards/wrapped_format_reward": 0.75, "step": 638 }, { "completion_length": 500.0, "epoch": 127.8, "grad_norm": 0.6176121234893799, "kl": 1.813757061958313, "learning_rate": 4.590715389619399e-06, "loss": 0.0726, "reward": 0.5704012513160706, "reward_std": 3.1261343955993652, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.41458332538604736, "rewards/wrapped_driving_reward": -1.3441821336746216, "rewards/wrapped_format_reward": 0.75, "step": 639 }, { "completion_length": 488.0, "epoch": 128.0, "grad_norm": 0.6150643229484558, "kl": 1.8711895942687988, "learning_rate": 4.588719528532342e-06, "loss": 0.0748, "reward": 2.4459846019744873, "reward_std": 0.8233567476272583, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7440475821495056, "rewards/wrapped_driving_reward": 0.07693709433078766, "rewards/wrapped_format_reward": 0.625, "step": 640 }, { "completion_length": 500.0, "epoch": 128.2, "grad_norm": 0.5856380462646484, "kl": 1.3136426210403442, "learning_rate": 4.586719248971387e-06, "loss": 0.0525, "reward": 2.145009994506836, "reward_std": 0.7691636085510254, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6855769157409668, "rewards/wrapped_driving_reward": -0.2905668616294861, "rewards/wrapped_format_reward": 0.75, "step": 641 }, { "completion_length": 500.0, "epoch": 128.4, "grad_norm": 0.5520462989807129, "kl": 1.793980598449707, "learning_rate": 4.584714555167921e-06, "loss": 0.0718, "reward": 2.248842239379883, "reward_std": 0.23642319440841675, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.512499988079071, "rewards/wrapped_driving_reward": -0.013657848350703716, "rewards/wrapped_format_reward": 0.75, "step": 642 }, { "completion_length": 500.0, "epoch": 128.6, "grad_norm": 0.5082619190216064, "kl": 1.7066545486450195, "learning_rate": 4.582705451362672e-06, "loss": 0.0683, "reward": 1.1921451091766357, "reward_std": 3.1507742404937744, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3187499940395355, "rewards/wrapped_driving_reward": -0.7516048550605774, "rewards/wrapped_format_reward": 0.875, "step": 643 }, { "completion_length": 365.0, "epoch": 128.8, "grad_norm": 0.48745468258857727, "kl": 2.0945849418640137, "learning_rate": 4.580691941805695e-06, "loss": 0.0838, "reward": 2.269286870956421, "reward_std": 0.241103857755661, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.3166666626930237, "rewards/wrapped_driving_reward": -0.04737963154911995, "rewards/wrapped_format_reward": 1.0, "step": 644 }, { "completion_length": 500.0, "epoch": 129.0, "grad_norm": 0.4229970872402191, "kl": 1.823197364807129, "learning_rate": 4.578674030756364e-06, "loss": 0.0729, "reward": 2.301126003265381, "reward_std": 1.0769553184509277, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6267857551574707, "rewards/wrapped_driving_reward": -0.07565981149673462, "rewards/wrapped_format_reward": 0.75, "step": 645 }, { "completion_length": 400.0, "epoch": 129.2, "grad_norm": 0.8031338453292847, "kl": 1.4413716793060303, "learning_rate": 4.576651722483364e-06, "loss": 0.0577, "reward": 0.5792556405067444, "reward_std": 3.146937847137451, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -0.9207443594932556, "rewards/wrapped_format_reward": 0.25, "step": 646 }, { "completion_length": 500.0, "epoch": 129.4, "grad_norm": 0.575326144695282, "kl": 1.3405693769454956, "learning_rate": 4.5746250212646845e-06, "loss": 0.0536, "reward": 1.8609615564346313, "reward_std": 1.1676558256149292, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7234848737716675, "rewards/wrapped_driving_reward": -0.2375233769416809, "rewards/wrapped_format_reward": 0.375, "step": 647 }, { "completion_length": 500.0, "epoch": 129.6, "grad_norm": 1.5227434635162354, "kl": 1.8580933809280396, "learning_rate": 4.572593931387604e-06, "loss": 0.0743, "reward": 1.1107244491577148, "reward_std": 3.418213129043579, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3142857253551483, "rewards/wrapped_driving_reward": -0.45356136560440063, "rewards/wrapped_format_reward": 0.5, "step": 648 }, { "completion_length": 500.0, "epoch": 129.8, "grad_norm": 0.3943979740142822, "kl": 1.2036634683609009, "learning_rate": 4.570558457148689e-06, "loss": 0.0481, "reward": -0.2929380536079407, "reward_std": 2.5635411739349365, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.421875, "rewards/wrapped_driving_reward": -2.214812994003296, "rewards/wrapped_format_reward": 0.75, "step": 649 }, { "completion_length": 500.0, "epoch": 130.0, "grad_norm": 0.609381914138794, "kl": 1.2709250450134277, "learning_rate": 4.568518602853776e-06, "loss": 0.0508, "reward": 0.9115546941757202, "reward_std": 3.0052967071533203, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6294642686843872, "rewards/wrapped_driving_reward": -1.092909574508667, "rewards/wrapped_format_reward": 0.625, "step": 650 }, { "completion_length": 268.0, "epoch": 130.2, "grad_norm": 0.7053789496421814, "kl": 1.1529924869537354, "learning_rate": 4.566474372817971e-06, "loss": 0.0461, "reward": 2.538076877593994, "reward_std": 0.16932101547718048, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5401515364646912, "rewards/wrapped_driving_reward": -0.0020749024115502834, "rewards/wrapped_format_reward": 1.0, "step": 651 }, { "completion_length": 260.0, "epoch": 130.4, "grad_norm": 1.0269075632095337, "kl": 1.1925607919692993, "learning_rate": 4.564425771365636e-06, "loss": 0.0477, "reward": 0.13461077213287354, "reward_std": 3.1838464736938477, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6166666746139526, "rewards/wrapped_driving_reward": -1.9820557832717896, "rewards/wrapped_format_reward": 0.75, "step": 652 }, { "completion_length": 500.0, "epoch": 130.6, "grad_norm": 0.6632705330848694, "kl": 1.5169345140457153, "learning_rate": 4.562372802830376e-06, "loss": 0.0607, "reward": 2.3893017768859863, "reward_std": 0.3469656705856323, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6416667103767395, "rewards/wrapped_driving_reward": -0.0023648329079151154, "rewards/wrapped_format_reward": 0.75, "step": 653 }, { "completion_length": 500.0, "epoch": 130.8, "grad_norm": 0.4717840254306793, "kl": 1.9436031579971313, "learning_rate": 4.560315471555039e-06, "loss": 0.0777, "reward": -0.5293911099433899, "reward_std": 2.971536636352539, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.2142857164144516, "rewards/wrapped_driving_reward": -2.2436769008636475, "rewards/wrapped_format_reward": 0.75, "step": 654 }, { "completion_length": 500.0, "epoch": 131.0, "grad_norm": 0.6224848628044128, "kl": 2.232926368713379, "learning_rate": 4.558253781891701e-06, "loss": 0.0893, "reward": 1.4312851428985596, "reward_std": 3.0140573978424072, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3125, "rewards/wrapped_driving_reward": -0.38121485710144043, "rewards/wrapped_format_reward": 0.75, "step": 655 }, { "completion_length": 500.0, "epoch": 131.2, "grad_norm": 0.5012187957763672, "kl": 2.527440309524536, "learning_rate": 4.556187738201656e-06, "loss": 0.1011, "reward": 2.6756157875061035, "reward_std": 0.3911329209804535, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.432539701461792, "rewards/wrapped_driving_reward": 0.3680762052536011, "rewards/wrapped_format_reward": 0.875, "step": 656 }, { "completion_length": 500.0, "epoch": 131.4, "grad_norm": 0.6326376795768738, "kl": 0.7759931087493896, "learning_rate": 4.55411734485541e-06, "loss": 0.031, "reward": -1.3146235942840576, "reward_std": 2.605213165283203, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.9396235942840576, "rewards/wrapped_format_reward": 0.375, "step": 657 }, { "completion_length": 500.0, "epoch": 131.6, "grad_norm": 0.5646911263465881, "kl": 2.084763765335083, "learning_rate": 4.5520426062326685e-06, "loss": 0.0834, "reward": 3.4696907997131348, "reward_std": 0.5449883937835693, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8888888359069824, "rewards/wrapped_driving_reward": 0.8308018445968628, "rewards/wrapped_format_reward": 0.75, "step": 658 }, { "completion_length": 500.0, "epoch": 131.8, "grad_norm": 0.6221711039543152, "kl": 1.0228608846664429, "learning_rate": 4.549963526722332e-06, "loss": 0.0409, "reward": 0.8106170892715454, "reward_std": 3.273923635482788, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5982142686843872, "rewards/wrapped_driving_reward": -1.0375971794128418, "rewards/wrapped_format_reward": 0.5, "step": 659 }, { "completion_length": 337.0, "epoch": 132.0, "grad_norm": 0.43772298097610474, "kl": 1.3129949569702148, "learning_rate": 4.54788011072248e-06, "loss": 0.0525, "reward": 1.5600903034210205, "reward_std": 0.5521067976951599, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.3645833432674408, "rewards/wrapped_driving_reward": -0.8044930100440979, "rewards/wrapped_format_reward": 1.0, "step": 660 }, { "completion_length": 500.0, "epoch": 132.2, "grad_norm": 0.5554527044296265, "kl": 2.8973710536956787, "learning_rate": 4.5457923626403685e-06, "loss": 0.1159, "reward": 1.3094146251678467, "reward_std": 2.027087688446045, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6666666865348816, "rewards/wrapped_driving_reward": -1.1072518825531006, "rewards/wrapped_format_reward": 0.75, "step": 661 }, { "completion_length": 500.0, "epoch": 132.4, "grad_norm": 0.656486988067627, "kl": 3.2406187057495117, "learning_rate": 4.543700286892417e-06, "loss": 0.1296, "reward": 2.77382230758667, "reward_std": 0.26443302631378174, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7041667103767395, "rewards/wrapped_driving_reward": 0.06965568661689758, "rewards/wrapped_format_reward": 1.0, "step": 662 }, { "completion_length": 500.0, "epoch": 132.6, "grad_norm": 0.6020192503929138, "kl": 1.9372469186782837, "learning_rate": 4.541603887904198e-06, "loss": 0.0775, "reward": 2.9173097610473633, "reward_std": 0.4557800590991974, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.39772725105285645, "rewards/wrapped_driving_reward": 0.7695826888084412, "rewards/wrapped_format_reward": 0.75, "step": 663 }, { "completion_length": 500.0, "epoch": 132.8, "grad_norm": 0.43070000410079956, "kl": 0.6832436323165894, "learning_rate": 4.539503170110431e-06, "loss": 0.0273, "reward": 2.062741756439209, "reward_std": 0.6955239772796631, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6101190447807312, "rewards/wrapped_driving_reward": -0.1723773032426834, "rewards/wrapped_format_reward": 0.625, "step": 664 }, { "completion_length": 500.0, "epoch": 133.0, "grad_norm": 0.7564574480056763, "kl": 2.0831825733184814, "learning_rate": 4.537398137954971e-06, "loss": 0.0833, "reward": 1.0558321475982666, "reward_std": 3.0471808910369873, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.36098483204841614, "rewards/wrapped_driving_reward": -0.9301527142524719, "rewards/wrapped_format_reward": 0.875, "step": 665 }, { "completion_length": 415.0, "epoch": 133.2, "grad_norm": 0.6062067747116089, "kl": 1.8398441076278687, "learning_rate": 4.535288795890799e-06, "loss": 0.0736, "reward": 1.2171411514282227, "reward_std": 3.5121548175811768, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3499999940395355, "rewards/wrapped_driving_reward": -0.3828587532043457, "rewards/wrapped_format_reward": 0.5, "step": 666 }, { "completion_length": 378.0, "epoch": 133.4, "grad_norm": 0.7440605759620667, "kl": 1.9756370782852173, "learning_rate": 4.533175148380014e-06, "loss": 0.079, "reward": 2.4865829944610596, "reward_std": 0.26880526542663574, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7978896498680115, "rewards/wrapped_driving_reward": -0.18630655109882355, "rewards/wrapped_format_reward": 0.875, "step": 667 }, { "completion_length": 500.0, "epoch": 133.6, "grad_norm": 1.240096092224121, "kl": 1.0959250926971436, "learning_rate": 4.531057199893824e-06, "loss": 0.0438, "reward": -1.00247323513031, "reward_std": 3.461418628692627, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2142857164144516, "rewards/wrapped_driving_reward": -2.216759204864502, "rewards/wrapped_format_reward": 0.5, "step": 668 }, { "completion_length": 500.0, "epoch": 133.8, "grad_norm": 0.5158315300941467, "kl": 3.460261344909668, "learning_rate": 4.528934954912531e-06, "loss": 0.1384, "reward": 1.6893670558929443, "reward_std": 0.9373288154602051, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.53125, "rewards/wrapped_driving_reward": -0.5918828248977661, "rewards/wrapped_format_reward": 0.75, "step": 669 }, { "completion_length": 498.0, "epoch": 134.0, "grad_norm": 0.6073102355003357, "kl": 2.4675910472869873, "learning_rate": 4.526808417925531e-06, "loss": 0.0987, "reward": 0.656061053276062, "reward_std": 3.1601967811584473, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.328125, "rewards/wrapped_driving_reward": -0.922063946723938, "rewards/wrapped_format_reward": 0.5, "step": 670 }, { "completion_length": 500.0, "epoch": 134.2, "grad_norm": 0.4408455789089203, "kl": 1.7826595306396484, "learning_rate": 4.524677593431296e-06, "loss": 0.0713, "reward": 2.8755440711975098, "reward_std": 0.24851343035697937, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.46666663885116577, "rewards/wrapped_driving_reward": 0.658877432346344, "rewards/wrapped_format_reward": 0.75, "step": 671 }, { "completion_length": 500.0, "epoch": 134.4, "grad_norm": 0.42519596219062805, "kl": 2.016735792160034, "learning_rate": 4.522542485937369e-06, "loss": 0.0807, "reward": 2.178114891052246, "reward_std": 1.032065749168396, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5922619700431824, "rewards/wrapped_driving_reward": -0.03914716839790344, "rewards/wrapped_format_reward": 0.625, "step": 672 }, { "completion_length": 500.0, "epoch": 134.6, "grad_norm": 1.4886196851730347, "kl": 1.402744174003601, "learning_rate": 4.520403099960352e-06, "loss": 0.0561, "reward": 0.8560357689857483, "reward_std": 3.2442822456359863, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": -1.1439642906188965, "rewards/wrapped_format_reward": 0.625, "step": 673 }, { "completion_length": 500.0, "epoch": 134.8, "grad_norm": 0.7325296401977539, "kl": 2.223708152770996, "learning_rate": 4.5182594400259e-06, "loss": 0.0889, "reward": 0.6000238656997681, "reward_std": 3.0800774097442627, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3638888895511627, "rewards/wrapped_driving_reward": -1.0138651132583618, "rewards/wrapped_format_reward": 0.5, "step": 674 }, { "completion_length": 500.0, "epoch": 135.0, "grad_norm": 0.5647112131118774, "kl": 0.942818820476532, "learning_rate": 4.516111510668707e-06, "loss": 0.0377, "reward": 0.16696184873580933, "reward_std": 2.2731566429138184, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.4313446879386902, "rewards/wrapped_driving_reward": -1.5143828392028809, "rewards/wrapped_format_reward": 0.25, "step": 675 }, { "completion_length": 500.0, "epoch": 135.2, "grad_norm": 0.6304842829704285, "kl": 1.3673999309539795, "learning_rate": 4.513959316432499e-06, "loss": 0.0547, "reward": 0.21511971950531006, "reward_std": 2.8151655197143555, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.2848801612854004, "rewards/wrapped_format_reward": 0.25, "step": 676 }, { "completion_length": 500.0, "epoch": 135.4, "grad_norm": 0.590766191482544, "kl": 1.9189481735229492, "learning_rate": 4.511802861870025e-06, "loss": 0.0768, "reward": 1.6132283210754395, "reward_std": 3.119673728942871, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.44999998807907104, "rewards/wrapped_driving_reward": -0.4617716073989868, "rewards/wrapped_format_reward": 0.875, "step": 677 }, { "completion_length": 316.0, "epoch": 135.6, "grad_norm": 0.5920143723487854, "kl": 1.698227047920227, "learning_rate": 4.509642151543043e-06, "loss": 0.0679, "reward": 2.163217544555664, "reward_std": 0.1866256445646286, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6196428537368774, "rewards/wrapped_driving_reward": -0.206425279378891, "rewards/wrapped_format_reward": 0.75, "step": 678 }, { "completion_length": 500.0, "epoch": 135.8, "grad_norm": 0.473319411277771, "kl": 1.885067105293274, "learning_rate": 4.50747719002232e-06, "loss": 0.0754, "reward": 0.8563193082809448, "reward_std": 3.319228172302246, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4124999940395355, "rewards/wrapped_driving_reward": -0.8061806559562683, "rewards/wrapped_format_reward": 0.5, "step": 679 }, { "completion_length": 500.0, "epoch": 136.0, "grad_norm": 0.5373473167419434, "kl": 0.3416510224342346, "learning_rate": 4.50530798188761e-06, "loss": 0.0137, "reward": 0.8898559212684631, "reward_std": 3.2994210720062256, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5681818127632141, "rewards/wrapped_driving_reward": -0.678325891494751, "rewards/wrapped_format_reward": 0.25, "step": 680 }, { "completion_length": 500.0, "epoch": 136.2, "grad_norm": 0.6305875182151794, "kl": 3.0031747817993164, "learning_rate": 4.503134531727652e-06, "loss": 0.1201, "reward": 3.073108196258545, "reward_std": 0.3107910454273224, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.574999988079071, "rewards/wrapped_driving_reward": 0.7481079697608948, "rewards/wrapped_format_reward": 0.75, "step": 681 }, { "completion_length": 500.0, "epoch": 136.4, "grad_norm": 1.7253578901290894, "kl": 1.7754122018814087, "learning_rate": 4.50095684414016e-06, "loss": 0.071, "reward": 0.7270383834838867, "reward_std": 2.8357577323913574, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.42045456171035767, "rewards/wrapped_driving_reward": -0.9434162378311157, "rewards/wrapped_format_reward": 0.5, "step": 682 }, { "completion_length": 314.0, "epoch": 136.6, "grad_norm": 0.9889490604400635, "kl": 2.0324018001556396, "learning_rate": 4.498774923731809e-06, "loss": 0.0813, "reward": 1.0457359552383423, "reward_std": 3.387676239013672, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -0.9542640447616577, "rewards/wrapped_format_reward": 0.75, "step": 683 }, { "completion_length": 500.0, "epoch": 136.8, "grad_norm": 2.2817821502685547, "kl": 2.853245973587036, "learning_rate": 4.496588775118232e-06, "loss": 0.1141, "reward": 0.7907824516296387, "reward_std": 3.2839391231536865, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6875, "rewards/wrapped_driving_reward": -1.1467175483703613, "rewards/wrapped_format_reward": 0.5, "step": 684 }, { "completion_length": 386.0, "epoch": 137.0, "grad_norm": 0.6486660242080688, "kl": 1.3754545450210571, "learning_rate": 4.494398402924004e-06, "loss": 0.055, "reward": 2.4631450176239014, "reward_std": 0.11703906208276749, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6589285731315613, "rewards/wrapped_driving_reward": -0.1957835555076599, "rewards/wrapped_format_reward": 1.0, "step": 685 }, { "completion_length": 500.0, "epoch": 137.2, "grad_norm": 0.6334738731384277, "kl": 1.6304785013198853, "learning_rate": 4.492203811782633e-06, "loss": 0.0652, "reward": 0.28112760186195374, "reward_std": 2.1938462257385254, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.48750001192092896, "rewards/wrapped_driving_reward": -1.2063723802566528, "rewards/wrapped_format_reward": 0.25, "step": 686 }, { "completion_length": 500.0, "epoch": 137.4, "grad_norm": 0.7023109197616577, "kl": 0.08691535145044327, "learning_rate": 4.490005006336555e-06, "loss": 0.0035, "reward": -2.4576797485351562, "reward_std": 3.0846405029296875, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -3.0826797485351562, "rewards/wrapped_format_reward": 0.125, "step": 687 }, { "completion_length": 409.0, "epoch": 137.6, "grad_norm": 0.41668447852134705, "kl": 2.5656790733337402, "learning_rate": 4.48780199123712e-06, "loss": 0.1026, "reward": 3.512512683868408, "reward_std": 0.17563197016716003, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6671626567840576, "rewards/wrapped_driving_reward": 0.8453500270843506, "rewards/wrapped_format_reward": 1.0, "step": 688 }, { "completion_length": 500.0, "epoch": 137.8, "grad_norm": 0.5395167469978333, "kl": 1.4732530117034912, "learning_rate": 4.4855947711445806e-06, "loss": 0.0589, "reward": 0.42403852939605713, "reward_std": 3.0854415893554688, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.453125, "rewards/wrapped_driving_reward": -1.0290864706039429, "rewards/wrapped_format_reward": 0.25, "step": 689 }, { "completion_length": 500.0, "epoch": 138.0, "grad_norm": 0.51726895570755, "kl": 0.7486550211906433, "learning_rate": 4.4833833507280884e-06, "loss": 0.0299, "reward": -0.6679528951644897, "reward_std": 3.5653388500213623, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3333333432674408, "rewards/wrapped_driving_reward": -2.001286268234253, "rewards/wrapped_format_reward": 0.5, "step": 690 }, { "completion_length": 460.0, "epoch": 138.2, "grad_norm": 1.1171619892120361, "kl": 2.044977903366089, "learning_rate": 4.481167734665678e-06, "loss": 0.0818, "reward": 2.515073776245117, "reward_std": 0.45018860697746277, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6000000238418579, "rewards/wrapped_driving_reward": 0.16507390141487122, "rewards/wrapped_format_reward": 0.75, "step": 691 }, { "completion_length": 436.0, "epoch": 138.4, "grad_norm": 0.6236823201179504, "kl": 2.022012948989868, "learning_rate": 4.478947927644259e-06, "loss": 0.0809, "reward": -0.336450457572937, "reward_std": 3.3746840953826904, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3020833134651184, "rewards/wrapped_driving_reward": -2.0135338306427, "rewards/wrapped_format_reward": 0.875, "step": 692 }, { "completion_length": 500.0, "epoch": 138.6, "grad_norm": 0.753580629825592, "kl": 2.32511305809021, "learning_rate": 4.476723934359609e-06, "loss": 0.093, "reward": -1.1542490720748901, "reward_std": 2.9460480213165283, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5307692289352417, "rewards/wrapped_driving_reward": -2.8100180625915527, "rewards/wrapped_format_reward": 0.375, "step": 693 }, { "completion_length": 500.0, "epoch": 138.8, "grad_norm": 0.5850738286972046, "kl": 1.3262362480163574, "learning_rate": 4.4744957595163586e-06, "loss": 0.053, "reward": 0.7404945492744446, "reward_std": 3.183239698410034, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.1345055103302002, "rewards/wrapped_format_reward": 0.625, "step": 694 }, { "completion_length": 500.0, "epoch": 139.0, "grad_norm": 0.46199971437454224, "kl": 1.4109580516815186, "learning_rate": 4.472263407827987e-06, "loss": 0.0564, "reward": 0.9053983688354492, "reward_std": 2.945990562438965, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.2874999940395355, "rewards/wrapped_driving_reward": -0.7571016550064087, "rewards/wrapped_format_reward": 0.625, "step": 695 }, { "completion_length": 356.0, "epoch": 139.2, "grad_norm": 0.5959521532058716, "kl": 1.1791226863861084, "learning_rate": 4.470026884016805e-06, "loss": 0.0472, "reward": 2.3219900131225586, "reward_std": 0.6103870272636414, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.675000011920929, "rewards/wrapped_driving_reward": -0.1030101329088211, "rewards/wrapped_format_reward": 0.75, "step": 696 }, { "completion_length": 500.0, "epoch": 139.4, "grad_norm": 0.5678549408912659, "kl": 0.7185838222503662, "learning_rate": 4.4677861928139535e-06, "loss": 0.0287, "reward": 2.031836986541748, "reward_std": 0.3398396968841553, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5272321701049805, "rewards/wrapped_driving_reward": 0.004605006892234087, "rewards/wrapped_format_reward": 0.5, "step": 697 }, { "completion_length": 500.0, "epoch": 139.6, "grad_norm": 0.6844885349273682, "kl": 1.7473582029342651, "learning_rate": 4.465541338959386e-06, "loss": 0.0699, "reward": 1.1412525177001953, "reward_std": 3.4600088596343994, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7222222089767456, "rewards/wrapped_driving_reward": -0.9559696912765503, "rewards/wrapped_format_reward": 0.625, "step": 698 }, { "completion_length": 302.0, "epoch": 139.8, "grad_norm": 0.5990707278251648, "kl": 1.7741172313690186, "learning_rate": 4.463292327201862e-06, "loss": 0.071, "reward": 3.2261505126953125, "reward_std": 0.07425826042890549, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6333333253860474, "rewards/wrapped_driving_reward": 0.8428170680999756, "rewards/wrapped_format_reward": 0.75, "step": 699 }, { "completion_length": 500.0, "epoch": 140.0, "grad_norm": 1.0769627094268799, "kl": 2.001673936843872, "learning_rate": 4.46103916229894e-06, "loss": 0.0801, "reward": 0.48460817337036133, "reward_std": 3.084113359451294, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.33522725105285645, "rewards/wrapped_driving_reward": -1.2256190776824951, "rewards/wrapped_format_reward": 0.625, "step": 700 }, { "completion_length": 500.0, "epoch": 140.2, "grad_norm": 0.5872037410736084, "kl": 1.038836121559143, "learning_rate": 4.4587818490169585e-06, "loss": 0.0416, "reward": -0.32788264751434326, "reward_std": 4.246962547302246, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -1.5778825283050537, "rewards/wrapped_format_reward": 0.5, "step": 701 }, { "completion_length": 500.0, "epoch": 140.4, "grad_norm": 0.6318936944007874, "kl": 2.6195878982543945, "learning_rate": 4.456520392131035e-06, "loss": 0.1048, "reward": 1.1189181804656982, "reward_std": 3.4260780811309814, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5375000238418579, "rewards/wrapped_driving_reward": -0.9185817837715149, "rewards/wrapped_format_reward": 0.75, "step": 702 }, { "completion_length": 500.0, "epoch": 140.6, "grad_norm": 0.6811997890472412, "kl": 1.4545273780822754, "learning_rate": 4.454254796425053e-06, "loss": 0.0582, "reward": -0.7819237112998962, "reward_std": 3.4338369369506836, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3697916865348816, "rewards/wrapped_driving_reward": -2.2767152786254883, "rewards/wrapped_format_reward": 0.625, "step": 703 }, { "completion_length": 500.0, "epoch": 140.8, "grad_norm": 0.6857403516769409, "kl": 2.4222664833068848, "learning_rate": 4.451985066691649e-06, "loss": 0.0969, "reward": 2.3576412200927734, "reward_std": 0.7202218174934387, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6955128312110901, "rewards/wrapped_driving_reward": -0.3378716707229614, "rewards/wrapped_format_reward": 1.0, "step": 704 }, { "completion_length": 500.0, "epoch": 141.0, "grad_norm": 0.7510570287704468, "kl": 1.073608636856079, "learning_rate": 4.4497112077322045e-06, "loss": 0.0429, "reward": 2.0912461280822754, "reward_std": 0.7873207926750183, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5208333730697632, "rewards/wrapped_driving_reward": -0.17958709597587585, "rewards/wrapped_format_reward": 0.75, "step": 705 }, { "completion_length": 500.0, "epoch": 141.2, "grad_norm": 0.696620762348175, "kl": 2.867920398712158, "learning_rate": 4.44743322435684e-06, "loss": 0.1147, "reward": 0.7361401915550232, "reward_std": 3.1636664867401123, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4749999940395355, "rewards/wrapped_driving_reward": -0.9888597130775452, "rewards/wrapped_format_reward": 0.5, "step": 706 }, { "completion_length": 500.0, "epoch": 141.4, "grad_norm": 1.5193531513214111, "kl": 3.1368560791015625, "learning_rate": 4.445151121384395e-06, "loss": 0.1255, "reward": 1.0279366970062256, "reward_std": 3.371938705444336, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3083333373069763, "rewards/wrapped_driving_reward": -0.40539664030075073, "rewards/wrapped_format_reward": 0.375, "step": 707 }, { "completion_length": 500.0, "epoch": 141.6, "grad_norm": 0.5330837965011597, "kl": 2.149311065673828, "learning_rate": 4.442864903642428e-06, "loss": 0.086, "reward": 0.7508726119995117, "reward_std": 3.270827293395996, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.659375011920929, "rewards/wrapped_driving_reward": -1.158502459526062, "rewards/wrapped_format_reward": 0.5, "step": 708 }, { "completion_length": 500.0, "epoch": 141.8, "grad_norm": 0.9430155754089355, "kl": 0.734877347946167, "learning_rate": 4.440574575967199e-06, "loss": 0.0294, "reward": 1.7707712650299072, "reward_std": 0.6201328635215759, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5791666507720947, "rewards/wrapped_driving_reward": -0.18339526653289795, "rewards/wrapped_format_reward": 0.375, "step": 709 }, { "completion_length": 500.0, "epoch": 142.0, "grad_norm": 0.5545501708984375, "kl": 0.34012243151664734, "learning_rate": 4.438280143203665e-06, "loss": 0.0136, "reward": -2.095297336578369, "reward_std": 2.3890910148620605, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4166666865348816, "rewards/wrapped_driving_reward": -3.5119643211364746, "rewards/wrapped_format_reward": 0.5, "step": 710 }, { "completion_length": 338.0, "epoch": 142.2, "grad_norm": 0.545477569103241, "kl": 1.5843563079833984, "learning_rate": 4.435981610205464e-06, "loss": 0.0634, "reward": 3.214670181274414, "reward_std": 0.26854538917541504, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.625568151473999, "rewards/wrapped_driving_reward": 0.5891021490097046, "rewards/wrapped_format_reward": 1.0, "step": 711 }, { "completion_length": 500.0, "epoch": 142.4, "grad_norm": 0.4460209608078003, "kl": 1.0748577117919922, "learning_rate": 4.4336789818349105e-06, "loss": 0.043, "reward": 2.6283092498779297, "reward_std": 0.26468104124069214, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8482142686843872, "rewards/wrapped_driving_reward": -0.09490520507097244, "rewards/wrapped_format_reward": 0.875, "step": 712 }, { "completion_length": 500.0, "epoch": 142.6, "grad_norm": 0.4173543155193329, "kl": 1.4964444637298584, "learning_rate": 4.43137226296298e-06, "loss": 0.0599, "reward": 0.5792319774627686, "reward_std": 2.4592177867889404, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.328125, "rewards/wrapped_driving_reward": -0.9988929629325867, "rewards/wrapped_format_reward": 0.5, "step": 713 }, { "completion_length": 500.0, "epoch": 142.8, "grad_norm": 1.1737406253814697, "kl": 1.9200931787490845, "learning_rate": 4.4290614584693005e-06, "loss": 0.0768, "reward": -1.191518783569336, "reward_std": 2.6190543174743652, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4749999940395355, "rewards/wrapped_driving_reward": -3.1665186882019043, "rewards/wrapped_format_reward": 0.75, "step": 714 }, { "completion_length": 500.0, "epoch": 143.0, "grad_norm": 0.41345661878585815, "kl": 2.837022066116333, "learning_rate": 4.426746573242145e-06, "loss": 0.1135, "reward": 1.8437055349349976, "reward_std": 0.6069126725196838, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5068181753158569, "rewards/wrapped_driving_reward": -0.4131126403808594, "rewards/wrapped_format_reward": 0.75, "step": 715 }, { "completion_length": 500.0, "epoch": 143.2, "grad_norm": 0.480434387922287, "kl": 1.3712215423583984, "learning_rate": 4.42442761217842e-06, "loss": 0.0548, "reward": 0.1885419487953186, "reward_std": 2.7936906814575195, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4416666626930237, "rewards/wrapped_driving_reward": -1.378124713897705, "rewards/wrapped_format_reward": 0.375, "step": 716 }, { "completion_length": 449.0, "epoch": 143.4, "grad_norm": 0.44977426528930664, "kl": 2.328033685684204, "learning_rate": 4.422104580183649e-06, "loss": 0.0931, "reward": 1.6494495868682861, "reward_std": 1.049676537513733, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.3708333373069763, "rewards/wrapped_driving_reward": -0.4713836908340454, "rewards/wrapped_format_reward": 0.75, "step": 717 }, { "completion_length": 500.0, "epoch": 143.6, "grad_norm": 0.4617660343647003, "kl": 2.1286087036132812, "learning_rate": 4.419777482171972e-06, "loss": 0.0851, "reward": 2.2191004753112793, "reward_std": 0.3220396935939789, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6625000238418579, "rewards/wrapped_driving_reward": 0.18160034716129303, "rewards/wrapped_format_reward": 0.375, "step": 718 }, { "completion_length": 435.0, "epoch": 143.8, "grad_norm": 0.5595609545707703, "kl": 1.694679617881775, "learning_rate": 4.417446323066127e-06, "loss": 0.0678, "reward": 2.1137912273406982, "reward_std": 0.2863823473453522, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.26120877265930176, "rewards/wrapped_format_reward": 0.625, "step": 719 }, { "completion_length": 500.0, "epoch": 144.0, "grad_norm": 0.6063684821128845, "kl": 2.057292938232422, "learning_rate": 4.415111107797445e-06, "loss": 0.0823, "reward": 1.3950753211975098, "reward_std": 3.602837562561035, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3958333134651184, "rewards/wrapped_driving_reward": -0.3757581114768982, "rewards/wrapped_format_reward": 0.625, "step": 720 }, { "completion_length": 381.0, "epoch": 144.2, "grad_norm": 0.5373388528823853, "kl": 1.459957242012024, "learning_rate": 4.4127718413058375e-06, "loss": 0.0584, "reward": 2.5545997619628906, "reward_std": 0.5723170638084412, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7321428656578064, "rewards/wrapped_driving_reward": 0.07245711982250214, "rewards/wrapped_format_reward": 0.75, "step": 721 }, { "completion_length": 500.0, "epoch": 144.4, "grad_norm": 0.6013333201408386, "kl": 2.579637289047241, "learning_rate": 4.410428528539783e-06, "loss": 0.1032, "reward": 1.461907982826233, "reward_std": 3.642347812652588, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3333333432674408, "rewards/wrapped_driving_reward": -0.3714253008365631, "rewards/wrapped_format_reward": 0.75, "step": 722 }, { "completion_length": 268.0, "epoch": 144.6, "grad_norm": 0.7695161700248718, "kl": 2.719187021255493, "learning_rate": 4.408081174456322e-06, "loss": 0.1088, "reward": 1.0171679258346558, "reward_std": 3.025564193725586, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4749999940395355, "rewards/wrapped_driving_reward": -1.0828319787979126, "rewards/wrapped_format_reward": 0.875, "step": 723 }, { "completion_length": 500.0, "epoch": 144.8, "grad_norm": 0.5366469621658325, "kl": 2.3963301181793213, "learning_rate": 4.405729784021046e-06, "loss": 0.0959, "reward": 2.128978729248047, "reward_std": 0.3375060260295868, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5071429014205933, "rewards/wrapped_driving_reward": -0.003164224326610565, "rewards/wrapped_format_reward": 0.625, "step": 724 }, { "completion_length": 500.0, "epoch": 145.0, "grad_norm": 0.4683205187320709, "kl": 1.2654401063919067, "learning_rate": 4.403374362208078e-06, "loss": 0.0506, "reward": 2.3388218879699707, "reward_std": 0.5537890791893005, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7309523820877075, "rewards/wrapped_driving_reward": 0.10786936432123184, "rewards/wrapped_format_reward": 0.5, "step": 725 }, { "completion_length": 354.0, "epoch": 145.2, "grad_norm": 0.5253893136978149, "kl": 1.5235090255737305, "learning_rate": 4.401014914000078e-06, "loss": 0.0609, "reward": 2.349693536758423, "reward_std": 0.35415056347846985, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5189393758773804, "rewards/wrapped_driving_reward": -0.16924580931663513, "rewards/wrapped_format_reward": 1.0, "step": 726 }, { "completion_length": 379.0, "epoch": 145.4, "grad_norm": 0.46271148324012756, "kl": 1.7733136415481567, "learning_rate": 4.398651444388216e-06, "loss": 0.0709, "reward": 0.9300908446311951, "reward_std": 1.7890135049819946, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8068181872367859, "rewards/wrapped_driving_reward": -1.3767273426055908, "rewards/wrapped_format_reward": 0.5, "step": 727 }, { "completion_length": 500.0, "epoch": 145.6, "grad_norm": 0.5508533120155334, "kl": 2.2134132385253906, "learning_rate": 4.396283958372173e-06, "loss": 0.0885, "reward": 0.34229958057403564, "reward_std": 3.0221662521362305, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.40625, "rewards/wrapped_driving_reward": -1.0639503002166748, "rewards/wrapped_format_reward": 0.25, "step": 728 }, { "completion_length": 500.0, "epoch": 145.8, "grad_norm": 0.37063583731651306, "kl": 1.571765661239624, "learning_rate": 4.393912460960125e-06, "loss": 0.0629, "reward": 0.5917848348617554, "reward_std": 2.4744818210601807, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4791666865348816, "rewards/wrapped_driving_reward": -1.137381911277771, "rewards/wrapped_format_reward": 0.5, "step": 729 }, { "completion_length": 500.0, "epoch": 146.0, "grad_norm": 0.5619032382965088, "kl": 1.441375970840454, "learning_rate": 4.391536957168733e-06, "loss": 0.0577, "reward": 0.5195353031158447, "reward_std": 2.359299421310425, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3854166865348816, "rewards/wrapped_driving_reward": -0.9908813834190369, "rewards/wrapped_format_reward": 0.375, "step": 730 }, { "completion_length": 500.0, "epoch": 146.2, "grad_norm": 0.5752468109130859, "kl": 1.5844868421554565, "learning_rate": 4.389157452023134e-06, "loss": 0.0634, "reward": 1.9300518035888672, "reward_std": 0.41432899236679077, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.4958333373069763, "rewards/wrapped_driving_reward": -0.06578151881694794, "rewards/wrapped_format_reward": 0.5, "step": 731 }, { "completion_length": 500.0, "epoch": 146.4, "grad_norm": 0.5859819054603577, "kl": 2.655578374862671, "learning_rate": 4.386773950556931e-06, "loss": 0.1062, "reward": 1.6558188199996948, "reward_std": 3.1272969245910645, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.36250001192092896, "rewards/wrapped_driving_reward": -0.45668113231658936, "rewards/wrapped_format_reward": 1.0, "step": 732 }, { "completion_length": 500.0, "epoch": 146.6, "grad_norm": 0.5299608111381531, "kl": 0.5435693860054016, "learning_rate": 4.384386457812176e-06, "loss": 0.0217, "reward": 2.3482933044433594, "reward_std": 0.23350678384304047, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.71875, "rewards/wrapped_driving_reward": 0.004543111193925142, "rewards/wrapped_format_reward": 0.625, "step": 733 }, { "completion_length": 500.0, "epoch": 146.8, "grad_norm": 0.5528538227081299, "kl": 1.7951174974441528, "learning_rate": 4.3819949788393715e-06, "loss": 0.0718, "reward": 0.8419543504714966, "reward_std": 2.9007656574249268, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.0330456495285034, "rewards/wrapped_format_reward": 0.625, "step": 734 }, { "completion_length": 500.0, "epoch": 147.0, "grad_norm": 0.6346167325973511, "kl": 2.1108882427215576, "learning_rate": 4.379599518697444e-06, "loss": 0.0844, "reward": 2.109321117401123, "reward_std": 1.1090701818466187, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.4958333373069763, "rewards/wrapped_driving_reward": 0.11348781734704971, "rewards/wrapped_format_reward": 0.5, "step": 735 }, { "completion_length": 500.0, "epoch": 147.2, "grad_norm": 0.4851335287094116, "kl": 2.4532203674316406, "learning_rate": 4.377200082453748e-06, "loss": 0.0981, "reward": 2.2372679710388184, "reward_std": 0.4187573492527008, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.4749999940395355, "rewards/wrapped_driving_reward": 0.01226797979325056, "rewards/wrapped_format_reward": 0.75, "step": 736 }, { "completion_length": 412.0, "epoch": 147.4, "grad_norm": 0.4031081795692444, "kl": 1.5797117948532104, "learning_rate": 4.3747966751840475e-06, "loss": 0.0632, "reward": 2.355008840560913, "reward_std": 0.503248929977417, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.699999988079071, "rewards/wrapped_driving_reward": -0.21999120712280273, "rewards/wrapped_format_reward": 0.875, "step": 737 }, { "completion_length": 449.0, "epoch": 147.6, "grad_norm": 0.3564465343952179, "kl": 2.1247503757476807, "learning_rate": 4.372389301972506e-06, "loss": 0.085, "reward": 1.6672782897949219, "reward_std": 3.1267757415771484, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4444444477558136, "rewards/wrapped_driving_reward": -0.4021660089492798, "rewards/wrapped_format_reward": 0.875, "step": 738 }, { "completion_length": 500.0, "epoch": 147.8, "grad_norm": 0.4619370102882385, "kl": 2.4289839267730713, "learning_rate": 4.369977967911676e-06, "loss": 0.0972, "reward": 1.0640186071395874, "reward_std": 2.7171685695648193, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5089285373687744, "rewards/wrapped_driving_reward": -1.069909930229187, "rewards/wrapped_format_reward": 0.875, "step": 739 }, { "completion_length": 500.0, "epoch": 148.0, "grad_norm": 0.43945783376693726, "kl": 1.0754218101501465, "learning_rate": 4.367562678102491e-06, "loss": 0.043, "reward": 1.8312697410583496, "reward_std": 0.32288119196891785, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7234848737716675, "rewards/wrapped_driving_reward": -0.3922150731086731, "rewards/wrapped_format_reward": 0.5, "step": 740 }, { "completion_length": 500.0, "epoch": 148.2, "grad_norm": 0.5091390013694763, "kl": 1.6062896251678467, "learning_rate": 4.365143437654249e-06, "loss": 0.0643, "reward": 1.1088793277740479, "reward_std": 2.7566280364990234, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5151515007019043, "rewards/wrapped_driving_reward": -0.9062721729278564, "rewards/wrapped_format_reward": 0.75, "step": 741 }, { "completion_length": 290.0, "epoch": 148.4, "grad_norm": 0.4643152952194214, "kl": 2.1783127784729004, "learning_rate": 4.36272025168461e-06, "loss": 0.0871, "reward": 1.2315528392791748, "reward_std": 2.8298909664154053, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": -1.1434472799301147, "rewards/wrapped_format_reward": 1.0, "step": 742 }, { "completion_length": 500.0, "epoch": 148.6, "grad_norm": 0.6217731833457947, "kl": 1.8467411994934082, "learning_rate": 4.360293125319575e-06, "loss": 0.0739, "reward": 1.6594910621643066, "reward_std": 3.169166088104248, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.550000011920929, "rewards/wrapped_driving_reward": -0.3905089497566223, "rewards/wrapped_format_reward": 0.75, "step": 743 }, { "completion_length": 500.0, "epoch": 148.8, "grad_norm": 0.5691359043121338, "kl": 2.164647340774536, "learning_rate": 4.357862063693486e-06, "loss": 0.0866, "reward": 2.4071640968322754, "reward_std": 0.46048980951309204, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6142857074737549, "rewards/wrapped_driving_reward": 0.04287836700677872, "rewards/wrapped_format_reward": 0.75, "step": 744 }, { "completion_length": 500.0, "epoch": 149.0, "grad_norm": 0.46008655428886414, "kl": 0.5319089889526367, "learning_rate": 4.355427071949004e-06, "loss": 0.0213, "reward": 1.8021718263626099, "reward_std": 0.693901777267456, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8409090638160706, "rewards/wrapped_driving_reward": -0.5387372374534607, "rewards/wrapped_format_reward": 0.5, "step": 745 }, { "completion_length": 500.0, "epoch": 149.2, "grad_norm": 0.5122867226600647, "kl": 1.535349726676941, "learning_rate": 4.352988155237109e-06, "loss": 0.0614, "reward": 0.9291813373565674, "reward_std": 2.972092390060425, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4791666567325592, "rewards/wrapped_driving_reward": -1.1749852895736694, "rewards/wrapped_format_reward": 0.875, "step": 746 }, { "completion_length": 255.0, "epoch": 149.4, "grad_norm": 0.5990192294120789, "kl": 1.036647081375122, "learning_rate": 4.350545318717081e-06, "loss": 0.0415, "reward": 2.0738182067871094, "reward_std": 0.7627832889556885, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.40833333134651184, "rewards/wrapped_driving_reward": 0.040485017001628876, "rewards/wrapped_format_reward": 0.625, "step": 747 }, { "completion_length": 371.0, "epoch": 149.6, "grad_norm": 0.47503599524497986, "kl": 2.037416934967041, "learning_rate": 4.34809856755649e-06, "loss": 0.0815, "reward": 3.0756759643554688, "reward_std": 0.6012242436408997, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5687500238418579, "rewards/wrapped_driving_reward": 0.5069259405136108, "rewards/wrapped_format_reward": 1.0, "step": 748 }, { "completion_length": 500.0, "epoch": 149.8, "grad_norm": 0.6611429452896118, "kl": 0.05937589704990387, "learning_rate": 4.345647906931193e-06, "loss": 0.0024, "reward": 0.7192306518554688, "reward_std": 3.1889588832855225, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6458333134651184, "rewards/wrapped_driving_reward": -0.9266026616096497, "rewards/wrapped_format_reward": 0.25, "step": 749 }, { "completion_length": 500.0, "epoch": 150.0, "grad_norm": 1.4345812797546387, "kl": 2.214282989501953, "learning_rate": 4.34319334202531e-06, "loss": 0.0886, "reward": 1.2727808952331543, "reward_std": 1.221142053604126, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6937500238418579, "rewards/wrapped_driving_reward": -1.295969009399414, "rewards/wrapped_format_reward": 0.875, "step": 750 }, { "completion_length": 500.0, "epoch": 150.2, "grad_norm": 1.0040264129638672, "kl": 1.755966067314148, "learning_rate": 4.340734878031226e-06, "loss": 0.0702, "reward": 2.463878631591797, "reward_std": 0.4975334405899048, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6708333492279053, "rewards/wrapped_driving_reward": 0.04304514080286026, "rewards/wrapped_format_reward": 0.75, "step": 751 }, { "completion_length": 343.0, "epoch": 150.4, "grad_norm": 0.5416125655174255, "kl": 2.034369945526123, "learning_rate": 4.338272520149572e-06, "loss": 0.0814, "reward": 1.5815997123718262, "reward_std": 0.738770067691803, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.574999988079071, "rewards/wrapped_driving_reward": -0.3684001564979553, "rewards/wrapped_format_reward": 0.375, "step": 752 }, { "completion_length": 330.0, "epoch": 150.6, "grad_norm": 0.6395848393440247, "kl": 1.1736429929733276, "learning_rate": 4.335806273589214e-06, "loss": 0.0469, "reward": 2.9796881675720215, "reward_std": 0.46711617708206177, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.392857164144516, "rewards/wrapped_driving_reward": 0.8368311524391174, "rewards/wrapped_format_reward": 0.75, "step": 753 }, { "completion_length": 500.0, "epoch": 150.8, "grad_norm": 0.6400429010391235, "kl": 1.6994929313659668, "learning_rate": 4.333336143567247e-06, "loss": 0.068, "reward": 2.359811782836914, "reward_std": 0.6050797700881958, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6776515245437622, "rewards/wrapped_driving_reward": -0.0678396001458168, "rewards/wrapped_format_reward": 0.75, "step": 754 }, { "completion_length": 500.0, "epoch": 151.0, "grad_norm": 0.41147860884666443, "kl": 2.3285937309265137, "learning_rate": 4.3308621353089806e-06, "loss": 0.0931, "reward": 2.1614489555358887, "reward_std": 0.7562393546104431, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.671875, "rewards/wrapped_driving_reward": -0.13542607426643372, "rewards/wrapped_format_reward": 0.625, "step": 755 }, { "completion_length": 500.0, "epoch": 151.2, "grad_norm": 0.5263493657112122, "kl": 1.4681487083435059, "learning_rate": 4.328384254047927e-06, "loss": 0.0587, "reward": -0.8270676136016846, "reward_std": 3.383007764816284, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2738095223903656, "rewards/wrapped_driving_reward": -1.9758771657943726, "rewards/wrapped_format_reward": 0.375, "step": 756 }, { "completion_length": 500.0, "epoch": 151.4, "grad_norm": 0.4938522279262543, "kl": 1.1184033155441284, "learning_rate": 4.325902505025792e-06, "loss": 0.0447, "reward": 2.295780897140503, "reward_std": 0.5614804029464722, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5666667222976685, "rewards/wrapped_driving_reward": -0.14588575065135956, "rewards/wrapped_format_reward": 0.875, "step": 757 }, { "completion_length": 500.0, "epoch": 151.6, "grad_norm": 0.4597296118736267, "kl": 1.5768853425979614, "learning_rate": 4.3234168934924634e-06, "loss": 0.0631, "reward": -0.861888587474823, "reward_std": 3.1065499782562256, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.2368886470794678, "rewards/wrapped_format_reward": 0.625, "step": 758 }, { "completion_length": 500.0, "epoch": 151.8, "grad_norm": 0.4385567307472229, "kl": 2.0956034660339355, "learning_rate": 4.320927424706001e-06, "loss": 0.0838, "reward": 2.5754759311676025, "reward_std": 0.4464452266693115, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5806277394294739, "rewards/wrapped_driving_reward": 0.24484820663928986, "rewards/wrapped_format_reward": 0.75, "step": 759 }, { "completion_length": 500.0, "epoch": 152.0, "grad_norm": 0.5834075808525085, "kl": 1.261903166770935, "learning_rate": 4.318434103932622e-06, "loss": 0.0505, "reward": 3.6084847450256348, "reward_std": 0.2288597822189331, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8055555820465088, "rewards/wrapped_driving_reward": 0.8029290437698364, "rewards/wrapped_format_reward": 1.0, "step": 760 }, { "completion_length": 500.0, "epoch": 152.2, "grad_norm": 0.5083420276641846, "kl": 2.5127692222595215, "learning_rate": 4.315936936446694e-06, "loss": 0.1005, "reward": 0.5838586091995239, "reward_std": 3.2157862186431885, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -0.9161414504051208, "rewards/wrapped_format_reward": 0.25, "step": 761 }, { "completion_length": 500.0, "epoch": 152.4, "grad_norm": 0.5797263979911804, "kl": 0.6266072392463684, "learning_rate": 4.313435927530719e-06, "loss": 0.0251, "reward": 1.1752140522003174, "reward_std": 3.138611316680908, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3541666567325592, "rewards/wrapped_driving_reward": -0.4289526641368866, "rewards/wrapped_format_reward": 0.5, "step": 762 }, { "completion_length": 417.0, "epoch": 152.6, "grad_norm": 0.37527403235435486, "kl": 3.000368595123291, "learning_rate": 4.310931082475331e-06, "loss": 0.12, "reward": 2.424795150756836, "reward_std": 0.22299793362617493, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6354166865348816, "rewards/wrapped_driving_reward": 0.16437844932079315, "rewards/wrapped_format_reward": 0.625, "step": 763 }, { "completion_length": 500.0, "epoch": 152.8, "grad_norm": 0.543470561504364, "kl": 1.3623380661010742, "learning_rate": 4.30842240657927e-06, "loss": 0.0545, "reward": 2.3236351013183594, "reward_std": 0.4697359800338745, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5468073487281799, "rewards/wrapped_driving_reward": 0.026827752590179443, "rewards/wrapped_format_reward": 0.75, "step": 764 }, { "completion_length": 420.0, "epoch": 153.0, "grad_norm": 0.4464316666126251, "kl": 2.909830331802368, "learning_rate": 4.305909905149389e-06, "loss": 0.1164, "reward": 2.5203139781951904, "reward_std": 0.1798373907804489, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6229166984558105, "rewards/wrapped_driving_reward": 0.27239733934402466, "rewards/wrapped_format_reward": 0.625, "step": 765 }, { "completion_length": 452.0, "epoch": 153.2, "grad_norm": 0.5384455323219299, "kl": 1.8738824129104614, "learning_rate": 4.303393583500629e-06, "loss": 0.075, "reward": 1.2647901773452759, "reward_std": 2.3588814735412598, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6443182229995728, "rewards/wrapped_driving_reward": -1.2545280456542969, "rewards/wrapped_format_reward": 0.875, "step": 766 }, { "completion_length": 365.0, "epoch": 153.4, "grad_norm": 0.4524284601211548, "kl": 2.0011465549468994, "learning_rate": 4.300873446956011e-06, "loss": 0.08, "reward": 2.823650360107422, "reward_std": 0.22715435922145844, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.675000011920929, "rewards/wrapped_driving_reward": 0.14865019917488098, "rewards/wrapped_format_reward": 1.0, "step": 767 }, { "completion_length": 357.0, "epoch": 153.6, "grad_norm": 0.5021389126777649, "kl": 1.9665089845657349, "learning_rate": 4.2983495008466285e-06, "loss": 0.0787, "reward": 2.5459351539611816, "reward_std": 0.29390469193458557, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5777778029441833, "rewards/wrapped_driving_reward": 0.09315741062164307, "rewards/wrapped_format_reward": 0.875, "step": 768 }, { "completion_length": 419.0, "epoch": 153.8, "grad_norm": 0.44508621096611023, "kl": 1.8401458263397217, "learning_rate": 4.295821750511633e-06, "loss": 0.0736, "reward": 1.6928329467773438, "reward_std": 3.1920254230499268, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5416666865348816, "rewards/wrapped_driving_reward": -0.34883368015289307, "rewards/wrapped_format_reward": 0.75, "step": 769 }, { "completion_length": 242.0, "epoch": 154.0, "grad_norm": 0.7367874383926392, "kl": 1.5703699588775635, "learning_rate": 4.293290201298224e-06, "loss": 0.0628, "reward": 2.305760383605957, "reward_std": 0.3531375527381897, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.4442397356033325, "rewards/wrapped_format_reward": 1.0, "step": 770 }, { "completion_length": 473.0, "epoch": 154.2, "grad_norm": 0.44318413734436035, "kl": 1.721099853515625, "learning_rate": 4.290754858561636e-06, "loss": 0.0688, "reward": 2.0304620265960693, "reward_std": 0.42801910638809204, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7458333373069763, "rewards/wrapped_driving_reward": -0.340371310710907, "rewards/wrapped_format_reward": 0.625, "step": 771 }, { "completion_length": 500.0, "epoch": 154.4, "grad_norm": 0.5513876676559448, "kl": 2.0845999717712402, "learning_rate": 4.288215727665129e-06, "loss": 0.0834, "reward": 2.804028272628784, "reward_std": 0.7167229056358337, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6104166507720947, "rewards/wrapped_driving_reward": 0.568611741065979, "rewards/wrapped_format_reward": 0.625, "step": 772 }, { "completion_length": 500.0, "epoch": 154.6, "grad_norm": 0.4972803294658661, "kl": 1.3273216485977173, "learning_rate": 4.285672813979977e-06, "loss": 0.0531, "reward": 1.5267138481140137, "reward_std": 0.9965893030166626, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6812500357627869, "rewards/wrapped_driving_reward": -0.40453627705574036, "rewards/wrapped_format_reward": 0.25, "step": 773 }, { "completion_length": 347.0, "epoch": 154.8, "grad_norm": 0.5454884171485901, "kl": 0.8703808188438416, "learning_rate": 4.283126122885455e-06, "loss": 0.0348, "reward": 2.0228705406188965, "reward_std": 0.6939378380775452, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7291666865348816, "rewards/wrapped_driving_reward": -0.5812963247299194, "rewards/wrapped_format_reward": 0.875, "step": 774 }, { "completion_length": 500.0, "epoch": 155.0, "grad_norm": 0.4202658236026764, "kl": 2.270378351211548, "learning_rate": 4.280575659768828e-06, "loss": 0.0908, "reward": 1.0306168794631958, "reward_std": 3.046523332595825, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5416666865348816, "rewards/wrapped_driving_reward": -0.8860498666763306, "rewards/wrapped_format_reward": 0.625, "step": 775 }, { "completion_length": 373.0, "epoch": 155.2, "grad_norm": 0.6647098660469055, "kl": 1.4597883224487305, "learning_rate": 4.278021430025343e-06, "loss": 0.0584, "reward": 2.2688865661621094, "reward_std": 0.4073338210582733, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6887626647949219, "rewards/wrapped_driving_reward": -0.41987621784210205, "rewards/wrapped_format_reward": 1.0, "step": 776 }, { "completion_length": 367.0, "epoch": 155.4, "grad_norm": 0.5167476534843445, "kl": 1.7029842138290405, "learning_rate": 4.275463439058214e-06, "loss": 0.0681, "reward": 2.55387020111084, "reward_std": 0.37654924392700195, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5375000238418579, "rewards/wrapped_driving_reward": 0.26637014746665955, "rewards/wrapped_format_reward": 0.75, "step": 777 }, { "completion_length": 500.0, "epoch": 155.6, "grad_norm": 0.5125796794891357, "kl": 1.6133759021759033, "learning_rate": 4.2729016922786095e-06, "loss": 0.0645, "reward": 2.204677104949951, "reward_std": 0.5718064308166504, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5437500476837158, "rewards/wrapped_driving_reward": -0.08907286077737808, "rewards/wrapped_format_reward": 0.75, "step": 778 }, { "completion_length": 388.0, "epoch": 155.8, "grad_norm": 0.5310250520706177, "kl": 1.4088696241378784, "learning_rate": 4.270336195105645e-06, "loss": 0.0564, "reward": 2.2584033012390137, "reward_std": 0.7043437957763672, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6000000238418579, "rewards/wrapped_driving_reward": 0.0334031768143177, "rewards/wrapped_format_reward": 0.625, "step": 779 }, { "completion_length": 500.0, "epoch": 156.0, "grad_norm": 0.46999886631965637, "kl": 3.0204784870147705, "learning_rate": 4.267766952966369e-06, "loss": 0.1208, "reward": 2.017543077468872, "reward_std": 0.6146121025085449, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.4727272391319275, "rewards/wrapped_driving_reward": -0.33018413186073303, "rewards/wrapped_format_reward": 0.875, "step": 780 }, { "completion_length": 173.0, "epoch": 156.2, "grad_norm": 0.9462590217590332, "kl": 0.8838510513305664, "learning_rate": 4.265193971295752e-06, "loss": 0.0354, "reward": 3.458010196685791, "reward_std": 0.3142799139022827, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.59375, "rewards/wrapped_driving_reward": 0.8642601370811462, "rewards/wrapped_format_reward": 1.0, "step": 781 }, { "completion_length": 327.0, "epoch": 156.4, "grad_norm": 0.6192349195480347, "kl": 1.7248393297195435, "learning_rate": 4.262617255536676e-06, "loss": 0.069, "reward": 1.1187782287597656, "reward_std": 3.079193353652954, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.637499988079071, "rewards/wrapped_driving_reward": -1.1437218189239502, "rewards/wrapped_format_reward": 0.875, "step": 782 }, { "completion_length": 500.0, "epoch": 156.6, "grad_norm": 0.4486222565174103, "kl": 0.9365119338035583, "learning_rate": 4.260036811139922e-06, "loss": 0.0375, "reward": -0.7293701171875, "reward_std": 3.7841498851776123, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2708333134651184, "rewards/wrapped_driving_reward": -2.0002033710479736, "rewards/wrapped_format_reward": 0.5, "step": 783 }, { "completion_length": 363.0, "epoch": 156.8, "grad_norm": 0.5218331813812256, "kl": 1.2869641780853271, "learning_rate": 4.257452643564155e-06, "loss": 0.0515, "reward": 1.064315915107727, "reward_std": 1.7001785039901733, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6000000238418579, "rewards/wrapped_driving_reward": -1.4106841087341309, "rewards/wrapped_format_reward": 0.875, "step": 784 }, { "completion_length": 500.0, "epoch": 157.0, "grad_norm": 0.5681933164596558, "kl": 2.0852723121643066, "learning_rate": 4.254864758275921e-06, "loss": 0.0834, "reward": 0.7124841213226318, "reward_std": 2.500086545944214, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5375000238418579, "rewards/wrapped_driving_reward": -1.200015902519226, "rewards/wrapped_format_reward": 0.625, "step": 785 }, { "completion_length": 500.0, "epoch": 157.2, "grad_norm": 0.4559946358203888, "kl": 1.3557456731796265, "learning_rate": 4.2522731607496275e-06, "loss": 0.0542, "reward": 1.9311479330062866, "reward_std": 0.2824523150920868, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8207070827484131, "rewards/wrapped_driving_reward": -0.6395591497421265, "rewards/wrapped_format_reward": 0.75, "step": 786 }, { "completion_length": 272.0, "epoch": 157.4, "grad_norm": 0.47390007972717285, "kl": 1.5192948579788208, "learning_rate": 4.249677856467537e-06, "loss": 0.0608, "reward": 3.276240587234497, "reward_std": 0.3651244342327118, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.578125, "rewards/wrapped_driving_reward": 0.8231154680252075, "rewards/wrapped_format_reward": 0.875, "step": 787 }, { "completion_length": 214.0, "epoch": 157.6, "grad_norm": 0.4379805326461792, "kl": 1.4829914569854736, "learning_rate": 4.24707885091975e-06, "loss": 0.0593, "reward": 2.4753520488739014, "reward_std": 0.41669923067092896, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.46875, "rewards/wrapped_driving_reward": 0.006602026056498289, "rewards/wrapped_format_reward": 1.0, "step": 788 }, { "completion_length": 371.0, "epoch": 157.8, "grad_norm": 0.6933165192604065, "kl": 1.9118812084197998, "learning_rate": 4.244476149604201e-06, "loss": 0.0765, "reward": 0.8855187296867371, "reward_std": 3.2956066131591797, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.1144812107086182, "rewards/wrapped_format_reward": 0.5, "step": 789 }, { "completion_length": 273.0, "epoch": 158.0, "grad_norm": 0.6838796734809875, "kl": 0.8689262866973877, "learning_rate": 4.241869758026638e-06, "loss": 0.0348, "reward": 2.694098949432373, "reward_std": 0.4889434278011322, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5776515007019043, "rewards/wrapped_driving_reward": 0.2414475977420807, "rewards/wrapped_format_reward": 0.875, "step": 790 }, { "completion_length": 461.0, "epoch": 158.2, "grad_norm": 1.5503700971603394, "kl": 1.9748976230621338, "learning_rate": 4.239259681700618e-06, "loss": 0.079, "reward": 2.4788031578063965, "reward_std": 0.48520609736442566, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9097222089767456, "rewards/wrapped_driving_reward": 0.19408084452152252, "rewards/wrapped_format_reward": 0.375, "step": 791 }, { "completion_length": 500.0, "epoch": 158.4, "grad_norm": 0.36111530661582947, "kl": 2.072077751159668, "learning_rate": 4.236645926147493e-06, "loss": 0.0829, "reward": -0.7475395202636719, "reward_std": 2.8378074169158936, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6297348737716675, "rewards/wrapped_driving_reward": -2.877274513244629, "rewards/wrapped_format_reward": 0.75, "step": 792 }, { "completion_length": 500.0, "epoch": 158.6, "grad_norm": 0.5936017632484436, "kl": 1.7306230068206787, "learning_rate": 4.234028496896398e-06, "loss": 0.0692, "reward": 0.6696592569351196, "reward_std": 3.121098518371582, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4322916865348816, "rewards/wrapped_driving_reward": -1.1376324892044067, "rewards/wrapped_format_reward": 0.625, "step": 793 }, { "completion_length": 500.0, "epoch": 158.8, "grad_norm": 0.44106271862983704, "kl": 1.4598654508590698, "learning_rate": 4.231407399484236e-06, "loss": 0.0584, "reward": 2.673553466796875, "reward_std": 0.6879240870475769, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6916666626930237, "rewards/wrapped_driving_reward": 0.35688674449920654, "rewards/wrapped_format_reward": 0.625, "step": 794 }, { "completion_length": 500.0, "epoch": 159.0, "grad_norm": 0.37563157081604004, "kl": 1.7261723279953003, "learning_rate": 4.228782639455674e-06, "loss": 0.069, "reward": 0.8295122385025024, "reward_std": 2.6207921504974365, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4431818127632141, "rewards/wrapped_driving_reward": -0.9886695146560669, "rewards/wrapped_format_reward": 0.625, "step": 795 }, { "completion_length": 500.0, "epoch": 159.2, "grad_norm": 0.6870527267456055, "kl": 2.3870997428894043, "learning_rate": 4.226154222363124e-06, "loss": 0.0955, "reward": 0.891355037689209, "reward_std": 3.266871929168701, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6473214626312256, "rewards/wrapped_driving_reward": -1.255966305732727, "rewards/wrapped_format_reward": 0.75, "step": 796 }, { "completion_length": 455.0, "epoch": 159.4, "grad_norm": 0.4349604845046997, "kl": 2.073577404022217, "learning_rate": 4.223522153766737e-06, "loss": 0.0829, "reward": 3.245163917541504, "reward_std": 0.43290379643440247, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6818181872367859, "rewards/wrapped_driving_reward": 0.8133454918861389, "rewards/wrapped_format_reward": 0.75, "step": 797 }, { "completion_length": 500.0, "epoch": 159.6, "grad_norm": 0.4623129665851593, "kl": 0.7278208136558533, "learning_rate": 4.220886439234385e-06, "loss": 0.0291, "reward": 2.338970184326172, "reward_std": 0.46193283796310425, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.692307710647583, "rewards/wrapped_driving_reward": 0.021662473678588867, "rewards/wrapped_format_reward": 0.625, "step": 798 }, { "completion_length": 500.0, "epoch": 159.8, "grad_norm": 1.6049169301986694, "kl": 1.601915717124939, "learning_rate": 4.218247084341656e-06, "loss": 0.0641, "reward": -1.058358907699585, "reward_std": 3.428088665008545, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2604166567325592, "rewards/wrapped_driving_reward": -1.9437755346298218, "rewards/wrapped_format_reward": 0.125, "step": 799 }, { "completion_length": 243.0, "epoch": 160.0, "grad_norm": 0.6691842675209045, "kl": 1.261725902557373, "learning_rate": 4.215604094671835e-06, "loss": 0.0505, "reward": 0.6617778539657593, "reward_std": 2.219189167022705, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9125000238418579, "rewards/wrapped_driving_reward": -2.0007221698760986, "rewards/wrapped_format_reward": 0.75, "step": 800 }, { "completion_length": 378.0, "epoch": 160.2, "grad_norm": 0.46809715032577515, "kl": 2.0106468200683594, "learning_rate": 4.212957475815898e-06, "loss": 0.0804, "reward": -0.409631609916687, "reward_std": 3.321371078491211, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4583333134651184, "rewards/wrapped_driving_reward": -1.9929649829864502, "rewards/wrapped_format_reward": 0.625, "step": 801 }, { "completion_length": 500.0, "epoch": 160.4, "grad_norm": 0.3511122763156891, "kl": 1.422855257987976, "learning_rate": 4.2103072333725e-06, "loss": 0.0569, "reward": -2.0055766105651855, "reward_std": 3.01659893989563, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.1041666641831398, "rewards/wrapped_driving_reward": -2.984743356704712, "rewards/wrapped_format_reward": 0.625, "step": 802 }, { "completion_length": 500.0, "epoch": 160.6, "grad_norm": 0.46046286821365356, "kl": 2.3452701568603516, "learning_rate": 4.207653372947959e-06, "loss": 0.0938, "reward": 3.2800838947296143, "reward_std": 0.46462494134902954, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.59375, "rewards/wrapped_driving_reward": 0.6863338351249695, "rewards/wrapped_format_reward": 1.0, "step": 803 }, { "completion_length": 251.0, "epoch": 160.8, "grad_norm": 0.7049965858459473, "kl": 0.937861442565918, "learning_rate": 4.204995900156247e-06, "loss": 0.0375, "reward": 2.2281129360198975, "reward_std": 0.6199620366096497, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6354166865348816, "rewards/wrapped_driving_reward": -0.2823038101196289, "rewards/wrapped_format_reward": 0.875, "step": 804 }, { "completion_length": 262.0, "epoch": 161.0, "grad_norm": 0.5614827275276184, "kl": 0.9679184556007385, "learning_rate": 4.202334820618976e-06, "loss": 0.0387, "reward": 2.6722943782806396, "reward_std": 0.6139424443244934, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7901785373687744, "rewards/wrapped_driving_reward": 0.007115684449672699, "rewards/wrapped_format_reward": 0.875, "step": 805 }, { "completion_length": 500.0, "epoch": 161.2, "grad_norm": 0.45120781660079956, "kl": 1.3372995853424072, "learning_rate": 4.199670139965393e-06, "loss": 0.0535, "reward": -0.08228069543838501, "reward_std": 3.973278522491455, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.5822806358337402, "rewards/wrapped_format_reward": 0.625, "step": 806 }, { "completion_length": 500.0, "epoch": 161.4, "grad_norm": 0.4979395568370819, "kl": 1.8079756498336792, "learning_rate": 4.197001863832355e-06, "loss": 0.0723, "reward": 2.27278733253479, "reward_std": 0.6808876991271973, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.4583333432674408, "rewards/wrapped_driving_reward": -0.060546088963747025, "rewards/wrapped_format_reward": 0.875, "step": 807 }, { "completion_length": 327.0, "epoch": 161.6, "grad_norm": 0.9964870810508728, "kl": 1.6865053176879883, "learning_rate": 4.194329997864331e-06, "loss": 0.0675, "reward": 2.5463435649871826, "reward_std": 0.353541761636734, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.07865653932094574, "rewards/wrapped_format_reward": 0.875, "step": 808 }, { "completion_length": 500.0, "epoch": 161.8, "grad_norm": 0.4412590265274048, "kl": 0.9222019910812378, "learning_rate": 4.191654547713382e-06, "loss": 0.0369, "reward": -1.0952434539794922, "reward_std": 2.005805253982544, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5772727131843567, "rewards/wrapped_driving_reward": -3.172516345977783, "rewards/wrapped_format_reward": 0.75, "step": 809 }, { "completion_length": 500.0, "epoch": 162.0, "grad_norm": 0.7107343077659607, "kl": 1.3023508787155151, "learning_rate": 4.188975519039151e-06, "loss": 0.0521, "reward": 1.950555682182312, "reward_std": 0.7478029131889343, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5277777910232544, "rewards/wrapped_driving_reward": 0.04777785390615463, "rewards/wrapped_format_reward": 0.375, "step": 810 }, { "completion_length": 288.0, "epoch": 162.2, "grad_norm": 0.6013153195381165, "kl": 2.0154178142547607, "learning_rate": 4.1862929175088505e-06, "loss": 0.0806, "reward": 2.210658550262451, "reward_std": 0.7157384753227234, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5776515007019043, "rewards/wrapped_driving_reward": -0.24199309945106506, "rewards/wrapped_format_reward": 0.875, "step": 811 }, { "completion_length": 305.0, "epoch": 162.4, "grad_norm": 0.4675794541835785, "kl": 1.456557273864746, "learning_rate": 4.183606748797251e-06, "loss": 0.0583, "reward": 2.7761802673339844, "reward_std": 0.534675657749176, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5166666507720947, "rewards/wrapped_driving_reward": 0.2595132887363434, "rewards/wrapped_format_reward": 1.0, "step": 812 }, { "completion_length": 500.0, "epoch": 162.6, "grad_norm": 0.8961451053619385, "kl": 1.8925386667251587, "learning_rate": 4.18091701858667e-06, "loss": 0.0757, "reward": -0.6078506112098694, "reward_std": 3.6464812755584717, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.42500001192092896, "rewards/wrapped_driving_reward": -1.9078506231307983, "rewards/wrapped_format_reward": 0.375, "step": 813 }, { "completion_length": 445.0, "epoch": 162.8, "grad_norm": 0.47308549284935, "kl": 1.925355076789856, "learning_rate": 4.178223732566959e-06, "loss": 0.077, "reward": 1.8975515365600586, "reward_std": 0.5106257796287537, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7611111402511597, "rewards/wrapped_driving_reward": -0.2385595142841339, "rewards/wrapped_format_reward": 0.375, "step": 814 }, { "completion_length": 500.0, "epoch": 163.0, "grad_norm": 0.5124558806419373, "kl": 1.3224657773971558, "learning_rate": 4.17552689643549e-06, "loss": 0.0529, "reward": 3.306344509124756, "reward_std": 0.2665938436985016, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.59375, "rewards/wrapped_driving_reward": 0.8375946283340454, "rewards/wrapped_format_reward": 0.875, "step": 815 }, { "completion_length": 500.0, "epoch": 163.2, "grad_norm": 0.3726499676704407, "kl": 1.8487807512283325, "learning_rate": 4.172826515897146e-06, "loss": 0.074, "reward": 3.01424241065979, "reward_std": 0.20929349958896637, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8482142686843872, "rewards/wrapped_driving_reward": 0.16602809727191925, "rewards/wrapped_format_reward": 1.0, "step": 816 }, { "completion_length": 500.0, "epoch": 163.4, "grad_norm": 0.5630603432655334, "kl": 3.1796255111694336, "learning_rate": 4.170122596664308e-06, "loss": 0.1272, "reward": 1.0359416007995605, "reward_std": 3.3693974018096924, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5833333134651184, "rewards/wrapped_driving_reward": -1.0473915338516235, "rewards/wrapped_format_reward": 0.75, "step": 817 }, { "completion_length": 500.0, "epoch": 163.6, "grad_norm": 0.5170841217041016, "kl": 1.0113697052001953, "learning_rate": 4.1674151444568404e-06, "loss": 0.0405, "reward": -0.10817074775695801, "reward_std": 2.296278238296509, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.19374999403953552, "rewards/wrapped_driving_reward": -1.551920771598816, "rewards/wrapped_format_reward": 0.5, "step": 818 }, { "completion_length": 500.0, "epoch": 163.8, "grad_norm": 0.48902177810668945, "kl": 0.9528977274894714, "learning_rate": 4.164704165002086e-06, "loss": 0.0381, "reward": -0.916668713092804, "reward_std": 3.3977835178375244, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.14166666567325592, "rewards/wrapped_driving_reward": -1.933335304260254, "rewards/wrapped_format_reward": 0.375, "step": 819 }, { "completion_length": 393.0, "epoch": 164.0, "grad_norm": 0.5517847537994385, "kl": 1.925469160079956, "learning_rate": 4.161989664034844e-06, "loss": 0.077, "reward": -0.004072427749633789, "reward_std": 3.7684214115142822, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.42500001192092896, "rewards/wrapped_driving_reward": -1.804072380065918, "rewards/wrapped_format_reward": 0.875, "step": 820 }, { "completion_length": 500.0, "epoch": 164.2, "grad_norm": 0.529854416847229, "kl": 3.0172078609466553, "learning_rate": 4.159271647297368e-06, "loss": 0.1207, "reward": 0.9234670400619507, "reward_std": 2.6180100440979004, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4375, "rewards/wrapped_driving_reward": -1.2640329599380493, "rewards/wrapped_format_reward": 1.0, "step": 821 }, { "completion_length": 500.0, "epoch": 164.4, "grad_norm": 0.5819915533065796, "kl": 0.8147578239440918, "learning_rate": 4.1565501205393445e-06, "loss": 0.0326, "reward": -1.302182912826538, "reward_std": 2.6905508041381836, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.46875, "rewards/wrapped_driving_reward": -3.020932912826538, "rewards/wrapped_format_reward": 0.5, "step": 822 }, { "completion_length": 500.0, "epoch": 164.6, "grad_norm": 0.3606034517288208, "kl": 2.3594508171081543, "learning_rate": 4.153825089517886e-06, "loss": 0.0944, "reward": 2.2323150634765625, "reward_std": 0.5947590470314026, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6545454263687134, "rewards/wrapped_driving_reward": 0.0777696967124939, "rewards/wrapped_format_reward": 0.5, "step": 823 }, { "completion_length": 249.0, "epoch": 164.8, "grad_norm": 0.810918927192688, "kl": 1.1353868246078491, "learning_rate": 4.151096559997519e-06, "loss": 0.0454, "reward": 0.823899507522583, "reward_std": 2.566319465637207, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.44999998807907104, "rewards/wrapped_driving_reward": -1.0011005401611328, "rewards/wrapped_format_reward": 0.625, "step": 824 }, { "completion_length": 207.0, "epoch": 165.0, "grad_norm": 0.5611321330070496, "kl": 0.9290062189102173, "learning_rate": 4.1483645377501726e-06, "loss": 0.0372, "reward": 3.5698623657226562, "reward_std": 0.3293698728084564, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": 0.8198621869087219, "rewards/wrapped_format_reward": 1.0, "step": 825 }, { "completion_length": 385.0, "epoch": 165.2, "grad_norm": 0.6278635859489441, "kl": 2.905214786529541, "learning_rate": 4.14562902855516e-06, "loss": 0.1162, "reward": 1.2987035512924194, "reward_std": 2.8734114170074463, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5416666865348816, "rewards/wrapped_driving_reward": -0.9929630756378174, "rewards/wrapped_format_reward": 1.0, "step": 826 }, { "completion_length": 500.0, "epoch": 165.4, "grad_norm": 0.42276179790496826, "kl": 1.4326372146606445, "learning_rate": 4.142890038199173e-06, "loss": 0.0573, "reward": 0.241288423538208, "reward_std": 4.038166522979736, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4375, "rewards/wrapped_driving_reward": -1.571211576461792, "rewards/wrapped_format_reward": 0.875, "step": 827 }, { "completion_length": 500.0, "epoch": 165.6, "grad_norm": 0.5326957106590271, "kl": 1.077783465385437, "learning_rate": 4.140147572476269e-06, "loss": 0.0431, "reward": 2.8745408058166504, "reward_std": 0.3801617920398712, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6517857313156128, "rewards/wrapped_driving_reward": 0.597754955291748, "rewards/wrapped_format_reward": 0.625, "step": 828 }, { "completion_length": 500.0, "epoch": 165.8, "grad_norm": 0.6267257332801819, "kl": 2.0615594387054443, "learning_rate": 4.137401637187854e-06, "loss": 0.0825, "reward": 1.2131645679473877, "reward_std": 0.8687582612037659, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.36666664481163025, "rewards/wrapped_driving_reward": -0.9035020470619202, "rewards/wrapped_format_reward": 0.75, "step": 829 }, { "completion_length": 500.0, "epoch": 166.0, "grad_norm": 0.433156281709671, "kl": 2.3467886447906494, "learning_rate": 4.134652238142674e-06, "loss": 0.0939, "reward": 2.4150657653808594, "reward_std": 0.34335243701934814, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7708333730697632, "rewards/wrapped_driving_reward": -0.23076753318309784, "rewards/wrapped_format_reward": 0.875, "step": 830 }, { "completion_length": 499.0, "epoch": 166.2, "grad_norm": 0.6104046702384949, "kl": 1.47028386592865, "learning_rate": 4.1318993811568065e-06, "loss": 0.0588, "reward": -0.45583468675613403, "reward_std": 3.5396041870117188, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4583333134651184, "rewards/wrapped_driving_reward": -1.6641680002212524, "rewards/wrapped_format_reward": 0.25, "step": 831 }, { "completion_length": 500.0, "epoch": 166.4, "grad_norm": 0.5006852746009827, "kl": 1.100881576538086, "learning_rate": 4.129143072053639e-06, "loss": 0.044, "reward": 2.0321669578552246, "reward_std": 0.3274669349193573, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6625000238418579, "rewards/wrapped_driving_reward": 0.11966703832149506, "rewards/wrapped_format_reward": 0.25, "step": 832 }, { "completion_length": 500.0, "epoch": 166.6, "grad_norm": 0.4999527633190155, "kl": 2.64371395111084, "learning_rate": 4.126383316663862e-06, "loss": 0.1057, "reward": 1.147336721420288, "reward_std": 3.5284812450408936, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.71875, "rewards/wrapped_driving_reward": -1.071413278579712, "rewards/wrapped_format_reward": 0.75, "step": 833 }, { "completion_length": 500.0, "epoch": 166.8, "grad_norm": 0.44384869933128357, "kl": 2.825562000274658, "learning_rate": 4.123620120825459e-06, "loss": 0.113, "reward": 2.3765311241149902, "reward_std": 0.47541069984436035, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6484848260879517, "rewards/wrapped_driving_reward": -0.021953638643026352, "rewards/wrapped_format_reward": 0.75, "step": 834 }, { "completion_length": 500.0, "epoch": 167.0, "grad_norm": 0.47259521484375, "kl": 1.9621601104736328, "learning_rate": 4.120853490383691e-06, "loss": 0.0785, "reward": 2.1093008518218994, "reward_std": 0.5590493083000183, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7767857313156128, "rewards/wrapped_driving_reward": -0.04248480871319771, "rewards/wrapped_format_reward": 0.375, "step": 835 }, { "completion_length": 500.0, "epoch": 167.2, "grad_norm": 0.6543713212013245, "kl": 2.6922717094421387, "learning_rate": 4.1180834311910815e-06, "loss": 0.1077, "reward": 0.8647270202636719, "reward_std": 3.387563943862915, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.47857141494750977, "rewards/wrapped_driving_reward": -1.113844394683838, "rewards/wrapped_format_reward": 0.75, "step": 836 }, { "completion_length": 260.0, "epoch": 167.4, "grad_norm": 0.6397120356559753, "kl": 0.9813829660415649, "learning_rate": 4.11530994910741e-06, "loss": 0.0393, "reward": 2.3943681716918945, "reward_std": 0.2554885149002075, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.41969698667526245, "rewards/wrapped_driving_reward": -0.02532881312072277, "rewards/wrapped_format_reward": 1.0, "step": 837 }, { "completion_length": 500.0, "epoch": 167.6, "grad_norm": 0.3910243809223175, "kl": 1.3764312267303467, "learning_rate": 4.112533049999696e-06, "loss": 0.0551, "reward": 1.5576720237731934, "reward_std": 1.9142135381698608, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6619048118591309, "rewards/wrapped_driving_reward": -0.8542327880859375, "rewards/wrapped_format_reward": 0.75, "step": 838 }, { "completion_length": 500.0, "epoch": 167.8, "grad_norm": 0.9477003216743469, "kl": 2.6856141090393066, "learning_rate": 4.109752739742188e-06, "loss": 0.1074, "reward": 2.924741268157959, "reward_std": 0.6457185745239258, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": 0.799741268157959, "rewards/wrapped_format_reward": 0.375, "step": 839 }, { "completion_length": 234.0, "epoch": 168.0, "grad_norm": 0.5520507097244263, "kl": 1.299153447151184, "learning_rate": 4.106969024216348e-06, "loss": 0.052, "reward": 2.7171425819396973, "reward_std": 0.24512527883052826, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5791666507720947, "rewards/wrapped_driving_reward": 0.13797593116760254, "rewards/wrapped_format_reward": 1.0, "step": 840 }, { "completion_length": 500.0, "epoch": 168.2, "grad_norm": 0.5527065396308899, "kl": 2.1630938053131104, "learning_rate": 4.104181909310846e-06, "loss": 0.0865, "reward": 2.0791397094726562, "reward_std": 0.4801023304462433, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.44999998807907104, "rewards/wrapped_driving_reward": 0.004139885306358337, "rewards/wrapped_format_reward": 0.625, "step": 841 }, { "completion_length": 500.0, "epoch": 168.4, "grad_norm": 0.5730997920036316, "kl": 2.608327865600586, "learning_rate": 4.101391400921538e-06, "loss": 0.1043, "reward": 1.2084792852401733, "reward_std": 2.810657262802124, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.359375, "rewards/wrapped_driving_reward": -0.9008957147598267, "rewards/wrapped_format_reward": 1.0, "step": 842 }, { "completion_length": 500.0, "epoch": 168.6, "grad_norm": 0.5039235949516296, "kl": 1.0649555921554565, "learning_rate": 4.098597504951462e-06, "loss": 0.0426, "reward": 0.48669183254241943, "reward_std": 3.072101354598999, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5871212482452393, "rewards/wrapped_driving_reward": -1.4754294157028198, "rewards/wrapped_format_reward": 0.625, "step": 843 }, { "completion_length": 432.0, "epoch": 168.8, "grad_norm": 0.6145120859146118, "kl": 1.7667502164840698, "learning_rate": 4.095800227310821e-06, "loss": 0.0707, "reward": 1.667678952217102, "reward_std": 1.1833763122558594, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6693181991577148, "rewards/wrapped_driving_reward": -0.501639187335968, "rewards/wrapped_format_reward": 0.5, "step": 844 }, { "completion_length": 500.0, "epoch": 169.0, "grad_norm": 0.5540525317192078, "kl": 1.9023669958114624, "learning_rate": 4.092999573916971e-06, "loss": 0.0761, "reward": 1.3604118824005127, "reward_std": 3.586372137069702, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5229166746139526, "rewards/wrapped_driving_reward": -0.6625047326087952, "rewards/wrapped_format_reward": 0.75, "step": 845 }, { "completion_length": 500.0, "epoch": 169.2, "grad_norm": 0.4508492350578308, "kl": 3.246771812438965, "learning_rate": 4.09019555069441e-06, "loss": 0.1299, "reward": -0.16893959045410156, "reward_std": 3.86875057220459, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.18333333730697632, "rewards/wrapped_driving_reward": -1.6022729873657227, "rewards/wrapped_format_reward": 0.75, "step": 846 }, { "completion_length": 359.0, "epoch": 169.4, "grad_norm": 1.2723370790481567, "kl": 1.246498703956604, "learning_rate": 4.087388163574765e-06, "loss": 0.0499, "reward": 1.342596173286438, "reward_std": 3.5642547607421875, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.46136364340782166, "rewards/wrapped_driving_reward": -0.618767499923706, "rewards/wrapped_format_reward": 0.75, "step": 847 }, { "completion_length": 500.0, "epoch": 169.6, "grad_norm": 36.44300842285156, "kl": 6.069018840789795, "learning_rate": 4.084577418496775e-06, "loss": 0.2428, "reward": 2.431246280670166, "reward_std": 0.3451830744743347, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.645684540271759, "rewards/wrapped_driving_reward": 0.035561904311180115, "rewards/wrapped_format_reward": 0.75, "step": 848 }, { "completion_length": 380.0, "epoch": 169.8, "grad_norm": 0.4584289491176605, "kl": 1.9627432823181152, "learning_rate": 4.081763321406291e-06, "loss": 0.0785, "reward": 1.7621755599975586, "reward_std": 0.7273334264755249, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": 0.012175636366009712, "rewards/wrapped_format_reward": 0.375, "step": 849 }, { "completion_length": 303.0, "epoch": 170.0, "grad_norm": 0.7688567638397217, "kl": 1.5286363363265991, "learning_rate": 4.078945878256244e-06, "loss": 0.0611, "reward": 2.059847831726074, "reward_std": 0.5189023017883301, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.726190447807312, "rewards/wrapped_driving_reward": -0.16634270548820496, "rewards/wrapped_format_reward": 0.5, "step": 850 }, { "completion_length": 500.0, "epoch": 170.2, "grad_norm": 0.5445908308029175, "kl": 1.359305739402771, "learning_rate": 4.0761250950066525e-06, "loss": 0.0544, "reward": 0.6495171189308167, "reward_std": 3.1488304138183594, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5982142686843872, "rewards/wrapped_driving_reward": -1.1986972093582153, "rewards/wrapped_format_reward": 0.5, "step": 851 }, { "completion_length": 492.0, "epoch": 170.4, "grad_norm": 0.3592756688594818, "kl": 2.042684316635132, "learning_rate": 4.073300977624594e-06, "loss": 0.0817, "reward": 2.8880326747894287, "reward_std": 0.537230372428894, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5555555820465088, "rewards/wrapped_driving_reward": 0.5824772119522095, "rewards/wrapped_format_reward": 0.75, "step": 852 }, { "completion_length": 320.0, "epoch": 170.6, "grad_norm": 0.5545300841331482, "kl": 1.5483572483062744, "learning_rate": 4.070473532084204e-06, "loss": 0.0619, "reward": 3.0650322437286377, "reward_std": 0.30363455414772034, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6674107313156128, "rewards/wrapped_driving_reward": 0.5226215124130249, "rewards/wrapped_format_reward": 0.875, "step": 853 }, { "completion_length": 500.0, "epoch": 170.8, "grad_norm": 0.5614524483680725, "kl": 1.9028587341308594, "learning_rate": 4.067642764366655e-06, "loss": 0.0761, "reward": -0.2645247280597687, "reward_std": 3.0570878982543945, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6071428656578064, "rewards/wrapped_driving_reward": -1.9966676235198975, "rewards/wrapped_format_reward": 0.375, "step": 854 }, { "completion_length": 500.0, "epoch": 171.0, "grad_norm": 0.6641601324081421, "kl": 2.537586212158203, "learning_rate": 4.064808680460149e-06, "loss": 0.1015, "reward": 2.8995206356048584, "reward_std": 0.9057855606079102, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5138888955116272, "rewards/wrapped_driving_reward": 0.510631799697876, "rewards/wrapped_format_reward": 0.875, "step": 855 }, { "completion_length": 211.0, "epoch": 171.2, "grad_norm": 0.7883555889129639, "kl": 1.2052825689315796, "learning_rate": 4.0619712863599005e-06, "loss": 0.0482, "reward": 2.6141700744628906, "reward_std": 0.26695874333381653, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.49166667461395264, "rewards/wrapped_driving_reward": 0.12250331789255142, "rewards/wrapped_format_reward": 1.0, "step": 856 }, { "completion_length": 500.0, "epoch": 171.4, "grad_norm": 0.6091546416282654, "kl": 0.2985217869281769, "learning_rate": 4.059130588068132e-06, "loss": 0.0119, "reward": 0.01799154281616211, "reward_std": 3.5884652137756348, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.7320085763931274, "rewards/wrapped_format_reward": 0.625, "step": 857 }, { "completion_length": 238.0, "epoch": 171.6, "grad_norm": 0.6881460547447205, "kl": 1.6285467147827148, "learning_rate": 4.056286591594049e-06, "loss": 0.0651, "reward": 1.3435238599777222, "reward_std": 2.921504020690918, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4318181872367859, "rewards/wrapped_driving_reward": -0.8382943272590637, "rewards/wrapped_format_reward": 1.0, "step": 858 }, { "completion_length": 500.0, "epoch": 171.8, "grad_norm": 0.5135880708694458, "kl": 2.07365345954895, "learning_rate": 4.053439302953839e-06, "loss": 0.0829, "reward": 1.434957504272461, "reward_std": 3.6868643760681152, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6158841252326965, "rewards/wrapped_driving_reward": -0.4309265911579132, "rewards/wrapped_format_reward": 0.5, "step": 859 }, { "completion_length": 394.0, "epoch": 172.0, "grad_norm": 0.5846676826477051, "kl": 1.9917961359024048, "learning_rate": 4.0505887281706505e-06, "loss": 0.0797, "reward": 2.4975860118865967, "reward_std": 0.1576320081949234, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6041666865348816, "rewards/wrapped_driving_reward": -0.10658073425292969, "rewards/wrapped_format_reward": 1.0, "step": 860 }, { "completion_length": 500.0, "epoch": 172.2, "grad_norm": 3.3948516845703125, "kl": 2.4768245220184326, "learning_rate": 4.047734873274586e-06, "loss": 0.0991, "reward": 0.4537258744239807, "reward_std": 3.1550240516662598, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5972222089767456, "rewards/wrapped_driving_reward": -1.6434962749481201, "rewards/wrapped_format_reward": 0.75, "step": 861 }, { "completion_length": 500.0, "epoch": 172.4, "grad_norm": 0.4853835105895996, "kl": 2.340878963470459, "learning_rate": 4.044877744302684e-06, "loss": 0.0936, "reward": 0.851525604724884, "reward_std": 2.948742151260376, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4943181872367859, "rewards/wrapped_driving_reward": -1.0177925825119019, "rewards/wrapped_format_reward": 0.625, "step": 862 }, { "completion_length": 500.0, "epoch": 172.6, "grad_norm": 0.4636750817298889, "kl": 1.7857810258865356, "learning_rate": 4.04201734729891e-06, "loss": 0.0714, "reward": 1.720499038696289, "reward_std": 0.7010266184806824, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5562499761581421, "rewards/wrapped_driving_reward": -0.2107509821653366, "rewards/wrapped_format_reward": 0.375, "step": 863 }, { "completion_length": 500.0, "epoch": 172.8, "grad_norm": 0.42203953862190247, "kl": 1.8687101602554321, "learning_rate": 4.039153688314146e-06, "loss": 0.0747, "reward": 2.4908885955810547, "reward_std": 0.43568116426467896, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5531250238418579, "rewards/wrapped_driving_reward": -0.062236420810222626, "rewards/wrapped_format_reward": 1.0, "step": 864 }, { "completion_length": 500.0, "epoch": 173.0, "grad_norm": 0.6500958800315857, "kl": 1.269221305847168, "learning_rate": 4.036286773406169e-06, "loss": 0.0508, "reward": 0.6660364866256714, "reward_std": 3.2007014751434326, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.265625, "rewards/wrapped_driving_reward": -0.7245885133743286, "rewards/wrapped_format_reward": 0.375, "step": 865 }, { "completion_length": 500.0, "epoch": 173.2, "grad_norm": 0.44822511076927185, "kl": 2.7584421634674072, "learning_rate": 4.033416608639648e-06, "loss": 0.1103, "reward": 2.8062314987182617, "reward_std": 0.18861481547355652, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5249999761581421, "rewards/wrapped_driving_reward": 0.5312313437461853, "rewards/wrapped_format_reward": 0.75, "step": 866 }, { "completion_length": 413.0, "epoch": 173.4, "grad_norm": 0.778404176235199, "kl": 1.6933817863464355, "learning_rate": 4.0305432000861236e-06, "loss": 0.0677, "reward": 2.8611397743225098, "reward_std": 0.44354596734046936, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5255681872367859, "rewards/wrapped_driving_reward": 0.8355716466903687, "rewards/wrapped_format_reward": 0.5, "step": 867 }, { "completion_length": 500.0, "epoch": 173.6, "grad_norm": 0.6367045640945435, "kl": 2.054391622543335, "learning_rate": 4.027666553824e-06, "loss": 0.0822, "reward": 1.0274338722229004, "reward_std": 3.3570926189422607, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6875, "rewards/wrapped_driving_reward": -1.1600661277770996, "rewards/wrapped_format_reward": 0.75, "step": 868 }, { "completion_length": 232.0, "epoch": 173.8, "grad_norm": 0.7129911780357361, "kl": 1.315772533416748, "learning_rate": 4.0247866759385295e-06, "loss": 0.0526, "reward": 2.3401172161102295, "reward_std": 0.4827757477760315, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.4791666567325592, "rewards/wrapped_driving_reward": -0.13904941082000732, "rewards/wrapped_format_reward": 1.0, "step": 869 }, { "completion_length": 498.0, "epoch": 174.0, "grad_norm": 0.4014141261577606, "kl": 2.552555561065674, "learning_rate": 4.021903572521802e-06, "loss": 0.1021, "reward": 2.2962279319763184, "reward_std": 0.7203732132911682, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5833333134651184, "rewards/wrapped_driving_reward": 0.08789470791816711, "rewards/wrapped_format_reward": 0.625, "step": 870 }, { "completion_length": 500.0, "epoch": 174.2, "grad_norm": 0.4916898310184479, "kl": 3.051544189453125, "learning_rate": 4.019017249672729e-06, "loss": 0.1221, "reward": 2.731815814971924, "reward_std": 0.4801827073097229, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7250000238418579, "rewards/wrapped_driving_reward": 0.2568158507347107, "rewards/wrapped_format_reward": 0.75, "step": 871 }, { "completion_length": 465.0, "epoch": 174.4, "grad_norm": 0.4961591362953186, "kl": 2.8904507160186768, "learning_rate": 4.016127713497034e-06, "loss": 0.1156, "reward": 2.294635772705078, "reward_std": 0.2517685890197754, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.4437499940395355, "rewards/wrapped_driving_reward": -0.024114076048135757, "rewards/wrapped_format_reward": 0.875, "step": 872 }, { "completion_length": 500.0, "epoch": 174.6, "grad_norm": 0.40440821647644043, "kl": 2.878016233444214, "learning_rate": 4.013234970107236e-06, "loss": 0.1151, "reward": 2.4871575832366943, "reward_std": 0.2101297825574875, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.01284235343337059, "rewards/wrapped_format_reward": 0.75, "step": 873 }, { "completion_length": 209.0, "epoch": 174.8, "grad_norm": 0.6273557543754578, "kl": 1.5473604202270508, "learning_rate": 4.010339025622641e-06, "loss": 0.0619, "reward": 3.079251766204834, "reward_std": 0.11588083952665329, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5916666388511658, "rewards/wrapped_driving_reward": 0.4875849485397339, "rewards/wrapped_format_reward": 1.0, "step": 874 }, { "completion_length": 500.0, "epoch": 175.0, "grad_norm": 0.4993760883808136, "kl": 2.671588659286499, "learning_rate": 4.0074398861693244e-06, "loss": 0.1069, "reward": 0.2552154064178467, "reward_std": 3.624678373336792, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.42500001192092896, "rewards/wrapped_driving_reward": -1.5447847843170166, "rewards/wrapped_format_reward": 0.625, "step": 875 }, { "completion_length": 500.0, "epoch": 175.2, "grad_norm": 0.4493451416492462, "kl": 0.9401676058769226, "learning_rate": 4.0045375578801216e-06, "loss": 0.0376, "reward": 0.3250732421875, "reward_std": 2.5905001163482666, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.39642855525016785, "rewards/wrapped_driving_reward": -1.1963552236557007, "rewards/wrapped_format_reward": 0.375, "step": 876 }, { "completion_length": 500.0, "epoch": 175.4, "grad_norm": 0.5743693709373474, "kl": 1.0388669967651367, "learning_rate": 4.001632046894612e-06, "loss": 0.0416, "reward": 2.1007652282714844, "reward_std": 0.534542977809906, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.34166666865348816, "rewards/wrapped_driving_reward": 0.009098691865801811, "rewards/wrapped_format_reward": 0.75, "step": 877 }, { "completion_length": 500.0, "epoch": 175.6, "grad_norm": 0.5322843790054321, "kl": 1.8900378942489624, "learning_rate": 3.99872335935911e-06, "loss": 0.0756, "reward": 0.8245073556900024, "reward_std": 3.2326302528381348, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.32083335518836975, "rewards/wrapped_driving_reward": -0.9963259696960449, "rewards/wrapped_format_reward": 0.75, "step": 878 }, { "completion_length": 500.0, "epoch": 175.8, "grad_norm": 0.6121527552604675, "kl": 0.9317227005958557, "learning_rate": 3.995811501426648e-06, "loss": 0.0373, "reward": -0.6629834175109863, "reward_std": 3.575207471847534, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.1979166716337204, "rewards/wrapped_driving_reward": -1.8609000444412231, "rewards/wrapped_format_reward": 0.5, "step": 879 }, { "completion_length": 500.0, "epoch": 176.0, "grad_norm": 0.4072819650173187, "kl": 2.1265110969543457, "learning_rate": 3.992896479256966e-06, "loss": 0.0851, "reward": 2.546074867248535, "reward_std": 0.40843427181243896, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8125, "rewards/wrapped_driving_reward": -0.016425125300884247, "rewards/wrapped_format_reward": 0.75, "step": 880 }, { "completion_length": 316.0, "epoch": 176.2, "grad_norm": 0.7060668468475342, "kl": 1.781854510307312, "learning_rate": 3.989978299016497e-06, "loss": 0.0713, "reward": 1.5066014528274536, "reward_std": 3.10461688041687, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5416666865348816, "rewards/wrapped_driving_reward": -0.7850651741027832, "rewards/wrapped_format_reward": 1.0, "step": 881 }, { "completion_length": 500.0, "epoch": 176.4, "grad_norm": 0.42797377705574036, "kl": 2.0129175186157227, "learning_rate": 3.987056966878354e-06, "loss": 0.0805, "reward": -0.019154369831085205, "reward_std": 2.614680290222168, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3863636255264282, "rewards/wrapped_driving_reward": -2.030518054962158, "rewards/wrapped_format_reward": 0.875, "step": 882 }, { "completion_length": 500.0, "epoch": 176.6, "grad_norm": 0.49211832880973816, "kl": 2.391724109649658, "learning_rate": 3.984132489022319e-06, "loss": 0.0957, "reward": 2.3837080001831055, "reward_std": 0.7526604533195496, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6818181872367859, "rewards/wrapped_driving_reward": -0.04811014607548714, "rewards/wrapped_format_reward": 0.75, "step": 883 }, { "completion_length": 500.0, "epoch": 176.8, "grad_norm": 0.8800190091133118, "kl": 0.1896388977766037, "learning_rate": 3.981204871634827e-06, "loss": 0.0076, "reward": 1.9683669805526733, "reward_std": 0.4162609875202179, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8333333134651184, "rewards/wrapped_driving_reward": 0.01003369502723217, "rewards/wrapped_format_reward": 0.125, "step": 884 }, { "completion_length": 500.0, "epoch": 177.0, "grad_norm": 0.4547770619392395, "kl": 2.019606351852417, "learning_rate": 3.978274120908957e-06, "loss": 0.0808, "reward": 1.6338741779327393, "reward_std": 1.3036158084869385, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6260416507720947, "rewards/wrapped_driving_reward": -0.8671674132347107, "rewards/wrapped_format_reward": 0.875, "step": 885 }, { "completion_length": 500.0, "epoch": 177.2, "grad_norm": 0.4264660179615021, "kl": 1.7100956439971924, "learning_rate": 3.975340243044412e-06, "loss": 0.0684, "reward": 0.6114295125007629, "reward_std": 2.460634469985962, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6291666626930237, "rewards/wrapped_driving_reward": -1.2677372694015503, "rewards/wrapped_format_reward": 0.5, "step": 886 }, { "completion_length": 295.0, "epoch": 177.4, "grad_norm": 0.5942258834838867, "kl": 2.0637025833129883, "learning_rate": 3.972403244247512e-06, "loss": 0.0825, "reward": 1.650522232055664, "reward_std": 3.112292766571045, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5249999761581421, "rewards/wrapped_driving_reward": -0.37447768449783325, "rewards/wrapped_format_reward": 0.75, "step": 887 }, { "completion_length": 298.0, "epoch": 177.6, "grad_norm": 0.7984853982925415, "kl": 1.0979580879211426, "learning_rate": 3.969463130731183e-06, "loss": 0.0439, "reward": 1.6391451358795166, "reward_std": 0.7815407514572144, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6482142806053162, "rewards/wrapped_driving_reward": -0.3840690851211548, "rewards/wrapped_format_reward": 0.375, "step": 888 }, { "completion_length": 223.0, "epoch": 177.8, "grad_norm": 0.7368780970573425, "kl": 1.0995688438415527, "learning_rate": 3.966519908714934e-06, "loss": 0.044, "reward": 2.920001983642578, "reward_std": 0.40887919068336487, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7986111044883728, "rewards/wrapped_driving_reward": 0.24639087915420532, "rewards/wrapped_format_reward": 0.875, "step": 889 }, { "completion_length": 500.0, "epoch": 178.0, "grad_norm": 0.6906342506408691, "kl": 0.8721696138381958, "learning_rate": 3.963573584424852e-06, "loss": 0.0349, "reward": -0.7715927958488464, "reward_std": 3.750873327255249, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2916666567325592, "rewards/wrapped_driving_reward": -2.0632596015930176, "rewards/wrapped_format_reward": 0.5, "step": 890 }, { "completion_length": 343.0, "epoch": 178.2, "grad_norm": 0.44748204946517944, "kl": 2.911588668823242, "learning_rate": 3.960624164093587e-06, "loss": 0.1165, "reward": 2.9007225036621094, "reward_std": 0.32139718532562256, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6625000238418579, "rewards/wrapped_driving_reward": 0.2382226139307022, "rewards/wrapped_format_reward": 1.0, "step": 891 }, { "completion_length": 500.0, "epoch": 178.4, "grad_norm": 0.4959274232387543, "kl": 1.8275189399719238, "learning_rate": 3.957671653960337e-06, "loss": 0.0731, "reward": 2.34122896194458, "reward_std": 0.6799303889274597, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9861111044883728, "rewards/wrapped_driving_reward": -0.1448819935321808, "rewards/wrapped_format_reward": 0.5, "step": 892 }, { "completion_length": 258.0, "epoch": 178.6, "grad_norm": 0.7540722489356995, "kl": 1.114927053451538, "learning_rate": 3.954716060270839e-06, "loss": 0.0446, "reward": 0.9169098138809204, "reward_std": 3.2809665203094482, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.46666666865348816, "rewards/wrapped_driving_reward": -0.9247567653656006, "rewards/wrapped_format_reward": 0.625, "step": 893 }, { "completion_length": 500.0, "epoch": 178.8, "grad_norm": 0.3893278241157532, "kl": 1.8390357494354248, "learning_rate": 3.951757389277349e-06, "loss": 0.0736, "reward": 0.8127535581588745, "reward_std": 2.265406608581543, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6520562767982483, "rewards/wrapped_driving_reward": -1.5893027782440186, "rewards/wrapped_format_reward": 0.75, "step": 894 }, { "completion_length": 500.0, "epoch": 179.0, "grad_norm": 0.5656977891921997, "kl": 1.489612340927124, "learning_rate": 3.948795647238638e-06, "loss": 0.0596, "reward": 1.133161187171936, "reward_std": 3.497837543487549, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4166666865348816, "rewards/wrapped_driving_reward": -0.4085054397583008, "rewards/wrapped_format_reward": 0.375, "step": 895 }, { "completion_length": 216.0, "epoch": 179.2, "grad_norm": 0.6591039299964905, "kl": 1.8445957899093628, "learning_rate": 3.945830840419966e-06, "loss": 0.0738, "reward": 3.370729923248291, "reward_std": 0.3368479609489441, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.543749988079071, "rewards/wrapped_driving_reward": 0.8269798755645752, "rewards/wrapped_format_reward": 1.0, "step": 896 }, { "completion_length": 443.0, "epoch": 179.4, "grad_norm": 0.5731416344642639, "kl": 2.922247886657715, "learning_rate": 3.942862975093085e-06, "loss": 0.1169, "reward": 2.6342549324035645, "reward_std": 0.6286062002182007, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7357142567634583, "rewards/wrapped_driving_reward": 0.14854073524475098, "rewards/wrapped_format_reward": 0.75, "step": 897 }, { "completion_length": 253.0, "epoch": 179.6, "grad_norm": 0.7713198065757751, "kl": 1.6429288387298584, "learning_rate": 3.939892057536209e-06, "loss": 0.0657, "reward": 1.960778832435608, "reward_std": 1.2942521572113037, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6000000238418579, "rewards/wrapped_driving_reward": -0.6392212510108948, "rewards/wrapped_format_reward": 1.0, "step": 898 }, { "completion_length": 402.0, "epoch": 179.8, "grad_norm": 0.5697601437568665, "kl": 2.0517232418060303, "learning_rate": 3.936918094034014e-06, "loss": 0.0821, "reward": -2.063636302947998, "reward_std": 1.3159124851226807, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5613636374473572, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.625, "step": 899 }, { "completion_length": 500.0, "epoch": 180.0, "grad_norm": 0.6935559511184692, "kl": 1.2006996870040894, "learning_rate": 3.933941090877615e-06, "loss": 0.048, "reward": 0.6366249322891235, "reward_std": 3.113308906555176, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4895833134651184, "rewards/wrapped_driving_reward": -0.9779583811759949, "rewards/wrapped_format_reward": 0.375, "step": 900 }, { "completion_length": 500.0, "epoch": 180.2, "grad_norm": 0.42344924807548523, "kl": 2.85091495513916, "learning_rate": 3.9309610543645635e-06, "loss": 0.114, "reward": 3.433417320251465, "reward_std": 0.4497038722038269, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": 0.8084172606468201, "rewards/wrapped_format_reward": 0.875, "step": 901 }, { "completion_length": 500.0, "epoch": 180.4, "grad_norm": 0.6061966419219971, "kl": 2.7855606079101562, "learning_rate": 3.927977990798822e-06, "loss": 0.1114, "reward": 2.628899574279785, "reward_std": 0.49769842624664307, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8229166865348816, "rewards/wrapped_driving_reward": 0.05598287284374237, "rewards/wrapped_format_reward": 0.75, "step": 902 }, { "completion_length": 340.0, "epoch": 180.6, "grad_norm": 0.69258052110672, "kl": 0.9422200322151184, "learning_rate": 3.924991906490758e-06, "loss": 0.0377, "reward": 2.399470329284668, "reward_std": 0.7950145602226257, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.53125, "rewards/wrapped_driving_reward": 0.11822015047073364, "rewards/wrapped_format_reward": 0.75, "step": 903 }, { "completion_length": 346.0, "epoch": 180.8, "grad_norm": 0.5851873159408569, "kl": 1.3978681564331055, "learning_rate": 3.92200280775713e-06, "loss": 0.0559, "reward": 2.826590061187744, "reward_std": 0.40369948744773865, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7749999761581421, "rewards/wrapped_driving_reward": 0.1765901893377304, "rewards/wrapped_format_reward": 0.875, "step": 904 }, { "completion_length": 277.0, "epoch": 181.0, "grad_norm": 0.6823394894599915, "kl": 1.7282536029815674, "learning_rate": 3.9190107009210725e-06, "loss": 0.0691, "reward": 1.0783967971801758, "reward_std": 2.7210581302642822, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4285714328289032, "rewards/wrapped_driving_reward": -1.1001746654510498, "rewards/wrapped_format_reward": 1.0, "step": 905 }, { "completion_length": 500.0, "epoch": 181.2, "grad_norm": 0.5186096429824829, "kl": 3.4375360012054443, "learning_rate": 3.916015592312083e-06, "loss": 0.1375, "reward": 1.223285436630249, "reward_std": 3.5174005031585693, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5208333730697632, "rewards/wrapped_driving_reward": -0.7975478172302246, "rewards/wrapped_format_reward": 0.75, "step": 906 }, { "completion_length": 500.0, "epoch": 181.4, "grad_norm": 0.535533607006073, "kl": 1.992340087890625, "learning_rate": 3.9130174882660085e-06, "loss": 0.0797, "reward": 1.930053949356079, "reward_std": 1.0493069887161255, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5874999761581421, "rewards/wrapped_driving_reward": -0.28244608640670776, "rewards/wrapped_format_reward": 0.625, "step": 907 }, { "completion_length": 500.0, "epoch": 181.6, "grad_norm": 0.5773860216140747, "kl": 0.793572187423706, "learning_rate": 3.910016395125037e-06, "loss": 0.0317, "reward": -2.2496049404144287, "reward_std": 3.1762144565582275, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.1666666716337204, "rewards/wrapped_driving_reward": -3.041271448135376, "rewards/wrapped_format_reward": 0.375, "step": 908 }, { "completion_length": 500.0, "epoch": 181.8, "grad_norm": 0.5250527858734131, "kl": 1.8544220924377441, "learning_rate": 3.907012319237672e-06, "loss": 0.0742, "reward": -0.4366135001182556, "reward_std": 2.187870979309082, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5833333134651184, "rewards/wrapped_driving_reward": -2.394946813583374, "rewards/wrapped_format_reward": 0.625, "step": 909 }, { "completion_length": 500.0, "epoch": 182.0, "grad_norm": 0.42340776324272156, "kl": 2.853215217590332, "learning_rate": 3.9040052669587325e-06, "loss": 0.1141, "reward": 2.987250328063965, "reward_std": 0.5010718703269958, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8298611640930176, "rewards/wrapped_driving_reward": 0.2823891043663025, "rewards/wrapped_format_reward": 0.875, "step": 910 }, { "completion_length": 278.0, "epoch": 182.2, "grad_norm": 0.6234366297721863, "kl": 1.1290289163589478, "learning_rate": 3.900995244649333e-06, "loss": 0.0452, "reward": 2.3610379695892334, "reward_std": 0.2784062922000885, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.4124999940395355, "rewards/wrapped_driving_reward": 0.07353793829679489, "rewards/wrapped_format_reward": 0.875, "step": 911 }, { "completion_length": 500.0, "epoch": 182.4, "grad_norm": 0.3704410493373871, "kl": 2.557985782623291, "learning_rate": 3.897982258676867e-06, "loss": 0.1023, "reward": 0.8126872777938843, "reward_std": 2.915015935897827, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": -1.1873127222061157, "rewards/wrapped_format_reward": 0.625, "step": 912 }, { "completion_length": 500.0, "epoch": 182.6, "grad_norm": 0.6271265149116516, "kl": 2.3218307495117188, "learning_rate": 3.894966315415004e-06, "loss": 0.0929, "reward": 2.265043258666992, "reward_std": 0.6728046536445618, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.4548611044883728, "rewards/wrapped_driving_reward": -0.06481779366731644, "rewards/wrapped_format_reward": 0.875, "step": 913 }, { "completion_length": 471.0, "epoch": 182.8, "grad_norm": 0.38925671577453613, "kl": 2.9206743240356445, "learning_rate": 3.891947421243662e-06, "loss": 0.1168, "reward": 0.8769209980964661, "reward_std": 2.990391731262207, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3333333432674408, "rewards/wrapped_driving_reward": -0.8314123153686523, "rewards/wrapped_format_reward": 0.625, "step": 914 }, { "completion_length": 500.0, "epoch": 183.0, "grad_norm": 0.3494448661804199, "kl": 3.355926036834717, "learning_rate": 3.888925582549006e-06, "loss": 0.1342, "reward": 1.738662838935852, "reward_std": 0.648934543132782, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.4553571343421936, "rewards/wrapped_driving_reward": -0.5916943550109863, "rewards/wrapped_format_reward": 0.875, "step": 915 }, { "completion_length": 500.0, "epoch": 183.2, "grad_norm": 0.44800475239753723, "kl": 1.6594505310058594, "learning_rate": 3.8859008057234294e-06, "loss": 0.0664, "reward": 2.1790528297424316, "reward_std": 0.41116464138031006, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.875, "rewards/wrapped_driving_reward": -0.07094721496105194, "rewards/wrapped_format_reward": 0.375, "step": 916 }, { "completion_length": 281.0, "epoch": 183.4, "grad_norm": 0.4399340748786926, "kl": 2.0123393535614014, "learning_rate": 3.882873097165539e-06, "loss": 0.0805, "reward": 2.4943761825561523, "reward_std": 0.4836216866970062, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5214285850524902, "rewards/wrapped_driving_reward": -0.027052275836467743, "rewards/wrapped_format_reward": 1.0, "step": 917 }, { "completion_length": 264.0, "epoch": 183.6, "grad_norm": 0.8163371682167053, "kl": 1.926215410232544, "learning_rate": 3.879842463280146e-06, "loss": 0.077, "reward": 1.1410983800888062, "reward_std": 3.4868903160095215, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6458333134651184, "rewards/wrapped_driving_reward": -0.754734992980957, "rewards/wrapped_format_reward": 0.5, "step": 918 }, { "completion_length": 232.0, "epoch": 183.8, "grad_norm": 0.6990157961845398, "kl": 1.1987215280532837, "learning_rate": 3.876808910478247e-06, "loss": 0.0479, "reward": 3.3785595893859863, "reward_std": 0.29756999015808105, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5520833730697632, "rewards/wrapped_driving_reward": 0.8264761567115784, "rewards/wrapped_format_reward": 1.0, "step": 919 }, { "completion_length": 368.0, "epoch": 184.0, "grad_norm": 0.705010175704956, "kl": 2.2318997383117676, "learning_rate": 3.8737724451770155e-06, "loss": 0.0893, "reward": 2.699289321899414, "reward_std": 0.28938788175582886, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8333333134651184, "rewards/wrapped_driving_reward": 0.11595587432384491, "rewards/wrapped_format_reward": 0.75, "step": 920 }, { "completion_length": 500.0, "epoch": 184.2, "grad_norm": 1.0950113534927368, "kl": 2.1620304584503174, "learning_rate": 3.870733073799785e-06, "loss": 0.0865, "reward": 1.9017943143844604, "reward_std": 0.7144573926925659, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5625, "rewards/wrapped_driving_reward": -0.16070565581321716, "rewards/wrapped_format_reward": 0.5, "step": 921 }, { "completion_length": 369.0, "epoch": 184.4, "grad_norm": 0.4460541903972626, "kl": 1.3944790363311768, "learning_rate": 3.867690802776036e-06, "loss": 0.0558, "reward": 0.7362405061721802, "reward_std": 2.3158774375915527, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5718749761581421, "rewards/wrapped_driving_reward": -1.835634469985962, "rewards/wrapped_format_reward": 1.0, "step": 922 }, { "completion_length": 351.0, "epoch": 184.6, "grad_norm": 0.5902615189552307, "kl": 2.4331822395324707, "learning_rate": 3.864645638541386e-06, "loss": 0.0973, "reward": 2.846731424331665, "reward_std": 0.6255345344543457, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.675000011920929, "rewards/wrapped_driving_reward": 0.17173147201538086, "rewards/wrapped_format_reward": 1.0, "step": 923 }, { "completion_length": 500.0, "epoch": 184.8, "grad_norm": 0.4837050437927246, "kl": 1.265053629875183, "learning_rate": 3.861597587537568e-06, "loss": 0.0506, "reward": 1.7583963871002197, "reward_std": 3.172339916229248, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.40625, "rewards/wrapped_driving_reward": -0.3978537321090698, "rewards/wrapped_format_reward": 1.0, "step": 924 }, { "completion_length": 500.0, "epoch": 185.0, "grad_norm": 0.5546427369117737, "kl": 2.9983720779418945, "learning_rate": 3.858546656212425e-06, "loss": 0.1199, "reward": 2.343202590942383, "reward_std": 0.794277012348175, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.84375, "rewards/wrapped_driving_reward": -0.2505473494529724, "rewards/wrapped_format_reward": 0.75, "step": 925 }, { "completion_length": 451.0, "epoch": 185.2, "grad_norm": 0.41086822748184204, "kl": 2.0693869590759277, "learning_rate": 3.855492851019893e-06, "loss": 0.0828, "reward": 3.010849952697754, "reward_std": 0.35766229033470154, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6818181872367859, "rewards/wrapped_driving_reward": 0.45403170585632324, "rewards/wrapped_format_reward": 0.875, "step": 926 }, { "completion_length": 500.0, "epoch": 185.4, "grad_norm": 0.47299084067344666, "kl": 1.6233148574829102, "learning_rate": 3.8524361784199855e-06, "loss": 0.0649, "reward": 1.0155279636383057, "reward_std": 0.24341019988059998, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.4109848737716675, "rewards/wrapped_driving_reward": -0.8954569101333618, "rewards/wrapped_format_reward": 0.5, "step": 927 }, { "completion_length": 500.0, "epoch": 185.6, "grad_norm": 0.4359581470489502, "kl": 2.3563270568847656, "learning_rate": 3.849376644878783e-06, "loss": 0.0943, "reward": 0.35687488317489624, "reward_std": 1.495343565940857, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5772727727890015, "rewards/wrapped_driving_reward": -1.5953978300094604, "rewards/wrapped_format_reward": 0.375, "step": 928 }, { "completion_length": 342.0, "epoch": 185.8, "grad_norm": 0.5124377608299255, "kl": 1.685256838798523, "learning_rate": 3.846314256868418e-06, "loss": 0.0674, "reward": 2.3232834339141846, "reward_std": 0.3332556188106537, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6395833492279053, "rewards/wrapped_driving_reward": -0.3162999153137207, "rewards/wrapped_format_reward": 1.0, "step": 929 }, { "completion_length": 263.0, "epoch": 186.0, "grad_norm": 0.5631007552146912, "kl": 2.1246392726898193, "learning_rate": 3.8432490208670605e-06, "loss": 0.085, "reward": 2.90290904045105, "reward_std": 0.33712247014045715, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8767856955528259, "rewards/wrapped_driving_reward": 0.026123281568288803, "rewards/wrapped_format_reward": 1.0, "step": 930 }, { "completion_length": 266.0, "epoch": 186.2, "grad_norm": 1.3182933330535889, "kl": 2.2304015159606934, "learning_rate": 3.840180943358906e-06, "loss": 0.0892, "reward": 0.9916277527809143, "reward_std": 3.33302640914917, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.53125, "rewards/wrapped_driving_reward": -1.03962242603302, "rewards/wrapped_format_reward": 0.75, "step": 931 }, { "completion_length": 438.0, "epoch": 186.4, "grad_norm": 0.6201478242874146, "kl": 3.557145595550537, "learning_rate": 3.837110030834162e-06, "loss": 0.1423, "reward": 2.540984630584717, "reward_std": 0.32874172925949097, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.53125, "rewards/wrapped_driving_reward": 0.009734511375427246, "rewards/wrapped_format_reward": 1.0, "step": 932 }, { "completion_length": 325.0, "epoch": 186.6, "grad_norm": 0.608431339263916, "kl": 1.1576615571975708, "learning_rate": 3.83403628978903e-06, "loss": 0.0463, "reward": 3.2168986797332764, "reward_std": 0.348687082529068, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5874999761581421, "rewards/wrapped_driving_reward": 0.7543985843658447, "rewards/wrapped_format_reward": 0.875, "step": 933 }, { "completion_length": 344.0, "epoch": 186.8, "grad_norm": 0.528343141078949, "kl": 1.7108962535858154, "learning_rate": 3.830959726725697e-06, "loss": 0.0684, "reward": -0.9138116836547852, "reward_std": 1.629123568534851, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5568181872367859, "rewards/wrapped_driving_reward": -3.220629930496216, "rewards/wrapped_format_reward": 0.75, "step": 934 }, { "completion_length": 277.0, "epoch": 187.0, "grad_norm": 0.6971482038497925, "kl": 1.7196736335754395, "learning_rate": 3.827880348152321e-06, "loss": 0.0688, "reward": 2.0635178089141846, "reward_std": 1.3739427328109741, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5874999761581421, "rewards/wrapped_driving_reward": -0.2739821672439575, "rewards/wrapped_format_reward": 0.75, "step": 935 }, { "completion_length": 500.0, "epoch": 187.2, "grad_norm": 0.41752108931541443, "kl": 3.7903270721435547, "learning_rate": 3.824798160583012e-06, "loss": 0.1516, "reward": 3.360854148864746, "reward_std": 0.5319862961769104, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8125, "rewards/wrapped_driving_reward": 0.5483542680740356, "rewards/wrapped_format_reward": 1.0, "step": 936 }, { "completion_length": 293.0, "epoch": 187.4, "grad_norm": 1.1056814193725586, "kl": 1.8957011699676514, "learning_rate": 3.821713170537828e-06, "loss": 0.0758, "reward": 1.4631000757217407, "reward_std": 0.4590722322463989, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.65625, "rewards/wrapped_driving_reward": -0.6931499242782593, "rewards/wrapped_format_reward": 0.5, "step": 937 }, { "completion_length": 500.0, "epoch": 187.6, "grad_norm": 0.5264467000961304, "kl": 2.6821203231811523, "learning_rate": 3.81862538454275e-06, "loss": 0.1073, "reward": 2.004483699798584, "reward_std": 0.8467980027198792, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6875, "rewards/wrapped_driving_reward": -0.3080163300037384, "rewards/wrapped_format_reward": 0.625, "step": 938 }, { "completion_length": 500.0, "epoch": 187.8, "grad_norm": 3.0910918712615967, "kl": 2.3213939666748047, "learning_rate": 3.815534809129674e-06, "loss": 0.0929, "reward": 2.5399014949798584, "reward_std": 0.5887588262557983, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7251983880996704, "rewards/wrapped_driving_reward": 0.06470300257205963, "rewards/wrapped_format_reward": 0.75, "step": 939 }, { "completion_length": 500.0, "epoch": 188.0, "grad_norm": 0.6292603015899658, "kl": 2.379415988922119, "learning_rate": 3.8124414508364005e-06, "loss": 0.0952, "reward": -1.8796478509902954, "reward_std": 2.129520893096924, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.22142857313156128, "rewards/wrapped_driving_reward": -3.351076364517212, "rewards/wrapped_format_reward": 0.75, "step": 940 }, { "completion_length": 267.0, "epoch": 188.2, "grad_norm": 0.6127882599830627, "kl": 1.479856252670288, "learning_rate": 3.809345316206614e-06, "loss": 0.0592, "reward": 2.3314530849456787, "reward_std": 1.0573755502700806, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5952381491661072, "rewards/wrapped_driving_reward": -0.2637850046157837, "rewards/wrapped_format_reward": 1.0, "step": 941 }, { "completion_length": 245.0, "epoch": 188.4, "grad_norm": 0.8831590414047241, "kl": 1.7615694999694824, "learning_rate": 3.806246411789872e-06, "loss": 0.0705, "reward": 3.193218469619751, "reward_std": 0.6387902498245239, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": 0.8182185888290405, "rewards/wrapped_format_reward": 0.75, "step": 942 }, { "completion_length": 500.0, "epoch": 188.6, "grad_norm": 0.5580199360847473, "kl": 0.9022277593612671, "learning_rate": 3.8031447441415936e-06, "loss": 0.0361, "reward": -0.2560221552848816, "reward_std": 2.3979086875915527, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9090908765792847, "rewards/wrapped_driving_reward": -2.9151129722595215, "rewards/wrapped_format_reward": 0.75, "step": 943 }, { "completion_length": 500.0, "epoch": 188.8, "grad_norm": 0.5519108772277832, "kl": 2.4725968837738037, "learning_rate": 3.8000403198230385e-06, "loss": 0.0989, "reward": 2.5940396785736084, "reward_std": 0.7400142550468445, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7875000238418579, "rewards/wrapped_driving_reward": 0.056539591401815414, "rewards/wrapped_format_reward": 0.75, "step": 944 }, { "completion_length": 500.0, "epoch": 189.0, "grad_norm": 0.5508544445037842, "kl": 1.890506386756897, "learning_rate": 3.796933145401304e-06, "loss": 0.0756, "reward": 1.9368696212768555, "reward_std": 0.7139579057693481, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8020833134651184, "rewards/wrapped_driving_reward": -0.24021360278129578, "rewards/wrapped_format_reward": 0.375, "step": 945 }, { "completion_length": 500.0, "epoch": 189.2, "grad_norm": 0.5794646739959717, "kl": 2.5017786026000977, "learning_rate": 3.7938232274493002e-06, "loss": 0.1001, "reward": 2.999016284942627, "reward_std": 0.4313461482524872, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6166666746139526, "rewards/wrapped_driving_reward": 0.3823496103286743, "rewards/wrapped_format_reward": 1.0, "step": 946 }, { "completion_length": 500.0, "epoch": 189.4, "grad_norm": 4.063485145568848, "kl": 3.1036813259124756, "learning_rate": 3.7907105725457414e-06, "loss": 0.1241, "reward": -0.2984890043735504, "reward_std": 1.8190838098526, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5340908765792847, "rewards/wrapped_driving_reward": -2.3325798511505127, "rewards/wrapped_format_reward": 0.75, "step": 947 }, { "completion_length": 500.0, "epoch": 189.6, "grad_norm": 0.4874606430530548, "kl": 1.7708829641342163, "learning_rate": 3.787595187275136e-06, "loss": 0.0708, "reward": -1.6385157108306885, "reward_std": 3.3215858936309814, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3499999940395355, "rewards/wrapped_driving_reward": -2.988515853881836, "rewards/wrapped_format_reward": 0.5, "step": 948 }, { "completion_length": 350.0, "epoch": 189.8, "grad_norm": 0.7013482451438904, "kl": 2.308487892150879, "learning_rate": 3.7844770782277625e-06, "loss": 0.0923, "reward": 2.7643275260925293, "reward_std": 0.7384212613105774, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.53125, "rewards/wrapped_driving_reward": 0.4830774962902069, "rewards/wrapped_format_reward": 0.75, "step": 949 }, { "completion_length": 429.0, "epoch": 190.0, "grad_norm": 0.5328139066696167, "kl": 2.9645802974700928, "learning_rate": 3.7813562519996633e-06, "loss": 0.1186, "reward": 2.4175798892974854, "reward_std": 0.2546192407608032, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6937500238418579, "rewards/wrapped_driving_reward": -0.1511700600385666, "rewards/wrapped_format_reward": 0.875, "step": 950 }, { "completion_length": 391.0, "epoch": 190.2, "grad_norm": 0.593440592288971, "kl": 2.426928997039795, "learning_rate": 3.77823271519263e-06, "loss": 0.0971, "reward": 2.1927688121795654, "reward_std": 0.30556195974349976, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6964285373687744, "rewards/wrapped_driving_reward": -0.25365975499153137, "rewards/wrapped_format_reward": 0.75, "step": 951 }, { "completion_length": 245.0, "epoch": 190.4, "grad_norm": 1.080994963645935, "kl": 0.6552043557167053, "learning_rate": 3.7751064744141886e-06, "loss": 0.0262, "reward": 1.770737886428833, "reward_std": 3.188565969467163, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.538690447807312, "rewards/wrapped_driving_reward": -0.392952561378479, "rewards/wrapped_format_reward": 0.875, "step": 952 }, { "completion_length": 415.0, "epoch": 190.6, "grad_norm": 0.6117026209831238, "kl": 1.9140716791152954, "learning_rate": 3.771977536277581e-06, "loss": 0.0766, "reward": 0.20093318819999695, "reward_std": 1.6952910423278809, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6339285373687744, "rewards/wrapped_driving_reward": -2.182995319366455, "rewards/wrapped_format_reward": 0.75, "step": 953 }, { "completion_length": 330.0, "epoch": 190.8, "grad_norm": 0.5234684348106384, "kl": 1.726454734802246, "learning_rate": 3.768845907401761e-06, "loss": 0.0691, "reward": 2.6395888328552246, "reward_std": 0.21844245493412018, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": 0.13958871364593506, "rewards/wrapped_format_reward": 0.875, "step": 954 }, { "completion_length": 310.0, "epoch": 191.0, "grad_norm": 3.978804349899292, "kl": 1.9915828704833984, "learning_rate": 3.765711594411369e-06, "loss": 0.0797, "reward": 1.3190569877624512, "reward_std": 1.5399754047393799, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5364583730697632, "rewards/wrapped_driving_reward": -1.2174015045166016, "rewards/wrapped_format_reward": 1.0, "step": 955 }, { "completion_length": 500.0, "epoch": 191.2, "grad_norm": 1.074221134185791, "kl": 2.539937973022461, "learning_rate": 3.7625746039367258e-06, "loss": 0.1016, "reward": 0.7857856750488281, "reward_std": 2.531182050704956, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5069444179534912, "rewards/wrapped_driving_reward": -1.3461588621139526, "rewards/wrapped_format_reward": 0.875, "step": 956 }, { "completion_length": 500.0, "epoch": 191.4, "grad_norm": 0.5890676975250244, "kl": 2.2778594493865967, "learning_rate": 3.759434942613816e-06, "loss": 0.0911, "reward": 1.2759552001953125, "reward_std": 3.559316635131836, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5833333134651184, "rewards/wrapped_driving_reward": -0.5573782920837402, "rewards/wrapped_format_reward": 0.5, "step": 957 }, { "completion_length": 500.0, "epoch": 191.6, "grad_norm": 1.5262166261672974, "kl": 1.0246572494506836, "learning_rate": 3.7562926170842753e-06, "loss": 0.041, "reward": 2.7088077068328857, "reward_std": 0.544691801071167, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.4523809552192688, "rewards/wrapped_driving_reward": 0.7564265727996826, "rewards/wrapped_format_reward": 0.5, "step": 958 }, { "completion_length": 445.0, "epoch": 191.8, "grad_norm": 0.5555316805839539, "kl": 1.5433566570281982, "learning_rate": 3.753147633995372e-06, "loss": 0.0617, "reward": 0.8373291492462158, "reward_std": 1.813393235206604, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8020833134651184, "rewards/wrapped_driving_reward": -1.3397541046142578, "rewards/wrapped_format_reward": 0.375, "step": 959 }, { "completion_length": 310.0, "epoch": 192.0, "grad_norm": 0.5657883286476135, "kl": 1.1397076845169067, "learning_rate": 3.7500000000000005e-06, "loss": 0.0456, "reward": 2.39677357673645, "reward_std": 0.9404283165931702, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7250000238418579, "rewards/wrapped_driving_reward": -0.3282264471054077, "rewards/wrapped_format_reward": 1.0, "step": 960 }, { "completion_length": 253.0, "epoch": 192.2, "grad_norm": 0.6523939371109009, "kl": 0.9725794196128845, "learning_rate": 3.7468497217566585e-06, "loss": 0.0389, "reward": 2.332803249359131, "reward_std": 0.6193321943283081, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7583333253860474, "rewards/wrapped_driving_reward": -0.42553025484085083, "rewards/wrapped_format_reward": 1.0, "step": 961 }, { "completion_length": 357.0, "epoch": 192.4, "grad_norm": 0.5648502111434937, "kl": 2.2727229595184326, "learning_rate": 3.7436968059294416e-06, "loss": 0.0909, "reward": 2.81382417678833, "reward_std": 0.4169542193412781, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.675000011920929, "rewards/wrapped_driving_reward": 0.13882412016391754, "rewards/wrapped_format_reward": 1.0, "step": 962 }, { "completion_length": 500.0, "epoch": 192.6, "grad_norm": 0.5214493274688721, "kl": 2.491422414779663, "learning_rate": 3.7405412591880213e-06, "loss": 0.0997, "reward": 0.8314995765686035, "reward_std": 2.937877655029297, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.19374999403953552, "rewards/wrapped_driving_reward": -0.9872503280639648, "rewards/wrapped_format_reward": 0.875, "step": 963 }, { "completion_length": 500.0, "epoch": 192.8, "grad_norm": 0.5463294386863708, "kl": 1.8412586450576782, "learning_rate": 3.7373830882076358e-06, "loss": 0.0737, "reward": 0.23144245147705078, "reward_std": 1.7444252967834473, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7041396498680115, "rewards/wrapped_driving_reward": -1.972697138786316, "rewards/wrapped_format_reward": 0.5, "step": 964 }, { "completion_length": 389.0, "epoch": 193.0, "grad_norm": 0.4172411561012268, "kl": 2.8092048168182373, "learning_rate": 3.734222299669076e-06, "loss": 0.1124, "reward": 1.2366937398910522, "reward_std": 1.9508404731750488, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.4166666865348816, "rewards/wrapped_driving_reward": -0.9299729466438293, "rewards/wrapped_format_reward": 0.75, "step": 965 }, { "completion_length": 248.0, "epoch": 193.2, "grad_norm": 0.7303145527839661, "kl": 1.1455670595169067, "learning_rate": 3.7310589002586683e-06, "loss": 0.0458, "reward": 0.860407829284668, "reward_std": 2.5853140354156494, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.21875, "rewards/wrapped_driving_reward": -0.8583422303199768, "rewards/wrapped_format_reward": 0.75, "step": 966 }, { "completion_length": 500.0, "epoch": 193.4, "grad_norm": 0.4069046974182129, "kl": 1.850386619567871, "learning_rate": 3.7278928966682624e-06, "loss": 0.074, "reward": 2.3188323974609375, "reward_std": 0.481251060962677, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6875, "rewards/wrapped_driving_reward": -0.24366779625415802, "rewards/wrapped_format_reward": 0.875, "step": 967 }, { "completion_length": 357.0, "epoch": 193.6, "grad_norm": 0.46468204259872437, "kl": 1.970192551612854, "learning_rate": 3.724724295595218e-06, "loss": 0.0788, "reward": 1.748946189880371, "reward_std": 3.184161424636841, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4749999940395355, "rewards/wrapped_driving_reward": -0.4760538339614868, "rewards/wrapped_format_reward": 1.0, "step": 968 }, { "completion_length": 500.0, "epoch": 193.8, "grad_norm": 0.7338618636131287, "kl": 2.264711380004883, "learning_rate": 3.721553103742388e-06, "loss": 0.0906, "reward": 0.8214377164840698, "reward_std": 2.5559587478637695, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4821428656578064, "rewards/wrapped_driving_reward": -1.1607050895690918, "rewards/wrapped_format_reward": 0.75, "step": 969 }, { "completion_length": 500.0, "epoch": 194.0, "grad_norm": 2.0772714614868164, "kl": 1.8392152786254883, "learning_rate": 3.7183793278181063e-06, "loss": 0.0736, "reward": -0.2766129672527313, "reward_std": 2.930736780166626, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4749999940395355, "rewards/wrapped_driving_reward": -1.751612901687622, "rewards/wrapped_format_reward": 0.25, "step": 970 }, { "completion_length": 500.0, "epoch": 194.2, "grad_norm": 0.558027982711792, "kl": 1.4134743213653564, "learning_rate": 3.715202974536174e-06, "loss": 0.0565, "reward": 2.145724058151245, "reward_std": 0.4964456260204315, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.46875, "rewards/wrapped_driving_reward": 0.3019741177558899, "rewards/wrapped_format_reward": 0.375, "step": 971 }, { "completion_length": 500.0, "epoch": 194.4, "grad_norm": 0.7686465978622437, "kl": 1.3111281394958496, "learning_rate": 3.7120240506158433e-06, "loss": 0.0524, "reward": 2.0317254066467285, "reward_std": 0.9409922361373901, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5625, "rewards/wrapped_driving_reward": -0.030774464830756187, "rewards/wrapped_format_reward": 0.5, "step": 972 }, { "completion_length": 479.0, "epoch": 194.6, "grad_norm": 0.45619165897369385, "kl": 2.4493980407714844, "learning_rate": 3.708842562781804e-06, "loss": 0.098, "reward": 1.4903662204742432, "reward_std": 1.293304681777954, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.46666663885116577, "rewards/wrapped_driving_reward": -0.47630050778388977, "rewards/wrapped_format_reward": 0.5, "step": 973 }, { "completion_length": 500.0, "epoch": 194.8, "grad_norm": 0.5224854350090027, "kl": 1.571644902229309, "learning_rate": 3.7056585177641725e-06, "loss": 0.0629, "reward": -0.27295541763305664, "reward_std": 4.306777477264404, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4583333134651184, "rewards/wrapped_driving_reward": -1.7312886714935303, "rewards/wrapped_format_reward": 0.5, "step": 974 }, { "completion_length": 500.0, "epoch": 195.0, "grad_norm": 0.4753471314907074, "kl": 0.8306117653846741, "learning_rate": 3.7024719222984696e-06, "loss": 0.0332, "reward": 0.02014315128326416, "reward_std": 2.3500962257385254, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.48124998807907104, "rewards/wrapped_driving_reward": -1.711106777191162, "rewards/wrapped_format_reward": 0.25, "step": 975 }, { "completion_length": 354.0, "epoch": 195.2, "grad_norm": 0.4571085572242737, "kl": 1.9256433248519897, "learning_rate": 3.699282783125616e-06, "loss": 0.077, "reward": 2.5264735221862793, "reward_std": 0.08371148258447647, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6666666865348816, "rewards/wrapped_driving_reward": 0.10980676114559174, "rewards/wrapped_format_reward": 0.75, "step": 976 }, { "completion_length": 500.0, "epoch": 195.4, "grad_norm": 0.710415244102478, "kl": 2.092388868331909, "learning_rate": 3.696091106991911e-06, "loss": 0.0837, "reward": -1.0792343616485596, "reward_std": 1.617916464805603, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5338068008422852, "rewards/wrapped_driving_reward": -3.2380411624908447, "rewards/wrapped_format_reward": 0.625, "step": 977 }, { "completion_length": 359.0, "epoch": 195.6, "grad_norm": 0.5495086312294006, "kl": 2.092905044555664, "learning_rate": 3.6928969006490212e-06, "loss": 0.0837, "reward": 2.933638334274292, "reward_std": 0.4126608669757843, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6875, "rewards/wrapped_driving_reward": 0.24613827466964722, "rewards/wrapped_format_reward": 1.0, "step": 978 }, { "completion_length": 500.0, "epoch": 195.8, "grad_norm": 0.49977123737335205, "kl": 2.3313608169555664, "learning_rate": 3.689700170853966e-06, "loss": 0.0933, "reward": 0.5711701512336731, "reward_std": 3.1036572456359863, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4166666865348816, "rewards/wrapped_driving_reward": -0.9704965353012085, "rewards/wrapped_format_reward": 0.375, "step": 979 }, { "completion_length": 385.0, "epoch": 196.0, "grad_norm": 0.8438147306442261, "kl": 1.3914647102355957, "learning_rate": 3.6865009243691015e-06, "loss": 0.0557, "reward": 0.7538583874702454, "reward_std": 2.5134408473968506, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4869047701358795, "rewards/wrapped_driving_reward": -0.858046293258667, "rewards/wrapped_format_reward": 0.375, "step": 980 }, { "completion_length": 447.25, "epoch": 196.2, "grad_norm": 0.7130087018013, "kl": 2.2208592891693115, "learning_rate": 3.6832991679621087e-06, "loss": 0.0888, "reward": 0.9296894073486328, "reward_std": 3.295621156692505, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": -1.1953105926513672, "rewards/wrapped_format_reward": 0.75, "step": 981 }, { "completion_length": 500.0, "epoch": 196.4, "grad_norm": 0.5033509135246277, "kl": 2.3666887283325195, "learning_rate": 3.6800949084059785e-06, "loss": 0.0947, "reward": 1.163835048675537, "reward_std": 3.20870304107666, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3936507999897003, "rewards/wrapped_driving_reward": -0.8548157215118408, "rewards/wrapped_format_reward": 0.875, "step": 982 }, { "completion_length": 470.0, "epoch": 196.6, "grad_norm": 0.5835220217704773, "kl": 1.1331169605255127, "learning_rate": 3.6768881524789956e-06, "loss": 0.0453, "reward": 1.6293542385101318, "reward_std": 1.1588283777236938, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5666666626930237, "rewards/wrapped_driving_reward": -0.5623123645782471, "rewards/wrapped_format_reward": 0.625, "step": 983 }, { "completion_length": 351.0, "epoch": 196.8, "grad_norm": 0.5420541167259216, "kl": 1.2264989614486694, "learning_rate": 3.6736789069647273e-06, "loss": 0.0491, "reward": 2.2600245475769043, "reward_std": 0.2575899362564087, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8125, "rewards/wrapped_driving_reward": -0.5524753928184509, "rewards/wrapped_format_reward": 1.0, "step": 984 }, { "completion_length": 500.0, "epoch": 197.0, "grad_norm": 0.4675966799259186, "kl": 2.334993839263916, "learning_rate": 3.6704671786520053e-06, "loss": 0.0934, "reward": 2.6576266288757324, "reward_std": 0.4121764302253723, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6458333730697632, "rewards/wrapped_driving_reward": 0.01179332286119461, "rewards/wrapped_format_reward": 1.0, "step": 985 }, { "completion_length": 435.0, "epoch": 197.2, "grad_norm": 0.37134113907814026, "kl": 1.8131732940673828, "learning_rate": 3.667252974334915e-06, "loss": 0.0725, "reward": 1.0285606384277344, "reward_std": 1.6500244140625, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8782467246055603, "rewards/wrapped_driving_reward": -1.5996861457824707, "rewards/wrapped_format_reward": 0.75, "step": 986 }, { "completion_length": 500.0, "epoch": 197.4, "grad_norm": 0.6712488532066345, "kl": 0.7635093331336975, "learning_rate": 3.664036300812779e-06, "loss": 0.0305, "reward": 0.8483560085296631, "reward_std": 3.2776992321014404, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.444444477558136, "rewards/wrapped_driving_reward": -0.7210884094238281, "rewards/wrapped_format_reward": 0.375, "step": 987 }, { "completion_length": 500.0, "epoch": 197.6, "grad_norm": 0.4790368378162384, "kl": 2.8666765689849854, "learning_rate": 3.660817164890143e-06, "loss": 0.1147, "reward": 2.4303479194641113, "reward_std": 0.905510425567627, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6053571701049805, "rewards/wrapped_driving_reward": 0.32499074935913086, "rewards/wrapped_format_reward": 0.5, "step": 988 }, { "completion_length": 500.0, "epoch": 197.8, "grad_norm": 0.8172152042388916, "kl": 1.768971562385559, "learning_rate": 3.6575955733767614e-06, "loss": 0.0708, "reward": -0.7181567549705505, "reward_std": 3.2575185298919678, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4166666865348816, "rewards/wrapped_driving_reward": -2.0098233222961426, "rewards/wrapped_format_reward": 0.375, "step": 989 }, { "completion_length": 500.0, "epoch": 198.0, "grad_norm": 0.6419240236282349, "kl": 1.6300965547561646, "learning_rate": 3.654371533087586e-06, "loss": 0.0652, "reward": 0.22748053073883057, "reward_std": 2.975813865661621, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.37857145071029663, "rewards/wrapped_driving_reward": -1.1510907411575317, "rewards/wrapped_format_reward": 0.25, "step": 990 }, { "completion_length": 500.0, "epoch": 198.2, "grad_norm": 0.5838403701782227, "kl": 1.6610620021820068, "learning_rate": 3.6511450508427425e-06, "loss": 0.0664, "reward": 0.8827900886535645, "reward_std": 2.7138733863830566, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4583333134651184, "rewards/wrapped_driving_reward": -1.2005434036254883, "rewards/wrapped_format_reward": 0.875, "step": 991 }, { "completion_length": 352.0, "epoch": 198.4, "grad_norm": 0.5065628290176392, "kl": 1.4801349639892578, "learning_rate": 3.6479161334675294e-06, "loss": 0.0592, "reward": 1.1989092826843262, "reward_std": 2.159355640411377, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6499999761581421, "rewards/wrapped_driving_reward": -1.3260908126831055, "rewards/wrapped_format_reward": 0.875, "step": 992 }, { "completion_length": 323.0, "epoch": 198.6, "grad_norm": 0.9948468208312988, "kl": 2.3398282527923584, "learning_rate": 3.6446847877923917e-06, "loss": 0.0936, "reward": 1.2031556367874146, "reward_std": 3.4791674613952637, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.512499988079071, "rewards/wrapped_driving_reward": -0.8093443512916565, "rewards/wrapped_format_reward": 0.75, "step": 993 }, { "completion_length": 500.0, "epoch": 198.8, "grad_norm": 0.472013384103775, "kl": 2.1987760066986084, "learning_rate": 3.641451020652914e-06, "loss": 0.088, "reward": 2.839162826538086, "reward_std": 0.6518869996070862, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8472222089767456, "rewards/wrapped_driving_reward": 0.24194061756134033, "rewards/wrapped_format_reward": 0.75, "step": 994 }, { "completion_length": 203.0, "epoch": 199.0, "grad_norm": 1.2131339311599731, "kl": 1.4023782014846802, "learning_rate": 3.6382148388898013e-06, "loss": 0.0561, "reward": 1.610437273979187, "reward_std": 0.582275927066803, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5375000238418579, "rewards/wrapped_driving_reward": -0.5520627498626709, "rewards/wrapped_format_reward": 0.625, "step": 995 }, { "completion_length": 360.0, "epoch": 199.2, "grad_norm": 0.4115167558193207, "kl": 1.6350958347320557, "learning_rate": 3.634976249348867e-06, "loss": 0.0654, "reward": 2.6167430877685547, "reward_std": 0.34499531984329224, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.824999988079071, "rewards/wrapped_driving_reward": -0.08325683325529099, "rewards/wrapped_format_reward": 0.875, "step": 996 }, { "completion_length": 398.0, "epoch": 199.4, "grad_norm": 0.9497320055961609, "kl": 1.2184975147247314, "learning_rate": 3.631735258881019e-06, "loss": 0.0487, "reward": -0.7009193301200867, "reward_std": 1.8839086294174194, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4225524365901947, "rewards/wrapped_driving_reward": -2.623471736907959, "rewards/wrapped_format_reward": 0.75, "step": 997 }, { "completion_length": 500.0, "epoch": 199.6, "grad_norm": 0.5658312439918518, "kl": 2.5670063495635986, "learning_rate": 3.6284918743422424e-06, "loss": 0.1027, "reward": 3.029527187347412, "reward_std": 0.5694175958633423, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7777777910232544, "rewards/wrapped_driving_reward": 0.25174954533576965, "rewards/wrapped_format_reward": 1.0, "step": 998 }, { "completion_length": 500.0, "epoch": 199.8, "grad_norm": 0.5055840611457825, "kl": 2.8639843463897705, "learning_rate": 3.625246102593588e-06, "loss": 0.1146, "reward": -0.350710391998291, "reward_std": 3.675887107849121, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2666666805744171, "rewards/wrapped_driving_reward": -1.7423770427703857, "rewards/wrapped_format_reward": 0.625, "step": 999 }, { "completion_length": 500.0, "epoch": 200.0, "grad_norm": 0.4764675796031952, "kl": 2.366607189178467, "learning_rate": 3.621997950501156e-06, "loss": 0.0947, "reward": 2.3120839595794678, "reward_std": 0.9182683229446411, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.875, "rewards/wrapped_driving_reward": -0.31291601061820984, "rewards/wrapped_format_reward": 0.75, "step": 1000 }, { "completion_length": 305.0, "epoch": 200.2, "grad_norm": 0.5498916506767273, "kl": 2.5197534561157227, "learning_rate": 3.618747424936082e-06, "loss": 0.1008, "reward": 2.472724199295044, "reward_std": 0.10819098353385925, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8371212482452393, "rewards/wrapped_driving_reward": -0.36439698934555054, "rewards/wrapped_format_reward": 1.0, "step": 1001 }, { "completion_length": 500.0, "epoch": 200.4, "grad_norm": 0.4154340624809265, "kl": 1.6854757070541382, "learning_rate": 3.6154945327745223e-06, "loss": 0.0674, "reward": 2.4991588592529297, "reward_std": 0.5005276799201965, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.516964316368103, "rewards/wrapped_driving_reward": 0.23219454288482666, "rewards/wrapped_format_reward": 0.75, "step": 1002 }, { "completion_length": 500.0, "epoch": 200.6, "grad_norm": 0.43667981028556824, "kl": 2.499875068664551, "learning_rate": 3.6122392808976403e-06, "loss": 0.1, "reward": 1.626745343208313, "reward_std": 1.0123025178909302, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -0.3732547163963318, "rewards/wrapped_format_reward": 0.5, "step": 1003 }, { "completion_length": 422.0, "epoch": 200.8, "grad_norm": 0.5336111187934875, "kl": 2.150123357772827, "learning_rate": 3.608981676191591e-06, "loss": 0.086, "reward": 2.3110246658325195, "reward_std": 0.6167364716529846, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6770833134651184, "rewards/wrapped_driving_reward": -0.24105864763259888, "rewards/wrapped_format_reward": 0.875, "step": 1004 }, { "completion_length": 500.0, "epoch": 201.0, "grad_norm": 0.6543328762054443, "kl": 2.747828245162964, "learning_rate": 3.6057217255475034e-06, "loss": 0.1099, "reward": 2.973466634750366, "reward_std": 0.430820494890213, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": 0.5984667539596558, "rewards/wrapped_format_reward": 0.75, "step": 1005 }, { "completion_length": 231.0, "epoch": 201.2, "grad_norm": 0.6388723850250244, "kl": 1.3368468284606934, "learning_rate": 3.602459435861475e-06, "loss": 0.0535, "reward": 3.277226686477661, "reward_std": 0.6171747446060181, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": 0.5272266864776611, "rewards/wrapped_format_reward": 1.0, "step": 1006 }, { "completion_length": 259.0, "epoch": 201.4, "grad_norm": 0.7153186798095703, "kl": 1.3544353246688843, "learning_rate": 3.599194814034546e-06, "loss": 0.0542, "reward": 2.317756175994873, "reward_std": 0.2976848781108856, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5083333253860474, "rewards/wrapped_driving_reward": -0.19057707488536835, "rewards/wrapped_format_reward": 1.0, "step": 1007 }, { "completion_length": 464.0, "epoch": 201.6, "grad_norm": 0.5300239324569702, "kl": 2.9981822967529297, "learning_rate": 3.595927866972694e-06, "loss": 0.1199, "reward": 2.63146710395813, "reward_std": 0.8353642821311951, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7083333134651184, "rewards/wrapped_driving_reward": 0.17313367128372192, "rewards/wrapped_format_reward": 0.75, "step": 1008 }, { "completion_length": 500.0, "epoch": 201.8, "grad_norm": 0.5743690729141235, "kl": 1.3806171417236328, "learning_rate": 3.5926586015868113e-06, "loss": 0.0552, "reward": 1.4746780395507812, "reward_std": 0.49996283650398254, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6666666865348816, "rewards/wrapped_driving_reward": -0.8169887065887451, "rewards/wrapped_format_reward": 0.625, "step": 1009 }, { "completion_length": 395.0, "epoch": 202.0, "grad_norm": 0.5966888070106506, "kl": 2.2912120819091797, "learning_rate": 3.5893870247926986e-06, "loss": 0.0916, "reward": 2.3945603370666504, "reward_std": 0.1004558652639389, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7166666984558105, "rewards/wrapped_driving_reward": -0.32210636138916016, "rewards/wrapped_format_reward": 1.0, "step": 1010 }, { "completion_length": 281.0, "epoch": 202.2, "grad_norm": 0.5715641379356384, "kl": 0.5928288698196411, "learning_rate": 3.586113143511043e-06, "loss": 0.0237, "reward": 0.7174718379974365, "reward_std": 2.775279998779297, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.606249988079071, "rewards/wrapped_driving_reward": -1.3887779712677002, "rewards/wrapped_format_reward": 0.75, "step": 1011 }, { "completion_length": 500.0, "epoch": 202.4, "grad_norm": 0.5384933948516846, "kl": 2.648242950439453, "learning_rate": 3.582836964667408e-06, "loss": 0.1059, "reward": 1.1033844947814941, "reward_std": 3.072701930999756, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4077380895614624, "rewards/wrapped_driving_reward": -0.8043535351753235, "rewards/wrapped_format_reward": 0.75, "step": 1012 }, { "completion_length": 500.0, "epoch": 202.6, "grad_norm": 0.4972879886627197, "kl": 2.7536582946777344, "learning_rate": 3.5795584951922162e-06, "loss": 0.1101, "reward": 1.9187871217727661, "reward_std": 0.7690529227256775, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -0.0812128409743309, "rewards/wrapped_format_reward": 0.75, "step": 1013 }, { "completion_length": 500.0, "epoch": 202.8, "grad_norm": 0.4397309124469757, "kl": 1.8329887390136719, "learning_rate": 3.5762777420207382e-06, "loss": 0.0733, "reward": -0.024667024612426758, "reward_std": 4.044902801513672, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4375, "rewards/wrapped_driving_reward": -1.5871670246124268, "rewards/wrapped_format_reward": 0.625, "step": 1014 }, { "completion_length": 500.0, "epoch": 203.0, "grad_norm": 0.5517240762710571, "kl": 2.2935612201690674, "learning_rate": 3.572994712093073e-06, "loss": 0.0917, "reward": 2.5487308502197266, "reward_std": 0.6641695499420166, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6704545617103577, "rewards/wrapped_driving_reward": 0.12827637791633606, "rewards/wrapped_format_reward": 0.75, "step": 1015 }, { "completion_length": 500.0, "epoch": 203.2, "grad_norm": 0.544585108757019, "kl": 1.0313493013381958, "learning_rate": 3.5697094123541357e-06, "loss": 0.0413, "reward": 2.259660005569458, "reward_std": 0.6325886249542236, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8055555820465088, "rewards/wrapped_driving_reward": 0.07910440862178802, "rewards/wrapped_format_reward": 0.375, "step": 1016 }, { "completion_length": 500.0, "epoch": 203.4, "grad_norm": 0.5948061943054199, "kl": 2.8631389141082764, "learning_rate": 3.566421849753646e-06, "loss": 0.1145, "reward": 1.2803781032562256, "reward_std": 2.872330665588379, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6145833134651184, "rewards/wrapped_driving_reward": -0.834205150604248, "rewards/wrapped_format_reward": 0.75, "step": 1017 }, { "completion_length": 328.0, "epoch": 203.6, "grad_norm": 0.5911474823951721, "kl": 1.5049463510513306, "learning_rate": 3.563132031246108e-06, "loss": 0.0602, "reward": 2.733525276184082, "reward_std": 0.6236773729324341, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5639880895614624, "rewards/wrapped_driving_reward": 0.29453718662261963, "rewards/wrapped_format_reward": 0.875, "step": 1018 }, { "completion_length": 500.0, "epoch": 203.8, "grad_norm": 0.45593199133872986, "kl": 2.6516401767730713, "learning_rate": 3.559839963790797e-06, "loss": 0.1061, "reward": 2.222066879272461, "reward_std": 0.5652444362640381, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7916666865348816, "rewards/wrapped_driving_reward": -0.44459983706474304, "rewards/wrapped_format_reward": 0.875, "step": 1019 }, { "completion_length": 243.0, "epoch": 204.0, "grad_norm": 0.9958624839782715, "kl": 1.805721402168274, "learning_rate": 3.556545654351749e-06, "loss": 0.0722, "reward": 1.0059916973114014, "reward_std": 3.3596315383911133, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.45625001192092896, "rewards/wrapped_driving_reward": -0.9502584338188171, "rewards/wrapped_format_reward": 0.75, "step": 1020 }, { "completion_length": 293.0, "epoch": 204.2, "grad_norm": 0.8352221846580505, "kl": 1.2511953115463257, "learning_rate": 3.55324910989774e-06, "loss": 0.05, "reward": 1.6019468307495117, "reward_std": 3.7398316860198975, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -0.3980531692504883, "rewards/wrapped_format_reward": 0.75, "step": 1021 }, { "completion_length": 406.0, "epoch": 204.4, "grad_norm": 0.5668026804924011, "kl": 2.387995481491089, "learning_rate": 3.549950337402274e-06, "loss": 0.0955, "reward": 2.5478453636169434, "reward_std": 0.23899130523204803, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8583332896232605, "rewards/wrapped_driving_reward": -0.3104879856109619, "rewards/wrapped_format_reward": 1.0, "step": 1022 }, { "completion_length": 500.0, "epoch": 204.6, "grad_norm": 0.32792842388153076, "kl": 2.5008537769317627, "learning_rate": 3.5466493438435707e-06, "loss": 0.1, "reward": 0.9252380132675171, "reward_std": 2.6378915309906006, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.574999988079071, "rewards/wrapped_driving_reward": -1.1497619152069092, "rewards/wrapped_format_reward": 0.75, "step": 1023 }, { "completion_length": 500.0, "epoch": 204.8, "grad_norm": 0.6833317279815674, "kl": 2.799790143966675, "learning_rate": 3.543346136204545e-06, "loss": 0.112, "reward": 2.671267032623291, "reward_std": 0.6479148864746094, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7628968954086304, "rewards/wrapped_driving_reward": 0.15837009251117706, "rewards/wrapped_format_reward": 0.75, "step": 1024 }, { "completion_length": 500.0, "epoch": 205.0, "grad_norm": 0.6691253185272217, "kl": 1.3431271314620972, "learning_rate": 3.5400407214727983e-06, "loss": 0.0537, "reward": -0.398193359375, "reward_std": 2.553177833557129, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3590909242630005, "rewards/wrapped_driving_reward": -2.00728440284729, "rewards/wrapped_format_reward": 0.5, "step": 1025 }, { "completion_length": 410.0, "epoch": 205.2, "grad_norm": 0.5734631419181824, "kl": 2.421994924545288, "learning_rate": 3.536733106640598e-06, "loss": 0.0969, "reward": 2.102944850921631, "reward_std": 0.7066345810890198, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7842261791229248, "rewards/wrapped_driving_reward": -0.3062814176082611, "rewards/wrapped_format_reward": 0.625, "step": 1026 }, { "completion_length": 500.0, "epoch": 205.4, "grad_norm": 0.5254228115081787, "kl": 2.199488401412964, "learning_rate": 3.5334232987048677e-06, "loss": 0.088, "reward": 0.8647441267967224, "reward_std": 3.3761532306671143, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.48750001192092896, "rewards/wrapped_driving_reward": -0.8727558851242065, "rewards/wrapped_format_reward": 0.5, "step": 1027 }, { "completion_length": 278.0, "epoch": 205.6, "grad_norm": 0.6191807985305786, "kl": 2.1929683685302734, "learning_rate": 3.5301113046671717e-06, "loss": 0.0877, "reward": 3.5454652309417725, "reward_std": 0.32205861806869507, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7250000238418579, "rewards/wrapped_driving_reward": 0.8204653263092041, "rewards/wrapped_format_reward": 1.0, "step": 1028 }, { "completion_length": 500.0, "epoch": 205.8, "grad_norm": 0.557322084903717, "kl": 2.0848100185394287, "learning_rate": 3.5267971315336936e-06, "loss": 0.0834, "reward": 2.1687307357788086, "reward_std": 0.3615880012512207, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.3745535612106323, "rewards/wrapped_driving_reward": 0.1691771000623703, "rewards/wrapped_format_reward": 0.625, "step": 1029 }, { "completion_length": 500.0, "epoch": 206.0, "grad_norm": 0.6741705536842346, "kl": 2.3363914489746094, "learning_rate": 3.5234807863152316e-06, "loss": 0.0935, "reward": -0.48754245042800903, "reward_std": 2.5961735248565674, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5714285373687744, "rewards/wrapped_driving_reward": -2.5589711666107178, "rewards/wrapped_format_reward": 0.75, "step": 1030 }, { "completion_length": 500.0, "epoch": 206.2, "grad_norm": 14.736827850341797, "kl": 2.974747657775879, "learning_rate": 3.5201622760271768e-06, "loss": 0.119, "reward": 2.5500786304473877, "reward_std": 0.35873469710350037, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9583333134651184, "rewards/wrapped_driving_reward": 0.09174539148807526, "rewards/wrapped_format_reward": 0.5, "step": 1031 }, { "completion_length": 316.0, "epoch": 206.4, "grad_norm": 0.7986094951629639, "kl": 2.1568026542663574, "learning_rate": 3.516841607689501e-06, "loss": 0.0863, "reward": 2.485926389694214, "reward_std": 0.4706442654132843, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5909721851348877, "rewards/wrapped_driving_reward": 0.14495417475700378, "rewards/wrapped_format_reward": 0.75, "step": 1032 }, { "completion_length": 500.0, "epoch": 206.6, "grad_norm": 0.4788648784160614, "kl": 1.3308790922164917, "learning_rate": 3.5135187883267394e-06, "loss": 0.0532, "reward": 2.1260054111480713, "reward_std": 0.7040643095970154, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6194444894790649, "rewards/wrapped_driving_reward": -0.2434389740228653, "rewards/wrapped_format_reward": 0.75, "step": 1033 }, { "completion_length": 474.0, "epoch": 206.8, "grad_norm": 0.5434154868125916, "kl": 1.9080770015716553, "learning_rate": 3.5101938249679794e-06, "loss": 0.0763, "reward": 2.568082571029663, "reward_std": 0.8549544811248779, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.875, "rewards/wrapped_driving_reward": -0.056917455047369, "rewards/wrapped_format_reward": 0.75, "step": 1034 }, { "completion_length": 500.0, "epoch": 207.0, "grad_norm": 0.5829246044158936, "kl": 1.8311023712158203, "learning_rate": 3.5068667246468437e-06, "loss": 0.0732, "reward": 3.3914947509765625, "reward_std": 0.6190906763076782, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8229166865348816, "rewards/wrapped_driving_reward": 0.8185781240463257, "rewards/wrapped_format_reward": 0.75, "step": 1035 }, { "completion_length": 375.0, "epoch": 207.2, "grad_norm": 0.7728428840637207, "kl": 2.3411638736724854, "learning_rate": 3.503537494401473e-06, "loss": 0.0936, "reward": 2.491985321044922, "reward_std": 0.6704630851745605, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.71875, "rewards/wrapped_driving_reward": 0.023235168308019638, "rewards/wrapped_format_reward": 0.75, "step": 1036 }, { "completion_length": 500.0, "epoch": 207.4, "grad_norm": 0.5153346657752991, "kl": 0.7789537310600281, "learning_rate": 3.500206141274518e-06, "loss": 0.0312, "reward": 0.6475763320922852, "reward_std": 3.1472392082214355, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.643750011920929, "rewards/wrapped_driving_reward": -1.121173620223999, "rewards/wrapped_format_reward": 0.375, "step": 1037 }, { "completion_length": 500.0, "epoch": 207.6, "grad_norm": 0.5977131128311157, "kl": 0.8053792119026184, "learning_rate": 3.496872672313116e-06, "loss": 0.0322, "reward": 0.3766722083091736, "reward_std": 2.9514622688293457, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.47853535413742065, "rewards/wrapped_driving_reward": -1.101863145828247, "rewards/wrapped_format_reward": 0.25, "step": 1038 }, { "completion_length": 435.0, "epoch": 207.8, "grad_norm": 0.5188892483711243, "kl": 1.6353070735931396, "learning_rate": 3.4935370945688823e-06, "loss": 0.0654, "reward": 3.2971596717834473, "reward_std": 0.7382159233093262, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8035714626312256, "rewards/wrapped_driving_reward": 0.7435882687568665, "rewards/wrapped_format_reward": 0.75, "step": 1039 }, { "completion_length": 500.0, "epoch": 208.0, "grad_norm": 0.5447797775268555, "kl": 1.4209266901016235, "learning_rate": 3.4901994150978926e-06, "loss": 0.0568, "reward": 1.1221299171447754, "reward_std": 3.5077431201934814, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": -1.0028700828552246, "rewards/wrapped_format_reward": 0.75, "step": 1040 }, { "completion_length": 500.0, "epoch": 208.2, "grad_norm": 0.5244053602218628, "kl": 1.9224073886871338, "learning_rate": 3.486859640960668e-06, "loss": 0.0769, "reward": -2.001260757446289, "reward_std": 3.0251517295837402, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -3.001260757446289, "rewards/wrapped_format_reward": 0.5, "step": 1041 }, { "completion_length": 500.0, "epoch": 208.4, "grad_norm": 0.4434574842453003, "kl": 2.8597450256347656, "learning_rate": 3.483517779222163e-06, "loss": 0.1144, "reward": 2.362471103668213, "reward_std": 0.4425557851791382, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7916666865348816, "rewards/wrapped_driving_reward": -0.17919573187828064, "rewards/wrapped_format_reward": 0.75, "step": 1042 }, { "completion_length": 500.0, "epoch": 208.6, "grad_norm": 0.2913912832736969, "kl": 3.6161417961120605, "learning_rate": 3.480173836951746e-06, "loss": 0.1446, "reward": 3.4008288383483887, "reward_std": 0.289678692817688, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5833333730697632, "rewards/wrapped_driving_reward": 0.817495584487915, "rewards/wrapped_format_reward": 1.0, "step": 1043 }, { "completion_length": 462.0, "epoch": 208.8, "grad_norm": 0.4888365566730499, "kl": 2.206451177597046, "learning_rate": 3.476827821223184e-06, "loss": 0.0883, "reward": 1.409515619277954, "reward_std": 2.959552049636841, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5359848737716675, "rewards/wrapped_driving_reward": -0.8764691948890686, "rewards/wrapped_format_reward": 1.0, "step": 1044 }, { "completion_length": 500.0, "epoch": 209.0, "grad_norm": 0.6058388352394104, "kl": 1.874618411064148, "learning_rate": 3.4734797391146384e-06, "loss": 0.075, "reward": 0.9009166955947876, "reward_std": 3.2750093936920166, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3583333492279053, "rewards/wrapped_driving_reward": -0.8324167132377625, "rewards/wrapped_format_reward": 0.625, "step": 1045 }, { "completion_length": 215.0, "epoch": 209.2, "grad_norm": 0.8203223943710327, "kl": 1.0305882692337036, "learning_rate": 3.4701295977086326e-06, "loss": 0.0412, "reward": 2.5865678787231445, "reward_std": 0.26674625277519226, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7017857432365417, "rewards/wrapped_driving_reward": 0.009782092645764351, "rewards/wrapped_format_reward": 0.875, "step": 1046 }, { "completion_length": 403.0, "epoch": 209.4, "grad_norm": 0.623798668384552, "kl": 2.437943696975708, "learning_rate": 3.466777404092052e-06, "loss": 0.0975, "reward": 2.3178341388702393, "reward_std": 0.7038171887397766, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7916666865348816, "rewards/wrapped_driving_reward": -0.09883256256580353, "rewards/wrapped_format_reward": 0.625, "step": 1047 }, { "completion_length": 473.0, "epoch": 209.6, "grad_norm": 0.500562846660614, "kl": 1.3325062990188599, "learning_rate": 3.4634231653561213e-06, "loss": 0.0533, "reward": 1.639136552810669, "reward_std": 0.8360922336578369, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.706250011920929, "rewards/wrapped_driving_reward": -0.31711357831954956, "rewards/wrapped_format_reward": 0.25, "step": 1048 }, { "completion_length": 500.0, "epoch": 209.8, "grad_norm": 0.7569959759712219, "kl": 2.9517664909362793, "learning_rate": 3.460066888596391e-06, "loss": 0.1181, "reward": 1.878430962562561, "reward_std": 3.2616074085235596, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -0.3715689778327942, "rewards/wrapped_format_reward": 1.0, "step": 1049 }, { "completion_length": 500.0, "epoch": 210.0, "grad_norm": 0.7393296360969543, "kl": 2.2780613899230957, "learning_rate": 3.4567085809127247e-06, "loss": 0.0911, "reward": 0.7985653281211853, "reward_std": 3.2347371578216553, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3541666865348816, "rewards/wrapped_driving_reward": -1.0556013584136963, "rewards/wrapped_format_reward": 0.75, "step": 1050 }, { "completion_length": 500.0, "epoch": 210.2, "grad_norm": 0.35780608654022217, "kl": 2.8597917556762695, "learning_rate": 3.453348249409281e-06, "loss": 0.1144, "reward": 0.971619725227356, "reward_std": 2.6560258865356445, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5625, "rewards/wrapped_driving_reward": -0.840880274772644, "rewards/wrapped_format_reward": 0.5, "step": 1051 }, { "completion_length": 377.0, "epoch": 210.4, "grad_norm": 0.526102602481842, "kl": 1.6192888021469116, "learning_rate": 3.4499859011944982e-06, "loss": 0.0648, "reward": 2.530740261077881, "reward_std": 0.609768807888031, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.699999988079071, "rewards/wrapped_driving_reward": -0.04425982013344765, "rewards/wrapped_format_reward": 0.875, "step": 1052 }, { "completion_length": 500.0, "epoch": 210.6, "grad_norm": 0.41008394956588745, "kl": 2.1589150428771973, "learning_rate": 3.4466215433810827e-06, "loss": 0.0864, "reward": 0.914588987827301, "reward_std": 2.711768865585327, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4791666865348816, "rewards/wrapped_driving_reward": -1.064577579498291, "rewards/wrapped_format_reward": 0.75, "step": 1053 }, { "completion_length": 500.0, "epoch": 210.8, "grad_norm": 0.6899422407150269, "kl": 0.5674111247062683, "learning_rate": 3.4432551830859928e-06, "loss": 0.0227, "reward": 2.3185198307037354, "reward_std": 0.7490935921669006, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7291666269302368, "rewards/wrapped_driving_reward": 0.46435317397117615, "rewards/wrapped_format_reward": 0.125, "step": 1054 }, { "completion_length": 304.0, "epoch": 211.0, "grad_norm": 0.7471367716789246, "kl": 1.7172483205795288, "learning_rate": 3.4398868274304203e-06, "loss": 0.0687, "reward": 2.7969202995300293, "reward_std": 0.25754040479660034, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7875000238418579, "rewards/wrapped_driving_reward": 0.2594204843044281, "rewards/wrapped_format_reward": 0.75, "step": 1055 }, { "completion_length": 500.0, "epoch": 211.2, "grad_norm": 0.46859121322631836, "kl": 2.476396322250366, "learning_rate": 3.436516483539781e-06, "loss": 0.0991, "reward": 2.941046714782715, "reward_std": 0.7439018487930298, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6114583015441895, "rewards/wrapped_driving_reward": 0.32958826422691345, "rewards/wrapped_format_reward": 1.0, "step": 1056 }, { "completion_length": 500.0, "epoch": 211.4, "grad_norm": 0.6604138016700745, "kl": 1.533609390258789, "learning_rate": 3.433144158543692e-06, "loss": 0.0613, "reward": 0.7513000965118408, "reward_std": 3.1933209896087646, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4035714268684387, "rewards/wrapped_driving_reward": -1.0272712707519531, "rewards/wrapped_format_reward": 0.625, "step": 1057 }, { "completion_length": 500.0, "epoch": 211.6, "grad_norm": 0.44344985485076904, "kl": 1.624708890914917, "learning_rate": 3.4297698595759665e-06, "loss": 0.065, "reward": 1.8965282440185547, "reward_std": 0.48195382952690125, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.675000011920929, "rewards/wrapped_driving_reward": -0.40347182750701904, "rewards/wrapped_format_reward": 0.625, "step": 1058 }, { "completion_length": 500.0, "epoch": 211.8, "grad_norm": 0.842307984828949, "kl": 2.328880548477173, "learning_rate": 3.426393593774591e-06, "loss": 0.0932, "reward": 2.489910364151001, "reward_std": 0.5331727862358093, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6520562767982483, "rewards/wrapped_driving_reward": 0.08785391598939896, "rewards/wrapped_format_reward": 0.75, "step": 1059 }, { "completion_length": 297.0, "epoch": 212.0, "grad_norm": 0.8044981360435486, "kl": 0.5240013003349304, "learning_rate": 3.4230153682817112e-06, "loss": 0.021, "reward": 1.2353980541229248, "reward_std": 2.903379201889038, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5568181872367859, "rewards/wrapped_driving_reward": -0.9464200735092163, "rewards/wrapped_format_reward": 0.875, "step": 1060 }, { "completion_length": 500.0, "epoch": 212.2, "grad_norm": 0.5891744494438171, "kl": 1.850303292274475, "learning_rate": 3.4196351902436213e-06, "loss": 0.074, "reward": 2.277845621109009, "reward_std": 0.6284061074256897, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6343749761581421, "rewards/wrapped_driving_reward": 0.01847068779170513, "rewards/wrapped_format_reward": 0.625, "step": 1061 }, { "completion_length": 500.0, "epoch": 212.4, "grad_norm": 0.7233597636222839, "kl": 2.9670701026916504, "learning_rate": 3.4162530668107435e-06, "loss": 0.1187, "reward": 2.2532575130462646, "reward_std": 0.9563757181167603, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.65625, "rewards/wrapped_driving_reward": 0.09700753539800644, "rewards/wrapped_format_reward": 0.5, "step": 1062 }, { "completion_length": 287.0, "epoch": 212.6, "grad_norm": 0.7498565316200256, "kl": 1.2739527225494385, "learning_rate": 3.4128690051376167e-06, "loss": 0.051, "reward": 2.430758237838745, "reward_std": 0.6825680732727051, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5437500476837158, "rewards/wrapped_driving_reward": 0.13700829446315765, "rewards/wrapped_format_reward": 0.75, "step": 1063 }, { "completion_length": 500.0, "epoch": 212.8, "grad_norm": 0.6844552159309387, "kl": 1.6506372690200806, "learning_rate": 3.409483012382879e-06, "loss": 0.066, "reward": 0.3284950256347656, "reward_std": 2.604048728942871, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5258021354675293, "rewards/wrapped_driving_reward": -1.8223071098327637, "rewards/wrapped_format_reward": 0.875, "step": 1064 }, { "completion_length": 423.0, "epoch": 213.0, "grad_norm": 1.679571509361267, "kl": 2.1014058589935303, "learning_rate": 3.406095095709254e-06, "loss": 0.0841, "reward": 0.9278544783592224, "reward_std": 2.7194809913635254, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.46562498807907104, "rewards/wrapped_driving_reward": -0.9127705693244934, "rewards/wrapped_format_reward": 0.625, "step": 1065 }, { "completion_length": 240.0, "epoch": 213.2, "grad_norm": 0.6710036396980286, "kl": 1.5790921449661255, "learning_rate": 3.4027052622835365e-06, "loss": 0.0632, "reward": 1.5165295600891113, "reward_std": 3.016507148742676, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6071428656578064, "rewards/wrapped_driving_reward": -0.8406133651733398, "rewards/wrapped_format_reward": 1.0, "step": 1066 }, { "completion_length": 370.0, "epoch": 213.4, "grad_norm": 0.7575778961181641, "kl": 1.0548183917999268, "learning_rate": 3.3993135192765726e-06, "loss": 0.0422, "reward": 0.9999492168426514, "reward_std": 3.340517997741699, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.39375001192092896, "rewards/wrapped_driving_reward": -0.8938007354736328, "rewards/wrapped_format_reward": 0.75, "step": 1067 }, { "completion_length": 248.0, "epoch": 213.6, "grad_norm": 0.9177556037902832, "kl": 0.8549500703811646, "learning_rate": 3.39591987386325e-06, "loss": 0.0342, "reward": 3.167234420776367, "reward_std": 0.2603113055229187, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.598557710647583, "rewards/wrapped_driving_reward": 0.5686768293380737, "rewards/wrapped_format_reward": 1.0, "step": 1068 }, { "completion_length": 500.0, "epoch": 213.8, "grad_norm": 0.4174642860889435, "kl": 1.9004242420196533, "learning_rate": 3.392524333222484e-06, "loss": 0.076, "reward": 1.6922800540924072, "reward_std": 0.5809571146965027, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7104166746139526, "rewards/wrapped_driving_reward": -0.6431365609169006, "rewards/wrapped_format_reward": 0.625, "step": 1069 }, { "completion_length": 500.0, "epoch": 214.0, "grad_norm": 0.5569381713867188, "kl": 2.7133734226226807, "learning_rate": 3.389126904537192e-06, "loss": 0.1085, "reward": 0.9743454456329346, "reward_std": 3.318183422088623, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6041666865348816, "rewards/wrapped_driving_reward": -1.1298210620880127, "rewards/wrapped_format_reward": 0.75, "step": 1070 }, { "completion_length": 500.0, "epoch": 214.2, "grad_norm": 0.5514374375343323, "kl": 1.5457887649536133, "learning_rate": 3.3857275949942896e-06, "loss": 0.0618, "reward": 3.0973973274230957, "reward_std": 0.33971819281578064, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7653409242630005, "rewards/wrapped_driving_reward": 0.7070566415786743, "rewards/wrapped_format_reward": 0.625, "step": 1071 }, { "completion_length": 500.0, "epoch": 214.4, "grad_norm": 0.4922221004962921, "kl": 2.018758535385132, "learning_rate": 3.3823264117846722e-06, "loss": 0.0808, "reward": 1.13262939453125, "reward_std": 3.1093437671661377, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4611110985279083, "rewards/wrapped_driving_reward": -0.8284817337989807, "rewards/wrapped_format_reward": 0.75, "step": 1072 }, { "completion_length": 491.0, "epoch": 214.6, "grad_norm": 0.5854265689849854, "kl": 2.7815158367156982, "learning_rate": 3.3789233621031976e-06, "loss": 0.1113, "reward": 2.0108320713043213, "reward_std": 0.5877364873886108, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5416666865348816, "rewards/wrapped_driving_reward": -0.15583454072475433, "rewards/wrapped_format_reward": 0.625, "step": 1073 }, { "completion_length": 500.0, "epoch": 214.8, "grad_norm": 0.5536097884178162, "kl": 1.9036718606948853, "learning_rate": 3.375518453148669e-06, "loss": 0.0761, "reward": 0.8932580947875977, "reward_std": 3.3474950790405273, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5763888955116272, "rewards/wrapped_driving_reward": -0.8081308603286743, "rewards/wrapped_format_reward": 0.375, "step": 1074 }, { "completion_length": 500.0, "epoch": 215.0, "grad_norm": 0.8610581159591675, "kl": 1.8210433721542358, "learning_rate": 3.3721116921238273e-06, "loss": 0.0728, "reward": 0.4815085530281067, "reward_std": 2.6779685020446777, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.574999988079071, "rewards/wrapped_driving_reward": -1.4684913158416748, "rewards/wrapped_format_reward": 0.625, "step": 1075 }, { "completion_length": 500.0, "epoch": 215.2, "grad_norm": 0.512554943561554, "kl": 1.6055521965026855, "learning_rate": 3.3687030862353286e-06, "loss": 0.0642, "reward": -0.5987721085548401, "reward_std": 3.9364922046661377, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3541666865348816, "rewards/wrapped_driving_reward": -1.8279387950897217, "rewards/wrapped_format_reward": 0.375, "step": 1076 }, { "completion_length": 500.0, "epoch": 215.4, "grad_norm": 0.4070813059806824, "kl": 2.273082733154297, "learning_rate": 3.3652926426937327e-06, "loss": 0.0909, "reward": 0.7826076745986938, "reward_std": 2.5797641277313232, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3995535671710968, "rewards/wrapped_driving_reward": -0.9919459223747253, "rewards/wrapped_format_reward": 0.625, "step": 1077 }, { "completion_length": 434.0, "epoch": 215.6, "grad_norm": 0.6026346683502197, "kl": 2.101696729660034, "learning_rate": 3.361880368713486e-06, "loss": 0.0841, "reward": 0.5555827021598816, "reward_std": 3.045008897781372, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.675000011920929, "rewards/wrapped_driving_reward": -1.6194173097610474, "rewards/wrapped_format_reward": 0.75, "step": 1078 }, { "completion_length": 500.0, "epoch": 215.8, "grad_norm": 0.5466208457946777, "kl": 2.214514970779419, "learning_rate": 3.3584662715129067e-06, "loss": 0.0886, "reward": 0.738072395324707, "reward_std": 3.174536943435669, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5625, "rewards/wrapped_driving_reward": -1.1994274854660034, "rewards/wrapped_format_reward": 0.625, "step": 1079 }, { "completion_length": 238.0, "epoch": 216.0, "grad_norm": 0.643779993057251, "kl": 1.95009446144104, "learning_rate": 3.3550503583141726e-06, "loss": 0.078, "reward": 1.7485800981521606, "reward_std": 3.168154001235962, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4229166805744171, "rewards/wrapped_driving_reward": -0.42433667182922363, "rewards/wrapped_format_reward": 1.0, "step": 1080 }, { "completion_length": 500.0, "epoch": 216.2, "grad_norm": 0.4136994481086731, "kl": 2.495558738708496, "learning_rate": 3.3516326363432983e-06, "loss": 0.0998, "reward": 2.4836232662200928, "reward_std": 0.06431353092193604, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.59375, "rewards/wrapped_driving_reward": 0.014873265288770199, "rewards/wrapped_format_reward": 0.875, "step": 1081 }, { "completion_length": 500.0, "epoch": 216.4, "grad_norm": 0.6237711310386658, "kl": 2.377938747406006, "learning_rate": 3.348213112830128e-06, "loss": 0.0951, "reward": 2.8220295906066895, "reward_std": 0.46597862243652344, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6708333492279053, "rewards/wrapped_driving_reward": 0.15119624137878418, "rewards/wrapped_format_reward": 1.0, "step": 1082 }, { "completion_length": 500.0, "epoch": 216.6, "grad_norm": 0.5950479507446289, "kl": 2.0295755863189697, "learning_rate": 3.344791795008318e-06, "loss": 0.0812, "reward": -0.26134294271469116, "reward_std": 3.7655556201934814, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.380952388048172, "rewards/wrapped_driving_reward": -1.7672953605651855, "rewards/wrapped_format_reward": 0.625, "step": 1083 }, { "completion_length": 335.0, "epoch": 216.8, "grad_norm": 0.6945561170578003, "kl": 2.158395290374756, "learning_rate": 3.3413686901153164e-06, "loss": 0.0863, "reward": 2.4333994388580322, "reward_std": 0.28242501616477966, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7008928656578064, "rewards/wrapped_driving_reward": -0.14249330759048462, "rewards/wrapped_format_reward": 0.875, "step": 1084 }, { "completion_length": 246.0, "epoch": 217.0, "grad_norm": 0.8249087333679199, "kl": 1.0083154439926147, "learning_rate": 3.337943805392354e-06, "loss": 0.0403, "reward": 3.6922755241394043, "reward_std": 0.1821533590555191, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8385416865348816, "rewards/wrapped_driving_reward": 0.8537338972091675, "rewards/wrapped_format_reward": 1.0, "step": 1085 }, { "completion_length": 351.0, "epoch": 217.2, "grad_norm": 1.7654740810394287, "kl": 1.004237413406372, "learning_rate": 3.3345171480844275e-06, "loss": 0.0402, "reward": -0.8608919978141785, "reward_std": 3.675323009490967, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4375, "rewards/wrapped_driving_reward": -2.048391819000244, "rewards/wrapped_format_reward": 0.25, "step": 1086 }, { "completion_length": 500.0, "epoch": 217.4, "grad_norm": 0.6707190275192261, "kl": 0.3917839527130127, "learning_rate": 3.3310887254402816e-06, "loss": 0.0157, "reward": 1.1094677448272705, "reward_std": 3.447364568710327, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": -0.7655322551727295, "rewards/wrapped_format_reward": 0.5, "step": 1087 }, { "completion_length": 432.0, "epoch": 217.6, "grad_norm": 0.43202340602874756, "kl": 2.4471139907836914, "learning_rate": 3.3276585447123957e-06, "loss": 0.0979, "reward": 2.983943223953247, "reward_std": 0.3320766091346741, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5568181872367859, "rewards/wrapped_driving_reward": 0.8021249771118164, "rewards/wrapped_format_reward": 0.625, "step": 1088 }, { "completion_length": 500.0, "epoch": 217.8, "grad_norm": 0.33002638816833496, "kl": 2.312364339828491, "learning_rate": 3.3242266131569685e-06, "loss": 0.0925, "reward": 1.0883586406707764, "reward_std": 2.746081829071045, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5064103007316589, "rewards/wrapped_driving_reward": -0.9180518388748169, "rewards/wrapped_format_reward": 0.75, "step": 1089 }, { "completion_length": 500.0, "epoch": 218.0, "grad_norm": 0.26137205958366394, "kl": 2.3020501136779785, "learning_rate": 3.3207929380339034e-06, "loss": 0.0921, "reward": 2.5714211463928223, "reward_std": 0.22478428483009338, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8125, "rewards/wrapped_driving_reward": -0.2410787045955658, "rewards/wrapped_format_reward": 1.0, "step": 1090 }, { "completion_length": 356.0, "epoch": 218.2, "grad_norm": 0.41147956252098083, "kl": 2.56846022605896, "learning_rate": 3.31735752660679e-06, "loss": 0.1027, "reward": 2.705118179321289, "reward_std": 0.36867934465408325, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7750000357627869, "rewards/wrapped_driving_reward": -0.06988176703453064, "rewards/wrapped_format_reward": 1.0, "step": 1091 }, { "completion_length": 500.0, "epoch": 218.4, "grad_norm": 0.35654011368751526, "kl": 2.531102418899536, "learning_rate": 3.313920386142892e-06, "loss": 0.1012, "reward": 1.0521018505096436, "reward_std": 2.1977527141571045, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.890625, "rewards/wrapped_driving_reward": -1.7135231494903564, "rewards/wrapped_format_reward": 0.875, "step": 1092 }, { "completion_length": 311.0, "epoch": 218.6, "grad_norm": 0.5261901617050171, "kl": 2.2694334983825684, "learning_rate": 3.3104815239131315e-06, "loss": 0.0908, "reward": 2.8872218132019043, "reward_std": 0.1508374661207199, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.875, "rewards/wrapped_driving_reward": 0.01222193893045187, "rewards/wrapped_format_reward": 1.0, "step": 1093 }, { "completion_length": 500.0, "epoch": 218.8, "grad_norm": 0.5256309509277344, "kl": 2.2685375213623047, "learning_rate": 3.3070409471920726e-06, "loss": 0.0907, "reward": 1.2930638790130615, "reward_std": 3.546462059020996, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.45681819319725037, "rewards/wrapped_driving_reward": -0.6637543439865112, "rewards/wrapped_format_reward": 0.75, "step": 1094 }, { "completion_length": 209.0, "epoch": 219.0, "grad_norm": 0.8069453835487366, "kl": 0.9102882146835327, "learning_rate": 3.303598663257904e-06, "loss": 0.0364, "reward": 2.114650011062622, "reward_std": 0.3360171318054199, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.47291669249534607, "rewards/wrapped_driving_reward": -0.23326672613620758, "rewards/wrapped_format_reward": 0.875, "step": 1095 }, { "completion_length": 500.0, "epoch": 219.2, "grad_norm": 0.4966677725315094, "kl": 1.5107797384262085, "learning_rate": 3.300154679392429e-06, "loss": 0.0604, "reward": 3.4166953563690186, "reward_std": 0.13869397342205048, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6194444298744202, "rewards/wrapped_driving_reward": 0.7972507476806641, "rewards/wrapped_format_reward": 1.0, "step": 1096 }, { "completion_length": 285.0, "epoch": 219.4, "grad_norm": 0.670080840587616, "kl": 1.9777253866195679, "learning_rate": 3.2967090028810455e-06, "loss": 0.0791, "reward": 0.8169577717781067, "reward_std": 2.669731616973877, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6458333134651184, "rewards/wrapped_driving_reward": -1.3288755416870117, "rewards/wrapped_format_reward": 0.75, "step": 1097 }, { "completion_length": 488.0, "epoch": 219.6, "grad_norm": 0.4334641396999359, "kl": 1.8001562356948853, "learning_rate": 3.293261641012731e-06, "loss": 0.072, "reward": 2.4559478759765625, "reward_std": 0.5525669455528259, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7537878751754761, "rewards/wrapped_driving_reward": -0.17284002900123596, "rewards/wrapped_format_reward": 0.875, "step": 1098 }, { "completion_length": 459.0, "epoch": 219.8, "grad_norm": 0.46266570687294006, "kl": 2.7362425327301025, "learning_rate": 3.289812601080029e-06, "loss": 0.1094, "reward": 2.8137500286102295, "reward_std": 0.48922643065452576, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6166666746139526, "rewards/wrapped_driving_reward": 0.19708330929279327, "rewards/wrapped_format_reward": 1.0, "step": 1099 }, { "completion_length": 371.0, "epoch": 220.0, "grad_norm": 0.6102287769317627, "kl": 1.650485634803772, "learning_rate": 3.2863618903790346e-06, "loss": 0.066, "reward": 0.9878777265548706, "reward_std": 3.3531715869903564, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6583333015441895, "rewards/wrapped_driving_reward": -0.9204555153846741, "rewards/wrapped_format_reward": 0.5, "step": 1100 }, { "completion_length": 241.0, "epoch": 220.2, "grad_norm": 0.7990890741348267, "kl": 0.8699340224266052, "learning_rate": 3.282909516209374e-06, "loss": 0.0348, "reward": 2.3980917930603027, "reward_std": 0.32275962829589844, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.4702381193637848, "rewards/wrapped_driving_reward": 0.052853815257549286, "rewards/wrapped_format_reward": 0.875, "step": 1101 }, { "completion_length": 402.0, "epoch": 220.4, "grad_norm": 0.4910404682159424, "kl": 2.570774793624878, "learning_rate": 3.279455485874195e-06, "loss": 0.1028, "reward": 1.999456524848938, "reward_std": 0.8922026753425598, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.765625, "rewards/wrapped_driving_reward": -0.641168475151062, "rewards/wrapped_format_reward": 0.875, "step": 1102 }, { "completion_length": 500.0, "epoch": 220.6, "grad_norm": 0.5606198906898499, "kl": 1.29267418384552, "learning_rate": 3.2759998066801475e-06, "loss": 0.0517, "reward": 0.22187107801437378, "reward_std": 2.864499092102051, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.46875, "rewards/wrapped_driving_reward": -1.4968788623809814, "rewards/wrapped_format_reward": 0.5, "step": 1103 }, { "completion_length": 500.0, "epoch": 220.8, "grad_norm": 0.44967177510261536, "kl": 2.5338563919067383, "learning_rate": 3.272542485937369e-06, "loss": 0.1014, "reward": 2.190580368041992, "reward_std": 0.5135902762413025, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7916666865348816, "rewards/wrapped_driving_reward": -0.22608643770217896, "rewards/wrapped_format_reward": 0.625, "step": 1104 }, { "completion_length": 403.0, "epoch": 221.0, "grad_norm": 0.507538378238678, "kl": 2.479355573654175, "learning_rate": 3.269083530959471e-06, "loss": 0.0992, "reward": 2.641688346862793, "reward_std": 1.0113797187805176, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6666666865348816, "rewards/wrapped_driving_reward": 0.22502171993255615, "rewards/wrapped_format_reward": 0.75, "step": 1105 }, { "completion_length": 500.0, "epoch": 221.2, "grad_norm": 0.5035107731819153, "kl": 2.3069989681243896, "learning_rate": 3.2656229490635205e-06, "loss": 0.0923, "reward": 2.155073642730713, "reward_std": 0.7193397283554077, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6354166865348816, "rewards/wrapped_driving_reward": 0.01965704932808876, "rewards/wrapped_format_reward": 0.5, "step": 1106 }, { "completion_length": 500.0, "epoch": 221.4, "grad_norm": 0.6406556367874146, "kl": 0.8870555758476257, "learning_rate": 3.2621607475700272e-06, "loss": 0.0355, "reward": -0.8762617707252502, "reward_std": 3.06488299369812, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3541666865348816, "rewards/wrapped_driving_reward": -2.230428457260132, "rewards/wrapped_format_reward": 0.5, "step": 1107 }, { "completion_length": 500.0, "epoch": 221.6, "grad_norm": 0.6188851594924927, "kl": 1.7303296327590942, "learning_rate": 3.258696933802927e-06, "loss": 0.0692, "reward": 1.3352479934692383, "reward_std": 2.9083144664764404, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.538095235824585, "rewards/wrapped_driving_reward": -0.9528472423553467, "rewards/wrapped_format_reward": 1.0, "step": 1108 }, { "completion_length": 500.0, "epoch": 221.8, "grad_norm": 0.4737381637096405, "kl": 2.4472341537475586, "learning_rate": 3.255231515089565e-06, "loss": 0.0979, "reward": 2.456052303314209, "reward_std": 0.3459714949131012, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9166666865348816, "rewards/wrapped_driving_reward": -0.3356144428253174, "rewards/wrapped_format_reward": 0.875, "step": 1109 }, { "completion_length": 500.0, "epoch": 222.0, "grad_norm": 0.48130306601524353, "kl": 2.732863664627075, "learning_rate": 3.2517644987606827e-06, "loss": 0.1093, "reward": -0.381786972284317, "reward_std": 2.440122604370117, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5707070827484131, "rewards/wrapped_driving_reward": -2.577493906021118, "rewards/wrapped_format_reward": 0.875, "step": 1110 }, { "completion_length": 500.0, "epoch": 222.2, "grad_norm": 0.49456945061683655, "kl": 1.7387335300445557, "learning_rate": 3.248295892150402e-06, "loss": 0.0695, "reward": 2.4018759727478027, "reward_std": 0.8662802577018738, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6318181753158569, "rewards/wrapped_driving_reward": 0.14505766332149506, "rewards/wrapped_format_reward": 0.625, "step": 1111 }, { "completion_length": 500.0, "epoch": 222.4, "grad_norm": 0.8731140494346619, "kl": 1.7846884727478027, "learning_rate": 3.244825702596205e-06, "loss": 0.0714, "reward": 0.8994615077972412, "reward_std": 3.2763547897338867, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.398809552192688, "rewards/wrapped_driving_reward": -0.7493481040000916, "rewards/wrapped_format_reward": 0.5, "step": 1112 }, { "completion_length": 500.0, "epoch": 222.6, "grad_norm": 0.5572091937065125, "kl": 2.502410411834717, "learning_rate": 3.2413539374389275e-06, "loss": 0.1001, "reward": 2.637514591217041, "reward_std": 1.0844190120697021, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7175324559211731, "rewards/wrapped_driving_reward": 0.16998234391212463, "rewards/wrapped_format_reward": 0.75, "step": 1113 }, { "completion_length": 500.0, "epoch": 222.8, "grad_norm": 0.7900421023368835, "kl": 0.35668110847473145, "learning_rate": 3.237880604022735e-06, "loss": 0.0143, "reward": 0.4556227922439575, "reward_std": 2.9754996299743652, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6193181872367859, "rewards/wrapped_driving_reward": -1.0386954545974731, "rewards/wrapped_format_reward": 0.125, "step": 1114 }, { "completion_length": 500.0, "epoch": 223.0, "grad_norm": 0.8804983496665955, "kl": 1.2278152704238892, "learning_rate": 3.234405709695111e-06, "loss": 0.0491, "reward": 0.814409077167511, "reward_std": 3.2423410415649414, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3511904776096344, "rewards/wrapped_driving_reward": -0.7867814302444458, "rewards/wrapped_format_reward": 0.5, "step": 1115 }, { "completion_length": 443.0, "epoch": 223.2, "grad_norm": 0.5172404646873474, "kl": 2.488184690475464, "learning_rate": 3.230929261806842e-06, "loss": 0.0995, "reward": 2.3262295722961426, "reward_std": 0.6750156879425049, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8133116960525513, "rewards/wrapped_driving_reward": -0.23708215355873108, "rewards/wrapped_format_reward": 0.75, "step": 1116 }, { "completion_length": 500.0, "epoch": 223.4, "grad_norm": 1.899024248123169, "kl": 2.307103157043457, "learning_rate": 3.227451267712e-06, "loss": 0.0923, "reward": 2.5990545749664307, "reward_std": 0.41813117265701294, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5416666269302368, "rewards/wrapped_driving_reward": 0.3073878288269043, "rewards/wrapped_format_reward": 0.75, "step": 1117 }, { "completion_length": 500.0, "epoch": 223.6, "grad_norm": 0.6529691815376282, "kl": 0.7595447301864624, "learning_rate": 3.223971734767928e-06, "loss": 0.0304, "reward": 2.2365458011627197, "reward_std": 0.18140019476413727, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": 0.4865458011627197, "rewards/wrapped_format_reward": 0.0, "step": 1118 }, { "completion_length": 384.0, "epoch": 223.8, "grad_norm": 0.397745817899704, "kl": 2.2953081130981445, "learning_rate": 3.2204906703352236e-06, "loss": 0.0918, "reward": 2.6576931476593018, "reward_std": 0.7317430377006531, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7997158765792847, "rewards/wrapped_driving_reward": -0.017022810876369476, "rewards/wrapped_format_reward": 0.875, "step": 1119 }, { "completion_length": 378.0, "epoch": 224.0, "grad_norm": 0.7474454641342163, "kl": 1.953171968460083, "learning_rate": 3.217008081777726e-06, "loss": 0.0781, "reward": 2.7320427894592285, "reward_std": 0.6689425110816956, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.925000011920929, "rewards/wrapped_driving_reward": 0.05704260617494583, "rewards/wrapped_format_reward": 0.75, "step": 1120 }, { "completion_length": 500.0, "epoch": 224.2, "grad_norm": 0.5669078826904297, "kl": 1.9643305540084839, "learning_rate": 3.213523976462497e-06, "loss": 0.0786, "reward": 2.506957769393921, "reward_std": 0.4910891652107239, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6625000238418579, "rewards/wrapped_driving_reward": 0.09445776790380478, "rewards/wrapped_format_reward": 0.75, "step": 1121 }, { "completion_length": 500.0, "epoch": 224.4, "grad_norm": 0.7585324048995972, "kl": 2.899036407470703, "learning_rate": 3.2100383617598075e-06, "loss": 0.116, "reward": 0.7978399991989136, "reward_std": 2.549865484237671, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": -1.3271600008010864, "rewards/wrapped_format_reward": 0.75, "step": 1122 }, { "completion_length": 500.0, "epoch": 224.6, "grad_norm": 0.44917452335357666, "kl": 2.065988063812256, "learning_rate": 3.2065512450431203e-06, "loss": 0.0826, "reward": 1.36043119430542, "reward_std": 3.6185102462768555, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6000000238418579, "rewards/wrapped_driving_reward": -0.48956888914108276, "rewards/wrapped_format_reward": 0.5, "step": 1123 }, { "completion_length": 312.0, "epoch": 224.8, "grad_norm": 0.8163612484931946, "kl": 2.0297625064849854, "learning_rate": 3.2030626336890767e-06, "loss": 0.0812, "reward": 3.209805965423584, "reward_std": 0.09874077886343002, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9166666865348816, "rewards/wrapped_driving_reward": 0.29313918948173523, "rewards/wrapped_format_reward": 1.0, "step": 1124 }, { "completion_length": 489.0, "epoch": 225.0, "grad_norm": 0.6252363920211792, "kl": 2.283188819885254, "learning_rate": 3.199572535077481e-06, "loss": 0.0913, "reward": 1.458830714225769, "reward_std": 3.028761386871338, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5565475821495056, "rewards/wrapped_driving_reward": -0.5977168679237366, "rewards/wrapped_format_reward": 0.75, "step": 1125 }, { "completion_length": 278.0, "epoch": 225.2, "grad_norm": 0.778930127620697, "kl": 0.6062681674957275, "learning_rate": 3.19608095659128e-06, "loss": 0.0243, "reward": 2.692627429962158, "reward_std": 0.7534105777740479, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7750000357627869, "rewards/wrapped_driving_reward": 0.04262733459472656, "rewards/wrapped_format_reward": 0.875, "step": 1126 }, { "completion_length": 428.0, "epoch": 225.4, "grad_norm": 0.7403745651245117, "kl": 1.9297510385513306, "learning_rate": 3.1925879056165542e-06, "loss": 0.0772, "reward": 2.582500696182251, "reward_std": 0.46472248435020447, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.737500011920929, "rewards/wrapped_driving_reward": 0.09500071406364441, "rewards/wrapped_format_reward": 0.75, "step": 1127 }, { "completion_length": 450.0, "epoch": 225.6, "grad_norm": 0.7498205304145813, "kl": 1.9860079288482666, "learning_rate": 3.189093389542498e-06, "loss": 0.0794, "reward": 0.5490126609802246, "reward_std": 3.077941656112671, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5255681872367859, "rewards/wrapped_driving_reward": -1.476555585861206, "rewards/wrapped_format_reward": 0.75, "step": 1128 }, { "completion_length": 500.0, "epoch": 225.8, "grad_norm": 0.5413967967033386, "kl": 1.4809303283691406, "learning_rate": 3.1855974157614056e-06, "loss": 0.0592, "reward": 2.4868316650390625, "reward_std": 0.6567498445510864, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.949999988079071, "rewards/wrapped_driving_reward": -0.088168203830719, "rewards/wrapped_format_reward": 0.625, "step": 1129 }, { "completion_length": 500.0, "epoch": 226.0, "grad_norm": 0.4866003394126892, "kl": 2.129143238067627, "learning_rate": 3.182099991668653e-06, "loss": 0.0852, "reward": 2.8081109523773193, "reward_std": 0.7812449336051941, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.761904776096344, "rewards/wrapped_driving_reward": 0.42120617628097534, "rewards/wrapped_format_reward": 0.625, "step": 1130 }, { "completion_length": 366.0, "epoch": 226.2, "grad_norm": 0.44595396518707275, "kl": 1.4577531814575195, "learning_rate": 3.1786011246626858e-06, "loss": 0.0583, "reward": 2.980329751968384, "reward_std": 0.39713937044143677, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.78125, "rewards/wrapped_driving_reward": 0.449079692363739, "rewards/wrapped_format_reward": 0.75, "step": 1131 }, { "completion_length": 254.0, "epoch": 226.4, "grad_norm": 0.6714006066322327, "kl": 1.2142536640167236, "learning_rate": 3.1751008221450024e-06, "loss": 0.0486, "reward": 0.5025764107704163, "reward_std": 2.82546067237854, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6214286088943481, "rewards/wrapped_driving_reward": -1.868852138519287, "rewards/wrapped_format_reward": 1.0, "step": 1132 }, { "completion_length": 500.0, "epoch": 226.6, "grad_norm": 0.5622838735580444, "kl": 1.9434720277786255, "learning_rate": 3.1715990915201363e-06, "loss": 0.0777, "reward": 0.48643213510513306, "reward_std": 2.180483818054199, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6977272629737854, "rewards/wrapped_driving_reward": -2.2112951278686523, "rewards/wrapped_format_reward": 1.0, "step": 1133 }, { "completion_length": 500.0, "epoch": 226.8, "grad_norm": 0.7641708254814148, "kl": 1.0789122581481934, "learning_rate": 3.1680959401956425e-06, "loss": 0.0432, "reward": 1.8419464826583862, "reward_std": 0.3493276536464691, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": 0.09194645285606384, "rewards/wrapped_format_reward": 0.0, "step": 1134 }, { "completion_length": 377.0, "epoch": 227.0, "grad_norm": 0.8626437187194824, "kl": 1.7295544147491455, "learning_rate": 3.164591375582082e-06, "loss": 0.0692, "reward": 2.162261724472046, "reward_std": 0.709762454032898, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7396634817123413, "rewards/wrapped_driving_reward": -0.32740166783332825, "rewards/wrapped_format_reward": 0.75, "step": 1135 }, { "completion_length": 389.0, "epoch": 227.2, "grad_norm": 0.5872017741203308, "kl": 1.9370197057724, "learning_rate": 3.1610854050930063e-06, "loss": 0.0775, "reward": 3.3996024131774902, "reward_std": 0.2868571877479553, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7250000238418579, "rewards/wrapped_driving_reward": 0.7996025681495667, "rewards/wrapped_format_reward": 0.875, "step": 1136 }, { "completion_length": 500.0, "epoch": 227.4, "grad_norm": 0.7077613472938538, "kl": 2.923098087310791, "learning_rate": 3.157578036144937e-06, "loss": 0.1169, "reward": 2.0499467849731445, "reward_std": 0.5497013330459595, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.53818678855896, "rewards/wrapped_driving_reward": -0.23824018239974976, "rewards/wrapped_format_reward": 0.75, "step": 1137 }, { "completion_length": 500.0, "epoch": 227.6, "grad_norm": 1.610383152961731, "kl": 2.595532178878784, "learning_rate": 3.1540692761573588e-06, "loss": 0.1038, "reward": 2.710698366165161, "reward_std": 0.7003037333488464, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 1.0, "rewards/wrapped_driving_reward": 0.2106982171535492, "rewards/wrapped_format_reward": 0.5, "step": 1138 }, { "completion_length": 500.0, "epoch": 227.8, "grad_norm": 2.498199939727783, "kl": 2.2451419830322266, "learning_rate": 3.1505591325526974e-06, "loss": 0.0898, "reward": 0.6672154664993286, "reward_std": 2.449706554412842, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.43560606241226196, "rewards/wrapped_driving_reward": -1.0183905363082886, "rewards/wrapped_format_reward": 0.5, "step": 1139 }, { "completion_length": 343.0, "epoch": 228.0, "grad_norm": 0.6862303018569946, "kl": 2.998260259628296, "learning_rate": 3.147047612756302e-06, "loss": 0.1199, "reward": 1.565877914428711, "reward_std": 3.044724225997925, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": -0.8091220259666443, "rewards/wrapped_format_reward": 1.0, "step": 1140 }, { "completion_length": 500.0, "epoch": 228.2, "grad_norm": 0.5895454287528992, "kl": 1.188035011291504, "learning_rate": 3.1435347241964386e-06, "loss": 0.0475, "reward": 1.230802059173584, "reward_std": 3.494351863861084, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7124999761581421, "rewards/wrapped_driving_reward": -0.8566978573799133, "rewards/wrapped_format_reward": 0.625, "step": 1141 }, { "completion_length": 500.0, "epoch": 228.4, "grad_norm": 0.5685290098190308, "kl": 2.118563413619995, "learning_rate": 3.140020474304265e-06, "loss": 0.0847, "reward": 0.9381794929504395, "reward_std": 3.3283822536468506, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.7291666865348816, "rewards/wrapped_driving_reward": -1.040987253189087, "rewards/wrapped_format_reward": 0.5, "step": 1142 }, { "completion_length": 500.0, "epoch": 228.6, "grad_norm": 0.3604917526245117, "kl": 2.8074140548706055, "learning_rate": 3.136504870513819e-06, "loss": 0.1123, "reward": 1.2526460886001587, "reward_std": 2.856135368347168, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.613194465637207, "rewards/wrapped_driving_reward": -0.8605483770370483, "rewards/wrapped_format_reward": 0.75, "step": 1143 }, { "completion_length": 386.0, "epoch": 228.8, "grad_norm": 47.2328987121582, "kl": 6.184914588928223, "learning_rate": 3.132987920262005e-06, "loss": 0.2474, "reward": 1.2696081399917603, "reward_std": 3.5769197940826416, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.46875, "rewards/wrapped_driving_reward": -0.44914188981056213, "rewards/wrapped_format_reward": 0.5, "step": 1144 }, { "completion_length": 359.0, "epoch": 229.0, "grad_norm": 0.545904815196991, "kl": 1.799540400505066, "learning_rate": 3.1294696309885717e-06, "loss": 0.072, "reward": 2.541309118270874, "reward_std": 0.6157224774360657, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8500000238418579, "rewards/wrapped_driving_reward": 0.06630919873714447, "rewards/wrapped_format_reward": 0.625, "step": 1145 }, { "completion_length": 500.0, "epoch": 229.2, "grad_norm": 0.5170572996139526, "kl": 1.115472674369812, "learning_rate": 3.125950010136104e-06, "loss": 0.0446, "reward": 1.0069831609725952, "reward_std": 3.351008176803589, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.1180167198181152, "rewards/wrapped_format_reward": 0.625, "step": 1146 }, { "completion_length": 500.0, "epoch": 229.4, "grad_norm": 0.7276570796966553, "kl": 1.101094126701355, "learning_rate": 3.1224290651500017e-06, "loss": 0.044, "reward": 2.017418384552002, "reward_std": 1.0247039794921875, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.675000011920929, "rewards/wrapped_driving_reward": 0.09241843223571777, "rewards/wrapped_format_reward": 0.25, "step": 1147 }, { "completion_length": 328.0, "epoch": 229.6, "grad_norm": 0.7038210034370422, "kl": 1.8290187120437622, "learning_rate": 3.1189068034784653e-06, "loss": 0.0732, "reward": 2.6151554584503174, "reward_std": 0.46249744296073914, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7250000238418579, "rewards/wrapped_driving_reward": 0.015155537985265255, "rewards/wrapped_format_reward": 0.875, "step": 1148 }, { "completion_length": 500.0, "epoch": 229.8, "grad_norm": 0.5270535945892334, "kl": 1.3238747119903564, "learning_rate": 3.115383232572483e-06, "loss": 0.053, "reward": 3.190889835357666, "reward_std": 0.4365469813346863, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6312500238418579, "rewards/wrapped_driving_reward": 0.8096399307250977, "rewards/wrapped_format_reward": 0.75, "step": 1149 }, { "completion_length": 280.0, "epoch": 230.0, "grad_norm": 0.710309624671936, "kl": 1.1019251346588135, "learning_rate": 3.1118583598858097e-06, "loss": 0.0441, "reward": 2.3788347244262695, "reward_std": 0.33022433519363403, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8806818127632141, "rewards/wrapped_driving_reward": -0.37684714794158936, "rewards/wrapped_format_reward": 0.875, "step": 1150 }, { "completion_length": 500.0, "epoch": 230.2, "grad_norm": 0.3681263327598572, "kl": 2.2811102867126465, "learning_rate": 3.108332192874956e-06, "loss": 0.0912, "reward": 0.9493532180786133, "reward_std": 3.0929114818573, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.675000011920929, "rewards/wrapped_driving_reward": -1.1006468534469604, "rewards/wrapped_format_reward": 0.625, "step": 1151 }, { "completion_length": 500.0, "epoch": 230.4, "grad_norm": 0.333864688873291, "kl": 3.1689260005950928, "learning_rate": 3.1048047389991693e-06, "loss": 0.1268, "reward": 1.1928478479385376, "reward_std": 2.8281567096710205, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6875, "rewards/wrapped_driving_reward": -0.9946521520614624, "rewards/wrapped_format_reward": 0.75, "step": 1152 }, { "completion_length": 393.0, "epoch": 230.6, "grad_norm": 0.7062379121780396, "kl": 0.7364031672477722, "learning_rate": 3.101276005720421e-06, "loss": 0.0295, "reward": 1.7746498584747314, "reward_std": 0.7216954827308655, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6909722089767456, "rewards/wrapped_driving_reward": -0.5413224697113037, "rewards/wrapped_format_reward": 0.625, "step": 1153 }, { "completion_length": 500.0, "epoch": 230.8, "grad_norm": 0.5441510081291199, "kl": 1.5412652492523193, "learning_rate": 3.097746000503386e-06, "loss": 0.0617, "reward": 3.2702202796936035, "reward_std": 0.2382946014404297, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9281250238418579, "rewards/wrapped_driving_reward": 0.5920952558517456, "rewards/wrapped_format_reward": 0.75, "step": 1154 }, { "completion_length": 500.0, "epoch": 231.0, "grad_norm": 0.5519305467605591, "kl": 1.6986979246139526, "learning_rate": 3.094214730815433e-06, "loss": 0.0679, "reward": 1.980459213256836, "reward_std": 0.6980248689651489, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6833333373069763, "rewards/wrapped_driving_reward": -0.07787404209375381, "rewards/wrapped_format_reward": 0.375, "step": 1155 }, { "completion_length": 500.0, "epoch": 231.2, "grad_norm": 0.5493302345275879, "kl": 1.9409642219543457, "learning_rate": 3.0906822041266045e-06, "loss": 0.0776, "reward": 0.6890133619308472, "reward_std": 2.8132059574127197, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5758928656578064, "rewards/wrapped_driving_reward": -1.136879563331604, "rewards/wrapped_format_reward": 0.5, "step": 1156 }, { "completion_length": 500.0, "epoch": 231.4, "grad_norm": 0.5624310374259949, "kl": 2.409280776977539, "learning_rate": 3.087148427909599e-06, "loss": 0.0964, "reward": 1.1970300674438477, "reward_std": 2.80200457572937, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4375, "rewards/wrapped_driving_reward": -0.4904700517654419, "rewards/wrapped_format_reward": 0.5, "step": 1157 }, { "completion_length": 500.0, "epoch": 231.6, "grad_norm": 0.6499096751213074, "kl": 1.6109482049942017, "learning_rate": 3.0836134096397642e-06, "loss": 0.0644, "reward": 1.087529182434082, "reward_std": 3.407144784927368, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.037470817565918, "rewards/wrapped_format_reward": 0.625, "step": 1158 }, { "completion_length": 500.0, "epoch": 231.8, "grad_norm": 0.4589069187641144, "kl": 2.6704835891723633, "learning_rate": 3.08007715679507e-06, "loss": 0.1068, "reward": -0.354661226272583, "reward_std": 3.3552398681640625, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.104660987854004, "rewards/wrapped_format_reward": 0.75, "step": 1159 }, { "completion_length": 421.0, "epoch": 232.0, "grad_norm": 0.6029008626937866, "kl": 2.1493005752563477, "learning_rate": 3.0765396768561005e-06, "loss": 0.086, "reward": 2.5536091327667236, "reward_std": 0.5019896030426025, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7083333134651184, "rewards/wrapped_driving_reward": 0.22027575969696045, "rewards/wrapped_format_reward": 0.625, "step": 1160 }, { "completion_length": 500.0, "epoch": 232.2, "grad_norm": 0.6164078712463379, "kl": 2.0120291709899902, "learning_rate": 3.073000977306036e-06, "loss": 0.0805, "reward": 2.199471950531006, "reward_std": 0.4356992244720459, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.17552806437015533, "rewards/wrapped_format_reward": 0.625, "step": 1161 }, { "completion_length": 500.0, "epoch": 232.4, "grad_norm": 0.6032366156578064, "kl": 2.4116945266723633, "learning_rate": 3.069461065630634e-06, "loss": 0.0965, "reward": 0.8309696316719055, "reward_std": 3.039715051651001, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5694444179534912, "rewards/wrapped_driving_reward": -1.2384748458862305, "rewards/wrapped_format_reward": 0.75, "step": 1162 }, { "completion_length": 471.0, "epoch": 232.6, "grad_norm": 0.5067011117935181, "kl": 2.722092866897583, "learning_rate": 3.0659199493182197e-06, "loss": 0.1089, "reward": 2.81124210357666, "reward_std": 0.5590693354606628, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6922348737716675, "rewards/wrapped_driving_reward": 0.11900745332241058, "rewards/wrapped_format_reward": 1.0, "step": 1163 }, { "completion_length": 500.0, "epoch": 232.8, "grad_norm": 0.44970521330833435, "kl": 1.905672311782837, "learning_rate": 3.062377635859663e-06, "loss": 0.0762, "reward": 3.408777952194214, "reward_std": 0.13981571793556213, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.78125, "rewards/wrapped_driving_reward": 0.7525278925895691, "rewards/wrapped_format_reward": 0.875, "step": 1164 }, { "completion_length": 353.0, "epoch": 233.0, "grad_norm": 2.3353593349456787, "kl": 2.2877211570739746, "learning_rate": 3.058834132748369e-06, "loss": 0.0915, "reward": 2.1397366523742676, "reward_std": 0.6332308053970337, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.65625, "rewards/wrapped_driving_reward": -0.016513491049408913, "rewards/wrapped_format_reward": 0.5, "step": 1165 }, { "completion_length": 500.0, "epoch": 233.2, "grad_norm": 0.529943585395813, "kl": 1.6956918239593506, "learning_rate": 3.0552894474802585e-06, "loss": 0.0678, "reward": 2.3963022232055664, "reward_std": 0.8782609105110168, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.688095211982727, "rewards/wrapped_driving_reward": 0.2082068920135498, "rewards/wrapped_format_reward": 0.5, "step": 1166 }, { "completion_length": 245.0, "epoch": 233.4, "grad_norm": 0.7266473770141602, "kl": 0.6849232316017151, "learning_rate": 3.051743587553754e-06, "loss": 0.0274, "reward": 2.6962366104125977, "reward_std": 0.4670329988002777, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7278409004211426, "rewards/wrapped_driving_reward": -0.031604327261447906, "rewards/wrapped_format_reward": 1.0, "step": 1167 }, { "completion_length": 500.0, "epoch": 233.6, "grad_norm": 1.9038420915603638, "kl": 2.0398311614990234, "learning_rate": 3.0481965604697582e-06, "loss": 0.0816, "reward": 2.6148018836975098, "reward_std": 0.41098082065582275, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8402777314186096, "rewards/wrapped_driving_reward": -0.1004759669303894, "rewards/wrapped_format_reward": 0.875, "step": 1168 }, { "completion_length": 500.0, "epoch": 233.8, "grad_norm": 0.44645681977272034, "kl": 1.9117319583892822, "learning_rate": 3.0446483737316506e-06, "loss": 0.0765, "reward": 2.3172287940979004, "reward_std": 0.5684508681297302, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7890625, "rewards/wrapped_driving_reward": -0.09683382511138916, "rewards/wrapped_format_reward": 0.625, "step": 1169 }, { "completion_length": 500.0, "epoch": 234.0, "grad_norm": 0.4773977994918823, "kl": 1.4317177534103394, "learning_rate": 3.0410990348452572e-06, "loss": 0.0573, "reward": 2.1967670917510986, "reward_std": 0.7857086658477783, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6875, "rewards/wrapped_driving_reward": 0.25926706194877625, "rewards/wrapped_format_reward": 0.25, "step": 1170 }, { "completion_length": 500.0, "epoch": 234.2, "grad_norm": 0.5660207271575928, "kl": 1.2617156505584717, "learning_rate": 3.037548551318845e-06, "loss": 0.0505, "reward": 2.4822895526885986, "reward_std": 0.6116423606872559, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7107142806053162, "rewards/wrapped_driving_reward": 0.14657525718212128, "rewards/wrapped_format_reward": 0.625, "step": 1171 }, { "completion_length": 500.0, "epoch": 234.4, "grad_norm": 0.9286490082740784, "kl": 1.951711893081665, "learning_rate": 3.0339969306631008e-06, "loss": 0.0781, "reward": 1.744084119796753, "reward_std": 2.5072267055511475, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.606249988079071, "rewards/wrapped_driving_reward": -0.7371658086776733, "rewards/wrapped_format_reward": 0.875, "step": 1172 }, { "completion_length": 432.0, "epoch": 234.6, "grad_norm": 0.5660628080368042, "kl": 1.7724957466125488, "learning_rate": 3.030444180391116e-06, "loss": 0.0709, "reward": -1.4140844345092773, "reward_std": 3.211521625518799, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2750000059604645, "rewards/wrapped_driving_reward": -2.689084529876709, "rewards/wrapped_format_reward": 0.5, "step": 1173 }, { "completion_length": 500.0, "epoch": 234.8, "grad_norm": 0.672362744808197, "kl": 1.663142442703247, "learning_rate": 3.0268903080183744e-06, "loss": 0.0665, "reward": 1.67949378490448, "reward_std": 3.7935738563537598, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5625, "rewards/wrapped_driving_reward": -0.38300615549087524, "rewards/wrapped_format_reward": 0.75, "step": 1174 }, { "completion_length": 281.0, "epoch": 235.0, "grad_norm": 1.239585041999817, "kl": 1.3690040111541748, "learning_rate": 3.0233353210627305e-06, "loss": 0.0548, "reward": 0.8162803649902344, "reward_std": 3.222710132598877, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5211039185523987, "rewards/wrapped_driving_reward": -1.0798234939575195, "rewards/wrapped_format_reward": 0.625, "step": 1175 }, { "completion_length": 500.0, "epoch": 235.2, "grad_norm": 0.6818597316741943, "kl": 3.026345729827881, "learning_rate": 3.019779227044398e-06, "loss": 0.1211, "reward": 2.812359571456909, "reward_std": 0.6443459391593933, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.675000011920929, "rewards/wrapped_driving_reward": 0.512359619140625, "rewards/wrapped_format_reward": 0.625, "step": 1176 }, { "completion_length": 500.0, "epoch": 235.4, "grad_norm": 0.5823447108268738, "kl": 2.818692207336426, "learning_rate": 3.016222033485935e-06, "loss": 0.1127, "reward": 0.8440968990325928, "reward_std": 3.2312660217285156, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6499999761581421, "rewards/wrapped_driving_reward": -1.3059030771255493, "rewards/wrapped_format_reward": 0.75, "step": 1177 }, { "completion_length": 500.0, "epoch": 235.6, "grad_norm": 0.597113847732544, "kl": 3.0461084842681885, "learning_rate": 3.0126637479122196e-06, "loss": 0.1218, "reward": 0.7975152730941772, "reward_std": 3.2184927463531494, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4000000059604645, "rewards/wrapped_driving_reward": -0.9774847030639648, "rewards/wrapped_format_reward": 0.625, "step": 1178 }, { "completion_length": 500.0, "epoch": 235.8, "grad_norm": 0.5691481828689575, "kl": 1.2775683403015137, "learning_rate": 3.0091043778504438e-06, "loss": 0.0511, "reward": 1.559517741203308, "reward_std": 1.1475749015808105, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.875, "rewards/wrapped_driving_reward": -0.6904822587966919, "rewards/wrapped_format_reward": 0.375, "step": 1179 }, { "completion_length": 391.0, "epoch": 236.0, "grad_norm": 0.4509887099266052, "kl": 2.163684129714966, "learning_rate": 3.0055439308300954e-06, "loss": 0.0865, "reward": 2.042484760284424, "reward_std": 1.392350196838379, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.925000011920929, "rewards/wrapped_driving_reward": -0.8825151324272156, "rewards/wrapped_format_reward": 1.0, "step": 1180 }, { "completion_length": 366.0, "epoch": 236.2, "grad_norm": 0.6021866798400879, "kl": 1.381424903869629, "learning_rate": 3.001982414382936e-06, "loss": 0.0553, "reward": 2.278049945831299, "reward_std": 0.3396390378475189, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.90625, "rewards/wrapped_driving_reward": -0.12819984555244446, "rewards/wrapped_format_reward": 0.5, "step": 1181 }, { "completion_length": 500.0, "epoch": 236.4, "grad_norm": 0.566749095916748, "kl": 2.7605605125427246, "learning_rate": 2.998419836042993e-06, "loss": 0.1104, "reward": 2.184630870819092, "reward_std": 0.42303529381752014, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.65625, "rewards/wrapped_driving_reward": 0.02838093787431717, "rewards/wrapped_format_reward": 0.5, "step": 1182 }, { "completion_length": 422.0, "epoch": 236.6, "grad_norm": 0.6822918057441711, "kl": 1.2048547267913818, "learning_rate": 2.994856203346539e-06, "loss": 0.0482, "reward": 2.3428688049316406, "reward_std": 0.5582606792449951, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9097222089767456, "rewards/wrapped_driving_reward": -0.06685321033000946, "rewards/wrapped_format_reward": 0.5, "step": 1183 }, { "completion_length": 440.0, "epoch": 236.8, "grad_norm": 0.3997076451778412, "kl": 2.8531198501586914, "learning_rate": 2.9912915238320755e-06, "loss": 0.1141, "reward": 3.394401788711548, "reward_std": 0.11493463069200516, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6845238208770752, "rewards/wrapped_driving_reward": 0.8348779678344727, "rewards/wrapped_format_reward": 0.875, "step": 1184 }, { "completion_length": 500.0, "epoch": 237.0, "grad_norm": 0.4889514148235321, "kl": 1.5936460494995117, "learning_rate": 2.9877258050403214e-06, "loss": 0.0637, "reward": 0.0724869966506958, "reward_std": 2.9861128330230713, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5916666984558105, "rewards/wrapped_driving_reward": -2.019179582595825, "rewards/wrapped_format_reward": 0.75, "step": 1185 }, { "completion_length": 429.0, "epoch": 237.2, "grad_norm": 0.5517227649688721, "kl": 1.9644097089767456, "learning_rate": 2.9841590545141906e-06, "loss": 0.0786, "reward": 2.304136037826538, "reward_std": 0.3130222260951996, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.824999988079071, "rewards/wrapped_driving_reward": -0.1458640694618225, "rewards/wrapped_format_reward": 0.625, "step": 1186 }, { "completion_length": 500.0, "epoch": 237.4, "grad_norm": 0.5474483370780945, "kl": 1.9161665439605713, "learning_rate": 2.980591279798783e-06, "loss": 0.0766, "reward": 1.6800909042358398, "reward_std": 3.4796338081359863, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6875, "rewards/wrapped_driving_reward": -0.38240915536880493, "rewards/wrapped_format_reward": 0.625, "step": 1187 }, { "completion_length": 500.0, "epoch": 237.6, "grad_norm": 0.4424038231372833, "kl": 3.048851728439331, "learning_rate": 2.9770224884413625e-06, "loss": 0.122, "reward": 2.4612011909484863, "reward_std": 0.7139378190040588, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5062500238418579, "rewards/wrapped_driving_reward": 0.20495104789733887, "rewards/wrapped_format_reward": 0.75, "step": 1188 }, { "completion_length": 500.0, "epoch": 237.8, "grad_norm": 1.0547658205032349, "kl": 2.931641101837158, "learning_rate": 2.973452687991345e-06, "loss": 0.1173, "reward": 0.9793931245803833, "reward_std": 3.330230474472046, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.46875, "rewards/wrapped_driving_reward": -0.9893567562103271, "rewards/wrapped_format_reward": 0.75, "step": 1189 }, { "completion_length": 262.0, "epoch": 238.0, "grad_norm": 1.0644350051879883, "kl": 0.8790358901023865, "learning_rate": 2.96988188600028e-06, "loss": 0.0352, "reward": 2.7329859733581543, "reward_std": 0.5028561353683472, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7604166865348816, "rewards/wrapped_driving_reward": -0.02743079513311386, "rewards/wrapped_format_reward": 1.0, "step": 1190 }, { "completion_length": 418.0, "epoch": 238.2, "grad_norm": 0.5356208086013794, "kl": 2.0399181842803955, "learning_rate": 2.966310090021837e-06, "loss": 0.0816, "reward": 2.227649688720703, "reward_std": 0.40488219261169434, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6820707321166992, "rewards/wrapped_driving_reward": -0.07942090928554535, "rewards/wrapped_format_reward": 0.625, "step": 1191 }, { "completion_length": 500.0, "epoch": 238.4, "grad_norm": 0.6586440801620483, "kl": 2.1276180744171143, "learning_rate": 2.9627373076117864e-06, "loss": 0.0851, "reward": 0.4574768543243408, "reward_std": 2.9957144260406494, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.48750001192092896, "rewards/wrapped_driving_reward": -1.405023217201233, "rewards/wrapped_format_reward": 0.625, "step": 1192 }, { "completion_length": 500.0, "epoch": 238.6, "grad_norm": 0.7815991640090942, "kl": 2.0300703048706055, "learning_rate": 2.9591635463279878e-06, "loss": 0.0812, "reward": -1.172812581062317, "reward_std": 2.385099172592163, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5953282713890076, "rewards/wrapped_driving_reward": -3.0181407928466797, "rewards/wrapped_format_reward": 0.5, "step": 1193 }, { "completion_length": 500.0, "epoch": 238.8, "grad_norm": 0.5488854050636292, "kl": 1.3850175142288208, "learning_rate": 2.9555888137303695e-06, "loss": 0.0554, "reward": 1.7122122049331665, "reward_std": 3.810119390487671, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6931818127632141, "rewards/wrapped_driving_reward": -0.48096954822540283, "rewards/wrapped_format_reward": 0.75, "step": 1194 }, { "completion_length": 297.0, "epoch": 239.0, "grad_norm": 0.6302858591079712, "kl": 1.5946235656738281, "learning_rate": 2.9520131173809136e-06, "loss": 0.0638, "reward": 2.8317596912384033, "reward_std": 0.38432830572128296, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8166666626930237, "rewards/wrapped_driving_reward": 0.14009301364421844, "rewards/wrapped_format_reward": 0.875, "step": 1195 }, { "completion_length": 500.0, "epoch": 239.2, "grad_norm": 0.7266413569450378, "kl": 2.6884946823120117, "learning_rate": 2.9484364648436437e-06, "loss": 0.1075, "reward": 2.127082347869873, "reward_std": 0.5652997493743896, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.731696367263794, "rewards/wrapped_driving_reward": -0.22961410880088806, "rewards/wrapped_format_reward": 0.625, "step": 1196 }, { "completion_length": 299.0, "epoch": 239.4, "grad_norm": 1.6480594873428345, "kl": 1.8642939329147339, "learning_rate": 2.944858863684605e-06, "loss": 0.0746, "reward": 1.0557785034179688, "reward_std": 2.7793121337890625, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4437499940395355, "rewards/wrapped_driving_reward": -1.0129714012145996, "rewards/wrapped_format_reward": 0.875, "step": 1197 }, { "completion_length": 281.0, "epoch": 239.6, "grad_norm": 0.5891710519790649, "kl": 1.1138620376586914, "learning_rate": 2.9412803214718484e-06, "loss": 0.0446, "reward": 2.076864719390869, "reward_std": 0.7732005715370178, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8415403962135315, "rewards/wrapped_driving_reward": -0.7646756768226624, "rewards/wrapped_format_reward": 1.0, "step": 1198 }, { "completion_length": 344.0, "epoch": 239.8, "grad_norm": 1.1018381118774414, "kl": 2.6638591289520264, "learning_rate": 2.9377008457754166e-06, "loss": 0.1066, "reward": 1.4926388263702393, "reward_std": 3.335418939590454, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -0.38236111402511597, "rewards/wrapped_format_reward": 0.875, "step": 1199 }, { "completion_length": 500.0, "epoch": 240.0, "grad_norm": 0.5162401795387268, "kl": 2.9488978385925293, "learning_rate": 2.9341204441673267e-06, "loss": 0.118, "reward": 1.0723812580108643, "reward_std": 3.0534160137176514, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5375000238418579, "rewards/wrapped_driving_reward": -0.9651187062263489, "rewards/wrapped_format_reward": 0.75, "step": 1200 }, { "completion_length": 316.0, "epoch": 240.2, "grad_norm": 0.9419980049133301, "kl": 1.4795646667480469, "learning_rate": 2.9305391242215544e-06, "loss": 0.0592, "reward": 2.434497833251953, "reward_std": 0.44978222250938416, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7250000238418579, "rewards/wrapped_driving_reward": -0.040502071380615234, "rewards/wrapped_format_reward": 0.75, "step": 1201 }, { "completion_length": 500.0, "epoch": 240.4, "grad_norm": 0.6440792679786682, "kl": 1.114894986152649, "learning_rate": 2.9269568935140176e-06, "loss": 0.0446, "reward": 2.006610870361328, "reward_std": 0.37925803661346436, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5255681872367859, "rewards/wrapped_driving_reward": -0.3939574360847473, "rewards/wrapped_format_reward": 0.875, "step": 1202 }, { "completion_length": 345.0, "epoch": 240.6, "grad_norm": 0.7772359251976013, "kl": 1.1790874004364014, "learning_rate": 2.9233737596225616e-06, "loss": 0.0472, "reward": 0.6748437881469727, "reward_std": 3.1834704875946045, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5375000238418579, "rewards/wrapped_driving_reward": -0.8626561760902405, "rewards/wrapped_format_reward": 0.25, "step": 1203 }, { "completion_length": 500.0, "epoch": 240.8, "grad_norm": 0.5890513062477112, "kl": 2.168215274810791, "learning_rate": 2.9197897301269433e-06, "loss": 0.0867, "reward": 2.322695255279541, "reward_std": 0.617214024066925, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5416666865348816, "rewards/wrapped_driving_reward": 0.0310283824801445, "rewards/wrapped_format_reward": 0.75, "step": 1204 }, { "completion_length": 500.0, "epoch": 241.0, "grad_norm": 0.6161360740661621, "kl": 2.0666635036468506, "learning_rate": 2.9162048126088115e-06, "loss": 0.0827, "reward": 3.004972457885742, "reward_std": 0.5857500433921814, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5833333134651184, "rewards/wrapped_driving_reward": 0.7966392040252686, "rewards/wrapped_format_reward": 0.625, "step": 1205 }, { "completion_length": 500.0, "epoch": 241.2, "grad_norm": 0.46667274832725525, "kl": 2.702096462249756, "learning_rate": 2.912619014651694e-06, "loss": 0.1081, "reward": 1.2773984670639038, "reward_std": 2.861306667327881, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4479166865348816, "rewards/wrapped_driving_reward": -0.9205181002616882, "rewards/wrapped_format_reward": 1.0, "step": 1206 }, { "completion_length": 500.0, "epoch": 241.4, "grad_norm": 0.5721752643585205, "kl": 1.5363929271697998, "learning_rate": 2.9090323438409844e-06, "loss": 0.0615, "reward": 2.0471699237823486, "reward_std": 0.43281105160713196, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7291666865348816, "rewards/wrapped_driving_reward": -0.18199677765369415, "rewards/wrapped_format_reward": 0.5, "step": 1207 }, { "completion_length": 303.0, "epoch": 241.6, "grad_norm": 0.7896791696548462, "kl": 1.1830860376358032, "learning_rate": 2.9054448077639193e-06, "loss": 0.0473, "reward": 1.2794041633605957, "reward_std": 3.5387539863586426, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.574999988079071, "rewards/wrapped_driving_reward": -0.5455958843231201, "rewards/wrapped_format_reward": 0.5, "step": 1208 }, { "completion_length": 500.0, "epoch": 241.8, "grad_norm": 0.5778261423110962, "kl": 1.7935805320739746, "learning_rate": 2.9018564140095657e-06, "loss": 0.0717, "reward": 0.5931916236877441, "reward_std": 3.0699262619018555, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5062500238418579, "rewards/wrapped_driving_reward": -1.2880582809448242, "rewards/wrapped_format_reward": 0.625, "step": 1209 }, { "completion_length": 500.0, "epoch": 242.0, "grad_norm": 0.5919026136398315, "kl": 2.3783318996429443, "learning_rate": 2.898267170168807e-06, "loss": 0.0951, "reward": 2.0943000316619873, "reward_std": 0.8427504301071167, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.4749999940395355, "rewards/wrapped_driving_reward": -0.005700083449482918, "rewards/wrapped_format_reward": 0.625, "step": 1210 }, { "completion_length": 500.0, "epoch": 242.2, "grad_norm": 0.4520552158355713, "kl": 1.74555504322052, "learning_rate": 2.894677083834323e-06, "loss": 0.0698, "reward": -2.620941638946533, "reward_std": 2.7581167221069336, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.1875, "rewards/wrapped_driving_reward": -3.308441638946533, "rewards/wrapped_format_reward": 0.25, "step": 1211 }, { "completion_length": 500.0, "epoch": 242.4, "grad_norm": 6.109090328216553, "kl": 2.2895333766937256, "learning_rate": 2.8910861626005774e-06, "loss": 0.0916, "reward": 1.918097972869873, "reward_std": 0.840246856212616, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6909722089767456, "rewards/wrapped_driving_reward": -0.39787420630455017, "rewards/wrapped_format_reward": 0.625, "step": 1212 }, { "completion_length": 428.0, "epoch": 242.6, "grad_norm": 0.6188477277755737, "kl": 2.177752733230591, "learning_rate": 2.887494414063799e-06, "loss": 0.0871, "reward": 0.880556583404541, "reward_std": 2.9453325271606445, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3958333134651184, "rewards/wrapped_driving_reward": -1.1402766704559326, "rewards/wrapped_format_reward": 0.875, "step": 1213 }, { "completion_length": 389.0, "epoch": 242.8, "grad_norm": 0.536531388759613, "kl": 2.127166509628296, "learning_rate": 2.8839018458219653e-06, "loss": 0.0851, "reward": 3.3626787662506104, "reward_std": 0.3718360364437103, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8333333730697632, "rewards/wrapped_driving_reward": 0.7793453931808472, "rewards/wrapped_format_reward": 0.75, "step": 1214 }, { "completion_length": 500.0, "epoch": 243.0, "grad_norm": 0.6899824142456055, "kl": 2.00929856300354, "learning_rate": 2.880308465474792e-06, "loss": 0.0804, "reward": 1.306373119354248, "reward_std": 2.8757214546203613, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5208333134651184, "rewards/wrapped_driving_reward": -0.9644601941108704, "rewards/wrapped_format_reward": 1.0, "step": 1215 }, { "completion_length": 500.0, "epoch": 243.2, "grad_norm": 0.6023064851760864, "kl": 3.2132043838500977, "learning_rate": 2.876714280623708e-06, "loss": 0.1285, "reward": 2.8999481201171875, "reward_std": 0.6026341915130615, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8833333253860474, "rewards/wrapped_driving_reward": 0.2666148245334625, "rewards/wrapped_format_reward": 0.75, "step": 1216 }, { "completion_length": 500.0, "epoch": 243.4, "grad_norm": 0.5641474723815918, "kl": 2.9786226749420166, "learning_rate": 2.8731192988718463e-06, "loss": 0.1191, "reward": 0.8588714599609375, "reward_std": 2.9822001457214355, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": -1.141128420829773, "rewards/wrapped_format_reward": 0.625, "step": 1217 }, { "completion_length": 500.0, "epoch": 243.6, "grad_norm": 0.7416784167289734, "kl": 0.9438762664794922, "learning_rate": 2.8695235278240272e-06, "loss": 0.0378, "reward": -1.0216190814971924, "reward_std": 2.409459352493286, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5729166865348816, "rewards/wrapped_driving_reward": -2.7195358276367188, "rewards/wrapped_format_reward": 0.375, "step": 1218 }, { "completion_length": 188.0, "epoch": 243.8, "grad_norm": 1.3310928344726562, "kl": 0.9168747663497925, "learning_rate": 2.865926975086737e-06, "loss": 0.0367, "reward": 1.8423606157302856, "reward_std": 3.2376551628112793, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.46875, "rewards/wrapped_driving_reward": -0.37638944387435913, "rewards/wrapped_format_reward": 1.0, "step": 1219 }, { "completion_length": 376.0, "epoch": 244.0, "grad_norm": 0.6404255628585815, "kl": 2.304161787033081, "learning_rate": 2.862329648268117e-06, "loss": 0.0922, "reward": 2.867375135421753, "reward_std": 0.456234872341156, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5422348380088806, "rewards/wrapped_driving_reward": 0.32514017820358276, "rewards/wrapped_format_reward": 1.0, "step": 1220 }, { "completion_length": 500.0, "epoch": 244.2, "grad_norm": 0.6682083010673523, "kl": 2.4416470527648926, "learning_rate": 2.858731554977948e-06, "loss": 0.0977, "reward": 1.6295703649520874, "reward_std": 3.7603485584259033, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -0.3704296052455902, "rewards/wrapped_format_reward": 0.75, "step": 1221 }, { "completion_length": 500.0, "epoch": 244.4, "grad_norm": 0.44403013586997986, "kl": 3.011091947555542, "learning_rate": 2.8551327028276315e-06, "loss": 0.1204, "reward": 2.6442835330963135, "reward_std": 0.2714155614376068, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": 0.019283462315797806, "rewards/wrapped_format_reward": 0.875, "step": 1222 }, { "completion_length": 500.0, "epoch": 244.6, "grad_norm": 0.6316026449203491, "kl": 1.1169650554656982, "learning_rate": 2.8515330994301716e-06, "loss": 0.0447, "reward": 0.5459051728248596, "reward_std": 3.0529112815856934, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3472222089767456, "rewards/wrapped_driving_reward": -1.0513170957565308, "rewards/wrapped_format_reward": 0.5, "step": 1223 }, { "completion_length": 395.0, "epoch": 244.8, "grad_norm": 0.6866368651390076, "kl": 2.462233304977417, "learning_rate": 2.847932752400164e-06, "loss": 0.0985, "reward": 2.089472770690918, "reward_std": 0.9030892252922058, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5833333134651184, "rewards/wrapped_driving_reward": -0.11886070668697357, "rewards/wrapped_format_reward": 0.625, "step": 1224 }, { "completion_length": 500.0, "epoch": 245.0, "grad_norm": 0.6900458335876465, "kl": 0.7952966094017029, "learning_rate": 2.844331669353777e-06, "loss": 0.0318, "reward": -0.9828380942344666, "reward_std": 3.48494291305542, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4375, "rewards/wrapped_driving_reward": -2.1703381538391113, "rewards/wrapped_format_reward": 0.25, "step": 1225 }, { "completion_length": 157.0, "epoch": 245.2, "grad_norm": 0.8836446404457092, "kl": 1.09471595287323, "learning_rate": 2.8407298579087367e-06, "loss": 0.0438, "reward": 3.0691778659820557, "reward_std": 0.4199413061141968, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7749999761581421, "rewards/wrapped_driving_reward": 0.29417791962623596, "rewards/wrapped_format_reward": 1.0, "step": 1226 }, { "completion_length": 424.0, "epoch": 245.4, "grad_norm": 0.5774113535881042, "kl": 2.5503952503204346, "learning_rate": 2.837127325684308e-06, "loss": 0.102, "reward": 2.2825865745544434, "reward_std": 0.38892245292663574, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9636363983154297, "rewards/wrapped_driving_reward": -0.5560497045516968, "rewards/wrapped_format_reward": 0.875, "step": 1227 }, { "completion_length": 500.0, "epoch": 245.6, "grad_norm": 0.5768952965736389, "kl": 1.4787319898605347, "learning_rate": 2.833524080301282e-06, "loss": 0.0591, "reward": 3.0736498832702637, "reward_std": 0.5016334056854248, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6679487228393555, "rewards/wrapped_driving_reward": 0.7807011604309082, "rewards/wrapped_format_reward": 0.625, "step": 1228 }, { "completion_length": 500.0, "epoch": 245.8, "grad_norm": 0.7406091094017029, "kl": 2.2784879207611084, "learning_rate": 2.8299201293819588e-06, "loss": 0.0911, "reward": 1.104966402053833, "reward_std": 3.078457832336426, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.71875, "rewards/wrapped_driving_reward": -1.113783597946167, "rewards/wrapped_format_reward": 0.75, "step": 1229 }, { "completion_length": 500.0, "epoch": 246.0, "grad_norm": 0.7869997024536133, "kl": 1.5223968029022217, "learning_rate": 2.82631548055013e-06, "loss": 0.0609, "reward": 0.5786565542221069, "reward_std": 2.739652395248413, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5664335489273071, "rewards/wrapped_driving_reward": -1.2377769947052002, "rewards/wrapped_format_reward": 0.5, "step": 1230 }, { "completion_length": 429.0, "epoch": 246.2, "grad_norm": 0.4810180366039276, "kl": 2.6067757606506348, "learning_rate": 2.822710141431062e-06, "loss": 0.1043, "reward": 2.258387565612793, "reward_std": 0.4629867672920227, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": 0.008387496694922447, "rewards/wrapped_format_reward": 0.625, "step": 1231 }, { "completion_length": 500.0, "epoch": 246.4, "grad_norm": 0.7077089548110962, "kl": 1.5288805961608887, "learning_rate": 2.8191041196514874e-06, "loss": 0.0612, "reward": 0.5051733255386353, "reward_std": 3.057506799697876, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4166666865348816, "rewards/wrapped_driving_reward": -1.1614934206008911, "rewards/wrapped_format_reward": 0.5, "step": 1232 }, { "completion_length": 362.0, "epoch": 246.6, "grad_norm": 0.6028400659561157, "kl": 3.094609498977661, "learning_rate": 2.815497422839575e-06, "loss": 0.1238, "reward": 1.513184666633606, "reward_std": 1.685122013092041, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8500000238418579, "rewards/wrapped_driving_reward": -0.961815357208252, "rewards/wrapped_format_reward": 0.625, "step": 1233 }, { "completion_length": 308.0, "epoch": 246.8, "grad_norm": 0.7054983973503113, "kl": 1.2821030616760254, "learning_rate": 2.8118900586249264e-06, "loss": 0.0513, "reward": 2.4657773971557617, "reward_std": 0.5821136832237244, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9375, "rewards/wrapped_driving_reward": -0.47172245383262634, "rewards/wrapped_format_reward": 1.0, "step": 1234 }, { "completion_length": 279.0, "epoch": 247.0, "grad_norm": 0.8858932256698608, "kl": 1.3778883218765259, "learning_rate": 2.8082820346385554e-06, "loss": 0.0551, "reward": 3.3071165084838867, "reward_std": 0.5519371628761292, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": 0.8071165084838867, "rewards/wrapped_format_reward": 0.75, "step": 1235 }, { "completion_length": 500.0, "epoch": 247.2, "grad_norm": 0.5988675355911255, "kl": 1.3694862127304077, "learning_rate": 2.804673358512869e-06, "loss": 0.0548, "reward": 1.9713540077209473, "reward_std": 0.5131980180740356, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7083333134651184, "rewards/wrapped_driving_reward": -0.23697930574417114, "rewards/wrapped_format_reward": 0.5, "step": 1236 }, { "completion_length": 500.0, "epoch": 247.4, "grad_norm": 0.645806074142456, "kl": 2.3348429203033447, "learning_rate": 2.8010640378816546e-06, "loss": 0.0934, "reward": 1.2530263662338257, "reward_std": 3.1767733097076416, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": -0.9969736337661743, "rewards/wrapped_format_reward": 0.875, "step": 1237 }, { "completion_length": 500.0, "epoch": 247.6, "grad_norm": 0.8527547121047974, "kl": 2.4006378650665283, "learning_rate": 2.797454080380064e-06, "loss": 0.096, "reward": 2.581483840942383, "reward_std": 0.7856062054634094, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.703125, "rewards/wrapped_driving_reward": 0.12835893034934998, "rewards/wrapped_format_reward": 0.75, "step": 1238 }, { "completion_length": 500.0, "epoch": 247.8, "grad_norm": 0.5346929430961609, "kl": 1.9728622436523438, "learning_rate": 2.7938434936445946e-06, "loss": 0.0789, "reward": -1.9234638214111328, "reward_std": 3.179391860961914, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.125, "rewards/wrapped_driving_reward": -2.798463821411133, "rewards/wrapped_format_reward": 0.5, "step": 1239 }, { "completion_length": 500.0, "epoch": 248.0, "grad_norm": 0.4159456789493561, "kl": 2.495544672012329, "learning_rate": 2.7902322853130758e-06, "loss": 0.0998, "reward": 1.989149570465088, "reward_std": 1.0644927024841309, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.574999988079071, "rewards/wrapped_driving_reward": -0.21085043251514435, "rewards/wrapped_format_reward": 0.625, "step": 1240 }, { "completion_length": 500.0, "epoch": 248.2, "grad_norm": 0.6543341875076294, "kl": 0.6809861063957214, "learning_rate": 2.7866204630246524e-06, "loss": 0.0272, "reward": -0.7314274311065674, "reward_std": 3.5070412158966064, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4166666865348816, "rewards/wrapped_driving_reward": -2.0230941772460938, "rewards/wrapped_format_reward": 0.375, "step": 1241 }, { "completion_length": 500.0, "epoch": 248.4, "grad_norm": 0.8649194836616516, "kl": 1.7983460426330566, "learning_rate": 2.7830080344197675e-06, "loss": 0.0719, "reward": 2.045046091079712, "reward_std": 0.8419520854949951, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6499999761581421, "rewards/wrapped_driving_reward": -0.1049538254737854, "rewards/wrapped_format_reward": 0.5, "step": 1242 }, { "completion_length": 496.0, "epoch": 248.6, "grad_norm": 0.5479925274848938, "kl": 2.59714674949646, "learning_rate": 2.7793950071401456e-06, "loss": 0.1039, "reward": 2.37050199508667, "reward_std": 0.6726978421211243, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.637499988079071, "rewards/wrapped_driving_reward": 0.10800187289714813, "rewards/wrapped_format_reward": 0.625, "step": 1243 }, { "completion_length": 500.0, "epoch": 248.8, "grad_norm": 0.6005021333694458, "kl": 1.8280941247940063, "learning_rate": 2.77578138882878e-06, "loss": 0.0731, "reward": 1.4184322357177734, "reward_std": 0.5856792330741882, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.875, "rewards/wrapped_driving_reward": -0.9565678238868713, "rewards/wrapped_format_reward": 0.5, "step": 1244 }, { "completion_length": 500.0, "epoch": 249.0, "grad_norm": 1.4269777536392212, "kl": 0.5068494081497192, "learning_rate": 2.7721671871299115e-06, "loss": 0.0203, "reward": 0.6254051923751831, "reward_std": 3.092291831970215, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.46666669845581055, "rewards/wrapped_driving_reward": -0.8412615060806274, "rewards/wrapped_format_reward": 0.25, "step": 1245 }, { "completion_length": 500.0, "epoch": 249.2, "grad_norm": 0.7743470072746277, "kl": 1.7520393133163452, "learning_rate": 2.7685524096890186e-06, "loss": 0.0701, "reward": 0.7374759912490845, "reward_std": 3.1639585494995117, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5583333373069763, "rewards/wrapped_driving_reward": -1.320857286453247, "rewards/wrapped_format_reward": 0.75, "step": 1246 }, { "completion_length": 500.0, "epoch": 249.4, "grad_norm": 0.4360426366329193, "kl": 1.9110513925552368, "learning_rate": 2.7649370641527935e-06, "loss": 0.0764, "reward": 2.1534876823425293, "reward_std": 0.3662029504776001, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7222222089767456, "rewards/wrapped_driving_reward": -0.19373437762260437, "rewards/wrapped_format_reward": 0.625, "step": 1247 }, { "completion_length": 500.0, "epoch": 249.6, "grad_norm": 0.6733804941177368, "kl": 3.088292360305786, "learning_rate": 2.761321158169134e-06, "loss": 0.1235, "reward": 2.572394847869873, "reward_std": 0.6688610315322876, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7130681872367859, "rewards/wrapped_driving_reward": 0.1093265637755394, "rewards/wrapped_format_reward": 0.75, "step": 1248 }, { "completion_length": 500.0, "epoch": 249.8, "grad_norm": 0.6770443916320801, "kl": 1.8496917486190796, "learning_rate": 2.7577046993871204e-06, "loss": 0.074, "reward": -0.5244669914245605, "reward_std": 3.7341253757476807, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3035714328289032, "rewards/wrapped_driving_reward": -1.9530383348464966, "rewards/wrapped_format_reward": 0.625, "step": 1249 }, { "completion_length": 500.0, "epoch": 250.0, "grad_norm": 0.7306457161903381, "kl": 1.9875150918960571, "learning_rate": 2.754087695457005e-06, "loss": 0.0795, "reward": 2.6576943397521973, "reward_std": 0.7947729825973511, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.768750011920929, "rewards/wrapped_driving_reward": -0.11105579137802124, "rewards/wrapped_format_reward": 1.0, "step": 1250 } ], "logging_steps": 1, "max_steps": 2400, "num_input_tokens_seen": 0, "num_train_epochs": 480, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }