{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 100.0, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 500.0, "epoch": 0.2, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.0833333333333335e-08, "loss": 0.0, "reward": -4.0, "reward_std": 0.0, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.0, "step": 1 }, { "completion_length": 500.0, "epoch": 0.4, "grad_norm": 0.4331946074962616, "kl": 0.0, "learning_rate": 4.166666666666667e-08, "loss": -0.0, "reward": -2.195732355117798, "reward_std": 2.577455759048462, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4375, "rewards/wrapped_driving_reward": -3.258232355117798, "rewards/wrapped_format_reward": 0.125, "step": 2 }, { "completion_length": 500.0, "epoch": 0.6, "grad_norm": 0.43272849917411804, "kl": 0.0009923786856234074, "learning_rate": 6.250000000000001e-08, "loss": 0.0, "reward": -1.0750184059143066, "reward_std": 3.4081573486328125, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2232142835855484, "rewards/wrapped_driving_reward": -1.9232327938079834, "rewards/wrapped_format_reward": 0.125, "step": 3 }, { "completion_length": 500.0, "epoch": 0.8, "grad_norm": 10.146150588989258, "kl": 0.0021735988557338715, "learning_rate": 8.333333333333334e-08, "loss": 0.0001, "reward": 1.4993090629577637, "reward_std": 3.6849262714385986, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.59375, "rewards/wrapped_driving_reward": -0.3444410264492035, "rewards/wrapped_format_reward": 0.5, "step": 4 }, { "completion_length": 500.0, "epoch": 1.0, "grad_norm": 3.436992883682251, "kl": 0.001074329949915409, "learning_rate": 1.0416666666666667e-07, "loss": 0.0, "reward": -1.1928160190582275, "reward_std": 3.2491295337677, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3055555522441864, "rewards/wrapped_driving_reward": -1.9983716011047363, "rewards/wrapped_format_reward": 0.0, "step": 5 }, { "completion_length": 500.0, "epoch": 1.2, "grad_norm": 0.0012362411944195628, "kl": 0.0008123984443955123, "learning_rate": 1.2500000000000002e-07, "loss": 0.0, "reward": -4.0, "reward_std": 0.0, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.0, "step": 6 }, { "completion_length": 500.0, "epoch": 1.4, "grad_norm": 3.972846031188965, "kl": 0.0018862163415178657, "learning_rate": 1.4583333333333335e-07, "loss": 0.0001, "reward": -1.2093137502670288, "reward_std": 3.227802038192749, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3901515007019043, "rewards/wrapped_driving_reward": -2.0994651317596436, "rewards/wrapped_format_reward": 0.0, "step": 7 }, { "completion_length": 500.0, "epoch": 1.6, "grad_norm": 1.9137581586837769, "kl": 0.0010808327933773398, "learning_rate": 1.6666666666666668e-07, "loss": 0.0, "reward": -1.0708822011947632, "reward_std": 3.423583745956421, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2916666567325592, "rewards/wrapped_driving_reward": -2.112548828125, "rewards/wrapped_format_reward": 0.25, "step": 8 }, { "completion_length": 500.0, "epoch": 1.8, "grad_norm": 8.79112720489502, "kl": 0.0014885602286085486, "learning_rate": 1.875e-07, "loss": 0.0001, "reward": -2.4617133140563965, "reward_std": 3.076573371887207, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.125, "rewards/wrapped_driving_reward": -2.8367133140563965, "rewards/wrapped_format_reward": 0.0, "step": 9 }, { "completion_length": 500.0, "epoch": 2.0, "grad_norm": 0.0010773384710773826, "kl": 0.0006336356163956225, "learning_rate": 2.0833333333333333e-07, "loss": 0.0, "reward": -4.0, "reward_std": 0.0, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.0, "step": 10 }, { "completion_length": 500.0, "epoch": 2.2, "grad_norm": 2.8661322593688965, "kl": 0.001619816990569234, "learning_rate": 2.2916666666666666e-07, "loss": 0.0001, "reward": 0.19541072845458984, "reward_std": 2.79884672164917, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4732142686843872, "rewards/wrapped_driving_reward": -1.0278035402297974, "rewards/wrapped_format_reward": 0.0, "step": 11 }, { "completion_length": 500.0, "epoch": 2.4, "grad_norm": 3.63022518157959, "kl": 0.0011069196043536067, "learning_rate": 2.5000000000000004e-07, "loss": 0.0, "reward": -0.7504351139068604, "reward_std": 3.4696340560913086, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.1770833432674408, "rewards/wrapped_driving_reward": -1.9275184869766235, "rewards/wrapped_format_reward": 0.5, "step": 12 }, { "completion_length": 500.0, "epoch": 2.6, "grad_norm": 1.3208059072494507, "kl": 0.0007781968452036381, "learning_rate": 2.7083333333333337e-07, "loss": 0.0, "reward": -0.5810263752937317, "reward_std": 3.9478907585144043, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.5810264348983765, "rewards/wrapped_format_reward": 0.125, "step": 13 }, { "completion_length": 500.0, "epoch": 2.8, "grad_norm": 4.958929061889648, "kl": 0.0007627051090821624, "learning_rate": 2.916666666666667e-07, "loss": 0.0, "reward": 0.09956195950508118, "reward_std": 2.970421552658081, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4571428596973419, "rewards/wrapped_driving_reward": -1.3575809001922607, "rewards/wrapped_format_reward": 0.25, "step": 14 }, { "completion_length": 500.0, "epoch": 3.0, "grad_norm": 9.031028747558594, "kl": 0.0017744852229952812, "learning_rate": 3.125e-07, "loss": 0.0001, "reward": -0.7453500032424927, "reward_std": 3.2129933834075928, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.9953501224517822, "rewards/wrapped_format_reward": 0.375, "step": 15 }, { "completion_length": 500.0, "epoch": 3.2, "grad_norm": 4.5779242515563965, "kl": 0.0011605366598814726, "learning_rate": 3.3333333333333335e-07, "loss": 0.0, "reward": -0.6313304901123047, "reward_std": 3.0313284397125244, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -2.0063304901123047, "rewards/wrapped_format_reward": 0.5, "step": 16 }, { "completion_length": 500.0, "epoch": 3.4, "grad_norm": 4.1963629722595215, "kl": 0.0015463099116459489, "learning_rate": 3.541666666666667e-07, "loss": 0.0001, "reward": -1.0474066734313965, "reward_std": 3.410123825073242, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4166666865348816, "rewards/wrapped_driving_reward": -2.089073419570923, "rewards/wrapped_format_reward": 0.125, "step": 17 }, { "completion_length": 500.0, "epoch": 3.6, "grad_norm": 2.828728675842285, "kl": 0.0015656519681215286, "learning_rate": 3.75e-07, "loss": 0.0001, "reward": -0.7301186323165894, "reward_std": 3.2286598682403564, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.980118751525879, "rewards/wrapped_format_reward": 0.25, "step": 18 }, { "completion_length": 500.0, "epoch": 3.8, "grad_norm": 2.103822946548462, "kl": 0.0015016624238342047, "learning_rate": 3.9583333333333334e-07, "loss": 0.0001, "reward": 0.5382012724876404, "reward_std": 2.7029671669006348, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5833333134651184, "rewards/wrapped_driving_reward": -0.9201321005821228, "rewards/wrapped_format_reward": 0.125, "step": 19 }, { "completion_length": 500.0, "epoch": 4.0, "grad_norm": 1.4944398403167725, "kl": 0.0011990186758339405, "learning_rate": 4.1666666666666667e-07, "loss": 0.0, "reward": -2.170128107070923, "reward_std": 3.0023434162139893, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.125, "rewards/wrapped_driving_reward": -2.795128107070923, "rewards/wrapped_format_reward": 0.25, "step": 20 }, { "completion_length": 500.0, "epoch": 4.2, "grad_norm": 0.00705456268042326, "kl": 0.0014940756373107433, "learning_rate": 4.375e-07, "loss": 0.0001, "reward": -4.0, "reward_std": 0.0, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.0, "step": 21 }, { "completion_length": 500.0, "epoch": 4.4, "grad_norm": 0.3451469838619232, "kl": 0.0007462741923518479, "learning_rate": 4.583333333333333e-07, "loss": 0.0, "reward": -1.0851657390594482, "reward_std": 2.5890932083129883, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6679292917251587, "rewards/wrapped_driving_reward": -2.6280951499938965, "rewards/wrapped_format_reward": 0.125, "step": 22 }, { "completion_length": 500.0, "epoch": 4.6, "grad_norm": 50.23689651489258, "kl": 0.0034247653093189, "learning_rate": 4.791666666666667e-07, "loss": 0.0001, "reward": -0.5867406129837036, "reward_std": 3.6583194732666016, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3901515007019043, "rewards/wrapped_driving_reward": -1.601892113685608, "rewards/wrapped_format_reward": 0.125, "step": 23 }, { "completion_length": 500.0, "epoch": 4.8, "grad_norm": 0.6233353614807129, "kl": 0.0010215980000793934, "learning_rate": 5.000000000000001e-07, "loss": 0.0, "reward": -1.15825355052948, "reward_std": 3.295187473297119, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2857142686843872, "rewards/wrapped_driving_reward": -2.068967819213867, "rewards/wrapped_format_reward": 0.125, "step": 24 }, { "completion_length": 500.0, "epoch": 5.0, "grad_norm": 0.3912212550640106, "kl": 0.000855346501339227, "learning_rate": 5.208333333333334e-07, "loss": 0.0, "reward": 0.6220214366912842, "reward_std": 2.7480878829956055, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.734375, "rewards/wrapped_driving_reward": -0.9873536229133606, "rewards/wrapped_format_reward": 0.125, "step": 25 }, { "completion_length": 500.0, "epoch": 5.2, "grad_norm": 0.9482459425926208, "kl": 0.0009874895913526416, "learning_rate": 5.416666666666667e-07, "loss": 0.0, "reward": -2.19559645652771, "reward_std": 2.9796664714813232, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.0535714291036129, "rewards/wrapped_driving_reward": -2.9991679191589355, "rewards/wrapped_format_reward": 0.5, "step": 26 }, { "completion_length": 500.0, "epoch": 5.4, "grad_norm": 4.4480109214782715, "kl": 0.0021480382420122623, "learning_rate": 5.625e-07, "loss": 0.0001, "reward": 0.7437606453895569, "reward_std": 2.506401300430298, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6515151262283325, "rewards/wrapped_driving_reward": -0.9077544212341309, "rewards/wrapped_format_reward": 0.25, "step": 27 }, { "completion_length": 500.0, "epoch": 5.6, "grad_norm": 2.110179901123047, "kl": 0.0010772650130093098, "learning_rate": 5.833333333333334e-07, "loss": 0.0, "reward": -2.052107810974121, "reward_std": 3.263345241546631, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.802107810974121, "rewards/wrapped_format_reward": 0.25, "step": 28 }, { "completion_length": 500.0, "epoch": 5.8, "grad_norm": 0.4768023192882538, "kl": 0.0009159984765574336, "learning_rate": 6.041666666666667e-07, "loss": 0.0, "reward": 0.34745848178863525, "reward_std": 2.903594493865967, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5719696879386902, "rewards/wrapped_driving_reward": -0.9745111465454102, "rewards/wrapped_format_reward": 0.0, "step": 29 }, { "completion_length": 500.0, "epoch": 6.0, "grad_norm": 0.3444426357746124, "kl": 0.000862763321492821, "learning_rate": 6.25e-07, "loss": 0.0, "reward": -0.7680141925811768, "reward_std": 3.4674270153045654, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3125, "rewards/wrapped_driving_reward": -1.9555140733718872, "rewards/wrapped_format_reward": 0.375, "step": 30 }, { "completion_length": 500.0, "epoch": 6.2, "grad_norm": 1.7202939987182617, "kl": 0.0010583762777969241, "learning_rate": 6.458333333333334e-07, "loss": 0.0, "reward": -1.404773235321045, "reward_std": 3.095156192779541, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4375, "rewards/wrapped_driving_reward": -2.467273235321045, "rewards/wrapped_format_reward": 0.125, "step": 31 }, { "completion_length": 500.0, "epoch": 6.4, "grad_norm": 0.7919694781303406, "kl": 0.0008548864279873669, "learning_rate": 6.666666666666667e-07, "loss": 0.0, "reward": 0.7130440473556519, "reward_std": 3.1504263877868652, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6428571343421936, "rewards/wrapped_driving_reward": -0.804813027381897, "rewards/wrapped_format_reward": 0.125, "step": 32 }, { "completion_length": 500.0, "epoch": 6.6, "grad_norm": 3.1069979667663574, "kl": 0.0012187480460852385, "learning_rate": 6.875000000000001e-07, "loss": 0.0, "reward": -2.399667739868164, "reward_std": 2.877002477645874, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.06818182021379471, "rewards/wrapped_driving_reward": -2.9678494930267334, "rewards/wrapped_format_reward": 0.25, "step": 33 }, { "completion_length": 500.0, "epoch": 6.8, "grad_norm": 0.730955183506012, "kl": 0.0008993800729513168, "learning_rate": 7.083333333333334e-07, "loss": 0.0, "reward": -2.6763646602630615, "reward_std": 2.647270679473877, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.1818181872367859, "rewards/wrapped_driving_reward": -3.108182907104492, "rewards/wrapped_format_reward": 0.0, "step": 34 }, { "completion_length": 500.0, "epoch": 7.0, "grad_norm": 5.383649826049805, "kl": 0.006175986025482416, "learning_rate": 7.291666666666667e-07, "loss": 0.0002, "reward": 3.2691361904144287, "reward_std": 0.3647014796733856, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9017857313156128, "rewards/wrapped_driving_reward": 0.8673505187034607, "rewards/wrapped_format_reward": 0.5, "step": 35 }, { "completion_length": 500.0, "epoch": 7.2, "grad_norm": 0.41836127638816833, "kl": 0.0009351923363283277, "learning_rate": 7.5e-07, "loss": 0.0, "reward": -3.75, "reward_std": 0.5, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.25, "step": 36 }, { "completion_length": 500.0, "epoch": 7.4, "grad_norm": 1.9916703701019287, "kl": 0.0017392473528161645, "learning_rate": 7.708333333333334e-07, "loss": 0.0001, "reward": -2.597564935684204, "reward_std": 2.804870128631592, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -3.097564935684204, "rewards/wrapped_format_reward": 0.0, "step": 37 }, { "completion_length": 500.0, "epoch": 7.6, "grad_norm": 0.433552622795105, "kl": 0.0009353554341942072, "learning_rate": 7.916666666666667e-07, "loss": 0.0, "reward": -2.5462069511413574, "reward_std": 2.5850212574005127, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.125, "rewards/wrapped_driving_reward": -3.0462067127227783, "rewards/wrapped_format_reward": 0.125, "step": 38 }, { "completion_length": 500.0, "epoch": 7.8, "grad_norm": 1.2196906805038452, "kl": 0.0008541917777620256, "learning_rate": 8.125000000000001e-07, "loss": 0.0, "reward": -1.038360834121704, "reward_std": 3.1846678256988525, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2083333283662796, "rewards/wrapped_driving_reward": -1.9966940879821777, "rewards/wrapped_format_reward": 0.25, "step": 39 }, { "completion_length": 500.0, "epoch": 8.0, "grad_norm": 1.281370997428894, "kl": 0.0012979316525161266, "learning_rate": 8.333333333333333e-07, "loss": 0.0001, "reward": -3.875, "reward_std": 0.25, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.125, "step": 40 }, { "completion_length": 500.0, "epoch": 8.2, "grad_norm": 0.6021063923835754, "kl": 0.0010175962233915925, "learning_rate": 8.541666666666667e-07, "loss": 0.0, "reward": 0.5651559829711914, "reward_std": 3.080660820007324, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -0.9348440170288086, "rewards/wrapped_format_reward": 0.25, "step": 41 }, { "completion_length": 500.0, "epoch": 8.4, "grad_norm": 0.3452380895614624, "kl": 0.0009200552594847977, "learning_rate": 8.75e-07, "loss": 0.0, "reward": -2.3739185333251953, "reward_std": 3.2521629333496094, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.1666666716337204, "rewards/wrapped_driving_reward": -2.7905852794647217, "rewards/wrapped_format_reward": 0.0, "step": 42 }, { "completion_length": 500.0, "epoch": 8.6, "grad_norm": 2.9192402362823486, "kl": 0.0010521383956074715, "learning_rate": 8.958333333333334e-07, "loss": 0.0, "reward": -2.741067886352539, "reward_std": 2.197209596633911, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.015625, "rewards/wrapped_driving_reward": -3.131692886352539, "rewards/wrapped_format_reward": 0.125, "step": 43 }, { "completion_length": 500.0, "epoch": 8.8, "grad_norm": 1.652388572692871, "kl": 0.001037920475937426, "learning_rate": 9.166666666666666e-07, "loss": 0.0, "reward": -0.9824157357215881, "reward_std": 3.4965455532073975, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4318181872367859, "rewards/wrapped_driving_reward": -1.914233922958374, "rewards/wrapped_format_reward": 0.0, "step": 44 }, { "completion_length": 500.0, "epoch": 9.0, "grad_norm": 1.7152076959609985, "kl": 0.0010146588319912553, "learning_rate": 9.375000000000001e-07, "loss": 0.0, "reward": 1.4902138710021973, "reward_std": 0.6985129117965698, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.3847861886024475, "rewards/wrapped_format_reward": 0.125, "step": 45 }, { "completion_length": 500.0, "epoch": 9.2, "grad_norm": 0.43010541796684265, "kl": 0.0009054208057932556, "learning_rate": 9.583333333333334e-07, "loss": 0.0, "reward": 0.844261884689331, "reward_std": 2.5686442852020264, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3854166567325592, "rewards/wrapped_driving_reward": -1.0411547422409058, "rewards/wrapped_format_reward": 0.75, "step": 46 }, { "completion_length": 500.0, "epoch": 9.4, "grad_norm": 1.6765927076339722, "kl": 0.0007561460370197892, "learning_rate": 9.791666666666667e-07, "loss": 0.0, "reward": -0.8482348918914795, "reward_std": 3.6395058631896973, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4375, "rewards/wrapped_driving_reward": -1.9107348918914795, "rewards/wrapped_format_reward": 0.125, "step": 47 }, { "completion_length": 500.0, "epoch": 9.6, "grad_norm": 0.39042791724205017, "kl": 0.0008848680299706757, "learning_rate": 1.0000000000000002e-06, "loss": 0.0, "reward": -1.178413987159729, "reward_std": 3.0193841457366943, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2770833373069763, "rewards/wrapped_driving_reward": -2.2054972648620605, "rewards/wrapped_format_reward": 0.25, "step": 48 }, { "completion_length": 500.0, "epoch": 9.8, "grad_norm": 0.8740907907485962, "kl": 0.0009530234383419156, "learning_rate": 1.0208333333333334e-06, "loss": 0.0, "reward": -1.9666993618011475, "reward_std": 2.7736213207244873, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.9666993618011475, "rewards/wrapped_format_reward": 0.5, "step": 49 }, { "completion_length": 500.0, "epoch": 10.0, "grad_norm": 0.3344237208366394, "kl": 0.0007590156164951622, "learning_rate": 1.0416666666666667e-06, "loss": 0.0, "reward": -3.875, "reward_std": 0.25, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.125, "step": 50 }, { "completion_length": 500.0, "epoch": 10.2, "grad_norm": 0.32544267177581787, "kl": 0.0008944781147874892, "learning_rate": 1.0625e-06, "loss": 0.0, "reward": 0.4576635956764221, "reward_std": 2.982259511947632, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5952380895614624, "rewards/wrapped_driving_reward": -1.012574553489685, "rewards/wrapped_format_reward": 0.125, "step": 51 }, { "completion_length": 500.0, "epoch": 10.4, "grad_norm": 5.682590007781982, "kl": 0.0026439097709953785, "learning_rate": 1.0833333333333335e-06, "loss": 0.0001, "reward": -1.0434162616729736, "reward_std": 3.424217462539673, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.9184162616729736, "rewards/wrapped_format_reward": 0.0, "step": 52 }, { "completion_length": 500.0, "epoch": 10.6, "grad_norm": 0.5044906735420227, "kl": 0.0010058499174192548, "learning_rate": 1.1041666666666668e-06, "loss": 0.0, "reward": 1.684746265411377, "reward_std": 0.37403419613838196, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8125, "rewards/wrapped_driving_reward": -0.12775368988513947, "rewards/wrapped_format_reward": 0.0, "step": 53 }, { "completion_length": 500.0, "epoch": 10.8, "grad_norm": 3.0662996768951416, "kl": 0.0013428920647129416, "learning_rate": 1.125e-06, "loss": 0.0001, "reward": -0.6375584602355957, "reward_std": 3.885220527648926, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4318181872367859, "rewards/wrapped_driving_reward": -1.6943767070770264, "rewards/wrapped_format_reward": 0.125, "step": 54 }, { "completion_length": 500.0, "epoch": 11.0, "grad_norm": 8.792601585388184, "kl": 0.002546559553593397, "learning_rate": 1.1458333333333333e-06, "loss": 0.0001, "reward": -1.0800423622131348, "reward_std": 3.3800711631774902, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3499999940395355, "rewards/wrapped_driving_reward": -1.9300422668457031, "rewards/wrapped_format_reward": 0.0, "step": 55 }, { "completion_length": 500.0, "epoch": 11.2, "grad_norm": 0.4684543013572693, "kl": 0.0008864883566275239, "learning_rate": 1.1666666666666668e-06, "loss": 0.0, "reward": 0.3871076703071594, "reward_std": 3.0099942684173584, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6000000238418579, "rewards/wrapped_driving_reward": -1.2128922939300537, "rewards/wrapped_format_reward": 0.25, "step": 56 }, { "completion_length": 500.0, "epoch": 11.4, "grad_norm": 3.0129141807556152, "kl": 0.001087621902115643, "learning_rate": 1.1875e-06, "loss": 0.0, "reward": -0.7230278253555298, "reward_std": 3.5053629875183105, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.42500001192092896, "rewards/wrapped_driving_reward": -1.898027777671814, "rewards/wrapped_format_reward": 0.25, "step": 57 }, { "completion_length": 500.0, "epoch": 11.6, "grad_norm": 4.786574363708496, "kl": 0.001360047492198646, "learning_rate": 1.2083333333333333e-06, "loss": 0.0001, "reward": -1.245678186416626, "reward_std": 3.180633544921875, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -1.995678186416626, "rewards/wrapped_format_reward": 0.0, "step": 58 }, { "completion_length": 500.0, "epoch": 11.8, "grad_norm": 17.00609588623047, "kl": 0.0034802549052983522, "learning_rate": 1.2291666666666666e-06, "loss": 0.0001, "reward": 1.0649046897888184, "reward_std": 3.3774664402008057, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5625, "rewards/wrapped_driving_reward": -0.37259531021118164, "rewards/wrapped_format_reward": 0.125, "step": 59 }, { "completion_length": 500.0, "epoch": 12.0, "grad_norm": 3.239806652069092, "kl": 0.003311566775664687, "learning_rate": 1.25e-06, "loss": 0.0001, "reward": 0.46374011039733887, "reward_std": 3.0523617267608643, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5833333730697632, "rewards/wrapped_driving_reward": -1.1195932626724243, "rewards/wrapped_format_reward": 0.25, "step": 60 }, { "completion_length": 500.0, "epoch": 12.2, "grad_norm": 0.33646300435066223, "kl": 0.000738381699193269, "learning_rate": 1.2708333333333334e-06, "loss": 0.0, "reward": -1.9281842708587646, "reward_std": 2.4151525497436523, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.20000000298023224, "rewards/wrapped_driving_reward": -3.0031843185424805, "rewards/wrapped_format_reward": 0.375, "step": 61 }, { "completion_length": 500.0, "epoch": 12.4, "grad_norm": 3.0721795558929443, "kl": 0.00422726571559906, "learning_rate": 1.2916666666666669e-06, "loss": 0.0002, "reward": -2.1507418155670166, "reward_std": 2.7292230129241943, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -3.0257418155670166, "rewards/wrapped_format_reward": 0.375, "step": 62 }, { "completion_length": 500.0, "epoch": 12.6, "grad_norm": 0.3383236527442932, "kl": 0.0008720280602574348, "learning_rate": 1.3125000000000001e-06, "loss": 0.0, "reward": 0.4363464117050171, "reward_std": 2.9959378242492676, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5255681872367859, "rewards/wrapped_driving_reward": -0.964221715927124, "rewards/wrapped_format_reward": 0.125, "step": 63 }, { "completion_length": 500.0, "epoch": 12.8, "grad_norm": 1.1513729095458984, "kl": 0.0009778901003301144, "learning_rate": 1.3333333333333334e-06, "loss": 0.0, "reward": -1.2415308952331543, "reward_std": 3.18522047996521, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4375, "rewards/wrapped_driving_reward": -2.1790308952331543, "rewards/wrapped_format_reward": 0.0, "step": 64 }, { "completion_length": 500.0, "epoch": 13.0, "grad_norm": 0.3557533025741577, "kl": 0.0007779670413583517, "learning_rate": 1.3541666666666667e-06, "loss": 0.0, "reward": -2.398324966430664, "reward_std": 2.8796792030334473, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.1666666716337204, "rewards/wrapped_driving_reward": -3.0649914741516113, "rewards/wrapped_format_reward": 0.25, "step": 65 }, { "completion_length": 500.0, "epoch": 13.2, "grad_norm": 3.0979981422424316, "kl": 0.002764773555099964, "learning_rate": 1.3750000000000002e-06, "loss": 0.0001, "reward": 0.25424015522003174, "reward_std": 2.8697030544281006, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -0.870759904384613, "rewards/wrapped_format_reward": 0.0, "step": 66 }, { "completion_length": 500.0, "epoch": 13.4, "grad_norm": 0.0013586197746917605, "kl": 0.0009537562145851552, "learning_rate": 1.3958333333333335e-06, "loss": 0.0, "reward": -4.0, "reward_std": 0.0, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.0, "step": 67 }, { "completion_length": 500.0, "epoch": 13.6, "grad_norm": 2.009368419647217, "kl": 0.002123823156580329, "learning_rate": 1.4166666666666667e-06, "loss": 0.0001, "reward": -3.625, "reward_std": 0.25, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.375, "step": 68 }, { "completion_length": 500.0, "epoch": 13.8, "grad_norm": 4.284774303436279, "kl": 0.0029185714665800333, "learning_rate": 1.4375e-06, "loss": 0.0001, "reward": 0.39181816577911377, "reward_std": 3.028280019760132, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5583333373069763, "rewards/wrapped_driving_reward": -1.1665152311325073, "rewards/wrapped_format_reward": 0.25, "step": 69 }, { "completion_length": 500.0, "epoch": 14.0, "grad_norm": 8.606492042541504, "kl": 0.0037718252278864384, "learning_rate": 1.4583333333333335e-06, "loss": 0.0002, "reward": -1.2599546909332275, "reward_std": 3.1716644763946533, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2818181812763214, "rewards/wrapped_driving_reward": -2.0417728424072266, "rewards/wrapped_format_reward": 0.0, "step": 70 }, { "completion_length": 500.0, "epoch": 14.2, "grad_norm": 1.687968373298645, "kl": 0.0014080241089686751, "learning_rate": 1.4791666666666668e-06, "loss": 0.0001, "reward": 1.1474496126174927, "reward_std": 3.4946651458740234, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.390625, "rewards/wrapped_driving_reward": -0.3681753873825073, "rewards/wrapped_format_reward": 0.375, "step": 71 }, { "completion_length": 500.0, "epoch": 14.4, "grad_norm": 2.8704042434692383, "kl": 0.0016583104152232409, "learning_rate": 1.5e-06, "loss": 0.0001, "reward": 0.42686039209365845, "reward_std": 2.952040910720825, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3083333373069763, "rewards/wrapped_driving_reward": -0.8814729452133179, "rewards/wrapped_format_reward": 0.25, "step": 72 }, { "completion_length": 500.0, "epoch": 14.6, "grad_norm": 0.32328060269355774, "kl": 0.0007918566698208451, "learning_rate": 1.5208333333333333e-06, "loss": 0.0, "reward": -0.9926960468292236, "reward_std": 3.4759135246276855, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3068181872367859, "rewards/wrapped_driving_reward": -2.0495142936706543, "rewards/wrapped_format_reward": 0.25, "step": 73 }, { "completion_length": 500.0, "epoch": 14.8, "grad_norm": 33.80134201049805, "kl": 0.0036835242062807083, "learning_rate": 1.5416666666666668e-06, "loss": 0.0001, "reward": -0.7441283464431763, "reward_std": 3.4832684993743896, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -1.9941283464431763, "rewards/wrapped_format_reward": 0.5, "step": 74 }, { "completion_length": 500.0, "epoch": 15.0, "grad_norm": 2.6355581283569336, "kl": 0.003368059406057, "learning_rate": 1.5625e-06, "loss": 0.0001, "reward": -2.332406997680664, "reward_std": 3.335186243057251, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.957406997680664, "rewards/wrapped_format_reward": 0.125, "step": 75 }, { "completion_length": 500.0, "epoch": 15.2, "grad_norm": 0.3515971302986145, "kl": 0.0008471008623018861, "learning_rate": 1.5833333333333333e-06, "loss": 0.0, "reward": 1.0511770248413086, "reward_std": 3.3706252574920654, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.550000011920929, "rewards/wrapped_driving_reward": -0.3738229274749756, "rewards/wrapped_format_reward": 0.125, "step": 76 }, { "completion_length": 500.0, "epoch": 15.4, "grad_norm": 2.122354745864868, "kl": 0.002714228816330433, "learning_rate": 1.6041666666666668e-06, "loss": 0.0001, "reward": 1.80057692527771, "reward_std": 0.6607468128204346, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.9166666865348816, "rewards/wrapped_driving_reward": -0.24108988046646118, "rewards/wrapped_format_reward": 0.125, "step": 77 }, { "completion_length": 500.0, "epoch": 15.6, "grad_norm": 13.85558032989502, "kl": 0.006086647976189852, "learning_rate": 1.6250000000000001e-06, "loss": 0.0002, "reward": -0.6382275819778442, "reward_std": 3.3426644802093506, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.46875, "rewards/wrapped_driving_reward": -1.9819775819778442, "rewards/wrapped_format_reward": 0.375, "step": 78 }, { "completion_length": 500.0, "epoch": 15.8, "grad_norm": 2.019861936569214, "kl": 0.0009517069556750357, "learning_rate": 1.6458333333333334e-06, "loss": 0.0, "reward": -1.128483533859253, "reward_std": 3.035717010498047, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4318181872367859, "rewards/wrapped_driving_reward": -2.1853017807006836, "rewards/wrapped_format_reward": 0.125, "step": 79 }, { "completion_length": 500.0, "epoch": 16.0, "grad_norm": 2.5614662170410156, "kl": 0.0013113848399370909, "learning_rate": 1.6666666666666667e-06, "loss": 0.0001, "reward": -2.397922992706299, "reward_std": 3.2041540145874023, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.17499999701976776, "rewards/wrapped_driving_reward": -2.947923183441162, "rewards/wrapped_format_reward": 0.125, "step": 80 }, { "completion_length": 500.0, "epoch": 16.2, "grad_norm": 0.3375092148780823, "kl": 0.0006573036080226302, "learning_rate": 1.6875000000000001e-06, "loss": 0.0, "reward": 1.6859958171844482, "reward_std": 0.27753275632858276, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6818181872367859, "rewards/wrapped_driving_reward": 0.0041775694116950035, "rewards/wrapped_format_reward": 0.0, "step": 81 }, { "completion_length": 500.0, "epoch": 16.4, "grad_norm": 0.3876532316207886, "kl": 0.000868563074618578, "learning_rate": 1.7083333333333334e-06, "loss": 0.0, "reward": -0.7118014097213745, "reward_std": 3.8014400005340576, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.5868014097213745, "rewards/wrapped_format_reward": 0.0, "step": 82 }, { "completion_length": 500.0, "epoch": 16.6, "grad_norm": 0.4464600682258606, "kl": 0.0009050997905433178, "learning_rate": 1.7291666666666667e-06, "loss": 0.0, "reward": -2.545438289642334, "reward_std": 2.909123182296753, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -3.045438289642334, "rewards/wrapped_format_reward": 0.0, "step": 83 }, { "completion_length": 500.0, "epoch": 16.8, "grad_norm": 7.183985710144043, "kl": 0.006316404789686203, "learning_rate": 1.75e-06, "loss": 0.0003, "reward": -1.0390408039093018, "reward_std": 3.426302433013916, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2916666865348816, "rewards/wrapped_driving_reward": -1.8307075500488281, "rewards/wrapped_format_reward": 0.0, "step": 84 }, { "completion_length": 500.0, "epoch": 17.0, "grad_norm": 4.626237869262695, "kl": 0.0024934238754212856, "learning_rate": 1.7708333333333337e-06, "loss": 0.0001, "reward": -0.7597904205322266, "reward_std": 3.164623260498047, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -1.8847904205322266, "rewards/wrapped_format_reward": 0.375, "step": 85 }, { "completion_length": 500.0, "epoch": 17.2, "grad_norm": 3.646864175796509, "kl": 0.006598799955099821, "learning_rate": 1.7916666666666667e-06, "loss": 0.0003, "reward": -0.8650339841842651, "reward_std": 3.405834436416626, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -2.1150341033935547, "rewards/wrapped_format_reward": 0.375, "step": 86 }, { "completion_length": 500.0, "epoch": 17.4, "grad_norm": 0.38579967617988586, "kl": 0.0009482253226451576, "learning_rate": 1.8125e-06, "loss": 0.0, "reward": -1.179398536682129, "reward_std": 2.9798529148101807, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3333333134651184, "rewards/wrapped_driving_reward": -2.1377317905426025, "rewards/wrapped_format_reward": 0.125, "step": 87 }, { "completion_length": 500.0, "epoch": 17.6, "grad_norm": 5.8937668800354, "kl": 0.0027326152194291353, "learning_rate": 1.8333333333333333e-06, "loss": 0.0001, "reward": 2.8879857063293457, "reward_std": 0.8419812321662903, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.898809552192688, "rewards/wrapped_driving_reward": 0.48917609453201294, "rewards/wrapped_format_reward": 0.5, "step": 88 }, { "completion_length": 500.0, "epoch": 17.8, "grad_norm": 0.4111790359020233, "kl": 0.0008414680487476289, "learning_rate": 1.854166666666667e-06, "loss": 0.0, "reward": -2.695969581604004, "reward_std": 2.608060836791992, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.10000000149011612, "rewards/wrapped_driving_reward": -3.0459694862365723, "rewards/wrapped_format_reward": 0.0, "step": 89 }, { "completion_length": 500.0, "epoch": 18.0, "grad_norm": 1.19579017162323, "kl": 0.0015203645452857018, "learning_rate": 1.8750000000000003e-06, "loss": 0.0001, "reward": -2.6511423587799072, "reward_std": 2.6977152824401855, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.09375, "rewards/wrapped_driving_reward": -2.9948923587799072, "rewards/wrapped_format_reward": 0.0, "step": 90 }, { "completion_length": 500.0, "epoch": 18.2, "grad_norm": 3.6108834743499756, "kl": 0.019146539270877838, "learning_rate": 1.8958333333333333e-06, "loss": 0.0008, "reward": -0.8020716905593872, "reward_std": 2.8371593952178955, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3031249940395355, "rewards/wrapped_driving_reward": -1.9801967144012451, "rewards/wrapped_format_reward": 0.375, "step": 91 }, { "completion_length": 500.0, "epoch": 18.4, "grad_norm": 1.2253782749176025, "kl": 0.0009205341921187937, "learning_rate": 1.916666666666667e-06, "loss": 0.0, "reward": -0.9882822036743164, "reward_std": 3.4778919219970703, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.28928571939468384, "rewards/wrapped_driving_reward": -1.9025678634643555, "rewards/wrapped_format_reward": 0.125, "step": 92 }, { "completion_length": 500.0, "epoch": 18.6, "grad_norm": 0.396138995885849, "kl": 0.0009751519537530839, "learning_rate": 1.9375e-06, "loss": 0.0, "reward": -1.189629077911377, "reward_std": 3.282799243927002, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.1805555522441864, "rewards/wrapped_driving_reward": -1.9951846599578857, "rewards/wrapped_format_reward": 0.125, "step": 93 }, { "completion_length": 500.0, "epoch": 18.8, "grad_norm": 5.001523971557617, "kl": 0.0025945594534277916, "learning_rate": 1.9583333333333334e-06, "loss": 0.0001, "reward": -0.38306254148483276, "reward_std": 4.17663049697876, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.34375, "rewards/wrapped_driving_reward": -1.6018126010894775, "rewards/wrapped_format_reward": 0.375, "step": 94 }, { "completion_length": 500.0, "epoch": 19.0, "grad_norm": 2.349184274673462, "kl": 0.009762048721313477, "learning_rate": 1.9791666666666666e-06, "loss": 0.0004, "reward": -2.3886988162994385, "reward_std": 2.599043846130371, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.20000000298023224, "rewards/wrapped_driving_reward": -3.3386988639831543, "rewards/wrapped_format_reward": 0.5, "step": 95 }, { "completion_length": 500.0, "epoch": 19.2, "grad_norm": 2.266725778579712, "kl": 0.0014983770670369267, "learning_rate": 2.0000000000000003e-06, "loss": 0.0001, "reward": -2.6680169105529785, "reward_std": 2.663965940475464, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.10000000149011612, "rewards/wrapped_driving_reward": -3.018017053604126, "rewards/wrapped_format_reward": 0.0, "step": 96 }, { "completion_length": 500.0, "epoch": 19.4, "grad_norm": 1.2776412963867188, "kl": 0.0011785050155594945, "learning_rate": 2.0208333333333336e-06, "loss": 0.0, "reward": -2.4784326553344727, "reward_std": 2.42277193069458, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -2.9784326553344727, "rewards/wrapped_format_reward": 0.25, "step": 97 }, { "completion_length": 500.0, "epoch": 19.6, "grad_norm": 5.367588996887207, "kl": 0.0031056797597557306, "learning_rate": 2.041666666666667e-06, "loss": 0.0001, "reward": -2.46954345703125, "reward_std": 3.0609130859375, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.203125, "rewards/wrapped_driving_reward": -3.04766845703125, "rewards/wrapped_format_reward": 0.125, "step": 98 }, { "completion_length": 500.0, "epoch": 19.8, "grad_norm": 0.42096036672592163, "kl": 0.0008003456750884652, "learning_rate": 2.0625e-06, "loss": 0.0, "reward": -2.5779201984405518, "reward_std": 2.521865129470825, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.0833333358168602, "rewards/wrapped_driving_reward": -3.0362536907196045, "rewards/wrapped_format_reward": 0.125, "step": 99 }, { "completion_length": 500.0, "epoch": 20.0, "grad_norm": 2.5479648113250732, "kl": 0.0011869773734360933, "learning_rate": 2.0833333333333334e-06, "loss": 0.0, "reward": 0.8239460587501526, "reward_std": 3.235414743423462, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5308441519737244, "rewards/wrapped_driving_reward": -0.7068981528282166, "rewards/wrapped_format_reward": 0.25, "step": 100 }, { "completion_length": 500.0, "epoch": 20.2, "grad_norm": 0.386593222618103, "kl": 0.0008260260801762342, "learning_rate": 2.1041666666666667e-06, "loss": 0.0, "reward": -2.655496120452881, "reward_std": 2.689007520675659, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.1428571492433548, "rewards/wrapped_driving_reward": -3.048353433609009, "rewards/wrapped_format_reward": 0.0, "step": 101 }, { "completion_length": 500.0, "epoch": 20.4, "grad_norm": 1.7718898057937622, "kl": 0.03829975798726082, "learning_rate": 2.125e-06, "loss": 0.0015, "reward": -2.821061134338379, "reward_std": 1.417920708656311, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.17307692766189575, "rewards/wrapped_driving_reward": -3.61913800239563, "rewards/wrapped_format_reward": 0.375, "step": 102 }, { "completion_length": 500.0, "epoch": 20.6, "grad_norm": 0.3890434205532074, "kl": 0.0009301622048951685, "learning_rate": 2.1458333333333333e-06, "loss": 0.0, "reward": -1.0485785007476807, "reward_std": 3.4387564659118652, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.42500001192092896, "rewards/wrapped_driving_reward": -2.223578453063965, "rewards/wrapped_format_reward": 0.25, "step": 103 }, { "completion_length": 500.0, "epoch": 20.8, "grad_norm": 1.8730741739273071, "kl": 0.003595958696678281, "learning_rate": 2.166666666666667e-06, "loss": 0.0001, "reward": -1.01571524143219, "reward_std": 3.184199333190918, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2708333134651184, "rewards/wrapped_driving_reward": -1.9115486145019531, "rewards/wrapped_format_reward": 0.125, "step": 104 }, { "completion_length": 500.0, "epoch": 21.0, "grad_norm": 9.170056343078613, "kl": 0.01782212406396866, "learning_rate": 2.1875000000000002e-06, "loss": 0.0007, "reward": -0.8671887516975403, "reward_std": 3.62764048576355, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -1.992188811302185, "rewards/wrapped_format_reward": 0.375, "step": 105 }, { "completion_length": 500.0, "epoch": 21.2, "grad_norm": 0.5751691460609436, "kl": 0.0012470635119825602, "learning_rate": 2.2083333333333335e-06, "loss": 0.0, "reward": 2.0378382205963135, "reward_std": 0.5405812859535217, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.781818151473999, "rewards/wrapped_driving_reward": 0.006020138971507549, "rewards/wrapped_format_reward": 0.25, "step": 106 }, { "completion_length": 500.0, "epoch": 21.4, "grad_norm": 0.3796798288822174, "kl": 0.0008278587483800948, "learning_rate": 2.2291666666666668e-06, "loss": 0.0, "reward": -0.9299300909042358, "reward_std": 3.0426578521728516, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.1818181872367859, "rewards/wrapped_driving_reward": -2.111748456954956, "rewards/wrapped_format_reward": 0.5, "step": 107 }, { "completion_length": 500.0, "epoch": 21.6, "grad_norm": 3.6200075149536133, "kl": 0.00310003524646163, "learning_rate": 2.25e-06, "loss": 0.0001, "reward": -0.7615724205970764, "reward_std": 3.7413580417633057, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.6365723609924316, "rewards/wrapped_format_reward": 0.0, "step": 108 }, { "completion_length": 500.0, "epoch": 21.8, "grad_norm": 5.526390075683594, "kl": 0.0022852052934467793, "learning_rate": 2.2708333333333333e-06, "loss": 0.0001, "reward": -2.3737564086914062, "reward_std": 2.928654193878174, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.125, "rewards/wrapped_driving_reward": -2.9987564086914062, "rewards/wrapped_format_reward": 0.25, "step": 109 }, { "completion_length": 500.0, "epoch": 22.0, "grad_norm": 3.324976921081543, "kl": 0.027398547157645226, "learning_rate": 2.2916666666666666e-06, "loss": 0.0011, "reward": -1.607568621635437, "reward_std": 2.372925281524658, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4427083432674408, "rewards/wrapped_driving_reward": -2.9252769947052, "rewards/wrapped_format_reward": 0.125, "step": 110 }, { "completion_length": 500.0, "epoch": 22.2, "grad_norm": 37.2985954284668, "kl": 0.02997448667883873, "learning_rate": 2.3125000000000003e-06, "loss": 0.0012, "reward": -3.625, "reward_std": 0.4787135720252991, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.375, "step": 111 }, { "completion_length": 500.0, "epoch": 22.4, "grad_norm": 1.5794304609298706, "kl": 0.003044416196644306, "learning_rate": 2.3333333333333336e-06, "loss": 0.0001, "reward": -2.8739120960235596, "reward_std": 2.252175807952881, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.1666666716337204, "rewards/wrapped_driving_reward": -3.290578842163086, "rewards/wrapped_format_reward": 0.0, "step": 112 }, { "completion_length": 500.0, "epoch": 22.6, "grad_norm": 4.685129642486572, "kl": 0.003984578885138035, "learning_rate": 2.354166666666667e-06, "loss": 0.0002, "reward": -0.772267758846283, "reward_std": 3.1760997772216797, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -2.0222678184509277, "rewards/wrapped_format_reward": 0.25, "step": 113 }, { "completion_length": 500.0, "epoch": 22.8, "grad_norm": 12.547566413879395, "kl": 0.030495142564177513, "learning_rate": 2.375e-06, "loss": 0.0012, "reward": 1.7730681896209717, "reward_std": 0.5380395650863647, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": 0.02306819148361683, "rewards/wrapped_format_reward": 0.0, "step": 114 }, { "completion_length": 500.0, "epoch": 23.0, "grad_norm": 2.9405810832977295, "kl": 0.001207878114655614, "learning_rate": 2.395833333333334e-06, "loss": 0.0, "reward": -1.1156203746795654, "reward_std": 3.3368895053863525, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.9906203746795654, "rewards/wrapped_format_reward": 0.0, "step": 115 }, { "completion_length": 500.0, "epoch": 23.2, "grad_norm": 1.6627004146575928, "kl": 0.0010354293044656515, "learning_rate": 2.4166666666666667e-06, "loss": 0.0, "reward": -0.9815444350242615, "reward_std": 3.485572576522827, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.40625, "rewards/wrapped_driving_reward": -2.012794256210327, "rewards/wrapped_format_reward": 0.125, "step": 116 }, { "completion_length": 500.0, "epoch": 23.4, "grad_norm": 15.359031677246094, "kl": 0.01042311079800129, "learning_rate": 2.4375e-06, "loss": 0.0004, "reward": -2.5663912296295166, "reward_std": 2.867217540740967, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.125, "rewards/wrapped_driving_reward": -2.9413912296295166, "rewards/wrapped_format_reward": 0.0, "step": 117 }, { "completion_length": 500.0, "epoch": 23.6, "grad_norm": 1.0322840213775635, "kl": 0.0010455718729645014, "learning_rate": 2.4583333333333332e-06, "loss": 0.0, "reward": -0.7912392616271973, "reward_std": 3.706042528152466, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3068181872367859, "rewards/wrapped_driving_reward": -1.973057508468628, "rewards/wrapped_format_reward": 0.375, "step": 118 }, { "completion_length": 500.0, "epoch": 23.8, "grad_norm": 2.897580862045288, "kl": 0.0041581131517887115, "learning_rate": 2.479166666666667e-06, "loss": 0.0002, "reward": -0.5823937058448792, "reward_std": 3.6703875064849854, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -1.5823936462402344, "rewards/wrapped_format_reward": 0.25, "step": 119 }, { "completion_length": 500.0, "epoch": 24.0, "grad_norm": 0.8678112030029297, "kl": 0.005057378206402063, "learning_rate": 2.5e-06, "loss": 0.0002, "reward": 1.8686788082122803, "reward_std": 0.3760830760002136, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6818181872367859, "rewards/wrapped_driving_reward": -0.06313925981521606, "rewards/wrapped_format_reward": 0.25, "step": 120 }, { "completion_length": 500.0, "epoch": 24.2, "grad_norm": 0.45261016488075256, "kl": 0.0010572966421023011, "learning_rate": 2.5208333333333335e-06, "loss": 0.0, "reward": -2.406386137008667, "reward_std": 3.187227725982666, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.1875, "rewards/wrapped_driving_reward": -2.968886137008667, "rewards/wrapped_format_reward": 0.125, "step": 121 }, { "completion_length": 500.0, "epoch": 24.4, "grad_norm": 1.2322733402252197, "kl": 0.001089164288714528, "learning_rate": 2.5416666666666668e-06, "loss": 0.0, "reward": -1.1155859231948853, "reward_std": 3.3368074893951416, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.9905859231948853, "rewards/wrapped_format_reward": 0.0, "step": 122 }, { "completion_length": 500.0, "epoch": 24.6, "grad_norm": 0.3946419060230255, "kl": 0.0009771620389074087, "learning_rate": 2.5625e-06, "loss": 0.0, "reward": -2.3373544216156006, "reward_std": 2.7000937461853027, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.09375, "rewards/wrapped_driving_reward": -2.9311044216156006, "rewards/wrapped_format_reward": 0.25, "step": 123 }, { "completion_length": 500.0, "epoch": 24.8, "grad_norm": 0.416446328163147, "kl": 0.0009063539328053594, "learning_rate": 2.5833333333333337e-06, "loss": 0.0, "reward": -2.6977710723876953, "reward_std": 2.6044580936431885, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.0625, "rewards/wrapped_driving_reward": -3.0102710723876953, "rewards/wrapped_format_reward": 0.0, "step": 124 }, { "completion_length": 500.0, "epoch": 25.0, "grad_norm": 0.37381142377853394, "kl": 0.0009684949764050543, "learning_rate": 2.604166666666667e-06, "loss": 0.0, "reward": -0.5735440254211426, "reward_std": 3.6742913722991943, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3020833134651184, "rewards/wrapped_driving_reward": -1.6256272792816162, "rewards/wrapped_format_reward": 0.25, "step": 125 }, { "completion_length": 500.0, "epoch": 25.2, "grad_norm": 0.3716869354248047, "kl": 0.0008447925210930407, "learning_rate": 2.6250000000000003e-06, "loss": 0.0, "reward": -0.8787834644317627, "reward_std": 3.3303563594818115, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2916666865348816, "rewards/wrapped_driving_reward": -1.920450210571289, "rewards/wrapped_format_reward": 0.25, "step": 126 }, { "completion_length": 500.0, "epoch": 25.4, "grad_norm": 1.7830382585525513, "kl": 0.0016754590906202793, "learning_rate": 2.6458333333333336e-06, "loss": 0.0001, "reward": 2.153042793273926, "reward_std": 0.3702898919582367, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": 0.15304280817508698, "rewards/wrapped_format_reward": 0.375, "step": 127 }, { "completion_length": 500.0, "epoch": 25.6, "grad_norm": 0.40261051058769226, "kl": 0.0009311916655860841, "learning_rate": 2.666666666666667e-06, "loss": 0.0, "reward": -2.4677088260650635, "reward_std": 2.7414004802703857, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.10000000149011612, "rewards/wrapped_driving_reward": -2.942708730697632, "rewards/wrapped_format_reward": 0.125, "step": 128 }, { "completion_length": 500.0, "epoch": 25.8, "grad_norm": 6.308819770812988, "kl": 0.00869703572243452, "learning_rate": 2.6875e-06, "loss": 0.0003, "reward": -0.47751596570014954, "reward_std": 3.236863136291504, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.41818180680274963, "rewards/wrapped_driving_reward": -2.020697832107544, "rewards/wrapped_format_reward": 0.625, "step": 129 }, { "completion_length": 500.0, "epoch": 26.0, "grad_norm": 3.463308572769165, "kl": 0.03951391950249672, "learning_rate": 2.7083333333333334e-06, "loss": 0.0016, "reward": -2.465014934539795, "reward_std": 2.109844207763672, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.0416666679084301, "rewards/wrapped_driving_reward": -3.1316816806793213, "rewards/wrapped_format_reward": 0.375, "step": 130 }, { "completion_length": 500.0, "epoch": 26.2, "grad_norm": 0.37268656492233276, "kl": 0.0010275749955326319, "learning_rate": 2.7291666666666667e-06, "loss": 0.0, "reward": -2.0108275413513184, "reward_std": 3.3450615406036377, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.15625, "rewards/wrapped_driving_reward": -2.7920775413513184, "rewards/wrapped_format_reward": 0.375, "step": 131 }, { "completion_length": 500.0, "epoch": 26.4, "grad_norm": 0.3964381217956543, "kl": 0.001012228662148118, "learning_rate": 2.7500000000000004e-06, "loss": 0.0, "reward": -1.2674376964569092, "reward_std": 3.008662223815918, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -2.267437696456909, "rewards/wrapped_format_reward": 0.125, "step": 132 }, { "completion_length": 500.0, "epoch": 26.6, "grad_norm": 10.317098617553711, "kl": 0.03428129479289055, "learning_rate": 2.7708333333333336e-06, "loss": 0.0014, "reward": -1.227315902709961, "reward_std": 2.922797679901123, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.26704543828964233, "rewards/wrapped_driving_reward": -2.119361162185669, "rewards/wrapped_format_reward": 0.125, "step": 133 }, { "completion_length": 500.0, "epoch": 26.8, "grad_norm": 1.919240951538086, "kl": 0.00681189214810729, "learning_rate": 2.791666666666667e-06, "loss": 0.0003, "reward": 0.37134385108947754, "reward_std": 2.954814910888672, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.003656268119812, "rewards/wrapped_format_reward": 0.125, "step": 134 }, { "completion_length": 500.0, "epoch": 27.0, "grad_norm": 4.358583927154541, "kl": 0.003853685688227415, "learning_rate": 2.8125e-06, "loss": 0.0002, "reward": -3.180288314819336, "reward_std": 0.9465946555137634, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3197115361690521, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.0, "step": 135 }, { "completion_length": 500.0, "epoch": 27.2, "grad_norm": 4.834251880645752, "kl": 0.01798548549413681, "learning_rate": 2.8333333333333335e-06, "loss": 0.0007, "reward": -0.00926351547241211, "reward_std": 2.662809133529663, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4791666865348816, "rewards/wrapped_driving_reward": -1.238430142402649, "rewards/wrapped_format_reward": 0.0, "step": 136 }, { "completion_length": 500.0, "epoch": 27.4, "grad_norm": 1.3516547679901123, "kl": 0.0019082785584032536, "learning_rate": 2.8541666666666667e-06, "loss": 0.0001, "reward": -0.8441787958145142, "reward_std": 3.6675186157226562, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4318181872367859, "rewards/wrapped_driving_reward": -2.0259971618652344, "rewards/wrapped_format_reward": 0.25, "step": 137 }, { "completion_length": 500.0, "epoch": 27.6, "grad_norm": 0.3614887595176697, "kl": 0.0007737466366961598, "learning_rate": 2.875e-06, "loss": 0.0, "reward": -1.5659279823303223, "reward_std": 2.8618316650390625, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.1354166716337204, "rewards/wrapped_driving_reward": -2.3263447284698486, "rewards/wrapped_format_reward": 0.125, "step": 138 }, { "completion_length": 500.0, "epoch": 27.8, "grad_norm": 0.44359442591667175, "kl": 0.0009050779044628143, "learning_rate": 2.8958333333333337e-06, "loss": 0.0, "reward": 0.5474408864974976, "reward_std": 2.7063920497894287, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5249999761581421, "rewards/wrapped_driving_reward": -0.9775590300559998, "rewards/wrapped_format_reward": 0.25, "step": 139 }, { "completion_length": 500.0, "epoch": 28.0, "grad_norm": 1.6172740459442139, "kl": 0.001949971541762352, "learning_rate": 2.916666666666667e-06, "loss": 0.0001, "reward": -2.165811061859131, "reward_std": 3.6683778762817383, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.790811061859131, "rewards/wrapped_format_reward": 0.125, "step": 140 }, { "completion_length": 500.0, "epoch": 28.2, "grad_norm": 1.963468074798584, "kl": 0.004007345996797085, "learning_rate": 2.9375000000000003e-06, "loss": 0.0002, "reward": -2.3711447715759277, "reward_std": 3.2577102184295654, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.125, "rewards/wrapped_driving_reward": -2.9961447715759277, "rewards/wrapped_format_reward": 0.25, "step": 141 }, { "completion_length": 500.0, "epoch": 28.4, "grad_norm": 2.7539641857147217, "kl": 0.00553749967366457, "learning_rate": 2.9583333333333335e-06, "loss": 0.0002, "reward": -0.709455668926239, "reward_std": 3.2490553855895996, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4129464328289032, "rewards/wrapped_driving_reward": -1.8724020719528198, "rewards/wrapped_format_reward": 0.25, "step": 142 }, { "completion_length": 500.0, "epoch": 28.6, "grad_norm": 3.479469060897827, "kl": 0.033329278230667114, "learning_rate": 2.979166666666667e-06, "loss": 0.0013, "reward": -0.8142069578170776, "reward_std": 3.702267646789551, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3482142686843872, "rewards/wrapped_driving_reward": -1.9124212265014648, "rewards/wrapped_format_reward": 0.25, "step": 143 }, { "completion_length": 500.0, "epoch": 28.8, "grad_norm": 3.352234363555908, "kl": 0.010930047370493412, "learning_rate": 3e-06, "loss": 0.0004, "reward": 0.9485020637512207, "reward_std": 3.3182427883148193, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3333333432674408, "rewards/wrapped_driving_reward": -0.38483119010925293, "rewards/wrapped_format_reward": 0.25, "step": 144 }, { "completion_length": 500.0, "epoch": 29.0, "grad_norm": 3.932000160217285, "kl": 0.03878607600927353, "learning_rate": 3.0208333333333334e-06, "loss": 0.0016, "reward": 0.1870807409286499, "reward_std": 2.7980854511260986, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4166666865348816, "rewards/wrapped_driving_reward": -1.104585886001587, "rewards/wrapped_format_reward": 0.125, "step": 145 }, { "completion_length": 500.0, "epoch": 29.2, "grad_norm": 0.0015932704554870725, "kl": 0.001104721101000905, "learning_rate": 3.0416666666666666e-06, "loss": 0.0, "reward": -4.0, "reward_std": 0.0, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.0, "step": 146 }, { "completion_length": 500.0, "epoch": 29.4, "grad_norm": 7.533337593078613, "kl": 0.04573941230773926, "learning_rate": 3.0625000000000003e-06, "loss": 0.0018, "reward": 0.9483587741851807, "reward_std": 3.3045196533203125, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -0.8016412258148193, "rewards/wrapped_format_reward": 0.5, "step": 147 }, { "completion_length": 500.0, "epoch": 29.6, "grad_norm": 2.8840458393096924, "kl": 0.03999151289463043, "learning_rate": 3.0833333333333336e-06, "loss": 0.0016, "reward": 0.3635343313217163, "reward_std": 2.5867059230804443, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": -1.1364656686782837, "rewards/wrapped_format_reward": 0.125, "step": 148 }, { "completion_length": 500.0, "epoch": 29.8, "grad_norm": 3.9054391384124756, "kl": 0.038462840020656586, "learning_rate": 3.104166666666667e-06, "loss": 0.0015, "reward": 0.5946906208992004, "reward_std": 3.0916733741760254, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.46590909361839294, "rewards/wrapped_driving_reward": -0.8712185025215149, "rewards/wrapped_format_reward": 0.25, "step": 149 }, { "completion_length": 500.0, "epoch": 30.0, "grad_norm": 1.3047268390655518, "kl": 0.003437787527218461, "learning_rate": 3.125e-06, "loss": 0.0001, "reward": -1.262099027633667, "reward_std": 3.1614561080932617, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.21875, "rewards/wrapped_driving_reward": -1.9808489084243774, "rewards/wrapped_format_reward": 0.0, "step": 150 }, { "completion_length": 500.0, "epoch": 30.2, "grad_norm": 19.819337844848633, "kl": 0.02368580363690853, "learning_rate": 3.1458333333333334e-06, "loss": 0.0009, "reward": -0.593837320804596, "reward_std": 3.3858869075775146, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.9688372611999512, "rewards/wrapped_format_reward": 0.5, "step": 151 }, { "completion_length": 500.0, "epoch": 30.4, "grad_norm": 0.36471259593963623, "kl": 0.0008758799522183836, "learning_rate": 3.1666666666666667e-06, "loss": 0.0, "reward": 0.32638394832611084, "reward_std": 2.8899521827697754, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6636363863945007, "rewards/wrapped_driving_reward": -1.0872524976730347, "rewards/wrapped_format_reward": 0.0, "step": 152 }, { "completion_length": 500.0, "epoch": 30.6, "grad_norm": 3.6062331199645996, "kl": 0.010750241577625275, "learning_rate": 3.1875e-06, "loss": 0.0004, "reward": 0.9008145332336426, "reward_std": 3.3484742641448975, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3392857313156128, "rewards/wrapped_driving_reward": -0.8134711980819702, "rewards/wrapped_format_reward": 0.625, "step": 153 }, { "completion_length": 500.0, "epoch": 30.8, "grad_norm": 3.008584976196289, "kl": 0.004083903506398201, "learning_rate": 3.2083333333333337e-06, "loss": 0.0002, "reward": 0.6495532393455505, "reward_std": 3.1026666164398193, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.8504468202590942, "rewards/wrapped_format_reward": 0.0, "step": 154 }, { "completion_length": 500.0, "epoch": 31.0, "grad_norm": 4.257124423980713, "kl": 0.007256774697452784, "learning_rate": 3.229166666666667e-06, "loss": 0.0003, "reward": 1.5835230350494385, "reward_std": 0.16466718912124634, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5773809552192688, "rewards/wrapped_driving_reward": 0.006142044439911842, "rewards/wrapped_format_reward": 0.0, "step": 155 }, { "completion_length": 500.0, "epoch": 31.2, "grad_norm": 2.010960578918457, "kl": 0.02292311191558838, "learning_rate": 3.2500000000000002e-06, "loss": 0.0009, "reward": 1.8099894523620605, "reward_std": 0.18248680233955383, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7178758978843689, "rewards/wrapped_driving_reward": 0.09211361408233643, "rewards/wrapped_format_reward": 0.0, "step": 156 }, { "completion_length": 500.0, "epoch": 31.4, "grad_norm": 0.4033740758895874, "kl": 0.0010355673730373383, "learning_rate": 3.2708333333333335e-06, "loss": 0.0, "reward": -3.75, "reward_std": 0.5, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.25, "step": 157 }, { "completion_length": 500.0, "epoch": 31.6, "grad_norm": 17.625986099243164, "kl": 0.04252302274107933, "learning_rate": 3.2916666666666668e-06, "loss": 0.0017, "reward": -2.468630075454712, "reward_std": 2.739564895629883, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.2142857164144516, "rewards/wrapped_driving_reward": -3.0579159259796143, "rewards/wrapped_format_reward": 0.125, "step": 158 }, { "completion_length": 500.0, "epoch": 31.8, "grad_norm": 6.371521949768066, "kl": 0.02408897504210472, "learning_rate": 3.3125e-06, "loss": 0.001, "reward": -0.5439435243606567, "reward_std": 4.002199172973633, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3125, "rewards/wrapped_driving_reward": -1.6064435243606567, "rewards/wrapped_format_reward": 0.25, "step": 159 }, { "completion_length": 500.0, "epoch": 32.0, "grad_norm": 0.8184000253677368, "kl": 0.027131706476211548, "learning_rate": 3.3333333333333333e-06, "loss": 0.0011, "reward": 1.6387406587600708, "reward_std": 0.7142547965049744, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8103895783424377, "rewards/wrapped_driving_reward": -0.17164897918701172, "rewards/wrapped_format_reward": 0.0, "step": 160 }, { "completion_length": 500.0, "epoch": 32.2, "grad_norm": 0.3655173182487488, "kl": 0.0010108003625646234, "learning_rate": 3.3541666666666666e-06, "loss": 0.0, "reward": -0.9831902980804443, "reward_std": 3.484320640563965, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3166666626930237, "rewards/wrapped_driving_reward": -2.0498569011688232, "rewards/wrapped_format_reward": 0.25, "step": 161 }, { "completion_length": 500.0, "epoch": 32.4, "grad_norm": 2.8332362174987793, "kl": 0.025723226368427277, "learning_rate": 3.3750000000000003e-06, "loss": 0.001, "reward": -0.8011678457260132, "reward_std": 3.6942384243011475, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2857142686843872, "rewards/wrapped_driving_reward": -1.5868821144104004, "rewards/wrapped_format_reward": 0.0, "step": 162 }, { "completion_length": 500.0, "epoch": 32.6, "grad_norm": 0.0015628942055627704, "kl": 0.0009882381418719888, "learning_rate": 3.3958333333333336e-06, "loss": 0.0, "reward": -4.0, "reward_std": 0.0, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.0, "step": 163 }, { "completion_length": 500.0, "epoch": 32.8, "grad_norm": 0.22536346316337585, "kl": 0.014067083597183228, "learning_rate": 3.416666666666667e-06, "loss": 0.0006, "reward": -4.0, "reward_std": 0.0, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.0, "step": 164 }, { "completion_length": 500.0, "epoch": 33.0, "grad_norm": 2.8652682304382324, "kl": 0.0301588773727417, "learning_rate": 3.4375e-06, "loss": 0.0012, "reward": 0.6044188737869263, "reward_std": 3.0828073024749756, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.48571425676345825, "rewards/wrapped_driving_reward": -1.0062953233718872, "rewards/wrapped_format_reward": 0.375, "step": 165 }, { "completion_length": 500.0, "epoch": 33.2, "grad_norm": 14.549288749694824, "kl": 0.03827816992998123, "learning_rate": 3.4583333333333334e-06, "loss": 0.0015, "reward": 0.20543813705444336, "reward_std": 2.8183021545410156, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3499999940395355, "rewards/wrapped_driving_reward": -1.0195618867874146, "rewards/wrapped_format_reward": 0.125, "step": 166 }, { "completion_length": 500.0, "epoch": 33.4, "grad_norm": 23.31963539123535, "kl": 0.0236306581646204, "learning_rate": 3.4791666666666667e-06, "loss": 0.0009, "reward": -1.3284555673599243, "reward_std": 2.110996961593628, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -2.4534554481506348, "rewards/wrapped_format_reward": 0.0, "step": 167 }, { "completion_length": 500.0, "epoch": 33.6, "grad_norm": 0.4104408323764801, "kl": 0.0013704978628084064, "learning_rate": 3.5e-06, "loss": 0.0001, "reward": -2.5163447856903076, "reward_std": 2.9673104286193848, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.125, "rewards/wrapped_driving_reward": -3.0163450241088867, "rewards/wrapped_format_reward": 0.125, "step": 168 }, { "completion_length": 500.0, "epoch": 33.8, "grad_norm": 7.642683982849121, "kl": 0.03837261348962784, "learning_rate": 3.520833333333334e-06, "loss": 0.0015, "reward": 0.8579701781272888, "reward_std": 3.2719130516052246, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.36666667461395264, "rewards/wrapped_driving_reward": -0.508696436882019, "rewards/wrapped_format_reward": 0.25, "step": 169 }, { "completion_length": 500.0, "epoch": 34.0, "grad_norm": 8.982998847961426, "kl": 0.06479807198047638, "learning_rate": 3.5416666666666673e-06, "loss": 0.0026, "reward": 1.5301133394241333, "reward_std": 1.1181119680404663, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.65625, "rewards/wrapped_driving_reward": -0.5011366009712219, "rewards/wrapped_format_reward": 0.375, "step": 170 }, { "completion_length": 500.0, "epoch": 34.2, "grad_norm": 2.2494914531707764, "kl": 0.0033163258340209723, "learning_rate": 3.5625e-06, "loss": 0.0001, "reward": -0.6762468814849854, "reward_std": 3.5722789764404297, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3020833134651184, "rewards/wrapped_driving_reward": -1.853330135345459, "rewards/wrapped_format_reward": 0.375, "step": 171 }, { "completion_length": 500.0, "epoch": 34.4, "grad_norm": 0.90104740858078, "kl": 0.006812056060880423, "learning_rate": 3.5833333333333335e-06, "loss": 0.0003, "reward": -2.3616232872009277, "reward_std": 3.2767534255981445, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.9866232872009277, "rewards/wrapped_format_reward": 0.125, "step": 172 }, { "completion_length": 500.0, "epoch": 34.6, "grad_norm": 0.3947916030883789, "kl": 0.0011263922788202763, "learning_rate": 3.6041666666666667e-06, "loss": 0.0, "reward": -0.5480427742004395, "reward_std": 4.001402854919434, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4375, "rewards/wrapped_driving_reward": -1.6105427742004395, "rewards/wrapped_format_reward": 0.125, "step": 173 }, { "completion_length": 500.0, "epoch": 34.8, "grad_norm": 0.6156601309776306, "kl": 0.01154815312474966, "learning_rate": 3.625e-06, "loss": 0.0005, "reward": -2.329235315322876, "reward_std": 3.017416000366211, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -3.079235315322876, "rewards/wrapped_format_reward": 0.25, "step": 174 }, { "completion_length": 500.0, "epoch": 35.0, "grad_norm": 0.4099337160587311, "kl": 0.0011381495278328657, "learning_rate": 3.6458333333333333e-06, "loss": 0.0, "reward": -2.365032434463501, "reward_std": 2.083897829055786, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.33181819319725037, "rewards/wrapped_driving_reward": -3.1968507766723633, "rewards/wrapped_format_reward": 0.0, "step": 175 }, { "completion_length": 500.0, "epoch": 35.2, "grad_norm": 4.004872798919678, "kl": 0.029580948874354362, "learning_rate": 3.6666666666666666e-06, "loss": 0.0012, "reward": -1.2987949848175049, "reward_std": 3.119826316833496, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3125, "rewards/wrapped_driving_reward": -2.111294746398926, "rewards/wrapped_format_reward": 0.0, "step": 176 }, { "completion_length": 500.0, "epoch": 35.4, "grad_norm": 1.671083927154541, "kl": 0.014694800600409508, "learning_rate": 3.6875000000000007e-06, "loss": 0.0006, "reward": -1.2172389030456543, "reward_std": 3.2450075149536133, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -1.9672389030456543, "rewards/wrapped_format_reward": 0.0, "step": 177 }, { "completion_length": 500.0, "epoch": 35.6, "grad_norm": 0.4049564301967621, "kl": 0.0011514219222590327, "learning_rate": 3.708333333333334e-06, "loss": 0.0, "reward": 0.8965679407119751, "reward_std": 3.293184757232666, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.53125, "rewards/wrapped_driving_reward": -0.3846820592880249, "rewards/wrapped_format_reward": 0.0, "step": 178 }, { "completion_length": 500.0, "epoch": 35.8, "grad_norm": 0.0426897257566452, "kl": 0.011578786186873913, "learning_rate": 3.7291666666666672e-06, "loss": 0.0005, "reward": -4.0, "reward_std": 0.0, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.0, "step": 179 }, { "completion_length": 500.0, "epoch": 36.0, "grad_norm": 2.753361701965332, "kl": 0.03497646003961563, "learning_rate": 3.7500000000000005e-06, "loss": 0.0014, "reward": -0.8905968070030212, "reward_std": 3.5913023948669434, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4375, "rewards/wrapped_driving_reward": -1.953096866607666, "rewards/wrapped_format_reward": 0.125, "step": 180 }, { "completion_length": 500.0, "epoch": 36.2, "grad_norm": 4.126710891723633, "kl": 0.023542242124676704, "learning_rate": 3.7708333333333334e-06, "loss": 0.0009, "reward": -2.637735366821289, "reward_std": 2.724529266357422, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.09375, "rewards/wrapped_driving_reward": -2.981485366821289, "rewards/wrapped_format_reward": 0.0, "step": 181 }, { "completion_length": 500.0, "epoch": 36.4, "grad_norm": 2.1164910793304443, "kl": 0.03186760097742081, "learning_rate": 3.7916666666666666e-06, "loss": 0.0013, "reward": -0.6355093121528625, "reward_std": 3.886032819747925, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.32499998807907104, "rewards/wrapped_driving_reward": -1.5855093002319336, "rewards/wrapped_format_reward": 0.125, "step": 182 }, { "completion_length": 500.0, "epoch": 36.6, "grad_norm": 0.9481387734413147, "kl": 0.02689046412706375, "learning_rate": 3.8125e-06, "loss": 0.0011, "reward": 0.9345278143882751, "reward_std": 3.2953097820281982, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5880681872367859, "rewards/wrapped_driving_reward": -0.778540313243866, "rewards/wrapped_format_reward": 0.375, "step": 183 }, { "completion_length": 500.0, "epoch": 36.8, "grad_norm": 0.3303874135017395, "kl": 0.0011166415642946959, "learning_rate": 3.833333333333334e-06, "loss": 0.0, "reward": -2.608830213546753, "reward_std": 2.782339572906494, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.1875, "rewards/wrapped_driving_reward": -3.046330213546753, "rewards/wrapped_format_reward": 0.0, "step": 184 }, { "completion_length": 500.0, "epoch": 37.0, "grad_norm": 0.5033643245697021, "kl": 0.001327235484495759, "learning_rate": 3.854166666666667e-06, "loss": 0.0001, "reward": -1.2191828489303589, "reward_std": 3.2157087326049805, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.23863635957241058, "rewards/wrapped_driving_reward": -1.9578192234039307, "rewards/wrapped_format_reward": 0.0, "step": 185 }, { "completion_length": 500.0, "epoch": 37.2, "grad_norm": 0.3546835780143738, "kl": 0.0012730626622214913, "learning_rate": 3.875e-06, "loss": 0.0001, "reward": -2.6489737033843994, "reward_std": 2.702052593231201, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -2.8989737033843994, "rewards/wrapped_format_reward": 0.0, "step": 186 }, { "completion_length": 500.0, "epoch": 37.4, "grad_norm": 0.3939490020275116, "kl": 0.0011839700164273381, "learning_rate": 3.8958333333333334e-06, "loss": 0.0, "reward": -2.425374746322632, "reward_std": 2.5269436836242676, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.125, "rewards/wrapped_driving_reward": -3.050374746322632, "rewards/wrapped_format_reward": 0.25, "step": 187 }, { "completion_length": 500.0, "epoch": 37.6, "grad_norm": 4.388774394989014, "kl": 0.03779164329171181, "learning_rate": 3.916666666666667e-06, "loss": 0.0015, "reward": -2.095153331756592, "reward_std": 3.8096938133239746, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.2083333283662796, "rewards/wrapped_driving_reward": -2.8034865856170654, "rewards/wrapped_format_reward": 0.25, "step": 188 }, { "completion_length": 500.0, "epoch": 37.8, "grad_norm": 0.3407905697822571, "kl": 0.0012132242554798722, "learning_rate": 3.9375e-06, "loss": 0.0, "reward": 0.9215935468673706, "reward_std": 3.2830312252044678, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6458333134651184, "rewards/wrapped_driving_reward": -0.849239706993103, "rewards/wrapped_format_reward": 0.375, "step": 189 }, { "completion_length": 500.0, "epoch": 38.0, "grad_norm": 12.550337791442871, "kl": 0.13451190292835236, "learning_rate": 3.958333333333333e-06, "loss": 0.0054, "reward": 1.0308945178985596, "reward_std": 2.702942371368408, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5249999761581421, "rewards/wrapped_driving_reward": -0.9941054582595825, "rewards/wrapped_format_reward": 0.75, "step": 190 }, { "completion_length": 500.0, "epoch": 38.2, "grad_norm": 13.734210968017578, "kl": 0.07660804688930511, "learning_rate": 3.9791666666666665e-06, "loss": 0.0031, "reward": -2.5775606632232666, "reward_std": 2.844878673553467, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.21590909361839294, "rewards/wrapped_driving_reward": -3.0434696674346924, "rewards/wrapped_format_reward": 0.0, "step": 191 }, { "completion_length": 500.0, "epoch": 38.4, "grad_norm": 0.39941278100013733, "kl": 0.0012940344167873263, "learning_rate": 4.000000000000001e-06, "loss": 0.0001, "reward": 1.0290822982788086, "reward_std": 3.354139566421509, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.42811357975006104, "rewards/wrapped_driving_reward": -0.3990311622619629, "rewards/wrapped_format_reward": 0.25, "step": 192 }, { "completion_length": 500.0, "epoch": 38.6, "grad_norm": 5.0886054039001465, "kl": 0.1496005654335022, "learning_rate": 4.020833333333334e-06, "loss": 0.006, "reward": 0.23665398359298706, "reward_std": 2.8346645832061768, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4821428656578064, "rewards/wrapped_driving_reward": -0.9954888820648193, "rewards/wrapped_format_reward": 0.0, "step": 193 }, { "completion_length": 500.0, "epoch": 38.8, "grad_norm": 1.0010530948638916, "kl": 0.01472307275980711, "learning_rate": 4.041666666666667e-06, "loss": 0.0006, "reward": -0.31997019052505493, "reward_std": 2.5833802223205566, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4886363744735718, "rewards/wrapped_driving_reward": -1.8086066246032715, "rewards/wrapped_format_reward": 0.25, "step": 194 }, { "completion_length": 500.0, "epoch": 39.0, "grad_norm": 4.434115886688232, "kl": 0.07414662837982178, "learning_rate": 4.0625000000000005e-06, "loss": 0.003, "reward": 1.8734700679779053, "reward_std": 0.7573453783988953, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6666666865348816, "rewards/wrapped_driving_reward": -0.16819655895233154, "rewards/wrapped_format_reward": 0.375, "step": 195 }, { "completion_length": 500.0, "epoch": 39.2, "grad_norm": 0.35328763723373413, "kl": 0.0011651457753032446, "learning_rate": 4.083333333333334e-06, "loss": 0.0, "reward": -0.9308372139930725, "reward_std": 3.265139579772949, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3499999940395355, "rewards/wrapped_driving_reward": -2.030837297439575, "rewards/wrapped_format_reward": 0.25, "step": 196 }, { "completion_length": 500.0, "epoch": 39.4, "grad_norm": 2.841628313064575, "kl": 0.1545742154121399, "learning_rate": 4.104166666666667e-06, "loss": 0.0062, "reward": 0.7136911153793335, "reward_std": 3.221494436264038, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4709596037864685, "rewards/wrapped_driving_reward": -0.8822685480117798, "rewards/wrapped_format_reward": 0.375, "step": 197 }, { "completion_length": 500.0, "epoch": 39.6, "grad_norm": 0.3438974618911743, "kl": 0.001065694261342287, "learning_rate": 4.125e-06, "loss": 0.0, "reward": -0.33991003036499023, "reward_std": 4.232908725738525, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.36250001192092896, "rewards/wrapped_driving_reward": -1.5774099826812744, "rewards/wrapped_format_reward": 0.375, "step": 198 }, { "completion_length": 500.0, "epoch": 39.8, "grad_norm": 5.850861549377441, "kl": 0.08895232528448105, "learning_rate": 4.145833333333334e-06, "loss": 0.0036, "reward": -1.0433223247528076, "reward_std": 3.4140782356262207, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4000000059604645, "rewards/wrapped_driving_reward": -2.0683224201202393, "rewards/wrapped_format_reward": 0.125, "step": 199 }, { "completion_length": 500.0, "epoch": 40.0, "grad_norm": 1.8783408403396606, "kl": 0.06220545247197151, "learning_rate": 4.166666666666667e-06, "loss": 0.0025, "reward": -2.5082778930664062, "reward_std": 2.9834442138671875, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.203125, "rewards/wrapped_driving_reward": -2.9614028930664062, "rewards/wrapped_format_reward": 0.0, "step": 200 }, { "completion_length": 500.0, "epoch": 40.2, "grad_norm": 0.3625878691673279, "kl": 0.0015527592040598392, "learning_rate": 4.1875e-06, "loss": 0.0001, "reward": -2.2801547050476074, "reward_std": 2.7830231189727783, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.1944444477558136, "rewards/wrapped_driving_reward": -2.9745991230010986, "rewards/wrapped_format_reward": 0.25, "step": 201 }, { "completion_length": 500.0, "epoch": 40.4, "grad_norm": 1.3591216802597046, "kl": 0.031130777671933174, "learning_rate": 4.208333333333333e-06, "loss": 0.0012, "reward": -0.9100172519683838, "reward_std": 3.5681514739990234, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.432692289352417, "rewards/wrapped_driving_reward": -1.9677093029022217, "rewards/wrapped_format_reward": 0.125, "step": 202 }, { "completion_length": 500.0, "epoch": 40.6, "grad_norm": 0.35409578680992126, "kl": 0.0012434074888005853, "learning_rate": 4.229166666666667e-06, "loss": 0.0, "reward": -2.404740571975708, "reward_std": 2.534834384918213, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.09375, "rewards/wrapped_driving_reward": -2.998490571975708, "rewards/wrapped_format_reward": 0.25, "step": 203 }, { "completion_length": 500.0, "epoch": 40.8, "grad_norm": 0.9982916712760925, "kl": 0.05762110650539398, "learning_rate": 4.25e-06, "loss": 0.0023, "reward": -2.0678892135620117, "reward_std": 3.5387465953826904, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.10000000149011612, "rewards/wrapped_driving_reward": -2.79288911819458, "rewards/wrapped_format_reward": 0.375, "step": 204 }, { "completion_length": 500.0, "epoch": 41.0, "grad_norm": 0.5429545640945435, "kl": 0.010603474453091621, "learning_rate": 4.270833333333333e-06, "loss": 0.0004, "reward": -1.1565592288970947, "reward_std": 3.0510413646698, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3214285671710968, "rewards/wrapped_driving_reward": -2.227987766265869, "rewards/wrapped_format_reward": 0.25, "step": 205 }, { "completion_length": 500.0, "epoch": 41.2, "grad_norm": 0.692771315574646, "kl": 0.033038631081581116, "learning_rate": 4.2916666666666665e-06, "loss": 0.0013, "reward": 0.6099777817726135, "reward_std": 3.07470703125, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6875, "rewards/wrapped_driving_reward": -0.9525222778320312, "rewards/wrapped_format_reward": 0.125, "step": 206 }, { "completion_length": 500.0, "epoch": 41.4, "grad_norm": 0.5571784973144531, "kl": 0.006391022354364395, "learning_rate": 4.312500000000001e-06, "loss": 0.0003, "reward": -2.5556817054748535, "reward_std": 2.888636350631714, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.1875, "rewards/wrapped_driving_reward": -2.9931819438934326, "rewards/wrapped_format_reward": 0.0, "step": 207 }, { "completion_length": 500.0, "epoch": 41.6, "grad_norm": 2.603093147277832, "kl": 0.08236520737409592, "learning_rate": 4.333333333333334e-06, "loss": 0.0033, "reward": 0.7872141003608704, "reward_std": 3.2228293418884277, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.42500001192092896, "rewards/wrapped_driving_reward": -0.3877858519554138, "rewards/wrapped_format_reward": 0.0, "step": 208 }, { "completion_length": 500.0, "epoch": 41.8, "grad_norm": 2.8598968982696533, "kl": 0.03701096028089523, "learning_rate": 4.354166666666667e-06, "loss": 0.0015, "reward": -3.75, "reward_std": 0.5, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.25, "step": 209 }, { "completion_length": 500.0, "epoch": 42.0, "grad_norm": 0.5654446482658386, "kl": 0.016315069049596786, "learning_rate": 4.3750000000000005e-06, "loss": 0.0007, "reward": -1.0942189693450928, "reward_std": 3.3554024696350098, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.42500001192092896, "rewards/wrapped_driving_reward": -2.019218921661377, "rewards/wrapped_format_reward": 0.0, "step": 210 }, { "completion_length": 500.0, "epoch": 42.2, "grad_norm": 1.1546794176101685, "kl": 0.1304369419813156, "learning_rate": 4.395833333333334e-06, "loss": 0.0052, "reward": -2.067227363586426, "reward_std": 2.8944802284240723, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.1875, "rewards/wrapped_driving_reward": -3.004727363586426, "rewards/wrapped_format_reward": 0.5, "step": 211 }, { "completion_length": 500.0, "epoch": 42.4, "grad_norm": 0.4224312901496887, "kl": 0.001306671998463571, "learning_rate": 4.416666666666667e-06, "loss": 0.0001, "reward": -1.3893475532531738, "reward_std": 3.059168577194214, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4318181872367859, "rewards/wrapped_driving_reward": -2.3211658000946045, "rewards/wrapped_format_reward": 0.0, "step": 212 }, { "completion_length": 500.0, "epoch": 42.6, "grad_norm": 14.517871856689453, "kl": 0.15590544044971466, "learning_rate": 4.4375e-06, "loss": 0.0062, "reward": -0.8690509796142578, "reward_std": 3.619702100753784, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.28125, "rewards/wrapped_driving_reward": -2.025300979614258, "rewards/wrapped_format_reward": 0.375, "step": 213 }, { "completion_length": 500.0, "epoch": 42.8, "grad_norm": 0.42172175645828247, "kl": 0.0013596608769148588, "learning_rate": 4.4583333333333336e-06, "loss": 0.0001, "reward": -0.6579513549804688, "reward_std": 3.57773756980896, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3068181872367859, "rewards/wrapped_driving_reward": -1.5897696018218994, "rewards/wrapped_format_reward": 0.125, "step": 214 }, { "completion_length": 500.0, "epoch": 43.0, "grad_norm": 1.4108898639678955, "kl": 0.05196976661682129, "learning_rate": 4.479166666666667e-06, "loss": 0.0021, "reward": 1.4945721626281738, "reward_std": 0.5106267333030701, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.3854166865348816, "rewards/wrapped_driving_reward": -0.14084455370903015, "rewards/wrapped_format_reward": 0.25, "step": 215 }, { "completion_length": 500.0, "epoch": 43.2, "grad_norm": 0.38824036717414856, "kl": 0.0016642104601487517, "learning_rate": 4.5e-06, "loss": 0.0001, "reward": -0.8108178377151489, "reward_std": 3.692883253097534, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3194444477558136, "rewards/wrapped_driving_reward": -2.1302623748779297, "rewards/wrapped_format_reward": 0.5, "step": 216 }, { "completion_length": 500.0, "epoch": 43.4, "grad_norm": 14.23416519165039, "kl": 0.13234129548072815, "learning_rate": 4.520833333333333e-06, "loss": 0.0053, "reward": -2.400029420852661, "reward_std": 2.876281499862671, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -3.025029182434082, "rewards/wrapped_format_reward": 0.125, "step": 217 }, { "completion_length": 500.0, "epoch": 43.6, "grad_norm": 4.683525562286377, "kl": 0.1954280287027359, "learning_rate": 4.541666666666667e-06, "loss": 0.0078, "reward": -1.0995738506317139, "reward_std": 3.3534717559814453, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2986111044883728, "rewards/wrapped_driving_reward": -1.8981850147247314, "rewards/wrapped_format_reward": 0.0, "step": 218 }, { "completion_length": 500.0, "epoch": 43.8, "grad_norm": 0.3793950080871582, "kl": 0.001844916958361864, "learning_rate": 4.5625e-06, "loss": 0.0001, "reward": -0.7520895004272461, "reward_std": 3.759147882461548, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.34375, "rewards/wrapped_driving_reward": -1.595839500427246, "rewards/wrapped_format_reward": 0.0, "step": 219 }, { "completion_length": 500.0, "epoch": 44.0, "grad_norm": 0.4106435477733612, "kl": 0.0017210771329700947, "learning_rate": 4.583333333333333e-06, "loss": 0.0001, "reward": 1.8025760650634766, "reward_std": 0.23965147137641907, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.49166664481163025, "rewards/wrapped_driving_reward": 0.06090930849313736, "rewards/wrapped_format_reward": 0.25, "step": 220 }, { "completion_length": 500.0, "epoch": 44.2, "grad_norm": 4.90147066116333, "kl": 0.293916791677475, "learning_rate": 4.6041666666666665e-06, "loss": 0.0118, "reward": 1.2585606575012207, "reward_std": 3.5554494857788086, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -0.3664393424987793, "rewards/wrapped_format_reward": 0.375, "step": 221 }, { "completion_length": 500.0, "epoch": 44.4, "grad_norm": 2.4181244373321533, "kl": 0.09946326911449432, "learning_rate": 4.625000000000001e-06, "loss": 0.004, "reward": -0.7496248483657837, "reward_std": 3.2129709720611572, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3214285671710968, "rewards/wrapped_driving_reward": -1.946053385734558, "rewards/wrapped_format_reward": 0.375, "step": 222 }, { "completion_length": 500.0, "epoch": 44.6, "grad_norm": 0.3938154876232147, "kl": 0.0027729361318051815, "learning_rate": 4.645833333333334e-06, "loss": 0.0001, "reward": 1.8255128860473633, "reward_std": 0.3188542425632477, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6972222328186035, "rewards/wrapped_driving_reward": -0.12170933187007904, "rewards/wrapped_format_reward": 0.25, "step": 223 }, { "completion_length": 500.0, "epoch": 44.8, "grad_norm": 1.139906883239746, "kl": 0.07463247328996658, "learning_rate": 4.666666666666667e-06, "loss": 0.003, "reward": -1.154573917388916, "reward_std": 3.352905511856079, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4318181872367859, "rewards/wrapped_driving_reward": -2.3363921642303467, "rewards/wrapped_format_reward": 0.25, "step": 224 }, { "completion_length": 500.0, "epoch": 45.0, "grad_norm": 0.37643423676490784, "kl": 0.0015645886305719614, "learning_rate": 4.6875000000000004e-06, "loss": 0.0001, "reward": -0.12135392427444458, "reward_std": 2.5997867584228516, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4166666865348816, "rewards/wrapped_driving_reward": -1.2880206108093262, "rewards/wrapped_format_reward": 0.0, "step": 225 }, { "completion_length": 500.0, "epoch": 45.2, "grad_norm": 0.4397854506969452, "kl": 0.0018841986311599612, "learning_rate": 4.708333333333334e-06, "loss": 0.0001, "reward": -1.1124364137649536, "reward_std": 3.4070234298706055, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2916666865348816, "rewards/wrapped_driving_reward": -2.0291032791137695, "rewards/wrapped_format_reward": 0.125, "step": 226 }, { "completion_length": 500.0, "epoch": 45.4, "grad_norm": 0.603453516960144, "kl": 0.02336149290204048, "learning_rate": 4.729166666666667e-06, "loss": 0.0009, "reward": 1.599435806274414, "reward_std": 0.5891650915145874, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6500000357627869, "rewards/wrapped_driving_reward": -0.17556418478488922, "rewards/wrapped_format_reward": 0.125, "step": 227 }, { "completion_length": 500.0, "epoch": 45.6, "grad_norm": 2.822200059890747, "kl": 0.33935749530792236, "learning_rate": 4.75e-06, "loss": 0.0136, "reward": -2.294963836669922, "reward_std": 3.08575439453125, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.11363636702299118, "rewards/wrapped_driving_reward": -2.783600091934204, "rewards/wrapped_format_reward": 0.125, "step": 228 }, { "completion_length": 500.0, "epoch": 45.8, "grad_norm": 0.5101578235626221, "kl": 0.013524656184017658, "learning_rate": 4.770833333333334e-06, "loss": 0.0005, "reward": -1.138512134552002, "reward_std": 3.435119390487671, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3636363744735718, "rewards/wrapped_driving_reward": -2.127148389816284, "rewards/wrapped_format_reward": 0.125, "step": 229 }, { "completion_length": 500.0, "epoch": 46.0, "grad_norm": 19.814189910888672, "kl": 0.25205615162849426, "learning_rate": 4.791666666666668e-06, "loss": 0.0101, "reward": 0.36332249641418457, "reward_std": 2.917628049850464, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5625, "rewards/wrapped_driving_reward": -0.9491775631904602, "rewards/wrapped_format_reward": 0.0, "step": 230 }, { "completion_length": 500.0, "epoch": 46.2, "grad_norm": 1.2908904552459717, "kl": 0.10442691296339035, "learning_rate": 4.8125e-06, "loss": 0.0042, "reward": 0.2518876791000366, "reward_std": 2.5249195098876953, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3693181872367859, "rewards/wrapped_driving_reward": -0.992430567741394, "rewards/wrapped_format_reward": 0.125, "step": 231 }, { "completion_length": 500.0, "epoch": 46.4, "grad_norm": 0.37878403067588806, "kl": 0.0021237193141132593, "learning_rate": 4.833333333333333e-06, "loss": 0.0001, "reward": 1.22785484790802, "reward_std": 0.8809930682182312, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8541666865348816, "rewards/wrapped_driving_reward": -0.8763118982315063, "rewards/wrapped_format_reward": 0.25, "step": 232 }, { "completion_length": 500.0, "epoch": 46.6, "grad_norm": 36.95953369140625, "kl": 0.40437987446784973, "learning_rate": 4.854166666666667e-06, "loss": 0.0162, "reward": -1.0619306564331055, "reward_std": 3.3970491886138916, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.0619306564331055, "rewards/wrapped_format_reward": 0.25, "step": 233 }, { "completion_length": 500.0, "epoch": 46.8, "grad_norm": 47.118255615234375, "kl": 0.134329155087471, "learning_rate": 4.875e-06, "loss": 0.0054, "reward": -0.5947959423065186, "reward_std": 3.65548038482666, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -1.8447959423065186, "rewards/wrapped_format_reward": 0.5, "step": 234 }, { "completion_length": 500.0, "epoch": 47.0, "grad_norm": 0.6403725743293762, "kl": 0.023185797035694122, "learning_rate": 4.895833333333333e-06, "loss": 0.0009, "reward": 1.51215398311615, "reward_std": 0.33645421266555786, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6640625, "rewards/wrapped_driving_reward": -0.1519085168838501, "rewards/wrapped_format_reward": 0.0, "step": 235 }, { "completion_length": 500.0, "epoch": 47.2, "grad_norm": 1.1939831972122192, "kl": 0.07250034809112549, "learning_rate": 4.9166666666666665e-06, "loss": 0.0029, "reward": -2.619947910308838, "reward_std": 2.438190460205078, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.171875, "rewards/wrapped_driving_reward": -3.166822910308838, "rewards/wrapped_format_reward": 0.125, "step": 236 }, { "completion_length": 500.0, "epoch": 47.4, "grad_norm": 2.614314556121826, "kl": 0.2160005122423172, "learning_rate": 4.937500000000001e-06, "loss": 0.0086, "reward": -1.6490904092788696, "reward_std": 2.716189384460449, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2929292917251587, "rewards/wrapped_driving_reward": -2.4420197010040283, "rewards/wrapped_format_reward": 0.0, "step": 237 }, { "completion_length": 500.0, "epoch": 47.6, "grad_norm": 78.35983276367188, "kl": 0.09474217891693115, "learning_rate": 4.958333333333334e-06, "loss": 0.0038, "reward": 0.05065804719924927, "reward_std": 2.749250650405884, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.45625001192092896, "rewards/wrapped_driving_reward": -1.2805919647216797, "rewards/wrapped_format_reward": 0.125, "step": 238 }, { "completion_length": 500.0, "epoch": 47.8, "grad_norm": 0.3830593526363373, "kl": 0.0018576495349407196, "learning_rate": 4.979166666666667e-06, "loss": 0.0001, "reward": -2.5445353984832764, "reward_std": 2.9109292030334473, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.125, "rewards/wrapped_driving_reward": -2.9195353984832764, "rewards/wrapped_format_reward": 0.0, "step": 239 }, { "completion_length": 500.0, "epoch": 48.0, "grad_norm": 3.0679774284362793, "kl": 0.2315346747636795, "learning_rate": 5e-06, "loss": 0.0093, "reward": -0.948199987411499, "reward_std": 3.5248489379882812, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.42500001192092896, "rewards/wrapped_driving_reward": -1.9982000589370728, "rewards/wrapped_format_reward": 0.125, "step": 240 }, { "completion_length": 500.0, "epoch": 48.2, "grad_norm": 3.1146676540374756, "kl": 0.1782953292131424, "learning_rate": 4.999997355752031e-06, "loss": 0.0071, "reward": -0.7975939512252808, "reward_std": 3.513246774673462, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.1875, "rewards/wrapped_driving_reward": -1.8600939512252808, "rewards/wrapped_format_reward": 0.375, "step": 241 }, { "completion_length": 500.0, "epoch": 48.4, "grad_norm": 4.299013137817383, "kl": 0.19927579164505005, "learning_rate": 4.999989423013716e-06, "loss": 0.008, "reward": -0.9984800815582275, "reward_std": 3.1900885105133057, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.998479962348938, "rewards/wrapped_format_reward": 0.125, "step": 242 }, { "completion_length": 500.0, "epoch": 48.6, "grad_norm": 0.3299877643585205, "kl": 0.0019166120328009129, "learning_rate": 4.999976201801837e-06, "loss": 0.0001, "reward": -2.3678572177886963, "reward_std": 2.9404144287109375, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.1818181872367859, "rewards/wrapped_driving_reward": -3.049675464630127, "rewards/wrapped_format_reward": 0.25, "step": 243 }, { "completion_length": 500.0, "epoch": 48.8, "grad_norm": 1.1573808193206787, "kl": 0.1410599946975708, "learning_rate": 4.999957692144361e-06, "loss": 0.0056, "reward": 0.8284584283828735, "reward_std": 3.2214972972869873, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.484375, "rewards/wrapped_driving_reward": -0.40591663122177124, "rewards/wrapped_format_reward": 0.0, "step": 244 }, { "completion_length": 500.0, "epoch": 49.0, "grad_norm": 0.47671303153038025, "kl": 0.06269445270299911, "learning_rate": 4.999933894080444e-06, "loss": 0.0025, "reward": 0.38734376430511475, "reward_std": 2.944716215133667, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6477272510528564, "rewards/wrapped_driving_reward": -1.0103834867477417, "rewards/wrapped_format_reward": 0.0, "step": 245 }, { "completion_length": 500.0, "epoch": 49.2, "grad_norm": 0.41526517271995544, "kl": 0.0028381492011249065, "learning_rate": 4.9999048076604286e-06, "loss": 0.0001, "reward": 0.23170125484466553, "reward_std": 2.49680233001709, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.2202381044626236, "rewards/wrapped_driving_reward": -0.9885368347167969, "rewards/wrapped_format_reward": 0.25, "step": 246 }, { "completion_length": 500.0, "epoch": 49.4, "grad_norm": 2.5495622158050537, "kl": 0.2154771089553833, "learning_rate": 4.999870432945843e-06, "loss": 0.0086, "reward": -2.2984023094177246, "reward_std": 3.403195381164551, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.7984023094177246, "rewards/wrapped_format_reward": 0.0, "step": 247 }, { "completion_length": 500.0, "epoch": 49.6, "grad_norm": 5.153905868530273, "kl": 0.3080277740955353, "learning_rate": 4.999830770009406e-06, "loss": 0.0123, "reward": 0.2953673005104065, "reward_std": 2.8969929218292236, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4092261791229248, "rewards/wrapped_driving_reward": -0.9888589382171631, "rewards/wrapped_format_reward": 0.125, "step": 248 }, { "completion_length": 500.0, "epoch": 49.8, "grad_norm": 0.5580005049705505, "kl": 0.043262895196676254, "learning_rate": 4.999785818935018e-06, "loss": 0.0017, "reward": -2.4887704849243164, "reward_std": 2.6994357109069824, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.1818181872367859, "rewards/wrapped_driving_reward": -3.045588493347168, "rewards/wrapped_format_reward": 0.125, "step": 249 }, { "completion_length": 500.0, "epoch": 50.0, "grad_norm": 2.3369898796081543, "kl": 0.2971566617488861, "learning_rate": 4.999735579817769e-06, "loss": 0.0119, "reward": -1.1610161066055298, "reward_std": 3.2840569019317627, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -1.9110161066055298, "rewards/wrapped_format_reward": 0.0, "step": 250 }, { "completion_length": 500.0, "epoch": 50.2, "grad_norm": 0.4173508882522583, "kl": 0.003112471429631114, "learning_rate": 4.9996800527639354e-06, "loss": 0.0001, "reward": -1.1911332607269287, "reward_std": 3.311530590057373, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -2.0661332607269287, "rewards/wrapped_format_reward": 0.0, "step": 251 }, { "completion_length": 500.0, "epoch": 50.4, "grad_norm": 1.8770030736923218, "kl": 0.1331283450126648, "learning_rate": 4.9996192378909785e-06, "loss": 0.0053, "reward": -1.1617484092712402, "reward_std": 2.9961960315704346, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.22499999403953552, "rewards/wrapped_driving_reward": -2.0117483139038086, "rewards/wrapped_format_reward": 0.125, "step": 252 }, { "completion_length": 500.0, "epoch": 50.6, "grad_norm": 5.769409656524658, "kl": 0.2073991745710373, "learning_rate": 4.999553135327546e-06, "loss": 0.0083, "reward": -3.75, "reward_std": 0.5, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.25, "step": 253 }, { "completion_length": 500.0, "epoch": 50.8, "grad_norm": 0.416715532541275, "kl": 0.003338834270834923, "learning_rate": 4.999481745213471e-06, "loss": 0.0001, "reward": 0.39701712131500244, "reward_std": 2.943756341934204, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4970238208770752, "rewards/wrapped_driving_reward": -1.1000065803527832, "rewards/wrapped_format_reward": 0.25, "step": 254 }, { "completion_length": 500.0, "epoch": 51.0, "grad_norm": 0.4497571587562561, "kl": 0.0044916169717907906, "learning_rate": 4.999405067699773e-06, "loss": 0.0002, "reward": 0.5951623916625977, "reward_std": 3.1178059577941895, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5833333730697632, "rewards/wrapped_driving_reward": -0.8631709814071655, "rewards/wrapped_format_reward": 0.125, "step": 255 }, { "completion_length": 500.0, "epoch": 51.2, "grad_norm": 2.1747357845306396, "kl": 0.4025267958641052, "learning_rate": 4.999323102948655e-06, "loss": 0.0161, "reward": -1.957139492034912, "reward_std": 3.11260986328125, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.957139492034912, "rewards/wrapped_format_reward": 0.5, "step": 256 }, { "completion_length": 500.0, "epoch": 51.4, "grad_norm": 1.526734709739685, "kl": 0.15158718824386597, "learning_rate": 4.9992358511335035e-06, "loss": 0.0061, "reward": -1.0233807563781738, "reward_std": 3.175232172012329, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.32499998807907104, "rewards/wrapped_driving_reward": -1.9733808040618896, "rewards/wrapped_format_reward": 0.125, "step": 257 }, { "completion_length": 500.0, "epoch": 51.6, "grad_norm": 2.924349546432495, "kl": 0.13476546108722687, "learning_rate": 4.999143312438893e-06, "loss": 0.0054, "reward": 2.7032151222229004, "reward_std": 0.22770251333713531, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6499999761581421, "rewards/wrapped_driving_reward": 0.6782150864601135, "rewards/wrapped_format_reward": 0.375, "step": 258 }, { "completion_length": 500.0, "epoch": 51.8, "grad_norm": 3.879314422607422, "kl": 0.5085331201553345, "learning_rate": 4.99904548706058e-06, "loss": 0.0203, "reward": 2.018110513687134, "reward_std": 0.8205149173736572, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": 0.018110457807779312, "rewards/wrapped_format_reward": 0.5, "step": 259 }, { "completion_length": 500.0, "epoch": 52.0, "grad_norm": 1.5741368532180786, "kl": 0.17756682634353638, "learning_rate": 4.998942375205502e-06, "loss": 0.0071, "reward": 0.5688350200653076, "reward_std": 3.046790599822998, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5833333134651184, "rewards/wrapped_driving_reward": -1.0144983530044556, "rewards/wrapped_format_reward": 0.25, "step": 260 }, { "completion_length": 500.0, "epoch": 52.2, "grad_norm": 6.336862564086914, "kl": 0.19336561858654022, "learning_rate": 4.998833977091783e-06, "loss": 0.0077, "reward": -0.49781960248947144, "reward_std": 4.047021865844727, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.22499999403953552, "rewards/wrapped_driving_reward": -1.5978196859359741, "rewards/wrapped_format_reward": 0.375, "step": 261 }, { "completion_length": 500.0, "epoch": 52.4, "grad_norm": 0.774104118347168, "kl": 0.1010366752743721, "learning_rate": 4.998720292948727e-06, "loss": 0.004, "reward": -0.8497058153152466, "reward_std": 3.3681893348693848, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.9747058153152466, "rewards/wrapped_format_reward": 0.25, "step": 262 }, { "completion_length": 500.0, "epoch": 52.6, "grad_norm": 0.36933571100234985, "kl": 0.06732728332281113, "learning_rate": 4.998601323016824e-06, "loss": 0.0027, "reward": 0.40385496616363525, "reward_std": 2.9700136184692383, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4000000059604645, "rewards/wrapped_driving_reward": -0.9961450695991516, "rewards/wrapped_format_reward": 0.25, "step": 263 }, { "completion_length": 500.0, "epoch": 52.8, "grad_norm": 0.39391028881073, "kl": 0.003927405923604965, "learning_rate": 4.99847706754774e-06, "loss": 0.0002, "reward": -0.6202123165130615, "reward_std": 3.9359829425811768, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.8702123165130615, "rewards/wrapped_format_reward": 0.375, "step": 264 }, { "completion_length": 500.0, "epoch": 53.0, "grad_norm": 5.415218830108643, "kl": 0.17419731616973877, "learning_rate": 4.9983475268043254e-06, "loss": 0.007, "reward": -1.1574738025665283, "reward_std": 3.333486557006836, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -2.1574738025665283, "rewards/wrapped_format_reward": 0.125, "step": 265 }, { "completion_length": 500.0, "epoch": 53.2, "grad_norm": 1.1539275646209717, "kl": 0.3938099145889282, "learning_rate": 4.998212701060612e-06, "loss": 0.0158, "reward": 1.269005298614502, "reward_std": 2.8699021339416504, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.40625, "rewards/wrapped_driving_reward": -0.38724470138549805, "rewards/wrapped_format_reward": 0.5, "step": 266 }, { "completion_length": 500.0, "epoch": 53.4, "grad_norm": 11.512609481811523, "kl": 0.2840820550918579, "learning_rate": 4.998072590601808e-06, "loss": 0.0114, "reward": 0.8071303963661194, "reward_std": 3.2237534523010254, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5791666507720947, "rewards/wrapped_driving_reward": -1.0220361948013306, "rewards/wrapped_format_reward": 0.5, "step": 267 }, { "completion_length": 500.0, "epoch": 53.6, "grad_norm": 0.42872354388237, "kl": 0.004085747059434652, "learning_rate": 4.9979271957243035e-06, "loss": 0.0002, "reward": -2.604086399078369, "reward_std": 2.7918272018432617, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.1428571492433548, "rewards/wrapped_driving_reward": -2.996943473815918, "rewards/wrapped_format_reward": 0.0, "step": 268 }, { "completion_length": 500.0, "epoch": 53.8, "grad_norm": 1.7167718410491943, "kl": 0.2890765964984894, "learning_rate": 4.997776516735667e-06, "loss": 0.0116, "reward": -1.7580829858779907, "reward_std": 2.456212282180786, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4901515245437622, "rewards/wrapped_driving_reward": -3.248234510421753, "rewards/wrapped_format_reward": 0.25, "step": 269 }, { "completion_length": 500.0, "epoch": 54.0, "grad_norm": 0.5007515549659729, "kl": 0.06032608821988106, "learning_rate": 4.997620553954645e-06, "loss": 0.0024, "reward": 0.5694142580032349, "reward_std": 3.0647597312927246, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.45454543828964233, "rewards/wrapped_driving_reward": -1.0101312398910522, "rewards/wrapped_format_reward": 0.375, "step": 270 }, { "completion_length": 500.0, "epoch": 54.2, "grad_norm": 0.42161646485328674, "kl": 0.003139057895168662, "learning_rate": 4.99745930771116e-06, "loss": 0.0001, "reward": 0.9141009449958801, "reward_std": 2.9500954151153564, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5813717842102051, "rewards/wrapped_driving_reward": -0.7922708988189697, "rewards/wrapped_format_reward": 0.375, "step": 271 }, { "completion_length": 500.0, "epoch": 54.4, "grad_norm": 0.41687673330307007, "kl": 0.00326509028673172, "learning_rate": 4.997292778346312e-06, "loss": 0.0001, "reward": -2.5603370666503906, "reward_std": 2.8793258666992188, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -3.0603370666503906, "rewards/wrapped_format_reward": 0.0, "step": 272 }, { "completion_length": 500.0, "epoch": 54.6, "grad_norm": 0.3959712088108063, "kl": 0.14926669001579285, "learning_rate": 4.9971209662123774e-06, "loss": 0.006, "reward": 0.6993837952613831, "reward_std": 3.2752537727355957, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5623737573623657, "rewards/wrapped_driving_reward": -0.8629899621009827, "rewards/wrapped_format_reward": 0.25, "step": 273 }, { "completion_length": 500.0, "epoch": 54.8, "grad_norm": 0.3514886498451233, "kl": 0.0030937029514461756, "learning_rate": 4.996943871672807e-06, "loss": 0.0001, "reward": -0.6663916110992432, "reward_std": 3.8655571937561035, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4000000059604645, "rewards/wrapped_driving_reward": -1.8163914680480957, "rewards/wrapped_format_reward": 0.25, "step": 274 }, { "completion_length": 500.0, "epoch": 55.0, "grad_norm": 0.3679739832878113, "kl": 0.0026038195937871933, "learning_rate": 4.996761495102227e-06, "loss": 0.0001, "reward": -2.3016748428344727, "reward_std": 3.396650552749634, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -2.8016748428344727, "rewards/wrapped_format_reward": 0.25, "step": 275 }, { "completion_length": 500.0, "epoch": 55.2, "grad_norm": 1.3922810554504395, "kl": 0.21797779202461243, "learning_rate": 4.9965738368864345e-06, "loss": 0.0087, "reward": 0.0015410780906677246, "reward_std": 2.685657024383545, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3541666865348816, "rewards/wrapped_driving_reward": -1.2276256084442139, "rewards/wrapped_format_reward": 0.125, "step": 276 }, { "completion_length": 500.0, "epoch": 55.4, "grad_norm": 1.249841570854187, "kl": 0.2836349904537201, "learning_rate": 4.996380897422405e-06, "loss": 0.0113, "reward": 0.4582923650741577, "reward_std": 2.975569725036621, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.0417077541351318, "rewards/wrapped_format_reward": 0.25, "step": 277 }, { "completion_length": 500.0, "epoch": 55.6, "grad_norm": 4.349127769470215, "kl": 0.6486947536468506, "learning_rate": 4.996182677118278e-06, "loss": 0.0259, "reward": -0.5984437465667725, "reward_std": 3.405778408050537, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3901515007019043, "rewards/wrapped_driving_reward": -1.9885952472686768, "rewards/wrapped_format_reward": 0.5, "step": 278 }, { "completion_length": 467.0, "epoch": 55.8, "grad_norm": 2.509852170944214, "kl": 0.6388497948646545, "learning_rate": 4.995979176393372e-06, "loss": 0.0256, "reward": 2.645427942276001, "reward_std": 0.5950685143470764, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": 0.6454278826713562, "rewards/wrapped_format_reward": 0.25, "step": 279 }, { "completion_length": 500.0, "epoch": 56.0, "grad_norm": 2.851917028427124, "kl": 0.19445133209228516, "learning_rate": 4.995770395678171e-06, "loss": 0.0078, "reward": 0.3463352918624878, "reward_std": 2.9164505004882812, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": -1.1536647081375122, "rewards/wrapped_format_reward": 0.125, "step": 280 }, { "completion_length": 500.0, "epoch": 56.2, "grad_norm": 0.42350760102272034, "kl": 0.04766889289021492, "learning_rate": 4.9955563354143285e-06, "loss": 0.0019, "reward": 1.01877760887146, "reward_std": 3.0202696323394775, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4166666865348816, "rewards/wrapped_driving_reward": -0.39788898825645447, "rewards/wrapped_format_reward": 0.25, "step": 281 }, { "completion_length": 500.0, "epoch": 56.4, "grad_norm": 1.5306605100631714, "kl": 0.2947113513946533, "learning_rate": 4.995336996054668e-06, "loss": 0.0118, "reward": -0.1445428729057312, "reward_std": 2.5796546936035156, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.3945426940917969, "rewards/wrapped_format_reward": 0.125, "step": 282 }, { "completion_length": 500.0, "epoch": 56.6, "grad_norm": 0.38275909423828125, "kl": 0.003526146523654461, "learning_rate": 4.99511237806318e-06, "loss": 0.0001, "reward": -1.1158170700073242, "reward_std": 3.336292028427124, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.9908171892166138, "rewards/wrapped_format_reward": 0.0, "step": 283 }, { "completion_length": 500.0, "epoch": 56.8, "grad_norm": 0.45490142703056335, "kl": 0.09113866090774536, "learning_rate": 4.994882481915019e-06, "loss": 0.0036, "reward": -0.9650794267654419, "reward_std": 2.967007875442505, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2767857313156128, "rewards/wrapped_driving_reward": -1.9918651580810547, "rewards/wrapped_format_reward": 0.25, "step": 284 }, { "completion_length": 500.0, "epoch": 57.0, "grad_norm": 0.6580032110214233, "kl": 0.20218665897846222, "learning_rate": 4.994647308096509e-06, "loss": 0.0081, "reward": -3.875, "reward_std": 0.25, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.125, "step": 285 }, { "completion_length": 500.0, "epoch": 57.2, "grad_norm": 8.386362075805664, "kl": 0.4461314380168915, "learning_rate": 4.994406857105136e-06, "loss": 0.0178, "reward": 0.5840720534324646, "reward_std": 3.0974204540252686, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.484375, "rewards/wrapped_driving_reward": -0.9003030061721802, "rewards/wrapped_format_reward": 0.25, "step": 286 }, { "completion_length": 500.0, "epoch": 57.4, "grad_norm": 0.448266863822937, "kl": 0.0026026677805930376, "learning_rate": 4.9941611294495495e-06, "loss": 0.0001, "reward": -2.178774833679199, "reward_std": 3.6424503326416016, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.1964285671710968, "rewards/wrapped_driving_reward": -2.8752033710479736, "rewards/wrapped_format_reward": 0.25, "step": 287 }, { "completion_length": 500.0, "epoch": 57.6, "grad_norm": 1.679646372795105, "kl": 0.20971831679344177, "learning_rate": 4.993910125649561e-06, "loss": 0.0084, "reward": 1.969580054283142, "reward_std": 0.40913528203964233, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5220588445663452, "rewards/wrapped_driving_reward": 0.07252118736505508, "rewards/wrapped_format_reward": 0.375, "step": 288 }, { "completion_length": 500.0, "epoch": 57.8, "grad_norm": 0.2821173071861267, "kl": 0.0028246240690350533, "learning_rate": 4.993653846236144e-06, "loss": 0.0001, "reward": 0.9076694250106812, "reward_std": 2.957883358001709, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4375, "rewards/wrapped_driving_reward": -0.5298306941986084, "rewards/wrapped_format_reward": 0.25, "step": 289 }, { "completion_length": 500.0, "epoch": 58.0, "grad_norm": 1.2047299146652222, "kl": 0.29308056831359863, "learning_rate": 4.993392291751431e-06, "loss": 0.0117, "reward": -0.6820242404937744, "reward_std": 3.285200834274292, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -2.0570242404937744, "rewards/wrapped_format_reward": 0.5, "step": 290 }, { "completion_length": 500.0, "epoch": 58.2, "grad_norm": 0.38363662362098694, "kl": 0.0025257065426558256, "learning_rate": 4.993125462748714e-06, "loss": 0.0001, "reward": -0.6149693727493286, "reward_std": 3.9089224338531494, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.22499999403953552, "rewards/wrapped_driving_reward": -1.589969277381897, "rewards/wrapped_format_reward": 0.25, "step": 291 }, { "completion_length": 500.0, "epoch": 58.4, "grad_norm": 0.8080898523330688, "kl": 0.20920060575008392, "learning_rate": 4.992853359792444e-06, "loss": 0.0084, "reward": -2.0424954891204834, "reward_std": 2.943457841873169, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.1818181872367859, "rewards/wrapped_driving_reward": -2.849313735961914, "rewards/wrapped_format_reward": 0.375, "step": 292 }, { "completion_length": 500.0, "epoch": 58.6, "grad_norm": 0.4658648371696472, "kl": 0.07586698979139328, "learning_rate": 4.9925759834582254e-06, "loss": 0.003, "reward": 0.4167907238006592, "reward_std": 2.305239677429199, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4068181812763214, "rewards/wrapped_driving_reward": -1.1150274276733398, "rewards/wrapped_format_reward": 0.375, "step": 293 }, { "completion_length": 500.0, "epoch": 58.8, "grad_norm": 8.748403549194336, "kl": 0.2903243601322174, "learning_rate": 4.992293334332821e-06, "loss": 0.0116, "reward": 0.17369729280471802, "reward_std": 2.7883899211883545, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": -1.2013027667999268, "rewards/wrapped_format_reward": 0.0, "step": 294 }, { "completion_length": 500.0, "epoch": 59.0, "grad_norm": 0.3755791187286377, "kl": 0.004894225392490625, "learning_rate": 4.9920054130141445e-06, "loss": 0.0002, "reward": 0.47168922424316406, "reward_std": 2.994149684906006, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.512499988079071, "rewards/wrapped_driving_reward": -0.915810763835907, "rewards/wrapped_format_reward": 0.125, "step": 295 }, { "completion_length": 500.0, "epoch": 59.2, "grad_norm": 0.3929916322231293, "kl": 0.004553962033241987, "learning_rate": 4.991712220111265e-06, "loss": 0.0002, "reward": 0.22208917140960693, "reward_std": 2.8153343200683594, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5341880321502686, "rewards/wrapped_driving_reward": -1.062098741531372, "rewards/wrapped_format_reward": 0.0, "step": 296 }, { "completion_length": 500.0, "epoch": 59.4, "grad_norm": 0.41047510504722595, "kl": 0.00469350116327405, "learning_rate": 4.991413756244404e-06, "loss": 0.0002, "reward": -1.1782411336898804, "reward_std": 3.260157346725464, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3035714328289032, "rewards/wrapped_driving_reward": -1.9818124771118164, "rewards/wrapped_format_reward": 0.0, "step": 297 }, { "completion_length": 500.0, "epoch": 59.6, "grad_norm": 3.261988401412964, "kl": 0.3944035470485687, "learning_rate": 4.99111002204493e-06, "loss": 0.0158, "reward": 2.40362548828125, "reward_std": 0.452992707490921, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7250000238418579, "rewards/wrapped_driving_reward": 0.5536257028579712, "rewards/wrapped_format_reward": 0.125, "step": 298 }, { "completion_length": 500.0, "epoch": 59.8, "grad_norm": 0.4695773422718048, "kl": 0.004432830028235912, "learning_rate": 4.990801018155361e-06, "loss": 0.0002, "reward": -0.06609618663787842, "reward_std": 2.646121025085449, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.2818181812763214, "rewards/wrapped_driving_reward": -1.0979143381118774, "rewards/wrapped_format_reward": 0.0, "step": 299 }, { "completion_length": 500.0, "epoch": 60.0, "grad_norm": 6.382628917694092, "kl": 0.2269033044576645, "learning_rate": 4.990486745229364e-06, "loss": 0.0091, "reward": 0.12916043400764465, "reward_std": 3.019442558288574, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5519230365753174, "rewards/wrapped_driving_reward": -1.4227626323699951, "rewards/wrapped_format_reward": 0.25, "step": 300 }, { "completion_length": 500.0, "epoch": 60.2, "grad_norm": 2.289363384246826, "kl": 0.4308004081249237, "learning_rate": 4.990167203931753e-06, "loss": 0.0172, "reward": 0.4380396008491516, "reward_std": 3.004404067993164, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -0.6869603395462036, "rewards/wrapped_format_reward": 0.0, "step": 301 }, { "completion_length": 500.0, "epoch": 60.4, "grad_norm": 0.34541359543800354, "kl": 0.0026338391471654177, "learning_rate": 4.989842394938482e-06, "loss": 0.0001, "reward": 0.008644580841064453, "reward_std": 2.6733663082122803, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4166666865348816, "rewards/wrapped_driving_reward": -1.1580220460891724, "rewards/wrapped_format_reward": 0.0, "step": 302 }, { "completion_length": 500.0, "epoch": 60.6, "grad_norm": 0.40867820382118225, "kl": 0.0034318449907004833, "learning_rate": 4.989512318936654e-06, "loss": 0.0001, "reward": -0.49820494651794434, "reward_std": 3.4901528358459473, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.8732049465179443, "rewards/wrapped_format_reward": 0.375, "step": 303 }, { "completion_length": 500.0, "epoch": 60.8, "grad_norm": 2.397559404373169, "kl": 0.26956814527511597, "learning_rate": 4.989176976624511e-06, "loss": 0.0108, "reward": 0.6745624542236328, "reward_std": 3.1912028789520264, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6000000238418579, "rewards/wrapped_driving_reward": -0.9254375696182251, "rewards/wrapped_format_reward": 0.25, "step": 304 }, { "completion_length": 500.0, "epoch": 61.0, "grad_norm": 2.3476884365081787, "kl": 0.31312525272369385, "learning_rate": 4.988836368711435e-06, "loss": 0.0125, "reward": 1.7296613454818726, "reward_std": 0.7248514890670776, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.27033859491348267, "rewards/wrapped_format_reward": 0.25, "step": 305 }, { "completion_length": 500.0, "epoch": 61.2, "grad_norm": 0.4137451946735382, "kl": 0.05325795337557793, "learning_rate": 4.988490495917948e-06, "loss": 0.0021, "reward": 0.3951526880264282, "reward_std": 2.953357458114624, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -0.9798471927642822, "rewards/wrapped_format_reward": 0.25, "step": 306 }, { "completion_length": 500.0, "epoch": 61.4, "grad_norm": 0.6206872463226318, "kl": 0.056155622005462646, "learning_rate": 4.988139358975707e-06, "loss": 0.0022, "reward": -0.9187849164009094, "reward_std": 3.277235507965088, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.30000001192092896, "rewards/wrapped_driving_reward": -2.093784809112549, "rewards/wrapped_format_reward": 0.375, "step": 307 }, { "completion_length": 500.0, "epoch": 61.6, "grad_norm": 1.5620518922805786, "kl": 0.2103491574525833, "learning_rate": 4.987782958627508e-06, "loss": 0.0084, "reward": 1.7064940929412842, "reward_std": 0.3729006052017212, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.47430557012557983, "rewards/wrapped_driving_reward": -0.017811477184295654, "rewards/wrapped_format_reward": 0.25, "step": 308 }, { "completion_length": 500.0, "epoch": 61.8, "grad_norm": 0.45032092928886414, "kl": 0.1164650246500969, "learning_rate": 4.987421295627279e-06, "loss": 0.0047, "reward": 0.4541400074958801, "reward_std": 2.9789493083953857, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -1.1708600521087646, "rewards/wrapped_format_reward": 0.125, "step": 309 }, { "completion_length": 500.0, "epoch": 62.0, "grad_norm": 2.286036252975464, "kl": 0.37701109051704407, "learning_rate": 4.9870543707400835e-06, "loss": 0.0151, "reward": 0.9700980186462402, "reward_std": 3.3365931510925293, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3080357313156128, "rewards/wrapped_driving_reward": -0.462937593460083, "rewards/wrapped_format_reward": 0.375, "step": 310 }, { "completion_length": 500.0, "epoch": 62.2, "grad_norm": 0.9412902593612671, "kl": 0.4719122350215912, "learning_rate": 4.986682184742111e-06, "loss": 0.0189, "reward": -1.1503251791000366, "reward_std": 3.0121943950653076, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3484848737716675, "rewards/wrapped_driving_reward": -2.248810052871704, "rewards/wrapped_format_reward": 0.25, "step": 311 }, { "completion_length": 500.0, "epoch": 62.4, "grad_norm": 0.40759605169296265, "kl": 0.004896071273833513, "learning_rate": 4.986304738420684e-06, "loss": 0.0002, "reward": 2.4090352058410645, "reward_std": 0.401497483253479, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6696428656578064, "rewards/wrapped_driving_reward": -0.010607685893774033, "rewards/wrapped_format_reward": 0.75, "step": 312 }, { "completion_length": 500.0, "epoch": 62.6, "grad_norm": 0.3452640771865845, "kl": 0.003664538264274597, "learning_rate": 4.985922032574252e-06, "loss": 0.0001, "reward": -1.0280461311340332, "reward_std": 3.437983751296997, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.442307710647583, "rewards/wrapped_driving_reward": -2.095353841781616, "rewards/wrapped_format_reward": 0.125, "step": 313 }, { "completion_length": 500.0, "epoch": 62.8, "grad_norm": 33.71268081665039, "kl": 0.42201700806617737, "learning_rate": 4.985534068012391e-06, "loss": 0.0169, "reward": 1.1327743530273438, "reward_std": 3.4218854904174805, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.36722564697265625, "rewards/wrapped_format_reward": 0.0, "step": 314 }, { "completion_length": 500.0, "epoch": 63.0, "grad_norm": 13.974825859069824, "kl": 0.2791546583175659, "learning_rate": 4.985140845555799e-06, "loss": 0.0112, "reward": -1.1279728412628174, "reward_std": 3.347849130630493, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3187499940395355, "rewards/wrapped_driving_reward": -2.071722984313965, "rewards/wrapped_format_reward": 0.125, "step": 315 }, { "completion_length": 500.0, "epoch": 63.2, "grad_norm": 2.090256929397583, "kl": 0.22176861763000488, "learning_rate": 4.9847423660363e-06, "loss": 0.0089, "reward": 1.3426780700683594, "reward_std": 0.3560336232185364, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5166667103767395, "rewards/wrapped_driving_reward": -0.2989885210990906, "rewards/wrapped_format_reward": 0.125, "step": 316 }, { "completion_length": 500.0, "epoch": 63.4, "grad_norm": 0.377633273601532, "kl": 0.005016393028199673, "learning_rate": 4.984338630296836e-06, "loss": 0.0002, "reward": 0.21509039402008057, "reward_std": 2.826265573501587, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3499999940395355, "rewards/wrapped_driving_reward": -1.1349096298217773, "rewards/wrapped_format_reward": 0.25, "step": 317 }, { "completion_length": 500.0, "epoch": 63.6, "grad_norm": 6.6817498207092285, "kl": 0.33422529697418213, "learning_rate": 4.9839296391914696e-06, "loss": 0.0134, "reward": 1.0928232669830322, "reward_std": 3.0657143592834473, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5982142686843872, "rewards/wrapped_driving_reward": -0.3803909420967102, "rewards/wrapped_format_reward": 0.125, "step": 318 }, { "completion_length": 500.0, "epoch": 63.8, "grad_norm": 0.9221271872520447, "kl": 0.44944262504577637, "learning_rate": 4.983515393585379e-06, "loss": 0.018, "reward": 1.702825903892517, "reward_std": 0.3978630602359772, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6068181991577148, "rewards/wrapped_driving_reward": -0.028992218896746635, "rewards/wrapped_format_reward": 0.125, "step": 319 }, { "completion_length": 500.0, "epoch": 64.0, "grad_norm": 6.990262985229492, "kl": 0.22952702641487122, "learning_rate": 4.983095894354858e-06, "loss": 0.0092, "reward": 0.6090406775474548, "reward_std": 3.087157964706421, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4375000298023224, "rewards/wrapped_driving_reward": -0.9534592628479004, "rewards/wrapped_format_reward": 0.375, "step": 320 }, { "completion_length": 500.0, "epoch": 64.2, "grad_norm": 4.146872520446777, "kl": 0.3527936637401581, "learning_rate": 4.982671142387316e-06, "loss": 0.0141, "reward": -0.8656878471374512, "reward_std": 3.6415810585021973, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.38474026322364807, "rewards/wrapped_driving_reward": -2.0004281997680664, "rewards/wrapped_format_reward": 0.25, "step": 321 }, { "completion_length": 500.0, "epoch": 64.4, "grad_norm": 0.37092164158821106, "kl": 0.004467155784368515, "learning_rate": 4.982241138581273e-06, "loss": 0.0002, "reward": -2.2984180450439453, "reward_std": 3.4031639099121094, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.125, "rewards/wrapped_driving_reward": -2.7984180450439453, "rewards/wrapped_format_reward": 0.125, "step": 322 }, { "completion_length": 500.0, "epoch": 64.6, "grad_norm": 0.6264036893844604, "kl": 0.08939827233552933, "learning_rate": 4.981805883846357e-06, "loss": 0.0036, "reward": -2.4163126945495605, "reward_std": 2.511791467666626, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.0833333358168602, "rewards/wrapped_driving_reward": -2.9996461868286133, "rewards/wrapped_format_reward": 0.25, "step": 323 }, { "completion_length": 500.0, "epoch": 64.8, "grad_norm": 0.29659271240234375, "kl": 0.009662508033216, "learning_rate": 4.981365379103306e-06, "loss": 0.0004, "reward": 0.39709973335266113, "reward_std": 2.9523847103118896, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5409451723098755, "rewards/wrapped_driving_reward": -1.0188454389572144, "rewards/wrapped_format_reward": 0.125, "step": 324 }, { "completion_length": 500.0, "epoch": 65.0, "grad_norm": 7.854650020599365, "kl": 0.38552114367485046, "learning_rate": 4.980919625283962e-06, "loss": 0.0154, "reward": -0.9804279804229736, "reward_std": 3.520909547805786, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -2.2304279804229736, "rewards/wrapped_format_reward": 0.375, "step": 325 }, { "completion_length": 500.0, "epoch": 65.2, "grad_norm": 0.44117897748947144, "kl": 0.006218797527253628, "learning_rate": 4.980468623331273e-06, "loss": 0.0002, "reward": 1.3627103567123413, "reward_std": 0.3762909770011902, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5910714268684387, "rewards/wrapped_driving_reward": -0.353361040353775, "rewards/wrapped_format_reward": 0.125, "step": 326 }, { "completion_length": 500.0, "epoch": 65.4, "grad_norm": 1.4037021398544312, "kl": 0.38747310638427734, "learning_rate": 4.980012374199288e-06, "loss": 0.0155, "reward": -0.6231173276901245, "reward_std": 3.8994014263153076, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.6231173276901245, "rewards/wrapped_format_reward": 0.125, "step": 327 }, { "completion_length": 500.0, "epoch": 65.6, "grad_norm": 2.3190011978149414, "kl": 0.25002360343933105, "learning_rate": 4.979550878853154e-06, "loss": 0.01, "reward": -0.6996442079544067, "reward_std": 3.268649101257324, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2874999940395355, "rewards/wrapped_driving_reward": -1.9871442317962646, "rewards/wrapped_format_reward": 0.5, "step": 328 }, { "completion_length": 500.0, "epoch": 65.8, "grad_norm": 0.37686654925346375, "kl": 0.006784873083233833, "learning_rate": 4.97908413826912e-06, "loss": 0.0003, "reward": -1.1586905717849731, "reward_std": 3.2879345417022705, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.0336904525756836, "rewards/wrapped_format_reward": 0.125, "step": 329 }, { "completion_length": 500.0, "epoch": 66.0, "grad_norm": 1.805623173713684, "kl": 0.40524211525917053, "learning_rate": 4.978612153434527e-06, "loss": 0.0162, "reward": -0.08653664588928223, "reward_std": 2.7601754665374756, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3380681872367859, "rewards/wrapped_driving_reward": -1.674604892730713, "rewards/wrapped_format_reward": 0.5, "step": 330 }, { "completion_length": 500.0, "epoch": 66.2, "grad_norm": 0.38753998279571533, "kl": 0.005745263770222664, "learning_rate": 4.97813492534781e-06, "loss": 0.0002, "reward": 0.27020639181137085, "reward_std": 2.9552714824676514, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.32499998807907104, "rewards/wrapped_driving_reward": -0.9297935366630554, "rewards/wrapped_format_reward": 0.125, "step": 331 }, { "completion_length": 500.0, "epoch": 66.4, "grad_norm": 0.9115275144577026, "kl": 0.3008367717266083, "learning_rate": 4.9776524550184965e-06, "loss": 0.012, "reward": 0.6117656826972961, "reward_std": 3.0815846920013428, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4444444477558136, "rewards/wrapped_driving_reward": -0.7076786756515503, "rewards/wrapped_format_reward": 0.125, "step": 332 }, { "completion_length": 500.0, "epoch": 66.6, "grad_norm": 0.408721387386322, "kl": 0.006442686542868614, "learning_rate": 4.977164743467206e-06, "loss": 0.0003, "reward": 2.024855852127075, "reward_std": 0.7860668897628784, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.810606062412262, "rewards/wrapped_driving_reward": -0.28575026988983154, "rewards/wrapped_format_reward": 0.5, "step": 333 }, { "completion_length": 500.0, "epoch": 66.8, "grad_norm": 0.4926323890686035, "kl": 0.20976316928863525, "learning_rate": 4.97667179172564e-06, "loss": 0.0084, "reward": 0.6569395065307617, "reward_std": 3.1219942569732666, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4124999940395355, "rewards/wrapped_driving_reward": -0.8805604577064514, "rewards/wrapped_format_reward": 0.375, "step": 334 }, { "completion_length": 500.0, "epoch": 67.0, "grad_norm": 0.5138577222824097, "kl": 0.1235075369477272, "learning_rate": 4.9761736008365906e-06, "loss": 0.0049, "reward": -0.7582014799118042, "reward_std": 3.743921995162964, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3425324559211731, "rewards/wrapped_driving_reward": -1.600733995437622, "rewards/wrapped_format_reward": 0.0, "step": 335 }, { "completion_length": 500.0, "epoch": 67.2, "grad_norm": 0.3995298743247986, "kl": 0.005109015386551619, "learning_rate": 4.975670171853926e-06, "loss": 0.0002, "reward": -1.069366455078125, "reward_std": 3.387301206588745, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.44999998807907104, "rewards/wrapped_driving_reward": -2.019366502761841, "rewards/wrapped_format_reward": 0.0, "step": 336 }, { "completion_length": 500.0, "epoch": 67.4, "grad_norm": 3.9272031784057617, "kl": 0.3972117304801941, "learning_rate": 4.975161505842603e-06, "loss": 0.0159, "reward": 0.659011960029602, "reward_std": 3.177152156829834, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4097222089767456, "rewards/wrapped_driving_reward": -1.0007102489471436, "rewards/wrapped_format_reward": 0.5, "step": 337 }, { "completion_length": 500.0, "epoch": 67.6, "grad_norm": 1.9709585905075073, "kl": 0.34756138920783997, "learning_rate": 4.97464760387865e-06, "loss": 0.0139, "reward": -1.0175073146820068, "reward_std": 3.2108185291290283, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.40625, "rewards/wrapped_driving_reward": -2.298757314682007, "rewards/wrapped_format_reward": 0.375, "step": 338 }, { "completion_length": 500.0, "epoch": 67.8, "grad_norm": 1.3277071714401245, "kl": 0.7304494380950928, "learning_rate": 4.974128467049177e-06, "loss": 0.0292, "reward": 1.208156704902649, "reward_std": 3.5106117725372314, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3392857313156128, "rewards/wrapped_driving_reward": -0.3811289668083191, "rewards/wrapped_format_reward": 0.5, "step": 339 }, { "completion_length": 500.0, "epoch": 68.0, "grad_norm": 0.5980082154273987, "kl": 0.1545403152704239, "learning_rate": 4.973604096452361e-06, "loss": 0.0062, "reward": 1.781707525253296, "reward_std": 0.38252297043800354, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7374999523162842, "rewards/wrapped_driving_reward": 0.04420744255185127, "rewards/wrapped_format_reward": 0.0, "step": 340 }, { "completion_length": 500.0, "epoch": 68.2, "grad_norm": 5.6277008056640625, "kl": 0.6528012156486511, "learning_rate": 4.97307449319746e-06, "loss": 0.0261, "reward": -2.5417990684509277, "reward_std": 2.9164016246795654, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -3.041799306869507, "rewards/wrapped_format_reward": 0.0, "step": 341 }, { "completion_length": 500.0, "epoch": 68.4, "grad_norm": 1.4556660652160645, "kl": 0.22342568635940552, "learning_rate": 4.972539658404793e-06, "loss": 0.0089, "reward": 1.7447742223739624, "reward_std": 0.46966177225112915, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6625000238418579, "rewards/wrapped_driving_reward": -0.1677258014678955, "rewards/wrapped_format_reward": 0.25, "step": 342 }, { "completion_length": 500.0, "epoch": 68.6, "grad_norm": 0.32919180393218994, "kl": 0.004485347308218479, "learning_rate": 4.971999593205748e-06, "loss": 0.0002, "reward": -0.8774416446685791, "reward_std": 3.6358015537261963, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.28125, "rewards/wrapped_driving_reward": -1.908691644668579, "rewards/wrapped_format_reward": 0.25, "step": 343 }, { "completion_length": 500.0, "epoch": 68.8, "grad_norm": 0.4438071846961975, "kl": 0.022456657141447067, "learning_rate": 4.971454298742779e-06, "loss": 0.0009, "reward": 1.4681932926177979, "reward_std": 3.670335054397583, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6666666865348816, "rewards/wrapped_driving_reward": -0.4484734535217285, "rewards/wrapped_format_reward": 0.5, "step": 344 }, { "completion_length": 500.0, "epoch": 69.0, "grad_norm": 0.5931668877601624, "kl": 0.21155548095703125, "learning_rate": 4.970903776169403e-06, "loss": 0.0085, "reward": 1.7453134059906006, "reward_std": 0.27375465631484985, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5980769395828247, "rewards/wrapped_driving_reward": 0.1472364068031311, "rewards/wrapped_format_reward": 0.0, "step": 345 }, { "completion_length": 500.0, "epoch": 69.2, "grad_norm": 1.203963041305542, "kl": 0.2919057607650757, "learning_rate": 4.97034802665019e-06, "loss": 0.0117, "reward": -1.129254937171936, "reward_std": 3.315586566925049, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4166666865348816, "rewards/wrapped_driving_reward": -2.170921802520752, "rewards/wrapped_format_reward": 0.125, "step": 346 }, { "completion_length": 500.0, "epoch": 69.4, "grad_norm": 0.7416115403175354, "kl": 0.22743701934814453, "learning_rate": 4.969787051360776e-06, "loss": 0.0091, "reward": 0.7157018184661865, "reward_std": 3.1705451011657715, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.578125, "rewards/wrapped_driving_reward": -0.8624231815338135, "rewards/wrapped_format_reward": 0.25, "step": 347 }, { "completion_length": 500.0, "epoch": 69.6, "grad_norm": 0.4830350875854492, "kl": 0.17163938283920288, "learning_rate": 4.9692208514878445e-06, "loss": 0.0069, "reward": -0.09030771255493164, "reward_std": 2.9694387912750244, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6041666865348816, "rewards/wrapped_driving_reward": -1.8194743394851685, "rewards/wrapped_format_reward": 0.375, "step": 348 }, { "completion_length": 500.0, "epoch": 69.8, "grad_norm": 0.2983047366142273, "kl": 0.004596756771206856, "learning_rate": 4.9686494282291354e-06, "loss": 0.0002, "reward": -2.3006930351257324, "reward_std": 2.772320032119751, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.125, "rewards/wrapped_driving_reward": -2.9256930351257324, "rewards/wrapped_format_reward": 0.25, "step": 349 }, { "completion_length": 500.0, "epoch": 70.0, "grad_norm": 0.9741338491439819, "kl": 0.2408900409936905, "learning_rate": 4.968072782793436e-06, "loss": 0.0096, "reward": 2.6951208114624023, "reward_std": 0.5698649883270264, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.421875, "rewards/wrapped_driving_reward": 0.7732457518577576, "rewards/wrapped_format_reward": 0.5, "step": 350 }, { "completion_length": 500.0, "epoch": 70.2, "grad_norm": 4.554775238037109, "kl": 0.4316965341567993, "learning_rate": 4.9674909164005805e-06, "loss": 0.0173, "reward": 1.6274943351745605, "reward_std": 0.7876996397972107, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5681818127632141, "rewards/wrapped_driving_reward": -0.06568745523691177, "rewards/wrapped_format_reward": 0.125, "step": 351 }, { "completion_length": 500.0, "epoch": 70.4, "grad_norm": 2.6662397384643555, "kl": 0.7358002066612244, "learning_rate": 4.966903830281449e-06, "loss": 0.0294, "reward": 0.41893279552459717, "reward_std": 2.621450662612915, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5625, "rewards/wrapped_driving_reward": -1.0185672044754028, "rewards/wrapped_format_reward": 0.125, "step": 352 }, { "completion_length": 500.0, "epoch": 70.6, "grad_norm": 5.053070545196533, "kl": 0.6792211532592773, "learning_rate": 4.966311525677961e-06, "loss": 0.0272, "reward": -0.49536943435668945, "reward_std": 4.055595397949219, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3333333134651184, "rewards/wrapped_driving_reward": -1.5787028074264526, "rewards/wrapped_format_reward": 0.25, "step": 353 }, { "completion_length": 500.0, "epoch": 70.8, "grad_norm": 0.6820557117462158, "kl": 0.6210047602653503, "learning_rate": 4.965714003843079e-06, "loss": 0.0248, "reward": 0.41455984115600586, "reward_std": 3.0012035369873047, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3738636374473572, "rewards/wrapped_driving_reward": -0.9593039155006409, "rewards/wrapped_format_reward": 0.25, "step": 354 }, { "completion_length": 500.0, "epoch": 71.0, "grad_norm": 4.2020440101623535, "kl": 0.7750915288925171, "learning_rate": 4.965111266040798e-06, "loss": 0.031, "reward": 2.4024367332458496, "reward_std": 0.6636844277381897, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6281249523162842, "rewards/wrapped_driving_reward": 0.1493116319179535, "rewards/wrapped_format_reward": 0.625, "step": 355 }, { "completion_length": 500.0, "epoch": 71.2, "grad_norm": 0.6401116251945496, "kl": 0.3446768820285797, "learning_rate": 4.964503313546149e-06, "loss": 0.0138, "reward": 0.8104474544525146, "reward_std": 3.2581946849823, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3187499940395355, "rewards/wrapped_driving_reward": -0.5083025097846985, "rewards/wrapped_format_reward": 0.25, "step": 356 }, { "completion_length": 500.0, "epoch": 71.4, "grad_norm": 8.429099082946777, "kl": 0.9006807208061218, "learning_rate": 4.963890147645195e-06, "loss": 0.036, "reward": 1.889892578125, "reward_std": 0.5496275424957275, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.4734848439693451, "rewards/wrapped_driving_reward": 0.16640779376029968, "rewards/wrapped_format_reward": 0.25, "step": 357 }, { "completion_length": 500.0, "epoch": 71.6, "grad_norm": 4.372265815734863, "kl": 1.383968710899353, "learning_rate": 4.963271769635024e-06, "loss": 0.0554, "reward": 2.0750181674957275, "reward_std": 0.5676727890968323, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.4749999940395355, "rewards/wrapped_driving_reward": 0.22501814365386963, "rewards/wrapped_format_reward": 0.375, "step": 358 }, { "completion_length": 500.0, "epoch": 71.8, "grad_norm": 2.1324520111083984, "kl": 0.41738006472587585, "learning_rate": 4.962648180823753e-06, "loss": 0.0167, "reward": -0.9892675876617432, "reward_std": 3.4826595783233643, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3083333373069763, "rewards/wrapped_driving_reward": -1.7976008653640747, "rewards/wrapped_format_reward": 0.0, "step": 359 }, { "completion_length": 500.0, "epoch": 72.0, "grad_norm": 0.6744612455368042, "kl": 0.17285731434822083, "learning_rate": 4.962019382530521e-06, "loss": 0.0069, "reward": 0.3177332878112793, "reward_std": 2.5485317707061768, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5868055820465088, "rewards/wrapped_driving_reward": -1.1440722942352295, "rewards/wrapped_format_reward": 0.125, "step": 360 }, { "completion_length": 500.0, "epoch": 72.2, "grad_norm": 0.6285593509674072, "kl": 0.33941590785980225, "learning_rate": 4.961385376085486e-06, "loss": 0.0136, "reward": 0.2698374390602112, "reward_std": 2.8500523567199707, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5714285373687744, "rewards/wrapped_driving_reward": -1.176591157913208, "rewards/wrapped_format_reward": 0.125, "step": 361 }, { "completion_length": 500.0, "epoch": 72.4, "grad_norm": 0.458164244890213, "kl": 0.09140162914991379, "learning_rate": 4.960746162829825e-06, "loss": 0.0037, "reward": 0.39255642890930176, "reward_std": 2.957616090774536, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5381944179534912, "rewards/wrapped_driving_reward": -0.8956379890441895, "rewards/wrapped_format_reward": 0.0, "step": 362 }, { "completion_length": 500.0, "epoch": 72.6, "grad_norm": 0.8604902029037476, "kl": 0.5563132762908936, "learning_rate": 4.960101744115727e-06, "loss": 0.0223, "reward": 0.682576596736908, "reward_std": 3.148731231689453, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6041666865348816, "rewards/wrapped_driving_reward": -0.6715900897979736, "rewards/wrapped_format_reward": 0.0, "step": 363 }, { "completion_length": 500.0, "epoch": 72.8, "grad_norm": 0.3864164352416992, "kl": 0.010636747814714909, "learning_rate": 4.959452121306397e-06, "loss": 0.0004, "reward": 2.0540273189544678, "reward_std": 0.3745618760585785, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8484848737716675, "rewards/wrapped_driving_reward": 0.2055424451828003, "rewards/wrapped_format_reward": 0.0, "step": 364 }, { "completion_length": 500.0, "epoch": 73.0, "grad_norm": 0.6751132011413574, "kl": 0.3509099781513214, "learning_rate": 4.958797295776045e-06, "loss": 0.014, "reward": -0.012695908546447754, "reward_std": 2.71622371673584, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6401515007019043, "rewards/wrapped_driving_reward": -1.527847409248352, "rewards/wrapped_format_reward": 0.125, "step": 365 }, { "completion_length": 500.0, "epoch": 73.2, "grad_norm": 0.6991415619850159, "kl": 0.26292094588279724, "learning_rate": 4.958137268909887e-06, "loss": 0.0105, "reward": -0.9175713062286377, "reward_std": 3.619429349899292, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.30000001192092896, "rewards/wrapped_driving_reward": -1.9675713777542114, "rewards/wrapped_format_reward": 0.25, "step": 366 }, { "completion_length": 500.0, "epoch": 73.4, "grad_norm": 0.4259271025657654, "kl": 0.006191871128976345, "learning_rate": 4.957472042104143e-06, "loss": 0.0002, "reward": -1.5161292552947998, "reward_std": 2.4123644828796387, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.578125, "rewards/wrapped_driving_reward": -2.9692542552948, "rewards/wrapped_format_reward": 0.125, "step": 367 }, { "completion_length": 500.0, "epoch": 73.6, "grad_norm": 0.41756653785705566, "kl": 0.190852090716362, "learning_rate": 4.956801616766033e-06, "loss": 0.0076, "reward": 2.1905903816223145, "reward_std": 0.8349334001541138, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.59375, "rewards/wrapped_driving_reward": 0.09684042632579803, "rewards/wrapped_format_reward": 0.5, "step": 368 }, { "completion_length": 500.0, "epoch": 73.8, "grad_norm": 0.9244159460067749, "kl": 0.4430541396141052, "learning_rate": 4.956125994313775e-06, "loss": 0.0177, "reward": 1.5870847702026367, "reward_std": 0.30261340737342834, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6979166865348816, "rewards/wrapped_driving_reward": -0.11083187907934189, "rewards/wrapped_format_reward": 0.0, "step": 369 }, { "completion_length": 500.0, "epoch": 74.0, "grad_norm": 0.7584779858589172, "kl": 0.37870338559150696, "learning_rate": 4.955445176176577e-06, "loss": 0.0151, "reward": -0.4603646993637085, "reward_std": 4.091524124145508, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.5853646993637085, "rewards/wrapped_format_reward": 0.25, "step": 370 }, { "completion_length": 500.0, "epoch": 74.2, "grad_norm": 0.5257065296173096, "kl": 0.14303787052631378, "learning_rate": 4.954759163794642e-06, "loss": 0.0057, "reward": -0.9337491989135742, "reward_std": 3.272832155227661, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3068181872367859, "rewards/wrapped_driving_reward": -1.9905673265457153, "rewards/wrapped_format_reward": 0.25, "step": 371 }, { "completion_length": 500.0, "epoch": 74.4, "grad_norm": 2.895681858062744, "kl": 0.6397900581359863, "learning_rate": 4.9540679586191605e-06, "loss": 0.0256, "reward": 0.5877273082733154, "reward_std": 3.06107759475708, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.75, "rewards/wrapped_driving_reward": -0.9122726917266846, "rewards/wrapped_format_reward": 0.0, "step": 372 }, { "completion_length": 500.0, "epoch": 74.6, "grad_norm": 0.4324706792831421, "kl": 0.1139565035700798, "learning_rate": 4.9533715621123046e-06, "loss": 0.0046, "reward": 0.969906210899353, "reward_std": 3.3792827129364014, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.45681819319725037, "rewards/wrapped_driving_reward": -0.611911952495575, "rewards/wrapped_format_reward": 0.375, "step": 373 }, { "completion_length": 500.0, "epoch": 74.8, "grad_norm": 0.48021069169044495, "kl": 0.010700889863073826, "learning_rate": 4.952669975747232e-06, "loss": 0.0004, "reward": -3.875, "reward_std": 0.25, "rewards/mpc_param_extraction_reward": 0.0, "rewards/mpc_param_name_reward": 0.0, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.125, "step": 374 }, { "completion_length": 500.0, "epoch": 75.0, "grad_norm": 0.43843770027160645, "kl": 0.12145346403121948, "learning_rate": 4.9519632010080765e-06, "loss": 0.0049, "reward": 0.22283601760864258, "reward_std": 2.8182504177093506, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.41874998807907104, "rewards/wrapped_driving_reward": -1.1959140300750732, "rewards/wrapped_format_reward": 0.25, "step": 375 }, { "completion_length": 500.0, "epoch": 75.2, "grad_norm": 0.3983944356441498, "kl": 0.2168252021074295, "learning_rate": 4.951251239389949e-06, "loss": 0.0087, "reward": 0.22715109586715698, "reward_std": 2.8382680416107178, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4375, "rewards/wrapped_driving_reward": -1.3353488445281982, "rewards/wrapped_format_reward": 0.375, "step": 376 }, { "completion_length": 500.0, "epoch": 75.4, "grad_norm": 1.3143914937973022, "kl": 0.49245813488960266, "learning_rate": 4.950534092398931e-06, "loss": 0.0197, "reward": -0.5440161228179932, "reward_std": 3.7116076946258545, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.42045456171035767, "rewards/wrapped_driving_reward": -1.589470624923706, "rewards/wrapped_format_reward": 0.125, "step": 377 }, { "completion_length": 500.0, "epoch": 75.6, "grad_norm": 0.7405281662940979, "kl": 0.4652176797389984, "learning_rate": 4.949811761552074e-06, "loss": 0.0186, "reward": -0.9247722625732422, "reward_std": 3.602470874786377, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3888888955116272, "rewards/wrapped_driving_reward": -2.0636610984802246, "rewards/wrapped_format_reward": 0.25, "step": 378 }, { "completion_length": 500.0, "epoch": 75.8, "grad_norm": 1.0121004581451416, "kl": 0.9068050980567932, "learning_rate": 4.9490842483773974e-06, "loss": 0.0363, "reward": 0.36130303144454956, "reward_std": 2.9178526401519775, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.2916666567325592, "rewards/wrapped_driving_reward": -0.805363655090332, "rewards/wrapped_format_reward": 0.125, "step": 379 }, { "completion_length": 500.0, "epoch": 76.0, "grad_norm": 0.3849989175796509, "kl": 0.008540408685803413, "learning_rate": 4.948351554413879e-06, "loss": 0.0003, "reward": -0.9468990564346313, "reward_std": 3.264946699142456, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3386363685131073, "rewards/wrapped_driving_reward": -1.9105353355407715, "rewards/wrapped_format_reward": 0.125, "step": 380 }, { "completion_length": 500.0, "epoch": 76.2, "grad_norm": 0.6559692621231079, "kl": 0.7614105343818665, "learning_rate": 4.94761368121146e-06, "loss": 0.0305, "reward": 0.15053200721740723, "reward_std": 2.780266761779785, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.46875, "rewards/wrapped_driving_reward": -1.1932181119918823, "rewards/wrapped_format_reward": 0.125, "step": 381 }, { "completion_length": 500.0, "epoch": 76.4, "grad_norm": 0.3702482283115387, "kl": 0.20814312994480133, "learning_rate": 4.946870630331035e-06, "loss": 0.0083, "reward": 0.8605266809463501, "reward_std": 3.243834972381592, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.546875, "rewards/wrapped_driving_reward": -0.6863483190536499, "rewards/wrapped_format_reward": 0.25, "step": 382 }, { "completion_length": 500.0, "epoch": 76.6, "grad_norm": 0.7226535081863403, "kl": 0.9034067988395691, "learning_rate": 4.9461224033444544e-06, "loss": 0.0361, "reward": 0.27967095375061035, "reward_std": 2.896880626678467, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5083333253860474, "rewards/wrapped_driving_reward": -1.6036624908447266, "rewards/wrapped_format_reward": 0.625, "step": 383 }, { "completion_length": 500.0, "epoch": 76.8, "grad_norm": 0.6067720651626587, "kl": 0.12315172702074051, "learning_rate": 4.9453690018345144e-06, "loss": 0.0049, "reward": -0.4893726110458374, "reward_std": 3.798448085784912, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.22499999403953552, "rewards/wrapped_driving_reward": -1.5893726348876953, "rewards/wrapped_format_reward": 0.375, "step": 384 }, { "completion_length": 500.0, "epoch": 77.0, "grad_norm": 0.4272429943084717, "kl": 0.00867474265396595, "learning_rate": 4.94461042739496e-06, "loss": 0.0003, "reward": 0.48846930265426636, "reward_std": 3.02353572845459, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3883928656578064, "rewards/wrapped_driving_reward": -0.89992356300354, "rewards/wrapped_format_reward": 0.25, "step": 385 }, { "completion_length": 477.0, "epoch": 77.2, "grad_norm": 0.476276695728302, "kl": 0.3388914167881012, "learning_rate": 4.943846681630479e-06, "loss": 0.0136, "reward": 0.190776526927948, "reward_std": 2.4944610595703125, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3499999940395355, "rewards/wrapped_driving_reward": -1.1592234373092651, "rewards/wrapped_format_reward": 0.25, "step": 386 }, { "completion_length": 500.0, "epoch": 77.4, "grad_norm": 0.53319251537323, "kl": 0.5233060717582703, "learning_rate": 4.943077766156698e-06, "loss": 0.0209, "reward": 2.4598963260650635, "reward_std": 0.17958295345306396, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.671875, "rewards/wrapped_driving_reward": 0.5380213856697083, "rewards/wrapped_format_reward": 0.25, "step": 387 }, { "completion_length": 500.0, "epoch": 77.6, "grad_norm": 0.4270462393760681, "kl": 0.007158294320106506, "learning_rate": 4.942303682600178e-06, "loss": 0.0003, "reward": 0.5006191730499268, "reward_std": 3.0305392742156982, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4558081030845642, "rewards/wrapped_driving_reward": -0.9551889896392822, "rewards/wrapped_format_reward": 0.25, "step": 388 }, { "completion_length": 500.0, "epoch": 77.8, "grad_norm": 0.6703900098800659, "kl": 0.6842387318611145, "learning_rate": 4.941524432598415e-06, "loss": 0.0274, "reward": -1.2612661123275757, "reward_std": 3.1640005111694336, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.0112662315368652, "rewards/wrapped_format_reward": 0.0, "step": 389 }, { "completion_length": 500.0, "epoch": 78.0, "grad_norm": 1.440481185913086, "kl": 0.5535719990730286, "learning_rate": 4.9407400177998335e-06, "loss": 0.0221, "reward": -0.8308386206626892, "reward_std": 3.6611149311065674, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2083333432674408, "rewards/wrapped_driving_reward": -1.9141719341278076, "rewards/wrapped_format_reward": 0.375, "step": 390 }, { "completion_length": 500.0, "epoch": 78.2, "grad_norm": 0.46088123321533203, "kl": 0.4104781746864319, "learning_rate": 4.9399504398637835e-06, "loss": 0.0164, "reward": 2.22438645362854, "reward_std": 0.4959229826927185, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8401514887809753, "rewards/wrapped_driving_reward": 0.2592349946498871, "rewards/wrapped_format_reward": 0.125, "step": 391 }, { "completion_length": 500.0, "epoch": 78.4, "grad_norm": 0.41232195496559143, "kl": 0.27239614725112915, "learning_rate": 4.939155700460536e-06, "loss": 0.0109, "reward": 1.3294200897216797, "reward_std": 3.238236904144287, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5874999761581421, "rewards/wrapped_driving_reward": -0.3830798864364624, "rewards/wrapped_format_reward": 0.375, "step": 392 }, { "completion_length": 500.0, "epoch": 78.6, "grad_norm": 79.61272430419922, "kl": 1.242353916168213, "learning_rate": 4.938355801271282e-06, "loss": 0.0497, "reward": 0.6597337126731873, "reward_std": 3.137744665145874, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6931818127632141, "rewards/wrapped_driving_reward": -1.0334481000900269, "rewards/wrapped_format_reward": 0.25, "step": 393 }, { "completion_length": 500.0, "epoch": 78.8, "grad_norm": 0.6254439353942871, "kl": 0.32317063212394714, "learning_rate": 4.937550743988127e-06, "loss": 0.0129, "reward": 1.7920137643814087, "reward_std": 0.5354746580123901, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.625568151473999, "rewards/wrapped_driving_reward": -0.4585544764995575, "rewards/wrapped_format_reward": 0.625, "step": 394 }, { "completion_length": 500.0, "epoch": 79.0, "grad_norm": 0.38457387685775757, "kl": 0.5373590588569641, "learning_rate": 4.936740530314087e-06, "loss": 0.0215, "reward": 0.1730683445930481, "reward_std": 2.877331495285034, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4318181872367859, "rewards/wrapped_driving_reward": -1.7587498426437378, "rewards/wrapped_format_reward": 0.75, "step": 395 }, { "completion_length": 500.0, "epoch": 79.2, "grad_norm": 0.6702293753623962, "kl": 0.557269275188446, "learning_rate": 4.935925161963089e-06, "loss": 0.0223, "reward": 1.288943886756897, "reward_std": 0.6816022396087646, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5984848737716675, "rewards/wrapped_driving_reward": -0.4345410466194153, "rewards/wrapped_format_reward": 0.125, "step": 396 }, { "completion_length": 500.0, "epoch": 79.4, "grad_norm": 8.266891479492188, "kl": 1.4165321588516235, "learning_rate": 4.935104640659959e-06, "loss": 0.0567, "reward": 0.45650458335876465, "reward_std": 3.0426013469696045, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.21875, "rewards/wrapped_driving_reward": -1.012245535850525, "rewards/wrapped_format_reward": 0.5, "step": 397 }, { "completion_length": 500.0, "epoch": 79.6, "grad_norm": 0.65053790807724, "kl": 0.45794495940208435, "learning_rate": 4.934278968140428e-06, "loss": 0.0183, "reward": -0.4596288204193115, "reward_std": 4.093783855438232, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.5846288204193115, "rewards/wrapped_format_reward": 0.125, "step": 398 }, { "completion_length": 500.0, "epoch": 79.8, "grad_norm": 0.46314430236816406, "kl": 0.5075499415397644, "learning_rate": 4.933448146151122e-06, "loss": 0.0203, "reward": 1.7097347974777222, "reward_std": 0.5429360270500183, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.71875, "rewards/wrapped_driving_reward": -0.13401514291763306, "rewards/wrapped_format_reward": 0.125, "step": 399 }, { "completion_length": 500.0, "epoch": 80.0, "grad_norm": 0.6950944066047668, "kl": 0.3538559675216675, "learning_rate": 4.93261217644956e-06, "loss": 0.0142, "reward": 0.39778077602386475, "reward_std": 2.652611255645752, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5687500238418579, "rewards/wrapped_driving_reward": -1.4209691286087036, "rewards/wrapped_format_reward": 0.5, "step": 400 }, { "completion_length": 417.0, "epoch": 80.2, "grad_norm": 0.6640767455101013, "kl": 1.0406416654586792, "learning_rate": 4.931771060804152e-06, "loss": 0.0416, "reward": 2.95090651512146, "reward_std": 0.567731499671936, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.786237359046936, "rewards/wrapped_driving_reward": 0.5396692156791687, "rewards/wrapped_format_reward": 0.625, "step": 401 }, { "completion_length": 500.0, "epoch": 80.4, "grad_norm": 0.7255659103393555, "kl": 0.7492356896400452, "learning_rate": 4.930924800994192e-06, "loss": 0.03, "reward": 0.889411211013794, "reward_std": 3.33335280418396, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5833333134651184, "rewards/wrapped_driving_reward": -0.9439221620559692, "rewards/wrapped_format_reward": 0.5, "step": 402 }, { "completion_length": 500.0, "epoch": 80.6, "grad_norm": 55.34680938720703, "kl": 0.841439425945282, "learning_rate": 4.930073398809857e-06, "loss": 0.0337, "reward": -2.488555908203125, "reward_std": 2.6998631954193115, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.09375, "rewards/wrapped_driving_reward": -2.957305908203125, "rewards/wrapped_format_reward": 0.125, "step": 403 }, { "completion_length": 500.0, "epoch": 80.8, "grad_norm": 0.4021887183189392, "kl": 0.009278661571443081, "learning_rate": 4.929216856052201e-06, "loss": 0.0004, "reward": -0.5718256235122681, "reward_std": 3.405724048614502, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -1.821825623512268, "rewards/wrapped_format_reward": 0.25, "step": 404 }, { "completion_length": 500.0, "epoch": 81.0, "grad_norm": 3.827162981033325, "kl": 0.7532582879066467, "learning_rate": 4.928355174533153e-06, "loss": 0.0301, "reward": 0.27101099491119385, "reward_std": 2.883357048034668, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.40625, "rewards/wrapped_driving_reward": -1.1352390050888062, "rewards/wrapped_format_reward": 0.25, "step": 405 }, { "completion_length": 500.0, "epoch": 81.2, "grad_norm": 0.6682533025741577, "kl": 0.9012861847877502, "learning_rate": 4.927488356075515e-06, "loss": 0.0361, "reward": 0.09313106536865234, "reward_std": 2.730103015899658, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4375, "rewards/wrapped_driving_reward": -1.0943689346313477, "rewards/wrapped_format_reward": 0.0, "step": 406 }, { "completion_length": 500.0, "epoch": 81.4, "grad_norm": 0.5417198538780212, "kl": 0.41213977336883545, "learning_rate": 4.926616402512952e-06, "loss": 0.0165, "reward": -1.3022674322128296, "reward_std": 2.9187700748443604, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -2.552267551422119, "rewards/wrapped_format_reward": 0.375, "step": 407 }, { "completion_length": 500.0, "epoch": 81.6, "grad_norm": 1.2818468809127808, "kl": 0.6974972486495972, "learning_rate": 4.925739315689991e-06, "loss": 0.0279, "reward": 2.659097671508789, "reward_std": 0.6468947529792786, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5107142925262451, "rewards/wrapped_driving_reward": 0.39838337898254395, "rewards/wrapped_format_reward": 0.75, "step": 408 }, { "completion_length": 500.0, "epoch": 81.8, "grad_norm": 0.4666033089160919, "kl": 0.2668006122112274, "learning_rate": 4.924857097462023e-06, "loss": 0.0107, "reward": -0.5389512777328491, "reward_std": 4.004843711853027, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4318181872367859, "rewards/wrapped_driving_reward": -1.5957695245742798, "rewards/wrapped_format_reward": 0.125, "step": 409 }, { "completion_length": 500.0, "epoch": 82.0, "grad_norm": 1.2812401056289673, "kl": 0.9626360535621643, "learning_rate": 4.9239697496952904e-06, "loss": 0.0385, "reward": -2.234349489212036, "reward_std": 2.903162717819214, "rewards/mpc_param_extraction_reward": 0.25, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -2.984349489212036, "rewards/wrapped_format_reward": 0.25, "step": 410 }, { "completion_length": 500.0, "epoch": 82.2, "grad_norm": 0.644499659538269, "kl": 0.8354526162147522, "learning_rate": 4.923077274266886e-06, "loss": 0.0334, "reward": 0.48837804794311523, "reward_std": 3.006782293319702, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -0.8866219520568848, "rewards/wrapped_format_reward": 0.25, "step": 411 }, { "completion_length": 500.0, "epoch": 82.4, "grad_norm": 2.690115451812744, "kl": 0.7986117005348206, "learning_rate": 4.922179673064752e-06, "loss": 0.0319, "reward": -0.877657413482666, "reward_std": 3.3253700733184814, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.1666666716337204, "rewards/wrapped_driving_reward": -2.0443239212036133, "rewards/wrapped_format_reward": 0.5, "step": 412 }, { "completion_length": 500.0, "epoch": 82.6, "grad_norm": 0.4054378867149353, "kl": 0.013790788128972054, "learning_rate": 4.921276947987672e-06, "loss": 0.0006, "reward": -0.5940237045288086, "reward_std": 3.6547868251800537, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4375, "rewards/wrapped_driving_reward": -1.6565237045288086, "rewards/wrapped_format_reward": 0.125, "step": 413 }, { "completion_length": 500.0, "epoch": 82.8, "grad_norm": 3.897165060043335, "kl": 1.4948945045471191, "learning_rate": 4.92036910094527e-06, "loss": 0.0598, "reward": 2.222888469696045, "reward_std": 0.5692758560180664, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": 0.0978885143995285, "rewards/wrapped_format_reward": 0.625, "step": 414 }, { "completion_length": 500.0, "epoch": 83.0, "grad_norm": 0.6666826605796814, "kl": 0.5109995603561401, "learning_rate": 4.919456133858003e-06, "loss": 0.0204, "reward": -0.8746355772018433, "reward_std": 3.3271138668060303, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2916666865348816, "rewards/wrapped_driving_reward": -1.7913023233413696, "rewards/wrapped_format_reward": 0.125, "step": 415 }, { "completion_length": 500.0, "epoch": 83.2, "grad_norm": 1.92827570438385, "kl": 1.322735071182251, "learning_rate": 4.91853804865716e-06, "loss": 0.0529, "reward": -0.5941190719604492, "reward_std": 3.9339089393615723, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2291666567325592, "rewards/wrapped_driving_reward": -1.573285698890686, "rewards/wrapped_format_reward": 0.25, "step": 416 }, { "completion_length": 500.0, "epoch": 83.4, "grad_norm": 0.6745986342430115, "kl": 0.505329430103302, "learning_rate": 4.917614847284858e-06, "loss": 0.0202, "reward": -0.9166472554206848, "reward_std": 3.3589961528778076, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3541666865348816, "rewards/wrapped_driving_reward": -2.1458141803741455, "rewards/wrapped_format_reward": 0.375, "step": 417 }, { "completion_length": 500.0, "epoch": 83.6, "grad_norm": 4.199557781219482, "kl": 1.2294397354125977, "learning_rate": 4.916686531694035e-06, "loss": 0.0492, "reward": 1.6677517890930176, "reward_std": 0.404989629983902, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6770833134651184, "rewards/wrapped_driving_reward": -0.009331552311778069, "rewards/wrapped_format_reward": 0.0, "step": 418 }, { "completion_length": 500.0, "epoch": 83.8, "grad_norm": 0.5093013644218445, "kl": 0.5093794465065002, "learning_rate": 4.9157531038484494e-06, "loss": 0.0204, "reward": 0.7348726987838745, "reward_std": 2.8353636264801025, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.59375, "rewards/wrapped_driving_reward": -0.9838773608207703, "rewards/wrapped_format_reward": 0.375, "step": 419 }, { "completion_length": 500.0, "epoch": 84.0, "grad_norm": 0.4214366376399994, "kl": 0.12739227712154388, "learning_rate": 4.914814565722671e-06, "loss": 0.0051, "reward": 0.06370812654495239, "reward_std": 2.8763251304626465, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3166666626930237, "rewards/wrapped_driving_reward": -1.2529585361480713, "rewards/wrapped_format_reward": 0.25, "step": 420 }, { "completion_length": 500.0, "epoch": 84.2, "grad_norm": 0.6959801316261292, "kl": 1.1503578424453735, "learning_rate": 4.913870919302083e-06, "loss": 0.046, "reward": 1.276258945465088, "reward_std": 3.544874668121338, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": -0.5987411737442017, "rewards/wrapped_format_reward": 0.5, "step": 421 }, { "completion_length": 500.0, "epoch": 84.4, "grad_norm": 0.6742568612098694, "kl": 0.5141565799713135, "learning_rate": 4.912922166582874e-06, "loss": 0.0206, "reward": -1.0406144857406616, "reward_std": 3.4172041416168213, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.40625, "rewards/wrapped_driving_reward": -1.946864366531372, "rewards/wrapped_format_reward": 0.0, "step": 422 }, { "completion_length": 500.0, "epoch": 84.6, "grad_norm": 0.6518729329109192, "kl": 0.7472659349441528, "learning_rate": 4.9119683095720325e-06, "loss": 0.0299, "reward": 1.693802833557129, "reward_std": 0.29680135846138, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.45681819319725037, "rewards/wrapped_driving_reward": -0.013015310280025005, "rewards/wrapped_format_reward": 0.25, "step": 423 }, { "completion_length": 500.0, "epoch": 84.8, "grad_norm": 0.7331550121307373, "kl": 0.6410544514656067, "learning_rate": 4.911009350287348e-06, "loss": 0.0256, "reward": 1.389469861984253, "reward_std": 0.43002644181251526, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6306818127632141, "rewards/wrapped_driving_reward": -0.24121199548244476, "rewards/wrapped_format_reward": 0.0, "step": 424 }, { "completion_length": 500.0, "epoch": 85.0, "grad_norm": 0.5071043968200684, "kl": 0.2673094868659973, "learning_rate": 4.910045290757399e-06, "loss": 0.0107, "reward": 1.6472147703170776, "reward_std": 0.5979868173599243, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5372024178504944, "rewards/wrapped_driving_reward": -0.01498757116496563, "rewards/wrapped_format_reward": 0.125, "step": 425 }, { "completion_length": 500.0, "epoch": 85.2, "grad_norm": 0.4037346839904785, "kl": 0.2433103322982788, "learning_rate": 4.909076133021558e-06, "loss": 0.0097, "reward": 1.7711191177368164, "reward_std": 0.41978758573532104, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5687500238418579, "rewards/wrapped_driving_reward": -0.04763093218207359, "rewards/wrapped_format_reward": 0.25, "step": 426 }, { "completion_length": 500.0, "epoch": 85.4, "grad_norm": 0.3942338824272156, "kl": 0.014928624033927917, "learning_rate": 4.908101879129977e-06, "loss": 0.0006, "reward": -2.022028923034668, "reward_std": 1.4208987951278687, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.47904038429260254, "rewards/wrapped_driving_reward": -3.6260693073272705, "rewards/wrapped_format_reward": 0.375, "step": 427 }, { "completion_length": 500.0, "epoch": 85.6, "grad_norm": 0.5229224562644958, "kl": 1.1797394752502441, "learning_rate": 4.907122531143595e-06, "loss": 0.0472, "reward": 2.057708740234375, "reward_std": 0.6911286115646362, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5538690090179443, "rewards/wrapped_driving_reward": 0.25383973121643066, "rewards/wrapped_format_reward": 0.25, "step": 428 }, { "completion_length": 500.0, "epoch": 85.8, "grad_norm": 0.4138140380382538, "kl": 0.8060752749443054, "learning_rate": 4.906138091134118e-06, "loss": 0.0322, "reward": 2.941300868988037, "reward_std": 0.7528426051139832, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6656249761581421, "rewards/wrapped_driving_reward": 0.775675892829895, "rewards/wrapped_format_reward": 0.5, "step": 429 }, { "completion_length": 500.0, "epoch": 86.0, "grad_norm": 1.6084057092666626, "kl": 1.5639433860778809, "learning_rate": 4.905148561184033e-06, "loss": 0.0626, "reward": 0.7152248620986938, "reward_std": 2.8380167484283447, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.543749988079071, "rewards/wrapped_driving_reward": -1.0785250663757324, "rewards/wrapped_format_reward": 0.5, "step": 430 }, { "completion_length": 500.0, "epoch": 86.2, "grad_norm": 0.793965756893158, "kl": 1.6652806997299194, "learning_rate": 4.904153943386588e-06, "loss": 0.0666, "reward": 1.4550268650054932, "reward_std": 3.6481876373291016, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.531818151473999, "rewards/wrapped_driving_reward": -0.5767912268638611, "rewards/wrapped_format_reward": 0.75, "step": 431 }, { "completion_length": 500.0, "epoch": 86.4, "grad_norm": 0.4197085499763489, "kl": 0.388515830039978, "learning_rate": 4.903154239845798e-06, "loss": 0.0155, "reward": 2.14121413230896, "reward_std": 0.5500617027282715, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.71875, "rewards/wrapped_driving_reward": 0.42246413230895996, "rewards/wrapped_format_reward": 0.0, "step": 432 }, { "completion_length": 500.0, "epoch": 86.6, "grad_norm": 0.8733661770820618, "kl": 1.6929181814193726, "learning_rate": 4.9021494526764315e-06, "loss": 0.0677, "reward": -1.1920154094696045, "reward_std": 3.250979423522949, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -2.1920154094696045, "rewards/wrapped_format_reward": 0.125, "step": 433 }, { "completion_length": 500.0, "epoch": 86.8, "grad_norm": 0.4056279957294464, "kl": 0.4129205346107483, "learning_rate": 4.901139584004014e-06, "loss": 0.0165, "reward": 0.33355867862701416, "reward_std": 2.93294358253479, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.550000011920929, "rewards/wrapped_driving_reward": -1.09144127368927, "rewards/wrapped_format_reward": 0.125, "step": 434 }, { "completion_length": 500.0, "epoch": 87.0, "grad_norm": 1.0974806547164917, "kl": 1.1918952465057373, "learning_rate": 4.900124635964823e-06, "loss": 0.0477, "reward": -0.3588448762893677, "reward_std": 2.147490978240967, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.45972222089767456, "rewards/wrapped_driving_reward": -1.943567156791687, "rewards/wrapped_format_reward": 0.125, "step": 435 }, { "completion_length": 500.0, "epoch": 87.2, "grad_norm": 0.8155910968780518, "kl": 0.01346185989677906, "learning_rate": 4.899104610705874e-06, "loss": 0.0005, "reward": 0.33636826276779175, "reward_std": 2.900613307952881, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5511363744735718, "rewards/wrapped_driving_reward": -1.0897680521011353, "rewards/wrapped_format_reward": 0.125, "step": 436 }, { "completion_length": 500.0, "epoch": 87.4, "grad_norm": 0.7519292831420898, "kl": 0.420356810092926, "learning_rate": 4.898079510384929e-06, "loss": 0.0168, "reward": 1.5522735118865967, "reward_std": 0.47781333327293396, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6753472089767456, "rewards/wrapped_driving_reward": -0.37307366728782654, "rewards/wrapped_format_reward": 0.25, "step": 437 }, { "completion_length": 500.0, "epoch": 87.6, "grad_norm": 2.0137500762939453, "kl": 1.7654343843460083, "learning_rate": 4.897049337170483e-06, "loss": 0.0706, "reward": 0.5818377137184143, "reward_std": 2.7649831771850586, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.30000001192092896, "rewards/wrapped_driving_reward": -0.8431622385978699, "rewards/wrapped_format_reward": 0.375, "step": 438 }, { "completion_length": 500.0, "epoch": 87.8, "grad_norm": 0.48328566551208496, "kl": 0.9588396549224854, "learning_rate": 4.896014093241763e-06, "loss": 0.0384, "reward": 1.3982526063919067, "reward_std": 0.5576097965240479, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.4005681872367859, "rewards/wrapped_driving_reward": -0.0023155699018388987, "rewards/wrapped_format_reward": 0.0, "step": 439 }, { "completion_length": 372.0, "epoch": 88.0, "grad_norm": 0.7031254172325134, "kl": 1.1994409561157227, "learning_rate": 4.894973780788722e-06, "loss": 0.048, "reward": 0.5088822245597839, "reward_std": 3.0980026721954346, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.44999998807907104, "rewards/wrapped_driving_reward": -0.9411178827285767, "rewards/wrapped_format_reward": 0.25, "step": 440 }, { "completion_length": 500.0, "epoch": 88.2, "grad_norm": 2.8440561294555664, "kl": 0.9831880331039429, "learning_rate": 4.8939284020120365e-06, "loss": 0.0393, "reward": 0.980705738067627, "reward_std": 2.253283977508545, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6000000238418579, "rewards/wrapped_driving_reward": -1.119294285774231, "rewards/wrapped_format_reward": 0.5, "step": 441 }, { "completion_length": 500.0, "epoch": 88.4, "grad_norm": 0.4284060299396515, "kl": 0.8977882862091064, "learning_rate": 4.892877959123097e-06, "loss": 0.0359, "reward": 0.3894255757331848, "reward_std": 2.9320125579833984, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": -0.86057448387146, "rewards/wrapped_format_reward": 0.0, "step": 442 }, { "completion_length": 500.0, "epoch": 88.6, "grad_norm": 0.43213698267936707, "kl": 0.11996592581272125, "learning_rate": 4.89182245434401e-06, "loss": 0.0048, "reward": -1.3073043823242188, "reward_std": 3.148372173309326, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.265625, "rewards/wrapped_driving_reward": -2.0729293823242188, "rewards/wrapped_format_reward": 0.0, "step": 443 }, { "completion_length": 500.0, "epoch": 88.8, "grad_norm": 0.5716166496276855, "kl": 0.3549193739891052, "learning_rate": 4.890761889907589e-06, "loss": 0.0142, "reward": 0.5434389114379883, "reward_std": 3.20829701423645, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.43717533349990845, "rewards/wrapped_driving_reward": -1.0187362432479858, "rewards/wrapped_format_reward": 0.375, "step": 444 }, { "completion_length": 500.0, "epoch": 89.0, "grad_norm": 0.4193194806575775, "kl": 0.20122253894805908, "learning_rate": 4.889696268057349e-06, "loss": 0.008, "reward": -0.5508648753166199, "reward_std": 3.4957611560821533, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.375, "rewards/wrapped_driving_reward": -1.9258649349212646, "rewards/wrapped_format_reward": 0.5, "step": 445 }, { "completion_length": 500.0, "epoch": 89.2, "grad_norm": 0.4390241801738739, "kl": 0.015177929773926735, "learning_rate": 4.888625591047505e-06, "loss": 0.0006, "reward": 1.724879264831543, "reward_std": 0.7149723768234253, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.3999999761581421, "rewards/wrapped_driving_reward": 0.07487919926643372, "rewards/wrapped_format_reward": 0.25, "step": 446 }, { "completion_length": 312.0, "epoch": 89.4, "grad_norm": 2.1881051063537598, "kl": 1.9536460638046265, "learning_rate": 4.887549861142967e-06, "loss": 0.0781, "reward": 0.2170354127883911, "reward_std": 2.8930094242095947, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3794642686843872, "rewards/wrapped_driving_reward": -1.287428855895996, "rewards/wrapped_format_reward": 0.375, "step": 447 }, { "completion_length": 500.0, "epoch": 89.6, "grad_norm": 1.2952624559402466, "kl": 1.0334632396697998, "learning_rate": 4.88646908061933e-06, "loss": 0.0413, "reward": -0.2442917823791504, "reward_std": 4.344462871551514, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.34375, "rewards/wrapped_driving_reward": -1.5880416631698608, "rewards/wrapped_format_reward": 0.5, "step": 448 }, { "completion_length": 500.0, "epoch": 89.8, "grad_norm": 0.4427298605442047, "kl": 0.6207051873207092, "learning_rate": 4.885383251762877e-06, "loss": 0.0248, "reward": 2.1898977756500244, "reward_std": 0.5821254849433899, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7702991366386414, "rewards/wrapped_driving_reward": -0.2054014354944229, "rewards/wrapped_format_reward": 0.625, "step": 449 }, { "completion_length": 500.0, "epoch": 90.0, "grad_norm": 0.5388855934143066, "kl": 1.0115077495574951, "learning_rate": 4.884292376870567e-06, "loss": 0.0405, "reward": 2.037525177001953, "reward_std": 0.5797269344329834, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.643750011920929, "rewards/wrapped_driving_reward": 0.1437750905752182, "rewards/wrapped_format_reward": 0.25, "step": 450 }, { "completion_length": 500.0, "epoch": 90.2, "grad_norm": 1.7166131734848022, "kl": 1.5529717206954956, "learning_rate": 4.883196458250037e-06, "loss": 0.0621, "reward": 2.3594932556152344, "reward_std": 0.8208974599838257, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6888889074325562, "rewards/wrapped_driving_reward": 0.17060428857803345, "rewards/wrapped_format_reward": 0.5, "step": 451 }, { "completion_length": 500.0, "epoch": 90.4, "grad_norm": 0.8398679494857788, "kl": 0.9991617798805237, "learning_rate": 4.8820954982195905e-06, "loss": 0.04, "reward": 1.959275484085083, "reward_std": 0.25932568311691284, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.875, "rewards/wrapped_driving_reward": -0.290724515914917, "rewards/wrapped_format_reward": 0.375, "step": 452 }, { "completion_length": 500.0, "epoch": 90.6, "grad_norm": 0.4421575367450714, "kl": 0.6176390051841736, "learning_rate": 4.880989499108196e-06, "loss": 0.0247, "reward": 0.9777252078056335, "reward_std": 3.386754274368286, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.25, "rewards/wrapped_driving_reward": -0.39727485179901123, "rewards/wrapped_format_reward": 0.375, "step": 453 }, { "completion_length": 500.0, "epoch": 90.8, "grad_norm": 0.6734241247177124, "kl": 0.6196687817573547, "learning_rate": 4.879878463255483e-06, "loss": 0.0248, "reward": 0.035056740045547485, "reward_std": 2.8004674911499023, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.1607142835855484, "rewards/wrapped_driving_reward": -1.125657558441162, "rewards/wrapped_format_reward": 0.25, "step": 454 }, { "completion_length": 500.0, "epoch": 91.0, "grad_norm": 0.432086706161499, "kl": 0.23082022368907928, "learning_rate": 4.878762393011735e-06, "loss": 0.0092, "reward": 0.7312225103378296, "reward_std": 3.202765703201294, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.518750011920929, "rewards/wrapped_driving_reward": -0.9125275611877441, "rewards/wrapped_format_reward": 0.375, "step": 455 }, { "completion_length": 500.0, "epoch": 91.2, "grad_norm": 0.4672556221485138, "kl": 0.2818826138973236, "learning_rate": 4.8776412907378845e-06, "loss": 0.0113, "reward": -1.3701595067977905, "reward_std": 3.0371835231781006, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.21875, "rewards/wrapped_driving_reward": -2.088909387588501, "rewards/wrapped_format_reward": 0.0, "step": 456 }, { "completion_length": 500.0, "epoch": 91.4, "grad_norm": 0.8472755551338196, "kl": 1.7518972158432007, "learning_rate": 4.87651515880551e-06, "loss": 0.0701, "reward": 1.7843856811523438, "reward_std": 0.242258682847023, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7857142686843872, "rewards/wrapped_driving_reward": -0.0013285325840115547, "rewards/wrapped_format_reward": 0.0, "step": 457 }, { "completion_length": 500.0, "epoch": 91.6, "grad_norm": 0.48336735367774963, "kl": 0.4738996624946594, "learning_rate": 4.875383999596828e-06, "loss": 0.019, "reward": 2.511242151260376, "reward_std": 0.3284090459346771, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5847222208976746, "rewards/wrapped_driving_reward": 0.5515199899673462, "rewards/wrapped_format_reward": 0.375, "step": 458 }, { "completion_length": 500.0, "epoch": 91.8, "grad_norm": 0.6792411208152771, "kl": 1.5720727443695068, "learning_rate": 4.874247815504693e-06, "loss": 0.0629, "reward": 2.0669398307800293, "reward_std": 0.740801990032196, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6636363863945007, "rewards/wrapped_driving_reward": 0.1533033847808838, "rewards/wrapped_format_reward": 0.25, "step": 459 }, { "completion_length": 500.0, "epoch": 92.0, "grad_norm": 1.1329480409622192, "kl": 1.284619927406311, "learning_rate": 4.873106608932585e-06, "loss": 0.0514, "reward": 0.6621590852737427, "reward_std": 1.5119678974151611, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8068181872367859, "rewards/wrapped_driving_reward": -1.2696590423583984, "rewards/wrapped_format_reward": 0.125, "step": 460 }, { "completion_length": 500.0, "epoch": 92.2, "grad_norm": 3.491309642791748, "kl": 1.2074986696243286, "learning_rate": 4.871960382294611e-06, "loss": 0.0483, "reward": 1.2318992614746094, "reward_std": 0.664622962474823, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.4752747416496277, "rewards/wrapped_driving_reward": -0.4933754801750183, "rewards/wrapped_format_reward": 0.25, "step": 461 }, { "completion_length": 500.0, "epoch": 92.4, "grad_norm": 0.6012648940086365, "kl": 0.7964257001876831, "learning_rate": 4.870809138015499e-06, "loss": 0.0319, "reward": 0.7039540410041809, "reward_std": 3.1964340209960938, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5625, "rewards/wrapped_driving_reward": -0.9835458993911743, "rewards/wrapped_format_reward": 0.375, "step": 462 }, { "completion_length": 500.0, "epoch": 92.6, "grad_norm": 0.7076395153999329, "kl": 1.378025770187378, "learning_rate": 4.869652878530586e-06, "loss": 0.0551, "reward": 1.9368897676467896, "reward_std": 0.9331097602844238, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6136363744735718, "rewards/wrapped_driving_reward": -0.17674663662910461, "rewards/wrapped_format_reward": 0.5, "step": 463 }, { "completion_length": 500.0, "epoch": 92.8, "grad_norm": 0.37601929903030396, "kl": 0.07725519686937332, "learning_rate": 4.868491606285823e-06, "loss": 0.0031, "reward": 2.537005662918091, "reward_std": 0.17966555058956146, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6748737096786499, "rewards/wrapped_driving_reward": 0.7371318340301514, "rewards/wrapped_format_reward": 0.125, "step": 464 }, { "completion_length": 500.0, "epoch": 93.0, "grad_norm": 0.3961866796016693, "kl": 0.06434271484613419, "learning_rate": 4.867325323737765e-06, "loss": 0.0026, "reward": -0.08214747905731201, "reward_std": 2.7034084796905518, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4636363685131073, "rewards/wrapped_driving_reward": -1.4207837581634521, "rewards/wrapped_format_reward": 0.125, "step": 465 }, { "completion_length": 500.0, "epoch": 93.2, "grad_norm": 0.515503466129303, "kl": 0.7670941948890686, "learning_rate": 4.866154033353561e-06, "loss": 0.0307, "reward": 0.5504806041717529, "reward_std": 3.077503204345703, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.42045456171035767, "rewards/wrapped_driving_reward": -1.11997389793396, "rewards/wrapped_format_reward": 0.5, "step": 466 }, { "completion_length": 500.0, "epoch": 93.4, "grad_norm": 0.4547137916088104, "kl": 1.259108543395996, "learning_rate": 4.864977737610959e-06, "loss": 0.0504, "reward": 2.1892905235290527, "reward_std": 0.13488423824310303, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.628125011920929, "rewards/wrapped_driving_reward": 0.18616540729999542, "rewards/wrapped_format_reward": 0.375, "step": 467 }, { "completion_length": 500.0, "epoch": 93.6, "grad_norm": 0.39233630895614624, "kl": 0.8455955982208252, "learning_rate": 4.863796438998293e-06, "loss": 0.0338, "reward": 1.7829570770263672, "reward_std": 1.168465495109558, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.620707094669342, "rewards/wrapped_driving_reward": -0.3377498984336853, "rewards/wrapped_format_reward": 0.5, "step": 468 }, { "completion_length": 500.0, "epoch": 93.8, "grad_norm": 0.4378278851509094, "kl": 0.7639008164405823, "learning_rate": 4.862610140014479e-06, "loss": 0.0306, "reward": 2.175501823425293, "reward_std": 0.711540937423706, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6541666388511658, "rewards/wrapped_driving_reward": 0.2713351547718048, "rewards/wrapped_format_reward": 0.25, "step": 469 }, { "completion_length": 500.0, "epoch": 94.0, "grad_norm": 0.37471047043800354, "kl": 0.33596715331077576, "learning_rate": 4.861418843169012e-06, "loss": 0.0134, "reward": 1.253250002861023, "reward_std": 0.31675300002098083, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.71875, "rewards/wrapped_driving_reward": -0.46549999713897705, "rewards/wrapped_format_reward": 0.0, "step": 470 }, { "completion_length": 500.0, "epoch": 94.2, "grad_norm": 0.41066431999206543, "kl": 0.7373389005661011, "learning_rate": 4.860222550981961e-06, "loss": 0.0295, "reward": 1.3983137607574463, "reward_std": 3.6222870349884033, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.625, "rewards/wrapped_driving_reward": -0.4766860902309418, "rewards/wrapped_format_reward": 0.5, "step": 471 }, { "completion_length": 500.0, "epoch": 94.4, "grad_norm": 0.5545728802680969, "kl": 1.111506462097168, "learning_rate": 4.859021265983959e-06, "loss": 0.0445, "reward": 0.44755399227142334, "reward_std": 3.0138490200042725, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4422348737716675, "rewards/wrapped_driving_reward": -0.9946808815002441, "rewards/wrapped_format_reward": 0.25, "step": 472 }, { "completion_length": 340.0, "epoch": 94.6, "grad_norm": 0.5120311379432678, "kl": 1.3492473363876343, "learning_rate": 4.8578149907162035e-06, "loss": 0.054, "reward": 2.156475067138672, "reward_std": 0.6874570846557617, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6458333134651184, "rewards/wrapped_driving_reward": 0.010641898959875107, "rewards/wrapped_format_reward": 0.5, "step": 473 }, { "completion_length": 500.0, "epoch": 94.8, "grad_norm": 0.44791045784950256, "kl": 0.6434275507926941, "learning_rate": 4.856603727730446e-06, "loss": 0.0257, "reward": -1.7244491577148438, "reward_std": 2.2597899436950684, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3499999940395355, "rewards/wrapped_driving_reward": -3.074449062347412, "rewards/wrapped_format_reward": 0.25, "step": 474 }, { "completion_length": 500.0, "epoch": 95.0, "grad_norm": 0.8485005497932434, "kl": 1.348606824874878, "learning_rate": 4.855387479588991e-06, "loss": 0.0539, "reward": 1.8527848720550537, "reward_std": 0.7476688623428345, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.4541666507720947, "rewards/wrapped_driving_reward": 0.023618236184120178, "rewards/wrapped_format_reward": 0.375, "step": 475 }, { "completion_length": 500.0, "epoch": 95.2, "grad_norm": 0.3853321671485901, "kl": 0.24151970446109772, "learning_rate": 4.854166248864689e-06, "loss": 0.0097, "reward": 0.4717317819595337, "reward_std": 2.6661465167999268, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.5151515007019043, "rewards/wrapped_driving_reward": -1.2934197187423706, "rewards/wrapped_format_reward": 0.5, "step": 476 }, { "completion_length": 500.0, "epoch": 95.4, "grad_norm": 0.5518341660499573, "kl": 0.753800630569458, "learning_rate": 4.852940038140927e-06, "loss": 0.0302, "reward": 1.9437934160232544, "reward_std": 0.5748332142829895, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.8422619104385376, "rewards/wrapped_driving_reward": -0.2734684646129608, "rewards/wrapped_format_reward": 0.375, "step": 477 }, { "completion_length": 500.0, "epoch": 95.6, "grad_norm": 0.5776206851005554, "kl": 0.9822722673416138, "learning_rate": 4.851708850011631e-06, "loss": 0.0393, "reward": 0.7107340097427368, "reward_std": 3.164551258087158, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.3901515305042267, "rewards/wrapped_driving_reward": -0.8044174909591675, "rewards/wrapped_format_reward": 0.375, "step": 478 }, { "completion_length": 500.0, "epoch": 95.8, "grad_norm": 3.6034088134765625, "kl": 1.1710273027420044, "learning_rate": 4.850472687081253e-06, "loss": 0.0468, "reward": 0.6904103755950928, "reward_std": 3.202547311782837, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.6590908765792847, "rewards/wrapped_driving_reward": -1.218680500984192, "rewards/wrapped_format_reward": 0.5, "step": 479 }, { "completion_length": 500.0, "epoch": 96.0, "grad_norm": 0.45567023754119873, "kl": 0.40337690711021423, "learning_rate": 4.849231551964771e-06, "loss": 0.0161, "reward": -0.5991268754005432, "reward_std": 3.64422869682312, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.2604166865348816, "rewards/wrapped_driving_reward": -1.6095435619354248, "rewards/wrapped_format_reward": 0.25, "step": 480 }, { "completion_length": 500.0, "epoch": 96.2, "grad_norm": 0.5505725741386414, "kl": 0.7656367421150208, "learning_rate": 4.847985447287681e-06, "loss": 0.0306, "reward": 2.3976004123687744, "reward_std": 0.4736846089363098, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7408459782600403, "rewards/wrapped_driving_reward": 0.03175440803170204, "rewards/wrapped_format_reward": 0.625, "step": 481 }, { "completion_length": 500.0, "epoch": 96.4, "grad_norm": 0.45343244075775146, "kl": 0.6199319958686829, "learning_rate": 4.846734375685989e-06, "loss": 0.0248, "reward": 2.3475000858306885, "reward_std": 0.8652529120445251, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6261904835700989, "rewards/wrapped_driving_reward": 0.3463096022605896, "rewards/wrapped_format_reward": 0.375, "step": 482 }, { "completion_length": 500.0, "epoch": 96.6, "grad_norm": 0.3729810416698456, "kl": 0.47601577639579773, "learning_rate": 4.845478339806211e-06, "loss": 0.019, "reward": 0.2215256690979004, "reward_std": 2.8646438121795654, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.38749998807907104, "rewards/wrapped_driving_reward": -1.2909743785858154, "rewards/wrapped_format_reward": 0.375, "step": 483 }, { "completion_length": 500.0, "epoch": 96.8, "grad_norm": 0.43768781423568726, "kl": 0.5539119839668274, "learning_rate": 4.844217342305363e-06, "loss": 0.0222, "reward": -2.151515007019043, "reward_std": 0.9406149983406067, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4734848737716675, "rewards/wrapped_driving_reward": -4.0, "rewards/wrapped_format_reward": 0.625, "step": 484 }, { "completion_length": 500.0, "epoch": 97.0, "grad_norm": 1.7781202793121338, "kl": 2.4269611835479736, "learning_rate": 4.842951385850957e-06, "loss": 0.0971, "reward": 1.8921316862106323, "reward_std": 0.6051745414733887, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.5, "rewards/wrapped_driving_reward": 0.017131807282567024, "rewards/wrapped_format_reward": 0.375, "step": 485 }, { "completion_length": 500.0, "epoch": 97.2, "grad_norm": 1.187008023262024, "kl": 1.609662652015686, "learning_rate": 4.841680473120994e-06, "loss": 0.0644, "reward": 1.960210919380188, "reward_std": 0.6634494066238403, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.53125, "rewards/wrapped_driving_reward": 0.05396085977554321, "rewards/wrapped_format_reward": 0.375, "step": 486 }, { "completion_length": 500.0, "epoch": 97.4, "grad_norm": 0.5124778151512146, "kl": 0.3628022372722626, "learning_rate": 4.840404606803963e-06, "loss": 0.0145, "reward": 0.39914047718048096, "reward_std": 2.975532054901123, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4023042917251587, "rewards/wrapped_driving_reward": -1.2531636953353882, "rewards/wrapped_format_reward": 0.5, "step": 487 }, { "completion_length": 500.0, "epoch": 97.6, "grad_norm": 0.6879832744598389, "kl": 0.7469982504844666, "learning_rate": 4.839123789598829e-06, "loss": 0.0299, "reward": 0.6401175260543823, "reward_std": 0.6992170214653015, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6193910241127014, "rewards/wrapped_driving_reward": -1.2292735576629639, "rewards/wrapped_format_reward": 0.25, "step": 488 }, { "completion_length": 399.0, "epoch": 97.8, "grad_norm": 0.6237967610359192, "kl": 1.441658616065979, "learning_rate": 4.83783802421503e-06, "loss": 0.0577, "reward": 2.1392810344696045, "reward_std": 0.6474350690841675, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6000000238418579, "rewards/wrapped_driving_reward": -0.0857190191745758, "rewards/wrapped_format_reward": 0.625, "step": 489 }, { "completion_length": 500.0, "epoch": 98.0, "grad_norm": 0.4759804904460907, "kl": 0.377165824174881, "learning_rate": 4.836547313372472e-06, "loss": 0.0151, "reward": -1.1276339292526245, "reward_std": 3.3167312145233154, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.19696970283985138, "rewards/wrapped_driving_reward": -1.9496036767959595, "rewards/wrapped_format_reward": 0.125, "step": 490 }, { "completion_length": 500.0, "epoch": 98.2, "grad_norm": 0.5299642086029053, "kl": 1.3804445266723633, "learning_rate": 4.835251659801522e-06, "loss": 0.0552, "reward": -1.6679158210754395, "reward_std": 0.3146505653858185, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.7202380895614624, "rewards/wrapped_driving_reward": -3.8881540298461914, "rewards/wrapped_format_reward": 0.5, "step": 491 }, { "completion_length": 500.0, "epoch": 98.4, "grad_norm": 0.8449365496635437, "kl": 0.88936448097229, "learning_rate": 4.833951066243004e-06, "loss": 0.0356, "reward": -0.9219812154769897, "reward_std": 3.307687282562256, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.3541666567325592, "rewards/wrapped_driving_reward": -2.0261478424072266, "rewards/wrapped_format_reward": 0.25, "step": 492 }, { "completion_length": 500.0, "epoch": 98.6, "grad_norm": 0.49737241864204407, "kl": 0.24093657732009888, "learning_rate": 4.832645535448193e-06, "loss": 0.0096, "reward": -0.3710002899169922, "reward_std": 3.9087464809417725, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.4464285671710968, "rewards/wrapped_driving_reward": -1.692428708076477, "rewards/wrapped_format_reward": 0.375, "step": 493 }, { "completion_length": 500.0, "epoch": 98.8, "grad_norm": 0.548768937587738, "kl": 0.6372228264808655, "learning_rate": 4.8313350701788054e-06, "loss": 0.0255, "reward": 0.4414939880371094, "reward_std": 2.9852190017700195, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.4437499940395355, "rewards/wrapped_driving_reward": -1.0022560358047485, "rewards/wrapped_format_reward": 0.25, "step": 494 }, { "completion_length": 500.0, "epoch": 99.0, "grad_norm": 0.5245958566665649, "kl": 0.9060522317886353, "learning_rate": 4.830019673206997e-06, "loss": 0.0362, "reward": 1.7761955261230469, "reward_std": 0.7908852696418762, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6812500357627869, "rewards/wrapped_driving_reward": -0.1550544649362564, "rewards/wrapped_format_reward": 0.25, "step": 495 }, { "completion_length": 500.0, "epoch": 99.2, "grad_norm": 0.5090546607971191, "kl": 0.9043675661087036, "learning_rate": 4.828699347315357e-06, "loss": 0.0362, "reward": 0.39988845586776733, "reward_std": 2.9460675716400146, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.369047611951828, "rewards/wrapped_driving_reward": -0.8441591262817383, "rewards/wrapped_format_reward": 0.125, "step": 496 }, { "completion_length": 500.0, "epoch": 99.4, "grad_norm": 0.7547634840011597, "kl": 0.936507523059845, "learning_rate": 4.8273740952969e-06, "loss": 0.0375, "reward": -0.501820981502533, "reward_std": 3.7566311359405518, "rewards/mpc_param_extraction_reward": 0.5, "rewards/mpc_param_name_reward": 0.328125, "rewards/wrapped_driving_reward": -1.5799460411071777, "rewards/wrapped_format_reward": 0.25, "step": 497 }, { "completion_length": 500.0, "epoch": 99.6, "grad_norm": 0.6466436982154846, "kl": 1.7943109273910522, "learning_rate": 4.826043919955062e-06, "loss": 0.0718, "reward": 2.4197564125061035, "reward_std": 0.2094937115907669, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.800000011920929, "rewards/wrapped_driving_reward": -0.0052434951066970825, "rewards/wrapped_format_reward": 0.625, "step": 498 }, { "completion_length": 500.0, "epoch": 99.8, "grad_norm": 0.42996838688850403, "kl": 0.36297762393951416, "learning_rate": 4.824708824103694e-06, "loss": 0.0145, "reward": -0.16156989336013794, "reward_std": 2.625190019607544, "rewards/mpc_param_extraction_reward": 0.75, "rewards/mpc_param_name_reward": 0.42500001192092896, "rewards/wrapped_driving_reward": -1.586569905281067, "rewards/wrapped_format_reward": 0.25, "step": 499 }, { "completion_length": 500.0, "epoch": 100.0, "grad_norm": 0.8012004494667053, "kl": 1.8889646530151367, "learning_rate": 4.823368810567056e-06, "loss": 0.0756, "reward": 0.8908319473266602, "reward_std": 2.290397882461548, "rewards/mpc_param_extraction_reward": 1.0, "rewards/mpc_param_name_reward": 0.6818181872367859, "rewards/wrapped_driving_reward": -0.9159862399101257, "rewards/wrapped_format_reward": 0.125, "step": 500 } ], "logging_steps": 1, "max_steps": 2400, "num_input_tokens_seen": 0, "num_train_epochs": 480, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }