nibauman's picture
Upload folder using huggingface_hub
a977566 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 50.0,
"eval_steps": 500,
"global_step": 250,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 500.0,
"epoch": 0.2,
"grad_norm": 144.23707580566406,
"kl": 51.48179244995117,
"learning_rate": 3.1250000000000005e-08,
"loss": 2.0593,
"reward": 0.9761996865272522,
"reward_std": 3.3251326084136963,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -1.148800253868103,
"rewards/wrapped_format_reward": 0.625,
"step": 1
},
{
"completion_length": 500.0,
"epoch": 0.4,
"grad_norm": 974.2139892578125,
"kl": 216.24957275390625,
"learning_rate": 6.250000000000001e-08,
"loss": 8.65,
"reward": -3.75,
"reward_std": 0.5,
"rewards/mpc_param_extraction_reward": 0.0,
"rewards/mpc_param_name_reward": 0.0,
"rewards/wrapped_driving_reward": -4.0,
"rewards/wrapped_format_reward": 0.25,
"step": 2
},
{
"completion_length": 500.0,
"epoch": 0.6,
"grad_norm": 41697.91015625,
"kl": 3837.41943359375,
"learning_rate": 9.375e-08,
"loss": 153.4967,
"reward": -0.7961921691894531,
"reward_std": 3.700653076171875,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.4375,
"rewards/wrapped_driving_reward": -1.7336921691894531,
"rewards/wrapped_format_reward": 0.0,
"step": 3
},
{
"completion_length": 500.0,
"epoch": 0.8,
"grad_norm": 10122959.0,
"kl": 511094.90625,
"learning_rate": 1.2500000000000002e-07,
"loss": 20443.7988,
"reward": -2.338921546936035,
"reward_std": 3.322157144546509,
"rewards/mpc_param_extraction_reward": 0.25,
"rewards/mpc_param_name_reward": 0.25,
"rewards/wrapped_driving_reward": -2.838921546936035,
"rewards/wrapped_format_reward": 0.0,
"step": 4
},
{
"completion_length": 500.0,
"epoch": 1.0,
"grad_norm": 100702232.0,
"kl": 5416315.0,
"learning_rate": 1.5625e-07,
"loss": 216652.5938,
"reward": -0.16450506448745728,
"reward_std": 3.8515079021453857,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -1.6645050048828125,
"rewards/wrapped_format_reward": 0.5,
"step": 5
},
{
"completion_length": 500.0,
"epoch": 1.2,
"grad_norm": 17.33243751525879,
"kl": 7.672175884246826,
"learning_rate": 1.875e-07,
"loss": 0.3069,
"reward": 0.9893605709075928,
"reward_std": 1.5257619619369507,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": -1.3856394290924072,
"rewards/wrapped_format_reward": 0.375,
"step": 6
},
{
"completion_length": 500.0,
"epoch": 1.4,
"grad_norm": 70.34513092041016,
"kl": 17.917146682739258,
"learning_rate": 2.1875e-07,
"loss": 0.7167,
"reward": 1.2267450094223022,
"reward_std": 3.4932949542999268,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -0.5232549905776978,
"rewards/wrapped_format_reward": 0.25,
"step": 7
},
{
"completion_length": 500.0,
"epoch": 1.6,
"grad_norm": 69448.7734375,
"kl": 9786.2802734375,
"learning_rate": 2.5000000000000004e-07,
"loss": 391.4512,
"reward": -0.967779815196991,
"reward_std": 3.5180184841156006,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -2.2177798748016357,
"rewards/wrapped_format_reward": 0.25,
"step": 8
},
{
"completion_length": 500.0,
"epoch": 1.8,
"grad_norm": 7205431.5,
"kl": 363326.15625,
"learning_rate": 2.8125e-07,
"loss": 14533.0439,
"reward": -0.4434952139854431,
"reward_std": 4.112156867980957,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -1.5684951543807983,
"rewards/wrapped_format_reward": 0.125,
"step": 9
},
{
"completion_length": 500.0,
"epoch": 2.0,
"grad_norm": 1344.39306640625,
"kl": 182.57179260253906,
"learning_rate": 3.125e-07,
"loss": 7.3029,
"reward": -0.5283111929893494,
"reward_std": 3.7256903648376465,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -1.6533112525939941,
"rewards/wrapped_format_reward": 0.125,
"step": 10
},
{
"completion_length": 500.0,
"epoch": 2.2,
"grad_norm": 663.510498046875,
"kl": 125.65680694580078,
"learning_rate": 3.4375000000000004e-07,
"loss": 5.0263,
"reward": -2.449397563934326,
"reward_std": 3.1012051105499268,
"rewards/mpc_param_extraction_reward": 0.25,
"rewards/mpc_param_name_reward": 0.25,
"rewards/wrapped_driving_reward": -2.949397563934326,
"rewards/wrapped_format_reward": 0.0,
"step": 11
},
{
"completion_length": 500.0,
"epoch": 2.4,
"grad_norm": 31.331226348876953,
"kl": 10.755382537841797,
"learning_rate": 3.75e-07,
"loss": 0.4302,
"reward": -2.1561226844787598,
"reward_std": 3.36269211769104,
"rewards/mpc_param_extraction_reward": 0.25,
"rewards/mpc_param_name_reward": 0.25,
"rewards/wrapped_driving_reward": -2.7811226844787598,
"rewards/wrapped_format_reward": 0.125,
"step": 12
},
{
"completion_length": 500.0,
"epoch": 2.6,
"grad_norm": 10.003079414367676,
"kl": 3.8625946044921875,
"learning_rate": 4.0625000000000003e-07,
"loss": 0.1545,
"reward": -2.1760454177856445,
"reward_std": 3.647908926010132,
"rewards/mpc_param_extraction_reward": 0.25,
"rewards/mpc_param_name_reward": 0.25,
"rewards/wrapped_driving_reward": -2.8010454177856445,
"rewards/wrapped_format_reward": 0.125,
"step": 13
},
{
"completion_length": 500.0,
"epoch": 2.8,
"grad_norm": 62.2325325012207,
"kl": 13.510702133178711,
"learning_rate": 4.375e-07,
"loss": 0.5404,
"reward": -2.294018030166626,
"reward_std": 3.0876402854919434,
"rewards/mpc_param_extraction_reward": 0.25,
"rewards/mpc_param_name_reward": 0.25,
"rewards/wrapped_driving_reward": -3.044018030166626,
"rewards/wrapped_format_reward": 0.25,
"step": 14
},
{
"completion_length": 500.0,
"epoch": 3.0,
"grad_norm": 96.1074447631836,
"kl": 10.679292678833008,
"learning_rate": 4.6875000000000006e-07,
"loss": 0.4272,
"reward": -0.6371059417724609,
"reward_std": 3.885227680206299,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -1.637105941772461,
"rewards/wrapped_format_reward": 0.0,
"step": 15
},
{
"completion_length": 500.0,
"epoch": 3.2,
"grad_norm": 1650.8782958984375,
"kl": 208.3596954345703,
"learning_rate": 5.000000000000001e-07,
"loss": 8.3344,
"reward": -4.0,
"reward_std": 0.0,
"rewards/mpc_param_extraction_reward": 0.0,
"rewards/mpc_param_name_reward": 0.0,
"rewards/wrapped_driving_reward": -4.0,
"rewards/wrapped_format_reward": 0.0,
"step": 16
},
{
"completion_length": 500.0,
"epoch": 3.4,
"grad_norm": 17.093393325805664,
"kl": 5.905396461486816,
"learning_rate": 5.3125e-07,
"loss": 0.2362,
"reward": -2.4352118968963623,
"reward_std": 2.806159257888794,
"rewards/mpc_param_extraction_reward": 0.25,
"rewards/mpc_param_name_reward": 0.25,
"rewards/wrapped_driving_reward": -3.060211658477783,
"rewards/wrapped_format_reward": 0.125,
"step": 17
},
{
"completion_length": 500.0,
"epoch": 3.6,
"grad_norm": 78087.4140625,
"kl": 7675.3564453125,
"learning_rate": 5.625e-07,
"loss": 307.0142,
"reward": -0.4786604046821594,
"reward_std": 4.071903228759766,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -1.8536603450775146,
"rewards/wrapped_format_reward": 0.375,
"step": 18
},
{
"completion_length": 500.0,
"epoch": 3.8,
"grad_norm": 2062.067626953125,
"kl": 105.56303405761719,
"learning_rate": 5.9375e-07,
"loss": 4.2225,
"reward": -0.4283701777458191,
"reward_std": 4.145442008972168,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -1.8033702373504639,
"rewards/wrapped_format_reward": 0.375,
"step": 19
},
{
"completion_length": 500.0,
"epoch": 4.0,
"grad_norm": 513992.65625,
"kl": 39077.08984375,
"learning_rate": 6.25e-07,
"loss": 1563.0836,
"reward": -2.0327651500701904,
"reward_std": 3.3016297817230225,
"rewards/mpc_param_extraction_reward": 0.25,
"rewards/mpc_param_name_reward": 0.25,
"rewards/wrapped_driving_reward": -2.7827651500701904,
"rewards/wrapped_format_reward": 0.25,
"step": 20
},
{
"completion_length": 500.0,
"epoch": 4.2,
"grad_norm": 271.5398254394531,
"kl": 37.08869934082031,
"learning_rate": 6.562500000000001e-07,
"loss": 1.4835,
"reward": -3.5,
"reward_std": 0.5773502588272095,
"rewards/mpc_param_extraction_reward": 0.0,
"rewards/mpc_param_name_reward": 0.0,
"rewards/wrapped_driving_reward": -4.0,
"rewards/wrapped_format_reward": 0.5,
"step": 21
},
{
"completion_length": 500.0,
"epoch": 4.4,
"grad_norm": 33774.453125,
"kl": 4115.1591796875,
"learning_rate": 6.875000000000001e-07,
"loss": 164.6064,
"reward": -1.1355788707733154,
"reward_std": 3.42315673828125,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -2.5105788707733154,
"rewards/wrapped_format_reward": 0.375,
"step": 22
},
{
"completion_length": 500.0,
"epoch": 4.6,
"grad_norm": 52.09832000732422,
"kl": 14.069100379943848,
"learning_rate": 7.1875e-07,
"loss": 0.5628,
"reward": 0.9047523736953735,
"reward_std": 3.2798702716827393,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -0.5952475666999817,
"rewards/wrapped_format_reward": 0.0,
"step": 23
},
{
"completion_length": 500.0,
"epoch": 4.8,
"grad_norm": 75.83870697021484,
"kl": 16.262989044189453,
"learning_rate": 7.5e-07,
"loss": 0.6505,
"reward": -0.5572073459625244,
"reward_std": 2.9710958003997803,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -2.0572073459625244,
"rewards/wrapped_format_reward": 0.0,
"step": 24
},
{
"completion_length": 500.0,
"epoch": 5.0,
"grad_norm": 8.57257080078125,
"kl": 3.3865182399749756,
"learning_rate": 7.8125e-07,
"loss": 0.1355,
"reward": -2.110412120819092,
"reward_std": 3.7791755199432373,
"rewards/mpc_param_extraction_reward": 0.25,
"rewards/mpc_param_name_reward": 0.25,
"rewards/wrapped_driving_reward": -2.860412120819092,
"rewards/wrapped_format_reward": 0.25,
"step": 25
},
{
"completion_length": 500.0,
"epoch": 5.2,
"grad_norm": 978403.0,
"kl": 89647.2578125,
"learning_rate": 8.125000000000001e-07,
"loss": 3585.8899,
"reward": -0.2944529056549072,
"reward_std": 4.2804718017578125,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -1.6694529056549072,
"rewards/wrapped_format_reward": 0.375,
"step": 26
},
{
"completion_length": 500.0,
"epoch": 5.4,
"grad_norm": 168.73036193847656,
"kl": 29.724079132080078,
"learning_rate": 8.437500000000001e-07,
"loss": 1.189,
"reward": -0.6913368701934814,
"reward_std": 3.5795211791992188,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -2.0663368701934814,
"rewards/wrapped_format_reward": 0.375,
"step": 27
},
{
"completion_length": 500.0,
"epoch": 5.6,
"grad_norm": 75324.5234375,
"kl": 7936.74267578125,
"learning_rate": 8.75e-07,
"loss": 317.4697,
"reward": 0.9355948567390442,
"reward_std": 3.3464736938476562,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -0.6894051432609558,
"rewards/wrapped_format_reward": 0.125,
"step": 28
},
{
"completion_length": 500.0,
"epoch": 5.8,
"grad_norm": 469.9671630859375,
"kl": 71.2878189086914,
"learning_rate": 9.0625e-07,
"loss": 2.8515,
"reward": -2.547950267791748,
"reward_std": 2.904099464416504,
"rewards/mpc_param_extraction_reward": 0.25,
"rewards/mpc_param_name_reward": 0.25,
"rewards/wrapped_driving_reward": -3.047950267791748,
"rewards/wrapped_format_reward": 0.0,
"step": 29
},
{
"completion_length": 500.0,
"epoch": 6.0,
"grad_norm": 36.652645111083984,
"kl": 13.148932456970215,
"learning_rate": 9.375000000000001e-07,
"loss": 0.526,
"reward": -3.375,
"reward_std": 0.9464846849441528,
"rewards/mpc_param_extraction_reward": 0.25,
"rewards/mpc_param_name_reward": 0.25,
"rewards/wrapped_driving_reward": -4.0,
"rewards/wrapped_format_reward": 0.125,
"step": 30
},
{
"completion_length": 500.0,
"epoch": 6.2,
"grad_norm": 30.199684143066406,
"kl": 9.480849266052246,
"learning_rate": 9.6875e-07,
"loss": 0.3792,
"reward": -2.0530290603637695,
"reward_std": 3.2615222930908203,
"rewards/mpc_param_extraction_reward": 0.25,
"rewards/mpc_param_name_reward": 0.25,
"rewards/wrapped_driving_reward": -2.9280290603637695,
"rewards/wrapped_format_reward": 0.375,
"step": 31
},
{
"completion_length": 500.0,
"epoch": 6.4,
"grad_norm": 78.3298568725586,
"kl": 26.2161865234375,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.0486,
"reward": 1.33430814743042,
"reward_std": 3.5583572387695312,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -0.41569197177886963,
"rewards/wrapped_format_reward": 0.25,
"step": 32
},
{
"completion_length": 500.0,
"epoch": 6.6,
"grad_norm": 19.472774505615234,
"kl": 7.02009391784668,
"learning_rate": 1.03125e-06,
"loss": 0.2808,
"reward": -0.6657888889312744,
"reward_std": 3.853987216949463,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -1.6657888889312744,
"rewards/wrapped_format_reward": 0.0,
"step": 33
},
{
"completion_length": 500.0,
"epoch": 6.8,
"grad_norm": 65.95396423339844,
"kl": 18.14912223815918,
"learning_rate": 1.0625e-06,
"loss": 0.726,
"reward": -0.8661626577377319,
"reward_std": 3.6189870834350586,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -2.1161625385284424,
"rewards/wrapped_format_reward": 0.25,
"step": 34
},
{
"completion_length": 500.0,
"epoch": 7.0,
"grad_norm": 1.4008393287658691,
"kl": 0.8411699533462524,
"learning_rate": 1.0937500000000001e-06,
"loss": 0.0336,
"reward": -0.38928359746932983,
"reward_std": 4.171046257019043,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -1.6392836570739746,
"rewards/wrapped_format_reward": 0.25,
"step": 35
},
{
"completion_length": 500.0,
"epoch": 7.2,
"grad_norm": 7.80991268157959,
"kl": 4.218427658081055,
"learning_rate": 1.125e-06,
"loss": 0.1687,
"reward": -1.1417465209960938,
"reward_std": 3.381352186203003,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -2.1417465209960938,
"rewards/wrapped_format_reward": 0.0,
"step": 36
},
{
"completion_length": 500.0,
"epoch": 7.4,
"grad_norm": 4.478097438812256,
"kl": 2.112290143966675,
"learning_rate": 1.1562500000000002e-06,
"loss": 0.0845,
"reward": -2.102426528930664,
"reward_std": 3.1373467445373535,
"rewards/mpc_param_extraction_reward": 0.25,
"rewards/mpc_param_name_reward": 0.25,
"rewards/wrapped_driving_reward": -2.852426528930664,
"rewards/wrapped_format_reward": 0.25,
"step": 37
},
{
"completion_length": 500.0,
"epoch": 7.6,
"grad_norm": 9.182758331298828,
"kl": 4.63405179977417,
"learning_rate": 1.1875e-06,
"loss": 0.1854,
"reward": -2.2051236629486084,
"reward_std": 3.2649383544921875,
"rewards/mpc_param_extraction_reward": 0.25,
"rewards/mpc_param_name_reward": 0.25,
"rewards/wrapped_driving_reward": -2.9551236629486084,
"rewards/wrapped_format_reward": 0.25,
"step": 38
},
{
"completion_length": 500.0,
"epoch": 7.8,
"grad_norm": 203072.515625,
"kl": 20056.82421875,
"learning_rate": 1.21875e-06,
"loss": 802.2729,
"reward": -2.125791311264038,
"reward_std": 3.0907511711120605,
"rewards/mpc_param_extraction_reward": 0.25,
"rewards/mpc_param_name_reward": 0.25,
"rewards/wrapped_driving_reward": -3.000791311264038,
"rewards/wrapped_format_reward": 0.375,
"step": 39
},
{
"completion_length": 500.0,
"epoch": 8.0,
"grad_norm": 5.843477725982666,
"kl": 3.436691999435425,
"learning_rate": 1.25e-06,
"loss": 0.1375,
"reward": -0.9351435899734497,
"reward_std": 3.608586072921753,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -2.3101437091827393,
"rewards/wrapped_format_reward": 0.375,
"step": 40
},
{
"completion_length": 500.0,
"epoch": 8.2,
"grad_norm": 7742784.5,
"kl": 650933.375,
"learning_rate": 1.28125e-06,
"loss": 26037.334,
"reward": 1.5518302917480469,
"reward_std": 3.7290942668914795,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -0.4481697678565979,
"rewards/wrapped_format_reward": 0.5,
"step": 41
},
{
"completion_length": 500.0,
"epoch": 8.4,
"grad_norm": 8.937725067138672,
"kl": 3.0701639652252197,
"learning_rate": 1.3125000000000001e-06,
"loss": 0.1228,
"reward": -2.1904397010803223,
"reward_std": 3.2942306995391846,
"rewards/mpc_param_extraction_reward": 0.25,
"rewards/mpc_param_name_reward": 0.25,
"rewards/wrapped_driving_reward": -3.0654397010803223,
"rewards/wrapped_format_reward": 0.375,
"step": 42
},
{
"completion_length": 500.0,
"epoch": 8.6,
"grad_norm": 14.36681079864502,
"kl": 5.88793420791626,
"learning_rate": 1.34375e-06,
"loss": 0.2355,
"reward": 0.6519123315811157,
"reward_std": 3.105113983154297,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -0.973087728023529,
"rewards/wrapped_format_reward": 0.125,
"step": 43
},
{
"completion_length": 500.0,
"epoch": 8.8,
"grad_norm": 180.1724395751953,
"kl": 33.255760192871094,
"learning_rate": 1.3750000000000002e-06,
"loss": 1.3302,
"reward": -0.6511552333831787,
"reward_std": 3.9030916690826416,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -1.7761552333831787,
"rewards/wrapped_format_reward": 0.125,
"step": 44
},
{
"completion_length": 500.0,
"epoch": 9.0,
"grad_norm": 32.81709671020508,
"kl": 6.7791428565979,
"learning_rate": 1.40625e-06,
"loss": 0.2712,
"reward": -3.875,
"reward_std": 0.25,
"rewards/mpc_param_extraction_reward": 0.0,
"rewards/mpc_param_name_reward": 0.0,
"rewards/wrapped_driving_reward": -4.0,
"rewards/wrapped_format_reward": 0.125,
"step": 45
},
{
"completion_length": 500.0,
"epoch": 9.2,
"grad_norm": 4151.640625,
"kl": 350.8572082519531,
"learning_rate": 1.4375e-06,
"loss": 14.0343,
"reward": -1.1032943725585938,
"reward_std": 3.3453028202056885,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -2.1032943725585938,
"rewards/wrapped_format_reward": 0.0,
"step": 46
},
{
"completion_length": 500.0,
"epoch": 9.4,
"grad_norm": 15.409821510314941,
"kl": 5.346187114715576,
"learning_rate": 1.4687500000000001e-06,
"loss": 0.2138,
"reward": -1.1791430711746216,
"reward_std": 2.8540005683898926,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -2.929143190383911,
"rewards/wrapped_format_reward": 0.25,
"step": 47
},
{
"completion_length": 500.0,
"epoch": 9.6,
"grad_norm": 144.20443725585938,
"kl": 23.608051300048828,
"learning_rate": 1.5e-06,
"loss": 0.9443,
"reward": -2.530029058456421,
"reward_std": 2.939941883087158,
"rewards/mpc_param_extraction_reward": 0.25,
"rewards/mpc_param_name_reward": 0.20000000298023224,
"rewards/wrapped_driving_reward": -2.9800291061401367,
"rewards/wrapped_format_reward": 0.0,
"step": 48
},
{
"completion_length": 500.0,
"epoch": 9.8,
"grad_norm": 7.547443866729736,
"kl": 3.824962615966797,
"learning_rate": 1.5312500000000002e-06,
"loss": 0.153,
"reward": -0.7816690802574158,
"reward_std": 3.7201201915740967,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -1.906669020652771,
"rewards/wrapped_format_reward": 0.125,
"step": 49
},
{
"completion_length": 500.0,
"epoch": 10.0,
"grad_norm": 77.945556640625,
"kl": 16.699840545654297,
"learning_rate": 1.5625e-06,
"loss": 0.668,
"reward": -0.2709696292877197,
"reward_std": 4.022421360015869,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -1.6459696292877197,
"rewards/wrapped_format_reward": 0.375,
"step": 50
},
{
"completion_length": 500.0,
"epoch": 10.2,
"grad_norm": 11738.6953125,
"kl": 798.2957763671875,
"learning_rate": 1.59375e-06,
"loss": 31.9318,
"reward": -2.244354724884033,
"reward_std": 3.1866860389709473,
"rewards/mpc_param_extraction_reward": 0.25,
"rewards/mpc_param_name_reward": 0.25,
"rewards/wrapped_driving_reward": -2.869354724884033,
"rewards/wrapped_format_reward": 0.125,
"step": 51
},
{
"completion_length": 500.0,
"epoch": 10.4,
"grad_norm": 14.563969612121582,
"kl": 5.301497936248779,
"learning_rate": 1.6250000000000001e-06,
"loss": 0.2121,
"reward": 2.282280445098877,
"reward_std": 0.7978482246398926,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": -0.21771956980228424,
"rewards/wrapped_format_reward": 0.5,
"step": 52
},
{
"completion_length": 500.0,
"epoch": 10.6,
"grad_norm": 6.280083656311035,
"kl": 3.3535187244415283,
"learning_rate": 1.65625e-06,
"loss": 0.1341,
"reward": 0.2996126413345337,
"reward_std": 2.949772357940674,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -1.3253873586654663,
"rewards/wrapped_format_reward": 0.125,
"step": 53
},
{
"completion_length": 500.0,
"epoch": 10.8,
"grad_norm": 223484.046875,
"kl": 25810.041015625,
"learning_rate": 1.6875000000000001e-06,
"loss": 1032.4015,
"reward": -0.344623327255249,
"reward_std": 3.944869041442871,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -1.719623327255249,
"rewards/wrapped_format_reward": 0.375,
"step": 54
},
{
"completion_length": 500.0,
"epoch": 11.0,
"grad_norm": 593881.0,
"kl": 99004.265625,
"learning_rate": 1.71875e-06,
"loss": 3960.1709,
"reward": -3.875,
"reward_std": 0.25,
"rewards/mpc_param_extraction_reward": 0.0,
"rewards/mpc_param_name_reward": 0.0,
"rewards/wrapped_driving_reward": -4.0,
"rewards/wrapped_format_reward": 0.125,
"step": 55
},
{
"completion_length": 500.0,
"epoch": 11.2,
"grad_norm": 6.934082508087158,
"kl": 2.1647584438323975,
"learning_rate": 1.75e-06,
"loss": 0.0866,
"reward": -1.2983622550964355,
"reward_std": 2.1767022609710693,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -3.1733622550964355,
"rewards/wrapped_format_reward": 0.375,
"step": 56
},
{
"completion_length": 500.0,
"epoch": 11.4,
"grad_norm": 11660.44921875,
"kl": 1039.497802734375,
"learning_rate": 1.78125e-06,
"loss": 41.5799,
"reward": 1.111867070198059,
"reward_std": 3.4727301597595215,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -0.6381329298019409,
"rewards/wrapped_format_reward": 0.25,
"step": 57
},
{
"completion_length": 500.0,
"epoch": 11.6,
"grad_norm": 116.61476135253906,
"kl": 30.001558303833008,
"learning_rate": 1.8125e-06,
"loss": 1.2001,
"reward": -3.875,
"reward_std": 0.25,
"rewards/mpc_param_extraction_reward": 0.0,
"rewards/mpc_param_name_reward": 0.0,
"rewards/wrapped_driving_reward": -4.0,
"rewards/wrapped_format_reward": 0.125,
"step": 58
},
{
"completion_length": 500.0,
"epoch": 11.8,
"grad_norm": 7.750627040863037,
"kl": 3.2049598693847656,
"learning_rate": 1.8437500000000003e-06,
"loss": 0.1282,
"reward": -3.875,
"reward_std": 0.25,
"rewards/mpc_param_extraction_reward": 0.0,
"rewards/mpc_param_name_reward": 0.0,
"rewards/wrapped_driving_reward": -4.0,
"rewards/wrapped_format_reward": 0.125,
"step": 59
},
{
"completion_length": 500.0,
"epoch": 12.0,
"grad_norm": 6.483101844787598,
"kl": 2.8182482719421387,
"learning_rate": 1.8750000000000003e-06,
"loss": 0.1127,
"reward": 0.4944196343421936,
"reward_std": 2.8224055767059326,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -1.3805804252624512,
"rewards/wrapped_format_reward": 0.375,
"step": 60
},
{
"completion_length": 500.0,
"epoch": 12.2,
"grad_norm": 2.41937255859375,
"kl": 1.5243698358535767,
"learning_rate": 1.90625e-06,
"loss": 0.061,
"reward": -1.5549238920211792,
"reward_std": 3.3336286544799805,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -2.9299237728118896,
"rewards/wrapped_format_reward": 0.375,
"step": 61
},
{
"completion_length": 500.0,
"epoch": 12.4,
"grad_norm": 1.8835395574569702,
"kl": 1.3928029537200928,
"learning_rate": 1.9375e-06,
"loss": 0.0557,
"reward": 0.5334538817405701,
"reward_std": 3.0702548027038574,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -0.9665461778640747,
"rewards/wrapped_format_reward": 0.0,
"step": 62
},
{
"completion_length": 500.0,
"epoch": 12.6,
"grad_norm": 82.16962432861328,
"kl": 9.488784790039062,
"learning_rate": 1.96875e-06,
"loss": 0.3796,
"reward": -0.4010847806930542,
"reward_std": 4.155675411224365,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -1.7760847806930542,
"rewards/wrapped_format_reward": 0.375,
"step": 63
},
{
"completion_length": 500.0,
"epoch": 12.8,
"grad_norm": 1.6330454349517822,
"kl": 0.7770444750785828,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0311,
"reward": -1.0365194082260132,
"reward_std": 3.1399788856506348,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.46875,
"rewards/wrapped_driving_reward": -2.2552695274353027,
"rewards/wrapped_format_reward": 0.25,
"step": 64
},
{
"completion_length": 500.0,
"epoch": 13.0,
"grad_norm": 226.30535888671875,
"kl": 45.10585021972656,
"learning_rate": 2.0312500000000002e-06,
"loss": 1.8042,
"reward": -1.9248578548431396,
"reward_std": 3.5153682231903076,
"rewards/mpc_param_extraction_reward": 0.25,
"rewards/mpc_param_name_reward": 0.25,
"rewards/wrapped_driving_reward": -2.7998578548431396,
"rewards/wrapped_format_reward": 0.375,
"step": 65
},
{
"completion_length": 500.0,
"epoch": 13.2,
"grad_norm": 62.87789535522461,
"kl": 7.880831718444824,
"learning_rate": 2.0625e-06,
"loss": 0.3152,
"reward": 3.1509861946105957,
"reward_std": 0.29935261607170105,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 0.987500011920929,
"rewards/wrapped_driving_reward": 0.5384860038757324,
"rewards/wrapped_format_reward": 0.625,
"step": 66
},
{
"completion_length": 500.0,
"epoch": 13.4,
"grad_norm": 41.82035446166992,
"kl": 9.357061386108398,
"learning_rate": 2.09375e-06,
"loss": 0.3743,
"reward": -2.576190948486328,
"reward_std": 2.8476178646087646,
"rewards/mpc_param_extraction_reward": 0.25,
"rewards/mpc_param_name_reward": 0.25,
"rewards/wrapped_driving_reward": -3.201190948486328,
"rewards/wrapped_format_reward": 0.125,
"step": 67
},
{
"completion_length": 500.0,
"epoch": 13.6,
"grad_norm": 4.275550365447998,
"kl": 2.5297634601593018,
"learning_rate": 2.125e-06,
"loss": 0.1012,
"reward": -2.036945104598999,
"reward_std": 3.926109790802002,
"rewards/mpc_param_extraction_reward": 0.25,
"rewards/mpc_param_name_reward": 0.25,
"rewards/wrapped_driving_reward": -2.786945104598999,
"rewards/wrapped_format_reward": 0.25,
"step": 68
},
{
"completion_length": 500.0,
"epoch": 13.8,
"grad_norm": 12.396137237548828,
"kl": 3.0427801609039307,
"learning_rate": 2.1562500000000003e-06,
"loss": 0.1217,
"reward": 0.7283755540847778,
"reward_std": 3.323927879333496,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -1.2716244459152222,
"rewards/wrapped_format_reward": 0.5,
"step": 69
},
{
"completion_length": 500.0,
"epoch": 14.0,
"grad_norm": 1.5282281637191772,
"kl": 1.092595100402832,
"learning_rate": 2.1875000000000002e-06,
"loss": 0.0437,
"reward": -3.2356114387512207,
"reward_std": 1.528777003288269,
"rewards/mpc_param_extraction_reward": 0.25,
"rewards/mpc_param_name_reward": 0.25,
"rewards/wrapped_driving_reward": -3.8606114387512207,
"rewards/wrapped_format_reward": 0.125,
"step": 70
},
{
"completion_length": 500.0,
"epoch": 14.2,
"grad_norm": 1.2480015754699707,
"kl": 0.7834239602088928,
"learning_rate": 2.21875e-06,
"loss": 0.0313,
"reward": -2.0355887413024902,
"reward_std": 3.270659923553467,
"rewards/mpc_param_extraction_reward": 0.25,
"rewards/mpc_param_name_reward": 0.25,
"rewards/wrapped_driving_reward": -2.7855887413024902,
"rewards/wrapped_format_reward": 0.25,
"step": 71
},
{
"completion_length": 500.0,
"epoch": 14.4,
"grad_norm": 80.85037231445312,
"kl": 9.716327667236328,
"learning_rate": 2.25e-06,
"loss": 0.3887,
"reward": -3.375,
"reward_std": 1.25,
"rewards/mpc_param_extraction_reward": 0.25,
"rewards/mpc_param_name_reward": 0.25,
"rewards/wrapped_driving_reward": -4.0,
"rewards/wrapped_format_reward": 0.125,
"step": 72
},
{
"completion_length": 500.0,
"epoch": 14.6,
"grad_norm": 1.1875276565551758,
"kl": 0.9156450629234314,
"learning_rate": 2.28125e-06,
"loss": 0.0366,
"reward": 0.398318886756897,
"reward_std": 2.9533674716949463,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -1.3516812324523926,
"rewards/wrapped_format_reward": 0.25,
"step": 73
},
{
"completion_length": 500.0,
"epoch": 14.8,
"grad_norm": 5.200348854064941,
"kl": 2.256690502166748,
"learning_rate": 2.3125000000000003e-06,
"loss": 0.0903,
"reward": -0.17332077026367188,
"reward_std": 4.147373199462891,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.47727274894714355,
"rewards/wrapped_driving_reward": -1.650593638420105,
"rewards/wrapped_format_reward": 0.5,
"step": 74
},
{
"completion_length": 500.0,
"epoch": 15.0,
"grad_norm": 7008.42626953125,
"kl": 815.8104248046875,
"learning_rate": 2.3437500000000002e-06,
"loss": 32.6324,
"reward": -2.0281713008880615,
"reward_std": 3.3107235431671143,
"rewards/mpc_param_extraction_reward": 0.25,
"rewards/mpc_param_name_reward": 0.25,
"rewards/wrapped_driving_reward": -2.9031713008880615,
"rewards/wrapped_format_reward": 0.375,
"step": 75
},
{
"completion_length": 500.0,
"epoch": 15.2,
"grad_norm": 0.9020519852638245,
"kl": 0.9176801443099976,
"learning_rate": 2.375e-06,
"loss": 0.0367,
"reward": -2.1365058422088623,
"reward_std": 3.0964157581329346,
"rewards/mpc_param_extraction_reward": 0.25,
"rewards/mpc_param_name_reward": 0.25,
"rewards/wrapped_driving_reward": -3.1365058422088623,
"rewards/wrapped_format_reward": 0.5,
"step": 76
},
{
"completion_length": 500.0,
"epoch": 15.4,
"grad_norm": 2.2519280910491943,
"kl": 0.8236393332481384,
"learning_rate": 2.40625e-06,
"loss": 0.0329,
"reward": -1.22861909866333,
"reward_std": 2.9241857528686523,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -2.353619337081909,
"rewards/wrapped_format_reward": 0.125,
"step": 77
},
{
"completion_length": 500.0,
"epoch": 15.6,
"grad_norm": 1.5832031965255737,
"kl": 0.7527546286582947,
"learning_rate": 2.4375e-06,
"loss": 0.0301,
"reward": -0.6693365573883057,
"reward_std": 3.860503911972046,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -1.7943366765975952,
"rewards/wrapped_format_reward": 0.125,
"step": 78
},
{
"completion_length": 500.0,
"epoch": 15.8,
"grad_norm": 1.108726143836975,
"kl": 1.0248883962631226,
"learning_rate": 2.4687500000000003e-06,
"loss": 0.041,
"reward": -3.125,
"reward_std": 1.4361406564712524,
"rewards/mpc_param_extraction_reward": 0.25,
"rewards/mpc_param_name_reward": 0.25,
"rewards/wrapped_driving_reward": -4.0,
"rewards/wrapped_format_reward": 0.375,
"step": 79
},
{
"completion_length": 500.0,
"epoch": 16.0,
"grad_norm": 1.0169743299484253,
"kl": 0.7592311501502991,
"learning_rate": 2.5e-06,
"loss": 0.0304,
"reward": 1.145168423652649,
"reward_std": 3.56965708732605,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -0.7298316359519958,
"rewards/wrapped_format_reward": 0.375,
"step": 80
},
{
"completion_length": 500.0,
"epoch": 16.2,
"grad_norm": 0.7822604179382324,
"kl": 0.560085117816925,
"learning_rate": 2.53125e-06,
"loss": 0.0224,
"reward": 1.2056835889816284,
"reward_std": 3.5178937911987305,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -0.6693164110183716,
"rewards/wrapped_format_reward": 0.375,
"step": 81
},
{
"completion_length": 500.0,
"epoch": 16.4,
"grad_norm": 0.8175077438354492,
"kl": 0.5752599239349365,
"learning_rate": 2.5625e-06,
"loss": 0.023,
"reward": 2.9252407550811768,
"reward_std": 0.7892647385597229,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": 0.3002408742904663,
"rewards/wrapped_format_reward": 0.625,
"step": 82
},
{
"completion_length": 500.0,
"epoch": 16.6,
"grad_norm": 6.47392463684082,
"kl": 2.1055572032928467,
"learning_rate": 2.5937500000000004e-06,
"loss": 0.0842,
"reward": -2.7708332538604736,
"reward_std": 1.4678263664245605,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.4791666865348816,
"rewards/wrapped_driving_reward": -4.0,
"rewards/wrapped_format_reward": 0.25,
"step": 83
},
{
"completion_length": 500.0,
"epoch": 16.8,
"grad_norm": 0.9220647811889648,
"kl": 0.7157851457595825,
"learning_rate": 2.6250000000000003e-06,
"loss": 0.0286,
"reward": -2.987729549407959,
"reward_std": 1.3781793117523193,
"rewards/mpc_param_extraction_reward": 0.25,
"rewards/mpc_param_name_reward": 0.25,
"rewards/wrapped_driving_reward": -3.987729549407959,
"rewards/wrapped_format_reward": 0.5,
"step": 84
},
{
"completion_length": 500.0,
"epoch": 17.0,
"grad_norm": 62.959754943847656,
"kl": 11.148348808288574,
"learning_rate": 2.65625e-06,
"loss": 0.4459,
"reward": -3.875,
"reward_std": 0.25,
"rewards/mpc_param_extraction_reward": 0.0,
"rewards/mpc_param_name_reward": 0.0,
"rewards/wrapped_driving_reward": -4.0,
"rewards/wrapped_format_reward": 0.125,
"step": 85
},
{
"completion_length": 500.0,
"epoch": 17.2,
"grad_norm": 1.0555181503295898,
"kl": 0.7983899712562561,
"learning_rate": 2.6875e-06,
"loss": 0.0319,
"reward": -0.34822893142700195,
"reward_std": 1.868666648864746,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": -2.973228931427002,
"rewards/wrapped_format_reward": 0.625,
"step": 86
},
{
"completion_length": 500.0,
"epoch": 17.4,
"grad_norm": 5.494264602661133,
"kl": 1.303008794784546,
"learning_rate": 2.71875e-06,
"loss": 0.0521,
"reward": -1.6160635948181152,
"reward_std": 3.132638931274414,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -2.7410635948181152,
"rewards/wrapped_format_reward": 0.125,
"step": 87
},
{
"completion_length": 500.0,
"epoch": 17.6,
"grad_norm": 4.55032205581665,
"kl": 0.8609241247177124,
"learning_rate": 2.7500000000000004e-06,
"loss": 0.0344,
"reward": -0.07164722681045532,
"reward_std": 2.4449591636657715,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.7045454978942871,
"rewards/wrapped_driving_reward": -1.9011927843093872,
"rewards/wrapped_format_reward": 0.375,
"step": 88
},
{
"completion_length": 500.0,
"epoch": 17.8,
"grad_norm": 0.9195135831832886,
"kl": 0.683874785900116,
"learning_rate": 2.7812500000000003e-06,
"loss": 0.0274,
"reward": -0.11890482902526855,
"reward_std": 3.1548802852630615,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.699999988079071,
"rewards/wrapped_driving_reward": -2.0689048767089844,
"rewards/wrapped_format_reward": 0.5,
"step": 89
},
{
"completion_length": 500.0,
"epoch": 18.0,
"grad_norm": 33.38914108276367,
"kl": 7.705580234527588,
"learning_rate": 2.8125e-06,
"loss": 0.3082,
"reward": 1.586517572402954,
"reward_std": 3.729795217514038,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -0.4134823679924011,
"rewards/wrapped_format_reward": 0.5,
"step": 90
},
{
"completion_length": 500.0,
"epoch": 18.2,
"grad_norm": 57.7512092590332,
"kl": 6.441009998321533,
"learning_rate": 2.84375e-06,
"loss": 0.2576,
"reward": -0.8143091201782227,
"reward_std": 3.7249650955200195,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.4444444477558136,
"rewards/wrapped_driving_reward": -2.008753776550293,
"rewards/wrapped_format_reward": 0.25,
"step": 91
},
{
"completion_length": 500.0,
"epoch": 18.4,
"grad_norm": 1.0778491497039795,
"kl": 0.7857025265693665,
"learning_rate": 2.875e-06,
"loss": 0.0314,
"reward": -0.4593994617462158,
"reward_std": 3.805197238922119,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -1.8343994617462158,
"rewards/wrapped_format_reward": 0.375,
"step": 92
},
{
"completion_length": 500.0,
"epoch": 18.6,
"grad_norm": 1.1437242031097412,
"kl": 0.5162321925163269,
"learning_rate": 2.9062500000000003e-06,
"loss": 0.0206,
"reward": -1.5748236179351807,
"reward_std": 3.3948116302490234,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -2.8248236179351807,
"rewards/wrapped_format_reward": 0.25,
"step": 93
},
{
"completion_length": 500.0,
"epoch": 18.8,
"grad_norm": 0.782410204410553,
"kl": 0.4215336740016937,
"learning_rate": 2.9375000000000003e-06,
"loss": 0.0169,
"reward": -1.7746977806091309,
"reward_std": 3.3859715461730957,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.4375,
"rewards/wrapped_driving_reward": -2.962197780609131,
"rewards/wrapped_format_reward": 0.25,
"step": 94
},
{
"completion_length": 500.0,
"epoch": 19.0,
"grad_norm": 4.5705718994140625,
"kl": 2.0152359008789062,
"learning_rate": 2.96875e-06,
"loss": 0.0806,
"reward": 0.6353222131729126,
"reward_std": 3.1809890270233154,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -1.1146776676177979,
"rewards/wrapped_format_reward": 0.25,
"step": 95
},
{
"completion_length": 500.0,
"epoch": 19.2,
"grad_norm": 0.7173673510551453,
"kl": 0.5042878985404968,
"learning_rate": 3e-06,
"loss": 0.0202,
"reward": 3.3471288681030273,
"reward_std": 0.31114432215690613,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": 0.722128689289093,
"rewards/wrapped_format_reward": 0.625,
"step": 96
},
{
"completion_length": 500.0,
"epoch": 19.4,
"grad_norm": 2.924496650695801,
"kl": 0.9709882736206055,
"learning_rate": 3.03125e-06,
"loss": 0.0388,
"reward": 1.2860008478164673,
"reward_std": 2.1523053646087646,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 0.9166666865348816,
"rewards/wrapped_driving_reward": -1.1306657791137695,
"rewards/wrapped_format_reward": 0.5,
"step": 97
},
{
"completion_length": 500.0,
"epoch": 19.6,
"grad_norm": 1.3449220657348633,
"kl": 0.776192843914032,
"learning_rate": 3.0625000000000003e-06,
"loss": 0.031,
"reward": -1.2475244998931885,
"reward_std": 3.184887170791626,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -2.2475244998931885,
"rewards/wrapped_format_reward": 0.0,
"step": 98
},
{
"completion_length": 500.0,
"epoch": 19.8,
"grad_norm": 16.095233917236328,
"kl": 1.6037352085113525,
"learning_rate": 3.0937500000000002e-06,
"loss": 0.0641,
"reward": 0.7092133164405823,
"reward_std": 3.2424275875091553,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -1.040786862373352,
"rewards/wrapped_format_reward": 0.25,
"step": 99
},
{
"completion_length": 500.0,
"epoch": 20.0,
"grad_norm": 1.065063714981079,
"kl": 0.6967657208442688,
"learning_rate": 3.125e-06,
"loss": 0.0279,
"reward": 1.58005690574646,
"reward_std": 3.7297377586364746,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -0.41994309425354004,
"rewards/wrapped_format_reward": 0.5,
"step": 100
},
{
"completion_length": 500.0,
"epoch": 20.2,
"grad_norm": 0.7442240715026855,
"kl": 0.5057598352432251,
"learning_rate": 3.15625e-06,
"loss": 0.0202,
"reward": -0.4889770746231079,
"reward_std": 3.2432987689971924,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -2.1139769554138184,
"rewards/wrapped_format_reward": 0.625,
"step": 101
},
{
"completion_length": 500.0,
"epoch": 20.4,
"grad_norm": 0.6366997361183167,
"kl": 0.44367504119873047,
"learning_rate": 3.1875e-06,
"loss": 0.0177,
"reward": -2.3750693798065186,
"reward_std": 2.5939254760742188,
"rewards/mpc_param_extraction_reward": 0.25,
"rewards/mpc_param_name_reward": 0.25,
"rewards/wrapped_driving_reward": -3.2500693798065186,
"rewards/wrapped_format_reward": 0.375,
"step": 102
},
{
"completion_length": 500.0,
"epoch": 20.6,
"grad_norm": 2.0096611976623535,
"kl": 0.4689376652240753,
"learning_rate": 3.2187500000000003e-06,
"loss": 0.0188,
"reward": -0.5932518243789673,
"reward_std": 3.942629814147949,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -1.9682518243789673,
"rewards/wrapped_format_reward": 0.375,
"step": 103
},
{
"completion_length": 500.0,
"epoch": 20.8,
"grad_norm": 1.2003757953643799,
"kl": 0.3688035309314728,
"learning_rate": 3.2500000000000002e-06,
"loss": 0.0148,
"reward": 2.4348607063293457,
"reward_std": 1.402535080909729,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 0.824999988079071,
"rewards/wrapped_driving_reward": 0.1098608672618866,
"rewards/wrapped_format_reward": 0.5,
"step": 104
},
{
"completion_length": 500.0,
"epoch": 21.0,
"grad_norm": 1.0951755046844482,
"kl": 0.6130416393280029,
"learning_rate": 3.28125e-06,
"loss": 0.0245,
"reward": 1.912153720855713,
"reward_std": 2.661609649658203,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": -0.4628463387489319,
"rewards/wrapped_format_reward": 0.375,
"step": 105
},
{
"completion_length": 500.0,
"epoch": 21.2,
"grad_norm": 0.7108362317085266,
"kl": 0.4603574275970459,
"learning_rate": 3.3125e-06,
"loss": 0.0184,
"reward": -0.09991639852523804,
"reward_std": 2.701847791671753,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.699999988079071,
"rewards/wrapped_driving_reward": -1.674916386604309,
"rewards/wrapped_format_reward": 0.125,
"step": 106
},
{
"completion_length": 500.0,
"epoch": 21.4,
"grad_norm": 0.6701599955558777,
"kl": 0.40188899636268616,
"learning_rate": 3.34375e-06,
"loss": 0.0161,
"reward": -0.8211934566497803,
"reward_std": 3.6706011295318604,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -2.0711934566497803,
"rewards/wrapped_format_reward": 0.25,
"step": 107
},
{
"completion_length": 500.0,
"epoch": 21.6,
"grad_norm": 2.1547489166259766,
"kl": 1.2839192152023315,
"learning_rate": 3.3750000000000003e-06,
"loss": 0.0514,
"reward": 1.2595562934875488,
"reward_std": 3.514036178588867,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -0.36544373631477356,
"rewards/wrapped_format_reward": 0.125,
"step": 108
},
{
"completion_length": 500.0,
"epoch": 21.8,
"grad_norm": 1.838152527809143,
"kl": 0.49252963066101074,
"learning_rate": 3.40625e-06,
"loss": 0.0197,
"reward": 1.507678508758545,
"reward_std": 3.675663948059082,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.7272727489471436,
"rewards/wrapped_driving_reward": -0.46959418058395386,
"rewards/wrapped_format_reward": 0.5,
"step": 109
},
{
"completion_length": 500.0,
"epoch": 22.0,
"grad_norm": 0.7439582347869873,
"kl": 0.595367431640625,
"learning_rate": 3.4375e-06,
"loss": 0.0238,
"reward": -3.625,
"reward_std": 0.4787135720252991,
"rewards/mpc_param_extraction_reward": 0.0,
"rewards/mpc_param_name_reward": 0.0,
"rewards/wrapped_driving_reward": -4.0,
"rewards/wrapped_format_reward": 0.375,
"step": 110
},
{
"completion_length": 500.0,
"epoch": 22.2,
"grad_norm": 0.5864555835723877,
"kl": 0.3803044855594635,
"learning_rate": 3.46875e-06,
"loss": 0.0152,
"reward": 1.1622142791748047,
"reward_std": 3.198068618774414,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.7272727489471436,
"rewards/wrapped_driving_reward": -1.0650583505630493,
"rewards/wrapped_format_reward": 0.75,
"step": 111
},
{
"completion_length": 500.0,
"epoch": 22.4,
"grad_norm": 4.526273250579834,
"kl": 0.46392467617988586,
"learning_rate": 3.5e-06,
"loss": 0.0186,
"reward": -0.7247750163078308,
"reward_std": 3.514815330505371,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -2.0997748374938965,
"rewards/wrapped_format_reward": 0.375,
"step": 112
},
{
"completion_length": 500.0,
"epoch": 22.6,
"grad_norm": 0.5704318881034851,
"kl": 0.31563645601272583,
"learning_rate": 3.5312500000000007e-06,
"loss": 0.0126,
"reward": -0.6594128608703613,
"reward_std": 3.574946880340576,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -2.0344128608703613,
"rewards/wrapped_format_reward": 0.375,
"step": 113
},
{
"completion_length": 500.0,
"epoch": 22.8,
"grad_norm": 0.671708881855011,
"kl": 0.44867807626724243,
"learning_rate": 3.5625e-06,
"loss": 0.0179,
"reward": 1.6787878274917603,
"reward_std": 0.9073445200920105,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": -0.9462121725082397,
"rewards/wrapped_format_reward": 0.625,
"step": 114
},
{
"completion_length": 500.0,
"epoch": 23.0,
"grad_norm": 1.2352200746536255,
"kl": 0.46100977063179016,
"learning_rate": 3.59375e-06,
"loss": 0.0184,
"reward": -0.30863749980926514,
"reward_std": 3.9797427654266357,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -1.8086374998092651,
"rewards/wrapped_format_reward": 0.5,
"step": 115
},
{
"completion_length": 500.0,
"epoch": 23.2,
"grad_norm": 0.783157229423523,
"kl": 0.44468817114830017,
"learning_rate": 3.625e-06,
"loss": 0.0178,
"reward": 1.9851404428482056,
"reward_std": 1.0498998165130615,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": -0.2648596167564392,
"rewards/wrapped_format_reward": 0.25,
"step": 116
},
{
"completion_length": 500.0,
"epoch": 23.4,
"grad_norm": 0.6096097826957703,
"kl": 0.36140069365501404,
"learning_rate": 3.65625e-06,
"loss": 0.0145,
"reward": -0.9730753898620605,
"reward_std": 2.724126100540161,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -3.0980753898620605,
"rewards/wrapped_format_reward": 0.625,
"step": 117
},
{
"completion_length": 500.0,
"epoch": 23.6,
"grad_norm": 2.4817147254943848,
"kl": 0.3356289267539978,
"learning_rate": 3.6875000000000007e-06,
"loss": 0.0134,
"reward": 2.9043874740600586,
"reward_std": 0.34505343437194824,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": 0.5293872952461243,
"rewards/wrapped_format_reward": 0.375,
"step": 118
},
{
"completion_length": 500.0,
"epoch": 23.8,
"grad_norm": 1.3457905054092407,
"kl": 0.32610735297203064,
"learning_rate": 3.7187500000000006e-06,
"loss": 0.013,
"reward": -0.004844188690185547,
"reward_std": 3.559382915496826,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -1.7548441886901855,
"rewards/wrapped_format_reward": 0.25,
"step": 119
},
{
"completion_length": 500.0,
"epoch": 24.0,
"grad_norm": 0.8271002769470215,
"kl": 0.592341423034668,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.0237,
"reward": 0.23737984895706177,
"reward_std": 2.8921873569488525,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.6000000238418579,
"rewards/wrapped_driving_reward": -1.2376201152801514,
"rewards/wrapped_format_reward": 0.125,
"step": 120
},
{
"completion_length": 500.0,
"epoch": 24.2,
"grad_norm": 1.575377106666565,
"kl": 0.31468361616134644,
"learning_rate": 3.78125e-06,
"loss": 0.0126,
"reward": 0.08798408508300781,
"reward_std": 3.364243984222412,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -1.9120157957077026,
"rewards/wrapped_format_reward": 0.5,
"step": 121
},
{
"completion_length": 500.0,
"epoch": 24.4,
"grad_norm": 1.548542857170105,
"kl": 0.7125066518783569,
"learning_rate": 3.8125e-06,
"loss": 0.0285,
"reward": 3.202035903930664,
"reward_std": 0.5515704154968262,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 0.8958333134651184,
"rewards/wrapped_driving_reward": 0.6812027096748352,
"rewards/wrapped_format_reward": 0.625,
"step": 122
},
{
"completion_length": 500.0,
"epoch": 24.6,
"grad_norm": 0.6466585397720337,
"kl": 0.33141595125198364,
"learning_rate": 3.84375e-06,
"loss": 0.0133,
"reward": -0.8563422560691833,
"reward_std": 2.9308738708496094,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -2.981342315673828,
"rewards/wrapped_format_reward": 0.625,
"step": 123
},
{
"completion_length": 500.0,
"epoch": 24.8,
"grad_norm": 0.9053751826286316,
"kl": 0.3941192626953125,
"learning_rate": 3.875e-06,
"loss": 0.0158,
"reward": -0.9088470935821533,
"reward_std": 2.4116313457489014,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -2.6588470935821533,
"rewards/wrapped_format_reward": 0.25,
"step": 124
},
{
"completion_length": 500.0,
"epoch": 25.0,
"grad_norm": 0.7404253482818604,
"kl": 0.3537856936454773,
"learning_rate": 3.90625e-06,
"loss": 0.0142,
"reward": -0.08935052156448364,
"reward_std": 4.237273693084717,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -1.5893504619598389,
"rewards/wrapped_format_reward": 0.5,
"step": 125
},
{
"completion_length": 500.0,
"epoch": 25.2,
"grad_norm": 0.5974608659744263,
"kl": 0.31292691826820374,
"learning_rate": 3.9375e-06,
"loss": 0.0125,
"reward": -0.5100458860397339,
"reward_std": 3.746746063232422,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -1.8850458860397339,
"rewards/wrapped_format_reward": 0.375,
"step": 126
},
{
"completion_length": 500.0,
"epoch": 25.4,
"grad_norm": 0.9886866807937622,
"kl": 0.3266676068305969,
"learning_rate": 3.96875e-06,
"loss": 0.0131,
"reward": 3.5397558212280273,
"reward_std": 0.24529722332954407,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": 0.7897558212280273,
"rewards/wrapped_format_reward": 0.75,
"step": 127
},
{
"completion_length": 500.0,
"epoch": 25.6,
"grad_norm": 0.6569087505340576,
"kl": 0.28314509987831116,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0113,
"reward": -0.5303106904029846,
"reward_std": 4.0666303634643555,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.375,
"rewards/wrapped_driving_reward": -1.7803106307983398,
"rewards/wrapped_format_reward": 0.375,
"step": 128
},
{
"completion_length": 500.0,
"epoch": 25.8,
"grad_norm": 1.4679771661758423,
"kl": 0.4160246253013611,
"learning_rate": 4.031250000000001e-06,
"loss": 0.0166,
"reward": -0.5868573188781738,
"reward_std": 3.941485643386841,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -1.8368571996688843,
"rewards/wrapped_format_reward": 0.25,
"step": 129
},
{
"completion_length": 500.0,
"epoch": 26.0,
"grad_norm": 0.5985941290855408,
"kl": 0.31736528873443604,
"learning_rate": 4.0625000000000005e-06,
"loss": 0.0127,
"reward": 1.1113789081573486,
"reward_std": 3.429651975631714,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -1.0136209726333618,
"rewards/wrapped_format_reward": 0.625,
"step": 130
},
{
"completion_length": 500.0,
"epoch": 26.2,
"grad_norm": 0.6327362656593323,
"kl": 0.40226221084594727,
"learning_rate": 4.09375e-06,
"loss": 0.0161,
"reward": 0.7944153547286987,
"reward_std": 2.8826069831848145,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.7250000238418579,
"rewards/wrapped_driving_reward": -1.3055846691131592,
"rewards/wrapped_format_reward": 0.625,
"step": 131
},
{
"completion_length": 500.0,
"epoch": 26.4,
"grad_norm": 0.6797881722450256,
"kl": 0.4582635164260864,
"learning_rate": 4.125e-06,
"loss": 0.0183,
"reward": 2.8031327724456787,
"reward_std": 0.7006269097328186,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": 0.05313277989625931,
"rewards/wrapped_format_reward": 0.75,
"step": 132
},
{
"completion_length": 500.0,
"epoch": 26.6,
"grad_norm": 0.5752917528152466,
"kl": 0.36153456568717957,
"learning_rate": 4.15625e-06,
"loss": 0.0145,
"reward": -2.564105987548828,
"reward_std": 2.871788263320923,
"rewards/mpc_param_extraction_reward": 0.25,
"rewards/mpc_param_name_reward": 0.25,
"rewards/wrapped_driving_reward": -3.189105987548828,
"rewards/wrapped_format_reward": 0.125,
"step": 133
},
{
"completion_length": 500.0,
"epoch": 26.8,
"grad_norm": 0.569823145866394,
"kl": 0.3600581884384155,
"learning_rate": 4.1875e-06,
"loss": 0.0144,
"reward": 3.2037861347198486,
"reward_std": 0.1732039451599121,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": 0.8287861943244934,
"rewards/wrapped_format_reward": 0.375,
"step": 134
},
{
"completion_length": 500.0,
"epoch": 27.0,
"grad_norm": 11.942618370056152,
"kl": 2.177290678024292,
"learning_rate": 4.21875e-06,
"loss": 0.0871,
"reward": -2.3714582920074463,
"reward_std": 1.8921570777893066,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -3.7464582920074463,
"rewards/wrapped_format_reward": 0.375,
"step": 135
},
{
"completion_length": 500.0,
"epoch": 27.2,
"grad_norm": 0.5660642385482788,
"kl": 0.2908819019794464,
"learning_rate": 4.25e-06,
"loss": 0.0116,
"reward": -0.6192033290863037,
"reward_std": 3.6331570148468018,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -2.1192033290863037,
"rewards/wrapped_format_reward": 0.5,
"step": 136
},
{
"completion_length": 500.0,
"epoch": 27.4,
"grad_norm": 1.4041975736618042,
"kl": 0.463067889213562,
"learning_rate": 4.28125e-06,
"loss": 0.0185,
"reward": -2.75,
"reward_std": 1.1902379989624023,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -4.0,
"rewards/wrapped_format_reward": 0.25,
"step": 137
},
{
"completion_length": 500.0,
"epoch": 27.6,
"grad_norm": 0.4801470637321472,
"kl": 0.2532914876937866,
"learning_rate": 4.312500000000001e-06,
"loss": 0.0101,
"reward": -2.304798126220703,
"reward_std": 3.3904037475585938,
"rewards/mpc_param_extraction_reward": 0.25,
"rewards/mpc_param_name_reward": 0.25,
"rewards/wrapped_driving_reward": -3.054798126220703,
"rewards/wrapped_format_reward": 0.25,
"step": 138
},
{
"completion_length": 500.0,
"epoch": 27.8,
"grad_norm": 0.6999854445457458,
"kl": 0.4938638210296631,
"learning_rate": 4.3437500000000006e-06,
"loss": 0.0198,
"reward": -1.9556548595428467,
"reward_std": 3.430131196975708,
"rewards/mpc_param_extraction_reward": 0.25,
"rewards/mpc_param_name_reward": 0.25,
"rewards/wrapped_driving_reward": -2.9556548595428467,
"rewards/wrapped_format_reward": 0.5,
"step": 139
},
{
"completion_length": 500.0,
"epoch": 28.0,
"grad_norm": 1.7622352838516235,
"kl": 0.32535520195961,
"learning_rate": 4.3750000000000005e-06,
"loss": 0.013,
"reward": 3.048956871032715,
"reward_std": 0.7497459053993225,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": 0.42395687103271484,
"rewards/wrapped_format_reward": 0.625,
"step": 140
},
{
"completion_length": 500.0,
"epoch": 28.2,
"grad_norm": 1.0910435914993286,
"kl": 0.3166691064834595,
"learning_rate": 4.40625e-06,
"loss": 0.0127,
"reward": 2.1717541217803955,
"reward_std": 2.45133900642395,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": -0.45324593782424927,
"rewards/wrapped_format_reward": 0.625,
"step": 141
},
{
"completion_length": 500.0,
"epoch": 28.4,
"grad_norm": 0.563035249710083,
"kl": 0.34334975481033325,
"learning_rate": 4.4375e-06,
"loss": 0.0137,
"reward": -0.30545544624328613,
"reward_std": 2.531362295150757,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.71875,
"rewards/wrapped_driving_reward": -2.149205446243286,
"rewards/wrapped_format_reward": 0.375,
"step": 142
},
{
"completion_length": 500.0,
"epoch": 28.6,
"grad_norm": 0.6513370871543884,
"kl": 0.2893451154232025,
"learning_rate": 4.46875e-06,
"loss": 0.0116,
"reward": -0.7655331492424011,
"reward_std": 3.818908214569092,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -2.265533208847046,
"rewards/wrapped_format_reward": 0.5,
"step": 143
},
{
"completion_length": 500.0,
"epoch": 28.8,
"grad_norm": 0.6747258305549622,
"kl": 0.4012701213359833,
"learning_rate": 4.5e-06,
"loss": 0.0161,
"reward": -1.631712794303894,
"reward_std": 2.7382092475891113,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -3.2567129135131836,
"rewards/wrapped_format_reward": 0.625,
"step": 144
},
{
"completion_length": 500.0,
"epoch": 29.0,
"grad_norm": 1.388415813446045,
"kl": 0.3030587136745453,
"learning_rate": 4.53125e-06,
"loss": 0.0121,
"reward": -0.3709021210670471,
"reward_std": 3.0999691486358643,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.6666666865348816,
"rewards/wrapped_driving_reward": -2.0375688076019287,
"rewards/wrapped_format_reward": 0.25,
"step": 145
},
{
"completion_length": 500.0,
"epoch": 29.2,
"grad_norm": 1.3835958242416382,
"kl": 0.5185285806655884,
"learning_rate": 4.5625e-06,
"loss": 0.0207,
"reward": -0.25881457328796387,
"reward_std": 2.5411531925201416,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 0.9010416865348816,
"rewards/wrapped_driving_reward": -2.6598563194274902,
"rewards/wrapped_format_reward": 0.5,
"step": 146
},
{
"completion_length": 500.0,
"epoch": 29.4,
"grad_norm": 1.0529229640960693,
"kl": 0.304034560918808,
"learning_rate": 4.59375e-06,
"loss": 0.0122,
"reward": 0.5033270120620728,
"reward_std": 3.8062596321105957,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -1.6216729879379272,
"rewards/wrapped_format_reward": 0.625,
"step": 147
},
{
"completion_length": 500.0,
"epoch": 29.6,
"grad_norm": 1.3924496173858643,
"kl": 0.3519279956817627,
"learning_rate": 4.625000000000001e-06,
"loss": 0.0141,
"reward": 1.081155776977539,
"reward_std": 2.07023286819458,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 0.9750000238418579,
"rewards/wrapped_driving_reward": -1.1438441276550293,
"rewards/wrapped_format_reward": 0.25,
"step": 148
},
{
"completion_length": 500.0,
"epoch": 29.8,
"grad_norm": 1.6642379760742188,
"kl": 0.5595217347145081,
"learning_rate": 4.6562500000000005e-06,
"loss": 0.0224,
"reward": 2.879631519317627,
"reward_std": 0.5703426003456116,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": 0.12963154911994934,
"rewards/wrapped_format_reward": 0.75,
"step": 149
},
{
"completion_length": 500.0,
"epoch": 30.0,
"grad_norm": 0.5775982737541199,
"kl": 0.2810514271259308,
"learning_rate": 4.6875000000000004e-06,
"loss": 0.0112,
"reward": 0.10444420576095581,
"reward_std": 3.2514774799346924,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -2.0205557346343994,
"rewards/wrapped_format_reward": 0.625,
"step": 150
},
{
"completion_length": 500.0,
"epoch": 30.2,
"grad_norm": 0.9198185801506042,
"kl": 0.28956174850463867,
"learning_rate": 4.71875e-06,
"loss": 0.0116,
"reward": -0.260436087846756,
"reward_std": 2.8927173614501953,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -2.2604360580444336,
"rewards/wrapped_format_reward": 0.5,
"step": 151
},
{
"completion_length": 500.0,
"epoch": 30.4,
"grad_norm": 0.7754166722297668,
"kl": 0.38463443517684937,
"learning_rate": 4.75e-06,
"loss": 0.0154,
"reward": 0.4493406414985657,
"reward_std": 2.646808385848999,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -1.300659418106079,
"rewards/wrapped_format_reward": 0.25,
"step": 152
},
{
"completion_length": 500.0,
"epoch": 30.6,
"grad_norm": 0.5780096650123596,
"kl": 0.3385607898235321,
"learning_rate": 4.781250000000001e-06,
"loss": 0.0135,
"reward": -1.7927734851837158,
"reward_std": 3.755190849304199,
"rewards/mpc_param_extraction_reward": 0.25,
"rewards/mpc_param_name_reward": 0.25,
"rewards/wrapped_driving_reward": -2.792773485183716,
"rewards/wrapped_format_reward": 0.5,
"step": 153
},
{
"completion_length": 500.0,
"epoch": 30.8,
"grad_norm": 0.5552729964256287,
"kl": 0.28436288237571716,
"learning_rate": 4.8125e-06,
"loss": 0.0114,
"reward": 0.6222386360168457,
"reward_std": 2.1850173473358154,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": -1.8777613639831543,
"rewards/wrapped_format_reward": 0.5,
"step": 154
},
{
"completion_length": 500.0,
"epoch": 31.0,
"grad_norm": 0.9199939370155334,
"kl": 0.37593454122543335,
"learning_rate": 4.84375e-06,
"loss": 0.015,
"reward": 0.4306233525276184,
"reward_std": 3.5992963314056396,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.6607142686843872,
"rewards/wrapped_driving_reward": -1.605090856552124,
"rewards/wrapped_format_reward": 0.625,
"step": 155
},
{
"completion_length": 500.0,
"epoch": 31.2,
"grad_norm": 0.5603945851325989,
"kl": 0.3141997754573822,
"learning_rate": 4.875e-06,
"loss": 0.0126,
"reward": -2.0,
"reward_std": 1.3540064096450806,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -4.0,
"rewards/wrapped_format_reward": 0.5,
"step": 156
},
{
"completion_length": 500.0,
"epoch": 31.4,
"grad_norm": 0.6190344095230103,
"kl": 0.27537742257118225,
"learning_rate": 4.90625e-06,
"loss": 0.011,
"reward": 1.0282058715820312,
"reward_std": 3.394833564758301,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -0.846794068813324,
"rewards/wrapped_format_reward": 0.375,
"step": 157
},
{
"completion_length": 500.0,
"epoch": 31.6,
"grad_norm": 0.6877399682998657,
"kl": 0.2958383858203888,
"learning_rate": 4.937500000000001e-06,
"loss": 0.0118,
"reward": -0.755041241645813,
"reward_std": 3.4646472930908203,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -2.1300413608551025,
"rewards/wrapped_format_reward": 0.375,
"step": 158
},
{
"completion_length": 500.0,
"epoch": 31.8,
"grad_norm": 31.595932006835938,
"kl": 6.78364372253418,
"learning_rate": 4.9687500000000005e-06,
"loss": 0.2713,
"reward": 0.35857605934143066,
"reward_std": 2.9158554077148438,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -1.7664239406585693,
"rewards/wrapped_format_reward": 0.625,
"step": 159
},
{
"completion_length": 500.0,
"epoch": 32.0,
"grad_norm": 1.7048529386520386,
"kl": 0.35252463817596436,
"learning_rate": 5e-06,
"loss": 0.0141,
"reward": 2.64233136177063,
"reward_std": 0.7985239624977112,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": 0.14233140647411346,
"rewards/wrapped_format_reward": 0.5,
"step": 160
},
{
"completion_length": 500.0,
"epoch": 32.2,
"grad_norm": 0.9699507355690002,
"kl": 0.3963090479373932,
"learning_rate": 4.99999405044338e-06,
"loss": 0.0159,
"reward": 2.578547477722168,
"reward_std": 0.16937123239040375,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": -0.046452634036540985,
"rewards/wrapped_format_reward": 0.625,
"step": 161
},
{
"completion_length": 500.0,
"epoch": 32.4,
"grad_norm": 0.6427643299102783,
"kl": 0.2770542800426483,
"learning_rate": 4.999976201801837e-06,
"loss": 0.0111,
"reward": 2.2058539390563965,
"reward_std": 1.1022424697875977,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": -0.41914597153663635,
"rewards/wrapped_format_reward": 0.625,
"step": 162
},
{
"completion_length": 500.0,
"epoch": 32.6,
"grad_norm": 0.6896190047264099,
"kl": 0.27440541982650757,
"learning_rate": 4.999946454160323e-06,
"loss": 0.011,
"reward": 1.1675429344177246,
"reward_std": 3.4587650299072266,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.7142857313156128,
"rewards/wrapped_driving_reward": -0.7967426180839539,
"rewards/wrapped_format_reward": 0.5,
"step": 163
},
{
"completion_length": 500.0,
"epoch": 32.8,
"grad_norm": 0.7695831060409546,
"kl": 0.4198772609233856,
"learning_rate": 4.9999048076604286e-06,
"loss": 0.0168,
"reward": -1.912316918373108,
"reward_std": 2.555265426635742,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -2.9123167991638184,
"rewards/wrapped_format_reward": 0.0,
"step": 164
},
{
"completion_length": 500.0,
"epoch": 33.0,
"grad_norm": 0.5920954942703247,
"kl": 0.28969520330429077,
"learning_rate": 4.999851262500375e-06,
"loss": 0.0116,
"reward": 3.1377110481262207,
"reward_std": 0.5497580170631409,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": 0.5127109289169312,
"rewards/wrapped_format_reward": 0.625,
"step": 165
},
{
"completion_length": 500.0,
"epoch": 33.2,
"grad_norm": 6.077564716339111,
"kl": 0.44747114181518555,
"learning_rate": 4.999785818935018e-06,
"loss": 0.0179,
"reward": 2.047877073287964,
"reward_std": 2.722182035446167,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": -0.45212286710739136,
"rewards/wrapped_format_reward": 0.5,
"step": 166
},
{
"completion_length": 500.0,
"epoch": 33.4,
"grad_norm": 0.6889002919197083,
"kl": 0.37658053636550903,
"learning_rate": 4.999708477275846e-06,
"loss": 0.0151,
"reward": -2.284590482711792,
"reward_std": 3.106440782546997,
"rewards/mpc_param_extraction_reward": 0.25,
"rewards/mpc_param_name_reward": 0.25,
"rewards/wrapped_driving_reward": -3.159590482711792,
"rewards/wrapped_format_reward": 0.375,
"step": 167
},
{
"completion_length": 500.0,
"epoch": 33.6,
"grad_norm": 1.8645473718643188,
"kl": 0.3408987522125244,
"learning_rate": 4.9996192378909785e-06,
"loss": 0.0136,
"reward": 0.917718231678009,
"reward_std": 2.948974132537842,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.699999988079071,
"rewards/wrapped_driving_reward": -1.157281756401062,
"rewards/wrapped_format_reward": 0.625,
"step": 168
},
{
"completion_length": 500.0,
"epoch": 33.8,
"grad_norm": 0.535763680934906,
"kl": 0.25453072786331177,
"learning_rate": 4.999518101205162e-06,
"loss": 0.0102,
"reward": 3.604552745819092,
"reward_std": 0.45598289370536804,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": 0.7295528054237366,
"rewards/wrapped_format_reward": 0.875,
"step": 169
},
{
"completion_length": 500.0,
"epoch": 34.0,
"grad_norm": 1.0853776931762695,
"kl": 0.2871979773044586,
"learning_rate": 4.999405067699773e-06,
"loss": 0.0115,
"reward": 0.7697337865829468,
"reward_std": 3.0176069736480713,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -1.3552662134170532,
"rewards/wrapped_format_reward": 0.625,
"step": 170
},
{
"completion_length": 500.0,
"epoch": 34.2,
"grad_norm": 2.175551176071167,
"kl": 0.7303879261016846,
"learning_rate": 4.99928013791281e-06,
"loss": 0.0292,
"reward": 0.010015249252319336,
"reward_std": 4.346557140350342,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.46875,
"rewards/wrapped_driving_reward": -1.5837347507476807,
"rewards/wrapped_format_reward": 0.625,
"step": 171
},
{
"completion_length": 500.0,
"epoch": 34.4,
"grad_norm": 1.3378883600234985,
"kl": 0.2555471658706665,
"learning_rate": 4.999143312438893e-06,
"loss": 0.0102,
"reward": 1.064118504524231,
"reward_std": 1.464298963546753,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": -1.4358813762664795,
"rewards/wrapped_format_reward": 0.5,
"step": 172
},
{
"completion_length": 500.0,
"epoch": 34.6,
"grad_norm": 1.6005758047103882,
"kl": 0.3272940516471863,
"learning_rate": 4.998994591929266e-06,
"loss": 0.0131,
"reward": 3.320277214050293,
"reward_std": 0.5942137241363525,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": 0.8202772736549377,
"rewards/wrapped_format_reward": 0.5,
"step": 173
},
{
"completion_length": 500.0,
"epoch": 34.8,
"grad_norm": 0.8775622844696045,
"kl": 0.3981474041938782,
"learning_rate": 4.998833977091783e-06,
"loss": 0.0159,
"reward": 2.548191547393799,
"reward_std": 0.13038182258605957,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": 0.17319151759147644,
"rewards/wrapped_format_reward": 0.375,
"step": 174
},
{
"completion_length": 500.0,
"epoch": 35.0,
"grad_norm": 0.5131356716156006,
"kl": 0.26495081186294556,
"learning_rate": 4.998661468690914e-06,
"loss": 0.0106,
"reward": 0.2881455421447754,
"reward_std": 3.1594552993774414,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -1.7118544578552246,
"rewards/wrapped_format_reward": 0.5,
"step": 175
},
{
"completion_length": 500.0,
"epoch": 35.2,
"grad_norm": 1.4990577697753906,
"kl": 0.3656232953071594,
"learning_rate": 4.99847706754774e-06,
"loss": 0.0146,
"reward": 2.0933961868286133,
"reward_std": 0.39702948927879333,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 0.9375,
"rewards/wrapped_driving_reward": -0.594103991985321,
"rewards/wrapped_format_reward": 0.75,
"step": 176
},
{
"completion_length": 500.0,
"epoch": 35.4,
"grad_norm": 0.5740483999252319,
"kl": 0.265653520822525,
"learning_rate": 4.998280774539943e-06,
"loss": 0.0106,
"reward": 1.1700050830841064,
"reward_std": 3.1657402515411377,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -0.8299949765205383,
"rewards/wrapped_format_reward": 0.5,
"step": 177
},
{
"completion_length": 500.0,
"epoch": 35.6,
"grad_norm": 0.6564896702766418,
"kl": 0.265337198972702,
"learning_rate": 4.998072590601808e-06,
"loss": 0.0106,
"reward": -0.852949857711792,
"reward_std": 3.3822429180145264,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -2.102949857711792,
"rewards/wrapped_format_reward": 0.25,
"step": 178
},
{
"completion_length": 500.0,
"epoch": 35.8,
"grad_norm": 23.83641242980957,
"kl": 4.303451061248779,
"learning_rate": 4.9978525167242176e-06,
"loss": 0.1721,
"reward": 0.764412522315979,
"reward_std": 2.8684115409851074,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.7250000238418579,
"rewards/wrapped_driving_reward": -1.085587501525879,
"rewards/wrapped_format_reward": 0.375,
"step": 179
},
{
"completion_length": 500.0,
"epoch": 36.0,
"grad_norm": 0.7350974082946777,
"kl": 0.30466321110725403,
"learning_rate": 4.997620553954645e-06,
"loss": 0.0122,
"reward": -0.10997164249420166,
"reward_std": 2.883012056350708,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -2.109971523284912,
"rewards/wrapped_format_reward": 0.5,
"step": 180
},
{
"completion_length": 500.0,
"epoch": 36.2,
"grad_norm": 1.8978265523910522,
"kl": 0.5050737857818604,
"learning_rate": 4.997376703397151e-06,
"loss": 0.0202,
"reward": -0.35431569814682007,
"reward_std": 4.209678649902344,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -1.8543156385421753,
"rewards/wrapped_format_reward": 0.5,
"step": 181
},
{
"completion_length": 500.0,
"epoch": 36.4,
"grad_norm": 0.6739000678062439,
"kl": 0.3342580497264862,
"learning_rate": 4.9971209662123774e-06,
"loss": 0.0134,
"reward": 1.24358332157135,
"reward_std": 3.5022475719451904,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -0.5064166188240051,
"rewards/wrapped_format_reward": 0.25,
"step": 182
},
{
"completion_length": 500.0,
"epoch": 36.6,
"grad_norm": 0.8527255654335022,
"kl": 0.44380900263786316,
"learning_rate": 4.996853343617542e-06,
"loss": 0.0178,
"reward": 1.3519909381866455,
"reward_std": 2.9203834533691406,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.71875,
"rewards/wrapped_driving_reward": -0.6167589426040649,
"rewards/wrapped_format_reward": 0.5,
"step": 183
},
{
"completion_length": 500.0,
"epoch": 36.8,
"grad_norm": 0.6037353277206421,
"kl": 0.3514931797981262,
"learning_rate": 4.9965738368864345e-06,
"loss": 0.0141,
"reward": 2.4617958068847656,
"reward_std": 0.43256813287734985,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": -0.1632043421268463,
"rewards/wrapped_format_reward": 0.625,
"step": 184
},
{
"completion_length": 500.0,
"epoch": 37.0,
"grad_norm": 0.6498645544052124,
"kl": 0.39014145731925964,
"learning_rate": 4.996282447349408e-06,
"loss": 0.0156,
"reward": 2.696049451828003,
"reward_std": 0.6518055200576782,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": -0.17895053327083588,
"rewards/wrapped_format_reward": 0.875,
"step": 185
},
{
"completion_length": 500.0,
"epoch": 37.2,
"grad_norm": 0.6228243708610535,
"kl": 0.2633248567581177,
"learning_rate": 4.995979176393372e-06,
"loss": 0.0105,
"reward": 1.1363269090652466,
"reward_std": 3.4644434452056885,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -0.9886730313301086,
"rewards/wrapped_format_reward": 0.625,
"step": 186
},
{
"completion_length": 500.0,
"epoch": 37.4,
"grad_norm": 8.40079402923584,
"kl": 1.8278297185897827,
"learning_rate": 4.99566402546179e-06,
"loss": 0.0731,
"reward": -0.7244951725006104,
"reward_std": 3.783473491668701,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.3705357313156128,
"rewards/wrapped_driving_reward": -2.0950307846069336,
"rewards/wrapped_format_reward": 0.5,
"step": 187
},
{
"completion_length": 500.0,
"epoch": 37.6,
"grad_norm": 0.5168763399124146,
"kl": 0.2395801991224289,
"learning_rate": 4.995336996054668e-06,
"loss": 0.0096,
"reward": 1.9002426862716675,
"reward_std": 2.223823070526123,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 0.9772727489471436,
"rewards/wrapped_driving_reward": -0.7020300626754761,
"rewards/wrapped_format_reward": 0.625,
"step": 188
},
{
"completion_length": 500.0,
"epoch": 37.8,
"grad_norm": 0.9863908290863037,
"kl": 0.27976277470588684,
"learning_rate": 4.99499808972855e-06,
"loss": 0.0112,
"reward": -0.028857052326202393,
"reward_std": 2.8702406883239746,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -2.1538569927215576,
"rewards/wrapped_format_reward": 0.625,
"step": 189
},
{
"completion_length": 500.0,
"epoch": 38.0,
"grad_norm": 0.8377166986465454,
"kl": 0.48623228073120117,
"learning_rate": 4.994647308096509e-06,
"loss": 0.0194,
"reward": 2.531177043914795,
"reward_std": 0.5673744082450867,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 0.9166666865348816,
"rewards/wrapped_driving_reward": 0.4895104467868805,
"rewards/wrapped_format_reward": 0.125,
"step": 190
},
{
"completion_length": 500.0,
"epoch": 38.2,
"grad_norm": 0.9249876737594604,
"kl": 0.4526787996292114,
"learning_rate": 4.994284652828143e-06,
"loss": 0.0181,
"reward": 0.6909130215644836,
"reward_std": 3.1517491340637207,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -1.1840870380401611,
"rewards/wrapped_format_reward": 0.375,
"step": 191
},
{
"completion_length": 500.0,
"epoch": 38.4,
"grad_norm": 0.5216014385223389,
"kl": 0.2844958007335663,
"learning_rate": 4.993910125649561e-06,
"loss": 0.0114,
"reward": 1.347219705581665,
"reward_std": 3.583749771118164,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -0.527780294418335,
"rewards/wrapped_format_reward": 0.375,
"step": 192
},
{
"completion_length": 500.0,
"epoch": 38.6,
"grad_norm": 0.7675309181213379,
"kl": 0.46290096640586853,
"learning_rate": 4.99352372834338e-06,
"loss": 0.0185,
"reward": 1.28756582736969,
"reward_std": 3.200143337249756,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.7083333134651184,
"rewards/wrapped_driving_reward": -0.5457674860954285,
"rewards/wrapped_format_reward": 0.375,
"step": 193
},
{
"completion_length": 500.0,
"epoch": 38.8,
"grad_norm": 0.5267873406410217,
"kl": 0.27246928215026855,
"learning_rate": 4.993125462748714e-06,
"loss": 0.0109,
"reward": 0.5119737386703491,
"reward_std": 2.572335958480835,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": -1.9880262613296509,
"rewards/wrapped_format_reward": 0.5,
"step": 194
},
{
"completion_length": 500.0,
"epoch": 39.0,
"grad_norm": 0.557345449924469,
"kl": 0.33223679661750793,
"learning_rate": 4.992715330761167e-06,
"loss": 0.0133,
"reward": 1.9005041122436523,
"reward_std": 1.5405527353286743,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": -0.5994958281517029,
"rewards/wrapped_format_reward": 0.5,
"step": 195
},
{
"completion_length": 500.0,
"epoch": 39.2,
"grad_norm": 0.5145586729049683,
"kl": 0.27872464060783386,
"learning_rate": 4.992293334332821e-06,
"loss": 0.0111,
"reward": 0.08070141077041626,
"reward_std": 2.161402702331543,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": -2.9192986488342285,
"rewards/wrapped_format_reward": 1.0,
"step": 196
},
{
"completion_length": 500.0,
"epoch": 39.4,
"grad_norm": 0.5731538534164429,
"kl": 0.2947344481945038,
"learning_rate": 4.9918594754722286e-06,
"loss": 0.0118,
"reward": 1.089212417602539,
"reward_std": 3.5704760551452637,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -0.9107875823974609,
"rewards/wrapped_format_reward": 0.5,
"step": 197
},
{
"completion_length": 500.0,
"epoch": 39.6,
"grad_norm": 1.0262069702148438,
"kl": 0.36793074011802673,
"learning_rate": 4.991413756244404e-06,
"loss": 0.0147,
"reward": 2.804293632507324,
"reward_std": 0.05172164365649223,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": 0.3042936325073242,
"rewards/wrapped_format_reward": 0.5,
"step": 198
},
{
"completion_length": 500.0,
"epoch": 39.8,
"grad_norm": 0.7235340476036072,
"kl": 0.4867457151412964,
"learning_rate": 4.990956178770814e-06,
"loss": 0.0195,
"reward": 2.4924705028533936,
"reward_std": 0.6009870767593384,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 0.949999988079071,
"rewards/wrapped_driving_reward": -0.08252956718206406,
"rewards/wrapped_format_reward": 0.625,
"step": 199
},
{
"completion_length": 500.0,
"epoch": 40.0,
"grad_norm": 0.8564599752426147,
"kl": 0.4650922119617462,
"learning_rate": 4.990486745229364e-06,
"loss": 0.0186,
"reward": 2.757322311401367,
"reward_std": 0.5960695743560791,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 0.9444444179534912,
"rewards/wrapped_driving_reward": 0.18787765502929688,
"rewards/wrapped_format_reward": 0.625,
"step": 200
},
{
"completion_length": 500.0,
"epoch": 40.2,
"grad_norm": 0.6181848645210266,
"kl": 0.33555763959884644,
"learning_rate": 4.990005457854392e-06,
"loss": 0.0134,
"reward": 0.935232937335968,
"reward_std": 2.9882521629333496,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.7222222089767456,
"rewards/wrapped_driving_reward": -0.7869893312454224,
"rewards/wrapped_format_reward": 0.25,
"step": 201
},
{
"completion_length": 500.0,
"epoch": 40.4,
"grad_norm": 0.8061473369598389,
"kl": 0.3526011109352112,
"learning_rate": 4.989512318936654e-06,
"loss": 0.0141,
"reward": 2.038607597351074,
"reward_std": 1.286082148551941,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 0.9772727489471436,
"rewards/wrapped_driving_reward": -0.4386652112007141,
"rewards/wrapped_format_reward": 0.5,
"step": 202
},
{
"completion_length": 500.0,
"epoch": 40.6,
"grad_norm": 1.0745853185653687,
"kl": 0.7225068807601929,
"learning_rate": 4.989007330823319e-06,
"loss": 0.0289,
"reward": 3.327683210372925,
"reward_std": 0.45302456617355347,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": 0.5776832103729248,
"rewards/wrapped_format_reward": 0.75,
"step": 203
},
{
"completion_length": 500.0,
"epoch": 40.8,
"grad_norm": 0.6797990202903748,
"kl": 0.49457883834838867,
"learning_rate": 4.988490495917948e-06,
"loss": 0.0198,
"reward": 1.4564661979675293,
"reward_std": 3.6745243072509766,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -0.7935338020324707,
"rewards/wrapped_format_reward": 0.75,
"step": 204
},
{
"completion_length": 500.0,
"epoch": 41.0,
"grad_norm": 0.5719887018203735,
"kl": 0.3025702238082886,
"learning_rate": 4.987961816680493e-06,
"loss": 0.0121,
"reward": 0.8813665509223938,
"reward_std": 3.3135292530059814,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.685606062412262,
"rewards/wrapped_driving_reward": -1.1792395114898682,
"rewards/wrapped_format_reward": 0.625,
"step": 205
},
{
"completion_length": 500.0,
"epoch": 41.2,
"grad_norm": 0.7324315905570984,
"kl": 0.387521356344223,
"learning_rate": 4.987421295627279e-06,
"loss": 0.0155,
"reward": 3.60201358795166,
"reward_std": 0.17326904833316803,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 0.9722222089767456,
"rewards/wrapped_driving_reward": 0.7547914981842041,
"rewards/wrapped_format_reward": 0.875,
"step": 206
},
{
"completion_length": 500.0,
"epoch": 41.4,
"grad_norm": 1.4426076412200928,
"kl": 0.3239262104034424,
"learning_rate": 4.986868935330998e-06,
"loss": 0.013,
"reward": 1.1451337337493896,
"reward_std": 3.175523042678833,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -0.9798662662506104,
"rewards/wrapped_format_reward": 0.625,
"step": 207
},
{
"completion_length": 500.0,
"epoch": 41.6,
"grad_norm": 0.6265994310379028,
"kl": 0.31086966395378113,
"learning_rate": 4.986304738420684e-06,
"loss": 0.0124,
"reward": -0.08087223768234253,
"reward_std": 3.9480772018432617,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -1.5808722972869873,
"rewards/wrapped_format_reward": 0.5,
"step": 208
},
{
"completion_length": 500.0,
"epoch": 41.8,
"grad_norm": 0.5122293829917908,
"kl": 0.22147461771965027,
"learning_rate": 4.985728707581717e-06,
"loss": 0.0089,
"reward": 2.2255654335021973,
"reward_std": 0.4417201578617096,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 0.9821428656578064,
"rewards/wrapped_driving_reward": -0.2565774619579315,
"rewards/wrapped_format_reward": 0.5,
"step": 209
},
{
"completion_length": 500.0,
"epoch": 42.0,
"grad_norm": 0.5366212725639343,
"kl": 0.2860429286956787,
"learning_rate": 4.985140845555799e-06,
"loss": 0.0114,
"reward": -1.875,
"reward_std": 1.108677864074707,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -4.0,
"rewards/wrapped_format_reward": 0.625,
"step": 210
},
{
"completion_length": 500.0,
"epoch": 42.2,
"grad_norm": 0.757074773311615,
"kl": 0.5041708946228027,
"learning_rate": 4.984541155140945e-06,
"loss": 0.0202,
"reward": 1.3050158023834229,
"reward_std": 3.2698206901550293,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.6428571343421936,
"rewards/wrapped_driving_reward": -0.7128414511680603,
"rewards/wrapped_format_reward": 0.625,
"step": 211
},
{
"completion_length": 500.0,
"epoch": 42.4,
"grad_norm": 0.5149911046028137,
"kl": 0.24131189286708832,
"learning_rate": 4.9839296391914696e-06,
"loss": 0.0097,
"reward": -0.5590072870254517,
"reward_std": 3.6906325817108154,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -1.9340074062347412,
"rewards/wrapped_format_reward": 0.375,
"step": 212
},
{
"completion_length": 500.0,
"epoch": 42.6,
"grad_norm": 0.7922428250312805,
"kl": 0.4074100852012634,
"learning_rate": 4.98330630061797e-06,
"loss": 0.0163,
"reward": 0.7251100540161133,
"reward_std": 3.205897569656372,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -1.1498900651931763,
"rewards/wrapped_format_reward": 0.375,
"step": 213
},
{
"completion_length": 500.0,
"epoch": 42.8,
"grad_norm": 0.8499237298965454,
"kl": 0.533706784248352,
"learning_rate": 4.982671142387316e-06,
"loss": 0.0213,
"reward": 1.2264912128448486,
"reward_std": 3.1925883293151855,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -0.7735086679458618,
"rewards/wrapped_format_reward": 0.5,
"step": 214
},
{
"completion_length": 500.0,
"epoch": 43.0,
"grad_norm": 0.5848891139030457,
"kl": 0.4833756983280182,
"learning_rate": 4.982024167522638e-06,
"loss": 0.0193,
"reward": 2.640871524810791,
"reward_std": 0.3350675404071808,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 0.935606062412262,
"rewards/wrapped_driving_reward": -0.1697344183921814,
"rewards/wrapped_format_reward": 0.875,
"step": 215
},
{
"completion_length": 500.0,
"epoch": 43.2,
"grad_norm": 1.0190398693084717,
"kl": 0.5212844014167786,
"learning_rate": 4.981365379103306e-06,
"loss": 0.0209,
"reward": 1.518845796585083,
"reward_std": 1.8981057405471802,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 0.9772727489471436,
"rewards/wrapped_driving_reward": -1.0834269523620605,
"rewards/wrapped_format_reward": 0.625,
"step": 216
},
{
"completion_length": 500.0,
"epoch": 43.4,
"grad_norm": 0.6003134250640869,
"kl": 0.2476293295621872,
"learning_rate": 4.980694780264918e-06,
"loss": 0.0099,
"reward": 2.3462984561920166,
"reward_std": 0.5958766937255859,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": -0.4037014842033386,
"rewards/wrapped_format_reward": 0.75,
"step": 217
},
{
"completion_length": 500.0,
"epoch": 43.6,
"grad_norm": 0.5352597832679749,
"kl": 0.33760789036750793,
"learning_rate": 4.980012374199288e-06,
"loss": 0.0135,
"reward": 1.1177078485488892,
"reward_std": 3.422083854675293,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -0.8822920918464661,
"rewards/wrapped_format_reward": 0.5,
"step": 218
},
{
"completion_length": 500.0,
"epoch": 43.8,
"grad_norm": 0.78425532579422,
"kl": 0.45192739367485046,
"learning_rate": 4.979318164154426e-06,
"loss": 0.0181,
"reward": 3.3331549167633057,
"reward_std": 0.4030221104621887,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": 0.8331548571586609,
"rewards/wrapped_format_reward": 0.5,
"step": 219
},
{
"completion_length": 500.0,
"epoch": 44.0,
"grad_norm": 0.5511319041252136,
"kl": 0.2625429630279541,
"learning_rate": 4.978612153434527e-06,
"loss": 0.0105,
"reward": 3.4739222526550293,
"reward_std": 0.35263335704803467,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": 0.5989223122596741,
"rewards/wrapped_format_reward": 0.875,
"step": 220
},
{
"completion_length": 500.0,
"epoch": 44.2,
"grad_norm": 0.8345232009887695,
"kl": 0.5118071436882019,
"learning_rate": 4.97789434539995e-06,
"loss": 0.0205,
"reward": 1.788142442703247,
"reward_std": 2.3180289268493652,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": -1.086857557296753,
"rewards/wrapped_format_reward": 0.875,
"step": 221
},
{
"completion_length": 500.0,
"epoch": 44.4,
"grad_norm": 0.8292976021766663,
"kl": 0.5234676003456116,
"learning_rate": 4.977164743467206e-06,
"loss": 0.0209,
"reward": 1.3859682083129883,
"reward_std": 3.6182608604431152,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.6875,
"rewards/wrapped_driving_reward": -0.5515317916870117,
"rewards/wrapped_format_reward": 0.5,
"step": 222
},
{
"completion_length": 500.0,
"epoch": 44.6,
"grad_norm": 0.8200549483299255,
"kl": 0.3950418531894684,
"learning_rate": 4.976423351108943e-06,
"loss": 0.0158,
"reward": 1.9203238487243652,
"reward_std": 1.1563453674316406,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": -0.7046762704849243,
"rewards/wrapped_format_reward": 0.625,
"step": 223
},
{
"completion_length": 500.0,
"epoch": 44.8,
"grad_norm": 0.6968622207641602,
"kl": 0.2271728217601776,
"learning_rate": 4.975670171853926e-06,
"loss": 0.0091,
"reward": -0.3170052766799927,
"reward_std": 2.1093220710754395,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 0.9772727489471436,
"rewards/wrapped_driving_reward": -2.919278144836426,
"rewards/wrapped_format_reward": 0.625,
"step": 224
},
{
"completion_length": 500.0,
"epoch": 45.0,
"grad_norm": 0.7795050144195557,
"kl": 0.4355601966381073,
"learning_rate": 4.97490520928702e-06,
"loss": 0.0174,
"reward": 2.6324033737182617,
"reward_std": 0.5314469933509827,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": -0.11759641766548157,
"rewards/wrapped_format_reward": 0.75,
"step": 225
},
{
"completion_length": 500.0,
"epoch": 45.2,
"grad_norm": 0.5524005889892578,
"kl": 0.30952146649360657,
"learning_rate": 4.974128467049177e-06,
"loss": 0.0124,
"reward": -2.125,
"reward_std": 1.314977765083313,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -4.0,
"rewards/wrapped_format_reward": 0.375,
"step": 226
},
{
"completion_length": 500.0,
"epoch": 45.4,
"grad_norm": 0.5645884871482849,
"kl": 0.4939887821674347,
"learning_rate": 4.9733399488374115e-06,
"loss": 0.0198,
"reward": 2.418989658355713,
"reward_std": 0.14345024526119232,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": -0.08101026713848114,
"rewards/wrapped_format_reward": 0.5,
"step": 227
},
{
"completion_length": 500.0,
"epoch": 45.6,
"grad_norm": 0.9631263017654419,
"kl": 0.6647568941116333,
"learning_rate": 4.972539658404793e-06,
"loss": 0.0266,
"reward": -0.0228692889213562,
"reward_std": 3.135000228881836,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -2.272869110107422,
"rewards/wrapped_format_reward": 0.75,
"step": 228
},
{
"completion_length": 500.0,
"epoch": 45.8,
"grad_norm": 0.5868902802467346,
"kl": 0.536701500415802,
"learning_rate": 4.971727599560418e-06,
"loss": 0.0215,
"reward": 2.595135450363159,
"reward_std": 0.5522119402885437,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 0.949999988079071,
"rewards/wrapped_driving_reward": 0.020135482773184776,
"rewards/wrapped_format_reward": 0.625,
"step": 229
},
{
"completion_length": 500.0,
"epoch": 46.0,
"grad_norm": 0.6927148103713989,
"kl": 0.5391973257064819,
"learning_rate": 4.970903776169403e-06,
"loss": 0.0216,
"reward": 3.2273426055908203,
"reward_std": 0.38745206594467163,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 0.9642857313156128,
"rewards/wrapped_driving_reward": 0.7630569934844971,
"rewards/wrapped_format_reward": 0.5,
"step": 230
},
{
"completion_length": 500.0,
"epoch": 46.2,
"grad_norm": 2.157358407974243,
"kl": 0.5963761210441589,
"learning_rate": 4.9700681921528495e-06,
"loss": 0.0239,
"reward": 3.3556950092315674,
"reward_std": 0.5486971735954285,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": 0.7306950092315674,
"rewards/wrapped_format_reward": 0.625,
"step": 231
},
{
"completion_length": 500.0,
"epoch": 46.4,
"grad_norm": 0.5409197211265564,
"kl": 0.31054040789604187,
"learning_rate": 4.9692208514878445e-06,
"loss": 0.0124,
"reward": -1.75,
"reward_std": 1.1902379989624023,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -4.0,
"rewards/wrapped_format_reward": 0.75,
"step": 232
},
{
"completion_length": 500.0,
"epoch": 46.6,
"grad_norm": 0.8271388411521912,
"kl": 0.5030784606933594,
"learning_rate": 4.968361758207428e-06,
"loss": 0.0201,
"reward": 2.2951016426086426,
"reward_std": 0.6324443817138672,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": -0.07989836484193802,
"rewards/wrapped_format_reward": 0.375,
"step": 233
},
{
"completion_length": 500.0,
"epoch": 46.8,
"grad_norm": 0.9013113975524902,
"kl": 0.527148425579071,
"learning_rate": 4.9674909164005805e-06,
"loss": 0.0211,
"reward": -0.08311975002288818,
"reward_std": 4.243640422821045,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -1.5831197500228882,
"rewards/wrapped_format_reward": 0.5,
"step": 234
},
{
"completion_length": 500.0,
"epoch": 47.0,
"grad_norm": 0.621760368347168,
"kl": 0.5894174575805664,
"learning_rate": 4.966608330212198e-06,
"loss": 0.0236,
"reward": 2.69521427154541,
"reward_std": 0.2680894732475281,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 0.9722222089767456,
"rewards/wrapped_driving_reward": -0.15200814604759216,
"rewards/wrapped_format_reward": 0.875,
"step": 235
},
{
"completion_length": 500.0,
"epoch": 47.2,
"grad_norm": 0.6673128604888916,
"kl": 0.42412999272346497,
"learning_rate": 4.965714003843079e-06,
"loss": 0.017,
"reward": -2.0,
"reward_std": 1.0801234245300293,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -4.0,
"rewards/wrapped_format_reward": 0.5,
"step": 236
},
{
"completion_length": 500.0,
"epoch": 47.4,
"grad_norm": 0.6826753616333008,
"kl": 0.48437440395355225,
"learning_rate": 4.9648079415499e-06,
"loss": 0.0194,
"reward": 2.6671550273895264,
"reward_std": 0.6421502828598022,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": -0.20784501731395721,
"rewards/wrapped_format_reward": 0.875,
"step": 237
},
{
"completion_length": 500.0,
"epoch": 47.6,
"grad_norm": 0.7442097663879395,
"kl": 0.5179538130760193,
"learning_rate": 4.963890147645195e-06,
"loss": 0.0207,
"reward": 0.023519575595855713,
"reward_std": 1.7913424968719482,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": -2.351480484008789,
"rewards/wrapped_format_reward": 0.375,
"step": 238
},
{
"completion_length": 500.0,
"epoch": 47.8,
"grad_norm": 0.9971833825111389,
"kl": 0.2566893994808197,
"learning_rate": 4.962960626497339e-06,
"loss": 0.0103,
"reward": 1.0741076469421387,
"reward_std": 3.4465811252593994,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -0.6758923530578613,
"rewards/wrapped_format_reward": 0.25,
"step": 239
},
{
"completion_length": 500.0,
"epoch": 48.0,
"grad_norm": 0.776371955871582,
"kl": 0.6667019724845886,
"learning_rate": 4.962019382530521e-06,
"loss": 0.0267,
"reward": 0.7681245803833008,
"reward_std": 3.63140606880188,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -1.6068754196166992,
"rewards/wrapped_format_reward": 0.875,
"step": 240
},
{
"completion_length": 500.0,
"epoch": 48.2,
"grad_norm": 0.958461344242096,
"kl": 0.6015651226043701,
"learning_rate": 4.961066420224729e-06,
"loss": 0.0241,
"reward": 0.8900174498558044,
"reward_std": 2.1547889709472656,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 0.9583333134651184,
"rewards/wrapped_driving_reward": -1.568315863609314,
"rewards/wrapped_format_reward": 0.5,
"step": 241
},
{
"completion_length": 500.0,
"epoch": 48.4,
"grad_norm": 0.8577614426612854,
"kl": 0.7052382230758667,
"learning_rate": 4.960101744115727e-06,
"loss": 0.0282,
"reward": 0.500007152557373,
"reward_std": 3.6019463539123535,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.685606062412262,
"rewards/wrapped_driving_reward": -1.6855988502502441,
"rewards/wrapped_format_reward": 0.75,
"step": 242
},
{
"completion_length": 500.0,
"epoch": 48.6,
"grad_norm": 0.6088186502456665,
"kl": 0.3410260081291199,
"learning_rate": 4.959125358795031e-06,
"loss": 0.0136,
"reward": 1.2359226942062378,
"reward_std": 3.157292127609253,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -1.1390773057937622,
"rewards/wrapped_format_reward": 0.875,
"step": 243
},
{
"completion_length": 500.0,
"epoch": 48.8,
"grad_norm": 0.6780346035957336,
"kl": 0.47339513897895813,
"learning_rate": 4.958137268909887e-06,
"loss": 0.0189,
"reward": 1.3727295398712158,
"reward_std": 3.2822999954223633,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.7250000238418579,
"rewards/wrapped_driving_reward": -0.8522703647613525,
"rewards/wrapped_format_reward": 0.75,
"step": 244
},
{
"completion_length": 500.0,
"epoch": 49.0,
"grad_norm": 0.6219626069068909,
"kl": 0.3212871849536896,
"learning_rate": 4.957137479163253e-06,
"loss": 0.0129,
"reward": 0.08353948593139648,
"reward_std": 2.884551525115967,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -2.2914605140686035,
"rewards/wrapped_format_reward": 0.875,
"step": 245
},
{
"completion_length": 500.0,
"epoch": 49.2,
"grad_norm": 0.8742188811302185,
"kl": 0.6009516716003418,
"learning_rate": 4.956125994313775e-06,
"loss": 0.024,
"reward": 3.219036817550659,
"reward_std": 0.6377858519554138,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 0.8035714626312256,
"rewards/wrapped_driving_reward": 0.5404652953147888,
"rewards/wrapped_format_reward": 0.875,
"step": 246
},
{
"completion_length": 500.0,
"epoch": 49.4,
"grad_norm": 3.8272242546081543,
"kl": 1.5439887046813965,
"learning_rate": 4.95510281917576e-06,
"loss": 0.0618,
"reward": 3.679497241973877,
"reward_std": 0.29719072580337524,
"rewards/mpc_param_extraction_reward": 1.0,
"rewards/mpc_param_name_reward": 1.0,
"rewards/wrapped_driving_reward": 0.6794970631599426,
"rewards/wrapped_format_reward": 1.0,
"step": 247
},
{
"completion_length": 500.0,
"epoch": 49.6,
"grad_norm": 0.5449193120002747,
"kl": 0.3074452579021454,
"learning_rate": 4.9540679586191605e-06,
"loss": 0.0123,
"reward": -0.8099073171615601,
"reward_std": 2.768624782562256,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -2.9349074363708496,
"rewards/wrapped_format_reward": 0.625,
"step": 248
},
{
"completion_length": 500.0,
"epoch": 49.8,
"grad_norm": 0.9541939496994019,
"kl": 0.5022038221359253,
"learning_rate": 4.953021417569545e-06,
"loss": 0.0201,
"reward": 0.9676476120948792,
"reward_std": 3.3370866775512695,
"rewards/mpc_param_extraction_reward": 0.75,
"rewards/mpc_param_name_reward": 0.75,
"rewards/wrapped_driving_reward": -1.1573524475097656,
"rewards/wrapped_format_reward": 0.625,
"step": 249
},
{
"completion_length": 500.0,
"epoch": 50.0,
"grad_norm": 0.6913716197013855,
"kl": 0.22377586364746094,
"learning_rate": 4.9519632010080765e-06,
"loss": 0.009,
"reward": -0.7356908917427063,
"reward_std": 3.197190761566162,
"rewards/mpc_param_extraction_reward": 0.5,
"rewards/mpc_param_name_reward": 0.5,
"rewards/wrapped_driving_reward": -2.4856908321380615,
"rewards/wrapped_format_reward": 0.75,
"step": 250
}
],
"logging_steps": 1,
"max_steps": 1600,
"num_input_tokens_seen": 0,
"num_train_epochs": 320,
"save_steps": 250,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}