{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.9935691318327975,
  "eval_steps": 500,
  "global_step": 206,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.01041666666666663,
      "completions/max_length": 4096.0,
      "completions/max_terminated_length": 2578.0,
      "completions/mean_length": 995.7604370117188,
      "completions/mean_terminated_length": 963.1263427734375,
      "completions/min_length": 354.0,
      "completions/min_terminated_length": 354.0,
      "epoch": 0.00964630225080386,
      "grad_norm": 0.6591727137565613,
      "learning_rate": 0.0,
      "loss": 0.066,
      "num_tokens": 136897.0,
      "reward": 0.6407926082611084,
      "reward_std": 0.26224446296691895,
      "rewards/constexpr_reward/mean": 0.11666667461395264,
      "rewards/constexpr_reward/std": 0.0991189256310463,
      "rewards/imports_decorator_reward/mean": 0.1770833283662796,
      "rewards/imports_decorator_reward/std": 0.06403809040784836,
      "rewards/masks_load_store_reward/mean": 0.0885416641831398,
      "rewards/masks_load_store_reward/std": 0.03201904892921448,
      "rewards/one_code_blob_reward/mean": 0.02701156586408615,
      "rewards/one_code_blob_reward/std": 0.054558686912059784,
      "rewards/reward_code_runs/mean": -0.13697917759418488,
      "rewards/reward_code_runs/std": 0.28388699889183044,
      "rewards/think_reward/mean": 0.19867680966854095,
      "rewards/think_reward/std": 0.009136492386460304,
      "rewards/torch_empty_penalty/mean": -0.01458333432674408,
      "rewards/torch_empty_penalty/std": 0.03547917678952217,
      "rewards/torch_zeros_reward/mean": 0.01145833358168602,
      "rewards/torch_zeros_reward/std": 0.03201904520392418,
      "rewards/valid_tl_methods_reward/mean": 0.17291666567325592,
      "rewards/valid_tl_methods_reward/std": 0.06879284977912903,
      "step": 1
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3924.0,
      "completions/max_terminated_length": 3924.0,
      "completions/mean_length": 1379.291748046875,
      "completions/mean_terminated_length": 1379.291748046875,
      "completions/min_length": 453.0,
      "completions/min_terminated_length": 453.0,
      "epoch": 0.01929260450160772,
      "grad_norm": 0.5995727181434631,
      "learning_rate": 1e-08,
      "loss": -0.0234,
      "num_tokens": 313889.0,
      "reward": 0.3980224132537842,
      "reward_std": 0.2479538917541504,
      "rewards/constexpr_reward/mean": 0.10000000149011612,
      "rewards/constexpr_reward/std": 0.10052493959665298,
      "rewards/imports_decorator_reward/mean": 0.17916667461395264,
      "rewards/imports_decorator_reward/std": 0.061416033655405045,
      "rewards/masks_load_store_reward/mean": 0.0677083358168602,
      "rewards/masks_load_store_reward/std": 0.047004569321870804,
      "rewards/one_code_blob_reward/mean": 0.003911829087883234,
      "rewards/one_code_blob_reward/std": 0.06139358878135681,
      "rewards/reward_code_runs/mean": -0.2239583283662796,
      "rewards/reward_code_runs/std": 0.17946985363960266,
      "rewards/think_reward/mean": 0.19098560512065887,
      "rewards/think_reward/std": 0.03435641527175903,
      "rewards/torch_empty_penalty/mean": -0.02500000037252903,
      "rewards/torch_empty_penalty/std": 0.04352857545018196,
      "rewards/torch_zeros_reward/mean": 0.01770833320915699,
      "rewards/torch_zeros_reward/std": 0.03837431222200394,
      "rewards/valid_tl_methods_reward/mean": 0.08749999850988388,
      "rewards/valid_tl_methods_reward/std": 0.09973649680614471,
      "step": 2
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3371.0,
      "completions/max_terminated_length": 3371.0,
      "completions/mean_length": 1258.229248046875,
      "completions/mean_terminated_length": 1258.229248046875,
      "completions/min_length": 616.0,
      "completions/min_terminated_length": 616.0,
      "epoch": 0.028938906752411574,
      "grad_norm": 0.5150277018547058,
      "learning_rate": 2e-08,
      "loss": -0.0327,
      "num_tokens": 480339.0,
      "reward": 0.5664182901382446,
      "reward_std": 0.22327932715415955,
      "rewards/constexpr_reward/mean": 0.15416666865348816,
      "rewards/constexpr_reward/std": 0.08450059592723846,
      "rewards/imports_decorator_reward/mean": 0.19166667759418488,
      "rewards/imports_decorator_reward/std": 0.04017505422234535,
      "rewards/masks_load_store_reward/mean": 0.06562500447034836,
      "rewards/masks_load_store_reward/std": 0.04774521291255951,
      "rewards/one_code_blob_reward/mean": -0.003944525495171547,
      "rewards/one_code_blob_reward/std": 0.06892073899507523,
      "rewards/reward_code_runs/mean": -0.20624999701976776,
      "rewards/reward_code_runs/std": 0.2225746363401413,
      "rewards/think_reward/mean": 0.19223777949810028,
      "rewards/think_reward/std": 0.030695218592882156,
      "rewards/torch_empty_penalty/mean": -0.01979166641831398,
      "rewards/torch_empty_penalty/std": 0.04005204886198044,
      "rewards/torch_zeros_reward/mean": 0.046875,
      "rewards/torch_zeros_reward/std": 0.05016420781612396,
      "rewards/valid_tl_methods_reward/mean": 0.1458333283662796,
      "rewards/valid_tl_methods_reward/std": 0.08934459090232849,
      "step": 3
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3301.0,
      "completions/max_terminated_length": 3301.0,
      "completions/mean_length": 1140.697998046875,
      "completions/mean_terminated_length": 1140.697998046875,
      "completions/min_length": 608.0,
      "completions/min_terminated_length": 608.0,
      "epoch": 0.03858520900321544,
      "grad_norm": 0.4870540201663971,
      "learning_rate": 3e-08,
      "loss": -0.0068,
      "num_tokens": 636226.0,
      "reward": 0.46876463294029236,
      "reward_std": 0.16259050369262695,
      "rewards/constexpr_reward/mean": 0.13125000894069672,
      "rewards/constexpr_reward/std": 0.09549043327569962,
      "rewards/imports_decorator_reward/mean": 0.18958334624767303,
      "rewards/imports_decorator_reward/std": 0.044672295451164246,
      "rewards/masks_load_store_reward/mean": 0.06562500447034836,
      "rewards/masks_load_store_reward/std": 0.04774521291255951,
      "rewards/one_code_blob_reward/mean": 0.006447285413742065,
      "rewards/one_code_blob_reward/std": 0.06022736802697182,
      "rewards/reward_code_runs/mean": -0.23593749105930328,
      "rewards/reward_code_runs/std": 0.07870769500732422,
      "rewards/think_reward/mean": 0.1992965191602707,
      "rewards/think_reward/std": 0.006892777048051357,
      "rewards/torch_empty_penalty/mean": -0.03750000149011612,
      "rewards/torch_empty_penalty/std": 0.04866642504930496,
      "rewards/torch_zeros_reward/mean": 0.010416666977107525,
      "rewards/torch_zeros_reward/std": 0.03070802055299282,
      "rewards/valid_tl_methods_reward/mean": 0.13958333432674408,
      "rewards/valid_tl_methods_reward/std": 0.09231429547071457,
      "step": 4
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2648.0,
      "completions/max_terminated_length": 2648.0,
      "completions/mean_length": 1042.760498046875,
      "completions/mean_terminated_length": 1042.760498046875,
      "completions/min_length": 374.0,
      "completions/min_terminated_length": 374.0,
      "epoch": 0.04823151125401929,
      "grad_norm": 0.5751297473907471,
      "learning_rate": 4e-08,
      "loss": -0.0088,
      "num_tokens": 778943.0,
      "reward": 0.5530459880828857,
      "reward_std": 0.2552254796028137,
      "rewards/constexpr_reward/mean": 0.12916666269302368,
      "rewards/constexpr_reward/std": 0.0961541160941124,
      "rewards/imports_decorator_reward/mean": 0.1875,
      "rewards/imports_decorator_reward/std": 0.04866642504930496,
      "rewards/masks_load_store_reward/mean": 0.06354167312383652,
      "rewards/masks_load_store_reward/std": 0.04838397353887558,
      "rewards/one_code_blob_reward/mean": 0.01439767237752676,
      "rewards/one_code_blob_reward/std": 0.064917653799057,
      "rewards/reward_code_runs/mean": -0.17552083730697632,
      "rewards/reward_code_runs/std": 0.28441956639289856,
      "rewards/think_reward/mean": 0.19333581626415253,
      "rewards/think_reward/std": 0.03038639947772026,
      "rewards/torch_empty_penalty/mean": -0.03333333507180214,
      "rewards/torch_empty_penalty/std": 0.04738790914416313,
      "rewards/torch_zeros_reward/mean": 0.03229166939854622,
      "rewards/torch_zeros_reward/std": 0.047004569321870804,
      "rewards/valid_tl_methods_reward/mean": 0.14166666567325592,
      "rewards/valid_tl_methods_reward/std": 0.09138313680887222,
      "step": 5
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3503.0,
      "completions/max_terminated_length": 3503.0,
      "completions/mean_length": 1263.84375,
      "completions/mean_terminated_length": 1263.84375,
      "completions/min_length": 255.0,
      "completions/min_terminated_length": 255.0,
      "epoch": 0.05787781350482315,
      "grad_norm": 0.51258385181427,
      "learning_rate": 5e-08,
      "loss": 0.0506,
      "num_tokens": 946052.0,
      "reward": 0.5646942853927612,
      "reward_std": 0.2725030779838562,
      "rewards/constexpr_reward/mean": 0.1041666641831398,
      "rewards/constexpr_reward/std": 0.10043764114379883,
      "rewards/imports_decorator_reward/mean": 0.1875,
      "rewards/imports_decorator_reward/std": 0.04866642504930496,
      "rewards/masks_load_store_reward/mean": 0.06145833432674408,
      "rewards/masks_load_store_reward/std": 0.04892484471201897,
      "rewards/one_code_blob_reward/mean": 0.0059766932390630245,
      "rewards/one_code_blob_reward/std": 0.05690459534525871,
      "rewards/reward_code_runs/mean": -0.17916667461395264,
      "rewards/reward_code_runs/std": 0.2631456255912781,
      "rewards/think_reward/mean": 0.1941341906785965,
      "rewards/think_reward/std": 0.03578920289874077,
      "rewards/torch_empty_penalty/mean": -0.010416666977107525,
      "rewards/torch_empty_penalty/std": 0.030708016827702522,
      "rewards/torch_zeros_reward/mean": 0.02812500111758709,
      "rewards/torch_zeros_reward/std": 0.04519693925976753,
      "rewards/valid_tl_methods_reward/mean": 0.17291666567325592,
      "rewards/valid_tl_methods_reward/std": 0.06879284232854843,
      "step": 6
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2655.0,
      "completions/max_terminated_length": 2655.0,
      "completions/mean_length": 1081.010498046875,
      "completions/mean_terminated_length": 1081.010498046875,
      "completions/min_length": 375.0,
      "completions/min_terminated_length": 375.0,
      "epoch": 0.06752411575562701,
      "grad_norm": 0.703319251537323,
      "learning_rate": 6e-08,
      "loss": 0.017,
      "num_tokens": 1092393.0,
      "reward": 0.5540701746940613,
      "reward_std": 0.22611849009990692,
      "rewards/constexpr_reward/mean": 0.12708333134651184,
      "rewards/constexpr_reward/std": 0.09676794707775116,
      "rewards/imports_decorator_reward/mean": 0.1875,
      "rewards/imports_decorator_reward/std": 0.04866642504930496,
      "rewards/masks_load_store_reward/mean": 0.07604166865348816,
      "rewards/masks_load_store_reward/std": 0.042906977236270905,
      "rewards/one_code_blob_reward/mean": 0.021316377446055412,
      "rewards/one_code_blob_reward/std": 0.05409438535571098,
      "rewards/reward_code_runs/mean": -0.17812500894069672,
      "rewards/reward_code_runs/std": 0.24298717081546783,
      "rewards/think_reward/mean": 0.19942043721675873,
      "rewards/think_reward/std": 0.005678629036992788,
      "rewards/torch_empty_penalty/mean": -0.03333333507180214,
      "rewards/torch_empty_penalty/std": 0.04738790914416313,
      "rewards/torch_zeros_reward/mean": 0.02500000037252903,
      "rewards/torch_zeros_reward/std": 0.04352857545018196,
      "rewards/valid_tl_methods_reward/mean": 0.12916667759418488,
      "rewards/valid_tl_methods_reward/std": 0.0961541160941124,
      "step": 7
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2545.0,
      "completions/max_terminated_length": 2545.0,
      "completions/mean_length": 1092.4375,
      "completions/mean_terminated_length": 1092.4375,
      "completions/min_length": 435.0,
      "completions/min_terminated_length": 435.0,
      "epoch": 0.07717041800643087,
      "grad_norm": 0.5653499364852905,
      "learning_rate": 7e-08,
      "loss": 0.0581,
      "num_tokens": 1236051.0,
      "reward": 0.5787287950515747,
      "reward_std": 0.27947068214416504,
      "rewards/constexpr_reward/mean": 0.1145833358168602,
      "rewards/constexpr_reward/std": 0.0994502454996109,
      "rewards/imports_decorator_reward/mean": 0.19166667759418488,
      "rewards/imports_decorator_reward/std": 0.04017505422234535,
      "rewards/masks_load_store_reward/mean": 0.06354166567325592,
      "rewards/masks_load_store_reward/std": 0.04838397353887558,
      "rewards/one_code_blob_reward/mean": 0.01976184733211994,
      "rewards/one_code_blob_reward/std": 0.04637972265481949,
      "rewards/reward_code_runs/mean": -0.14479167759418488,
      "rewards/reward_code_runs/std": 0.33217617869377136,
      "rewards/think_reward/mean": 0.19646687805652618,
      "rewards/think_reward/std": 0.017844805493950844,
      "rewards/torch_empty_penalty/mean": -0.03749999776482582,
      "rewards/torch_empty_penalty/std": 0.04866642504930496,
      "rewards/torch_zeros_reward/mean": 0.01458333432674408,
      "rewards/torch_zeros_reward/std": 0.03547917678952217,
      "rewards/valid_tl_methods_reward/mean": 0.16041666269302368,
      "rewards/valid_tl_methods_reward/std": 0.08010409772396088,
      "step": 8
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2317.0,
      "completions/max_terminated_length": 2317.0,
      "completions/mean_length": 1088.4583740234375,
      "completions/mean_terminated_length": 1088.4583740234375,
      "completions/min_length": 618.0,
      "completions/min_terminated_length": 618.0,
      "epoch": 0.08681672025723473,
      "grad_norm": 0.5193675756454468,
      "learning_rate": 8e-08,
      "loss": 0.0241,
      "num_tokens": 1387679.0,
      "reward": 0.6237555742263794,
      "reward_std": 0.23493945598602295,
      "rewards/constexpr_reward/mean": 0.125,
      "rewards/constexpr_reward/std": 0.09733285009860992,
      "rewards/imports_decorator_reward/mean": 0.18541665375232697,
      "rewards/imports_decorator_reward/std": 0.05227290466427803,
      "rewards/masks_load_store_reward/mean": 0.07499999552965164,
      "rewards/masks_load_store_reward/std": 0.04352857545018196,
      "rewards/one_code_blob_reward/mean": 0.011236711405217648,
      "rewards/one_code_blob_reward/std": 0.05193483084440231,
      "rewards/reward_code_runs/mean": -0.15312500298023224,
      "rewards/reward_code_runs/std": 0.3126131594181061,
      "rewards/think_reward/mean": 0.19793546199798584,
      "rewards/think_reward/std": 0.018091697245836258,
      "rewards/torch_empty_penalty/mean": -0.01875000074505806,
      "rewards/torch_empty_penalty/std": 0.039236124604940414,
      "rewards/torch_zeros_reward/mean": 0.02187499962747097,
      "rewards/torch_zeros_reward/std": 0.04155687242746353,
      "rewards/valid_tl_methods_reward/mean": 0.17916667461395264,
      "rewards/valid_tl_methods_reward/std": 0.061416033655405045,
      "step": 9
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.04166666666666663,
      "completions/max_length": 4096.0,
      "completions/max_terminated_length": 4025.0,
      "completions/mean_length": 1312.15625,
      "completions/mean_terminated_length": 1191.11962890625,
      "completions/min_length": 388.0,
      "completions/min_terminated_length": 388.0,
      "epoch": 0.09646302250803858,
      "grad_norm": 0.7497362494468689,
      "learning_rate": 9e-08,
      "loss": 0.0166,
      "num_tokens": 1554818.0,
      "reward": 0.6600733995437622,
      "reward_std": 0.34800395369529724,
      "rewards/constexpr_reward/mean": 0.1145833358168602,
      "rewards/constexpr_reward/std": 0.0994502454996109,
      "rewards/imports_decorator_reward/mean": 0.1770833283662796,
      "rewards/imports_decorator_reward/std": 0.06403809040784836,
      "rewards/masks_load_store_reward/mean": 0.06562500447034836,
      "rewards/masks_load_store_reward/std": 0.04774521291255951,
      "rewards/one_code_blob_reward/mean": 0.017952701076865196,
      "rewards/one_code_blob_reward/std": 0.06317053735256195,
      "rewards/reward_code_runs/mean": -0.08437500149011612,
      "rewards/reward_code_runs/std": 0.4170266091823578,
      "rewards/think_reward/mean": 0.19732896983623505,
      "rewards/think_reward/std": 0.020253153517842293,
      "rewards/torch_empty_penalty/mean": -0.015625,
      "rewards/torch_empty_penalty/std": 0.03649982064962387,
      "rewards/torch_zeros_reward/mean": 0.03125,
      "rewards/torch_zeros_reward/std": 0.04659455642104149,
      "rewards/valid_tl_methods_reward/mean": 0.15625,
      "rewards/valid_tl_methods_reward/std": 0.08311374485492706,
      "step": 10
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2730.0,
      "completions/max_terminated_length": 2730.0,
      "completions/mean_length": 980.1041870117188,
      "completions/mean_terminated_length": 980.1041870117188,
      "completions/min_length": 437.0,
      "completions/min_terminated_length": 437.0,
      "epoch": 0.10610932475884244,
      "grad_norm": 0.7204728126525879,
      "learning_rate": 1e-07,
      "loss": 0.0299,
      "num_tokens": 1689792.0,
      "reward": 0.5421775579452515,
      "reward_std": 0.2649068534374237,
      "rewards/constexpr_reward/mean": 0.1145833358168602,
      "rewards/constexpr_reward/std": 0.0994502454996109,
      "rewards/imports_decorator_reward/mean": 0.17916667461395264,
      "rewards/imports_decorator_reward/std": 0.061416033655405045,
      "rewards/masks_load_store_reward/mean": 0.0729166641831398,
      "rewards/masks_load_store_reward/std": 0.044672295451164246,
      "rewards/one_code_blob_reward/mean": 0.028092747554183006,
      "rewards/one_code_blob_reward/std": 0.04810455068945885,
      "rewards/reward_code_runs/mean": -0.15677082538604736,
      "rewards/reward_code_runs/std": 0.2936718463897705,
      "rewards/think_reward/mean": 0.19898061454296112,
      "rewards/think_reward/std": 0.009988076984882355,
      "rewards/torch_empty_penalty/mean": -0.04062500223517418,
      "rewards/torch_empty_penalty/std": 0.04937104508280754,
      "rewards/torch_zeros_reward/mean": 0.012500000186264515,
      "rewards/torch_zeros_reward/std": 0.033245496451854706,
      "rewards/valid_tl_methods_reward/mean": 0.13333334028720856,
      "rewards/valid_tl_methods_reward/std": 0.09477581828832626,
      "step": 11
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3685.0,
      "completions/max_terminated_length": 3685.0,
      "completions/mean_length": 1081.572998046875,
      "completions/mean_terminated_length": 1081.572998046875,
      "completions/min_length": 354.0,
      "completions/min_terminated_length": 354.0,
      "epoch": 0.1157556270096463,
      "grad_norm": 0.5205122828483582,
      "learning_rate": 1.0999999999999999e-07,
      "loss": 0.0225,
      "num_tokens": 1835971.0,
      "reward": 0.6614155173301697,
      "reward_std": 0.3161061406135559,
      "rewards/constexpr_reward/mean": 0.10208333283662796,
      "rewards/constexpr_reward/std": 0.1005031168460846,
      "rewards/imports_decorator_reward/mean": 0.18125002086162567,
      "rewards/imports_decorator_reward/std": 0.058602139353752136,
      "rewards/masks_load_store_reward/mean": 0.0625,
      "rewards/masks_load_store_reward/std": 0.04866642504930496,
      "rewards/one_code_blob_reward/mean": 0.015901053324341774,
      "rewards/one_code_blob_reward/std": 0.05575646832585335,
      "rewards/reward_code_runs/mean": -0.02760416828095913,
      "rewards/reward_code_runs/std": 0.4685852527618408,
      "rewards/think_reward/mean": 0.19811856746673584,
      "rewards/think_reward/std": 0.018434301018714905,
      "rewards/torch_empty_penalty/mean": -0.03854166716337204,
      "rewards/torch_empty_penalty/std": 0.04892484098672867,
      "rewards/torch_zeros_reward/mean": 0.0031250000465661287,
      "rewards/torch_zeros_reward/std": 0.017490599304437637,
      "rewards/valid_tl_methods_reward/mean": 0.16458334028720856,
      "rewards/valid_tl_methods_reward/std": 0.07674862444400787,
      "step": 12
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.01041666666666663,
      "completions/max_length": 4096.0,
      "completions/max_terminated_length": 3961.0,
      "completions/mean_length": 1053.5521240234375,
      "completions/mean_terminated_length": 1021.5263671875,
      "completions/min_length": 375.0,
      "completions/min_terminated_length": 375.0,
      "epoch": 0.12540192926045016,
      "grad_norm": 0.8472212553024292,
      "learning_rate": 1.2e-07,
      "loss": 0.0115,
      "num_tokens": 1982244.0,
      "reward": 0.5119444131851196,
      "reward_std": 0.19705253839492798,
      "rewards/constexpr_reward/mean": 0.12291666865348816,
      "rewards/constexpr_reward/std": 0.09784968197345734,
      "rewards/imports_decorator_reward/mean": 0.1875,
      "rewards/imports_decorator_reward/std": 0.04866642504930496,
      "rewards/masks_load_store_reward/mean": 0.06041666865348816,
      "rewards/masks_load_store_reward/std": 0.04915960505604744,
      "rewards/one_code_blob_reward/mean": 0.020093580707907677,
      "rewards/one_code_blob_reward/std": 0.061894889920949936,
      "rewards/reward_code_runs/mean": -0.2265625,
      "rewards/reward_code_runs/std": 0.10051266103982925,
      "rewards/think_reward/mean": 0.19341325759887695,
      "rewards/think_reward/std": 0.03378907963633537,
      "rewards/torch_empty_penalty/mean": -0.02083333395421505,
      "rewards/torch_empty_penalty/std": 0.040824830532073975,
      "rewards/torch_zeros_reward/mean": 0.02916666679084301,
      "rewards/torch_zeros_reward/std": 0.04569156840443611,
      "rewards/valid_tl_methods_reward/mean": 0.1458333283662796,
      "rewards/valid_tl_methods_reward/std": 0.08934459090232849,
      "step": 13
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2192.0,
      "completions/max_terminated_length": 2192.0,
      "completions/mean_length": 1119.21875,
      "completions/mean_terminated_length": 1119.21875,
      "completions/min_length": 529.0,
      "completions/min_terminated_length": 529.0,
      "epoch": 0.13504823151125403,
      "grad_norm": 0.5599005818367004,
      "learning_rate": 1.3e-07,
      "loss": 0.0018,
      "num_tokens": 2136117.0,
      "reward": 0.6745297312736511,
      "reward_std": 0.32885250449180603,
      "rewards/constexpr_reward/mean": 0.12291667610406876,
      "rewards/constexpr_reward/std": 0.09784968197345734,
      "rewards/imports_decorator_reward/mean": 0.18333333730697632,
      "rewards/imports_decorator_reward/std": 0.05556724593043327,
      "rewards/masks_load_store_reward/mean": 0.05937499925494194,
      "rewards/masks_load_store_reward/std": 0.04937104508280754,
      "rewards/one_code_blob_reward/mean": 0.00848993007093668,
      "rewards/one_code_blob_reward/std": 0.05749613419175148,
      "rewards/reward_code_runs/mean": -0.03229166939854622,
      "rewards/reward_code_runs/std": 0.4685351550579071,
      "rewards/think_reward/mean": 0.19937308132648468,
      "rewards/think_reward/std": 0.006142645608633757,
      "rewards/torch_empty_penalty/mean": -0.03020833432674408,
      "rewards/torch_empty_penalty/std": 0.046157147735357285,
      "rewards/torch_zeros_reward/mean": 0.00729166716337204,
      "rewards/torch_zeros_reward/std": 0.026136448606848717,
      "rewards/valid_tl_methods_reward/mean": 0.15625,
      "rewards/valid_tl_methods_reward/std": 0.08311374485492706,
      "step": 14
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.01041666666666663,
      "completions/max_length": 4096.0,
      "completions/max_terminated_length": 3819.0,
      "completions/mean_length": 1086.15625,
      "completions/mean_terminated_length": 1054.4737548828125,
      "completions/min_length": 324.0,
      "completions/min_terminated_length": 324.0,
      "epoch": 0.14469453376205788,
      "grad_norm": 0.721663236618042,
      "learning_rate": 1.4e-07,
      "loss": 0.0261,
      "num_tokens": 2280180.0,
      "reward": 0.6691732406616211,
      "reward_std": 0.2556842565536499,
      "rewards/constexpr_reward/mean": 0.11874999850988388,
      "rewards/constexpr_reward/std": 0.09874209016561508,
      "rewards/imports_decorator_reward/mean": 0.18125002086162567,
      "rewards/imports_decorator_reward/std": 0.058602139353752136,
      "rewards/masks_load_store_reward/mean": 0.07708332687616348,
      "rewards/masks_load_store_reward/std": 0.04225030168890953,
      "rewards/one_code_blob_reward/mean": 0.025778064504265785,
      "rewards/one_code_blob_reward/std": 0.055621080100536346,
      "rewards/reward_code_runs/mean": -0.12031248956918716,
      "rewards/reward_code_runs/std": 0.324246346950531,
      "rewards/think_reward/mean": 0.19495761394500732,
      "rewards/think_reward/std": 0.03575357794761658,
      "rewards/torch_empty_penalty/mean": -0.0010416667209938169,
      "rewards/torch_empty_penalty/std": 0.010206207633018494,
      "rewards/torch_zeros_reward/mean": 0.015625,
      "rewards/torch_zeros_reward/std": 0.03649982064962387,
      "rewards/valid_tl_methods_reward/mean": 0.1770833283662796,
      "rewards/valid_tl_methods_reward/std": 0.06403809040784836,
      "step": 15
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3865.0,
      "completions/max_terminated_length": 3865.0,
      "completions/mean_length": 1319.541748046875,
      "completions/mean_terminated_length": 1319.541748046875,
      "completions/min_length": 501.0,
      "completions/min_terminated_length": 501.0,
      "epoch": 0.15434083601286175,
      "grad_norm": 0.6778246760368347,
      "learning_rate": 1.5e-07,
      "loss": 0.0155,
      "num_tokens": 2451844.0,
      "reward": 0.5091235637664795,
      "reward_std": 0.22904875874519348,
      "rewards/constexpr_reward/mean": 0.11041667312383652,
      "rewards/constexpr_reward/std": 0.09997806698083878,
      "rewards/imports_decorator_reward/mean": 0.19166667759418488,
      "rewards/imports_decorator_reward/std": 0.04017505422234535,
      "rewards/masks_load_store_reward/mean": 0.06458333879709244,
      "rewards/masks_load_store_reward/std": 0.0480770580470562,
      "rewards/one_code_blob_reward/mean": 0.003854154609143734,
      "rewards/one_code_blob_reward/std": 0.06397390365600586,
      "rewards/reward_code_runs/mean": -0.21458333730697632,
      "rewards/reward_code_runs/std": 0.18944749236106873,
      "rewards/think_reward/mean": 0.19485266506671906,
      "rewards/think_reward/std": 0.02862134948372841,
      "rewards/torch_empty_penalty/mean": -0.02083333395421505,
      "rewards/torch_empty_penalty/std": 0.040824830532073975,
      "rewards/torch_zeros_reward/mean": 0.02291666716337204,
      "rewards/torch_zeros_reward/std": 0.04225029796361923,
      "rewards/valid_tl_methods_reward/mean": 0.15625,
      "rewards/valid_tl_methods_reward/std": 0.08311374485492706,
      "step": 16
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3195.0,
      "completions/max_terminated_length": 3195.0,
      "completions/mean_length": 1325.729248046875,
      "completions/mean_terminated_length": 1325.729248046875,
      "completions/min_length": 424.0,
      "completions/min_terminated_length": 424.0,
      "epoch": 0.1639871382636656,
      "grad_norm": 0.9080011248588562,
      "learning_rate": 1.6e-07,
      "loss": -0.0255,
      "num_tokens": 2625014.0,
      "reward": 0.5426409244537354,
      "reward_std": 0.24068626761436462,
      "rewards/constexpr_reward/mean": 0.12708333134651184,
      "rewards/constexpr_reward/std": 0.09676794707775116,
      "rewards/imports_decorator_reward/mean": 0.18333333730697632,
      "rewards/imports_decorator_reward/std": 0.05556724593043327,
      "rewards/masks_load_store_reward/mean": 0.06041666865348816,
      "rewards/masks_load_store_reward/std": 0.04915960505604744,
      "rewards/one_code_blob_reward/mean": 0.0076617044396698475,
      "rewards/one_code_blob_reward/std": 0.05307823792099953,
      "rewards/reward_code_runs/mean": -0.18385416269302368,
      "rewards/reward_code_runs/std": 0.2603130340576172,
      "rewards/think_reward/mean": 0.19695837795734406,
      "rewards/think_reward/std": 0.019501902163028717,
      "rewards/torch_empty_penalty/mean": -0.012500000186264515,
      "rewards/torch_empty_penalty/std": 0.033245496451854706,
      "rewards/torch_zeros_reward/mean": 0.03229166567325592,
      "rewards/torch_zeros_reward/std": 0.0470045730471611,
      "rewards/valid_tl_methods_reward/mean": 0.13125000894069672,
      "rewards/valid_tl_methods_reward/std": 0.09549042582511902,
      "step": 17
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2733.0,
      "completions/max_terminated_length": 2733.0,
      "completions/mean_length": 1176.71875,
      "completions/mean_terminated_length": 1176.71875,
      "completions/min_length": 358.0,
      "completions/min_terminated_length": 358.0,
      "epoch": 0.17363344051446947,
      "grad_norm": 0.611341118812561,
      "learning_rate": 1.7000000000000001e-07,
      "loss": -0.0074,
      "num_tokens": 2781767.0,
      "reward": 0.4905034005641937,
      "reward_std": 0.17990529537200928,
      "rewards/constexpr_reward/mean": 0.12291666865348816,
      "rewards/constexpr_reward/std": 0.09784968197345734,
      "rewards/imports_decorator_reward/mean": 0.1875,
      "rewards/imports_decorator_reward/std": 0.04866642504930496,
      "rewards/masks_load_store_reward/mean": 0.07187499850988388,
      "rewards/masks_load_store_reward/std": 0.04519694298505783,
      "rewards/one_code_blob_reward/mean": 0.014260203577578068,
      "rewards/one_code_blob_reward/std": 0.0600602962076664,
      "rewards/reward_code_runs/mean": -0.25,
      "rewards/reward_code_runs/std": 0.0,
      "rewards/think_reward/mean": 0.19707651436328888,
      "rewards/think_reward/std": 0.018288368359208107,
      "rewards/torch_empty_penalty/mean": -0.01979166828095913,
      "rewards/torch_empty_penalty/std": 0.04005204886198044,
      "rewards/torch_zeros_reward/mean": 0.03125,
      "rewards/torch_zeros_reward/std": 0.04659455642104149,
      "rewards/valid_tl_methods_reward/mean": 0.1354166716337204,
      "rewards/valid_tl_methods_reward/std": 0.0940091460943222,
      "step": 18
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3502.0,
      "completions/max_terminated_length": 3502.0,
      "completions/mean_length": 1105.3021240234375,
      "completions/mean_terminated_length": 1105.3021240234375,
      "completions/min_length": 461.0,
      "completions/min_terminated_length": 461.0,
      "epoch": 0.1832797427652733,
      "grad_norm": 0.5251040458679199,
      "learning_rate": 1.8e-07,
      "loss": 0.0387,
      "num_tokens": 2931664.0,
      "reward": 0.6205741167068481,
      "reward_std": 0.25367093086242676,
      "rewards/constexpr_reward/mean": 0.13125000894069672,
      "rewards/constexpr_reward/std": 0.09549042582511902,
      "rewards/imports_decorator_reward/mean": 0.17916667461395264,
      "rewards/imports_decorator_reward/std": 0.06141604110598564,
      "rewards/masks_load_store_reward/mean": 0.07083333283662796,
      "rewards/masks_load_store_reward/std": 0.04569156840443611,
      "rewards/one_code_blob_reward/mean": 0.021005704998970032,
      "rewards/one_code_blob_reward/std": 0.05132390931248665,
      "rewards/reward_code_runs/mean": -0.14427083730697632,
      "rewards/reward_code_runs/std": 0.24048960208892822,
      "rewards/think_reward/mean": 0.19800584018230438,
      "rewards/think_reward/std": 0.012335929088294506,
      "rewards/torch_empty_penalty/mean": -0.02291666716337204,
      "rewards/torch_empty_penalty/std": 0.04225030168890953,
      "rewards/torch_zeros_reward/mean": 0.02083333395421505,
      "rewards/torch_zeros_reward/std": 0.040824830532073975,
      "rewards/valid_tl_methods_reward/mean": 0.1666666716337204,
      "rewards/valid_tl_methods_reward/std": 0.07492686063051224,
      "step": 19
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2679.0,
      "completions/max_terminated_length": 2679.0,
      "completions/mean_length": 950.1146240234375,
      "completions/mean_terminated_length": 950.1146240234375,
      "completions/min_length": 355.0,
      "completions/min_terminated_length": 355.0,
      "epoch": 0.19292604501607716,
      "grad_norm": 0.5570400357246399,
      "learning_rate": 1.8999999999999998e-07,
      "loss": 0.0206,
      "num_tokens": 3063267.0,
      "reward": 0.6459920406341553,
      "reward_std": 0.27166104316711426,
      "rewards/constexpr_reward/mean": 0.09375,
      "rewards/constexpr_reward/std": 0.10032841563224792,
      "rewards/imports_decorator_reward/mean": 0.18333333730697632,
      "rewards/imports_decorator_reward/std": 0.05556724593043327,
      "rewards/masks_load_store_reward/mean": 0.0572916679084301,
      "rewards/masks_load_store_reward/std": 0.04972511902451515,
      "rewards/one_code_blob_reward/mean": 0.020587759092450142,
      "rewards/one_code_blob_reward/std": 0.06836681067943573,
      "rewards/reward_code_runs/mean": -0.1015625,
      "rewards/reward_code_runs/std": 0.32922980189323425,
      "rewards/think_reward/mean": 0.19884181022644043,
      "rewards/think_reward/std": 0.007376207038760185,
      "rewards/torch_empty_penalty/mean": -0.02291666716337204,
      "rewards/torch_empty_penalty/std": 0.04225029796361923,
      "rewards/torch_zeros_reward/mean": 0.0416666679084301,
      "rewards/torch_zeros_reward/std": 0.04955946281552315,
      "rewards/valid_tl_methods_reward/mean": 0.17499999701976776,
      "rewards/valid_tl_methods_reward/std": 0.06649099290370941,
      "step": 20
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2543.0,
      "completions/max_terminated_length": 2543.0,
      "completions/mean_length": 1102.510498046875,
      "completions/mean_terminated_length": 1102.510498046875,
      "completions/min_length": 395.0,
      "completions/min_terminated_length": 395.0,
      "epoch": 0.20257234726688103,
      "grad_norm": 0.7027966380119324,
      "learning_rate": 2e-07,
      "loss": 0.0014,
      "num_tokens": 3208468.0,
      "reward": 0.8018503189086914,
      "reward_std": 0.3390127122402191,
      "rewards/constexpr_reward/mean": 0.13125000894069672,
      "rewards/constexpr_reward/std": 0.09549042582511902,
      "rewards/imports_decorator_reward/mean": 0.18333333730697632,
      "rewards/imports_decorator_reward/std": 0.05556724593043327,
      "rewards/masks_load_store_reward/mean": 0.05937499925494194,
      "rewards/masks_load_store_reward/std": 0.04937104508280754,
      "rewards/one_code_blob_reward/mean": 0.013175372034311295,
      "rewards/one_code_blob_reward/std": 0.07021530717611313,
      "rewards/reward_code_runs/mean": 0.0572916679084301,
      "rewards/reward_code_runs/std": 0.50481778383255,
      "rewards/think_reward/mean": 0.19492490589618683,
      "rewards/think_reward/std": 0.026695001870393753,
      "rewards/torch_empty_penalty/mean": -0.03229166939854622,
      "rewards/torch_empty_penalty/std": 0.047004569321870804,
      "rewards/torch_zeros_reward/mean": 0.02187500149011612,
      "rewards/torch_zeros_reward/std": 0.04155687615275383,
      "rewards/valid_tl_methods_reward/mean": 0.17291666567325592,
      "rewards/valid_tl_methods_reward/std": 0.06879284232854843,
      "step": 21
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.01041666666666663,
      "completions/max_length": 4096.0,
      "completions/max_terminated_length": 2754.0,
      "completions/mean_length": 1145.375,
      "completions/mean_terminated_length": 1114.3157958984375,
      "completions/min_length": 367.0,
      "completions/min_terminated_length": 367.0,
      "epoch": 0.21221864951768488,
      "grad_norm": 0.5780210494995117,
      "learning_rate": 2.0999999999999997e-07,
      "loss": 0.0393,
      "num_tokens": 3364180.0,
      "reward": 0.5374256372451782,
      "reward_std": 0.23710334300994873,
      "rewards/constexpr_reward/mean": 0.11666667461395264,
      "rewards/constexpr_reward/std": 0.0991189256310463,
      "rewards/imports_decorator_reward/mean": 0.19166667759418488,
      "rewards/imports_decorator_reward/std": 0.04017505794763565,
      "rewards/masks_load_store_reward/mean": 0.08749999850988388,
      "rewards/masks_load_store_reward/std": 0.033245496451854706,
      "rewards/one_code_blob_reward/mean": 0.014938410371541977,
      "rewards/one_code_blob_reward/std": 0.05469757318496704,
      "rewards/reward_code_runs/mean": -0.19687502086162567,
      "rewards/reward_code_runs/std": 0.22996710240840912,
      "rewards/think_reward/mean": 0.1943621188402176,
      "rewards/think_reward/std": 0.02727266401052475,
      "rewards/torch_empty_penalty/mean": -0.02916666865348816,
      "rewards/torch_empty_penalty/std": 0.04569156840443611,
      "rewards/torch_zeros_reward/mean": 0.02083333395421505,
      "rewards/torch_zeros_reward/std": 0.040824830532073975,
      "rewards/valid_tl_methods_reward/mean": 0.13750000298023224,
      "rewards/valid_tl_methods_reward/std": 0.09318911284208298,
      "step": 22
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3467.0,
      "completions/max_terminated_length": 3467.0,
      "completions/mean_length": 971.15625,
      "completions/mean_terminated_length": 971.15625,
      "completions/min_length": 358.0,
      "completions/min_terminated_length": 358.0,
      "epoch": 0.22186495176848875,
      "grad_norm": 0.6884453892707825,
      "learning_rate": 2.1999999999999998e-07,
      "loss": 0.0707,
      "num_tokens": 3498811.0,
      "reward": 0.6988117098808289,
      "reward_std": 0.32961535453796387,
      "rewards/constexpr_reward/mean": 0.13958333432674408,
      "rewards/constexpr_reward/std": 0.09231429547071457,
      "rewards/imports_decorator_reward/mean": 0.19166667759418488,
      "rewards/imports_decorator_reward/std": 0.04017505422234535,
      "rewards/masks_load_store_reward/mean": 0.0729166716337204,
      "rewards/masks_load_store_reward/std": 0.044672295451164246,
      "rewards/one_code_blob_reward/mean": 0.030532771721482277,
      "rewards/one_code_blob_reward/std": 0.047944121062755585,
      "rewards/reward_code_runs/mean": -0.07395832985639572,
      "rewards/reward_code_runs/std": 0.4056170582771301,
      "rewards/think_reward/mean": 0.1932789534330368,
      "rewards/think_reward/std": 0.02945057302713394,
      "rewards/torch_empty_penalty/mean": -0.02708333171904087,
      "rewards/torch_empty_penalty/std": 0.044672295451164246,
      "rewards/torch_zeros_reward/mean": 0.01979166828095913,
      "rewards/torch_zeros_reward/std": 0.04005204886198044,
      "rewards/valid_tl_methods_reward/mean": 0.15208333730697632,
      "rewards/valid_tl_methods_reward/std": 0.0858139619231224,
      "step": 23
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2677.0,
      "completions/max_terminated_length": 2677.0,
      "completions/mean_length": 1099.4375,
      "completions/mean_terminated_length": 1099.4375,
      "completions/min_length": 412.0,
      "completions/min_terminated_length": 412.0,
      "epoch": 0.2315112540192926,
      "grad_norm": 0.6718233227729797,
      "learning_rate": 2.3e-07,
      "loss": -0.0482,
      "num_tokens": 3649573.0,
      "reward": 0.5938155651092529,
      "reward_std": 0.21998779475688934,
      "rewards/constexpr_reward/mean": 0.1562500149011612,
      "rewards/constexpr_reward/std": 0.08311374485492706,
      "rewards/imports_decorator_reward/mean": 0.19166667759418488,
      "rewards/imports_decorator_reward/std": 0.04017505794763565,
      "rewards/masks_load_store_reward/mean": 0.06666667014360428,
      "rewards/masks_load_store_reward/std": 0.04738791286945343,
      "rewards/one_code_blob_reward/mean": 0.008211708627641201,
      "rewards/one_code_blob_reward/std": 0.06651072949171066,
      "rewards/reward_code_runs/mean": -0.19687502086162567,
      "rewards/reward_code_runs/std": 0.22996710240840912,
      "rewards/think_reward/mean": 0.1960204392671585,
      "rewards/think_reward/std": 0.019158130511641502,
      "rewards/torch_empty_penalty/mean": -0.01874999888241291,
      "rewards/torch_empty_penalty/std": 0.03923612833023071,
      "rewards/torch_zeros_reward/mean": 0.0364583320915699,
      "rewards/torch_zeros_reward/std": 0.04838397353887558,
      "rewards/valid_tl_methods_reward/mean": 0.15416665375232697,
      "rewards/valid_tl_methods_reward/std": 0.08450059592723846,
      "step": 24
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1504.0,
      "completions/max_terminated_length": 1504.0,
      "completions/mean_length": 762.09375,
      "completions/mean_terminated_length": 762.09375,
      "completions/min_length": 374.0,
      "completions/min_terminated_length": 374.0,
      "epoch": 0.24115755627009647,
      "grad_norm": 0.8245034217834473,
      "learning_rate": 2.4e-07,
      "loss": 0.0112,
      "num_tokens": 3763150.0,
      "reward": 0.5939554572105408,
      "reward_std": 0.2910269498825073,
      "rewards/constexpr_reward/mean": 0.08958333730697632,
      "rewards/constexpr_reward/std": 0.09997807443141937,
      "rewards/imports_decorator_reward/mean": 0.17916667461395264,
      "rewards/imports_decorator_reward/std": 0.061416033655405045,
      "rewards/masks_load_store_reward/mean": 0.08124999701976776,
      "rewards/masks_load_store_reward/std": 0.03923612833023071,
      "rewards/one_code_blob_reward/mean": 0.0392678827047348,
      "rewards/one_code_blob_reward/std": 0.04844178259372711,
      "rewards/reward_code_runs/mean": -0.15677084028720856,
      "rewards/reward_code_runs/std": 0.2936718463897705,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": -0.02604166604578495,
      "rewards/torch_empty_penalty/std": 0.04411657154560089,
      "rewards/torch_zeros_reward/mean": 0.02500000037252903,
      "rewards/torch_zeros_reward/std": 0.04352857545018196,
      "rewards/valid_tl_methods_reward/mean": 0.16250000894069672,
      "rewards/valid_tl_methods_reward/std": 0.07847225666046143,
      "step": 25
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2847.0,
      "completions/max_terminated_length": 2847.0,
      "completions/mean_length": 1144.5208740234375,
      "completions/mean_terminated_length": 1144.5208740234375,
      "completions/min_length": 607.0,
      "completions/min_terminated_length": 607.0,
      "epoch": 0.2508038585209003,
      "grad_norm": 0.5811767578125,
      "learning_rate": 2.5e-07,
      "loss": 0.0298,
      "num_tokens": 3918792.0,
      "reward": 0.5160846710205078,
      "reward_std": 0.21970906853675842,
      "rewards/constexpr_reward/mean": 0.13333334028720856,
      "rewards/constexpr_reward/std": 0.09477582573890686,
      "rewards/imports_decorator_reward/mean": 0.19166667759418488,
      "rewards/imports_decorator_reward/std": 0.04017505422234535,
      "rewards/masks_load_store_reward/mean": 0.06666667014360428,
      "rewards/masks_load_store_reward/std": 0.04738790914416313,
      "rewards/one_code_blob_reward/mean": 0.015152446925640106,
      "rewards/one_code_blob_reward/std": 0.04148025065660477,
      "rewards/reward_code_runs/mean": -0.2135416716337204,
      "rewards/reward_code_runs/std": 0.16050563752651215,
      "rewards/think_reward/mean": 0.1967654973268509,
      "rewards/think_reward/std": 0.022924818098545074,
      "rewards/torch_empty_penalty/mean": -0.03020833432674408,
      "rewards/torch_empty_penalty/std": 0.046157147735357285,
      "rewards/torch_zeros_reward/mean": 0.03333333507180214,
      "rewards/torch_zeros_reward/std": 0.04738790914416313,
      "rewards/valid_tl_methods_reward/mean": 0.12291666865348816,
      "rewards/valid_tl_methods_reward/std": 0.09784968197345734,
      "step": 26
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2650.0,
      "completions/max_terminated_length": 2650.0,
      "completions/mean_length": 1387.46875,
      "completions/mean_terminated_length": 1387.46875,
      "completions/min_length": 286.0,
      "completions/min_terminated_length": 286.0,
      "epoch": 0.2604501607717042,
      "grad_norm": 0.4483291506767273,
      "learning_rate": 2.6e-07,
      "loss": -0.0394,
      "num_tokens": 4101141.0,
      "reward": 0.454596608877182,
      "reward_std": 0.16984260082244873,
      "rewards/constexpr_reward/mean": 0.12916666269302368,
      "rewards/constexpr_reward/std": 0.0961541160941124,
      "rewards/imports_decorator_reward/mean": 0.1937500238418579,
      "rewards/imports_decorator_reward/std": 0.034981194883584976,
      "rewards/masks_load_store_reward/mean": 0.06354167312383652,
      "rewards/masks_load_store_reward/std": 0.04838397353887558,
      "rewards/one_code_blob_reward/mean": -0.005110291298478842,
      "rewards/one_code_blob_reward/std": 0.06428639590740204,
      "rewards/reward_code_runs/mean": -0.25,
      "rewards/reward_code_runs/std": 0.0,
      "rewards/think_reward/mean": 0.18783187866210938,
      "rewards/think_reward/std": 0.04612936079502106,
      "rewards/torch_empty_penalty/mean": -0.02812500111758709,
      "rewards/torch_empty_penalty/std": 0.04519693925976753,
      "rewards/torch_zeros_reward/mean": 0.02187499962747097,
      "rewards/torch_zeros_reward/std": 0.04155687242746353,
      "rewards/valid_tl_methods_reward/mean": 0.14166666567325592,
      "rewards/valid_tl_methods_reward/std": 0.09138313680887222,
      "step": 27
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2586.0,
      "completions/max_terminated_length": 2586.0,
      "completions/mean_length": 1038.1458740234375,
      "completions/mean_terminated_length": 1038.1458740234375,
      "completions/min_length": 431.0,
      "completions/min_terminated_length": 431.0,
      "epoch": 0.27009646302250806,
      "grad_norm": 0.5938194394111633,
      "learning_rate": 2.7e-07,
      "loss": 0.0157,
      "num_tokens": 4242947.0,
      "reward": 0.554918110370636,
      "reward_std": 0.23007027804851532,
      "rewards/constexpr_reward/mean": 0.13750000298023224,
      "rewards/constexpr_reward/std": 0.09318911284208298,
      "rewards/imports_decorator_reward/mean": 0.18333333730697632,
      "rewards/imports_decorator_reward/std": 0.05556724593043327,
      "rewards/masks_load_store_reward/mean": 0.0729166716337204,
      "rewards/masks_load_store_reward/std": 0.044672295451164246,
      "rewards/one_code_blob_reward/mean": 0.014723489992320538,
      "rewards/one_code_blob_reward/std": 0.059258438646793365,
      "rewards/reward_code_runs/mean": -0.18281249701976776,
      "rewards/reward_code_runs/std": 0.23993729054927826,
      "rewards/think_reward/mean": 0.19696544110774994,
      "rewards/think_reward/std": 0.014847621321678162,
      "rewards/torch_empty_penalty/mean": -0.03125,
      "rewards/torch_empty_penalty/std": 0.04659455642104149,
      "rewards/torch_zeros_reward/mean": 0.02395833469927311,
      "rewards/torch_zeros_reward/std": 0.0429069809615612,
      "rewards/valid_tl_methods_reward/mean": 0.13958333432674408,
      "rewards/valid_tl_methods_reward/std": 0.09231429547071457,
      "step": 28
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 4096.0,
      "completions/max_terminated_length": 3154.0,
      "completions/mean_length": 1421.8125,
      "completions/mean_terminated_length": 1335.54833984375,
      "completions/min_length": 364.0,
      "completions/min_terminated_length": 364.0,
      "epoch": 0.2797427652733119,
      "grad_norm": 0.5307338237762451,
      "learning_rate": 2.8e-07,
      "loss": -0.0047,
      "num_tokens": 4422857.0,
      "reward": 0.5003696084022522,
      "reward_std": 0.27471408247947693,
      "rewards/constexpr_reward/mean": 0.12708334624767303,
      "rewards/constexpr_reward/std": 0.09676794707775116,
      "rewards/imports_decorator_reward/mean": 0.18125002086162567,
      "rewards/imports_decorator_reward/std": 0.058602139353752136,
      "rewards/masks_load_store_reward/mean": 0.07083333283662796,
      "rewards/masks_load_store_reward/std": 0.04569156840443611,
      "rewards/one_code_blob_reward/mean": -0.004753956105560064,
      "rewards/one_code_blob_reward/std": 0.07447423040866852,
      "rewards/reward_code_runs/mean": -0.17916667461395264,
      "rewards/reward_code_runs/std": 0.2631456255912781,
      "rewards/think_reward/mean": 0.1915818601846695,
      "rewards/think_reward/std": 0.037495218217372894,
      "rewards/torch_empty_penalty/mean": -0.02083333395421505,
      "rewards/torch_empty_penalty/std": 0.040824830532073975,
      "rewards/torch_zeros_reward/mean": 0.01770833320915699,
      "rewards/torch_zeros_reward/std": 0.03837431222200394,
      "rewards/valid_tl_methods_reward/mean": 0.11666667461395264,
      "rewards/valid_tl_methods_reward/std": 0.0991189256310463,
      "step": 29
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3182.0,
      "completions/max_terminated_length": 3182.0,
      "completions/mean_length": 1014.4375,
      "completions/mean_terminated_length": 1014.4375,
      "completions/min_length": 401.0,
      "completions/min_terminated_length": 401.0,
      "epoch": 0.28938906752411575,
      "grad_norm": 0.6022129058837891,
      "learning_rate": 2.9e-07,
      "loss": 0.0434,
      "num_tokens": 4559843.0,
      "reward": 0.6135505437850952,
      "reward_std": 0.2433612048625946,
      "rewards/constexpr_reward/mean": 0.14791665971279144,
      "rewards/constexpr_reward/std": 0.08823314309120178,
      "rewards/imports_decorator_reward/mean": 0.19166667759418488,
      "rewards/imports_decorator_reward/std": 0.04017505422234535,
      "rewards/masks_load_store_reward/mean": 0.0677083358168602,
      "rewards/masks_load_store_reward/std": 0.0470045730471611,
      "rewards/one_code_blob_reward/mean": 0.021184049546718597,
      "rewards/one_code_blob_reward/std": 0.05145742744207382,
      "rewards/reward_code_runs/mean": -0.18645834922790527,
      "rewards/reward_code_runs/std": 0.2141665667295456,
      "rewards/think_reward/mean": 0.19861643016338348,
      "rewards/think_reward/std": 0.0135562838986516,
      "rewards/torch_empty_penalty/mean": -0.01666666753590107,
      "rewards/torch_empty_penalty/std": 0.037463437765836716,
      "rewards/torch_zeros_reward/mean": 0.02291666716337204,
      "rewards/torch_zeros_reward/std": 0.04225030168890953,
      "rewards/valid_tl_methods_reward/mean": 0.1666666716337204,
      "rewards/valid_tl_methods_reward/std": 0.07492686808109283,
      "step": 30
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2744.0,
      "completions/max_terminated_length": 2744.0,
      "completions/mean_length": 1047.03125,
      "completions/mean_terminated_length": 1047.03125,
      "completions/min_length": 196.0,
      "completions/min_terminated_length": 196.0,
      "epoch": 0.2990353697749196,
      "grad_norm": 0.7429094314575195,
      "learning_rate": 3e-07,
      "loss": -0.0031,
      "num_tokens": 4698254.0,
      "reward": 0.7336772680282593,
      "reward_std": 0.3611759543418884,
      "rewards/constexpr_reward/mean": 0.1145833358168602,
      "rewards/constexpr_reward/std": 0.0994502454996109,
      "rewards/imports_decorator_reward/mean": 0.18958334624767303,
      "rewards/imports_decorator_reward/std": 0.044672295451164246,
      "rewards/masks_load_store_reward/mean": 0.07187499850988388,
      "rewards/masks_load_store_reward/std": 0.04519693925976753,
      "rewards/one_code_blob_reward/mean": 0.024039460346102715,
      "rewards/one_code_blob_reward/std": 0.049430347979068756,
      "rewards/reward_code_runs/mean": -0.01822916604578495,
      "rewards/reward_code_runs/std": 0.46854308247566223,
      "rewards/think_reward/mean": 0.1945335417985916,
      "rewards/think_reward/std": 0.0342429056763649,
      "rewards/torch_empty_penalty/mean": -0.02500000037252903,
      "rewards/torch_empty_penalty/std": 0.04352857545018196,
      "rewards/torch_zeros_reward/mean": 0.015625,
      "rewards/torch_zeros_reward/std": 0.03649982064962387,
      "rewards/valid_tl_methods_reward/mean": 0.1666666716337204,
      "rewards/valid_tl_methods_reward/std": 0.07492686063051224,
      "step": 31
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2556.0,
      "completions/max_terminated_length": 2556.0,
      "completions/mean_length": 1054.416748046875,
      "completions/mean_terminated_length": 1054.416748046875,
      "completions/min_length": 444.0,
      "completions/min_terminated_length": 444.0,
      "epoch": 0.3086816720257235,
      "grad_norm": 0.6211071014404297,
      "learning_rate": 3.1e-07,
      "loss": -0.0106,
      "num_tokens": 4842738.0,
      "reward": 0.600447416305542,
      "reward_std": 0.24926647543907166,
      "rewards/constexpr_reward/mean": 0.12916666269302368,
      "rewards/constexpr_reward/std": 0.0961541160941124,
      "rewards/imports_decorator_reward/mean": 0.1937500238418579,
      "rewards/imports_decorator_reward/std": 0.034981198608875275,
      "rewards/masks_load_store_reward/mean": 0.06354167312383652,
      "rewards/masks_load_store_reward/std": 0.04838397353887558,
      "rewards/one_code_blob_reward/mean": 0.01908457837998867,
      "rewards/one_code_blob_reward/std": 0.051201947033405304,
      "rewards/reward_code_runs/mean": -0.1640625,
      "rewards/reward_code_runs/std": 0.25138595700263977,
      "rewards/think_reward/mean": 0.19959193468093872,
      "rewards/think_reward/std": 0.0039982725866138935,
      "rewards/torch_empty_penalty/mean": -0.0312500037252903,
      "rewards/torch_empty_penalty/std": 0.04659455269575119,
      "rewards/torch_zeros_reward/mean": 0.03229166567325592,
      "rewards/torch_zeros_reward/std": 0.0470045730471611,
      "rewards/valid_tl_methods_reward/mean": 0.15833334624767303,
      "rewards/valid_tl_methods_reward/std": 0.08164966106414795,
      "step": 32
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3608.0,
      "completions/max_terminated_length": 3608.0,
      "completions/mean_length": 1173.635498046875,
      "completions/mean_terminated_length": 1173.635498046875,
      "completions/min_length": 504.0,
      "completions/min_terminated_length": 504.0,
      "epoch": 0.3183279742765273,
      "grad_norm": 0.6180107593536377,
      "learning_rate": 3.2e-07,
      "loss": 0.039,
      "num_tokens": 5001895.0,
      "reward": 0.4860292375087738,
      "reward_std": 0.16718368232250214,
      "rewards/constexpr_reward/mean": 0.11874999850988388,
      "rewards/constexpr_reward/std": 0.09874209016561508,
      "rewards/imports_decorator_reward/mean": 0.18541665375232697,
      "rewards/imports_decorator_reward/std": 0.05227290466427803,
      "rewards/masks_load_store_reward/mean": 0.0729166641831398,
      "rewards/masks_load_store_reward/std": 0.044672295451164246,
      "rewards/one_code_blob_reward/mean": 0.01918347366154194,
      "rewards/one_code_blob_reward/std": 0.03821328654885292,
      "rewards/reward_code_runs/mean": -0.2265625,
      "rewards/reward_code_runs/std": 0.10051266103982925,
      "rewards/think_reward/mean": 0.1965332180261612,
      "rewards/think_reward/std": 0.024040669202804565,
      "rewards/torch_empty_penalty/mean": -0.02916666679084301,
      "rewards/torch_empty_penalty/std": 0.04569156840443611,
      "rewards/torch_zeros_reward/mean": 0.02604166604578495,
      "rewards/torch_zeros_reward/std": 0.04411657154560089,
      "rewards/valid_tl_methods_reward/mean": 0.12291667610406876,
      "rewards/valid_tl_methods_reward/std": 0.09784968197345734,
      "step": 33
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3269.0,
      "completions/max_terminated_length": 3269.0,
      "completions/mean_length": 1302.8125,
      "completions/mean_terminated_length": 1302.8125,
      "completions/min_length": 642.0,
      "completions/min_terminated_length": 642.0,
      "epoch": 0.3279742765273312,
      "grad_norm": 0.4564855396747589,
      "learning_rate": 3.3e-07,
      "loss": 0.0455,
      "num_tokens": 5174113.0,
      "reward": 0.5342321395874023,
      "reward_std": 0.17704753577709198,
      "rewards/constexpr_reward/mean": 0.1354166716337204,
      "rewards/constexpr_reward/std": 0.0940091460943222,
      "rewards/imports_decorator_reward/mean": 0.19166667759418488,
      "rewards/imports_decorator_reward/std": 0.04017505422234535,
      "rewards/masks_load_store_reward/mean": 0.06354167312383652,
      "rewards/masks_load_store_reward/std": 0.04838397353887558,
      "rewards/one_code_blob_reward/mean": 0.009001667611300945,
      "rewards/one_code_blob_reward/std": 0.03968409448862076,
      "rewards/reward_code_runs/mean": -0.2265625,
      "rewards/reward_code_runs/std": 0.10051266103982925,
      "rewards/think_reward/mean": 0.19866792857646942,
      "rewards/think_reward/std": 0.009620300494134426,
      "rewards/torch_empty_penalty/mean": -0.0031250000465661287,
      "rewards/torch_empty_penalty/std": 0.017490599304437637,
      "rewards/torch_zeros_reward/mean": 0.01979166828095913,
      "rewards/torch_zeros_reward/std": 0.04005204886198044,
      "rewards/valid_tl_methods_reward/mean": 0.1458333283662796,
      "rewards/valid_tl_methods_reward/std": 0.08934459090232849,
      "step": 34
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2075.0,
      "completions/max_terminated_length": 2075.0,
      "completions/mean_length": 817.9896240234375,
      "completions/mean_terminated_length": 817.9896240234375,
      "completions/min_length": 359.0,
      "completions/min_terminated_length": 359.0,
      "epoch": 0.33762057877813506,
      "grad_norm": 0.6788144707679749,
      "learning_rate": 3.4000000000000003e-07,
      "loss": -0.0262,
      "num_tokens": 5287392.0,
      "reward": 0.8014768362045288,
      "reward_std": 0.35335928201675415,
      "rewards/constexpr_reward/mean": 0.13958333432674408,
      "rewards/constexpr_reward/std": 0.09231429547071457,
      "rewards/imports_decorator_reward/mean": 0.1937500238418579,
      "rewards/imports_decorator_reward/std": 0.034981198608875275,
      "rewards/masks_load_store_reward/mean": 0.07708332687616348,
      "rewards/masks_load_store_reward/std": 0.04225030168890953,
      "rewards/one_code_blob_reward/mean": 0.044637531042099,
      "rewards/one_code_blob_reward/std": 0.03677495941519737,
      "rewards/reward_code_runs/mean": -0.011458327062427998,
      "rewards/reward_code_runs/std": 0.4453549087047577,
      "rewards/think_reward/mean": 0.19850583374500275,
      "rewards/think_reward/std": 0.014639697037637234,
      "rewards/torch_empty_penalty/mean": -0.010416666977107525,
      "rewards/torch_empty_penalty/std": 0.03070801869034767,
      "rewards/torch_zeros_reward/mean": 0.009374999441206455,
      "rewards/torch_zeros_reward/std": 0.029301069676876068,
      "rewards/valid_tl_methods_reward/mean": 0.16041666269302368,
      "rewards/valid_tl_methods_reward/std": 0.08010409772396088,
      "step": 35
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 4096.0,
      "completions/max_terminated_length": 4069.0,
      "completions/mean_length": 1136.21875,
      "completions/mean_terminated_length": 1040.741943359375,
      "completions/min_length": 337.0,
      "completions/min_terminated_length": 337.0,
      "epoch": 0.34726688102893893,
      "grad_norm": 0.7359632849693298,
      "learning_rate": 3.5e-07,
      "loss": 0.0526,
      "num_tokens": 5434593.0,
      "reward": 0.7075538635253906,
      "reward_std": 0.2721782922744751,
      "rewards/constexpr_reward/mean": 0.14791665971279144,
      "rewards/constexpr_reward/std": 0.08823314309120178,
      "rewards/imports_decorator_reward/mean": 0.1875,
      "rewards/imports_decorator_reward/std": 0.04866642504930496,
      "rewards/masks_load_store_reward/mean": 0.0677083358168602,
      "rewards/masks_load_store_reward/std": 0.047004569321870804,
      "rewards/one_code_blob_reward/mean": 0.02682466246187687,
      "rewards/one_code_blob_reward/std": 0.059499479830265045,
      "rewards/reward_code_runs/mean": -0.1015625,
      "rewards/reward_code_runs/std": 0.32922980189323425,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": -0.008333333767950535,
      "rewards/torch_empty_penalty/std": 0.027783624827861786,
      "rewards/torch_zeros_reward/mean": 0.010416666977107525,
      "rewards/torch_zeros_reward/std": 0.03070802055299282,
      "rewards/valid_tl_methods_reward/mean": 0.1770833283662796,
      "rewards/valid_tl_methods_reward/std": 0.06403809785842896,
      "step": 36
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3331.0,
      "completions/max_terminated_length": 3331.0,
      "completions/mean_length": 1302.3333740234375,
      "completions/mean_terminated_length": 1302.3333740234375,
      "completions/min_length": 593.0,
      "completions/min_terminated_length": 593.0,
      "epoch": 0.35691318327974275,
      "grad_norm": 0.5119906663894653,
      "learning_rate": 3.6e-07,
      "loss": -0.0113,
      "num_tokens": 5607485.0,
      "reward": 0.632323145866394,
      "reward_std": 0.24545639753341675,
      "rewards/constexpr_reward/mean": 0.12291666120290756,
      "rewards/constexpr_reward/std": 0.09784968197345734,
      "rewards/imports_decorator_reward/mean": 0.19583334028720856,
      "rewards/imports_decorator_reward/std": 0.02871517650783062,
      "rewards/masks_load_store_reward/mean": 0.07708332687616348,
      "rewards/masks_load_store_reward/std": 0.04225030168890953,
      "rewards/one_code_blob_reward/mean": 0.010937422513961792,
      "rewards/one_code_blob_reward/std": 0.046592701226472855,
      "rewards/reward_code_runs/mean": -0.14270831644535065,
      "rewards/reward_code_runs/std": 0.29964709281921387,
      "rewards/think_reward/mean": 0.19742733240127563,
      "rewards/think_reward/std": 0.01643279939889908,
      "rewards/torch_empty_penalty/mean": -0.01875000074505806,
      "rewards/torch_empty_penalty/std": 0.039236124604940414,
      "rewards/torch_zeros_reward/mean": 0.01666666753590107,
      "rewards/torch_zeros_reward/std": 0.03746343404054642,
      "rewards/valid_tl_methods_reward/mean": 0.17291666567325592,
      "rewards/valid_tl_methods_reward/std": 0.06879284977912903,
      "step": 37
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3039.0,
      "completions/max_terminated_length": 3039.0,
      "completions/mean_length": 1210.7083740234375,
      "completions/mean_terminated_length": 1210.7083740234375,
      "completions/min_length": 556.0,
      "completions/min_terminated_length": 556.0,
      "epoch": 0.3665594855305466,
      "grad_norm": 0.6607086062431335,
      "learning_rate": 3.7e-07,
      "loss": -0.0111,
      "num_tokens": 5767849.0,
      "reward": 0.5687481164932251,
      "reward_std": 0.19704470038414001,
      "rewards/constexpr_reward/mean": 0.14791667461395264,
      "rewards/constexpr_reward/std": 0.08823314309120178,
      "rewards/imports_decorator_reward/mean": 0.18958334624767303,
      "rewards/imports_decorator_reward/std": 0.044672295451164246,
      "rewards/masks_load_store_reward/mean": 0.07187499850988388,
      "rewards/masks_load_store_reward/std": 0.04519694298505783,
      "rewards/one_code_blob_reward/mean": 0.004737721756100655,
      "rewards/one_code_blob_reward/std": 0.06969740241765976,
      "rewards/reward_code_runs/mean": -0.22187499701976776,
      "rewards/reward_code_runs/std": 0.10949946194887161,
      "rewards/think_reward/mean": 0.19630199670791626,
      "rewards/think_reward/std": 0.0195402093231678,
      "rewards/torch_empty_penalty/mean": -0.0052083334885537624,
      "rewards/torch_empty_penalty/std": 0.022336145862936974,
      "rewards/torch_zeros_reward/mean": 0.03333333507180214,
      "rewards/torch_zeros_reward/std": 0.04738790914416313,
      "rewards/valid_tl_methods_reward/mean": 0.15208333730697632,
      "rewards/valid_tl_methods_reward/std": 0.0858139619231224,
      "step": 38
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2575.0,
      "completions/max_terminated_length": 2575.0,
      "completions/mean_length": 1134.291748046875,
      "completions/mean_terminated_length": 1134.291748046875,
      "completions/min_length": 486.0,
      "completions/min_terminated_length": 486.0,
      "epoch": 0.3762057877813505,
      "grad_norm": 0.5510346293449402,
      "learning_rate": 3.7999999999999996e-07,
      "loss": 0.0248,
      "num_tokens": 5922641.0,
      "reward": 0.6064773797988892,
      "reward_std": 0.18955551087856293,
      "rewards/constexpr_reward/mean": 0.15833333134651184,
      "rewards/constexpr_reward/std": 0.08164966106414795,
      "rewards/imports_decorator_reward/mean": 0.1937500238418579,
      "rewards/imports_decorator_reward/std": 0.034981198608875275,
      "rewards/masks_load_store_reward/mean": 0.0781250074505806,
      "rewards/masks_load_store_reward/std": 0.04155687242746353,
      "rewards/one_code_blob_reward/mean": 0.023969531059265137,
      "rewards/one_code_blob_reward/std": 0.019068855792284012,
      "rewards/reward_code_runs/mean": -0.20156250894069672,
      "rewards/reward_code_runs/std": 0.2263501137495041,
      "rewards/think_reward/mean": 0.19865359365940094,
      "rewards/think_reward/std": 0.011305413208901882,
      "rewards/torch_empty_penalty/mean": -0.03854167088866234,
      "rewards/torch_empty_penalty/std": 0.04892484471201897,
      "rewards/torch_zeros_reward/mean": 0.014583333395421505,
      "rewards/torch_zeros_reward/std": 0.03547917678952217,
      "rewards/valid_tl_methods_reward/mean": 0.17916667461395264,
      "rewards/valid_tl_methods_reward/std": 0.061416033655405045,
      "step": 39
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2542.0,
      "completions/max_terminated_length": 2542.0,
      "completions/mean_length": 1113.197998046875,
      "completions/mean_terminated_length": 1113.197998046875,
      "completions/min_length": 393.0,
      "completions/min_terminated_length": 393.0,
      "epoch": 0.3858520900321543,
      "grad_norm": 0.5919126868247986,
      "learning_rate": 3.8999999999999997e-07,
      "loss": -0.0312,
      "num_tokens": 6074520.0,
      "reward": 0.57293701171875,
      "reward_std": 0.16654378175735474,
      "rewards/constexpr_reward/mean": 0.15833333134651184,
      "rewards/constexpr_reward/std": 0.08164966106414795,
      "rewards/imports_decorator_reward/mean": 0.19583334028720856,
      "rewards/imports_decorator_reward/std": 0.02871517650783062,
      "rewards/masks_load_store_reward/mean": 0.07604166865348816,
      "rewards/masks_load_store_reward/std": 0.042906977236270905,
      "rewards/one_code_blob_reward/mean": 0.016401944682002068,
      "rewards/one_code_blob_reward/std": 0.05448228865861893,
      "rewards/reward_code_runs/mean": -0.203125,
      "rewards/reward_code_runs/std": 0.13818608224391937,
      "rewards/think_reward/mean": 0.197160005569458,
      "rewards/think_reward/std": 0.014792421832680702,
      "rewards/torch_empty_penalty/mean": -0.01770833320915699,
      "rewards/torch_empty_penalty/std": 0.03837431222200394,
      "rewards/torch_zeros_reward/mean": 0.03749999776482582,
      "rewards/torch_zeros_reward/std": 0.04866642504930496,
      "rewards/valid_tl_methods_reward/mean": 0.11250000447034836,
      "rewards/valid_tl_methods_reward/std": 0.09973649680614471,
      "step": 40
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2118.0,
      "completions/max_terminated_length": 2118.0,
      "completions/mean_length": 883.5104370117188,
      "completions/mean_terminated_length": 883.5104370117188,
      "completions/min_length": 330.0,
      "completions/min_terminated_length": 330.0,
      "epoch": 0.3954983922829582,
      "grad_norm": 0.6467012166976929,
      "learning_rate": 4e-07,
      "loss": -0.0067,
      "num_tokens": 6197725.0,
      "reward": 0.7270565032958984,
      "reward_std": 0.3342481553554535,
      "rewards/constexpr_reward/mean": 0.14374999701976776,
      "rewards/constexpr_reward/std": 0.09039388597011566,
      "rewards/imports_decorator_reward/mean": 0.1875,
      "rewards/imports_decorator_reward/std": 0.04866642504930496,
      "rewards/masks_load_store_reward/mean": 0.08020833134651184,
      "rewards/masks_load_store_reward/std": 0.04005204886198044,
      "rewards/one_code_blob_reward/mean": 0.0374731607735157,
      "rewards/one_code_blob_reward/std": 0.03838203102350235,
      "rewards/reward_code_runs/mean": -0.0729166641831398,
      "rewards/reward_code_runs/std": 0.3925568163394928,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": -0.02708333171904087,
      "rewards/torch_empty_penalty/std": 0.044672295451164246,
      "rewards/torch_zeros_reward/mean": 0.01145833358168602,
      "rewards/torch_zeros_reward/std": 0.03201904892921448,
      "rewards/valid_tl_methods_reward/mean": 0.1666666716337204,
      "rewards/valid_tl_methods_reward/std": 0.07492686808109283,
      "step": 41
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3387.0,
      "completions/max_terminated_length": 3387.0,
      "completions/mean_length": 1058.7396240234375,
      "completions/mean_terminated_length": 1058.7396240234375,
      "completions/min_length": 392.0,
      "completions/min_terminated_length": 392.0,
      "epoch": 0.40514469453376206,
      "grad_norm": 0.6954807043075562,
      "learning_rate": 4.0999999999999994e-07,
      "loss": 0.0317,
      "num_tokens": 6342840.0,
      "reward": 0.6654441356658936,
      "reward_std": 0.29857248067855835,
      "rewards/constexpr_reward/mean": 0.15625,
      "rewards/constexpr_reward/std": 0.08311374485492706,
      "rewards/imports_decorator_reward/mean": 0.19166667759418488,
      "rewards/imports_decorator_reward/std": 0.04017505422234535,
      "rewards/masks_load_store_reward/mean": 0.08645833283662796,
      "rewards/masks_load_store_reward/std": 0.034396421164274216,
      "rewards/one_code_blob_reward/mean": 0.014418717473745346,
      "rewards/one_code_blob_reward/std": 0.05984903872013092,
      "rewards/reward_code_runs/mean": -0.11874999850988388,
      "rewards/reward_code_runs/std": 0.37015289068222046,
      "rewards/think_reward/mean": 0.19790036976337433,
      "rewards/think_reward/std": 0.017045380547642708,
      "rewards/torch_empty_penalty/mean": -0.04062500223517418,
      "rewards/torch_empty_penalty/std": 0.04937104508280754,
      "rewards/torch_zeros_reward/mean": 0.015625,
      "rewards/torch_zeros_reward/std": 0.03649982064962387,
      "rewards/valid_tl_methods_reward/mean": 0.16250000894069672,
      "rewards/valid_tl_methods_reward/std": 0.07847225666046143,
      "step": 42
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2707.0,
      "completions/max_terminated_length": 2707.0,
      "completions/mean_length": 1218.3958740234375,
      "completions/mean_terminated_length": 1218.3958740234375,
      "completions/min_length": 351.0,
      "completions/min_terminated_length": 351.0,
      "epoch": 0.41479099678456594,
      "grad_norm": 0.6541871428489685,
      "learning_rate": 4.1999999999999995e-07,
      "loss": -0.0297,
      "num_tokens": 6505454.0,
      "reward": 0.6576408743858337,
      "reward_std": 0.2851487398147583,
      "rewards/constexpr_reward/mean": 0.15000000596046448,
      "rewards/constexpr_reward/std": 0.08705715090036392,
      "rewards/imports_decorator_reward/mean": 0.19166667759418488,
      "rewards/imports_decorator_reward/std": 0.04017505422234535,
      "rewards/masks_load_store_reward/mean": 0.06666667014360428,
      "rewards/masks_load_store_reward/std": 0.04738790914416313,
      "rewards/one_code_blob_reward/mean": 0.01206450629979372,
      "rewards/one_code_blob_reward/std": 0.04901020601391792,
      "rewards/reward_code_runs/mean": -0.13072915375232697,
      "rewards/reward_code_runs/std": 0.33696553111076355,
      "rewards/think_reward/mean": 0.19818048179149628,
      "rewards/think_reward/std": 0.015096686780452728,
      "rewards/torch_empty_penalty/mean": -0.01145833358168602,
      "rewards/torch_empty_penalty/std": 0.03201904520392418,
      "rewards/torch_zeros_reward/mean": 0.03125,
      "rewards/torch_zeros_reward/std": 0.04659455642104149,
      "rewards/valid_tl_methods_reward/mean": 0.15000000596046448,
      "rewards/valid_tl_methods_reward/std": 0.08705715090036392,
      "step": 43
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3599.0,
      "completions/max_terminated_length": 3599.0,
      "completions/mean_length": 1293.84375,
      "completions/mean_terminated_length": 1293.84375,
      "completions/min_length": 494.0,
      "completions/min_terminated_length": 494.0,
      "epoch": 0.42443729903536975,
      "grad_norm": 0.5934505462646484,
      "learning_rate": 4.2999999999999996e-07,
      "loss": 0.0274,
      "num_tokens": 6676295.0,
      "reward": 0.5304321646690369,
      "reward_std": 0.13345909118652344,
      "rewards/constexpr_reward/mean": 0.1770833283662796,
      "rewards/constexpr_reward/std": 0.06403809040784836,
      "rewards/imports_decorator_reward/mean": 0.19583334028720856,
      "rewards/imports_decorator_reward/std": 0.02871517650783062,
      "rewards/masks_load_store_reward/mean": 0.06979166716337204,
      "rewards/masks_load_store_reward/std": 0.046157147735357285,
      "rewards/one_code_blob_reward/mean": 0.013796854764223099,
      "rewards/one_code_blob_reward/std": 0.03560984879732132,
      "rewards/reward_code_runs/mean": -0.24531249701976776,
      "rewards/reward_code_runs/std": 0.04592793434858322,
      "rewards/think_reward/mean": 0.1963227540254593,
      "rewards/think_reward/std": 0.023702142760157585,
      "rewards/torch_empty_penalty/mean": -0.02708333544433117,
      "rewards/torch_empty_penalty/std": 0.044672295451164246,
      "rewards/torch_zeros_reward/mean": 0.02291666716337204,
      "rewards/torch_zeros_reward/std": 0.04225029796361923,
      "rewards/valid_tl_methods_reward/mean": 0.12708334624767303,
      "rewards/valid_tl_methods_reward/std": 0.09676794707775116,
      "step": 44
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2542.0,
      "completions/max_terminated_length": 2542.0,
      "completions/mean_length": 1152.0833740234375,
      "completions/mean_terminated_length": 1152.0833740234375,
      "completions/min_length": 403.0,
      "completions/min_terminated_length": 403.0,
      "epoch": 0.4340836012861736,
      "grad_norm": 0.7656160593032837,
      "learning_rate": 4.3999999999999997e-07,
      "loss": -0.0271,
      "num_tokens": 6832231.0,
      "reward": 0.6402577757835388,
      "reward_std": 0.24128377437591553,
      "rewards/constexpr_reward/mean": 0.16249999403953552,
      "rewards/constexpr_reward/std": 0.07847225666046143,
      "rewards/imports_decorator_reward/mean": 0.19166667759418488,
      "rewards/imports_decorator_reward/std": 0.04017505422234535,
      "rewards/masks_load_store_reward/mean": 0.06354167312383652,
      "rewards/masks_load_store_reward/std": 0.04838397353887558,
      "rewards/one_code_blob_reward/mean": 0.014514167793095112,
      "rewards/one_code_blob_reward/std": 0.04225924238562584,
      "rewards/reward_code_runs/mean": -0.10729166120290756,
      "rewards/reward_code_runs/std": 0.3435096740722656,
      "rewards/think_reward/mean": 0.19866019487380981,
      "rewards/think_reward/std": 0.009797739796340466,
      "rewards/torch_empty_penalty/mean": -0.04375000298023224,
      "rewards/torch_empty_penalty/std": 0.04986824840307236,
      "rewards/torch_zeros_reward/mean": 0.02708333171904087,
      "rewards/torch_zeros_reward/std": 0.044672295451164246,
      "rewards/valid_tl_methods_reward/mean": 0.13333332538604736,
      "rewards/valid_tl_methods_reward/std": 0.09477582573890686,
      "step": 45
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2606.0,
      "completions/max_terminated_length": 2606.0,
      "completions/mean_length": 890.1875,
      "completions/mean_terminated_length": 890.1875,
      "completions/min_length": 319.0,
      "completions/min_terminated_length": 319.0,
      "epoch": 0.4437299035369775,
      "grad_norm": 0.7248244285583496,
      "learning_rate": 4.5e-07,
      "loss": 0.0216,
      "num_tokens": 6955813.0,
      "reward": 0.6671415567398071,
      "reward_std": 0.21686410903930664,
      "rewards/constexpr_reward/mean": 0.14374999701976776,
      "rewards/constexpr_reward/std": 0.09039388597011566,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.06875000149011612,
      "rewards/masks_load_store_reward/std": 0.04659455642104149,
      "rewards/one_code_blob_reward/mean": 0.03583366796374321,
      "rewards/one_code_blob_reward/std": 0.034937743097543716,
      "rewards/reward_code_runs/mean": -0.11927083134651184,
      "rewards/reward_code_runs/std": 0.30791059136390686,
      "rewards/think_reward/mean": 0.19849534332752228,
      "rewards/think_reward/std": 0.014742674306035042,
      "rewards/torch_empty_penalty/mean": -0.01979166828095913,
      "rewards/torch_empty_penalty/std": 0.04005204886198044,
      "rewards/torch_zeros_reward/mean": 0.0072916666977107525,
      "rewards/torch_zeros_reward/std": 0.026136448606848717,
      "rewards/valid_tl_methods_reward/mean": 0.15416668355464935,
      "rewards/valid_tl_methods_reward/std": 0.08450059592723846,
      "step": 46
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2120.0,
      "completions/max_terminated_length": 2120.0,
      "completions/mean_length": 1056.447998046875,
      "completions/mean_terminated_length": 1056.447998046875,
      "completions/min_length": 462.0,
      "completions/min_terminated_length": 462.0,
      "epoch": 0.4533762057877814,
      "grad_norm": 0.6093177795410156,
      "learning_rate": 4.6e-07,
      "loss": -0.009,
      "num_tokens": 7101140.0,
      "reward": 0.6234441995620728,
      "reward_std": 0.2314853072166443,
      "rewards/constexpr_reward/mean": 0.17500001192092896,
      "rewards/constexpr_reward/std": 0.06649099290370941,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.09479167312383652,
      "rewards/masks_load_store_reward/std": 0.022336147725582123,
      "rewards/one_code_blob_reward/mean": 0.011535567231476307,
      "rewards/one_code_blob_reward/std": 0.053763993084430695,
      "rewards/reward_code_runs/mean": -0.18645833432674408,
      "rewards/reward_code_runs/std": 0.2141665816307068,
      "rewards/think_reward/mean": 0.19836688041687012,
      "rewards/think_reward/std": 0.016001230105757713,
      "rewards/torch_empty_penalty/mean": -0.0364583358168602,
      "rewards/torch_empty_penalty/std": 0.04838397353887558,
      "rewards/torch_zeros_reward/mean": 0.012500000186264515,
      "rewards/torch_zeros_reward/std": 0.033245496451854706,
      "rewards/valid_tl_methods_reward/mean": 0.15625,
      "rewards/valid_tl_methods_reward/std": 0.08311374485492706,
      "step": 47
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2821.0,
      "completions/max_terminated_length": 2821.0,
      "completions/mean_length": 1075.125,
      "completions/mean_terminated_length": 1075.125,
      "completions/min_length": 581.0,
      "completions/min_terminated_length": 581.0,
      "epoch": 0.4630225080385852,
      "grad_norm": 0.5320653319358826,
      "learning_rate": 4.6999999999999995e-07,
      "loss": 0.0107,
      "num_tokens": 7250696.0,
      "reward": 0.601868748664856,
      "reward_std": 0.20453622937202454,
      "rewards/constexpr_reward/mean": 0.15625,
      "rewards/constexpr_reward/std": 0.08311375230550766,
      "rewards/imports_decorator_reward/mean": 0.19583334028720856,
      "rewards/imports_decorator_reward/std": 0.02871517650783062,
      "rewards/masks_load_store_reward/mean": 0.0677083358168602,
      "rewards/masks_load_store_reward/std": 0.0470045730471611,
      "rewards/one_code_blob_reward/mean": 0.015392146073281765,
      "rewards/one_code_blob_reward/std": 0.04831596091389656,
      "rewards/reward_code_runs/mean": -0.20520834624767303,
      "rewards/reward_code_runs/std": 0.1984783113002777,
      "rewards/think_reward/mean": 0.19793486595153809,
      "rewards/think_reward/std": 0.014705672860145569,
      "rewards/torch_empty_penalty/mean": -0.015625,
      "rewards/torch_empty_penalty/std": 0.03649982064962387,
      "rewards/torch_zeros_reward/mean": 0.03124999813735485,
      "rewards/torch_zeros_reward/std": 0.04659455269575119,
      "rewards/valid_tl_methods_reward/mean": 0.15833333134651184,
      "rewards/valid_tl_methods_reward/std": 0.08164966106414795,
      "step": 48
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2741.0,
      "completions/max_terminated_length": 2741.0,
      "completions/mean_length": 1201.479248046875,
      "completions/mean_terminated_length": 1201.479248046875,
      "completions/min_length": 537.0,
      "completions/min_terminated_length": 537.0,
      "epoch": 0.47266881028938906,
      "grad_norm": 0.6406047940254211,
      "learning_rate": 4.8e-07,
      "loss": 0.0062,
      "num_tokens": 7413042.0,
      "reward": 0.5506672859191895,
      "reward_std": 0.12405920028686523,
      "rewards/constexpr_reward/mean": 0.16458332538604736,
      "rewards/constexpr_reward/std": 0.07674862444400787,
      "rewards/imports_decorator_reward/mean": 0.1937500238418579,
      "rewards/imports_decorator_reward/std": 0.034981194883584976,
      "rewards/masks_load_store_reward/mean": 0.07499999552965164,
      "rewards/masks_load_store_reward/std": 0.04352857545018196,
      "rewards/one_code_blob_reward/mean": 0.018375607207417488,
      "rewards/one_code_blob_reward/std": 0.027684977278113365,
      "rewards/reward_code_runs/mean": -0.25,
      "rewards/reward_code_runs/std": 0.0,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": -0.03020833432674408,
      "rewards/torch_empty_penalty/std": 0.046157147735357285,
      "rewards/torch_zeros_reward/mean": 0.02291666716337204,
      "rewards/torch_zeros_reward/std": 0.04225030168890953,
      "rewards/valid_tl_methods_reward/mean": 0.1562500149011612,
      "rewards/valid_tl_methods_reward/std": 0.08311374485492706,
      "step": 49
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2284.0,
      "completions/max_terminated_length": 2284.0,
      "completions/mean_length": 1079.1771240234375,
      "completions/mean_terminated_length": 1079.1771240234375,
      "completions/min_length": 424.0,
      "completions/min_terminated_length": 424.0,
      "epoch": 0.48231511254019294,
      "grad_norm": 0.6042345762252808,
      "learning_rate": 4.9e-07,
      "loss": 0.0487,
      "num_tokens": 7559171.0,
      "reward": 0.730362594127655,
      "reward_std": 0.30589550733566284,
      "rewards/constexpr_reward/mean": 0.16250000894069672,
      "rewards/constexpr_reward/std": 0.07847225666046143,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.08125000447034836,
      "rewards/masks_load_store_reward/std": 0.03923612833023071,
      "rewards/one_code_blob_reward/mean": 0.025675097480416298,
      "rewards/one_code_blob_reward/std": 0.04089103266596794,
      "rewards/reward_code_runs/mean": -0.1119791641831398,
      "rewards/reward_code_runs/std": 0.3423405885696411,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": -0.012500000186264515,
      "rewards/torch_empty_penalty/std": 0.033245496451854706,
      "rewards/torch_zeros_reward/mean": 0.03333333507180214,
      "rewards/torch_zeros_reward/std": 0.04738790914416313,
      "rewards/valid_tl_methods_reward/mean": 0.15208333730697632,
      "rewards/valid_tl_methods_reward/std": 0.08581395447254181,
      "step": 50
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.01041666666666663,
      "completions/max_length": 2704.0,
      "completions/max_terminated_length": 2704.0,
      "completions/mean_length": 1062.40625,
      "completions/mean_terminated_length": 1048.4105224609375,
      "completions/min_length": 330.0,
      "completions/min_terminated_length": 330.0,
      "epoch": 0.4919614147909968,
      "grad_norm": 61.933326721191406,
      "learning_rate": 5e-07,
      "loss": -0.0098,
      "num_tokens": 7704446.0,
      "reward": 0.6786465644836426,
      "reward_std": 0.2720298171043396,
      "rewards/constexpr_reward/mean": 0.1666666716337204,
      "rewards/constexpr_reward/std": 0.07492686808109283,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.0729166641831398,
      "rewards/masks_load_store_reward/std": 0.044672295451164246,
      "rewards/one_code_blob_reward/mean": 0.03380136936903,
      "rewards/one_code_blob_reward/std": 0.02556346170604229,
      "rewards/reward_code_runs/mean": -0.14374999701976776,
      "rewards/reward_code_runs/std": 0.31633177399635315,
      "rewards/think_reward/mean": 0.19692851603031158,
      "rewards/think_reward/std": 0.01767767407000065,
      "rewards/torch_empty_penalty/mean": -0.01874999888241291,
      "rewards/torch_empty_penalty/std": 0.03923612833023071,
      "rewards/torch_zeros_reward/mean": 0.010416666977107525,
      "rewards/torch_zeros_reward/std": 0.03070802055299282,
      "rewards/valid_tl_methods_reward/mean": 0.16041666269302368,
      "rewards/valid_tl_methods_reward/std": 0.08010409772396088,
      "step": 51
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2608.0,
      "completions/max_terminated_length": 2608.0,
      "completions/mean_length": 1029.1458740234375,
      "completions/mean_terminated_length": 1029.1458740234375,
      "completions/min_length": 488.0,
      "completions/min_terminated_length": 488.0,
      "epoch": 0.5016077170418006,
      "grad_norm": 0.5496352314949036,
      "learning_rate": 5.1e-07,
      "loss": 0.0153,
      "num_tokens": 7845580.0,
      "reward": 0.7204399108886719,
      "reward_std": 0.26986491680145264,
      "rewards/constexpr_reward/mean": 0.16875000298023224,
      "rewards/constexpr_reward/std": 0.07299964129924774,
      "rewards/imports_decorator_reward/mean": 0.19166667759418488,
      "rewards/imports_decorator_reward/std": 0.04017505422234535,
      "rewards/masks_load_store_reward/mean": 0.07395833730697632,
      "rewards/masks_load_store_reward/std": 0.04411657154560089,
      "rewards/one_code_blob_reward/mean": 0.02279212512075901,
      "rewards/one_code_blob_reward/std": 0.051034536212682724,
      "rewards/reward_code_runs/mean": -0.08385416120290756,
      "rewards/reward_code_runs/std": 0.34834155440330505,
      "rewards/think_reward/mean": 0.19712696969509125,
      "rewards/think_reward/std": 0.014551707543432713,
      "rewards/torch_empty_penalty/mean": -0.0020833334419876337,
      "rewards/torch_empty_penalty/std": 0.01435758825391531,
      "rewards/torch_zeros_reward/mean": 0.012500000186264515,
      "rewards/torch_zeros_reward/std": 0.033245496451854706,
      "rewards/valid_tl_methods_reward/mean": 0.13958333432674408,
      "rewards/valid_tl_methods_reward/std": 0.09231429547071457,
      "step": 52
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.01041666666666663,
      "completions/max_length": 4096.0,
      "completions/max_terminated_length": 3084.0,
      "completions/mean_length": 1194.7396240234375,
      "completions/mean_terminated_length": 1164.2000732421875,
      "completions/min_length": 521.0,
      "completions/min_terminated_length": 521.0,
      "epoch": 0.5112540192926045,
      "grad_norm": 0.5003747940063477,
      "learning_rate": 5.2e-07,
      "loss": 0.073,
      "num_tokens": 8003787.0,
      "reward": 0.7361212968826294,
      "reward_std": 0.31317782402038574,
      "rewards/constexpr_reward/mean": 0.16458334028720856,
      "rewards/constexpr_reward/std": 0.07674862444400787,
      "rewards/imports_decorator_reward/mean": 0.19583334028720856,
      "rewards/imports_decorator_reward/std": 0.02871517650783062,
      "rewards/masks_load_store_reward/mean": 0.07916667312383652,
      "rewards/masks_load_store_reward/std": 0.040824830532073975,
      "rewards/one_code_blob_reward/mean": 0.011868351139128208,
      "rewards/one_code_blob_reward/std": 0.058140359818935394,
      "rewards/reward_code_runs/mean": -0.11406251043081284,
      "rewards/reward_code_runs/std": 0.37132078409194946,
      "rewards/think_reward/mean": 0.19352371990680695,
      "rewards/think_reward/std": 0.033737994730472565,
      "rewards/torch_empty_penalty/mean": -0.0010416667209938169,
      "rewards/torch_empty_penalty/std": 0.010206207633018494,
      "rewards/torch_zeros_reward/mean": 0.03750000149011612,
      "rewards/torch_zeros_reward/std": 0.04866642504930496,
      "rewards/valid_tl_methods_reward/mean": 0.16875000298023224,
      "rewards/valid_tl_methods_reward/std": 0.07299964129924774,
      "step": 53
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3009.0,
      "completions/max_terminated_length": 3009.0,
      "completions/mean_length": 1083.697998046875,
      "completions/mean_terminated_length": 1083.697998046875,
      "completions/min_length": 496.0,
      "completions/min_terminated_length": 496.0,
      "epoch": 0.5209003215434084,
      "grad_norm": 0.7225779891014099,
      "learning_rate": 5.3e-07,
      "loss": 0.0123,
      "num_tokens": 8149246.0,
      "reward": 0.6963642835617065,
      "reward_std": 0.20251570641994476,
      "rewards/constexpr_reward/mean": 0.18125002086162567,
      "rewards/constexpr_reward/std": 0.058602139353752136,
      "rewards/imports_decorator_reward/mean": 0.19583334028720856,
      "rewards/imports_decorator_reward/std": 0.02871517650783062,
      "rewards/masks_load_store_reward/mean": 0.08437499403953552,
      "rewards/masks_load_store_reward/std": 0.03649982064962387,
      "rewards/one_code_blob_reward/mean": 0.02474900148808956,
      "rewards/one_code_blob_reward/std": 0.030808135867118835,
      "rewards/reward_code_runs/mean": -0.15833333134651184,
      "rewards/reward_code_runs/std": 0.23290687799453735,
      "rewards/think_reward/mean": 0.1955736130475998,
      "rewards/think_reward/std": 0.022120453417301178,
      "rewards/torch_empty_penalty/mean": -0.02083333395421505,
      "rewards/torch_empty_penalty/std": 0.040824830532073975,
      "rewards/torch_zeros_reward/mean": 0.02916666865348816,
      "rewards/torch_zeros_reward/std": 0.04569156840443611,
      "rewards/valid_tl_methods_reward/mean": 0.16458332538604736,
      "rewards/valid_tl_methods_reward/std": 0.07674862444400787,
      "step": 54
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2648.0,
      "completions/max_terminated_length": 2648.0,
      "completions/mean_length": 976.3125,
      "completions/mean_terminated_length": 976.3125,
      "completions/min_length": 396.0,
      "completions/min_terminated_length": 396.0,
      "epoch": 0.5305466237942122,
      "grad_norm": 0.7034012675285339,
      "learning_rate": 5.4e-07,
      "loss": 0.0208,
      "num_tokens": 8281648.0,
      "reward": 0.7647324800491333,
      "reward_std": 0.23368114233016968,
      "rewards/constexpr_reward/mean": 0.17916667461395264,
      "rewards/constexpr_reward/std": 0.06141603738069534,
      "rewards/imports_decorator_reward/mean": 0.19583334028720856,
      "rewards/imports_decorator_reward/std": 0.02871517650783062,
      "rewards/masks_load_store_reward/mean": 0.0885416641831398,
      "rewards/masks_load_store_reward/std": 0.03201904520392418,
      "rewards/one_code_blob_reward/mean": 0.03195902332663536,
      "rewards/one_code_blob_reward/std": 0.04141776263713837,
      "rewards/reward_code_runs/mean": -0.11927083134651184,
      "rewards/reward_code_runs/std": 0.30791059136390686,
      "rewards/think_reward/mean": 0.1957942247390747,
      "rewards/think_reward/std": 0.02325473167002201,
      "rewards/torch_empty_penalty/mean": -0.010416666977107525,
      "rewards/torch_empty_penalty/std": 0.03070801869034767,
      "rewards/torch_zeros_reward/mean": 0.0364583320915699,
      "rewards/torch_zeros_reward/std": 0.04838397353887558,
      "rewards/valid_tl_methods_reward/mean": 0.1666666716337204,
      "rewards/valid_tl_methods_reward/std": 0.07492686808109283,
      "step": 55
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3093.0,
      "completions/max_terminated_length": 3093.0,
      "completions/mean_length": 1219.8333740234375,
      "completions/mean_terminated_length": 1219.8333740234375,
      "completions/min_length": 562.0,
      "completions/min_terminated_length": 562.0,
      "epoch": 0.5401929260450161,
      "grad_norm": 0.4217401146888733,
      "learning_rate": 5.5e-07,
      "loss": -0.0083,
      "num_tokens": 8447400.0,
      "reward": 0.5094969868659973,
      "reward_std": 0.1287832260131836,
      "rewards/constexpr_reward/mean": 0.17500001192092896,
      "rewards/constexpr_reward/std": 0.06649099290370941,
      "rewards/imports_decorator_reward/mean": 0.1937500238418579,
      "rewards/imports_decorator_reward/std": 0.034981194883584976,
      "rewards/masks_load_store_reward/mean": 0.06979166716337204,
      "rewards/masks_load_store_reward/std": 0.046157147735357285,
      "rewards/one_code_blob_reward/mean": 0.011323712766170502,
      "rewards/one_code_blob_reward/std": 0.046563684940338135,
      "rewards/reward_code_runs/mean": -0.25,
      "rewards/reward_code_runs/std": 0.0,
      "rewards/think_reward/mean": 0.19713155925273895,
      "rewards/think_reward/std": 0.014952579513192177,
      "rewards/torch_empty_penalty/mean": -0.04270833730697632,
      "rewards/torch_empty_penalty/std": 0.04972512274980545,
      "rewards/torch_zeros_reward/mean": 0.02395833283662796,
      "rewards/torch_zeros_reward/std": 0.042906977236270905,
      "rewards/valid_tl_methods_reward/mean": 0.13125000894069672,
      "rewards/valid_tl_methods_reward/std": 0.09549042582511902,
      "step": 56
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3450.0,
      "completions/max_terminated_length": 3450.0,
      "completions/mean_length": 1081.125,
      "completions/mean_terminated_length": 1081.125,
      "completions/min_length": 386.0,
      "completions/min_terminated_length": 386.0,
      "epoch": 0.5498392282958199,
      "grad_norm": 0.5305235385894775,
      "learning_rate": 5.6e-07,
      "loss": 0.0105,
      "num_tokens": 8594748.0,
      "reward": 0.746235728263855,
      "reward_std": 0.19432096183300018,
      "rewards/constexpr_reward/mean": 0.1770833283662796,
      "rewards/constexpr_reward/std": 0.06403809040784836,
      "rewards/imports_decorator_reward/mean": 0.19583334028720856,
      "rewards/imports_decorator_reward/std": 0.02871517837047577,
      "rewards/masks_load_store_reward/mean": 0.08229167014360428,
      "rewards/masks_load_store_reward/std": 0.03837431222200394,
      "rewards/one_code_blob_reward/mean": 0.022025473415851593,
      "rewards/one_code_blob_reward/std": 0.04012902453541756,
      "rewards/reward_code_runs/mean": -0.109375,
      "rewards/reward_code_runs/std": 0.3724253475666046,
      "rewards/think_reward/mean": 0.19816844165325165,
      "rewards/think_reward/std": 0.014005015604197979,
      "rewards/torch_empty_penalty/mean": -0.01875000074505806,
      "rewards/torch_empty_penalty/std": 0.03923612833023071,
      "rewards/torch_zeros_reward/mean": 0.03437500074505806,
      "rewards/torch_zeros_reward/std": 0.04774520918726921,
      "rewards/valid_tl_methods_reward/mean": 0.16458334028720856,
      "rewards/valid_tl_methods_reward/std": 0.07674862444400787,
      "step": 57
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2933.0,
      "completions/max_terminated_length": 2933.0,
      "completions/mean_length": 976.96875,
      "completions/mean_terminated_length": 976.96875,
      "completions/min_length": 318.0,
      "completions/min_terminated_length": 318.0,
      "epoch": 0.5594855305466238,
      "grad_norm": 0.6822076439857483,
      "learning_rate": 5.699999999999999e-07,
      "loss": -0.019,
      "num_tokens": 8730561.0,
      "reward": 0.7617365717887878,
      "reward_std": 0.2241070717573166,
      "rewards/constexpr_reward/mean": 0.16875000298023224,
      "rewards/constexpr_reward/std": 0.07299964129924774,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.08125000447034836,
      "rewards/masks_load_store_reward/std": 0.03923612833023071,
      "rewards/one_code_blob_reward/mean": 0.026567451655864716,
      "rewards/one_code_blob_reward/std": 0.05450519919395447,
      "rewards/reward_code_runs/mean": -0.12708333134651184,
      "rewards/reward_code_runs/std": 0.3533238172531128,
      "rewards/think_reward/mean": 0.19454412162303925,
      "rewards/think_reward/std": 0.03444638103246689,
      "rewards/torch_empty_penalty/mean": -0.015625,
      "rewards/torch_empty_penalty/std": 0.03649982064962387,
      "rewards/torch_zeros_reward/mean": 0.05625000223517418,
      "rewards/torch_zeros_reward/std": 0.04986824840307236,
      "rewards/valid_tl_methods_reward/mean": 0.17916667461395264,
      "rewards/valid_tl_methods_reward/std": 0.06141603738069534,
      "step": 58
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1891.0,
      "completions/max_terminated_length": 1891.0,
      "completions/mean_length": 1022.7083740234375,
      "completions/mean_terminated_length": 1022.7083740234375,
      "completions/min_length": 584.0,
      "completions/min_terminated_length": 584.0,
      "epoch": 0.5691318327974276,
      "grad_norm": 0.5892595052719116,
      "learning_rate": 5.8e-07,
      "loss": 0.0007,
      "num_tokens": 8874965.0,
      "reward": 0.7719477415084839,
      "reward_std": 0.19649700820446014,
      "rewards/constexpr_reward/mean": 0.1770833283662796,
      "rewards/constexpr_reward/std": 0.06403809785842896,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.0729166641831398,
      "rewards/masks_load_store_reward/std": 0.044672295451164246,
      "rewards/one_code_blob_reward/mean": 0.022468604147434235,
      "rewards/one_code_blob_reward/std": 0.036056578159332275,
      "rewards/reward_code_runs/mean": -0.10468748956918716,
      "rewards/reward_code_runs/std": 0.37346726655960083,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": -0.01145833358168602,
      "rewards/torch_empty_penalty/std": 0.03201904892921448,
      "rewards/torch_zeros_reward/mean": 0.0364583320915699,
      "rewards/torch_zeros_reward/std": 0.04838397353887558,
      "rewards/valid_tl_methods_reward/mean": 0.18125002086162567,
      "rewards/valid_tl_methods_reward/std": 0.05860213562846184,
      "step": 59
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2705.0,
      "completions/max_terminated_length": 2705.0,
      "completions/mean_length": 1103.1146240234375,
      "completions/mean_terminated_length": 1103.1146240234375,
      "completions/min_length": 529.0,
      "completions/min_terminated_length": 529.0,
      "epoch": 0.5787781350482315,
      "grad_norm": 0.5568946599960327,
      "learning_rate": 5.9e-07,
      "loss": 0.0115,
      "num_tokens": 9024520.0,
      "reward": 0.7087312936782837,
      "reward_std": 0.2077682912349701,
      "rewards/constexpr_reward/mean": 0.18958334624767303,
      "rewards/constexpr_reward/std": 0.044672295451164246,
      "rewards/imports_decorator_reward/mean": 0.1937500238418579,
      "rewards/imports_decorator_reward/std": 0.034981198608875275,
      "rewards/masks_load_store_reward/mean": 0.0833333358168602,
      "rewards/masks_load_store_reward/std": 0.03746343404054642,
      "rewards/one_code_blob_reward/mean": 0.019941674545407295,
      "rewards/one_code_blob_reward/std": 0.037697289139032364,
      "rewards/reward_code_runs/mean": -0.1666666567325592,
      "rewards/reward_code_runs/std": 0.20347851514816284,
      "rewards/think_reward/mean": 0.19816450774669647,
      "rewards/think_reward/std": 0.017156243324279785,
      "rewards/torch_empty_penalty/mean": -0.015625,
      "rewards/torch_empty_penalty/std": 0.03649982064962387,
      "rewards/torch_zeros_reward/mean": 0.03125,
      "rewards/torch_zeros_reward/std": 0.04659455642104149,
      "rewards/valid_tl_methods_reward/mean": 0.17499999701976776,
      "rewards/valid_tl_methods_reward/std": 0.06649099290370941,
      "step": 60
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1910.0,
      "completions/max_terminated_length": 1910.0,
      "completions/mean_length": 960.59375,
      "completions/mean_terminated_length": 960.59375,
      "completions/min_length": 331.0,
      "completions/min_terminated_length": 331.0,
      "epoch": 0.5884244372990354,
      "grad_norm": 0.6765536665916443,
      "learning_rate": 6e-07,
      "loss": -0.014,
      "num_tokens": 9159625.0,
      "reward": 1.0015864372253418,
      "reward_std": 0.2235845923423767,
      "rewards/constexpr_reward/mean": 0.18541668355464935,
      "rewards/constexpr_reward/std": 0.05227290466427803,
      "rewards/imports_decorator_reward/mean": 0.1937500238418579,
      "rewards/imports_decorator_reward/std": 0.034981198608875275,
      "rewards/masks_load_store_reward/mean": 0.08125000447034836,
      "rewards/masks_load_store_reward/std": 0.03923612833023071,
      "rewards/one_code_blob_reward/mean": 0.03283637762069702,
      "rewards/one_code_blob_reward/std": 0.03562239184975624,
      "rewards/reward_code_runs/mean": 0.14166666567325592,
      "rewards/reward_code_runs/std": 0.5730190873146057,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": -0.02604166604578495,
      "rewards/torch_empty_penalty/std": 0.04411657154560089,
      "rewards/torch_zeros_reward/mean": 0.02604166604578495,
      "rewards/torch_zeros_reward/std": 0.04411657154560089,
      "rewards/valid_tl_methods_reward/mean": 0.1666666716337204,
      "rewards/valid_tl_methods_reward/std": 0.07492686808109283,
      "step": 61
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3111.0,
      "completions/max_terminated_length": 3111.0,
      "completions/mean_length": 1079.375,
      "completions/mean_terminated_length": 1079.375,
      "completions/min_length": 374.0,
      "completions/min_terminated_length": 374.0,
      "epoch": 0.5980707395498392,
      "grad_norm": 0.7044626474380493,
      "learning_rate": 6.1e-07,
      "loss": 0.0289,
      "num_tokens": 9304165.0,
      "reward": 0.7447230219841003,
      "reward_std": 0.20032760500907898,
      "rewards/constexpr_reward/mean": 0.18333333730697632,
      "rewards/constexpr_reward/std": 0.05556724593043327,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.08125000447034836,
      "rewards/masks_load_store_reward/std": 0.03923612833023071,
      "rewards/one_code_blob_reward/mean": 0.028499729931354523,
      "rewards/one_code_blob_reward/std": 0.03636397048830986,
      "rewards/reward_code_runs/mean": -0.10624999552965164,
      "rewards/reward_code_runs/std": 0.3280926048755646,
      "rewards/think_reward/mean": 0.19226489961147308,
      "rewards/think_reward/std": 0.03887832909822464,
      "rewards/torch_empty_penalty/mean": -0.02083333395421505,
      "rewards/torch_empty_penalty/std": 0.040824830532073975,
      "rewards/torch_zeros_reward/mean": 0.02395833283662796,
      "rewards/torch_zeros_reward/std": 0.042906977236270905,
      "rewards/valid_tl_methods_reward/mean": 0.16250000894069672,
      "rewards/valid_tl_methods_reward/std": 0.07847225666046143,
      "step": 62
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3746.0,
      "completions/max_terminated_length": 3746.0,
      "completions/mean_length": 1197.572998046875,
      "completions/mean_terminated_length": 1197.572998046875,
      "completions/min_length": 405.0,
      "completions/min_terminated_length": 405.0,
      "epoch": 0.6077170418006431,
      "grad_norm": 0.617486298084259,
      "learning_rate": 6.2e-07,
      "loss": 0.0432,
      "num_tokens": 9462644.0,
      "reward": 0.7086712121963501,
      "reward_std": 0.21238891780376434,
      "rewards/constexpr_reward/mean": 0.1875,
      "rewards/constexpr_reward/std": 0.04866642504930496,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.08749999850988388,
      "rewards/masks_load_store_reward/std": 0.033245496451854706,
      "rewards/one_code_blob_reward/mean": 0.029755033552646637,
      "rewards/one_code_blob_reward/std": 0.02773762121796608,
      "rewards/reward_code_runs/mean": -0.12708333134651184,
      "rewards/reward_code_runs/std": 0.3533238172531128,
      "rewards/think_reward/mean": 0.19662445783615112,
      "rewards/think_reward/std": 0.023549221456050873,
      "rewards/torch_empty_penalty/mean": -0.004166666883975267,
      "rewards/torch_empty_penalty/std": 0.020087527111172676,
      "rewards/torch_zeros_reward/mean": 0.01770833320915699,
      "rewards/torch_zeros_reward/std": 0.03837431222200394,
      "rewards/valid_tl_methods_reward/mean": 0.12291666865348816,
      "rewards/valid_tl_methods_reward/std": 0.09784968197345734,
      "step": 63
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3416.0,
      "completions/max_terminated_length": 3416.0,
      "completions/mean_length": 974.75,
      "completions/mean_terminated_length": 974.75,
      "completions/min_length": 575.0,
      "completions/min_terminated_length": 575.0,
      "epoch": 0.617363344051447,
      "grad_norm": 0.5682595372200012,
      "learning_rate": 6.3e-07,
      "loss": 0.077,
      "num_tokens": 9595640.0,
      "reward": 0.6542726755142212,
      "reward_std": 0.2039661705493927,
      "rewards/constexpr_reward/mean": 0.1875,
      "rewards/constexpr_reward/std": 0.04866642504930496,
      "rewards/imports_decorator_reward/mean": 0.1937500238418579,
      "rewards/imports_decorator_reward/std": 0.034981198608875275,
      "rewards/masks_load_store_reward/mean": 0.08958333730697632,
      "rewards/masks_load_store_reward/std": 0.030708016827702522,
      "rewards/one_code_blob_reward/mean": 0.025626754388213158,
      "rewards/one_code_blob_reward/std": 0.03773731365799904,
      "rewards/reward_code_runs/mean": -0.19947916269302368,
      "rewards/reward_code_runs/std": 0.1758430451154709,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": -0.02395833469927311,
      "rewards/torch_empty_penalty/std": 0.0429069809615612,
      "rewards/torch_zeros_reward/mean": 0.02083333395421505,
      "rewards/torch_zeros_reward/std": 0.040824830532073975,
      "rewards/valid_tl_methods_reward/mean": 0.16041666269302368,
      "rewards/valid_tl_methods_reward/std": 0.08010409772396088,
      "step": 64
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3137.0,
      "completions/max_terminated_length": 3137.0,
      "completions/mean_length": 1027.65625,
      "completions/mean_terminated_length": 1027.65625,
      "completions/min_length": 416.0,
      "completions/min_terminated_length": 416.0,
      "epoch": 0.6270096463022508,
      "grad_norm": 0.920167088508606,
      "learning_rate": 6.4e-07,
      "loss": 0.007,
      "num_tokens": 9734891.0,
      "reward": 0.7769454717636108,
      "reward_std": 0.29669874906539917,
      "rewards/constexpr_reward/mean": 0.18958334624767303,
      "rewards/constexpr_reward/std": 0.044672295451164246,
      "rewards/imports_decorator_reward/mean": 0.19583334028720856,
      "rewards/imports_decorator_reward/std": 0.02871517650783062,
      "rewards/masks_load_store_reward/mean": 0.07187499850988388,
      "rewards/masks_load_store_reward/std": 0.04519694298505783,
      "rewards/one_code_blob_reward/mean": 0.02731100283563137,
      "rewards/one_code_blob_reward/std": 0.03964829444885254,
      "rewards/reward_code_runs/mean": -0.08697917312383652,
      "rewards/reward_code_runs/std": 0.39056265354156494,
      "rewards/think_reward/mean": 0.19807188212871552,
      "rewards/think_reward/std": 0.014280364848673344,
      "rewards/torch_empty_penalty/mean": -0.02187499962747097,
      "rewards/torch_empty_penalty/std": 0.04155687242746353,
      "rewards/torch_zeros_reward/mean": 0.02187500149011612,
      "rewards/torch_zeros_reward/std": 0.04155687615275383,
      "rewards/valid_tl_methods_reward/mean": 0.18125002086162567,
      "rewards/valid_tl_methods_reward/std": 0.058602139353752136,
      "step": 65
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3469.0,
      "completions/max_terminated_length": 3469.0,
      "completions/mean_length": 1270.28125,
      "completions/mean_terminated_length": 1270.28125,
      "completions/min_length": 645.0,
      "completions/min_terminated_length": 645.0,
      "epoch": 0.6366559485530546,
      "grad_norm": 0.5540661215782166,
      "learning_rate": 6.5e-07,
      "loss": 0.0347,
      "num_tokens": 9900650.0,
      "reward": 0.6175910234451294,
      "reward_std": 0.151606023311615,
      "rewards/constexpr_reward/mean": 0.19166667759418488,
      "rewards/constexpr_reward/std": 0.04017505422234535,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.08125000447034836,
      "rewards/masks_load_store_reward/std": 0.03923612833023071,
      "rewards/one_code_blob_reward/mean": 0.018527232110500336,
      "rewards/one_code_blob_reward/std": 0.014282294549047947,
      "rewards/reward_code_runs/mean": -0.21718747913837433,
      "rewards/reward_code_runs/std": 0.1176140308380127,
      "rewards/think_reward/mean": 0.19541792571544647,
      "rewards/think_reward/std": 0.033646970987319946,
      "rewards/torch_empty_penalty/mean": -0.03229166567325592,
      "rewards/torch_empty_penalty/std": 0.0470045730471611,
      "rewards/torch_zeros_reward/mean": 0.0364583320915699,
      "rewards/torch_zeros_reward/std": 0.04838396981358528,
      "rewards/valid_tl_methods_reward/mean": 0.1458333283662796,
      "rewards/valid_tl_methods_reward/std": 0.08934459090232849,
      "step": 66
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1970.0,
      "completions/max_terminated_length": 1970.0,
      "completions/mean_length": 920.7916870117188,
      "completions/mean_terminated_length": 920.7916870117188,
      "completions/min_length": 190.0,
      "completions/min_terminated_length": 190.0,
      "epoch": 0.6463022508038585,
      "grad_norm": 0.7348382472991943,
      "learning_rate": 6.6e-07,
      "loss": 0.0654,
      "num_tokens": 10030314.0,
      "reward": 0.782570481300354,
      "reward_std": 0.26678401231765747,
      "rewards/constexpr_reward/mean": 0.1666666716337204,
      "rewards/constexpr_reward/std": 0.07492686808109283,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.07708332687616348,
      "rewards/masks_load_store_reward/std": 0.04225030168890953,
      "rewards/one_code_blob_reward/mean": 0.03829953074455261,
      "rewards/one_code_blob_reward/std": 0.03580077737569809,
      "rewards/reward_code_runs/mean": -0.06197916343808174,
      "rewards/reward_code_runs/std": 0.4319932162761688,
      "rewards/think_reward/mean": 0.19687502086162567,
      "rewards/think_reward/std": 0.03061862289905548,
      "rewards/torch_empty_penalty/mean": -0.01666666753590107,
      "rewards/torch_empty_penalty/std": 0.03746343404054642,
      "rewards/torch_zeros_reward/mean": 0.01770833320915699,
      "rewards/torch_zeros_reward/std": 0.03837431222200394,
      "rewards/valid_tl_methods_reward/mean": 0.16458334028720856,
      "rewards/valid_tl_methods_reward/std": 0.07674862444400787,
      "step": 67
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2447.0,
      "completions/max_terminated_length": 2447.0,
      "completions/mean_length": 1012.15625,
      "completions/mean_terminated_length": 1012.15625,
      "completions/min_length": 528.0,
      "completions/min_terminated_length": 528.0,
      "epoch": 0.6559485530546624,
      "grad_norm": 0.5175846219062805,
      "learning_rate": 6.7e-07,
      "loss": 0.0515,
      "num_tokens": 10169061.0,
      "reward": 0.7850543856620789,
      "reward_std": 0.2515929341316223,
      "rewards/constexpr_reward/mean": 0.1937500238418579,
      "rewards/constexpr_reward/std": 0.034981198608875275,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.08437500149011612,
      "rewards/masks_load_store_reward/std": 0.03649982064962387,
      "rewards/one_code_blob_reward/mean": 0.03036683052778244,
      "rewards/one_code_blob_reward/std": 0.017092684283852577,
      "rewards/reward_code_runs/mean": -0.12968748807907104,
      "rewards/reward_code_runs/std": 0.3213113248348236,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": -0.00729166716337204,
      "rewards/torch_empty_penalty/std": 0.026136448606848717,
      "rewards/torch_zeros_reward/mean": 0.03020833432674408,
      "rewards/torch_zeros_reward/std": 0.046157147735357285,
      "rewards/valid_tl_methods_reward/mean": 0.18333333730697632,
      "rewards/valid_tl_methods_reward/std": 0.05556724593043327,
      "step": 68
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.10416666666666663,
      "completions/max_length": 4096.0,
      "completions/max_terminated_length": 3485.0,
      "completions/mean_length": 1298.5208740234375,
      "completions/mean_terminated_length": 973.2325439453125,
      "completions/min_length": 195.0,
      "completions/min_terminated_length": 195.0,
      "epoch": 0.6655948553054662,
      "grad_norm": 0.6217107176780701,
      "learning_rate": 6.800000000000001e-07,
      "loss": 0.0101,
      "num_tokens": 10336235.0,
      "reward": 0.7228078842163086,
      "reward_std": 0.2464483678340912,
      "rewards/constexpr_reward/mean": 0.14791665971279144,
      "rewards/constexpr_reward/std": 0.08823314309120178,
      "rewards/imports_decorator_reward/mean": 0.17500001192092896,
      "rewards/imports_decorator_reward/std": 0.06649099290370941,
      "rewards/masks_load_store_reward/mean": 0.07187499850988388,
      "rewards/masks_load_store_reward/std": 0.04519694298505783,
      "rewards/one_code_blob_reward/mean": 0.01695355586707592,
      "rewards/one_code_blob_reward/std": 0.08903989940881729,
      "rewards/reward_code_runs/mean": -0.06041666865348816,
      "rewards/reward_code_runs/std": 0.3515317440032959,
      "rewards/think_reward/mean": 0.19543762505054474,
      "rewards/think_reward/std": 0.03356730937957764,
      "rewards/torch_empty_penalty/mean": -0.0010416667209938169,
      "rewards/torch_empty_penalty/std": 0.010206207633018494,
      "rewards/torch_zeros_reward/mean": 0.02291666716337204,
      "rewards/torch_zeros_reward/std": 0.04225030168890953,
      "rewards/valid_tl_methods_reward/mean": 0.15416668355464935,
      "rewards/valid_tl_methods_reward/std": 0.08450059592723846,
      "step": 69
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.02083333333333337,
      "completions/max_length": 4096.0,
      "completions/max_terminated_length": 2915.0,
      "completions/mean_length": 1177.7708740234375,
      "completions/mean_terminated_length": 1115.6807861328125,
      "completions/min_length": 538.0,
      "completions/min_terminated_length": 538.0,
      "epoch": 0.6752411575562701,
      "grad_norm": 0.5847841501235962,
      "learning_rate": 6.9e-07,
      "loss": 0.0669,
      "num_tokens": 10496965.0,
      "reward": 0.59849613904953,
      "reward_std": 0.2042919248342514,
      "rewards/constexpr_reward/mean": 0.18333333730697632,
      "rewards/constexpr_reward/std": 0.05556724593043327,
      "rewards/imports_decorator_reward/mean": 0.18958334624767303,
      "rewards/imports_decorator_reward/std": 0.044672295451164246,
      "rewards/masks_load_store_reward/mean": 0.08645833283662796,
      "rewards/masks_load_store_reward/std": 0.034396424889564514,
      "rewards/one_code_blob_reward/mean": 0.016383716836571693,
      "rewards/one_code_blob_reward/std": 0.05423387512564659,
      "rewards/reward_code_runs/mean": -0.22760416567325592,
      "rewards/reward_code_runs/std": 0.14213962852954865,
      "rewards/think_reward/mean": 0.19617490470409393,
      "rewards/think_reward/std": 0.019116153940558434,
      "rewards/torch_empty_penalty/mean": -0.01145833358168602,
      "rewards/torch_empty_penalty/std": 0.03201904520392418,
      "rewards/torch_zeros_reward/mean": 0.02395833469927311,
      "rewards/torch_zeros_reward/std": 0.0429069809615612,
      "rewards/valid_tl_methods_reward/mean": 0.14166666567325592,
      "rewards/valid_tl_methods_reward/std": 0.09138313680887222,
      "step": 70
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3453.0,
      "completions/max_terminated_length": 3453.0,
      "completions/mean_length": 1173.96875,
      "completions/mean_terminated_length": 1173.96875,
      "completions/min_length": 406.0,
      "completions/min_terminated_length": 406.0,
      "epoch": 0.684887459807074,
      "grad_norm": 0.5115599036216736,
      "learning_rate": 7e-07,
      "loss": 0.0136,
      "num_tokens": 10652350.0,
      "reward": 0.8516645431518555,
      "reward_std": 0.3125598132610321,
      "rewards/constexpr_reward/mean": 0.19583334028720856,
      "rewards/constexpr_reward/std": 0.02871517650783062,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.09270834177732468,
      "rewards/masks_load_store_reward/std": 0.026136452332139015,
      "rewards/one_code_blob_reward/mean": 0.030832627788186073,
      "rewards/one_code_blob_reward/std": 0.023144574835896492,
      "rewards/reward_code_runs/mean": -0.030208328738808632,
      "rewards/reward_code_runs/std": 0.4455321729183197,
      "rewards/think_reward/mean": 0.19687356054782867,
      "rewards/think_reward/std": 0.021740630269050598,
      "rewards/torch_empty_penalty/mean": -0.0020833334419876337,
      "rewards/torch_empty_penalty/std": 0.01435758825391531,
      "rewards/torch_zeros_reward/mean": 0.02395833283662796,
      "rewards/torch_zeros_reward/std": 0.042906977236270905,
      "rewards/valid_tl_methods_reward/mean": 0.14374999701976776,
      "rewards/valid_tl_methods_reward/std": 0.09039388597011566,
      "step": 71
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3558.0,
      "completions/max_terminated_length": 3558.0,
      "completions/mean_length": 1290.6875,
      "completions/mean_terminated_length": 1290.6875,
      "completions/min_length": 393.0,
      "completions/min_terminated_length": 393.0,
      "epoch": 0.6945337620578779,
      "grad_norm": 0.6105243563652039,
      "learning_rate": 7.1e-07,
      "loss": 0.0294,
      "num_tokens": 10822744.0,
      "reward": 0.6542137861251831,
      "reward_std": 0.15812623500823975,
      "rewards/constexpr_reward/mean": 0.18333333730697632,
      "rewards/constexpr_reward/std": 0.05556724593043327,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.07916667312383652,
      "rewards/masks_load_store_reward/std": 0.040824830532073975,
      "rewards/one_code_blob_reward/mean": 0.020676322281360626,
      "rewards/one_code_blob_reward/std": 0.04006450995802879,
      "rewards/reward_code_runs/mean": -0.20781250298023224,
      "rewards/reward_code_runs/std": 0.1318548172712326,
      "rewards/think_reward/mean": 0.19447489082813263,
      "rewards/think_reward/std": 0.025090038776397705,
      "rewards/torch_empty_penalty/mean": -0.008333333767950535,
      "rewards/torch_empty_penalty/std": 0.027783622965216637,
      "rewards/torch_zeros_reward/mean": 0.0572916679084301,
      "rewards/torch_zeros_reward/std": 0.04972511902451515,
      "rewards/valid_tl_methods_reward/mean": 0.1354166716337204,
      "rewards/valid_tl_methods_reward/std": 0.0940091460943222,
      "step": 72
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2092.0,
      "completions/max_terminated_length": 2092.0,
      "completions/mean_length": 997.2916870117188,
      "completions/mean_terminated_length": 997.2916870117188,
      "completions/min_length": 513.0,
      "completions/min_terminated_length": 513.0,
      "epoch": 0.7041800643086816,
      "grad_norm": 0.5655916333198547,
      "learning_rate": 7.2e-07,
      "loss": 0.0062,
      "num_tokens": 10963064.0,
      "reward": 0.6285203099250793,
      "reward_std": 0.1138053834438324,
      "rewards/constexpr_reward/mean": 0.19166667759418488,
      "rewards/constexpr_reward/std": 0.04017505422234535,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.08749999850988388,
      "rewards/masks_load_store_reward/std": 0.033245496451854706,
      "rewards/one_code_blob_reward/mean": 0.03685358539223671,
      "rewards/one_code_blob_reward/std": 0.023610996082425117,
      "rewards/reward_code_runs/mean": -0.24062500894069672,
      "rewards/reward_code_runs/std": 0.06460914760828018,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": -0.02083333395421505,
      "rewards/torch_empty_penalty/std": 0.040824830532073975,
      "rewards/torch_zeros_reward/mean": 0.04479166865348816,
      "rewards/torch_zeros_reward/std": 0.049989037215709686,
      "rewards/valid_tl_methods_reward/mean": 0.13125000894069672,
      "rewards/valid_tl_methods_reward/std": 0.09549042582511902,
      "step": 73
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3602.0,
      "completions/max_terminated_length": 3602.0,
      "completions/mean_length": 1127.0625,
      "completions/mean_terminated_length": 1127.0625,
      "completions/min_length": 518.0,
      "completions/min_terminated_length": 518.0,
      "epoch": 0.7138263665594855,
      "grad_norm": 0.6025066375732422,
      "learning_rate": 7.3e-07,
      "loss": 0.0371,
      "num_tokens": 11111102.0,
      "reward": 0.7693430185317993,
      "reward_std": 0.14001044631004333,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.08229167014360428,
      "rewards/masks_load_store_reward/std": 0.03837431222200394,
      "rewards/one_code_blob_reward/mean": 0.033576302230358124,
      "rewards/one_code_blob_reward/std": 0.023212797939777374,
      "rewards/reward_code_runs/mean": -0.14687500894069672,
      "rewards/reward_code_runs/std": 0.19012632966041565,
      "rewards/think_reward/mean": 0.19514165818691254,
      "rewards/think_reward/std": 0.02538818120956421,
      "rewards/torch_empty_penalty/mean": -0.009374999441206455,
      "rewards/torch_empty_penalty/std": 0.029301069676876068,
      "rewards/torch_zeros_reward/mean": 0.02916666679084301,
      "rewards/torch_zeros_reward/std": 0.04569156840443611,
      "rewards/valid_tl_methods_reward/mean": 0.1875,
      "rewards/valid_tl_methods_reward/std": 0.04866642504930496,
      "step": 74
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3908.0,
      "completions/max_terminated_length": 3908.0,
      "completions/mean_length": 1304.0521240234375,
      "completions/mean_terminated_length": 1304.0521240234375,
      "completions/min_length": 356.0,
      "completions/min_terminated_length": 356.0,
      "epoch": 0.7234726688102894,
      "grad_norm": 0.7116039395332336,
      "learning_rate": 7.4e-07,
      "loss": 0.0137,
      "num_tokens": 11281699.0,
      "reward": 0.7607399821281433,
      "reward_std": 0.1809384822845459,
      "rewards/constexpr_reward/mean": 0.18958334624767303,
      "rewards/constexpr_reward/std": 0.044672295451164246,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.0885416641831398,
      "rewards/masks_load_store_reward/std": 0.03201904520392418,
      "rewards/one_code_blob_reward/mean": 0.032614920288324356,
      "rewards/one_code_blob_reward/std": 0.026224695146083832,
      "rewards/reward_code_runs/mean": -0.1354166716337204,
      "rewards/reward_code_runs/std": 0.3354428708553314,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": -0.013541667722165585,
      "rewards/torch_empty_penalty/std": 0.034396421164274216,
      "rewards/torch_zeros_reward/mean": 0.05104166641831398,
      "rewards/torch_zeros_reward/std": 0.0502515584230423,
      "rewards/valid_tl_methods_reward/mean": 0.14791665971279144,
      "rewards/valid_tl_methods_reward/std": 0.08823314309120178,
      "step": 75
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.01041666666666663,
      "completions/max_length": 4096.0,
      "completions/max_terminated_length": 3088.0,
      "completions/mean_length": 1423.822998046875,
      "completions/mean_terminated_length": 1395.69482421875,
      "completions/min_length": 633.0,
      "completions/min_terminated_length": 633.0,
      "epoch": 0.7331189710610932,
      "grad_norm": 0.6728214025497437,
      "learning_rate": 7.5e-07,
      "loss": 0.0324,
      "num_tokens": 11466194.0,
      "reward": 0.7873832583427429,
      "reward_std": 0.2371460199356079,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.09583333879709244,
      "rewards/masks_load_store_reward/std": 0.020087527111172676,
      "rewards/one_code_blob_reward/mean": 0.014753940515220165,
      "rewards/one_code_blob_reward/std": 0.02616616152226925,
      "rewards/reward_code_runs/mean": -0.12968750298023224,
      "rewards/reward_code_runs/std": 0.3213113248348236,
      "rewards/think_reward/mean": 0.196066752076149,
      "rewards/think_reward/std": 0.020336557179689407,
      "rewards/torch_empty_penalty/mean": -0.004166666883975267,
      "rewards/torch_empty_penalty/std": 0.020087525248527527,
      "rewards/torch_zeros_reward/mean": 0.04375000298023224,
      "rewards/torch_zeros_reward/std": 0.04986824840307236,
      "rewards/valid_tl_methods_reward/mean": 0.17083333432674408,
      "rewards/valid_tl_methods_reward/std": 0.07095835357904434,
      "step": 76
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2564.0,
      "completions/max_terminated_length": 2564.0,
      "completions/mean_length": 1124.21875,
      "completions/mean_terminated_length": 1124.21875,
      "completions/min_length": 463.0,
      "completions/min_terminated_length": 463.0,
      "epoch": 0.7427652733118971,
      "grad_norm": 1.729048490524292,
      "learning_rate": 7.599999999999999e-07,
      "loss": 0.0727,
      "num_tokens": 11622479.0,
      "reward": 0.7299603223800659,
      "reward_std": 0.18497233092784882,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.0885416641831398,
      "rewards/masks_load_store_reward/std": 0.03201904520392418,
      "rewards/one_code_blob_reward/mean": 0.022904405370354652,
      "rewards/one_code_blob_reward/std": 0.01825665310025215,
      "rewards/reward_code_runs/mean": -0.1875,
      "rewards/reward_code_runs/std": 0.23675435781478882,
      "rewards/think_reward/mean": 0.19768084585666656,
      "rewards/think_reward/std": 0.01613238826394081,
      "rewards/torch_empty_penalty/mean": -0.01666666753590107,
      "rewards/torch_empty_penalty/std": 0.037463437765836716,
      "rewards/torch_zeros_reward/mean": 0.05625000596046448,
      "rewards/torch_zeros_reward/std": 0.04986824840307236,
      "rewards/valid_tl_methods_reward/mean": 0.16875000298023224,
      "rewards/valid_tl_methods_reward/std": 0.07299964129924774,
      "step": 77
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1884.0,
      "completions/max_terminated_length": 1884.0,
      "completions/mean_length": 860.9166870117188,
      "completions/mean_terminated_length": 860.9166870117188,
      "completions/min_length": 378.0,
      "completions/min_terminated_length": 378.0,
      "epoch": 0.752411575562701,
      "grad_norm": 0.6792352795600891,
      "learning_rate": 7.699999999999999e-07,
      "loss": 0.0305,
      "num_tokens": 11744499.0,
      "reward": 0.9666028022766113,
      "reward_std": 0.25859588384628296,
      "rewards/constexpr_reward/mean": 0.1937500238418579,
      "rewards/constexpr_reward/std": 0.034981198608875275,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.08437500149011612,
      "rewards/masks_load_store_reward/std": 0.03649982064962387,
      "rewards/one_code_blob_reward/mean": 0.04160277917981148,
      "rewards/one_code_blob_reward/std": 0.03801194950938225,
      "rewards/reward_code_runs/mean": 0.05937500670552254,
      "rewards/reward_code_runs/std": 0.48315370082855225,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": -0.0052083334885537624,
      "rewards/torch_empty_penalty/std": 0.022336147725582123,
      "rewards/torch_zeros_reward/mean": 0.01979166641831398,
      "rewards/torch_zeros_reward/std": 0.04005204886198044,
      "rewards/valid_tl_methods_reward/mean": 0.17291666567325592,
      "rewards/valid_tl_methods_reward/std": 0.06879284232854843,
      "step": 78
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2805.0,
      "completions/max_terminated_length": 2805.0,
      "completions/mean_length": 925.71875,
      "completions/mean_terminated_length": 925.71875,
      "completions/min_length": 376.0,
      "completions/min_terminated_length": 376.0,
      "epoch": 0.7620578778135049,
      "grad_norm": 0.688829243183136,
      "learning_rate": 7.799999999999999e-07,
      "loss": 0.0321,
      "num_tokens": 11874984.0,
      "reward": 1.1787655353546143,
      "reward_std": 0.2676607370376587,
      "rewards/constexpr_reward/mean": 0.1937500238418579,
      "rewards/constexpr_reward/std": 0.034981198608875275,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.09583333879709244,
      "rewards/masks_load_store_reward/std": 0.020087527111172676,
      "rewards/one_code_blob_reward/mean": 0.03898598626255989,
      "rewards/one_code_blob_reward/std": 0.027744870632886887,
      "rewards/reward_code_runs/mean": 0.23854166269302368,
      "rewards/reward_code_runs/std": 0.5910952687263489,
      "rewards/think_reward/mean": 0.19811280071735382,
      "rewards/think_reward/std": 0.013295148499310017,
      "rewards/torch_empty_penalty/mean": -0.0010416667209938169,
      "rewards/torch_empty_penalty/std": 0.010206207633018494,
      "rewards/torch_zeros_reward/mean": 0.03541666641831398,
      "rewards/torch_zeros_reward/std": 0.0480770580470562,
      "rewards/valid_tl_methods_reward/mean": 0.18125002086162567,
      "rewards/valid_tl_methods_reward/std": 0.058602139353752136,
      "step": 79
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1763.0,
      "completions/max_terminated_length": 1763.0,
      "completions/mean_length": 800.3958740234375,
      "completions/mean_terminated_length": 800.3958740234375,
      "completions/min_length": 417.0,
      "completions/min_terminated_length": 417.0,
      "epoch": 0.7717041800643086,
      "grad_norm": 0.7262923717498779,
      "learning_rate": 7.9e-07,
      "loss": 0.0328,
      "num_tokens": 11992622.0,
      "reward": 1.040205955505371,
      "reward_std": 0.33069169521331787,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.09270834177732468,
      "rewards/masks_load_store_reward/std": 0.026136452332139015,
      "rewards/one_code_blob_reward/mean": 0.05322667583823204,
      "rewards/one_code_blob_reward/std": 0.026531290262937546,
      "rewards/reward_code_runs/mean": 0.12343750149011612,
      "rewards/reward_code_runs/std": 0.5305882096290588,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.008333333767950535,
      "rewards/torch_zeros_reward/std": 0.027783622965216637,
      "rewards/valid_tl_methods_reward/mean": 0.16458334028720856,
      "rewards/valid_tl_methods_reward/std": 0.07674862444400787,
      "step": 80
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.07291666666666663,
      "completions/max_length": 4096.0,
      "completions/max_terminated_length": 3992.0,
      "completions/mean_length": 1334.541748046875,
      "completions/mean_terminated_length": 1117.3482666015625,
      "completions/min_length": 381.0,
      "completions/min_terminated_length": 381.0,
      "epoch": 0.7813504823151125,
      "grad_norm": 0.7282072305679321,
      "learning_rate": 8e-07,
      "loss": 0.0188,
      "num_tokens": 12165150.0,
      "reward": 0.6720718145370483,
      "reward_std": 0.21535718441009521,
      "rewards/constexpr_reward/mean": 0.18125002086162567,
      "rewards/constexpr_reward/std": 0.05860213562846184,
      "rewards/imports_decorator_reward/mean": 0.18541665375232697,
      "rewards/imports_decorator_reward/std": 0.05227290466427803,
      "rewards/masks_load_store_reward/mean": 0.08020833134651184,
      "rewards/masks_load_store_reward/std": 0.04005204886198044,
      "rewards/one_code_blob_reward/mean": 0.015300924889743328,
      "rewards/one_code_blob_reward/std": 0.0648474469780922,
      "rewards/reward_code_runs/mean": -0.15156249701976776,
      "rewards/reward_code_runs/std": 0.1870059221982956,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": -0.01770833320915699,
      "rewards/torch_empty_penalty/std": 0.03837431222200394,
      "rewards/torch_zeros_reward/mean": 0.01874999888241291,
      "rewards/torch_zeros_reward/std": 0.03923613205552101,
      "rewards/valid_tl_methods_reward/mean": 0.16041666269302368,
      "rewards/valid_tl_methods_reward/std": 0.08010409772396088,
      "step": 81
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1576.0,
      "completions/max_terminated_length": 1576.0,
      "completions/mean_length": 805.71875,
      "completions/mean_terminated_length": 805.71875,
      "completions/min_length": 472.0,
      "completions/min_terminated_length": 472.0,
      "epoch": 0.7909967845659164,
      "grad_norm": 0.6383414268493652,
      "learning_rate": 8.1e-07,
      "loss": 0.0475,
      "num_tokens": 12283503.0,
      "reward": 0.8771345019340515,
      "reward_std": 0.24294182658195496,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.08541666716337204,
      "rewards/masks_load_store_reward/std": 0.03547917678952217,
      "rewards/one_code_blob_reward/mean": 0.04328025504946709,
      "rewards/one_code_blob_reward/std": 0.022545376792550087,
      "rewards/reward_code_runs/mean": -0.06093750521540642,
      "rewards/reward_code_runs/std": 0.4197244346141815,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": -0.0031250000465661287,
      "rewards/torch_empty_penalty/std": 0.017490599304437637,
      "rewards/torch_zeros_reward/mean": 0.03749999776482582,
      "rewards/torch_zeros_reward/std": 0.04866642504930496,
      "rewards/valid_tl_methods_reward/mean": 0.17499999701976776,
      "rewards/valid_tl_methods_reward/std": 0.06649099290370941,
      "step": 82
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3559.0,
      "completions/max_terminated_length": 3559.0,
      "completions/mean_length": 956.09375,
      "completions/mean_terminated_length": 956.09375,
      "completions/min_length": 393.0,
      "completions/min_terminated_length": 393.0,
      "epoch": 0.8006430868167203,
      "grad_norm": 1.5789874792099,
      "learning_rate": 8.199999999999999e-07,
      "loss": 0.0652,
      "num_tokens": 12417984.0,
      "reward": 0.9814224243164062,
      "reward_std": 0.1926470398902893,
      "rewards/constexpr_reward/mean": 0.19583334028720856,
      "rewards/constexpr_reward/std": 0.02871517650783062,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.09375,
      "rewards/masks_load_store_reward/std": 0.02433321252465248,
      "rewards/one_code_blob_reward/mean": 0.04589207097887993,
      "rewards/one_code_blob_reward/std": 0.03024989366531372,
      "rewards/reward_code_runs/mean": 0.02552083134651184,
      "rewards/reward_code_runs/std": 0.49857667088508606,
      "rewards/think_reward/mean": 0.19855110347270966,
      "rewards/think_reward/std": 0.014196311123669147,
      "rewards/torch_empty_penalty/mean": -0.0020833334419876337,
      "rewards/torch_empty_penalty/std": 0.01435758825391531,
      "rewards/torch_zeros_reward/mean": 0.04479166865348816,
      "rewards/torch_zeros_reward/std": 0.04998903349041939,
      "rewards/valid_tl_methods_reward/mean": 0.17916667461395264,
      "rewards/valid_tl_methods_reward/std": 0.06141604110598564,
      "step": 83
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3734.0,
      "completions/max_terminated_length": 3734.0,
      "completions/mean_length": 1361.791748046875,
      "completions/mean_terminated_length": 1361.791748046875,
      "completions/min_length": 521.0,
      "completions/min_terminated_length": 521.0,
      "epoch": 0.8102893890675241,
      "grad_norm": 0.5281466245651245,
      "learning_rate": 8.299999999999999e-07,
      "loss": 0.0101,
      "num_tokens": 12596164.0,
      "reward": 0.6960060000419617,
      "reward_std": 0.1029006764292717,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.08020833134651184,
      "rewards/masks_load_store_reward/std": 0.04005204886198044,
      "rewards/one_code_blob_reward/mean": 0.02231888473033905,
      "rewards/one_code_blob_reward/std": 0.020131021738052368,
      "rewards/reward_code_runs/mean": -0.21249999105930328,
      "rewards/reward_code_runs/std": 0.12502631545066833,
      "rewards/think_reward/mean": 0.19868700206279755,
      "rewards/think_reward/std": 0.011553033255040646,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.04270833730697632,
      "rewards/torch_zeros_reward/std": 0.04972512647509575,
      "rewards/valid_tl_methods_reward/mean": 0.1666666716337204,
      "rewards/valid_tl_methods_reward/std": 0.07492686808109283,
      "step": 84
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2050.0,
      "completions/max_terminated_length": 2050.0,
      "completions/mean_length": 795.3646240234375,
      "completions/mean_terminated_length": 795.3646240234375,
      "completions/min_length": 308.0,
      "completions/min_terminated_length": 308.0,
      "epoch": 0.819935691318328,
      "grad_norm": 0.7214101552963257,
      "learning_rate": 8.399999999999999e-07,
      "loss": 0.0325,
      "num_tokens": 12710763.0,
      "reward": 1.0834388732910156,
      "reward_std": 0.15569153428077698,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.09479167312383652,
      "rewards/masks_load_store_reward/std": 0.022336147725582123,
      "rewards/one_code_blob_reward/mean": 0.05166803300380707,
      "rewards/one_code_blob_reward/std": 0.02839084342122078,
      "rewards/reward_code_runs/mean": 0.12656250596046448,
      "rewards/reward_code_runs/std": 0.4989965260028839,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": -0.0020833334419876337,
      "rewards/torch_empty_penalty/std": 0.01435758825391531,
      "rewards/torch_zeros_reward/mean": 0.02291666716337204,
      "rewards/torch_zeros_reward/std": 0.04225029796361923,
      "rewards/valid_tl_methods_reward/mean": 0.18958334624767303,
      "rewards/valid_tl_methods_reward/std": 0.044672295451164246,
      "step": 85
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2049.0,
      "completions/max_terminated_length": 2049.0,
      "completions/mean_length": 856.7396240234375,
      "completions/mean_terminated_length": 856.7396240234375,
      "completions/min_length": 449.0,
      "completions/min_terminated_length": 449.0,
      "epoch": 0.8295819935691319,
      "grad_norm": 0.7173665165901184,
      "learning_rate": 8.499999999999999e-07,
      "loss": 0.0199,
      "num_tokens": 12834506.0,
      "reward": 0.7910616397857666,
      "reward_std": 0.1610206663608551,
      "rewards/constexpr_reward/mean": 0.19166667759418488,
      "rewards/constexpr_reward/std": 0.04017505422234535,
      "rewards/imports_decorator_reward/mean": 0.19583334028720856,
      "rewards/imports_decorator_reward/std": 0.02871517650783062,
      "rewards/masks_load_store_reward/mean": 0.09062501043081284,
      "rewards/masks_load_store_reward/std": 0.029301069676876068,
      "rewards/one_code_blob_reward/mean": 0.042103271931409836,
      "rewards/one_code_blob_reward/std": 0.02515607886016369,
      "rewards/reward_code_runs/mean": -0.17604166269302368,
      "rewards/reward_code_runs/std": 0.19654636085033417,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": -0.0052083334885537624,
      "rewards/torch_empty_penalty/std": 0.022336147725582123,
      "rewards/torch_zeros_reward/mean": 0.05416667088866234,
      "rewards/torch_zeros_reward/std": 0.050087641924619675,
      "rewards/valid_tl_methods_reward/mean": 0.1979166716337204,
      "rewards/valid_tl_methods_reward/std": 0.020412415266036987,
      "step": 86
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2101.0,
      "completions/max_terminated_length": 2101.0,
      "completions/mean_length": 1009.5,
      "completions/mean_terminated_length": 1009.5,
      "completions/min_length": 442.0,
      "completions/min_terminated_length": 442.0,
      "epoch": 0.8392282958199357,
      "grad_norm": 0.5952999591827393,
      "learning_rate": 8.599999999999999e-07,
      "loss": -0.0152,
      "num_tokens": 12976166.0,
      "reward": 0.7066213488578796,
      "reward_std": 0.1371747851371765,
      "rewards/constexpr_reward/mean": 0.19166667759418488,
      "rewards/constexpr_reward/std": 0.04017505422234535,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.09166666865348816,
      "rewards/masks_load_store_reward/std": 0.027783622965216637,
      "rewards/one_code_blob_reward/mean": 0.034746263176202774,
      "rewards/one_code_blob_reward/std": 0.028374899178743362,
      "rewards/reward_code_runs/mean": -0.16562499105930328,
      "rewards/reward_code_runs/std": 0.1765625774860382,
      "rewards/think_reward/mean": 0.19687502086162567,
      "rewards/think_reward/std": 0.03061862289905548,
      "rewards/torch_empty_penalty/mean": -0.010416666977107525,
      "rewards/torch_empty_penalty/std": 0.03070801869034767,
      "rewards/torch_zeros_reward/mean": 0.01979166641831398,
      "rewards/torch_zeros_reward/std": 0.04005204886198044,
      "rewards/valid_tl_methods_reward/mean": 0.14999999105930328,
      "rewards/valid_tl_methods_reward/std": 0.08705715090036392,
      "step": 87
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2930.0,
      "completions/max_terminated_length": 2930.0,
      "completions/mean_length": 1109.635498046875,
      "completions/mean_terminated_length": 1109.635498046875,
      "completions/min_length": 313.0,
      "completions/min_terminated_length": 313.0,
      "epoch": 0.8488745980707395,
      "grad_norm": 0.5833906531333923,
      "learning_rate": 8.699999999999999e-07,
      "loss": 0.0117,
      "num_tokens": 13126791.0,
      "reward": 0.8823820948600769,
      "reward_std": 0.13785666227340698,
      "rewards/constexpr_reward/mean": 0.1937500238418579,
      "rewards/constexpr_reward/std": 0.034981198608875275,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.08958333730697632,
      "rewards/masks_load_store_reward/std": 0.030708016827702522,
      "rewards/one_code_blob_reward/mean": 0.03290291130542755,
      "rewards/one_code_blob_reward/std": 0.04009070619940758,
      "rewards/reward_code_runs/mean": -0.04322915896773338,
      "rewards/reward_code_runs/std": 0.43320226669311523,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": -0.0010416667209938169,
      "rewards/torch_empty_penalty/std": 0.010206207633018494,
      "rewards/torch_zeros_reward/mean": 0.02916666679084301,
      "rewards/torch_zeros_reward/std": 0.04569156840443611,
      "rewards/valid_tl_methods_reward/mean": 0.1833333522081375,
      "rewards/valid_tl_methods_reward/std": 0.05556724965572357,
      "step": 88
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2153.0,
      "completions/max_terminated_length": 2153.0,
      "completions/mean_length": 870.3854370117188,
      "completions/mean_terminated_length": 870.3854370117188,
      "completions/min_length": 400.0,
      "completions/min_terminated_length": 400.0,
      "epoch": 0.8585209003215434,
      "grad_norm": 0.5769517421722412,
      "learning_rate": 8.799999999999999e-07,
      "loss": -0.0023,
      "num_tokens": 13251808.0,
      "reward": 0.9388400912284851,
      "reward_std": 0.26629945635795593,
      "rewards/constexpr_reward/mean": 0.19583334028720856,
      "rewards/constexpr_reward/std": 0.02871517837047577,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.09583333879709244,
      "rewards/masks_load_store_reward/std": 0.020087527111172676,
      "rewards/one_code_blob_reward/mean": 0.03936079144477844,
      "rewards/one_code_blob_reward/std": 0.034906670451164246,
      "rewards/reward_code_runs/mean": -0.0005208253860473633,
      "rewards/reward_code_runs/std": 0.4790612757205963,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": -0.01666666753590107,
      "rewards/torch_empty_penalty/std": 0.03746343404054642,
      "rewards/torch_zeros_reward/mean": 0.05416667088866234,
      "rewards/torch_zeros_reward/std": 0.05008764564990997,
      "rewards/valid_tl_methods_reward/mean": 0.17291666567325592,
      "rewards/valid_tl_methods_reward/std": 0.06879284977912903,
      "step": 89
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.01041666666666663,
      "completions/max_length": 4096.0,
      "completions/max_terminated_length": 3096.0,
      "completions/mean_length": 1160.875,
      "completions/mean_terminated_length": 1129.97900390625,
      "completions/min_length": 353.0,
      "completions/min_terminated_length": 353.0,
      "epoch": 0.8681672025723473,
      "grad_norm": 0.5800571441650391,
      "learning_rate": 8.9e-07,
      "loss": 0.0986,
      "num_tokens": 13405612.0,
      "reward": 0.8594149351119995,
      "reward_std": 0.17765943706035614,
      "rewards/constexpr_reward/mean": 0.18958334624767303,
      "rewards/constexpr_reward/std": 0.044672295451164246,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.09270834177732468,
      "rewards/masks_load_store_reward/std": 0.026136452332139015,
      "rewards/one_code_blob_reward/mean": 0.03177860751748085,
      "rewards/one_code_blob_reward/std": 0.03633953258395195,
      "rewards/reward_code_runs/mean": -0.05052083358168602,
      "rewards/reward_code_runs/std": 0.40778568387031555,
      "rewards/think_reward/mean": 0.19794876873493195,
      "rewards/think_reward/std": 0.020097941160202026,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.05000000074505806,
      "rewards/torch_zeros_reward/std": 0.05026246979832649,
      "rewards/valid_tl_methods_reward/mean": 0.14791667461395264,
      "rewards/valid_tl_methods_reward/std": 0.08823314309120178,
      "step": 90
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1805.0,
      "completions/max_terminated_length": 1805.0,
      "completions/mean_length": 1031.885498046875,
      "completions/mean_terminated_length": 1031.885498046875,
      "completions/min_length": 546.0,
      "completions/min_terminated_length": 546.0,
      "epoch": 0.8778135048231511,
      "grad_norm": 0.6327739357948303,
      "learning_rate": 9e-07,
      "loss": 0.0262,
      "num_tokens": 13551725.0,
      "reward": 0.6960409879684448,
      "reward_std": 0.13087350130081177,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.09062501043081284,
      "rewards/masks_load_store_reward/std": 0.029301069676876068,
      "rewards/one_code_blob_reward/mean": 0.026249190792441368,
      "rewards/one_code_blob_reward/std": 0.01706741750240326,
      "rewards/reward_code_runs/mean": -0.2135416716337204,
      "rewards/reward_code_runs/std": 0.16050563752651215,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.0364583358168602,
      "rewards/torch_zeros_reward/std": 0.04838397353887558,
      "rewards/valid_tl_methods_reward/mean": 0.15833334624767303,
      "rewards/valid_tl_methods_reward/std": 0.08164966106414795,
      "step": 91
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2159.0,
      "completions/max_terminated_length": 2159.0,
      "completions/mean_length": 1042.375,
      "completions/mean_terminated_length": 1042.375,
      "completions/min_length": 513.0,
      "completions/min_terminated_length": 513.0,
      "epoch": 0.887459807073955,
      "grad_norm": 0.549281120300293,
      "learning_rate": 9.1e-07,
      "loss": 0.0378,
      "num_tokens": 13699337.0,
      "reward": 0.8041318655014038,
      "reward_std": 0.18684861063957214,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.09375,
      "rewards/masks_load_store_reward/std": 0.02433321252465248,
      "rewards/one_code_blob_reward/mean": 0.02809007279574871,
      "rewards/one_code_blob_reward/std": 0.0198042131960392,
      "rewards/reward_code_runs/mean": -0.12187498807907104,
      "rewards/reward_code_runs/std": 0.27065345644950867,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": -0.0072916666977107525,
      "rewards/torch_empty_penalty/std": 0.026136452332139015,
      "rewards/torch_zeros_reward/mean": 0.0364583358168602,
      "rewards/torch_zeros_reward/std": 0.04838397353887558,
      "rewards/valid_tl_methods_reward/mean": 0.1770833283662796,
      "rewards/valid_tl_methods_reward/std": 0.06403809785842896,
      "step": 92
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2761.0,
      "completions/max_terminated_length": 2761.0,
      "completions/mean_length": 852.375,
      "completions/mean_terminated_length": 852.375,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "epoch": 0.8971061093247589,
      "grad_norm": 0.6425236463546753,
      "learning_rate": 9.2e-07,
      "loss": -0.0237,
      "num_tokens": 13822505.0,
      "reward": 1.0936288833618164,
      "reward_std": 0.28550925850868225,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.09583333879709244,
      "rewards/masks_load_store_reward/std": 0.020087528973817825,
      "rewards/one_code_blob_reward/mean": 0.04154547303915024,
      "rewards/one_code_blob_reward/std": 0.0370657816529274,
      "rewards/reward_code_runs/mean": 0.16770833730697632,
      "rewards/reward_code_runs/std": 0.5830491185188293,
      "rewards/think_reward/mean": 0.19687502086162567,
      "rewards/think_reward/std": 0.03061862103641033,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.01458333432674408,
      "rewards/torch_zeros_reward/std": 0.03547917678952217,
      "rewards/valid_tl_methods_reward/mean": 0.18124999105930328,
      "rewards/valid_tl_methods_reward/std": 0.058602139353752136,
      "step": 93
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2859.0,
      "completions/max_terminated_length": 2859.0,
      "completions/mean_length": 1155.875,
      "completions/mean_terminated_length": 1155.875,
      "completions/min_length": 657.0,
      "completions/min_terminated_length": 657.0,
      "epoch": 0.9067524115755627,
      "grad_norm": 0.560306966304779,
      "learning_rate": 9.3e-07,
      "loss": -0.0126,
      "num_tokens": 13979489.0,
      "reward": 0.6780275106430054,
      "reward_std": 0.11005106568336487,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.08229167014360428,
      "rewards/masks_load_store_reward/std": 0.03837431222200394,
      "rewards/one_code_blob_reward/mean": 0.020052263513207436,
      "rewards/one_code_blob_reward/std": 0.028812218457460403,
      "rewards/reward_code_runs/mean": -0.24531249701976776,
      "rewards/reward_code_runs/std": 0.04592793434858322,
      "rewards/think_reward/mean": 0.19912105798721313,
      "rewards/think_reward/std": 0.008611898869276047,
      "rewards/torch_empty_penalty/mean": -0.01458333432674408,
      "rewards/torch_empty_penalty/std": 0.03547917678952217,
      "rewards/torch_zeros_reward/mean": 0.07395833730697632,
      "rewards/torch_zeros_reward/std": 0.04411657154560089,
      "rewards/valid_tl_methods_reward/mean": 0.1666666716337204,
      "rewards/valid_tl_methods_reward/std": 0.07492686063051224,
      "step": 94
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1804.0,
      "completions/max_terminated_length": 1804.0,
      "completions/mean_length": 664.3333740234375,
      "completions/mean_terminated_length": 664.3333740234375,
      "completions/min_length": 322.0,
      "completions/min_terminated_length": 322.0,
      "epoch": 0.9163987138263665,
      "grad_norm": 0.6537352204322815,
      "learning_rate": 9.399999999999999e-07,
      "loss": -0.0326,
      "num_tokens": 14077177.0,
      "reward": 1.3599352836608887,
      "reward_std": 0.34259817004203796,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.19583334028720856,
      "rewards/imports_decorator_reward/std": 0.02871517837047577,
      "rewards/masks_load_store_reward/mean": 0.0989583358168602,
      "rewards/masks_load_store_reward/std": 0.010206207633018494,
      "rewards/one_code_blob_reward/mean": 0.06201860308647156,
      "rewards/one_code_blob_reward/std": 0.03674319013953209,
      "rewards/reward_code_runs/mean": 0.4000000059604645,
      "rewards/reward_code_runs/std": 0.5574377179145813,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.015625,
      "rewards/torch_zeros_reward/std": 0.03649982064962387,
      "rewards/valid_tl_methods_reward/mean": 0.18958334624767303,
      "rewards/valid_tl_methods_reward/std": 0.044672295451164246,
      "step": 95
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3091.0,
      "completions/max_terminated_length": 3091.0,
      "completions/mean_length": 1097.6875,
      "completions/mean_terminated_length": 1097.6875,
      "completions/min_length": 473.0,
      "completions/min_terminated_length": 473.0,
      "epoch": 0.9260450160771704,
      "grad_norm": 0.6797481775283813,
      "learning_rate": 9.499999999999999e-07,
      "loss": 0.0296,
      "num_tokens": 14226619.0,
      "reward": 0.7398571968078613,
      "reward_std": 0.2273358702659607,
      "rewards/constexpr_reward/mean": 0.1937500238418579,
      "rewards/constexpr_reward/std": 0.034981198608875275,
      "rewards/imports_decorator_reward/mean": 0.19166667759418488,
      "rewards/imports_decorator_reward/std": 0.04017505422234535,
      "rewards/masks_load_store_reward/mean": 0.09479167312383652,
      "rewards/masks_load_store_reward/std": 0.022336147725582123,
      "rewards/one_code_blob_reward/mean": 0.024232149124145508,
      "rewards/one_code_blob_reward/std": 0.04735946282744408,
      "rewards/reward_code_runs/mean": -0.18645833432674408,
      "rewards/reward_code_runs/std": 0.2141665816307068,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": -0.0052083334885537624,
      "rewards/torch_empty_penalty/std": 0.022336147725582123,
      "rewards/torch_zeros_reward/mean": 0.0520833320915699,
      "rewards/torch_zeros_reward/std": 0.050218820571899414,
      "rewards/valid_tl_methods_reward/mean": 0.17499999701976776,
      "rewards/valid_tl_methods_reward/std": 0.06649099290370941,
      "step": 96
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2312.0,
      "completions/max_terminated_length": 2312.0,
      "completions/mean_length": 997.28125,
      "completions/mean_terminated_length": 997.28125,
      "completions/min_length": 295.0,
      "completions/min_terminated_length": 295.0,
      "epoch": 0.9356913183279743,
      "grad_norm": 0.5603955984115601,
      "learning_rate": 9.6e-07,
      "loss": 0.025,
      "num_tokens": 14366218.0,
      "reward": 1.0203857421875,
      "reward_std": 0.1987045258283615,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.09791667014360428,
      "rewards/masks_load_store_reward/std": 0.01435758825391531,
      "rewards/one_code_blob_reward/mean": 0.03861479088664055,
      "rewards/one_code_blob_reward/std": 0.030727189034223557,
      "rewards/reward_code_runs/mean": 0.07447917014360428,
      "rewards/reward_code_runs/std": 0.469153493642807,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": -0.0010416667209938169,
      "rewards/torch_empty_penalty/std": 0.010206207633018494,
      "rewards/torch_zeros_reward/mean": 0.01874999888241291,
      "rewards/torch_zeros_reward/std": 0.03923612833023071,
      "rewards/valid_tl_methods_reward/mean": 0.19375000894069672,
      "rewards/valid_tl_methods_reward/std": 0.034981198608875275,
      "step": 97
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2277.0,
      "completions/max_terminated_length": 2277.0,
      "completions/mean_length": 1014.8541870117188,
      "completions/mean_terminated_length": 1014.8541870117188,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "epoch": 0.9453376205787781,
      "grad_norm": 0.6686782240867615,
      "learning_rate": 9.7e-07,
      "loss": -0.0485,
      "num_tokens": 14510132.0,
      "reward": 0.7968265414237976,
      "reward_std": 0.2724398076534271,
      "rewards/constexpr_reward/mean": 0.19166667759418488,
      "rewards/constexpr_reward/std": 0.04017505422234535,
      "rewards/imports_decorator_reward/mean": 0.19583334028720856,
      "rewards/imports_decorator_reward/std": 0.02871517650783062,
      "rewards/masks_load_store_reward/mean": 0.0885416641831398,
      "rewards/masks_load_store_reward/std": 0.03201904892921448,
      "rewards/one_code_blob_reward/mean": 0.02547227405011654,
      "rewards/one_code_blob_reward/std": 0.04091016575694084,
      "rewards/reward_code_runs/mean": -0.09739583730697632,
      "rewards/reward_code_runs/std": 0.40205851197242737,
      "rewards/think_reward/mean": 0.19375000894069672,
      "rewards/think_reward/std": 0.043072767555713654,
      "rewards/torch_empty_penalty/mean": -0.010416666977107525,
      "rewards/torch_empty_penalty/std": 0.03070801869034767,
      "rewards/torch_zeros_reward/mean": 0.04062500223517418,
      "rewards/torch_zeros_reward/std": 0.04937104508280754,
      "rewards/valid_tl_methods_reward/mean": 0.16875000298023224,
      "rewards/valid_tl_methods_reward/std": 0.07299964129924774,
      "step": 98
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3915.0,
      "completions/max_terminated_length": 3915.0,
      "completions/mean_length": 1276.166748046875,
      "completions/mean_terminated_length": 1276.166748046875,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "epoch": 0.954983922829582,
      "grad_norm": 0.5864729881286621,
      "learning_rate": 9.8e-07,
      "loss": -0.023,
      "num_tokens": 14676672.0,
      "reward": 0.7543246746063232,
      "reward_std": 0.1961607187986374,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.09791667014360428,
      "rewards/masks_load_store_reward/std": 0.01435758825391531,
      "rewards/one_code_blob_reward/mean": 0.03349132835865021,
      "rewards/one_code_blob_reward/std": 0.0393284372985363,
      "rewards/reward_code_runs/mean": -0.15729166567325592,
      "rewards/reward_code_runs/std": 0.2097591608762741,
      "rewards/think_reward/mean": 0.19687502086162567,
      "rewards/think_reward/std": 0.03061862103641033,
      "rewards/torch_empty_penalty/mean": -0.008333333767950535,
      "rewards/torch_empty_penalty/std": 0.027783624827861786,
      "rewards/torch_zeros_reward/mean": 0.01666666753590107,
      "rewards/torch_zeros_reward/std": 0.03746343404054642,
      "rewards/valid_tl_methods_reward/mean": 0.17916667461395264,
      "rewards/valid_tl_methods_reward/std": 0.06141603738069534,
      "step": 99
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1944.0,
      "completions/max_terminated_length": 1944.0,
      "completions/mean_length": 901.5625,
      "completions/mean_terminated_length": 901.5625,
      "completions/min_length": 444.0,
      "completions/min_terminated_length": 444.0,
      "epoch": 0.9646302250803859,
      "grad_norm": 0.7356076836585999,
      "learning_rate": 9.9e-07,
      "loss": 0.0596,
      "num_tokens": 14810406.0,
      "reward": 0.7643704414367676,
      "reward_std": 0.1145286113023758,
      "rewards/constexpr_reward/mean": 0.19583334028720856,
      "rewards/constexpr_reward/std": 0.02871517650783062,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.09687501192092896,
      "rewards/masks_load_store_reward/std": 0.017490599304437637,
      "rewards/one_code_blob_reward/mean": 0.033641304820775986,
      "rewards/one_code_blob_reward/std": 0.019893698394298553,
      "rewards/reward_code_runs/mean": -0.16093750298023224,
      "rewards/reward_code_runs/std": 0.18023422360420227,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": -0.0072916666977107525,
      "rewards/torch_empty_penalty/std": 0.026136448606848717,
      "rewards/torch_zeros_reward/mean": 0.02083333395421505,
      "rewards/torch_zeros_reward/std": 0.040824830532073975,
      "rewards/valid_tl_methods_reward/mean": 0.1875,
      "rewards/valid_tl_methods_reward/std": 0.04866642504930496,
      "step": 100
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2148.0,
      "completions/max_terminated_length": 2148.0,
      "completions/mean_length": 800.1979370117188,
      "completions/mean_terminated_length": 800.1979370117188,
      "completions/min_length": 347.0,
      "completions/min_terminated_length": 347.0,
      "epoch": 0.9742765273311897,
      "grad_norm": 1.3324309587478638,
      "learning_rate": 1e-06,
      "loss": 0.0146,
      "num_tokens": 14928589.0,
      "reward": 1.0799825191497803,
      "reward_std": 0.20546208322048187,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.09479167312383652,
      "rewards/masks_load_store_reward/std": 0.022336147725582123,
      "rewards/one_code_blob_reward/mean": 0.04612823203206062,
      "rewards/one_code_blob_reward/std": 0.030145816504955292,
      "rewards/reward_code_runs/mean": 0.1223958358168602,
      "rewards/reward_code_runs/std": 0.5407047271728516,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.0520833320915699,
      "rewards/torch_zeros_reward/std": 0.050218820571899414,
      "rewards/valid_tl_methods_reward/mean": 0.16458334028720856,
      "rewards/valid_tl_methods_reward/std": 0.07674862444400787,
      "step": 101
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1403.0,
      "completions/max_terminated_length": 1403.0,
      "completions/mean_length": 639.2291870117188,
      "completions/mean_terminated_length": 639.2291870117188,
      "completions/min_length": 312.0,
      "completions/min_terminated_length": 312.0,
      "epoch": 0.9839228295819936,
      "grad_norm": 0.8389772176742554,
      "learning_rate": 1e-06,
      "loss": 0.0127,
      "num_tokens": 15028019.0,
      "reward": 1.0647220611572266,
      "reward_std": 0.1348954290151596,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.09375,
      "rewards/masks_load_store_reward/std": 0.02433321252465248,
      "rewards/one_code_blob_reward/mean": 0.06732618808746338,
      "rewards/one_code_blob_reward/std": 0.02876145765185356,
      "rewards/reward_code_runs/mean": 0.1067708358168602,
      "rewards/reward_code_runs/std": 0.5149665474891663,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": -0.012500000186264515,
      "rewards/torch_empty_penalty/std": 0.033245496451854706,
      "rewards/torch_zeros_reward/mean": 0.03437500074505806,
      "rewards/torch_zeros_reward/std": 0.04774520918726921,
      "rewards/valid_tl_methods_reward/mean": 0.17499999701976776,
      "rewards/valid_tl_methods_reward/std": 0.06649099290370941,
      "step": 102
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1444.0,
      "completions/max_terminated_length": 1444.0,
      "completions/mean_length": 757.4833984375,
      "completions/mean_terminated_length": 757.4833984375,
      "completions/min_length": 477.0,
      "completions/min_terminated_length": 477.0,
      "epoch": 0.9935691318327974,
      "grad_norm": 0.9760256409645081,
      "learning_rate": 1e-06,
      "loss": 0.0163,
      "num_tokens": 15145295.0,
      "reward": 0.9058343768119812,
      "reward_std": 0.31626075506210327,
      "rewards/constexpr_reward/mean": 0.19166667759418488,
      "rewards/constexpr_reward/std": 0.04017505794763565,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.0989583358168602,
      "rewards/masks_load_store_reward/std": 0.010206207633018494,
      "rewards/one_code_blob_reward/mean": 0.05062602832913399,
      "rewards/one_code_blob_reward/std": 0.02906649000942707,
      "rewards/reward_code_runs/mean": -0.04270833358168602,
      "rewards/reward_code_runs/std": 0.36749356985092163,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.02187499962747097,
      "rewards/torch_zeros_reward/std": 0.04155687242746353,
      "rewards/valid_tl_methods_reward/mean": 0.18541665375232697,
      "rewards/valid_tl_methods_reward/std": 0.05227290466427803,
      "step": 103
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1830.0,
      "completions/max_terminated_length": 1830.0,
      "completions/mean_length": 961.4583740234375,
      "completions/mean_terminated_length": 961.4583740234375,
      "completions/min_length": 328.0,
      "completions/min_terminated_length": 328.0,
      "epoch": 1.0096463022508038,
      "grad_norm": 0.7094870209693909,
      "learning_rate": 1e-06,
      "loss": -0.0097,
      "num_tokens": 15278767.0,
      "reward": 1.114925742149353,
      "reward_std": 0.25318485498428345,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.09375,
      "rewards/masks_load_store_reward/std": 0.02433321252465248,
      "rewards/one_code_blob_reward/mean": 0.04044640436768532,
      "rewards/one_code_blob_reward/std": 0.033034007996320724,
      "rewards/reward_code_runs/mean": 0.19947916269302368,
      "rewards/reward_code_runs/std": 0.5805847644805908,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": -0.0010416667209938169,
      "rewards/torch_empty_penalty/std": 0.010206207633018494,
      "rewards/torch_zeros_reward/mean": 0.03020833432674408,
      "rewards/torch_zeros_reward/std": 0.046157147735357285,
      "rewards/valid_tl_methods_reward/mean": 0.15625,
      "rewards/valid_tl_methods_reward/std": 0.08311375230550766,
      "step": 104
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2418.0,
      "completions/max_terminated_length": 2418.0,
      "completions/mean_length": 913.8541870117188,
      "completions/mean_terminated_length": 913.8541870117188,
      "completions/min_length": 291.0,
      "completions/min_terminated_length": 291.0,
      "epoch": 1.0192926045016077,
      "grad_norm": 0.6050485968589783,
      "learning_rate": 1e-06,
      "loss": -0.0108,
      "num_tokens": 15410021.0,
      "reward": 0.907778263092041,
      "reward_std": 0.11176653951406479,
      "rewards/constexpr_reward/mean": 0.19375000894069672,
      "rewards/constexpr_reward/std": 0.034981194883584976,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.09062501043081284,
      "rewards/masks_load_store_reward/std": 0.02930106781423092,
      "rewards/one_code_blob_reward/mean": 0.04423652961850166,
      "rewards/one_code_blob_reward/std": 0.03256293758749962,
      "rewards/reward_code_runs/mean": -0.06562499701976776,
      "rewards/reward_code_runs/std": 0.4192921817302704,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": -0.0020833334419876337,
      "rewards/torch_empty_penalty/std": 0.014357589185237885,
      "rewards/torch_zeros_reward/mean": 0.0572916679084301,
      "rewards/torch_zeros_reward/std": 0.04972512274980545,
      "rewards/valid_tl_methods_reward/mean": 0.18958334624767303,
      "rewards/valid_tl_methods_reward/std": 0.044672295451164246,
      "step": 105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2130.0,
      "completions/max_terminated_length": 2130.0,
      "completions/mean_length": 750.1875,
      "completions/mean_terminated_length": 750.1875,
      "completions/min_length": 311.0,
      "completions/min_terminated_length": 311.0,
      "epoch": 1.0289389067524115,
      "grad_norm": 0.7807785868644714,
      "learning_rate": 1e-06,
      "loss": 0.0336,
      "num_tokens": 15525131.0,
      "reward": 1.070699691772461,
      "reward_std": 0.20007085800170898,
      "rewards/constexpr_reward/mean": 0.19583334028720856,
      "rewards/constexpr_reward/std": 0.02871517837047577,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.054553885012865067,
      "rewards/one_code_blob_reward/std": 0.03134790062904358,
      "rewards/reward_code_runs/mean": 0.10260417312383652,
      "rewards/reward_code_runs/std": 0.5553268194198608,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.046875,
      "rewards/torch_zeros_reward/std": 0.05016420781612396,
      "rewards/valid_tl_methods_reward/mean": 0.17083333432674408,
      "rewards/valid_tl_methods_reward/std": 0.07095835357904434,
      "step": 106
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1782.0,
      "completions/max_terminated_length": 1782.0,
      "completions/mean_length": 879.8854370117188,
      "completions/mean_terminated_length": 879.8854370117188,
      "completions/min_length": 291.0,
      "completions/min_terminated_length": 291.0,
      "epoch": 1.0385852090032155,
      "grad_norm": 0.6552996039390564,
      "learning_rate": 1e-06,
      "loss": 0.0405,
      "num_tokens": 15652152.0,
      "reward": 1.0106452703475952,
      "reward_std": 0.12692387402057648,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.09791667014360428,
      "rewards/masks_load_store_reward/std": 0.01435758825391531,
      "rewards/one_code_blob_reward/mean": 0.04189518466591835,
      "rewards/one_code_blob_reward/std": 0.031889814883470535,
      "rewards/reward_code_runs/mean": 0.05104166641831398,
      "rewards/reward_code_runs/std": 0.47356316447257996,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": -0.0010416667209938169,
      "rewards/torch_empty_penalty/std": 0.010206207633018494,
      "rewards/torch_zeros_reward/mean": 0.04791666939854622,
      "rewards/torch_zeros_reward/std": 0.050218820571899414,
      "rewards/valid_tl_methods_reward/mean": 0.17291666567325592,
      "rewards/valid_tl_methods_reward/std": 0.06879284977912903,
      "step": 107
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1908.0,
      "completions/max_terminated_length": 1908.0,
      "completions/mean_length": 852.1875,
      "completions/mean_terminated_length": 852.1875,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "epoch": 1.0482315112540193,
      "grad_norm": 0.5427823066711426,
      "learning_rate": 1e-06,
      "loss": 0.0062,
      "num_tokens": 15776010.0,
      "reward": 0.9478980898857117,
      "reward_std": 0.18901358544826508,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.09583333879709244,
      "rewards/masks_load_store_reward/std": 0.020087527111172676,
      "rewards/one_code_blob_reward/mean": 0.03800217807292938,
      "rewards/one_code_blob_reward/std": 0.04549255967140198,
      "rewards/reward_code_runs/mean": 0.0036458373069763184,
      "rewards/reward_code_runs/std": 0.4326323866844177,
      "rewards/think_reward/mean": 0.19687502086162567,
      "rewards/think_reward/std": 0.03061862289905548,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.02604166604578495,
      "rewards/torch_zeros_reward/std": 0.04411657154560089,
      "rewards/valid_tl_methods_reward/mean": 0.19166667759418488,
      "rewards/valid_tl_methods_reward/std": 0.04017505422234535,
      "step": 108
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1514.0,
      "completions/max_terminated_length": 1514.0,
      "completions/mean_length": 835.4583740234375,
      "completions/mean_terminated_length": 835.4583740234375,
      "completions/min_length": 370.0,
      "completions/min_terminated_length": 370.0,
      "epoch": 1.0578778135048232,
      "grad_norm": 0.9861233830451965,
      "learning_rate": 1e-06,
      "loss": 0.0045,
      "num_tokens": 15899906.0,
      "reward": 0.8864466547966003,
      "reward_std": 0.15781597793102264,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.0989583358168602,
      "rewards/masks_load_store_reward/std": 0.010206207633018494,
      "rewards/one_code_blob_reward/mean": 0.043217454105615616,
      "rewards/one_code_blob_reward/std": 0.02705226093530655,
      "rewards/reward_code_runs/mean": -0.07864583283662796,
      "rewards/reward_code_runs/std": 0.40501755475997925,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": -0.0010416667209938169,
      "rewards/torch_empty_penalty/std": 0.010206207633018494,
      "rewards/torch_zeros_reward/mean": 0.046875,
      "rewards/torch_zeros_reward/std": 0.05016420781612396,
      "rewards/valid_tl_methods_reward/mean": 0.1770833283662796,
      "rewards/valid_tl_methods_reward/std": 0.06403809040784836,
      "step": 109
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1851.0,
      "completions/max_terminated_length": 1851.0,
      "completions/mean_length": 789.4271240234375,
      "completions/mean_terminated_length": 789.4271240234375,
      "completions/min_length": 348.0,
      "completions/min_terminated_length": 348.0,
      "epoch": 1.067524115755627,
      "grad_norm": 0.8766092658042908,
      "learning_rate": 1e-06,
      "loss": 0.001,
      "num_tokens": 16018987.0,
      "reward": 0.7993344068527222,
      "reward_std": 0.13429999351501465,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.09791667014360428,
      "rewards/masks_load_store_reward/std": 0.01435758825391531,
      "rewards/one_code_blob_reward/mean": 0.0482926070690155,
      "rewards/one_code_blob_reward/std": 0.03107845038175583,
      "rewards/reward_code_runs/mean": -0.18437498807907104,
      "rewards/reward_code_runs/std": 0.15965628623962402,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.06458333134651184,
      "rewards/torch_zeros_reward/std": 0.0480770580470562,
      "rewards/valid_tl_methods_reward/mean": 0.17291666567325592,
      "rewards/valid_tl_methods_reward/std": 0.06879284977912903,
      "step": 110
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2100.0,
      "completions/max_terminated_length": 2100.0,
      "completions/mean_length": 759.7708740234375,
      "completions/mean_terminated_length": 759.7708740234375,
      "completions/min_length": 287.0,
      "completions/min_terminated_length": 287.0,
      "epoch": 1.077170418006431,
      "grad_norm": 0.789993941783905,
      "learning_rate": 1e-06,
      "loss": 0.0078,
      "num_tokens": 16133469.0,
      "reward": 0.9797508120536804,
      "reward_std": 0.142560213804245,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.09687501192092896,
      "rewards/masks_load_store_reward/std": 0.017490597441792488,
      "rewards/one_code_blob_reward/mean": 0.056834083050489426,
      "rewards/one_code_blob_reward/std": 0.031814008951187134,
      "rewards/reward_code_runs/mean": -0.009374993853271008,
      "rewards/reward_code_runs/std": 0.42098334431648254,
      "rewards/think_reward/mean": 0.19687502086162567,
      "rewards/think_reward/std": 0.03061862289905548,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.0572916679084301,
      "rewards/torch_zeros_reward/std": 0.04972512274980545,
      "rewards/valid_tl_methods_reward/mean": 0.18124999105930328,
      "rewards/valid_tl_methods_reward/std": 0.058602139353752136,
      "step": 111
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1706.0,
      "completions/max_terminated_length": 1706.0,
      "completions/mean_length": 767.8333740234375,
      "completions/mean_terminated_length": 767.8333740234375,
      "completions/min_length": 370.0,
      "completions/min_terminated_length": 370.0,
      "epoch": 1.0868167202572347,
      "grad_norm": 0.8064549565315247,
      "learning_rate": 1e-06,
      "loss": -0.0186,
      "num_tokens": 16249697.0,
      "reward": 0.8837992548942566,
      "reward_std": 0.17673146724700928,
      "rewards/constexpr_reward/mean": 0.19583334028720856,
      "rewards/constexpr_reward/std": 0.02871517650783062,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.09791667014360428,
      "rewards/masks_load_store_reward/std": 0.01435758825391531,
      "rewards/one_code_blob_reward/mean": 0.046299200505018234,
      "rewards/one_code_blob_reward/std": 0.03371729701757431,
      "rewards/reward_code_runs/mean": -0.05416667088866234,
      "rewards/reward_code_runs/std": 0.39441272616386414,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.02500000037252903,
      "rewards/torch_zeros_reward/std": 0.04352857545018196,
      "rewards/valid_tl_methods_reward/mean": 0.17500001192092896,
      "rewards/valid_tl_methods_reward/std": 0.06649099290370941,
      "step": 112
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1651.0,
      "completions/max_terminated_length": 1651.0,
      "completions/mean_length": 805.28125,
      "completions/mean_terminated_length": 805.28125,
      "completions/min_length": 254.0,
      "completions/min_terminated_length": 254.0,
      "epoch": 1.0964630225080385,
      "grad_norm": 0.7702875733375549,
      "learning_rate": 1e-06,
      "loss": -0.0468,
      "num_tokens": 16373240.0,
      "reward": 0.9052786827087402,
      "reward_std": 0.2217210829257965,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.09791667014360428,
      "rewards/masks_load_store_reward/std": 0.01435758825391531,
      "rewards/one_code_blob_reward/mean": 0.04486199840903282,
      "rewards/one_code_blob_reward/std": 0.039526235312223434,
      "rewards/reward_code_runs/mean": -0.01666666567325592,
      "rewards/reward_code_runs/std": 0.39541226625442505,
      "rewards/think_reward/mean": 0.19687502086162567,
      "rewards/think_reward/std": 0.03061862289905548,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.02187499962747097,
      "rewards/torch_zeros_reward/std": 0.04155687242746353,
      "rewards/valid_tl_methods_reward/mean": 0.16458334028720856,
      "rewards/valid_tl_methods_reward/std": 0.07674862444400787,
      "step": 113
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1377.0,
      "completions/max_terminated_length": 1377.0,
      "completions/mean_length": 663.2083740234375,
      "completions/mean_terminated_length": 663.2083740234375,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "epoch": 1.1061093247588425,
      "grad_norm": 0.7669295072555542,
      "learning_rate": 1e-06,
      "loss": 0.0035,
      "num_tokens": 16476820.0,
      "reward": 1.043578028678894,
      "reward_std": 0.11954164505004883,
      "rewards/constexpr_reward/mean": 0.17916667461395264,
      "rewards/constexpr_reward/std": 0.061416033655405045,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.09791667014360428,
      "rewards/masks_load_store_reward/std": 0.01435758825391531,
      "rewards/one_code_blob_reward/mean": 0.05659870803356171,
      "rewards/one_code_blob_reward/std": 0.04853541776537895,
      "rewards/reward_code_runs/mean": 0.09531249850988388,
      "rewards/reward_code_runs/std": 0.5377378463745117,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.03541666641831398,
      "rewards/torch_zeros_reward/std": 0.0480770580470562,
      "rewards/valid_tl_methods_reward/mean": 0.18125002086162567,
      "rewards/valid_tl_methods_reward/std": 0.058602139353752136,
      "step": 114
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1228.0,
      "completions/max_terminated_length": 1228.0,
      "completions/mean_length": 583.5729370117188,
      "completions/mean_terminated_length": 583.5729370117188,
      "completions/min_length": 276.0,
      "completions/min_terminated_length": 276.0,
      "epoch": 1.1157556270096463,
      "grad_norm": 0.8292638659477234,
      "learning_rate": 1e-06,
      "loss": -0.0067,
      "num_tokens": 16570607.0,
      "reward": 1.4030187129974365,
      "reward_std": 0.24441388249397278,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.0989583358168602,
      "rewards/masks_load_store_reward/std": 0.010206207633018494,
      "rewards/one_code_blob_reward/mean": 0.07333113998174667,
      "rewards/one_code_blob_reward/std": 0.0357937254011631,
      "rewards/reward_code_runs/mean": 0.40677082538604736,
      "rewards/reward_code_runs/std": 0.6152713894844055,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.04270833730697632,
      "rewards/torch_zeros_reward/std": 0.04972512647509575,
      "rewards/valid_tl_methods_reward/mean": 0.18124999105930328,
      "rewards/valid_tl_methods_reward/std": 0.058602139353752136,
      "step": 115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1615.0,
      "completions/max_terminated_length": 1615.0,
      "completions/mean_length": 790.4791870117188,
      "completions/mean_terminated_length": 790.4791870117188,
      "completions/min_length": 298.0,
      "completions/min_terminated_length": 298.0,
      "epoch": 1.1254019292604502,
      "grad_norm": 0.790634274482727,
      "learning_rate": 1e-06,
      "loss": -0.0193,
      "num_tokens": 16690557.0,
      "reward": 0.8124825358390808,
      "reward_std": 0.1920197457075119,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.09062501043081284,
      "rewards/masks_load_store_reward/std": 0.029301069676876068,
      "rewards/one_code_blob_reward/mean": 0.05206575617194176,
      "rewards/one_code_blob_reward/std": 0.03424833342432976,
      "rewards/reward_code_runs/mean": -0.15208333730697632,
      "rewards/reward_code_runs/std": 0.2957521080970764,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.05312499776482582,
      "rewards/torch_zeros_reward/std": 0.05016420781612396,
      "rewards/valid_tl_methods_reward/mean": 0.1729166954755783,
      "rewards/valid_tl_methods_reward/std": 0.06879284977912903,
      "step": 116
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1508.0,
      "completions/max_terminated_length": 1508.0,
      "completions/mean_length": 724.5,
      "completions/mean_terminated_length": 724.5,
      "completions/min_length": 372.0,
      "completions/min_terminated_length": 372.0,
      "epoch": 1.135048231511254,
      "grad_norm": 0.7953296303749084,
      "learning_rate": 1e-06,
      "loss": -0.0229,
      "num_tokens": 16807929.0,
      "reward": 0.7903190851211548,
      "reward_std": 0.15812592208385468,
      "rewards/constexpr_reward/mean": 0.19166667759418488,
      "rewards/constexpr_reward/std": 0.04017505422234535,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.09583333879709244,
      "rewards/masks_load_store_reward/std": 0.020087527111172676,
      "rewards/one_code_blob_reward/mean": 0.043444037437438965,
      "rewards/one_code_blob_reward/std": 0.024329539388418198,
      "rewards/reward_code_runs/mean": -0.17604167759418488,
      "rewards/reward_code_runs/std": 0.19654637575149536,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.05833333358168602,
      "rewards/torch_zeros_reward/std": 0.04955946281552315,
      "rewards/valid_tl_methods_reward/mean": 0.1770833283662796,
      "rewards/valid_tl_methods_reward/std": 0.06403809040784836,
      "step": 117
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1664.0,
      "completions/max_terminated_length": 1664.0,
      "completions/mean_length": 624.9375,
      "completions/mean_terminated_length": 624.9375,
      "completions/min_length": 272.0,
      "completions/min_terminated_length": 272.0,
      "epoch": 1.144694533762058,
      "grad_norm": 0.8499788641929626,
      "learning_rate": 1e-06,
      "loss": 0.0358,
      "num_tokens": 16908783.0,
      "reward": 1.0993988513946533,
      "reward_std": 0.14713454246520996,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.06085706874728203,
      "rewards/one_code_blob_reward/std": 0.030229216441512108,
      "rewards/reward_code_runs/mean": 0.109375,
      "rewards/reward_code_runs/std": 0.5343620181083679,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.03750000149011612,
      "rewards/torch_zeros_reward/std": 0.04866642504930496,
      "rewards/valid_tl_methods_reward/mean": 0.19166667759418488,
      "rewards/valid_tl_methods_reward/std": 0.04017505422234535,
      "step": 118
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1902.0,
      "completions/max_terminated_length": 1902.0,
      "completions/mean_length": 829.9791870117188,
      "completions/mean_terminated_length": 829.9791870117188,
      "completions/min_length": 263.0,
      "completions/min_terminated_length": 263.0,
      "epoch": 1.1543408360128617,
      "grad_norm": 0.8699808120727539,
      "learning_rate": 1e-06,
      "loss": -0.0014,
      "num_tokens": 17031757.0,
      "reward": 0.9007784128189087,
      "reward_std": 0.08680151402950287,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.09583333879709244,
      "rewards/masks_load_store_reward/std": 0.020087527111172676,
      "rewards/one_code_blob_reward/mean": 0.04348668456077576,
      "rewards/one_code_blob_reward/std": 0.031017502769827843,
      "rewards/reward_code_runs/mean": -0.06562500447034836,
      "rewards/reward_code_runs/std": 0.4192921221256256,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.03333333507180214,
      "rewards/torch_zeros_reward/std": 0.04738790914416313,
      "rewards/valid_tl_methods_reward/mean": 0.1937500238418579,
      "rewards/valid_tl_methods_reward/std": 0.034981198608875275,
      "step": 119
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1994.0,
      "completions/max_terminated_length": 1994.0,
      "completions/mean_length": 847.6979370117188,
      "completions/mean_terminated_length": 847.6979370117188,
      "completions/min_length": 373.0,
      "completions/min_terminated_length": 373.0,
      "epoch": 1.1639871382636655,
      "grad_norm": 0.7712662220001221,
      "learning_rate": 1e-06,
      "loss": -0.0355,
      "num_tokens": 17158976.0,
      "reward": 0.7087725400924683,
      "reward_std": 0.09143199771642685,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.0989583358168602,
      "rewards/masks_load_store_reward/std": 0.010206207633018494,
      "rewards/one_code_blob_reward/mean": 0.03741825371980667,
      "rewards/one_code_blob_reward/std": 0.03352433815598488,
      "rewards/reward_code_runs/mean": -0.24531249701976776,
      "rewards/reward_code_runs/std": 0.04592793434858322,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.046875,
      "rewards/torch_zeros_reward/std": 0.05016420781612396,
      "rewards/valid_tl_methods_reward/mean": 0.17499999701976776,
      "rewards/valid_tl_methods_reward/std": 0.06649099290370941,
      "step": 120
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1157.0,
      "completions/max_terminated_length": 1157.0,
      "completions/mean_length": 601.2604370117188,
      "completions/mean_terminated_length": 601.2604370117188,
      "completions/min_length": 266.0,
      "completions/min_terminated_length": 266.0,
      "epoch": 1.1736334405144695,
      "grad_norm": 0.6979901790618896,
      "learning_rate": 1e-06,
      "loss": 0.0226,
      "num_tokens": 17256777.0,
      "reward": 0.9565267562866211,
      "reward_std": 0.09983557462692261,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.06277672201395035,
      "rewards/one_code_blob_reward/std": 0.02563941292464733,
      "rewards/reward_code_runs/mean": -0.028124993667006493,
      "rewards/reward_code_runs/std": 0.4212645888328552,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.04895833134651184,
      "rewards/torch_zeros_reward/std": 0.050251562148332596,
      "rewards/valid_tl_methods_reward/mean": 0.17291666567325592,
      "rewards/valid_tl_methods_reward/std": 0.06879284977912903,
      "step": 121
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1296.0,
      "completions/max_terminated_length": 1296.0,
      "completions/mean_length": 720.125,
      "completions/mean_terminated_length": 720.125,
      "completions/min_length": 247.0,
      "completions/min_terminated_length": 247.0,
      "epoch": 1.1832797427652733,
      "grad_norm": 1.1423685550689697,
      "learning_rate": 1e-06,
      "loss": 0.0302,
      "num_tokens": 17369217.0,
      "reward": 0.9213575124740601,
      "reward_std": 0.08902470767498016,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.04844085872173309,
      "rewards/one_code_blob_reward/std": 0.0287802591919899,
      "rewards/reward_code_runs/mean": -0.046875,
      "rewards/reward_code_runs/std": 0.4207019507884979,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.03020833432674408,
      "rewards/torch_zeros_reward/std": 0.046157147735357285,
      "rewards/valid_tl_methods_reward/mean": 0.18958334624767303,
      "rewards/valid_tl_methods_reward/std": 0.044672295451164246,
      "step": 122
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2428.0,
      "completions/max_terminated_length": 2428.0,
      "completions/mean_length": 791.34375,
      "completions/mean_terminated_length": 791.34375,
      "completions/min_length": 281.0,
      "completions/min_terminated_length": 281.0,
      "epoch": 1.1929260450160772,
      "grad_norm": 0.8820392489433289,
      "learning_rate": 1e-06,
      "loss": -0.0095,
      "num_tokens": 17487174.0,
      "reward": 0.9861018061637878,
      "reward_std": 0.08739300817251205,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.09791667014360428,
      "rewards/masks_load_store_reward/std": 0.01435758825391531,
      "rewards/one_code_blob_reward/mean": 0.05380998179316521,
      "rewards/one_code_blob_reward/std": 0.038780417293310165,
      "rewards/reward_code_runs/mean": -0.01875000260770321,
      "rewards/reward_code_runs/std": 0.42122939229011536,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.05520833656191826,
      "rewards/torch_zeros_reward/std": 0.04998903349041939,
      "rewards/valid_tl_methods_reward/mean": 0.1979166716337204,
      "rewards/valid_tl_methods_reward/std": 0.020412415266036987,
      "step": 123
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1886.0,
      "completions/max_terminated_length": 1886.0,
      "completions/mean_length": 1003.7396240234375,
      "completions/mean_terminated_length": 1003.7396240234375,
      "completions/min_length": 316.0,
      "completions/min_terminated_length": 316.0,
      "epoch": 1.202572347266881,
      "grad_norm": 0.719910740852356,
      "learning_rate": 1e-06,
      "loss": 0.0595,
      "num_tokens": 17629193.0,
      "reward": 0.8898797035217285,
      "reward_std": 0.11359749734401703,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.035712968558073044,
      "rewards/one_code_blob_reward/std": 0.03590528666973114,
      "rewards/reward_code_runs/mean": -0.0625,
      "rewards/reward_code_runs/std": 0.3800969123840332,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.03958333283662796,
      "rewards/torch_zeros_reward/std": 0.04915960505604744,
      "rewards/valid_tl_methods_reward/mean": 0.1770833283662796,
      "rewards/valid_tl_methods_reward/std": 0.06403809040784836,
      "step": 124
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1455.0,
      "completions/max_terminated_length": 1455.0,
      "completions/mean_length": 684.3541870117188,
      "completions/mean_terminated_length": 684.3541870117188,
      "completions/min_length": 261.0,
      "completions/min_terminated_length": 261.0,
      "epoch": 1.212218649517685,
      "grad_norm": 0.7489015460014343,
      "learning_rate": 1e-06,
      "loss": -0.0006,
      "num_tokens": 17736483.0,
      "reward": 1.0232198238372803,
      "reward_std": 0.21418210864067078,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.09479167312383652,
      "rewards/masks_load_store_reward/std": 0.022336147725582123,
      "rewards/one_code_blob_reward/mean": 0.0570739321410656,
      "rewards/one_code_blob_reward/std": 0.03297010809183121,
      "rewards/reward_code_runs/mean": 0.02239583432674408,
      "rewards/reward_code_runs/std": 0.43096399307250977,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.05520833656191826,
      "rewards/torch_zeros_reward/std": 0.04998903349041939,
      "rewards/valid_tl_methods_reward/mean": 0.19583334028720856,
      "rewards/valid_tl_methods_reward/std": 0.02871517650783062,
      "step": 125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 4096.0,
      "completions/max_terminated_length": 3551.0,
      "completions/mean_length": 937.75,
      "completions/mean_terminated_length": 727.2000122070312,
      "completions/min_length": 273.0,
      "completions/min_terminated_length": 273.0,
      "epoch": 1.2218649517684887,
      "grad_norm": 0.9929730892181396,
      "learning_rate": 1e-06,
      "loss": 0.0323,
      "num_tokens": 17864571.0,
      "reward": 1.134056806564331,
      "reward_std": 0.26930850744247437,
      "rewards/constexpr_reward/mean": 0.1833333522081375,
      "rewards/constexpr_reward/std": 0.05556724965572357,
      "rewards/imports_decorator_reward/mean": 0.1833333522081375,
      "rewards/imports_decorator_reward/std": 0.05556724965572357,
      "rewards/masks_load_store_reward/mean": 0.09270834177732468,
      "rewards/masks_load_store_reward/std": 0.026136452332139015,
      "rewards/one_code_blob_reward/mean": 0.04603588208556175,
      "rewards/one_code_blob_reward/std": 0.07669390738010406,
      "rewards/reward_code_runs/mean": 0.19947917759418488,
      "rewards/reward_code_runs/std": 0.4922822415828705,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.04791666567325592,
      "rewards/torch_zeros_reward/std": 0.050218820571899414,
      "rewards/valid_tl_methods_reward/mean": 0.18125002086162567,
      "rewards/valid_tl_methods_reward/std": 0.058602139353752136,
      "step": 126
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1398.0,
      "completions/max_terminated_length": 1398.0,
      "completions/mean_length": 612.3333740234375,
      "completions/mean_terminated_length": 612.3333740234375,
      "completions/min_length": 277.0,
      "completions/min_terminated_length": 277.0,
      "epoch": 1.2315112540192925,
      "grad_norm": 0.766198992729187,
      "learning_rate": 1e-06,
      "loss": -0.0102,
      "num_tokens": 17966759.0,
      "reward": 1.213801383972168,
      "reward_std": 0.15725013613700867,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.06432215124368668,
      "rewards/one_code_blob_reward/std": 0.024616621434688568,
      "rewards/reward_code_runs/mean": 0.19947917759418488,
      "rewards/reward_code_runs/std": 0.49228227138519287,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.05625000596046448,
      "rewards/torch_zeros_reward/std": 0.04986824467778206,
      "rewards/valid_tl_methods_reward/mean": 0.1937500238418579,
      "rewards/valid_tl_methods_reward/std": 0.034981198608875275,
      "step": 127
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1922.0,
      "completions/max_terminated_length": 1922.0,
      "completions/mean_length": 714.3646240234375,
      "completions/mean_terminated_length": 714.3646240234375,
      "completions/min_length": 282.0,
      "completions/min_terminated_length": 282.0,
      "epoch": 1.2411575562700965,
      "grad_norm": 0.7098949551582336,
      "learning_rate": 1e-06,
      "loss": 0.019,
      "num_tokens": 18078058.0,
      "reward": 1.1137888431549072,
      "reward_std": 0.12167000770568848,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.05753864720463753,
      "rewards/one_code_blob_reward/std": 0.037805669009685516,
      "rewards/reward_code_runs/mean": 0.08541666716337204,
      "rewards/reward_code_runs/std": 0.49936363101005554,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.0729166641831398,
      "rewards/torch_zeros_reward/std": 0.044672295451164246,
      "rewards/valid_tl_methods_reward/mean": 0.20000000298023224,
      "rewards/valid_tl_methods_reward/std": 0.0,
      "step": 128
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1433.0,
      "completions/max_terminated_length": 1433.0,
      "completions/mean_length": 653.8125,
      "completions/mean_terminated_length": 653.8125,
      "completions/min_length": 263.0,
      "completions/min_terminated_length": 263.0,
      "epoch": 1.2508038585209003,
      "grad_norm": 0.8204249739646912,
      "learning_rate": 1e-06,
      "loss": 0.038,
      "num_tokens": 18181300.0,
      "reward": 0.9050588607788086,
      "reward_std": 0.05716240406036377,
      "rewards/constexpr_reward/mean": 0.1937500238418579,
      "rewards/constexpr_reward/std": 0.034981198608875275,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.054017141461372375,
      "rewards/one_code_blob_reward/std": 0.02962045557796955,
      "rewards/reward_code_runs/mean": -0.08437500149011612,
      "rewards/reward_code_runs/std": 0.4170266091823578,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.06666667014360428,
      "rewards/torch_zeros_reward/std": 0.04738791286945343,
      "rewards/valid_tl_methods_reward/mean": 0.17499999701976776,
      "rewards/valid_tl_methods_reward/std": 0.06649099290370941,
      "step": 129
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1499.0,
      "completions/max_terminated_length": 1499.0,
      "completions/mean_length": 729.5416870117188,
      "completions/mean_terminated_length": 729.5416870117188,
      "completions/min_length": 389.0,
      "completions/min_terminated_length": 389.0,
      "epoch": 1.2604501607717042,
      "grad_norm": 0.6134868264198303,
      "learning_rate": 1e-06,
      "loss": 0.0328,
      "num_tokens": 18298652.0,
      "reward": 0.8414579629898071,
      "reward_std": 0.19312511384487152,
      "rewards/constexpr_reward/mean": 0.19166667759418488,
      "rewards/constexpr_reward/std": 0.04017505422234535,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.09687501192092896,
      "rewards/masks_load_store_reward/std": 0.017490599304437637,
      "rewards/one_code_blob_reward/mean": 0.04874952509999275,
      "rewards/one_code_blob_reward/std": 0.020982850342988968,
      "rewards/reward_code_runs/mean": -0.1510416716337204,
      "rewards/reward_code_runs/std": 0.27786585688591003,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.05937500298023224,
      "rewards/torch_zeros_reward/std": 0.04937104508280754,
      "rewards/valid_tl_methods_reward/mean": 0.19583334028720856,
      "rewards/valid_tl_methods_reward/std": 0.02871517650783062,
      "step": 130
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1769.0,
      "completions/max_terminated_length": 1769.0,
      "completions/mean_length": 725.5104370117188,
      "completions/mean_terminated_length": 725.5104370117188,
      "completions/min_length": 348.0,
      "completions/min_terminated_length": 348.0,
      "epoch": 1.270096463022508,
      "grad_norm": 0.7332961559295654,
      "learning_rate": 1e-06,
      "loss": 0.0498,
      "num_tokens": 18414645.0,
      "reward": 0.9153237342834473,
      "reward_std": 0.17812936007976532,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.0989583358168602,
      "rewards/masks_load_store_reward/std": 0.010206207633018494,
      "rewards/one_code_blob_reward/mean": 0.05386530980467796,
      "rewards/one_code_blob_reward/std": 0.03133547678589821,
      "rewards/reward_code_runs/mean": -0.04583333060145378,
      "rewards/reward_code_runs/std": 0.4080548584461212,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.03333333507180214,
      "rewards/torch_zeros_reward/std": 0.04738790914416313,
      "rewards/valid_tl_methods_reward/mean": 0.17499999701976776,
      "rewards/valid_tl_methods_reward/std": 0.06649099290370941,
      "step": 131
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3680.0,
      "completions/max_terminated_length": 3680.0,
      "completions/mean_length": 1063.2708740234375,
      "completions/mean_terminated_length": 1063.2708740234375,
      "completions/min_length": 383.0,
      "completions/min_terminated_length": 383.0,
      "epoch": 1.279742765273312,
      "grad_norm": 0.7587729096412659,
      "learning_rate": 1e-06,
      "loss": 0.0326,
      "num_tokens": 18562499.0,
      "reward": 0.8570469617843628,
      "reward_std": 0.1749621331691742,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.0989583358168602,
      "rewards/masks_load_store_reward/std": 0.010206207633018494,
      "rewards/one_code_blob_reward/mean": 0.03673437237739563,
      "rewards/one_code_blob_reward/std": 0.02618933655321598,
      "rewards/reward_code_runs/mean": -0.11927083134651184,
      "rewards/reward_code_runs/std": 0.30791059136390686,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.05104166641831398,
      "rewards/torch_zeros_reward/std": 0.0502515584230423,
      "rewards/valid_tl_methods_reward/mean": 0.19166667759418488,
      "rewards/valid_tl_methods_reward/std": 0.04017505422234535,
      "step": 132
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1409.0,
      "completions/max_terminated_length": 1409.0,
      "completions/mean_length": 752.0104370117188,
      "completions/mean_terminated_length": 752.0104370117188,
      "completions/min_length": 341.0,
      "completions/min_terminated_length": 341.0,
      "epoch": 1.2893890675241158,
      "grad_norm": 0.8954541087150574,
      "learning_rate": 1e-06,
      "loss": 0.0442,
      "num_tokens": 18681312.0,
      "reward": 0.9138407111167908,
      "reward_std": 0.18238215148448944,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.09791667014360428,
      "rewards/masks_load_store_reward/std": 0.01435758825391531,
      "rewards/one_code_blob_reward/mean": 0.050298962742090225,
      "rewards/one_code_blob_reward/std": 0.02945978380739689,
      "rewards/reward_code_runs/mean": -0.0520833320915699,
      "rewards/reward_code_runs/std": 0.3669157922267914,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.02604166604578495,
      "rewards/torch_zeros_reward/std": 0.04411657154560089,
      "rewards/valid_tl_methods_reward/mean": 0.19375000894069672,
      "rewards/valid_tl_methods_reward/std": 0.034981198608875275,
      "step": 133
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1425.0,
      "completions/max_terminated_length": 1425.0,
      "completions/mean_length": 750.1146240234375,
      "completions/mean_terminated_length": 750.1146240234375,
      "completions/min_length": 372.0,
      "completions/min_terminated_length": 372.0,
      "epoch": 1.2990353697749195,
      "grad_norm": 0.6723171472549438,
      "learning_rate": 1e-06,
      "loss": -0.0271,
      "num_tokens": 18801695.0,
      "reward": 1.0500636100769043,
      "reward_std": 0.190835103392601,
      "rewards/constexpr_reward/mean": 0.1937500238418579,
      "rewards/constexpr_reward/std": 0.034981198608875275,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.04589679837226868,
      "rewards/one_code_blob_reward/std": 0.022632678970694542,
      "rewards/reward_code_runs/mean": 0.07500001043081284,
      "rewards/reward_code_runs/std": 0.5119621753692627,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.04583333432674408,
      "rewards/torch_zeros_reward/std": 0.050087641924619675,
      "rewards/valid_tl_methods_reward/mean": 0.18958334624767303,
      "rewards/valid_tl_methods_reward/std": 0.044672295451164246,
      "step": 134
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1770.0,
      "completions/max_terminated_length": 1770.0,
      "completions/mean_length": 809.6354370117188,
      "completions/mean_terminated_length": 809.6354370117188,
      "completions/min_length": 281.0,
      "completions/min_terminated_length": 281.0,
      "epoch": 1.3086816720257235,
      "grad_norm": 0.9398924708366394,
      "learning_rate": 1e-06,
      "loss": 0.0377,
      "num_tokens": 18924696.0,
      "reward": 0.9232425093650818,
      "reward_std": 0.14786306023597717,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.0989583358168602,
      "rewards/masks_load_store_reward/std": 0.010206207633018494,
      "rewards/one_code_blob_reward/mean": 0.043034106492996216,
      "rewards/one_code_blob_reward/std": 0.03087618201971054,
      "rewards/reward_code_runs/mean": -0.09062498807907104,
      "rewards/reward_code_runs/std": 0.3762217164039612,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.08437500149011612,
      "rewards/torch_zeros_reward/std": 0.03649982064962387,
      "rewards/valid_tl_methods_reward/mean": 0.1875,
      "rewards/valid_tl_methods_reward/std": 0.04866642504930496,
      "step": 135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1316.0,
      "completions/max_terminated_length": 1316.0,
      "completions/mean_length": 629.28125,
      "completions/mean_terminated_length": 629.28125,
      "completions/min_length": 303.0,
      "completions/min_terminated_length": 303.0,
      "epoch": 1.3183279742765273,
      "grad_norm": 0.9921103715896606,
      "learning_rate": 1e-06,
      "loss": 0.0028,
      "num_tokens": 19029159.0,
      "reward": 0.8372518420219421,
      "reward_std": 0.1289333999156952,
      "rewards/constexpr_reward/mean": 0.19583334028720856,
      "rewards/constexpr_reward/std": 0.02871517650783062,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.0989583358168602,
      "rewards/masks_load_store_reward/std": 0.010206207633018494,
      "rewards/one_code_blob_reward/mean": 0.05912676081061363,
      "rewards/one_code_blob_reward/std": 0.02672174572944641,
      "rewards/reward_code_runs/mean": -0.16562499105930328,
      "rewards/reward_code_runs/std": 0.1765625774860382,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.05520833656191826,
      "rewards/torch_zeros_reward/std": 0.049989037215709686,
      "rewards/valid_tl_methods_reward/mean": 0.19375000894069672,
      "rewards/valid_tl_methods_reward/std": 0.034981198608875275,
      "step": 136
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1429.0,
      "completions/max_terminated_length": 1429.0,
      "completions/mean_length": 610.3854370117188,
      "completions/mean_terminated_length": 610.3854370117188,
      "completions/min_length": 293.0,
      "completions/min_terminated_length": 293.0,
      "epoch": 1.3279742765273312,
      "grad_norm": 0.8055822849273682,
      "learning_rate": 1e-06,
      "loss": 0.0039,
      "num_tokens": 19129276.0,
      "reward": 1.0758776664733887,
      "reward_std": 0.13094481825828552,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.06285682320594788,
      "rewards/one_code_blob_reward/std": 0.02713281475007534,
      "rewards/reward_code_runs/mean": 0.07968749850988388,
      "rewards/reward_code_runs/std": 0.40792015194892883,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.0625,
      "rewards/torch_zeros_reward/std": 0.04866642504930496,
      "rewards/valid_tl_methods_reward/mean": 0.17500001192092896,
      "rewards/valid_tl_methods_reward/std": 0.06649099290370941,
      "step": 137
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1079.0,
      "completions/max_terminated_length": 1079.0,
      "completions/mean_length": 663.25,
      "completions/mean_terminated_length": 663.25,
      "completions/min_length": 302.0,
      "completions/min_terminated_length": 302.0,
      "epoch": 1.337620578778135,
      "grad_norm": 0.9214664697647095,
      "learning_rate": 1e-06,
      "loss": 0.0272,
      "num_tokens": 19236664.0,
      "reward": 0.911638617515564,
      "reward_std": 0.11367520689964294,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.09479167312383652,
      "rewards/masks_load_store_reward/std": 0.022336147725582123,
      "rewards/one_code_blob_reward/mean": 0.059555213898420334,
      "rewards/one_code_blob_reward/std": 0.03002886101603508,
      "rewards/reward_code_runs/mean": -0.10104166716337204,
      "rewards/reward_code_runs/std": 0.388043612241745,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.06041666865348816,
      "rewards/torch_zeros_reward/std": 0.04915960505604744,
      "rewards/valid_tl_methods_reward/mean": 0.20000000298023224,
      "rewards/valid_tl_methods_reward/std": 0.0,
      "step": 138
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1607.0,
      "completions/max_terminated_length": 1607.0,
      "completions/mean_length": 784.28125,
      "completions/mean_terminated_length": 784.28125,
      "completions/min_length": 334.0,
      "completions/min_terminated_length": 334.0,
      "epoch": 1.347266881028939,
      "grad_norm": 0.7683331370353699,
      "learning_rate": 1e-06,
      "loss": 0.0197,
      "num_tokens": 19358323.0,
      "reward": 0.774795413017273,
      "reward_std": 0.11972075700759888,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.04250364378094673,
      "rewards/one_code_blob_reward/std": 0.027647629380226135,
      "rewards/reward_code_runs/mean": -0.22187499701976776,
      "rewards/reward_code_runs/std": 0.10949946194887161,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.0833333358168602,
      "rewards/torch_zeros_reward/std": 0.03746343031525612,
      "rewards/valid_tl_methods_reward/mean": 0.17291666567325592,
      "rewards/valid_tl_methods_reward/std": 0.06879284977912903,
      "step": 139
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1169.0,
      "completions/max_terminated_length": 1169.0,
      "completions/mean_length": 651.3646240234375,
      "completions/mean_terminated_length": 651.3646240234375,
      "completions/min_length": 392.0,
      "completions/min_terminated_length": 392.0,
      "epoch": 1.3569131832797428,
      "grad_norm": 0.8256404399871826,
      "learning_rate": 1e-06,
      "loss": -0.0131,
      "num_tokens": 19469142.0,
      "reward": 0.809127926826477,
      "reward_std": 0.12502001225948334,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.09791667014360428,
      "rewards/masks_load_store_reward/std": 0.01435758825391531,
      "rewards/one_code_blob_reward/mean": 0.05600285530090332,
      "rewards/one_code_blob_reward/std": 0.022366967052221298,
      "rewards/reward_code_runs/mean": -0.17499999701976776,
      "rewards/reward_code_runs/std": 0.16858543455600739,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.0677083358168602,
      "rewards/torch_zeros_reward/std": 0.047004569321870804,
      "rewards/valid_tl_methods_reward/mean": 0.16250000894069672,
      "rewards/valid_tl_methods_reward/std": 0.07847225666046143,
      "step": 140
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1229.0,
      "completions/max_terminated_length": 1229.0,
      "completions/mean_length": 688.7291870117188,
      "completions/mean_terminated_length": 688.7291870117188,
      "completions/min_length": 365.0,
      "completions/min_terminated_length": 365.0,
      "epoch": 1.3665594855305465,
      "grad_norm": 0.8053058981895447,
      "learning_rate": 1e-06,
      "loss": 0.0236,
      "num_tokens": 19581064.0,
      "reward": 0.9577488899230957,
      "reward_std": 0.17412030696868896,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.09791667014360428,
      "rewards/masks_load_store_reward/std": 0.01435758825391531,
      "rewards/one_code_blob_reward/mean": 0.05201972648501396,
      "rewards/one_code_blob_reward/std": 0.029080655425786972,
      "rewards/reward_code_runs/mean": -0.048437491059303284,
      "rewards/reward_code_runs/std": 0.381234347820282,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.05833333358168602,
      "rewards/torch_zeros_reward/std": 0.04955946281552315,
      "rewards/valid_tl_methods_reward/mean": 0.20000000298023224,
      "rewards/valid_tl_methods_reward/std": 0.0,
      "step": 141
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1165.0,
      "completions/max_terminated_length": 1165.0,
      "completions/mean_length": 624.53125,
      "completions/mean_terminated_length": 624.53125,
      "completions/min_length": 326.0,
      "completions/min_terminated_length": 326.0,
      "epoch": 1.3762057877813505,
      "grad_norm": 0.7290429472923279,
      "learning_rate": 1e-06,
      "loss": 0.0215,
      "num_tokens": 19685803.0,
      "reward": 1.047572135925293,
      "reward_std": 0.19656646251678467,
      "rewards/constexpr_reward/mean": 0.19583334028720856,
      "rewards/constexpr_reward/std": 0.02871517650783062,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.05798870325088501,
      "rewards/one_code_blob_reward/std": 0.02330818958580494,
      "rewards/reward_code_runs/mean": 0.03854166343808174,
      "rewards/reward_code_runs/std": 0.40356823801994324,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.0572916679084301,
      "rewards/torch_zeros_reward/std": 0.04972511902451515,
      "rewards/valid_tl_methods_reward/mean": 0.1979166716337204,
      "rewards/valid_tl_methods_reward/std": 0.020412415266036987,
      "step": 142
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1546.0,
      "completions/max_terminated_length": 1546.0,
      "completions/mean_length": 727.40625,
      "completions/mean_terminated_length": 727.40625,
      "completions/min_length": 298.0,
      "completions/min_terminated_length": 298.0,
      "epoch": 1.3858520900321543,
      "grad_norm": 0.7532760500907898,
      "learning_rate": 1e-06,
      "loss": 0.0127,
      "num_tokens": 19797490.0,
      "reward": 0.940872311592102,
      "reward_std": 0.21272242069244385,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.0989583358168602,
      "rewards/masks_load_store_reward/std": 0.010206207633018494,
      "rewards/one_code_blob_reward/mean": 0.05910136178135872,
      "rewards/one_code_blob_reward/std": 0.036845460534095764,
      "rewards/reward_code_runs/mean": -0.06822916120290756,
      "rewards/reward_code_runs/std": 0.3931063711643219,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.07604166865348816,
      "rewards/torch_zeros_reward/std": 0.042906977236270905,
      "rewards/valid_tl_methods_reward/mean": 0.17499999701976776,
      "rewards/valid_tl_methods_reward/std": 0.06649099290370941,
      "step": 143
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1337.0,
      "completions/max_terminated_length": 1337.0,
      "completions/mean_length": 596.3333740234375,
      "completions/mean_terminated_length": 596.3333740234375,
      "completions/min_length": 336.0,
      "completions/min_terminated_length": 336.0,
      "epoch": 1.3954983922829582,
      "grad_norm": 0.9243506193161011,
      "learning_rate": 1e-06,
      "loss": 0.0259,
      "num_tokens": 19898106.0,
      "reward": 0.868034839630127,
      "reward_std": 0.15040241181850433,
      "rewards/constexpr_reward/mean": 0.17500001192092896,
      "rewards/constexpr_reward/std": 0.06649099290370941,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.09791667014360428,
      "rewards/masks_load_store_reward/std": 0.014357589185237885,
      "rewards/one_code_blob_reward/mean": 0.06282646209001541,
      "rewards/one_code_blob_reward/std": 0.0359637513756752,
      "rewards/reward_code_runs/mean": -0.13749998807907104,
      "rewards/reward_code_runs/std": 0.19587858021259308,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.09479167312383652,
      "rewards/torch_zeros_reward/std": 0.022336147725582123,
      "rewards/valid_tl_methods_reward/mean": 0.17499999701976776,
      "rewards/valid_tl_methods_reward/std": 0.06649099290370941,
      "step": 144
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1775.0,
      "completions/max_terminated_length": 1775.0,
      "completions/mean_length": 768.75,
      "completions/mean_terminated_length": 768.75,
      "completions/min_length": 292.0,
      "completions/min_terminated_length": 292.0,
      "epoch": 1.405144694533762,
      "grad_norm": 0.7844826579093933,
      "learning_rate": 1e-06,
      "loss": 0.0971,
      "num_tokens": 20015946.0,
      "reward": 1.0762923955917358,
      "reward_std": 0.10215584933757782,
      "rewards/constexpr_reward/mean": 0.19583334028720856,
      "rewards/constexpr_reward/std": 0.02871517650783062,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.048167187720537186,
      "rewards/one_code_blob_reward/std": 0.03205899894237518,
      "rewards/reward_code_runs/mean": 0.0729166641831398,
      "rewards/reward_code_runs/std": 0.5325171947479248,
      "rewards/think_reward/mean": 0.19687502086162567,
      "rewards/think_reward/std": 0.03061862289905548,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.06458333134651184,
      "rewards/torch_zeros_reward/std": 0.0480770580470562,
      "rewards/valid_tl_methods_reward/mean": 0.20000000298023224,
      "rewards/valid_tl_methods_reward/std": 0.0,
      "step": 145
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1564.0,
      "completions/max_terminated_length": 1564.0,
      "completions/mean_length": 654.3646240234375,
      "completions/mean_terminated_length": 654.3646240234375,
      "completions/min_length": 270.0,
      "completions/min_terminated_length": 270.0,
      "epoch": 1.414790996784566,
      "grad_norm": 0.9075904488563538,
      "learning_rate": 1e-06,
      "loss": -0.013,
      "num_tokens": 20120753.0,
      "reward": 1.1273137331008911,
      "reward_std": 0.10572604835033417,
      "rewards/constexpr_reward/mean": 0.19583334028720856,
      "rewards/constexpr_reward/std": 0.02871517837047577,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.06325113028287888,
      "rewards/one_code_blob_reward/std": 0.03051825985312462,
      "rewards/reward_code_runs/mean": 0.10468750447034836,
      "rewards/reward_code_runs/std": 0.5355311036109924,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.06562500447034836,
      "rewards/torch_zeros_reward/std": 0.04774521291255951,
      "rewards/valid_tl_methods_reward/mean": 0.1979166716337204,
      "rewards/valid_tl_methods_reward/std": 0.020412415266036987,
      "step": 146
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1215.0,
      "completions/max_terminated_length": 1215.0,
      "completions/mean_length": 592.09375,
      "completions/mean_terminated_length": 592.09375,
      "completions/min_length": 319.0,
      "completions/min_terminated_length": 319.0,
      "epoch": 1.4244372990353698,
      "grad_norm": 0.7394472360610962,
      "learning_rate": 1e-06,
      "loss": 0.0127,
      "num_tokens": 20224310.0,
      "reward": 0.8296582698822021,
      "reward_std": 0.13804888725280762,
      "rewards/constexpr_reward/mean": 0.19583334028720856,
      "rewards/constexpr_reward/std": 0.02871517650783062,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.0989583358168602,
      "rewards/masks_load_store_reward/std": 0.010206207633018494,
      "rewards/one_code_blob_reward/mean": 0.06142908334732056,
      "rewards/one_code_blob_reward/std": 0.02213943563401699,
      "rewards/reward_code_runs/mean": -0.17239584028720856,
      "rewards/reward_code_runs/std": 0.2241791933774948,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.0833333358168602,
      "rewards/torch_zeros_reward/std": 0.03746343404054642,
      "rewards/valid_tl_methods_reward/mean": 0.16250000894069672,
      "rewards/valid_tl_methods_reward/std": 0.07847225666046143,
      "step": 147
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1806.0,
      "completions/max_terminated_length": 1806.0,
      "completions/mean_length": 734.34375,
      "completions/mean_terminated_length": 734.34375,
      "completions/min_length": 321.0,
      "completions/min_terminated_length": 321.0,
      "epoch": 1.4340836012861735,
      "grad_norm": 0.9521190524101257,
      "learning_rate": 1e-06,
      "loss": 0.0554,
      "num_tokens": 20343263.0,
      "reward": 0.8669583201408386,
      "reward_std": 0.1845017373561859,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.0989583358168602,
      "rewards/masks_load_store_reward/std": 0.010206207633018494,
      "rewards/one_code_blob_reward/mean": 0.05341659113764763,
      "rewards/one_code_blob_reward/std": 0.028257377445697784,
      "rewards/reward_code_runs/mean": -0.16770832240581512,
      "rewards/reward_code_runs/std": 0.22722342610359192,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.09270834177732468,
      "rewards/torch_zeros_reward/std": 0.026136452332139015,
      "rewards/valid_tl_methods_reward/mean": 0.18958334624767303,
      "rewards/valid_tl_methods_reward/std": 0.044672295451164246,
      "step": 148
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1155.0,
      "completions/max_terminated_length": 1155.0,
      "completions/mean_length": 720.8854370117188,
      "completions/mean_terminated_length": 720.8854370117188,
      "completions/min_length": 298.0,
      "completions/min_terminated_length": 298.0,
      "epoch": 1.4437299035369775,
      "grad_norm": 0.9265972375869751,
      "learning_rate": 1e-06,
      "loss": 0.0182,
      "num_tokens": 20458680.0,
      "reward": 1.0103106498718262,
      "reward_std": 0.11800509691238403,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.0989583358168602,
      "rewards/masks_load_store_reward/std": 0.010206207633018494,
      "rewards/one_code_blob_reward/mean": 0.04728983715176582,
      "rewards/one_code_blob_reward/std": 0.030017884448170662,
      "rewards/reward_code_runs/mean": -0.02343749813735485,
      "rewards/reward_code_runs/std": 0.4212733209133148,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.08958333730697632,
      "rewards/torch_zeros_reward/std": 0.030708016827702522,
      "rewards/valid_tl_methods_reward/mean": 0.20000000298023224,
      "rewards/valid_tl_methods_reward/std": 0.0,
      "step": 149
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 977.0,
      "completions/max_terminated_length": 977.0,
      "completions/mean_length": 559.2604370117188,
      "completions/mean_terminated_length": 559.2604370117188,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "epoch": 1.4533762057877815,
      "grad_norm": 0.9568269848823547,
      "learning_rate": 1e-06,
      "loss": 0.0077,
      "num_tokens": 20554105.0,
      "reward": 0.8368010520935059,
      "reward_std": 0.12439590692520142,
      "rewards/constexpr_reward/mean": 0.1666666716337204,
      "rewards/constexpr_reward/std": 0.07492686808109283,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.09375,
      "rewards/masks_load_store_reward/std": 0.02433321252465248,
      "rewards/one_code_blob_reward/mean": 0.06961345672607422,
      "rewards/one_code_blob_reward/std": 0.04036742448806763,
      "rewards/reward_code_runs/mean": -0.15260417759418488,
      "rewards/reward_code_runs/std": 0.21267344057559967,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.07604166865348816,
      "rewards/torch_zeros_reward/std": 0.042906977236270905,
      "rewards/valid_tl_methods_reward/mean": 0.18541665375232697,
      "rewards/valid_tl_methods_reward/std": 0.05227290466427803,
      "step": 150
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1103.0,
      "completions/max_terminated_length": 1103.0,
      "completions/mean_length": 655.3229370117188,
      "completions/mean_terminated_length": 655.3229370117188,
      "completions/min_length": 282.0,
      "completions/min_terminated_length": 282.0,
      "epoch": 1.4630225080385852,
      "grad_norm": 0.802787721157074,
      "learning_rate": 1e-06,
      "loss": 0.021,
      "num_tokens": 20662844.0,
      "reward": 1.101922869682312,
      "reward_std": 0.13189242780208588,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.059214454144239426,
      "rewards/one_code_blob_reward/std": 0.02514045685529709,
      "rewards/reward_code_runs/mean": 0.06562500447034836,
      "rewards/reward_code_runs/std": 0.41130807995796204,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.07708333432674408,
      "rewards/torch_zeros_reward/std": 0.04225029796361923,
      "rewards/valid_tl_methods_reward/mean": 0.20000000298023224,
      "rewards/valid_tl_methods_reward/std": 0.0,
      "step": 151
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1408.0,
      "completions/max_terminated_length": 1408.0,
      "completions/mean_length": 690.8229370117188,
      "completions/mean_terminated_length": 690.8229370117188,
      "completions/min_length": 250.0,
      "completions/min_terminated_length": 250.0,
      "epoch": 1.472668810289389,
      "grad_norm": 1.0255579948425293,
      "learning_rate": 1e-06,
      "loss": 0.0057,
      "num_tokens": 20775195.0,
      "reward": 1.0822356939315796,
      "reward_std": 0.22474662959575653,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.0989583358168602,
      "rewards/masks_load_store_reward/std": 0.010206207633018494,
      "rewards/one_code_blob_reward/mean": 0.059839725494384766,
      "rewards/one_code_blob_reward/std": 0.03606778010725975,
      "rewards/reward_code_runs/mean": 0.04114583507180214,
      "rewards/reward_code_runs/std": 0.4284608066082001,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.08645833283662796,
      "rewards/torch_zeros_reward/std": 0.034396421164274216,
      "rewards/valid_tl_methods_reward/mean": 0.20000000298023224,
      "rewards/valid_tl_methods_reward/std": 0.0,
      "step": 152
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 870.0,
      "completions/max_terminated_length": 870.0,
      "completions/mean_length": 561.2604370117188,
      "completions/mean_terminated_length": 561.2604370117188,
      "completions/min_length": 345.0,
      "completions/min_terminated_length": 345.0,
      "epoch": 1.482315112540193,
      "grad_norm": 0.8129120469093323,
      "learning_rate": 1e-06,
      "loss": -0.0138,
      "num_tokens": 20872636.0,
      "reward": 1.1856117248535156,
      "reward_std": 0.33780115842819214,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.066340871155262,
      "rewards/one_code_blob_reward/std": 0.022852176800370216,
      "rewards/reward_code_runs/mean": 0.15781249105930328,
      "rewards/reward_code_runs/std": 0.4548179507255554,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.06354167312383652,
      "rewards/torch_zeros_reward/std": 0.04838397353887558,
      "rewards/valid_tl_methods_reward/mean": 0.1979166716337204,
      "rewards/valid_tl_methods_reward/std": 0.020412415266036987,
      "step": 153
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2653.0,
      "completions/max_terminated_length": 2653.0,
      "completions/mean_length": 746.875,
      "completions/mean_terminated_length": 746.875,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "epoch": 1.4919614147909968,
      "grad_norm": 1.1085046529769897,
      "learning_rate": 1e-06,
      "loss": -0.0302,
      "num_tokens": 20986444.0,
      "reward": 1.183764100074768,
      "reward_std": 0.3158164620399475,
      "rewards/constexpr_reward/mean": 0.17291666567325592,
      "rewards/constexpr_reward/std": 0.06879284977912903,
      "rewards/imports_decorator_reward/mean": 0.19583334028720856,
      "rewards/imports_decorator_reward/std": 0.02871517837047577,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.07751400023698807,
      "rewards/one_code_blob_reward/std": 0.05076025426387787,
      "rewards/reward_code_runs/mean": 0.17604167759418488,
      "rewards/reward_code_runs/std": 0.5024141669273376,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.06145833432674408,
      "rewards/torch_zeros_reward/std": 0.04892484098672867,
      "rewards/valid_tl_methods_reward/mean": 0.20000000298023224,
      "rewards/valid_tl_methods_reward/std": 0.0,
      "step": 154
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1292.0,
      "completions/max_terminated_length": 1292.0,
      "completions/mean_length": 594.2708740234375,
      "completions/mean_terminated_length": 594.2708740234375,
      "completions/min_length": 364.0,
      "completions/min_terminated_length": 364.0,
      "epoch": 1.5016077170418005,
      "grad_norm": 0.7305670380592346,
      "learning_rate": 1e-06,
      "loss": 0.0212,
      "num_tokens": 21087726.0,
      "reward": 1.0104423761367798,
      "reward_std": 0.2103710025548935,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.06460893154144287,
      "rewards/one_code_blob_reward/std": 0.019467337056994438,
      "rewards/reward_code_runs/mean": -0.007291665766388178,
      "rewards/reward_code_runs/std": 0.24772429466247559,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.08020833134651184,
      "rewards/torch_zeros_reward/std": 0.04005204886198044,
      "rewards/valid_tl_methods_reward/mean": 0.1770833283662796,
      "rewards/valid_tl_methods_reward/std": 0.06403809040784836,
      "step": 155
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1198.0,
      "completions/max_terminated_length": 1198.0,
      "completions/mean_length": 592.4583740234375,
      "completions/mean_terminated_length": 592.4583740234375,
      "completions/min_length": 323.0,
      "completions/min_terminated_length": 323.0,
      "epoch": 1.5112540192926045,
      "grad_norm": 1.0602891445159912,
      "learning_rate": 1e-06,
      "loss": 0.0118,
      "num_tokens": 21186950.0,
      "reward": 1.043541669845581,
      "reward_std": 0.26119792461395264,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.06385406851768494,
      "rewards/one_code_blob_reward/std": 0.027104245498776436,
      "rewards/reward_code_runs/mean": -0.004687500651925802,
      "rewards/reward_code_runs/std": 0.2869144380092621,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.09479167312383652,
      "rewards/torch_zeros_reward/std": 0.022336147725582123,
      "rewards/valid_tl_methods_reward/mean": 0.18958334624767303,
      "rewards/valid_tl_methods_reward/std": 0.044672295451164246,
      "step": 156
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1521.0,
      "completions/max_terminated_length": 1521.0,
      "completions/mean_length": 595.0625,
      "completions/mean_terminated_length": 595.0625,
      "completions/min_length": 302.0,
      "completions/min_terminated_length": 302.0,
      "epoch": 1.5209003215434085,
      "grad_norm": 1.1863574981689453,
      "learning_rate": 1e-06,
      "loss": 0.0414,
      "num_tokens": 21286340.0,
      "reward": 1.023719310760498,
      "reward_std": 0.17034493386745453,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.0674692839384079,
      "rewards/one_code_blob_reward/std": 0.027314169332385063,
      "rewards/reward_code_runs/mean": -0.02499999850988388,
      "rewards/reward_code_runs/std": 0.22618110477924347,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.08541666716337204,
      "rewards/torch_zeros_reward/std": 0.03547917678952217,
      "rewards/valid_tl_methods_reward/mean": 0.1979166716337204,
      "rewards/valid_tl_methods_reward/std": 0.020412415266036987,
      "step": 157
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1411.0,
      "completions/max_terminated_length": 1411.0,
      "completions/mean_length": 593.3125,
      "completions/mean_terminated_length": 593.3125,
      "completions/min_length": 285.0,
      "completions/min_terminated_length": 285.0,
      "epoch": 1.5305466237942122,
      "grad_norm": 0.9769577383995056,
      "learning_rate": 1e-06,
      "loss": 0.0688,
      "num_tokens": 21383450.0,
      "reward": 1.2167844772338867,
      "reward_std": 0.10052961111068726,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.07355514913797379,
      "rewards/one_code_blob_reward/std": 0.03204425796866417,
      "rewards/reward_code_runs/mean": 0.1796875,
      "rewards/reward_code_runs/std": 0.5113232731819153,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.07395833730697632,
      "rewards/torch_zeros_reward/std": 0.04411657154560089,
      "rewards/valid_tl_methods_reward/mean": 0.18958334624767303,
      "rewards/valid_tl_methods_reward/std": 0.044672295451164246,
      "step": 158
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1328.0,
      "completions/max_terminated_length": 1328.0,
      "completions/mean_length": 676.1979370117188,
      "completions/mean_terminated_length": 676.1979370117188,
      "completions/min_length": 330.0,
      "completions/min_terminated_length": 330.0,
      "epoch": 1.540192926045016,
      "grad_norm": 0.8277693390846252,
      "learning_rate": 1e-06,
      "loss": 0.0246,
      "num_tokens": 21493533.0,
      "reward": 0.9875229597091675,
      "reward_std": 0.20137527585029602,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.05991869792342186,
      "rewards/one_code_blob_reward/std": 0.02951066941022873,
      "rewards/reward_code_runs/mean": -0.05052083358168602,
      "rewards/reward_code_runs/std": 0.26749271154403687,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.08437500149011612,
      "rewards/torch_zeros_reward/std": 0.03649982064962387,
      "rewards/valid_tl_methods_reward/mean": 0.19375000894069672,
      "rewards/valid_tl_methods_reward/std": 0.034981198608875275,
      "step": 159
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1499.0,
      "completions/max_terminated_length": 1499.0,
      "completions/mean_length": 711.4271240234375,
      "completions/mean_terminated_length": 711.4271240234375,
      "completions/min_length": 427.0,
      "completions/min_terminated_length": 427.0,
      "epoch": 1.54983922829582,
      "grad_norm": 0.8331584930419922,
      "learning_rate": 1e-06,
      "loss": 0.0042,
      "num_tokens": 21609662.0,
      "reward": 0.8818552494049072,
      "reward_std": 0.14330685138702393,
      "rewards/constexpr_reward/mean": 0.19583334028720856,
      "rewards/constexpr_reward/std": 0.02871517650783062,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.0989583358168602,
      "rewards/masks_load_store_reward/std": 0.010206207633018494,
      "rewards/one_code_blob_reward/mean": 0.05477190017700195,
      "rewards/one_code_blob_reward/std": 0.0240237507969141,
      "rewards/reward_code_runs/mean": -0.1197916641831398,
      "rewards/reward_code_runs/std": 0.22935599088668823,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.05833333730697632,
      "rewards/torch_zeros_reward/std": 0.04955946281552315,
      "rewards/valid_tl_methods_reward/mean": 0.19583334028720856,
      "rewards/valid_tl_methods_reward/std": 0.02871517650783062,
      "step": 160
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1242.0,
      "completions/max_terminated_length": 1242.0,
      "completions/mean_length": 664.8229370117188,
      "completions/mean_terminated_length": 664.8229370117188,
      "completions/min_length": 329.0,
      "completions/min_terminated_length": 329.0,
      "epoch": 1.5594855305466238,
      "grad_norm": 0.8680635690689087,
      "learning_rate": 1e-06,
      "loss": 0.0221,
      "num_tokens": 21718005.0,
      "reward": 0.8824254274368286,
      "reward_std": 0.18108677864074707,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.05690452456474304,
      "rewards/one_code_blob_reward/std": 0.02404734678566456,
      "rewards/reward_code_runs/mean": -0.14427082240581512,
      "rewards/reward_code_runs/std": 0.24048960208892822,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.09270832687616348,
      "rewards/torch_zeros_reward/std": 0.026136452332139015,
      "rewards/valid_tl_methods_reward/mean": 0.1770833283662796,
      "rewards/valid_tl_methods_reward/std": 0.06403809040784836,
      "step": 161
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1159.0,
      "completions/max_terminated_length": 1159.0,
      "completions/mean_length": 642.7083740234375,
      "completions/mean_terminated_length": 642.7083740234375,
      "completions/min_length": 315.0,
      "completions/min_terminated_length": 315.0,
      "epoch": 1.5691318327974275,
      "grad_norm": 0.7752233743667603,
      "learning_rate": 1e-06,
      "loss": -0.0035,
      "num_tokens": 21823205.0,
      "reward": 1.1916781663894653,
      "reward_std": 0.2930299937725067,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.061990588903427124,
      "rewards/one_code_blob_reward/std": 0.027839384973049164,
      "rewards/reward_code_runs/mean": 0.13697917759418488,
      "rewards/reward_code_runs/std": 0.48494410514831543,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.09479167312383652,
      "rewards/torch_zeros_reward/std": 0.022336147725582123,
      "rewards/valid_tl_methods_reward/mean": 0.20000000298023224,
      "rewards/valid_tl_methods_reward/std": 0.0,
      "step": 162
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 986.0,
      "completions/max_terminated_length": 986.0,
      "completions/mean_length": 568.9896240234375,
      "completions/mean_terminated_length": 568.9896240234375,
      "completions/min_length": 293.0,
      "completions/min_terminated_length": 293.0,
      "epoch": 1.5787781350482315,
      "grad_norm": 0.8281655311584473,
      "learning_rate": 1e-06,
      "loss": 0.0028,
      "num_tokens": 21919996.0,
      "reward": 1.1239912509918213,
      "reward_std": 0.1698978841304779,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.06930360198020935,
      "rewards/one_code_blob_reward/std": 0.02347411774098873,
      "rewards/reward_code_runs/mean": 0.10885417461395264,
      "rewards/reward_code_runs/std": 0.49353688955307007,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.0729166641831398,
      "rewards/torch_zeros_reward/std": 0.044672295451164246,
      "rewards/valid_tl_methods_reward/mean": 0.17499999701976776,
      "rewards/valid_tl_methods_reward/std": 0.06649099290370941,
      "step": 163
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1046.0,
      "completions/max_terminated_length": 1046.0,
      "completions/mean_length": 556.96875,
      "completions/mean_terminated_length": 556.96875,
      "completions/min_length": 322.0,
      "completions/min_terminated_length": 322.0,
      "epoch": 1.5884244372990355,
      "grad_norm": 0.9195791482925415,
      "learning_rate": 1e-06,
      "loss": 0.019,
      "num_tokens": 22016113.0,
      "reward": 1.1228070259094238,
      "reward_std": 0.25016868114471436,
      "rewards/constexpr_reward/mean": 0.19583334028720856,
      "rewards/constexpr_reward/std": 0.02871517650783062,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.07384868711233139,
      "rewards/one_code_blob_reward/std": 0.022700248286128044,
      "rewards/reward_code_runs/mean": 0.0625000074505806,
      "rewards/reward_code_runs/std": 0.4486822783946991,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.09062499552965164,
      "rewards/torch_zeros_reward/std": 0.029301069676876068,
      "rewards/valid_tl_methods_reward/mean": 0.20000000298023224,
      "rewards/valid_tl_methods_reward/std": 0.0,
      "step": 164
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1407.0,
      "completions/max_terminated_length": 1407.0,
      "completions/mean_length": 623.90625,
      "completions/mean_terminated_length": 623.90625,
      "completions/min_length": 287.0,
      "completions/min_terminated_length": 287.0,
      "epoch": 1.5980707395498392,
      "grad_norm": 0.8318855166435242,
      "learning_rate": 1e-06,
      "loss": -0.0258,
      "num_tokens": 22118344.0,
      "reward": 0.9855782985687256,
      "reward_std": 0.16274294257164001,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.0610990971326828,
      "rewards/one_code_blob_reward/std": 0.03232499584555626,
      "rewards/reward_code_runs/mean": -0.05885416269302368,
      "rewards/reward_code_runs/std": 0.24602040648460388,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.0833333358168602,
      "rewards/torch_zeros_reward/std": 0.03746343404054642,
      "rewards/valid_tl_methods_reward/mean": 0.20000000298023224,
      "rewards/valid_tl_methods_reward/std": 0.0,
      "step": 165
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1078.0,
      "completions/max_terminated_length": 1078.0,
      "completions/mean_length": 696.3854370117188,
      "completions/mean_terminated_length": 696.3854370117188,
      "completions/min_length": 353.0,
      "completions/min_terminated_length": 353.0,
      "epoch": 1.607717041800643,
      "grad_norm": 0.8529621362686157,
      "learning_rate": 1e-06,
      "loss": 0.0254,
      "num_tokens": 22232525.0,
      "reward": 0.8414582014083862,
      "reward_std": 0.14661264419555664,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.05343733727931976,
      "rewards/one_code_blob_reward/std": 0.01874612644314766,
      "rewards/reward_code_runs/mean": -0.17031247913837433,
      "rewards/reward_code_runs/std": 0.17268440127372742,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.06875000149011612,
      "rewards/torch_zeros_reward/std": 0.04659455642104149,
      "rewards/valid_tl_methods_reward/mean": 0.19166667759418488,
      "rewards/valid_tl_methods_reward/std": 0.04017505422234535,
      "step": 166
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1035.0,
      "completions/max_terminated_length": 1035.0,
      "completions/mean_length": 612.2083740234375,
      "completions/mean_terminated_length": 612.2083740234375,
      "completions/min_length": 271.0,
      "completions/min_terminated_length": 271.0,
      "epoch": 1.617363344051447,
      "grad_norm": 0.8072406649589539,
      "learning_rate": 1e-06,
      "loss": 0.0471,
      "num_tokens": 22336597.0,
      "reward": 1.2598329782485962,
      "reward_std": 0.2021738737821579,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.05931195989251137,
      "rewards/one_code_blob_reward/std": 0.025900106877088547,
      "rewards/reward_code_runs/mean": 0.20364584028720856,
      "rewards/reward_code_runs/std": 0.4453410804271698,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": -0.0010416667209938169,
      "rewards/torch_empty_penalty/std": 0.010206207633018494,
      "rewards/torch_zeros_reward/mean": 0.10000000149011612,
      "rewards/torch_zeros_reward/std": 0.0,
      "rewards/valid_tl_methods_reward/mean": 0.1979166716337204,
      "rewards/valid_tl_methods_reward/std": 0.020412415266036987,
      "step": 167
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1458.0,
      "completions/max_terminated_length": 1458.0,
      "completions/mean_length": 598.25,
      "completions/mean_terminated_length": 598.25,
      "completions/min_length": 315.0,
      "completions/min_terminated_length": 315.0,
      "epoch": 1.6270096463022508,
      "grad_norm": 1.389051079750061,
      "learning_rate": 1e-06,
      "loss": 0.0229,
      "num_tokens": 22434457.0,
      "reward": 1.1792818307876587,
      "reward_std": 0.2810722887516022,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.19583334028720856,
      "rewards/imports_decorator_reward/std": 0.02871517650783062,
      "rewards/masks_load_store_reward/mean": 0.0989583358168602,
      "rewards/masks_load_store_reward/std": 0.010206207633018494,
      "rewards/one_code_blob_reward/mean": 0.06938587129116058,
      "rewards/one_code_blob_reward/std": 0.04094384238123894,
      "rewards/reward_code_runs/mean": 0.12968750298023224,
      "rewards/reward_code_runs/std": 0.3488762378692627,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.08958333730697632,
      "rewards/torch_zeros_reward/std": 0.03070801869034767,
      "rewards/valid_tl_methods_reward/mean": 0.1979166716337204,
      "rewards/valid_tl_methods_reward/std": 0.020412415266036987,
      "step": 168
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1077.0,
      "completions/max_terminated_length": 1077.0,
      "completions/mean_length": 571.9896240234375,
      "completions/mean_terminated_length": 571.9896240234375,
      "completions/min_length": 264.0,
      "completions/min_terminated_length": 264.0,
      "epoch": 1.6366559485530545,
      "grad_norm": 0.7332437634468079,
      "learning_rate": 1e-06,
      "loss": -0.0358,
      "num_tokens": 22529976.0,
      "reward": 1.359358310699463,
      "reward_std": 0.21025454998016357,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.06821238249540329,
      "rewards/one_code_blob_reward/std": 0.030263084918260574,
      "rewards/reward_code_runs/mean": 0.29114583134651184,
      "rewards/reward_code_runs/std": 0.5550246238708496,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.10000000149011612,
      "rewards/torch_zeros_reward/std": 0.0,
      "rewards/valid_tl_methods_reward/mean": 0.20000000298023224,
      "rewards/valid_tl_methods_reward/std": 0.0,
      "step": 169
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1574.0,
      "completions/max_terminated_length": 1574.0,
      "completions/mean_length": 698.34375,
      "completions/mean_terminated_length": 698.34375,
      "completions/min_length": 284.0,
      "completions/min_terminated_length": 284.0,
      "epoch": 1.6463022508038585,
      "grad_norm": 0.8411015272140503,
      "learning_rate": 1e-06,
      "loss": 0.0503,
      "num_tokens": 22642857.0,
      "reward": 1.1509120464324951,
      "reward_std": 0.19191358983516693,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.051953624933958054,
      "rewards/one_code_blob_reward/std": 0.030798546969890594,
      "rewards/reward_code_runs/mean": 0.10312500596046448,
      "rewards/reward_code_runs/std": 0.40110453963279724,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.09791667014360428,
      "rewards/torch_zeros_reward/std": 0.01435758825391531,
      "rewards/valid_tl_methods_reward/mean": 0.20000000298023224,
      "rewards/valid_tl_methods_reward/std": 0.0,
      "step": 170
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 909.0,
      "completions/max_terminated_length": 909.0,
      "completions/mean_length": 575.34375,
      "completions/mean_terminated_length": 575.34375,
      "completions/min_length": 309.0,
      "completions/min_terminated_length": 309.0,
      "epoch": 1.6559485530546625,
      "grad_norm": 0.9720630049705505,
      "learning_rate": 1e-06,
      "loss": 0.0254,
      "num_tokens": 22742142.0,
      "reward": 1.0138139724731445,
      "reward_std": 0.1578996777534485,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.0690222904086113,
      "rewards/one_code_blob_reward/std": 0.022182492539286613,
      "rewards/reward_code_runs/mean": -0.05416666343808174,
      "rewards/reward_code_runs/std": 0.2466263622045517,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.0989583358168602,
      "rewards/torch_zeros_reward/std": 0.010206207633018494,
      "rewards/valid_tl_methods_reward/mean": 0.20000000298023224,
      "rewards/valid_tl_methods_reward/std": 0.0,
      "step": 171
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1171.0,
      "completions/max_terminated_length": 1171.0,
      "completions/mean_length": 618.8229370117188,
      "completions/mean_terminated_length": 618.8229370117188,
      "completions/min_length": 251.0,
      "completions/min_terminated_length": 251.0,
      "epoch": 1.6655948553054662,
      "grad_norm": 0.9359613060951233,
      "learning_rate": 1e-06,
      "loss": 0.0075,
      "num_tokens": 22846405.0,
      "reward": 1.0907633304595947,
      "reward_std": 0.21883492171764374,
      "rewards/constexpr_reward/mean": 0.19583334028720856,
      "rewards/constexpr_reward/std": 0.02871517650783062,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.0989583358168602,
      "rewards/masks_load_store_reward/std": 0.010206207633018494,
      "rewards/one_code_blob_reward/mean": 0.05586738511919975,
      "rewards/one_code_blob_reward/std": 0.03974929451942444,
      "rewards/reward_code_runs/mean": 0.07343750447034836,
      "rewards/reward_code_runs/std": 0.48045775294303894,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.09583333879709244,
      "rewards/torch_zeros_reward/std": 0.020087527111172676,
      "rewards/valid_tl_methods_reward/mean": 0.17291666567325592,
      "rewards/valid_tl_methods_reward/std": 0.06879284977912903,
      "step": 172
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 981.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 539.7916870117188,
      "completions/mean_terminated_length": 539.7916870117188,
      "completions/min_length": 272.0,
      "completions/min_terminated_length": 272.0,
      "epoch": 1.67524115755627,
      "grad_norm": 1.1559139490127563,
      "learning_rate": 1e-06,
      "loss": 0.0079,
      "num_tokens": 22940153.0,
      "reward": 1.1731069087982178,
      "reward_std": 0.24236929416656494,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.07414843887090683,
      "rewards/one_code_blob_reward/std": 0.029914140701293945,
      "rewards/reward_code_runs/mean": 0.11145833134651184,
      "rewards/reward_code_runs/std": 0.41131874918937683,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.08958333730697632,
      "rewards/torch_zeros_reward/std": 0.030708016827702522,
      "rewards/valid_tl_methods_reward/mean": 0.20000000298023224,
      "rewards/valid_tl_methods_reward/std": 0.0,
      "step": 173
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1343.0,
      "completions/max_terminated_length": 1343.0,
      "completions/mean_length": 534.3229370117188,
      "completions/mean_terminated_length": 534.3229370117188,
      "completions/min_length": 281.0,
      "completions/min_terminated_length": 281.0,
      "epoch": 1.684887459807074,
      "grad_norm": 0.7538516521453857,
      "learning_rate": 1e-06,
      "loss": 0.0254,
      "num_tokens": 23032812.0,
      "reward": 1.4256443977355957,
      "reward_std": 0.15253810584545135,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.07772766798734665,
      "rewards/one_code_blob_reward/std": 0.02698957547545433,
      "rewards/reward_code_runs/mean": 0.359375,
      "rewards/reward_code_runs/std": 0.5299112200737,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.09062501043081284,
      "rewards/torch_zeros_reward/std": 0.02930106781423092,
      "rewards/valid_tl_methods_reward/mean": 0.1979166716337204,
      "rewards/valid_tl_methods_reward/std": 0.020412415266036987,
      "step": 174
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1460.0,
      "completions/max_terminated_length": 1460.0,
      "completions/mean_length": 592.8646240234375,
      "completions/mean_terminated_length": 592.8646240234375,
      "completions/min_length": 269.0,
      "completions/min_terminated_length": 269.0,
      "epoch": 1.694533762057878,
      "grad_norm": 0.9495415091514587,
      "learning_rate": 1e-06,
      "loss": 0.0298,
      "num_tokens": 23131895.0,
      "reward": 1.1976202726364136,
      "reward_std": 0.254415363073349,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.07053681463003159,
      "rewards/one_code_blob_reward/std": 0.03324928507208824,
      "rewards/reward_code_runs/mean": 0.15937501192092896,
      "rewards/reward_code_runs/std": 0.4877074658870697,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.078125,
      "rewards/torch_zeros_reward/std": 0.04155687242746353,
      "rewards/valid_tl_methods_reward/mean": 0.19166667759418488,
      "rewards/valid_tl_methods_reward/std": 0.04017505422234535,
      "step": 175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1105.0,
      "completions/max_terminated_length": 1105.0,
      "completions/mean_length": 614.21875,
      "completions/mean_terminated_length": 614.21875,
      "completions/min_length": 332.0,
      "completions/min_terminated_length": 332.0,
      "epoch": 1.7041800643086815,
      "grad_norm": 1.123584508895874,
      "learning_rate": 1e-06,
      "loss": 0.004,
      "num_tokens": 23237456.0,
      "reward": 1.0600738525390625,
      "reward_std": 0.2565411627292633,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.0989583358168602,
      "rewards/masks_load_store_reward/std": 0.010206207633018494,
      "rewards/one_code_blob_reward/mean": 0.06371968239545822,
      "rewards/one_code_blob_reward/std": 0.025978902354836464,
      "rewards/reward_code_runs/mean": 0.0026041690725833178,
      "rewards/reward_code_runs/std": 0.3209664523601532,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.09687501192092896,
      "rewards/torch_zeros_reward/std": 0.017490599304437637,
      "rewards/valid_tl_methods_reward/mean": 0.1979166716337204,
      "rewards/valid_tl_methods_reward/std": 0.020412415266036987,
      "step": 176
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1024.0,
      "completions/mean_length": 692.0729370117188,
      "completions/mean_terminated_length": 692.0729370117188,
      "completions/min_length": 389.0,
      "completions/min_terminated_length": 389.0,
      "epoch": 1.7138263665594855,
      "grad_norm": 0.9737513661384583,
      "learning_rate": 1e-06,
      "loss": 0.0171,
      "num_tokens": 23353047.0,
      "reward": 0.899864673614502,
      "reward_std": 0.2174696922302246,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.047260504215955734,
      "rewards/one_code_blob_reward/std": 0.015411733649671078,
      "rewards/reward_code_runs/mean": -0.1171875,
      "rewards/reward_code_runs/std": 0.2723028361797333,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.07187499850988388,
      "rewards/torch_zeros_reward/std": 0.04519694298505783,
      "rewards/valid_tl_methods_reward/mean": 0.1979166716337204,
      "rewards/valid_tl_methods_reward/std": 0.020412415266036987,
      "step": 177
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 937.0,
      "completions/max_terminated_length": 937.0,
      "completions/mean_length": 514.3333740234375,
      "completions/mean_terminated_length": 514.3333740234375,
      "completions/min_length": 215.0,
      "completions/min_terminated_length": 215.0,
      "epoch": 1.7234726688102895,
      "grad_norm": 0.9674484133720398,
      "learning_rate": 1e-06,
      "loss": 0.0208,
      "num_tokens": 23443451.0,
      "reward": 1.1143759489059448,
      "reward_std": 0.2083703726530075,
      "rewards/constexpr_reward/mean": 0.19166667759418488,
      "rewards/constexpr_reward/std": 0.04017505422234535,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.0989583358168602,
      "rewards/masks_load_store_reward/std": 0.010206207633018494,
      "rewards/one_code_blob_reward/mean": 0.07427162677049637,
      "rewards/one_code_blob_reward/std": 0.04172372817993164,
      "rewards/reward_code_runs/mean": 0.06822916865348816,
      "rewards/reward_code_runs/std": 0.43559515476226807,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.08749999850988388,
      "rewards/torch_zeros_reward/std": 0.033245496451854706,
      "rewards/valid_tl_methods_reward/mean": 0.19583334028720856,
      "rewards/valid_tl_methods_reward/std": 0.02871517650783062,
      "step": 178
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1690.0,
      "completions/max_terminated_length": 1690.0,
      "completions/mean_length": 710.8958740234375,
      "completions/mean_terminated_length": 710.8958740234375,
      "completions/min_length": 295.0,
      "completions/min_terminated_length": 295.0,
      "epoch": 1.7331189710610932,
      "grad_norm": 0.9569810032844543,
      "learning_rate": 1e-06,
      "loss": 0.0461,
      "num_tokens": 23556049.0,
      "reward": 1.02650785446167,
      "reward_std": 0.23694173991680145,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.059841081500053406,
      "rewards/one_code_blob_reward/std": 0.03418285399675369,
      "rewards/reward_code_runs/mean": -0.009375002235174179,
      "rewards/reward_code_runs/std": 0.28721094131469727,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.0989583358168602,
      "rewards/torch_zeros_reward/std": 0.010206207633018494,
      "rewards/valid_tl_methods_reward/mean": 0.1770833283662796,
      "rewards/valid_tl_methods_reward/std": 0.06403809040784836,
      "step": 179
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1070.0,
      "completions/max_terminated_length": 1070.0,
      "completions/mean_length": 628.15625,
      "completions/mean_terminated_length": 628.15625,
      "completions/min_length": 399.0,
      "completions/min_terminated_length": 399.0,
      "epoch": 1.742765273311897,
      "grad_norm": 0.8146075010299683,
      "learning_rate": 1e-06,
      "loss": 0.0011,
      "num_tokens": 23663992.0,
      "reward": 1.0439410209655762,
      "reward_std": 0.24557435512542725,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.0606076605618,
      "rewards/one_code_blob_reward/std": 0.021280352026224136,
      "rewards/reward_code_runs/mean": 2.4835269396561444e-09,
      "rewards/reward_code_runs/std": 0.286540150642395,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": -0.0010416667209938169,
      "rewards/torch_empty_penalty/std": 0.010206207633018494,
      "rewards/torch_zeros_reward/mean": 0.0989583358168602,
      "rewards/torch_zeros_reward/std": 0.010206207633018494,
      "rewards/valid_tl_methods_reward/mean": 0.18541668355464935,
      "rewards/valid_tl_methods_reward/std": 0.05227290466427803,
      "step": 180
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1395.0,
      "completions/max_terminated_length": 1395.0,
      "completions/mean_length": 624.0625,
      "completions/mean_terminated_length": 624.0625,
      "completions/min_length": 264.0,
      "completions/min_terminated_length": 264.0,
      "epoch": 1.752411575562701,
      "grad_norm": 0.9790536761283875,
      "learning_rate": 1e-06,
      "loss": 0.0055,
      "num_tokens": 23769790.0,
      "reward": 1.292978286743164,
      "reward_std": 0.16918937861919403,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.058603256940841675,
      "rewards/one_code_blob_reward/std": 0.02803383395075798,
      "rewards/reward_code_runs/mean": 0.24895834922790527,
      "rewards/reward_code_runs/std": 0.4883365333080292,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.08749999850988388,
      "rewards/torch_zeros_reward/std": 0.033245496451854706,
      "rewards/valid_tl_methods_reward/mean": 0.20000000298023224,
      "rewards/valid_tl_methods_reward/std": 0.0,
      "step": 181
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2086.0,
      "completions/max_terminated_length": 2086.0,
      "completions/mean_length": 633.5729370117188,
      "completions/mean_terminated_length": 633.5729370117188,
      "completions/min_length": 279.0,
      "completions/min_terminated_length": 279.0,
      "epoch": 1.762057877813505,
      "grad_norm": 0.8213247656822205,
      "learning_rate": 1e-06,
      "loss": 0.0247,
      "num_tokens": 23873021.0,
      "reward": 1.1788103580474854,
      "reward_std": 0.14977888762950897,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.06631030887365341,
      "rewards/one_code_blob_reward/std": 0.030902283266186714,
      "rewards/reward_code_runs/mean": 0.11249998956918716,
      "rewards/reward_code_runs/std": 0.3979552984237671,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.10000000149011612,
      "rewards/torch_zeros_reward/std": 0.0,
      "rewards/valid_tl_methods_reward/mean": 0.20000000298023224,
      "rewards/valid_tl_methods_reward/std": 0.0,
      "step": 182
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1249.0,
      "completions/max_terminated_length": 1249.0,
      "completions/mean_length": 575.8958740234375,
      "completions/mean_terminated_length": 575.8958740234375,
      "completions/min_length": 278.0,
      "completions/min_terminated_length": 278.0,
      "epoch": 1.7717041800643085,
      "grad_norm": 0.9315298199653625,
      "learning_rate": 1e-06,
      "loss": 0.0063,
      "num_tokens": 23970967.0,
      "reward": 1.3047760725021362,
      "reward_std": 0.23194283246994019,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.06935928016901016,
      "rewards/one_code_blob_reward/std": 0.035950787365436554,
      "rewards/reward_code_runs/mean": 0.2541666626930237,
      "rewards/reward_code_runs/std": 0.5269408226013184,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.0833333358168602,
      "rewards/torch_zeros_reward/std": 0.03746343404054642,
      "rewards/valid_tl_methods_reward/mean": 0.1979166716337204,
      "rewards/valid_tl_methods_reward/std": 0.020412415266036987,
      "step": 183
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1222.0,
      "completions/max_terminated_length": 1222.0,
      "completions/mean_length": 509.57293701171875,
      "completions/mean_terminated_length": 509.57293701171875,
      "completions/min_length": 303.0,
      "completions/min_terminated_length": 303.0,
      "epoch": 1.7813504823151125,
      "grad_norm": 1.0050220489501953,
      "learning_rate": 1e-06,
      "loss": 0.0742,
      "num_tokens": 24061922.0,
      "reward": 1.1889946460723877,
      "reward_std": 0.1527276635169983,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.07649455219507217,
      "rewards/one_code_blob_reward/std": 0.02742423489689827,
      "rewards/reward_code_runs/mean": 0.11250000447034836,
      "rewards/reward_code_runs/std": 0.3979552984237671,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.10000000149011612,
      "rewards/torch_zeros_reward/std": 0.0,
      "rewards/valid_tl_methods_reward/mean": 0.20000000298023224,
      "rewards/valid_tl_methods_reward/std": 0.0,
      "step": 184
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1045.0,
      "completions/max_terminated_length": 1045.0,
      "completions/mean_length": 449.47918701171875,
      "completions/mean_terminated_length": 449.47918701171875,
      "completions/min_length": 202.0,
      "completions/min_terminated_length": 202.0,
      "epoch": 1.7909967845659165,
      "grad_norm": 1.2509444952011108,
      "learning_rate": 1e-06,
      "loss": 0.0378,
      "num_tokens": 24144228.0,
      "reward": 1.2286624908447266,
      "reward_std": 0.21664278209209442,
      "rewards/constexpr_reward/mean": 0.18958334624767303,
      "rewards/constexpr_reward/std": 0.044672295451164246,
      "rewards/imports_decorator_reward/mean": 0.18958334624767303,
      "rewards/imports_decorator_reward/std": 0.044672295451164246,
      "rewards/masks_load_store_reward/mean": 0.09375,
      "rewards/masks_load_store_reward/std": 0.02433321252465248,
      "rewards/one_code_blob_reward/mean": 0.0922040268778801,
      "rewards/one_code_blob_reward/std": 0.03387906774878502,
      "rewards/reward_code_runs/mean": 0.20937500894069672,
      "rewards/reward_code_runs/std": 0.43025773763656616,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.06875000149011612,
      "rewards/torch_zeros_reward/std": 0.04659455642104149,
      "rewards/valid_tl_methods_reward/mean": 0.18541668355464935,
      "rewards/valid_tl_methods_reward/std": 0.05227290466427803,
      "step": 185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 954.0,
      "completions/max_terminated_length": 954.0,
      "completions/mean_length": 421.54168701171875,
      "completions/mean_terminated_length": 421.54168701171875,
      "completions/min_length": 258.0,
      "completions/min_terminated_length": 258.0,
      "epoch": 1.8006430868167203,
      "grad_norm": 1.323587417602539,
      "learning_rate": 1e-06,
      "loss": 0.028,
      "num_tokens": 24224920.0,
      "reward": 1.3313677310943604,
      "reward_std": 0.0972091406583786,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.09230508655309677,
      "rewards/one_code_blob_reward/std": 0.022745473310351372,
      "rewards/reward_code_runs/mean": 0.23906248807907104,
      "rewards/reward_code_runs/std": 0.3271248936653137,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.10000000149011612,
      "rewards/torch_zeros_reward/std": 0.0,
      "rewards/valid_tl_methods_reward/mean": 0.20000000298023224,
      "rewards/valid_tl_methods_reward/std": 0.0,
      "step": 186
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 955.0,
      "completions/max_terminated_length": 955.0,
      "completions/mean_length": 513.0729370117188,
      "completions/mean_terminated_length": 513.0729370117188,
      "completions/min_length": 274.0,
      "completions/min_terminated_length": 274.0,
      "epoch": 1.810289389067524,
      "grad_norm": 0.9576202630996704,
      "learning_rate": 1e-06,
      "loss": -0.0013,
      "num_tokens": 24314903.0,
      "reward": 1.1814203262329102,
      "reward_std": 0.14891770482063293,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.09687501192092896,
      "rewards/masks_load_store_reward/std": 0.017490599304437637,
      "rewards/one_code_blob_reward/mean": 0.07517031580209732,
      "rewards/one_code_blob_reward/std": 0.027302829548716545,
      "rewards/reward_code_runs/mean": 0.12083333730697632,
      "rewards/reward_code_runs/std": 0.4080548584461212,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.0989583358168602,
      "rewards/torch_zeros_reward/std": 0.010206207633018494,
      "rewards/valid_tl_methods_reward/mean": 0.18958334624767303,
      "rewards/valid_tl_methods_reward/std": 0.044672295451164246,
      "step": 187
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 963.0,
      "completions/max_terminated_length": 963.0,
      "completions/mean_length": 566.3541870117188,
      "completions/mean_terminated_length": 566.3541870117188,
      "completions/min_length": 270.0,
      "completions/min_terminated_length": 270.0,
      "epoch": 1.819935691318328,
      "grad_norm": 1.361865758895874,
      "learning_rate": 1e-06,
      "loss": 0.0454,
      "num_tokens": 24415089.0,
      "reward": 1.1108605861663818,
      "reward_std": 0.12311562150716782,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.06815212219953537,
      "rewards/one_code_blob_reward/std": 0.02729409746825695,
      "rewards/reward_code_runs/mean": 0.05625000223517418,
      "rewards/reward_code_runs/std": 0.41328275203704834,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.08645833283662796,
      "rewards/torch_zeros_reward/std": 0.034396424889564514,
      "rewards/valid_tl_methods_reward/mean": 0.20000000298023224,
      "rewards/valid_tl_methods_reward/std": 0.0,
      "step": 188
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1078.0,
      "completions/max_terminated_length": 1078.0,
      "completions/mean_length": 466.7708435058594,
      "completions/mean_terminated_length": 466.7708435058594,
      "completions/min_length": 231.0,
      "completions/min_terminated_length": 231.0,
      "epoch": 1.829581993569132,
      "grad_norm": 0.9256302714347839,
      "learning_rate": 1e-06,
      "loss": -0.0039,
      "num_tokens": 24500771.0,
      "reward": 1.3085904121398926,
      "reward_std": 0.16580632328987122,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.0989583358168602,
      "rewards/masks_load_store_reward/std": 0.010206207633018494,
      "rewards/one_code_blob_reward/mean": 0.08827787637710571,
      "rewards/one_code_blob_reward/std": 0.02642660029232502,
      "rewards/reward_code_runs/mean": 0.22343750298023224,
      "rewards/reward_code_runs/std": 0.4222092926502228,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.10000000149011612,
      "rewards/torch_zeros_reward/std": 0.0,
      "rewards/valid_tl_methods_reward/mean": 0.1979166716337204,
      "rewards/valid_tl_methods_reward/std": 0.020412415266036987,
      "step": 189
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1132.0,
      "completions/max_terminated_length": 1132.0,
      "completions/mean_length": 482.71875,
      "completions/mean_terminated_length": 482.71875,
      "completions/min_length": 269.0,
      "completions/min_terminated_length": 269.0,
      "epoch": 1.8392282958199357,
      "grad_norm": 1.2202610969543457,
      "learning_rate": 1e-06,
      "loss": 0.0375,
      "num_tokens": 24593192.0,
      "reward": 1.159076452255249,
      "reward_std": 0.19445598125457764,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.08043061196804047,
      "rewards/one_code_blob_reward/std": 0.027151940390467644,
      "rewards/reward_code_runs/mean": 0.09010416269302368,
      "rewards/reward_code_runs/std": 0.39186593890190125,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.0885416641831398,
      "rewards/torch_zeros_reward/std": 0.03201904520392418,
      "rewards/valid_tl_methods_reward/mean": 0.20000000298023224,
      "rewards/valid_tl_methods_reward/std": 0.0,
      "step": 190
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 852.0,
      "completions/max_terminated_length": 852.0,
      "completions/mean_length": 473.8645935058594,
      "completions/mean_terminated_length": 473.8645935058594,
      "completions/min_length": 309.0,
      "completions/min_terminated_length": 309.0,
      "epoch": 1.8488745980707395,
      "grad_norm": 1.0192776918411255,
      "learning_rate": 1e-06,
      "loss": 0.0012,
      "num_tokens": 24682363.0,
      "reward": 0.9869474768638611,
      "reward_std": 0.13572761416435242,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.0989583358168602,
      "rewards/masks_load_store_reward/std": 0.010206207633018494,
      "rewards/one_code_blob_reward/mean": 0.08173908293247223,
      "rewards/one_code_blob_reward/std": 0.034760426729917526,
      "rewards/reward_code_runs/mean": -0.06354166567325592,
      "rewards/reward_code_runs/std": 0.24532246589660645,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.0989583358168602,
      "rewards/torch_zeros_reward/std": 0.010206207633018494,
      "rewards/valid_tl_methods_reward/mean": 0.17499999701976776,
      "rewards/valid_tl_methods_reward/std": 0.06649099290370941,
      "step": 191
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 731.0,
      "completions/max_terminated_length": 731.0,
      "completions/mean_length": 475.13543701171875,
      "completions/mean_terminated_length": 475.13543701171875,
      "completions/min_length": 272.0,
      "completions/min_terminated_length": 272.0,
      "epoch": 1.8585209003215435,
      "grad_norm": 1.1663234233856201,
      "learning_rate": 1e-06,
      "loss": 0.0225,
      "num_tokens": 24770492.0,
      "reward": 1.237094521522522,
      "reward_std": 0.20853112637996674,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.09791667014360428,
      "rewards/masks_load_store_reward/std": 0.014357589185237885,
      "rewards/one_code_blob_reward/mean": 0.07928191870450974,
      "rewards/one_code_blob_reward/std": 0.03703931346535683,
      "rewards/reward_code_runs/mean": 0.1598958522081375,
      "rewards/reward_code_runs/std": 0.4301542043685913,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.10000000149011612,
      "rewards/torch_zeros_reward/std": 0.0,
      "rewards/valid_tl_methods_reward/mean": 0.20000000298023224,
      "rewards/valid_tl_methods_reward/std": 0.0,
      "step": 192
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1129.0,
      "completions/max_terminated_length": 1129.0,
      "completions/mean_length": 506.625,
      "completions/mean_terminated_length": 506.625,
      "completions/min_length": 250.0,
      "completions/min_terminated_length": 250.0,
      "epoch": 1.8681672025723473,
      "grad_norm": 1.028404712677002,
      "learning_rate": 1e-06,
      "loss": 0.0222,
      "num_tokens": 24862640.0,
      "reward": 1.3358540534973145,
      "reward_std": 0.20964989066123962,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.0989583358168602,
      "rewards/masks_load_store_reward/std": 0.010206207633018494,
      "rewards/one_code_blob_reward/mean": 0.08012475818395615,
      "rewards/one_code_blob_reward/std": 0.028094014152884483,
      "rewards/reward_code_runs/mean": 0.2588541805744171,
      "rewards/reward_code_runs/std": 0.42457374930381775,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.10000000149011612,
      "rewards/torch_zeros_reward/std": 0.0,
      "rewards/valid_tl_methods_reward/mean": 0.1979166716337204,
      "rewards/valid_tl_methods_reward/std": 0.020412415266036987,
      "step": 193
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 941.0,
      "completions/max_terminated_length": 941.0,
      "completions/mean_length": 483.82293701171875,
      "completions/mean_terminated_length": 483.82293701171875,
      "completions/min_length": 300.0,
      "completions/min_terminated_length": 300.0,
      "epoch": 1.877813504823151,
      "grad_norm": 1.0317487716674805,
      "learning_rate": 1e-06,
      "loss": 0.0237,
      "num_tokens": 24948291.0,
      "reward": 1.1043274402618408,
      "reward_std": 0.20908395946025848,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.09583333879709244,
      "rewards/masks_load_store_reward/std": 0.020087527111172676,
      "rewards/one_code_blob_reward/mean": 0.08870226889848709,
      "rewards/one_code_blob_reward/std": 0.023371906951069832,
      "rewards/reward_code_runs/mean": 0.02187499962747097,
      "rewards/reward_code_runs/std": 0.22121821343898773,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.10000000149011612,
      "rewards/torch_zeros_reward/std": 0.0,
      "rewards/valid_tl_methods_reward/mean": 0.20000000298023224,
      "rewards/valid_tl_methods_reward/std": 0.0,
      "step": 194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 732.0,
      "completions/max_terminated_length": 732.0,
      "completions/mean_length": 513.7291870117188,
      "completions/mean_terminated_length": 513.7291870117188,
      "completions/min_length": 319.0,
      "completions/min_terminated_length": 319.0,
      "epoch": 1.887459807073955,
      "grad_norm": 1.1328577995300293,
      "learning_rate": 1e-06,
      "loss": 0.0337,
      "num_tokens": 25046245.0,
      "reward": 1.0973868370056152,
      "reward_std": 0.1983201801776886,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.0989583358168602,
      "rewards/masks_load_store_reward/std": 0.010206207633018494,
      "rewards/one_code_blob_reward/mean": 0.08176174759864807,
      "rewards/one_code_blob_reward/std": 0.019281970337033272,
      "rewards/reward_code_runs/mean": 0.03020833432674408,
      "rewards/reward_code_runs/std": 0.24208298325538635,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.08645833283662796,
      "rewards/torch_zeros_reward/std": 0.034396424889564514,
      "rewards/valid_tl_methods_reward/mean": 0.20000000298023224,
      "rewards/valid_tl_methods_reward/std": 0.0,
      "step": 195
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 882.0,
      "completions/max_terminated_length": 882.0,
      "completions/mean_length": 499.7708435058594,
      "completions/mean_terminated_length": 499.7708435058594,
      "completions/min_length": 230.0,
      "completions/min_terminated_length": 230.0,
      "epoch": 1.897106109324759,
      "grad_norm": 1.057507872581482,
      "learning_rate": 1e-06,
      "loss": 0.0446,
      "num_tokens": 25138323.0,
      "reward": 1.2629551887512207,
      "reward_std": 0.14245173335075378,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.0827467143535614,
      "rewards/one_code_blob_reward/std": 0.03230925649404526,
      "rewards/reward_code_runs/mean": 0.20416666567325592,
      "rewards/reward_code_runs/std": 0.3813871145248413,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.07604166865348816,
      "rewards/torch_zeros_reward/std": 0.042906977236270905,
      "rewards/valid_tl_methods_reward/mean": 0.20000000298023224,
      "rewards/valid_tl_methods_reward/std": 0.0,
      "step": 196
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1365.0,
      "completions/max_terminated_length": 1365.0,
      "completions/mean_length": 664.9583740234375,
      "completions/mean_terminated_length": 664.9583740234375,
      "completions/min_length": 244.0,
      "completions/min_terminated_length": 244.0,
      "epoch": 1.9067524115755627,
      "grad_norm": 1.36724853515625,
      "learning_rate": 1e-06,
      "loss": 0.0326,
      "num_tokens": 25251311.0,
      "reward": 1.0337120294570923,
      "reward_std": 0.21500375866889954,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.09479167312383652,
      "rewards/masks_load_store_reward/std": 0.022336147725582123,
      "rewards/one_code_blob_reward/mean": 0.060274433344602585,
      "rewards/one_code_blob_reward/std": 0.04737605154514313,
      "rewards/reward_code_runs/mean": 0.01718750037252903,
      "rewards/reward_code_runs/std": 0.2221696972846985,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.0677083358168602,
      "rewards/torch_zeros_reward/std": 0.047004569321870804,
      "rewards/valid_tl_methods_reward/mean": 0.1979166716337204,
      "rewards/valid_tl_methods_reward/std": 0.020412415266036987,
      "step": 197
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 905.0,
      "completions/max_terminated_length": 905.0,
      "completions/mean_length": 465.8645935058594,
      "completions/mean_terminated_length": 465.8645935058594,
      "completions/min_length": 254.0,
      "completions/min_terminated_length": 254.0,
      "epoch": 1.9163987138263665,
      "grad_norm": 1.077828049659729,
      "learning_rate": 1e-06,
      "loss": 0.0036,
      "num_tokens": 25338658.0,
      "reward": 1.3010025024414062,
      "reward_std": 0.1482926309108734,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.0989583358168602,
      "rewards/masks_load_store_reward/std": 0.010206207633018494,
      "rewards/one_code_blob_reward/mean": 0.0885024294257164,
      "rewards/one_code_blob_reward/std": 0.03680278733372688,
      "rewards/reward_code_runs/mean": 0.22187499701976776,
      "rewards/reward_code_runs/std": 0.4921388626098633,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.09791667014360428,
      "rewards/torch_zeros_reward/std": 0.01435758825391531,
      "rewards/valid_tl_methods_reward/mean": 0.1979166716337204,
      "rewards/valid_tl_methods_reward/std": 0.020412415266036987,
      "step": 198
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 825.0,
      "completions/max_terminated_length": 825.0,
      "completions/mean_length": 469.71875,
      "completions/mean_terminated_length": 469.71875,
      "completions/min_length": 252.0,
      "completions/min_terminated_length": 252.0,
      "epoch": 1.9260450160771705,
      "grad_norm": 1.516790509223938,
      "learning_rate": 1e-06,
      "loss": 0.0507,
      "num_tokens": 25425919.0,
      "reward": 1.1425931453704834,
      "reward_std": 0.15534614026546478,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.09207212179899216,
      "rewards/one_code_blob_reward/std": 0.03535056114196777,
      "rewards/reward_code_runs/mean": 0.07968749850988388,
      "rewards/reward_code_runs/std": 0.4079201817512512,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.09791667014360428,
      "rewards/torch_zeros_reward/std": 0.01435758825391531,
      "rewards/valid_tl_methods_reward/mean": 0.17291666567325592,
      "rewards/valid_tl_methods_reward/std": 0.06879284977912903,
      "step": 199
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1290.0,
      "completions/max_terminated_length": 1290.0,
      "completions/mean_length": 515.4583740234375,
      "completions/mean_terminated_length": 515.4583740234375,
      "completions/min_length": 277.0,
      "completions/min_terminated_length": 277.0,
      "epoch": 1.9356913183279743,
      "grad_norm": 1.4440219402313232,
      "learning_rate": 1e-06,
      "loss": 0.0678,
      "num_tokens": 25519899.0,
      "reward": 1.2874916791915894,
      "reward_std": 0.09185895323753357,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.09791667014360428,
      "rewards/masks_load_store_reward/std": 0.01435758825391531,
      "rewards/one_code_blob_reward/mean": 0.0786375030875206,
      "rewards/one_code_blob_reward/std": 0.039403628557920456,
      "rewards/reward_code_runs/mean": 0.2421875,
      "rewards/reward_code_runs/std": 0.5983014106750488,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.07916666567325592,
      "rewards/torch_zeros_reward/std": 0.040824830532073975,
      "rewards/valid_tl_methods_reward/mean": 0.18958334624767303,
      "rewards/valid_tl_methods_reward/std": 0.044672295451164246,
      "step": 200
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1100.0,
      "completions/max_terminated_length": 1100.0,
      "completions/mean_length": 550.1458740234375,
      "completions/mean_terminated_length": 550.1458740234375,
      "completions/min_length": 345.0,
      "completions/min_terminated_length": 345.0,
      "epoch": 1.945337620578778,
      "grad_norm": 4.093983173370361,
      "learning_rate": 1e-06,
      "loss": 0.0547,
      "num_tokens": 25618949.0,
      "reward": 1.142181634902954,
      "reward_std": 0.13598135113716125,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.08072315901517868,
      "rewards/one_code_blob_reward/std": 0.026570141315460205,
      "rewards/reward_code_runs/mean": 0.06874999403953552,
      "rewards/reward_code_runs/std": 0.20561204850673676,
      "rewards/think_reward/mean": 0.19687502086162567,
      "rewards/think_reward/std": 0.03061862289905548,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.09791667014360428,
      "rewards/torch_zeros_reward/std": 0.01435758825391531,
      "rewards/valid_tl_methods_reward/mean": 0.1979166716337204,
      "rewards/valid_tl_methods_reward/std": 0.020412415266036987,
      "step": 201
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 807.0,
      "completions/max_terminated_length": 807.0,
      "completions/mean_length": 415.19793701171875,
      "completions/mean_terminated_length": 415.19793701171875,
      "completions/min_length": 217.0,
      "completions/min_terminated_length": 217.0,
      "epoch": 1.954983922829582,
      "grad_norm": 1.3485451936721802,
      "learning_rate": 1e-06,
      "loss": 0.0558,
      "num_tokens": 25698672.0,
      "reward": 1.3339226245880127,
      "reward_std": 0.17535977065563202,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.10371416807174683,
      "rewards/one_code_blob_reward/std": 0.03486758470535278,
      "rewards/reward_code_runs/mean": 0.26250001788139343,
      "rewards/reward_code_runs/std": 0.4343779683113098,
      "rewards/think_reward/mean": 0.19687502086162567,
      "rewards/think_reward/std": 0.03061862289905548,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.07500000298023224,
      "rewards/torch_zeros_reward/std": 0.04352857545018196,
      "rewards/valid_tl_methods_reward/mean": 0.20000000298023224,
      "rewards/valid_tl_methods_reward/std": 0.0,
      "step": 202
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 765.0,
      "completions/max_terminated_length": 765.0,
      "completions/mean_length": 475.3645935058594,
      "completions/mean_terminated_length": 475.3645935058594,
      "completions/min_length": 295.0,
      "completions/min_terminated_length": 295.0,
      "epoch": 1.964630225080386,
      "grad_norm": 2.2553164958953857,
      "learning_rate": 1e-06,
      "loss": 0.0669,
      "num_tokens": 25789811.0,
      "reward": 1.1456621885299683,
      "reward_std": 0.17673367261886597,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.08837032318115234,
      "rewards/one_code_blob_reward/std": 0.033976636826992035,
      "rewards/reward_code_runs/mean": 0.078125,
      "rewards/reward_code_runs/std": 0.2010253369808197,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.08749999850988388,
      "rewards/torch_zeros_reward/std": 0.033245496451854706,
      "rewards/valid_tl_methods_reward/mean": 0.19166667759418488,
      "rewards/valid_tl_methods_reward/std": 0.04017505422234535,
      "step": 203
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1050.0,
      "completions/max_terminated_length": 1050.0,
      "completions/mean_length": 482.88543701171875,
      "completions/mean_terminated_length": 482.88543701171875,
      "completions/min_length": 287.0,
      "completions/min_terminated_length": 287.0,
      "epoch": 1.9742765273311897,
      "grad_norm": 2.0317275524139404,
      "learning_rate": 1e-06,
      "loss": 0.0553,
      "num_tokens": 25880376.0,
      "reward": 1.2459683418273926,
      "reward_std": 0.13240359723567963,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.08763498812913895,
      "rewards/one_code_blob_reward/std": 0.03719012811779976,
      "rewards/reward_code_runs/mean": 0.1614583432674408,
      "rewards/reward_code_runs/std": 0.34826478362083435,
      "rewards/think_reward/mean": 0.19687502086162567,
      "rewards/think_reward/std": 0.03061862289905548,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.10000000149011612,
      "rewards/torch_zeros_reward/std": 0.0,
      "rewards/valid_tl_methods_reward/mean": 0.20000000298023224,
      "rewards/valid_tl_methods_reward/std": 0.0,
      "step": 204
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 875.0,
      "completions/max_terminated_length": 875.0,
      "completions/mean_length": 454.22918701171875,
      "completions/mean_terminated_length": 454.22918701171875,
      "completions/min_length": 311.0,
      "completions/min_terminated_length": 311.0,
      "epoch": 1.9839228295819935,
      "grad_norm": 2.1294896602630615,
      "learning_rate": 1e-06,
      "loss": 0.0397,
      "num_tokens": 25969666.0,
      "reward": 1.1490356922149658,
      "reward_std": 0.12597203254699707,
      "rewards/constexpr_reward/mean": 0.20000000298023224,
      "rewards/constexpr_reward/std": 0.0,
      "rewards/imports_decorator_reward/mean": 0.20000000298023224,
      "rewards/imports_decorator_reward/std": 0.0,
      "rewards/masks_load_store_reward/mean": 0.10000000149011612,
      "rewards/masks_load_store_reward/std": 0.0,
      "rewards/one_code_blob_reward/mean": 0.09486893564462662,
      "rewards/one_code_blob_reward/std": 0.03526860475540161,
      "rewards/reward_code_runs/mean": 0.05937499925494194,
      "rewards/reward_code_runs/std": 0.20967550575733185,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.09479167312383652,
      "rewards/torch_zeros_reward/std": 0.022336147725582123,
      "rewards/valid_tl_methods_reward/mean": 0.20000000298023224,
      "rewards/valid_tl_methods_reward/std": 0.0,
      "step": 205
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 702.0,
      "completions/max_terminated_length": 702.0,
      "completions/mean_length": 484.5333557128906,
      "completions/mean_terminated_length": 484.5333557128906,
      "completions/min_length": 318.0,
      "completions/min_terminated_length": 318.0,
      "epoch": 1.9935691318327975,
      "grad_norm": 1.774752140045166,
      "learning_rate": 1e-06,
      "loss": 0.0223,
      "num_tokens": 26056652.0,
      "reward": 1.3336315155029297,
      "reward_std": 0.21346089243888855,
      "rewards/constexpr_reward/mean": 0.1979166716337204,
      "rewards/constexpr_reward/std": 0.020412415266036987,
      "rewards/imports_decorator_reward/mean": 0.1979166716337204,
      "rewards/imports_decorator_reward/std": 0.020412415266036987,
      "rewards/masks_load_store_reward/mean": 0.09791667014360428,
      "rewards/masks_load_store_reward/std": 0.014357589185237885,
      "rewards/one_code_blob_reward/mean": 0.09613153338432312,
      "rewards/one_code_blob_reward/std": 0.017497001215815544,
      "rewards/reward_code_runs/mean": 0.2552083432674408,
      "rewards/reward_code_runs/std": 0.41450533270835876,
      "rewards/think_reward/mean": 0.20000000298023224,
      "rewards/think_reward/std": 0.0,
      "rewards/torch_empty_penalty/mean": 0.0,
      "rewards/torch_empty_penalty/std": 0.0,
      "rewards/torch_zeros_reward/mean": 0.0885416641831398,
      "rewards/torch_zeros_reward/std": 0.03201904892921448,
      "rewards/valid_tl_methods_reward/mean": 0.20000000298023224,
      "rewards/valid_tl_methods_reward/std": 0.0,
      "step": 206
    }
  ],
  "logging_steps": 1,
  "max_steps": 515,
  "num_input_tokens_seen": 26056652,
  "num_train_epochs": 5,
  "save_steps": 103,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}