{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9935691318327975, "eval_steps": 500, "global_step": 206, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01041666666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 2578.0, "completions/mean_length": 995.7604370117188, "completions/mean_terminated_length": 963.1263427734375, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.00964630225080386, "grad_norm": 0.6591727137565613, "learning_rate": 0.0, "loss": 0.066, "num_tokens": 136897.0, "reward": 0.6407926082611084, "reward_std": 0.26224446296691895, "rewards/constexpr_reward/mean": 0.11666667461395264, "rewards/constexpr_reward/std": 0.0991189256310463, "rewards/imports_decorator_reward/mean": 0.1770833283662796, "rewards/imports_decorator_reward/std": 0.06403809040784836, "rewards/masks_load_store_reward/mean": 0.0885416641831398, "rewards/masks_load_store_reward/std": 0.03201904892921448, "rewards/one_code_blob_reward/mean": 0.02701156586408615, "rewards/one_code_blob_reward/std": 0.054558686912059784, "rewards/reward_code_runs/mean": -0.13697917759418488, "rewards/reward_code_runs/std": 0.28388699889183044, "rewards/think_reward/mean": 0.19867680966854095, "rewards/think_reward/std": 0.009136492386460304, "rewards/torch_empty_penalty/mean": -0.01458333432674408, "rewards/torch_empty_penalty/std": 0.03547917678952217, "rewards/torch_zeros_reward/mean": 0.01145833358168602, "rewards/torch_zeros_reward/std": 0.03201904520392418, "rewards/valid_tl_methods_reward/mean": 0.17291666567325592, "rewards/valid_tl_methods_reward/std": 0.06879284977912903, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3924.0, "completions/max_terminated_length": 3924.0, "completions/mean_length": 1379.291748046875, "completions/mean_terminated_length": 1379.291748046875, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 0.01929260450160772, "grad_norm": 0.5995727181434631, "learning_rate": 1e-08, "loss": -0.0234, "num_tokens": 313889.0, "reward": 0.3980224132537842, "reward_std": 0.2479538917541504, "rewards/constexpr_reward/mean": 0.10000000149011612, "rewards/constexpr_reward/std": 0.10052493959665298, "rewards/imports_decorator_reward/mean": 0.17916667461395264, "rewards/imports_decorator_reward/std": 0.061416033655405045, "rewards/masks_load_store_reward/mean": 0.0677083358168602, "rewards/masks_load_store_reward/std": 0.047004569321870804, "rewards/one_code_blob_reward/mean": 0.003911829087883234, "rewards/one_code_blob_reward/std": 0.06139358878135681, "rewards/reward_code_runs/mean": -0.2239583283662796, "rewards/reward_code_runs/std": 0.17946985363960266, "rewards/think_reward/mean": 0.19098560512065887, "rewards/think_reward/std": 0.03435641527175903, "rewards/torch_empty_penalty/mean": -0.02500000037252903, "rewards/torch_empty_penalty/std": 0.04352857545018196, "rewards/torch_zeros_reward/mean": 0.01770833320915699, "rewards/torch_zeros_reward/std": 0.03837431222200394, "rewards/valid_tl_methods_reward/mean": 0.08749999850988388, "rewards/valid_tl_methods_reward/std": 0.09973649680614471, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3371.0, "completions/max_terminated_length": 3371.0, "completions/mean_length": 1258.229248046875, "completions/mean_terminated_length": 1258.229248046875, "completions/min_length": 616.0, "completions/min_terminated_length": 616.0, "epoch": 0.028938906752411574, "grad_norm": 0.5150277018547058, "learning_rate": 2e-08, "loss": -0.0327, "num_tokens": 480339.0, "reward": 0.5664182901382446, "reward_std": 0.22327932715415955, "rewards/constexpr_reward/mean": 0.15416666865348816, "rewards/constexpr_reward/std": 0.08450059592723846, "rewards/imports_decorator_reward/mean": 0.19166667759418488, "rewards/imports_decorator_reward/std": 0.04017505422234535, "rewards/masks_load_store_reward/mean": 0.06562500447034836, "rewards/masks_load_store_reward/std": 0.04774521291255951, "rewards/one_code_blob_reward/mean": -0.003944525495171547, "rewards/one_code_blob_reward/std": 0.06892073899507523, "rewards/reward_code_runs/mean": -0.20624999701976776, "rewards/reward_code_runs/std": 0.2225746363401413, "rewards/think_reward/mean": 0.19223777949810028, "rewards/think_reward/std": 0.030695218592882156, "rewards/torch_empty_penalty/mean": -0.01979166641831398, "rewards/torch_empty_penalty/std": 0.04005204886198044, "rewards/torch_zeros_reward/mean": 0.046875, "rewards/torch_zeros_reward/std": 0.05016420781612396, "rewards/valid_tl_methods_reward/mean": 0.1458333283662796, "rewards/valid_tl_methods_reward/std": 0.08934459090232849, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3301.0, "completions/max_terminated_length": 3301.0, "completions/mean_length": 1140.697998046875, "completions/mean_terminated_length": 1140.697998046875, "completions/min_length": 608.0, "completions/min_terminated_length": 608.0, "epoch": 0.03858520900321544, "grad_norm": 0.4870540201663971, "learning_rate": 3e-08, "loss": -0.0068, "num_tokens": 636226.0, "reward": 0.46876463294029236, "reward_std": 0.16259050369262695, "rewards/constexpr_reward/mean": 0.13125000894069672, "rewards/constexpr_reward/std": 0.09549043327569962, "rewards/imports_decorator_reward/mean": 0.18958334624767303, "rewards/imports_decorator_reward/std": 0.044672295451164246, "rewards/masks_load_store_reward/mean": 0.06562500447034836, "rewards/masks_load_store_reward/std": 0.04774521291255951, "rewards/one_code_blob_reward/mean": 0.006447285413742065, "rewards/one_code_blob_reward/std": 0.06022736802697182, "rewards/reward_code_runs/mean": -0.23593749105930328, "rewards/reward_code_runs/std": 0.07870769500732422, "rewards/think_reward/mean": 0.1992965191602707, "rewards/think_reward/std": 0.006892777048051357, "rewards/torch_empty_penalty/mean": -0.03750000149011612, "rewards/torch_empty_penalty/std": 0.04866642504930496, "rewards/torch_zeros_reward/mean": 0.010416666977107525, "rewards/torch_zeros_reward/std": 0.03070802055299282, "rewards/valid_tl_methods_reward/mean": 0.13958333432674408, "rewards/valid_tl_methods_reward/std": 0.09231429547071457, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2648.0, "completions/max_terminated_length": 2648.0, "completions/mean_length": 1042.760498046875, "completions/mean_terminated_length": 1042.760498046875, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.04823151125401929, "grad_norm": 0.5751297473907471, "learning_rate": 4e-08, "loss": -0.0088, "num_tokens": 778943.0, "reward": 0.5530459880828857, "reward_std": 0.2552254796028137, "rewards/constexpr_reward/mean": 0.12916666269302368, "rewards/constexpr_reward/std": 0.0961541160941124, "rewards/imports_decorator_reward/mean": 0.1875, "rewards/imports_decorator_reward/std": 0.04866642504930496, "rewards/masks_load_store_reward/mean": 0.06354167312383652, "rewards/masks_load_store_reward/std": 0.04838397353887558, "rewards/one_code_blob_reward/mean": 0.01439767237752676, "rewards/one_code_blob_reward/std": 0.064917653799057, "rewards/reward_code_runs/mean": -0.17552083730697632, "rewards/reward_code_runs/std": 0.28441956639289856, "rewards/think_reward/mean": 0.19333581626415253, "rewards/think_reward/std": 0.03038639947772026, "rewards/torch_empty_penalty/mean": -0.03333333507180214, "rewards/torch_empty_penalty/std": 0.04738790914416313, "rewards/torch_zeros_reward/mean": 0.03229166939854622, "rewards/torch_zeros_reward/std": 0.047004569321870804, "rewards/valid_tl_methods_reward/mean": 0.14166666567325592, "rewards/valid_tl_methods_reward/std": 0.09138313680887222, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3503.0, "completions/max_terminated_length": 3503.0, "completions/mean_length": 1263.84375, "completions/mean_terminated_length": 1263.84375, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.05787781350482315, "grad_norm": 0.51258385181427, "learning_rate": 5e-08, "loss": 0.0506, "num_tokens": 946052.0, "reward": 0.5646942853927612, "reward_std": 0.2725030779838562, "rewards/constexpr_reward/mean": 0.1041666641831398, "rewards/constexpr_reward/std": 0.10043764114379883, "rewards/imports_decorator_reward/mean": 0.1875, "rewards/imports_decorator_reward/std": 0.04866642504930496, "rewards/masks_load_store_reward/mean": 0.06145833432674408, "rewards/masks_load_store_reward/std": 0.04892484471201897, "rewards/one_code_blob_reward/mean": 0.0059766932390630245, "rewards/one_code_blob_reward/std": 0.05690459534525871, "rewards/reward_code_runs/mean": -0.17916667461395264, "rewards/reward_code_runs/std": 0.2631456255912781, "rewards/think_reward/mean": 0.1941341906785965, "rewards/think_reward/std": 0.03578920289874077, "rewards/torch_empty_penalty/mean": -0.010416666977107525, "rewards/torch_empty_penalty/std": 0.030708016827702522, "rewards/torch_zeros_reward/mean": 0.02812500111758709, "rewards/torch_zeros_reward/std": 0.04519693925976753, "rewards/valid_tl_methods_reward/mean": 0.17291666567325592, "rewards/valid_tl_methods_reward/std": 0.06879284232854843, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2655.0, "completions/max_terminated_length": 2655.0, "completions/mean_length": 1081.010498046875, "completions/mean_terminated_length": 1081.010498046875, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.06752411575562701, "grad_norm": 0.703319251537323, "learning_rate": 6e-08, "loss": 0.017, "num_tokens": 1092393.0, "reward": 0.5540701746940613, "reward_std": 0.22611849009990692, "rewards/constexpr_reward/mean": 0.12708333134651184, "rewards/constexpr_reward/std": 0.09676794707775116, "rewards/imports_decorator_reward/mean": 0.1875, "rewards/imports_decorator_reward/std": 0.04866642504930496, "rewards/masks_load_store_reward/mean": 0.07604166865348816, "rewards/masks_load_store_reward/std": 0.042906977236270905, "rewards/one_code_blob_reward/mean": 0.021316377446055412, "rewards/one_code_blob_reward/std": 0.05409438535571098, "rewards/reward_code_runs/mean": -0.17812500894069672, "rewards/reward_code_runs/std": 0.24298717081546783, "rewards/think_reward/mean": 0.19942043721675873, "rewards/think_reward/std": 0.005678629036992788, "rewards/torch_empty_penalty/mean": -0.03333333507180214, "rewards/torch_empty_penalty/std": 0.04738790914416313, "rewards/torch_zeros_reward/mean": 0.02500000037252903, "rewards/torch_zeros_reward/std": 0.04352857545018196, "rewards/valid_tl_methods_reward/mean": 0.12916667759418488, "rewards/valid_tl_methods_reward/std": 0.0961541160941124, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2545.0, "completions/max_terminated_length": 2545.0, "completions/mean_length": 1092.4375, "completions/mean_terminated_length": 1092.4375, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 0.07717041800643087, "grad_norm": 0.5653499364852905, "learning_rate": 7e-08, "loss": 0.0581, "num_tokens": 1236051.0, "reward": 0.5787287950515747, "reward_std": 0.27947068214416504, "rewards/constexpr_reward/mean": 0.1145833358168602, "rewards/constexpr_reward/std": 0.0994502454996109, "rewards/imports_decorator_reward/mean": 0.19166667759418488, "rewards/imports_decorator_reward/std": 0.04017505422234535, "rewards/masks_load_store_reward/mean": 0.06354166567325592, "rewards/masks_load_store_reward/std": 0.04838397353887558, "rewards/one_code_blob_reward/mean": 0.01976184733211994, "rewards/one_code_blob_reward/std": 0.04637972265481949, "rewards/reward_code_runs/mean": -0.14479167759418488, "rewards/reward_code_runs/std": 0.33217617869377136, "rewards/think_reward/mean": 0.19646687805652618, "rewards/think_reward/std": 0.017844805493950844, "rewards/torch_empty_penalty/mean": -0.03749999776482582, "rewards/torch_empty_penalty/std": 0.04866642504930496, "rewards/torch_zeros_reward/mean": 0.01458333432674408, "rewards/torch_zeros_reward/std": 0.03547917678952217, "rewards/valid_tl_methods_reward/mean": 0.16041666269302368, "rewards/valid_tl_methods_reward/std": 0.08010409772396088, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2317.0, "completions/max_terminated_length": 2317.0, "completions/mean_length": 1088.4583740234375, "completions/mean_terminated_length": 1088.4583740234375, "completions/min_length": 618.0, "completions/min_terminated_length": 618.0, "epoch": 0.08681672025723473, "grad_norm": 0.5193675756454468, "learning_rate": 8e-08, "loss": 0.0241, "num_tokens": 1387679.0, "reward": 0.6237555742263794, "reward_std": 0.23493945598602295, "rewards/constexpr_reward/mean": 0.125, "rewards/constexpr_reward/std": 0.09733285009860992, "rewards/imports_decorator_reward/mean": 0.18541665375232697, "rewards/imports_decorator_reward/std": 0.05227290466427803, "rewards/masks_load_store_reward/mean": 0.07499999552965164, "rewards/masks_load_store_reward/std": 0.04352857545018196, "rewards/one_code_blob_reward/mean": 0.011236711405217648, "rewards/one_code_blob_reward/std": 0.05193483084440231, "rewards/reward_code_runs/mean": -0.15312500298023224, "rewards/reward_code_runs/std": 0.3126131594181061, "rewards/think_reward/mean": 0.19793546199798584, "rewards/think_reward/std": 0.018091697245836258, "rewards/torch_empty_penalty/mean": -0.01875000074505806, "rewards/torch_empty_penalty/std": 0.039236124604940414, "rewards/torch_zeros_reward/mean": 0.02187499962747097, "rewards/torch_zeros_reward/std": 0.04155687242746353, "rewards/valid_tl_methods_reward/mean": 0.17916667461395264, "rewards/valid_tl_methods_reward/std": 0.061416033655405045, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4025.0, "completions/mean_length": 1312.15625, "completions/mean_terminated_length": 1191.11962890625, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 0.09646302250803858, "grad_norm": 0.7497362494468689, "learning_rate": 9e-08, "loss": 0.0166, "num_tokens": 1554818.0, "reward": 0.6600733995437622, "reward_std": 0.34800395369529724, "rewards/constexpr_reward/mean": 0.1145833358168602, "rewards/constexpr_reward/std": 0.0994502454996109, "rewards/imports_decorator_reward/mean": 0.1770833283662796, "rewards/imports_decorator_reward/std": 0.06403809040784836, "rewards/masks_load_store_reward/mean": 0.06562500447034836, "rewards/masks_load_store_reward/std": 0.04774521291255951, "rewards/one_code_blob_reward/mean": 0.017952701076865196, "rewards/one_code_blob_reward/std": 0.06317053735256195, "rewards/reward_code_runs/mean": -0.08437500149011612, "rewards/reward_code_runs/std": 0.4170266091823578, "rewards/think_reward/mean": 0.19732896983623505, "rewards/think_reward/std": 0.020253153517842293, "rewards/torch_empty_penalty/mean": -0.015625, "rewards/torch_empty_penalty/std": 0.03649982064962387, "rewards/torch_zeros_reward/mean": 0.03125, "rewards/torch_zeros_reward/std": 0.04659455642104149, "rewards/valid_tl_methods_reward/mean": 0.15625, "rewards/valid_tl_methods_reward/std": 0.08311374485492706, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2730.0, "completions/max_terminated_length": 2730.0, "completions/mean_length": 980.1041870117188, "completions/mean_terminated_length": 980.1041870117188, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 0.10610932475884244, "grad_norm": 0.7204728126525879, "learning_rate": 1e-07, "loss": 0.0299, "num_tokens": 1689792.0, "reward": 0.5421775579452515, "reward_std": 0.2649068534374237, "rewards/constexpr_reward/mean": 0.1145833358168602, "rewards/constexpr_reward/std": 0.0994502454996109, "rewards/imports_decorator_reward/mean": 0.17916667461395264, "rewards/imports_decorator_reward/std": 0.061416033655405045, "rewards/masks_load_store_reward/mean": 0.0729166641831398, "rewards/masks_load_store_reward/std": 0.044672295451164246, "rewards/one_code_blob_reward/mean": 0.028092747554183006, "rewards/one_code_blob_reward/std": 0.04810455068945885, "rewards/reward_code_runs/mean": -0.15677082538604736, "rewards/reward_code_runs/std": 0.2936718463897705, "rewards/think_reward/mean": 0.19898061454296112, "rewards/think_reward/std": 0.009988076984882355, "rewards/torch_empty_penalty/mean": -0.04062500223517418, "rewards/torch_empty_penalty/std": 0.04937104508280754, "rewards/torch_zeros_reward/mean": 0.012500000186264515, "rewards/torch_zeros_reward/std": 0.033245496451854706, "rewards/valid_tl_methods_reward/mean": 0.13333334028720856, "rewards/valid_tl_methods_reward/std": 0.09477581828832626, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3685.0, "completions/max_terminated_length": 3685.0, "completions/mean_length": 1081.572998046875, "completions/mean_terminated_length": 1081.572998046875, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.1157556270096463, "grad_norm": 0.5205122828483582, "learning_rate": 1.0999999999999999e-07, "loss": 0.0225, "num_tokens": 1835971.0, "reward": 0.6614155173301697, "reward_std": 0.3161061406135559, "rewards/constexpr_reward/mean": 0.10208333283662796, "rewards/constexpr_reward/std": 0.1005031168460846, "rewards/imports_decorator_reward/mean": 0.18125002086162567, "rewards/imports_decorator_reward/std": 0.058602139353752136, "rewards/masks_load_store_reward/mean": 0.0625, "rewards/masks_load_store_reward/std": 0.04866642504930496, "rewards/one_code_blob_reward/mean": 0.015901053324341774, "rewards/one_code_blob_reward/std": 0.05575646832585335, "rewards/reward_code_runs/mean": -0.02760416828095913, "rewards/reward_code_runs/std": 0.4685852527618408, "rewards/think_reward/mean": 0.19811856746673584, "rewards/think_reward/std": 0.018434301018714905, "rewards/torch_empty_penalty/mean": -0.03854166716337204, "rewards/torch_empty_penalty/std": 0.04892484098672867, "rewards/torch_zeros_reward/mean": 0.0031250000465661287, "rewards/torch_zeros_reward/std": 0.017490599304437637, "rewards/valid_tl_methods_reward/mean": 0.16458334028720856, "rewards/valid_tl_methods_reward/std": 0.07674862444400787, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01041666666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3961.0, "completions/mean_length": 1053.5521240234375, "completions/mean_terminated_length": 1021.5263671875, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.12540192926045016, "grad_norm": 0.8472212553024292, "learning_rate": 1.2e-07, "loss": 0.0115, "num_tokens": 1982244.0, "reward": 0.5119444131851196, "reward_std": 0.19705253839492798, "rewards/constexpr_reward/mean": 0.12291666865348816, "rewards/constexpr_reward/std": 0.09784968197345734, "rewards/imports_decorator_reward/mean": 0.1875, "rewards/imports_decorator_reward/std": 0.04866642504930496, "rewards/masks_load_store_reward/mean": 0.06041666865348816, "rewards/masks_load_store_reward/std": 0.04915960505604744, "rewards/one_code_blob_reward/mean": 0.020093580707907677, "rewards/one_code_blob_reward/std": 0.061894889920949936, "rewards/reward_code_runs/mean": -0.2265625, "rewards/reward_code_runs/std": 0.10051266103982925, "rewards/think_reward/mean": 0.19341325759887695, "rewards/think_reward/std": 0.03378907963633537, "rewards/torch_empty_penalty/mean": -0.02083333395421505, "rewards/torch_empty_penalty/std": 0.040824830532073975, "rewards/torch_zeros_reward/mean": 0.02916666679084301, "rewards/torch_zeros_reward/std": 0.04569156840443611, "rewards/valid_tl_methods_reward/mean": 0.1458333283662796, "rewards/valid_tl_methods_reward/std": 0.08934459090232849, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2192.0, "completions/max_terminated_length": 2192.0, "completions/mean_length": 1119.21875, "completions/mean_terminated_length": 1119.21875, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "epoch": 0.13504823151125403, "grad_norm": 0.5599005818367004, "learning_rate": 1.3e-07, "loss": 0.0018, "num_tokens": 2136117.0, "reward": 0.6745297312736511, "reward_std": 0.32885250449180603, "rewards/constexpr_reward/mean": 0.12291667610406876, "rewards/constexpr_reward/std": 0.09784968197345734, "rewards/imports_decorator_reward/mean": 0.18333333730697632, "rewards/imports_decorator_reward/std": 0.05556724593043327, "rewards/masks_load_store_reward/mean": 0.05937499925494194, "rewards/masks_load_store_reward/std": 0.04937104508280754, "rewards/one_code_blob_reward/mean": 0.00848993007093668, "rewards/one_code_blob_reward/std": 0.05749613419175148, "rewards/reward_code_runs/mean": -0.03229166939854622, "rewards/reward_code_runs/std": 0.4685351550579071, "rewards/think_reward/mean": 0.19937308132648468, "rewards/think_reward/std": 0.006142645608633757, "rewards/torch_empty_penalty/mean": -0.03020833432674408, "rewards/torch_empty_penalty/std": 0.046157147735357285, "rewards/torch_zeros_reward/mean": 0.00729166716337204, "rewards/torch_zeros_reward/std": 0.026136448606848717, "rewards/valid_tl_methods_reward/mean": 0.15625, "rewards/valid_tl_methods_reward/std": 0.08311374485492706, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01041666666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3819.0, "completions/mean_length": 1086.15625, "completions/mean_terminated_length": 1054.4737548828125, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.14469453376205788, "grad_norm": 0.721663236618042, "learning_rate": 1.4e-07, "loss": 0.0261, "num_tokens": 2280180.0, "reward": 0.6691732406616211, "reward_std": 0.2556842565536499, "rewards/constexpr_reward/mean": 0.11874999850988388, "rewards/constexpr_reward/std": 0.09874209016561508, "rewards/imports_decorator_reward/mean": 0.18125002086162567, "rewards/imports_decorator_reward/std": 0.058602139353752136, "rewards/masks_load_store_reward/mean": 0.07708332687616348, "rewards/masks_load_store_reward/std": 0.04225030168890953, "rewards/one_code_blob_reward/mean": 0.025778064504265785, "rewards/one_code_blob_reward/std": 0.055621080100536346, "rewards/reward_code_runs/mean": -0.12031248956918716, "rewards/reward_code_runs/std": 0.324246346950531, "rewards/think_reward/mean": 0.19495761394500732, "rewards/think_reward/std": 0.03575357794761658, "rewards/torch_empty_penalty/mean": -0.0010416667209938169, "rewards/torch_empty_penalty/std": 0.010206207633018494, "rewards/torch_zeros_reward/mean": 0.015625, "rewards/torch_zeros_reward/std": 0.03649982064962387, "rewards/valid_tl_methods_reward/mean": 0.1770833283662796, "rewards/valid_tl_methods_reward/std": 0.06403809040784836, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3865.0, "completions/max_terminated_length": 3865.0, "completions/mean_length": 1319.541748046875, "completions/mean_terminated_length": 1319.541748046875, "completions/min_length": 501.0, "completions/min_terminated_length": 501.0, "epoch": 0.15434083601286175, "grad_norm": 0.6778246760368347, "learning_rate": 1.5e-07, "loss": 0.0155, "num_tokens": 2451844.0, "reward": 0.5091235637664795, "reward_std": 0.22904875874519348, "rewards/constexpr_reward/mean": 0.11041667312383652, "rewards/constexpr_reward/std": 0.09997806698083878, "rewards/imports_decorator_reward/mean": 0.19166667759418488, "rewards/imports_decorator_reward/std": 0.04017505422234535, "rewards/masks_load_store_reward/mean": 0.06458333879709244, "rewards/masks_load_store_reward/std": 0.0480770580470562, "rewards/one_code_blob_reward/mean": 0.003854154609143734, "rewards/one_code_blob_reward/std": 0.06397390365600586, "rewards/reward_code_runs/mean": -0.21458333730697632, "rewards/reward_code_runs/std": 0.18944749236106873, "rewards/think_reward/mean": 0.19485266506671906, "rewards/think_reward/std": 0.02862134948372841, "rewards/torch_empty_penalty/mean": -0.02083333395421505, "rewards/torch_empty_penalty/std": 0.040824830532073975, "rewards/torch_zeros_reward/mean": 0.02291666716337204, "rewards/torch_zeros_reward/std": 0.04225029796361923, "rewards/valid_tl_methods_reward/mean": 0.15625, "rewards/valid_tl_methods_reward/std": 0.08311374485492706, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3195.0, "completions/max_terminated_length": 3195.0, "completions/mean_length": 1325.729248046875, "completions/mean_terminated_length": 1325.729248046875, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 0.1639871382636656, "grad_norm": 0.9080011248588562, "learning_rate": 1.6e-07, "loss": -0.0255, "num_tokens": 2625014.0, "reward": 0.5426409244537354, "reward_std": 0.24068626761436462, "rewards/constexpr_reward/mean": 0.12708333134651184, "rewards/constexpr_reward/std": 0.09676794707775116, "rewards/imports_decorator_reward/mean": 0.18333333730697632, "rewards/imports_decorator_reward/std": 0.05556724593043327, "rewards/masks_load_store_reward/mean": 0.06041666865348816, "rewards/masks_load_store_reward/std": 0.04915960505604744, "rewards/one_code_blob_reward/mean": 0.0076617044396698475, "rewards/one_code_blob_reward/std": 0.05307823792099953, "rewards/reward_code_runs/mean": -0.18385416269302368, "rewards/reward_code_runs/std": 0.2603130340576172, "rewards/think_reward/mean": 0.19695837795734406, "rewards/think_reward/std": 0.019501902163028717, "rewards/torch_empty_penalty/mean": -0.012500000186264515, "rewards/torch_empty_penalty/std": 0.033245496451854706, "rewards/torch_zeros_reward/mean": 0.03229166567325592, "rewards/torch_zeros_reward/std": 0.0470045730471611, "rewards/valid_tl_methods_reward/mean": 0.13125000894069672, "rewards/valid_tl_methods_reward/std": 0.09549042582511902, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2733.0, "completions/max_terminated_length": 2733.0, "completions/mean_length": 1176.71875, "completions/mean_terminated_length": 1176.71875, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.17363344051446947, "grad_norm": 0.611341118812561, "learning_rate": 1.7000000000000001e-07, "loss": -0.0074, "num_tokens": 2781767.0, "reward": 0.4905034005641937, "reward_std": 0.17990529537200928, "rewards/constexpr_reward/mean": 0.12291666865348816, "rewards/constexpr_reward/std": 0.09784968197345734, "rewards/imports_decorator_reward/mean": 0.1875, "rewards/imports_decorator_reward/std": 0.04866642504930496, "rewards/masks_load_store_reward/mean": 0.07187499850988388, "rewards/masks_load_store_reward/std": 0.04519694298505783, "rewards/one_code_blob_reward/mean": 0.014260203577578068, "rewards/one_code_blob_reward/std": 0.0600602962076664, "rewards/reward_code_runs/mean": -0.25, "rewards/reward_code_runs/std": 0.0, "rewards/think_reward/mean": 0.19707651436328888, "rewards/think_reward/std": 0.018288368359208107, "rewards/torch_empty_penalty/mean": -0.01979166828095913, "rewards/torch_empty_penalty/std": 0.04005204886198044, "rewards/torch_zeros_reward/mean": 0.03125, "rewards/torch_zeros_reward/std": 0.04659455642104149, "rewards/valid_tl_methods_reward/mean": 0.1354166716337204, "rewards/valid_tl_methods_reward/std": 0.0940091460943222, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3502.0, "completions/max_terminated_length": 3502.0, "completions/mean_length": 1105.3021240234375, "completions/mean_terminated_length": 1105.3021240234375, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 0.1832797427652733, "grad_norm": 0.5251040458679199, "learning_rate": 1.8e-07, "loss": 0.0387, "num_tokens": 2931664.0, "reward": 0.6205741167068481, "reward_std": 0.25367093086242676, "rewards/constexpr_reward/mean": 0.13125000894069672, "rewards/constexpr_reward/std": 0.09549042582511902, "rewards/imports_decorator_reward/mean": 0.17916667461395264, "rewards/imports_decorator_reward/std": 0.06141604110598564, "rewards/masks_load_store_reward/mean": 0.07083333283662796, "rewards/masks_load_store_reward/std": 0.04569156840443611, "rewards/one_code_blob_reward/mean": 0.021005704998970032, "rewards/one_code_blob_reward/std": 0.05132390931248665, "rewards/reward_code_runs/mean": -0.14427083730697632, "rewards/reward_code_runs/std": 0.24048960208892822, "rewards/think_reward/mean": 0.19800584018230438, "rewards/think_reward/std": 0.012335929088294506, "rewards/torch_empty_penalty/mean": -0.02291666716337204, "rewards/torch_empty_penalty/std": 0.04225030168890953, "rewards/torch_zeros_reward/mean": 0.02083333395421505, "rewards/torch_zeros_reward/std": 0.040824830532073975, "rewards/valid_tl_methods_reward/mean": 0.1666666716337204, "rewards/valid_tl_methods_reward/std": 0.07492686063051224, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2679.0, "completions/max_terminated_length": 2679.0, "completions/mean_length": 950.1146240234375, "completions/mean_terminated_length": 950.1146240234375, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.19292604501607716, "grad_norm": 0.5570400357246399, "learning_rate": 1.8999999999999998e-07, "loss": 0.0206, "num_tokens": 3063267.0, "reward": 0.6459920406341553, "reward_std": 0.27166104316711426, "rewards/constexpr_reward/mean": 0.09375, "rewards/constexpr_reward/std": 0.10032841563224792, "rewards/imports_decorator_reward/mean": 0.18333333730697632, "rewards/imports_decorator_reward/std": 0.05556724593043327, "rewards/masks_load_store_reward/mean": 0.0572916679084301, "rewards/masks_load_store_reward/std": 0.04972511902451515, "rewards/one_code_blob_reward/mean": 0.020587759092450142, "rewards/one_code_blob_reward/std": 0.06836681067943573, "rewards/reward_code_runs/mean": -0.1015625, "rewards/reward_code_runs/std": 0.32922980189323425, "rewards/think_reward/mean": 0.19884181022644043, "rewards/think_reward/std": 0.007376207038760185, "rewards/torch_empty_penalty/mean": -0.02291666716337204, "rewards/torch_empty_penalty/std": 0.04225029796361923, "rewards/torch_zeros_reward/mean": 0.0416666679084301, "rewards/torch_zeros_reward/std": 0.04955946281552315, "rewards/valid_tl_methods_reward/mean": 0.17499999701976776, "rewards/valid_tl_methods_reward/std": 0.06649099290370941, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2543.0, "completions/max_terminated_length": 2543.0, "completions/mean_length": 1102.510498046875, "completions/mean_terminated_length": 1102.510498046875, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 0.20257234726688103, "grad_norm": 0.7027966380119324, "learning_rate": 2e-07, "loss": 0.0014, "num_tokens": 3208468.0, "reward": 0.8018503189086914, "reward_std": 0.3390127122402191, "rewards/constexpr_reward/mean": 0.13125000894069672, "rewards/constexpr_reward/std": 0.09549042582511902, "rewards/imports_decorator_reward/mean": 0.18333333730697632, "rewards/imports_decorator_reward/std": 0.05556724593043327, "rewards/masks_load_store_reward/mean": 0.05937499925494194, "rewards/masks_load_store_reward/std": 0.04937104508280754, "rewards/one_code_blob_reward/mean": 0.013175372034311295, "rewards/one_code_blob_reward/std": 0.07021530717611313, "rewards/reward_code_runs/mean": 0.0572916679084301, "rewards/reward_code_runs/std": 0.50481778383255, "rewards/think_reward/mean": 0.19492490589618683, "rewards/think_reward/std": 0.026695001870393753, "rewards/torch_empty_penalty/mean": -0.03229166939854622, "rewards/torch_empty_penalty/std": 0.047004569321870804, "rewards/torch_zeros_reward/mean": 0.02187500149011612, "rewards/torch_zeros_reward/std": 0.04155687615275383, "rewards/valid_tl_methods_reward/mean": 0.17291666567325592, "rewards/valid_tl_methods_reward/std": 0.06879284232854843, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01041666666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 2754.0, "completions/mean_length": 1145.375, "completions/mean_terminated_length": 1114.3157958984375, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.21221864951768488, "grad_norm": 0.5780210494995117, "learning_rate": 2.0999999999999997e-07, "loss": 0.0393, "num_tokens": 3364180.0, "reward": 0.5374256372451782, "reward_std": 0.23710334300994873, "rewards/constexpr_reward/mean": 0.11666667461395264, "rewards/constexpr_reward/std": 0.0991189256310463, "rewards/imports_decorator_reward/mean": 0.19166667759418488, "rewards/imports_decorator_reward/std": 0.04017505794763565, "rewards/masks_load_store_reward/mean": 0.08749999850988388, "rewards/masks_load_store_reward/std": 0.033245496451854706, "rewards/one_code_blob_reward/mean": 0.014938410371541977, "rewards/one_code_blob_reward/std": 0.05469757318496704, "rewards/reward_code_runs/mean": -0.19687502086162567, "rewards/reward_code_runs/std": 0.22996710240840912, "rewards/think_reward/mean": 0.1943621188402176, "rewards/think_reward/std": 0.02727266401052475, "rewards/torch_empty_penalty/mean": -0.02916666865348816, "rewards/torch_empty_penalty/std": 0.04569156840443611, "rewards/torch_zeros_reward/mean": 0.02083333395421505, "rewards/torch_zeros_reward/std": 0.040824830532073975, "rewards/valid_tl_methods_reward/mean": 0.13750000298023224, "rewards/valid_tl_methods_reward/std": 0.09318911284208298, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3467.0, "completions/max_terminated_length": 3467.0, "completions/mean_length": 971.15625, "completions/mean_terminated_length": 971.15625, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.22186495176848875, "grad_norm": 0.6884453892707825, "learning_rate": 2.1999999999999998e-07, "loss": 0.0707, "num_tokens": 3498811.0, "reward": 0.6988117098808289, "reward_std": 0.32961535453796387, "rewards/constexpr_reward/mean": 0.13958333432674408, "rewards/constexpr_reward/std": 0.09231429547071457, "rewards/imports_decorator_reward/mean": 0.19166667759418488, "rewards/imports_decorator_reward/std": 0.04017505422234535, "rewards/masks_load_store_reward/mean": 0.0729166716337204, "rewards/masks_load_store_reward/std": 0.044672295451164246, "rewards/one_code_blob_reward/mean": 0.030532771721482277, "rewards/one_code_blob_reward/std": 0.047944121062755585, "rewards/reward_code_runs/mean": -0.07395832985639572, "rewards/reward_code_runs/std": 0.4056170582771301, "rewards/think_reward/mean": 0.1932789534330368, "rewards/think_reward/std": 0.02945057302713394, "rewards/torch_empty_penalty/mean": -0.02708333171904087, "rewards/torch_empty_penalty/std": 0.044672295451164246, "rewards/torch_zeros_reward/mean": 0.01979166828095913, "rewards/torch_zeros_reward/std": 0.04005204886198044, "rewards/valid_tl_methods_reward/mean": 0.15208333730697632, "rewards/valid_tl_methods_reward/std": 0.0858139619231224, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2677.0, "completions/max_terminated_length": 2677.0, "completions/mean_length": 1099.4375, "completions/mean_terminated_length": 1099.4375, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.2315112540192926, "grad_norm": 0.6718233227729797, "learning_rate": 2.3e-07, "loss": -0.0482, "num_tokens": 3649573.0, "reward": 0.5938155651092529, "reward_std": 0.21998779475688934, "rewards/constexpr_reward/mean": 0.1562500149011612, "rewards/constexpr_reward/std": 0.08311374485492706, "rewards/imports_decorator_reward/mean": 0.19166667759418488, "rewards/imports_decorator_reward/std": 0.04017505794763565, "rewards/masks_load_store_reward/mean": 0.06666667014360428, "rewards/masks_load_store_reward/std": 0.04738791286945343, "rewards/one_code_blob_reward/mean": 0.008211708627641201, "rewards/one_code_blob_reward/std": 0.06651072949171066, "rewards/reward_code_runs/mean": -0.19687502086162567, "rewards/reward_code_runs/std": 0.22996710240840912, "rewards/think_reward/mean": 0.1960204392671585, "rewards/think_reward/std": 0.019158130511641502, "rewards/torch_empty_penalty/mean": -0.01874999888241291, "rewards/torch_empty_penalty/std": 0.03923612833023071, "rewards/torch_zeros_reward/mean": 0.0364583320915699, "rewards/torch_zeros_reward/std": 0.04838397353887558, "rewards/valid_tl_methods_reward/mean": 0.15416665375232697, "rewards/valid_tl_methods_reward/std": 0.08450059592723846, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1504.0, "completions/max_terminated_length": 1504.0, "completions/mean_length": 762.09375, "completions/mean_terminated_length": 762.09375, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.24115755627009647, "grad_norm": 0.8245034217834473, "learning_rate": 2.4e-07, "loss": 0.0112, "num_tokens": 3763150.0, "reward": 0.5939554572105408, "reward_std": 0.2910269498825073, "rewards/constexpr_reward/mean": 0.08958333730697632, "rewards/constexpr_reward/std": 0.09997807443141937, "rewards/imports_decorator_reward/mean": 0.17916667461395264, "rewards/imports_decorator_reward/std": 0.061416033655405045, "rewards/masks_load_store_reward/mean": 0.08124999701976776, "rewards/masks_load_store_reward/std": 0.03923612833023071, "rewards/one_code_blob_reward/mean": 0.0392678827047348, "rewards/one_code_blob_reward/std": 0.04844178259372711, "rewards/reward_code_runs/mean": -0.15677084028720856, "rewards/reward_code_runs/std": 0.2936718463897705, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": -0.02604166604578495, "rewards/torch_empty_penalty/std": 0.04411657154560089, "rewards/torch_zeros_reward/mean": 0.02500000037252903, "rewards/torch_zeros_reward/std": 0.04352857545018196, "rewards/valid_tl_methods_reward/mean": 0.16250000894069672, "rewards/valid_tl_methods_reward/std": 0.07847225666046143, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2847.0, "completions/max_terminated_length": 2847.0, "completions/mean_length": 1144.5208740234375, "completions/mean_terminated_length": 1144.5208740234375, "completions/min_length": 607.0, "completions/min_terminated_length": 607.0, "epoch": 0.2508038585209003, "grad_norm": 0.5811767578125, "learning_rate": 2.5e-07, "loss": 0.0298, "num_tokens": 3918792.0, "reward": 0.5160846710205078, "reward_std": 0.21970906853675842, "rewards/constexpr_reward/mean": 0.13333334028720856, "rewards/constexpr_reward/std": 0.09477582573890686, "rewards/imports_decorator_reward/mean": 0.19166667759418488, "rewards/imports_decorator_reward/std": 0.04017505422234535, "rewards/masks_load_store_reward/mean": 0.06666667014360428, "rewards/masks_load_store_reward/std": 0.04738790914416313, "rewards/one_code_blob_reward/mean": 0.015152446925640106, "rewards/one_code_blob_reward/std": 0.04148025065660477, "rewards/reward_code_runs/mean": -0.2135416716337204, "rewards/reward_code_runs/std": 0.16050563752651215, "rewards/think_reward/mean": 0.1967654973268509, "rewards/think_reward/std": 0.022924818098545074, "rewards/torch_empty_penalty/mean": -0.03020833432674408, "rewards/torch_empty_penalty/std": 0.046157147735357285, "rewards/torch_zeros_reward/mean": 0.03333333507180214, "rewards/torch_zeros_reward/std": 0.04738790914416313, "rewards/valid_tl_methods_reward/mean": 0.12291666865348816, "rewards/valid_tl_methods_reward/std": 0.09784968197345734, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2650.0, "completions/max_terminated_length": 2650.0, "completions/mean_length": 1387.46875, "completions/mean_terminated_length": 1387.46875, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.2604501607717042, "grad_norm": 0.4483291506767273, "learning_rate": 2.6e-07, "loss": -0.0394, "num_tokens": 4101141.0, "reward": 0.454596608877182, "reward_std": 0.16984260082244873, "rewards/constexpr_reward/mean": 0.12916666269302368, "rewards/constexpr_reward/std": 0.0961541160941124, "rewards/imports_decorator_reward/mean": 0.1937500238418579, "rewards/imports_decorator_reward/std": 0.034981194883584976, "rewards/masks_load_store_reward/mean": 0.06354167312383652, "rewards/masks_load_store_reward/std": 0.04838397353887558, "rewards/one_code_blob_reward/mean": -0.005110291298478842, "rewards/one_code_blob_reward/std": 0.06428639590740204, "rewards/reward_code_runs/mean": -0.25, "rewards/reward_code_runs/std": 0.0, "rewards/think_reward/mean": 0.18783187866210938, "rewards/think_reward/std": 0.04612936079502106, "rewards/torch_empty_penalty/mean": -0.02812500111758709, "rewards/torch_empty_penalty/std": 0.04519693925976753, "rewards/torch_zeros_reward/mean": 0.02187499962747097, "rewards/torch_zeros_reward/std": 0.04155687242746353, "rewards/valid_tl_methods_reward/mean": 0.14166666567325592, "rewards/valid_tl_methods_reward/std": 0.09138313680887222, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2586.0, "completions/max_terminated_length": 2586.0, "completions/mean_length": 1038.1458740234375, "completions/mean_terminated_length": 1038.1458740234375, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 0.27009646302250806, "grad_norm": 0.5938194394111633, "learning_rate": 2.7e-07, "loss": 0.0157, "num_tokens": 4242947.0, "reward": 0.554918110370636, "reward_std": 0.23007027804851532, "rewards/constexpr_reward/mean": 0.13750000298023224, "rewards/constexpr_reward/std": 0.09318911284208298, "rewards/imports_decorator_reward/mean": 0.18333333730697632, "rewards/imports_decorator_reward/std": 0.05556724593043327, "rewards/masks_load_store_reward/mean": 0.0729166716337204, "rewards/masks_load_store_reward/std": 0.044672295451164246, "rewards/one_code_blob_reward/mean": 0.014723489992320538, "rewards/one_code_blob_reward/std": 0.059258438646793365, "rewards/reward_code_runs/mean": -0.18281249701976776, "rewards/reward_code_runs/std": 0.23993729054927826, "rewards/think_reward/mean": 0.19696544110774994, "rewards/think_reward/std": 0.014847621321678162, "rewards/torch_empty_penalty/mean": -0.03125, "rewards/torch_empty_penalty/std": 0.04659455642104149, "rewards/torch_zeros_reward/mean": 0.02395833469927311, "rewards/torch_zeros_reward/std": 0.0429069809615612, "rewards/valid_tl_methods_reward/mean": 0.13958333432674408, "rewards/valid_tl_methods_reward/std": 0.09231429547071457, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3154.0, "completions/mean_length": 1421.8125, "completions/mean_terminated_length": 1335.54833984375, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.2797427652733119, "grad_norm": 0.5307338237762451, "learning_rate": 2.8e-07, "loss": -0.0047, "num_tokens": 4422857.0, "reward": 0.5003696084022522, "reward_std": 0.27471408247947693, "rewards/constexpr_reward/mean": 0.12708334624767303, "rewards/constexpr_reward/std": 0.09676794707775116, "rewards/imports_decorator_reward/mean": 0.18125002086162567, "rewards/imports_decorator_reward/std": 0.058602139353752136, "rewards/masks_load_store_reward/mean": 0.07083333283662796, "rewards/masks_load_store_reward/std": 0.04569156840443611, "rewards/one_code_blob_reward/mean": -0.004753956105560064, "rewards/one_code_blob_reward/std": 0.07447423040866852, "rewards/reward_code_runs/mean": -0.17916667461395264, "rewards/reward_code_runs/std": 0.2631456255912781, "rewards/think_reward/mean": 0.1915818601846695, "rewards/think_reward/std": 0.037495218217372894, "rewards/torch_empty_penalty/mean": -0.02083333395421505, "rewards/torch_empty_penalty/std": 0.040824830532073975, "rewards/torch_zeros_reward/mean": 0.01770833320915699, "rewards/torch_zeros_reward/std": 0.03837431222200394, "rewards/valid_tl_methods_reward/mean": 0.11666667461395264, "rewards/valid_tl_methods_reward/std": 0.0991189256310463, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3182.0, "completions/max_terminated_length": 3182.0, "completions/mean_length": 1014.4375, "completions/mean_terminated_length": 1014.4375, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.28938906752411575, "grad_norm": 0.6022129058837891, "learning_rate": 2.9e-07, "loss": 0.0434, "num_tokens": 4559843.0, "reward": 0.6135505437850952, "reward_std": 0.2433612048625946, "rewards/constexpr_reward/mean": 0.14791665971279144, "rewards/constexpr_reward/std": 0.08823314309120178, "rewards/imports_decorator_reward/mean": 0.19166667759418488, "rewards/imports_decorator_reward/std": 0.04017505422234535, "rewards/masks_load_store_reward/mean": 0.0677083358168602, "rewards/masks_load_store_reward/std": 0.0470045730471611, "rewards/one_code_blob_reward/mean": 0.021184049546718597, "rewards/one_code_blob_reward/std": 0.05145742744207382, "rewards/reward_code_runs/mean": -0.18645834922790527, "rewards/reward_code_runs/std": 0.2141665667295456, "rewards/think_reward/mean": 0.19861643016338348, "rewards/think_reward/std": 0.0135562838986516, "rewards/torch_empty_penalty/mean": -0.01666666753590107, "rewards/torch_empty_penalty/std": 0.037463437765836716, "rewards/torch_zeros_reward/mean": 0.02291666716337204, "rewards/torch_zeros_reward/std": 0.04225030168890953, "rewards/valid_tl_methods_reward/mean": 0.1666666716337204, "rewards/valid_tl_methods_reward/std": 0.07492686808109283, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2744.0, "completions/max_terminated_length": 2744.0, "completions/mean_length": 1047.03125, "completions/mean_terminated_length": 1047.03125, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.2990353697749196, "grad_norm": 0.7429094314575195, "learning_rate": 3e-07, "loss": -0.0031, "num_tokens": 4698254.0, "reward": 0.7336772680282593, "reward_std": 0.3611759543418884, "rewards/constexpr_reward/mean": 0.1145833358168602, "rewards/constexpr_reward/std": 0.0994502454996109, "rewards/imports_decorator_reward/mean": 0.18958334624767303, "rewards/imports_decorator_reward/std": 0.044672295451164246, "rewards/masks_load_store_reward/mean": 0.07187499850988388, "rewards/masks_load_store_reward/std": 0.04519693925976753, "rewards/one_code_blob_reward/mean": 0.024039460346102715, "rewards/one_code_blob_reward/std": 0.049430347979068756, "rewards/reward_code_runs/mean": -0.01822916604578495, "rewards/reward_code_runs/std": 0.46854308247566223, "rewards/think_reward/mean": 0.1945335417985916, "rewards/think_reward/std": 0.0342429056763649, "rewards/torch_empty_penalty/mean": -0.02500000037252903, "rewards/torch_empty_penalty/std": 0.04352857545018196, "rewards/torch_zeros_reward/mean": 0.015625, "rewards/torch_zeros_reward/std": 0.03649982064962387, "rewards/valid_tl_methods_reward/mean": 0.1666666716337204, "rewards/valid_tl_methods_reward/std": 0.07492686063051224, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2556.0, "completions/max_terminated_length": 2556.0, "completions/mean_length": 1054.416748046875, "completions/mean_terminated_length": 1054.416748046875, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 0.3086816720257235, "grad_norm": 0.6211071014404297, "learning_rate": 3.1e-07, "loss": -0.0106, "num_tokens": 4842738.0, "reward": 0.600447416305542, "reward_std": 0.24926647543907166, "rewards/constexpr_reward/mean": 0.12916666269302368, "rewards/constexpr_reward/std": 0.0961541160941124, "rewards/imports_decorator_reward/mean": 0.1937500238418579, "rewards/imports_decorator_reward/std": 0.034981198608875275, "rewards/masks_load_store_reward/mean": 0.06354167312383652, "rewards/masks_load_store_reward/std": 0.04838397353887558, "rewards/one_code_blob_reward/mean": 0.01908457837998867, "rewards/one_code_blob_reward/std": 0.051201947033405304, "rewards/reward_code_runs/mean": -0.1640625, "rewards/reward_code_runs/std": 0.25138595700263977, "rewards/think_reward/mean": 0.19959193468093872, "rewards/think_reward/std": 0.0039982725866138935, "rewards/torch_empty_penalty/mean": -0.0312500037252903, "rewards/torch_empty_penalty/std": 0.04659455269575119, "rewards/torch_zeros_reward/mean": 0.03229166567325592, "rewards/torch_zeros_reward/std": 0.0470045730471611, "rewards/valid_tl_methods_reward/mean": 0.15833334624767303, "rewards/valid_tl_methods_reward/std": 0.08164966106414795, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3608.0, "completions/max_terminated_length": 3608.0, "completions/mean_length": 1173.635498046875, "completions/mean_terminated_length": 1173.635498046875, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 0.3183279742765273, "grad_norm": 0.6180107593536377, "learning_rate": 3.2e-07, "loss": 0.039, "num_tokens": 5001895.0, "reward": 0.4860292375087738, "reward_std": 0.16718368232250214, "rewards/constexpr_reward/mean": 0.11874999850988388, "rewards/constexpr_reward/std": 0.09874209016561508, "rewards/imports_decorator_reward/mean": 0.18541665375232697, "rewards/imports_decorator_reward/std": 0.05227290466427803, "rewards/masks_load_store_reward/mean": 0.0729166641831398, "rewards/masks_load_store_reward/std": 0.044672295451164246, "rewards/one_code_blob_reward/mean": 0.01918347366154194, "rewards/one_code_blob_reward/std": 0.03821328654885292, "rewards/reward_code_runs/mean": -0.2265625, "rewards/reward_code_runs/std": 0.10051266103982925, "rewards/think_reward/mean": 0.1965332180261612, "rewards/think_reward/std": 0.024040669202804565, "rewards/torch_empty_penalty/mean": -0.02916666679084301, "rewards/torch_empty_penalty/std": 0.04569156840443611, "rewards/torch_zeros_reward/mean": 0.02604166604578495, "rewards/torch_zeros_reward/std": 0.04411657154560089, "rewards/valid_tl_methods_reward/mean": 0.12291667610406876, "rewards/valid_tl_methods_reward/std": 0.09784968197345734, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3269.0, "completions/max_terminated_length": 3269.0, "completions/mean_length": 1302.8125, "completions/mean_terminated_length": 1302.8125, "completions/min_length": 642.0, "completions/min_terminated_length": 642.0, "epoch": 0.3279742765273312, "grad_norm": 0.4564855396747589, "learning_rate": 3.3e-07, "loss": 0.0455, "num_tokens": 5174113.0, "reward": 0.5342321395874023, "reward_std": 0.17704753577709198, "rewards/constexpr_reward/mean": 0.1354166716337204, "rewards/constexpr_reward/std": 0.0940091460943222, "rewards/imports_decorator_reward/mean": 0.19166667759418488, "rewards/imports_decorator_reward/std": 0.04017505422234535, "rewards/masks_load_store_reward/mean": 0.06354167312383652, "rewards/masks_load_store_reward/std": 0.04838397353887558, "rewards/one_code_blob_reward/mean": 0.009001667611300945, "rewards/one_code_blob_reward/std": 0.03968409448862076, "rewards/reward_code_runs/mean": -0.2265625, "rewards/reward_code_runs/std": 0.10051266103982925, "rewards/think_reward/mean": 0.19866792857646942, "rewards/think_reward/std": 0.009620300494134426, "rewards/torch_empty_penalty/mean": -0.0031250000465661287, "rewards/torch_empty_penalty/std": 0.017490599304437637, "rewards/torch_zeros_reward/mean": 0.01979166828095913, "rewards/torch_zeros_reward/std": 0.04005204886198044, "rewards/valid_tl_methods_reward/mean": 0.1458333283662796, "rewards/valid_tl_methods_reward/std": 0.08934459090232849, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2075.0, "completions/max_terminated_length": 2075.0, "completions/mean_length": 817.9896240234375, "completions/mean_terminated_length": 817.9896240234375, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.33762057877813506, "grad_norm": 0.6788144707679749, "learning_rate": 3.4000000000000003e-07, "loss": -0.0262, "num_tokens": 5287392.0, "reward": 0.8014768362045288, "reward_std": 0.35335928201675415, "rewards/constexpr_reward/mean": 0.13958333432674408, "rewards/constexpr_reward/std": 0.09231429547071457, "rewards/imports_decorator_reward/mean": 0.1937500238418579, "rewards/imports_decorator_reward/std": 0.034981198608875275, "rewards/masks_load_store_reward/mean": 0.07708332687616348, "rewards/masks_load_store_reward/std": 0.04225030168890953, "rewards/one_code_blob_reward/mean": 0.044637531042099, "rewards/one_code_blob_reward/std": 0.03677495941519737, "rewards/reward_code_runs/mean": -0.011458327062427998, "rewards/reward_code_runs/std": 0.4453549087047577, "rewards/think_reward/mean": 0.19850583374500275, "rewards/think_reward/std": 0.014639697037637234, "rewards/torch_empty_penalty/mean": -0.010416666977107525, "rewards/torch_empty_penalty/std": 0.03070801869034767, "rewards/torch_zeros_reward/mean": 0.009374999441206455, "rewards/torch_zeros_reward/std": 0.029301069676876068, "rewards/valid_tl_methods_reward/mean": 0.16041666269302368, "rewards/valid_tl_methods_reward/std": 0.08010409772396088, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4069.0, "completions/mean_length": 1136.21875, "completions/mean_terminated_length": 1040.741943359375, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.34726688102893893, "grad_norm": 0.7359632849693298, "learning_rate": 3.5e-07, "loss": 0.0526, "num_tokens": 5434593.0, "reward": 0.7075538635253906, "reward_std": 0.2721782922744751, "rewards/constexpr_reward/mean": 0.14791665971279144, "rewards/constexpr_reward/std": 0.08823314309120178, "rewards/imports_decorator_reward/mean": 0.1875, "rewards/imports_decorator_reward/std": 0.04866642504930496, "rewards/masks_load_store_reward/mean": 0.0677083358168602, "rewards/masks_load_store_reward/std": 0.047004569321870804, "rewards/one_code_blob_reward/mean": 0.02682466246187687, "rewards/one_code_blob_reward/std": 0.059499479830265045, "rewards/reward_code_runs/mean": -0.1015625, "rewards/reward_code_runs/std": 0.32922980189323425, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": -0.008333333767950535, "rewards/torch_empty_penalty/std": 0.027783624827861786, "rewards/torch_zeros_reward/mean": 0.010416666977107525, "rewards/torch_zeros_reward/std": 0.03070802055299282, "rewards/valid_tl_methods_reward/mean": 0.1770833283662796, "rewards/valid_tl_methods_reward/std": 0.06403809785842896, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3331.0, "completions/max_terminated_length": 3331.0, "completions/mean_length": 1302.3333740234375, "completions/mean_terminated_length": 1302.3333740234375, "completions/min_length": 593.0, "completions/min_terminated_length": 593.0, "epoch": 0.35691318327974275, "grad_norm": 0.5119906663894653, "learning_rate": 3.6e-07, "loss": -0.0113, "num_tokens": 5607485.0, "reward": 0.632323145866394, "reward_std": 0.24545639753341675, "rewards/constexpr_reward/mean": 0.12291666120290756, "rewards/constexpr_reward/std": 0.09784968197345734, "rewards/imports_decorator_reward/mean": 0.19583334028720856, "rewards/imports_decorator_reward/std": 0.02871517650783062, "rewards/masks_load_store_reward/mean": 0.07708332687616348, "rewards/masks_load_store_reward/std": 0.04225030168890953, "rewards/one_code_blob_reward/mean": 0.010937422513961792, "rewards/one_code_blob_reward/std": 0.046592701226472855, "rewards/reward_code_runs/mean": -0.14270831644535065, "rewards/reward_code_runs/std": 0.29964709281921387, "rewards/think_reward/mean": 0.19742733240127563, "rewards/think_reward/std": 0.01643279939889908, "rewards/torch_empty_penalty/mean": -0.01875000074505806, "rewards/torch_empty_penalty/std": 0.039236124604940414, "rewards/torch_zeros_reward/mean": 0.01666666753590107, "rewards/torch_zeros_reward/std": 0.03746343404054642, "rewards/valid_tl_methods_reward/mean": 0.17291666567325592, "rewards/valid_tl_methods_reward/std": 0.06879284977912903, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3039.0, "completions/max_terminated_length": 3039.0, "completions/mean_length": 1210.7083740234375, "completions/mean_terminated_length": 1210.7083740234375, "completions/min_length": 556.0, "completions/min_terminated_length": 556.0, "epoch": 0.3665594855305466, "grad_norm": 0.6607086062431335, "learning_rate": 3.7e-07, "loss": -0.0111, "num_tokens": 5767849.0, "reward": 0.5687481164932251, "reward_std": 0.19704470038414001, "rewards/constexpr_reward/mean": 0.14791667461395264, "rewards/constexpr_reward/std": 0.08823314309120178, "rewards/imports_decorator_reward/mean": 0.18958334624767303, "rewards/imports_decorator_reward/std": 0.044672295451164246, "rewards/masks_load_store_reward/mean": 0.07187499850988388, "rewards/masks_load_store_reward/std": 0.04519694298505783, "rewards/one_code_blob_reward/mean": 0.004737721756100655, "rewards/one_code_blob_reward/std": 0.06969740241765976, "rewards/reward_code_runs/mean": -0.22187499701976776, "rewards/reward_code_runs/std": 0.10949946194887161, "rewards/think_reward/mean": 0.19630199670791626, "rewards/think_reward/std": 0.0195402093231678, "rewards/torch_empty_penalty/mean": -0.0052083334885537624, "rewards/torch_empty_penalty/std": 0.022336145862936974, "rewards/torch_zeros_reward/mean": 0.03333333507180214, "rewards/torch_zeros_reward/std": 0.04738790914416313, "rewards/valid_tl_methods_reward/mean": 0.15208333730697632, "rewards/valid_tl_methods_reward/std": 0.0858139619231224, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2575.0, "completions/max_terminated_length": 2575.0, "completions/mean_length": 1134.291748046875, "completions/mean_terminated_length": 1134.291748046875, "completions/min_length": 486.0, "completions/min_terminated_length": 486.0, "epoch": 0.3762057877813505, "grad_norm": 0.5510346293449402, "learning_rate": 3.7999999999999996e-07, "loss": 0.0248, "num_tokens": 5922641.0, "reward": 0.6064773797988892, "reward_std": 0.18955551087856293, "rewards/constexpr_reward/mean": 0.15833333134651184, "rewards/constexpr_reward/std": 0.08164966106414795, "rewards/imports_decorator_reward/mean": 0.1937500238418579, "rewards/imports_decorator_reward/std": 0.034981198608875275, "rewards/masks_load_store_reward/mean": 0.0781250074505806, "rewards/masks_load_store_reward/std": 0.04155687242746353, "rewards/one_code_blob_reward/mean": 0.023969531059265137, "rewards/one_code_blob_reward/std": 0.019068855792284012, "rewards/reward_code_runs/mean": -0.20156250894069672, "rewards/reward_code_runs/std": 0.2263501137495041, "rewards/think_reward/mean": 0.19865359365940094, "rewards/think_reward/std": 0.011305413208901882, "rewards/torch_empty_penalty/mean": -0.03854167088866234, "rewards/torch_empty_penalty/std": 0.04892484471201897, "rewards/torch_zeros_reward/mean": 0.014583333395421505, "rewards/torch_zeros_reward/std": 0.03547917678952217, "rewards/valid_tl_methods_reward/mean": 0.17916667461395264, "rewards/valid_tl_methods_reward/std": 0.061416033655405045, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2542.0, "completions/max_terminated_length": 2542.0, "completions/mean_length": 1113.197998046875, "completions/mean_terminated_length": 1113.197998046875, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.3858520900321543, "grad_norm": 0.5919126868247986, "learning_rate": 3.8999999999999997e-07, "loss": -0.0312, "num_tokens": 6074520.0, "reward": 0.57293701171875, "reward_std": 0.16654378175735474, "rewards/constexpr_reward/mean": 0.15833333134651184, "rewards/constexpr_reward/std": 0.08164966106414795, "rewards/imports_decorator_reward/mean": 0.19583334028720856, "rewards/imports_decorator_reward/std": 0.02871517650783062, "rewards/masks_load_store_reward/mean": 0.07604166865348816, "rewards/masks_load_store_reward/std": 0.042906977236270905, "rewards/one_code_blob_reward/mean": 0.016401944682002068, "rewards/one_code_blob_reward/std": 0.05448228865861893, "rewards/reward_code_runs/mean": -0.203125, "rewards/reward_code_runs/std": 0.13818608224391937, "rewards/think_reward/mean": 0.197160005569458, "rewards/think_reward/std": 0.014792421832680702, "rewards/torch_empty_penalty/mean": -0.01770833320915699, "rewards/torch_empty_penalty/std": 0.03837431222200394, "rewards/torch_zeros_reward/mean": 0.03749999776482582, "rewards/torch_zeros_reward/std": 0.04866642504930496, "rewards/valid_tl_methods_reward/mean": 0.11250000447034836, "rewards/valid_tl_methods_reward/std": 0.09973649680614471, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2118.0, "completions/max_terminated_length": 2118.0, "completions/mean_length": 883.5104370117188, "completions/mean_terminated_length": 883.5104370117188, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.3954983922829582, "grad_norm": 0.6467012166976929, "learning_rate": 4e-07, "loss": -0.0067, "num_tokens": 6197725.0, "reward": 0.7270565032958984, "reward_std": 0.3342481553554535, "rewards/constexpr_reward/mean": 0.14374999701976776, "rewards/constexpr_reward/std": 0.09039388597011566, "rewards/imports_decorator_reward/mean": 0.1875, "rewards/imports_decorator_reward/std": 0.04866642504930496, "rewards/masks_load_store_reward/mean": 0.08020833134651184, "rewards/masks_load_store_reward/std": 0.04005204886198044, "rewards/one_code_blob_reward/mean": 0.0374731607735157, "rewards/one_code_blob_reward/std": 0.03838203102350235, "rewards/reward_code_runs/mean": -0.0729166641831398, "rewards/reward_code_runs/std": 0.3925568163394928, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": -0.02708333171904087, "rewards/torch_empty_penalty/std": 0.044672295451164246, "rewards/torch_zeros_reward/mean": 0.01145833358168602, "rewards/torch_zeros_reward/std": 0.03201904892921448, "rewards/valid_tl_methods_reward/mean": 0.1666666716337204, "rewards/valid_tl_methods_reward/std": 0.07492686808109283, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3387.0, "completions/max_terminated_length": 3387.0, "completions/mean_length": 1058.7396240234375, "completions/mean_terminated_length": 1058.7396240234375, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 0.40514469453376206, "grad_norm": 0.6954807043075562, "learning_rate": 4.0999999999999994e-07, "loss": 0.0317, "num_tokens": 6342840.0, "reward": 0.6654441356658936, "reward_std": 0.29857248067855835, "rewards/constexpr_reward/mean": 0.15625, "rewards/constexpr_reward/std": 0.08311374485492706, "rewards/imports_decorator_reward/mean": 0.19166667759418488, "rewards/imports_decorator_reward/std": 0.04017505422234535, "rewards/masks_load_store_reward/mean": 0.08645833283662796, "rewards/masks_load_store_reward/std": 0.034396421164274216, "rewards/one_code_blob_reward/mean": 0.014418717473745346, "rewards/one_code_blob_reward/std": 0.05984903872013092, "rewards/reward_code_runs/mean": -0.11874999850988388, "rewards/reward_code_runs/std": 0.37015289068222046, "rewards/think_reward/mean": 0.19790036976337433, "rewards/think_reward/std": 0.017045380547642708, "rewards/torch_empty_penalty/mean": -0.04062500223517418, "rewards/torch_empty_penalty/std": 0.04937104508280754, "rewards/torch_zeros_reward/mean": 0.015625, "rewards/torch_zeros_reward/std": 0.03649982064962387, "rewards/valid_tl_methods_reward/mean": 0.16250000894069672, "rewards/valid_tl_methods_reward/std": 0.07847225666046143, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2707.0, "completions/max_terminated_length": 2707.0, "completions/mean_length": 1218.3958740234375, "completions/mean_terminated_length": 1218.3958740234375, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.41479099678456594, "grad_norm": 0.6541871428489685, "learning_rate": 4.1999999999999995e-07, "loss": -0.0297, "num_tokens": 6505454.0, "reward": 0.6576408743858337, "reward_std": 0.2851487398147583, "rewards/constexpr_reward/mean": 0.15000000596046448, "rewards/constexpr_reward/std": 0.08705715090036392, "rewards/imports_decorator_reward/mean": 0.19166667759418488, "rewards/imports_decorator_reward/std": 0.04017505422234535, "rewards/masks_load_store_reward/mean": 0.06666667014360428, "rewards/masks_load_store_reward/std": 0.04738790914416313, "rewards/one_code_blob_reward/mean": 0.01206450629979372, "rewards/one_code_blob_reward/std": 0.04901020601391792, "rewards/reward_code_runs/mean": -0.13072915375232697, "rewards/reward_code_runs/std": 0.33696553111076355, "rewards/think_reward/mean": 0.19818048179149628, "rewards/think_reward/std": 0.015096686780452728, "rewards/torch_empty_penalty/mean": -0.01145833358168602, "rewards/torch_empty_penalty/std": 0.03201904520392418, "rewards/torch_zeros_reward/mean": 0.03125, "rewards/torch_zeros_reward/std": 0.04659455642104149, "rewards/valid_tl_methods_reward/mean": 0.15000000596046448, "rewards/valid_tl_methods_reward/std": 0.08705715090036392, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3599.0, "completions/max_terminated_length": 3599.0, "completions/mean_length": 1293.84375, "completions/mean_terminated_length": 1293.84375, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 0.42443729903536975, "grad_norm": 0.5934505462646484, "learning_rate": 4.2999999999999996e-07, "loss": 0.0274, "num_tokens": 6676295.0, "reward": 0.5304321646690369, "reward_std": 0.13345909118652344, "rewards/constexpr_reward/mean": 0.1770833283662796, "rewards/constexpr_reward/std": 0.06403809040784836, "rewards/imports_decorator_reward/mean": 0.19583334028720856, "rewards/imports_decorator_reward/std": 0.02871517650783062, "rewards/masks_load_store_reward/mean": 0.06979166716337204, "rewards/masks_load_store_reward/std": 0.046157147735357285, "rewards/one_code_blob_reward/mean": 0.013796854764223099, "rewards/one_code_blob_reward/std": 0.03560984879732132, "rewards/reward_code_runs/mean": -0.24531249701976776, "rewards/reward_code_runs/std": 0.04592793434858322, "rewards/think_reward/mean": 0.1963227540254593, "rewards/think_reward/std": 0.023702142760157585, "rewards/torch_empty_penalty/mean": -0.02708333544433117, "rewards/torch_empty_penalty/std": 0.044672295451164246, "rewards/torch_zeros_reward/mean": 0.02291666716337204, "rewards/torch_zeros_reward/std": 0.04225029796361923, "rewards/valid_tl_methods_reward/mean": 0.12708334624767303, "rewards/valid_tl_methods_reward/std": 0.09676794707775116, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2542.0, "completions/max_terminated_length": 2542.0, "completions/mean_length": 1152.0833740234375, "completions/mean_terminated_length": 1152.0833740234375, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.4340836012861736, "grad_norm": 0.7656160593032837, "learning_rate": 4.3999999999999997e-07, "loss": -0.0271, "num_tokens": 6832231.0, "reward": 0.6402577757835388, "reward_std": 0.24128377437591553, "rewards/constexpr_reward/mean": 0.16249999403953552, "rewards/constexpr_reward/std": 0.07847225666046143, "rewards/imports_decorator_reward/mean": 0.19166667759418488, "rewards/imports_decorator_reward/std": 0.04017505422234535, "rewards/masks_load_store_reward/mean": 0.06354167312383652, "rewards/masks_load_store_reward/std": 0.04838397353887558, "rewards/one_code_blob_reward/mean": 0.014514167793095112, "rewards/one_code_blob_reward/std": 0.04225924238562584, "rewards/reward_code_runs/mean": -0.10729166120290756, "rewards/reward_code_runs/std": 0.3435096740722656, "rewards/think_reward/mean": 0.19866019487380981, "rewards/think_reward/std": 0.009797739796340466, "rewards/torch_empty_penalty/mean": -0.04375000298023224, "rewards/torch_empty_penalty/std": 0.04986824840307236, "rewards/torch_zeros_reward/mean": 0.02708333171904087, "rewards/torch_zeros_reward/std": 0.044672295451164246, "rewards/valid_tl_methods_reward/mean": 0.13333332538604736, "rewards/valid_tl_methods_reward/std": 0.09477582573890686, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2606.0, "completions/max_terminated_length": 2606.0, "completions/mean_length": 890.1875, "completions/mean_terminated_length": 890.1875, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.4437299035369775, "grad_norm": 0.7248244285583496, "learning_rate": 4.5e-07, "loss": 0.0216, "num_tokens": 6955813.0, "reward": 0.6671415567398071, "reward_std": 0.21686410903930664, "rewards/constexpr_reward/mean": 0.14374999701976776, "rewards/constexpr_reward/std": 0.09039388597011566, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.06875000149011612, "rewards/masks_load_store_reward/std": 0.04659455642104149, "rewards/one_code_blob_reward/mean": 0.03583366796374321, "rewards/one_code_blob_reward/std": 0.034937743097543716, "rewards/reward_code_runs/mean": -0.11927083134651184, "rewards/reward_code_runs/std": 0.30791059136390686, "rewards/think_reward/mean": 0.19849534332752228, "rewards/think_reward/std": 0.014742674306035042, "rewards/torch_empty_penalty/mean": -0.01979166828095913, "rewards/torch_empty_penalty/std": 0.04005204886198044, "rewards/torch_zeros_reward/mean": 0.0072916666977107525, "rewards/torch_zeros_reward/std": 0.026136448606848717, "rewards/valid_tl_methods_reward/mean": 0.15416668355464935, "rewards/valid_tl_methods_reward/std": 0.08450059592723846, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2120.0, "completions/max_terminated_length": 2120.0, "completions/mean_length": 1056.447998046875, "completions/mean_terminated_length": 1056.447998046875, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 0.4533762057877814, "grad_norm": 0.6093177795410156, "learning_rate": 4.6e-07, "loss": -0.009, "num_tokens": 7101140.0, "reward": 0.6234441995620728, "reward_std": 0.2314853072166443, "rewards/constexpr_reward/mean": 0.17500001192092896, "rewards/constexpr_reward/std": 0.06649099290370941, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.09479167312383652, "rewards/masks_load_store_reward/std": 0.022336147725582123, "rewards/one_code_blob_reward/mean": 0.011535567231476307, "rewards/one_code_blob_reward/std": 0.053763993084430695, "rewards/reward_code_runs/mean": -0.18645833432674408, "rewards/reward_code_runs/std": 0.2141665816307068, "rewards/think_reward/mean": 0.19836688041687012, "rewards/think_reward/std": 0.016001230105757713, "rewards/torch_empty_penalty/mean": -0.0364583358168602, "rewards/torch_empty_penalty/std": 0.04838397353887558, "rewards/torch_zeros_reward/mean": 0.012500000186264515, "rewards/torch_zeros_reward/std": 0.033245496451854706, "rewards/valid_tl_methods_reward/mean": 0.15625, "rewards/valid_tl_methods_reward/std": 0.08311374485492706, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2821.0, "completions/max_terminated_length": 2821.0, "completions/mean_length": 1075.125, "completions/mean_terminated_length": 1075.125, "completions/min_length": 581.0, "completions/min_terminated_length": 581.0, "epoch": 0.4630225080385852, "grad_norm": 0.5320653319358826, "learning_rate": 4.6999999999999995e-07, "loss": 0.0107, "num_tokens": 7250696.0, "reward": 0.601868748664856, "reward_std": 0.20453622937202454, "rewards/constexpr_reward/mean": 0.15625, "rewards/constexpr_reward/std": 0.08311375230550766, "rewards/imports_decorator_reward/mean": 0.19583334028720856, "rewards/imports_decorator_reward/std": 0.02871517650783062, "rewards/masks_load_store_reward/mean": 0.0677083358168602, "rewards/masks_load_store_reward/std": 0.0470045730471611, "rewards/one_code_blob_reward/mean": 0.015392146073281765, "rewards/one_code_blob_reward/std": 0.04831596091389656, "rewards/reward_code_runs/mean": -0.20520834624767303, "rewards/reward_code_runs/std": 0.1984783113002777, "rewards/think_reward/mean": 0.19793486595153809, "rewards/think_reward/std": 0.014705672860145569, "rewards/torch_empty_penalty/mean": -0.015625, "rewards/torch_empty_penalty/std": 0.03649982064962387, "rewards/torch_zeros_reward/mean": 0.03124999813735485, "rewards/torch_zeros_reward/std": 0.04659455269575119, "rewards/valid_tl_methods_reward/mean": 0.15833333134651184, "rewards/valid_tl_methods_reward/std": 0.08164966106414795, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2741.0, "completions/max_terminated_length": 2741.0, "completions/mean_length": 1201.479248046875, "completions/mean_terminated_length": 1201.479248046875, "completions/min_length": 537.0, "completions/min_terminated_length": 537.0, "epoch": 0.47266881028938906, "grad_norm": 0.6406047940254211, "learning_rate": 4.8e-07, "loss": 0.0062, "num_tokens": 7413042.0, "reward": 0.5506672859191895, "reward_std": 0.12405920028686523, "rewards/constexpr_reward/mean": 0.16458332538604736, "rewards/constexpr_reward/std": 0.07674862444400787, "rewards/imports_decorator_reward/mean": 0.1937500238418579, "rewards/imports_decorator_reward/std": 0.034981194883584976, "rewards/masks_load_store_reward/mean": 0.07499999552965164, "rewards/masks_load_store_reward/std": 0.04352857545018196, "rewards/one_code_blob_reward/mean": 0.018375607207417488, "rewards/one_code_blob_reward/std": 0.027684977278113365, "rewards/reward_code_runs/mean": -0.25, "rewards/reward_code_runs/std": 0.0, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": -0.03020833432674408, "rewards/torch_empty_penalty/std": 0.046157147735357285, "rewards/torch_zeros_reward/mean": 0.02291666716337204, "rewards/torch_zeros_reward/std": 0.04225030168890953, "rewards/valid_tl_methods_reward/mean": 0.1562500149011612, "rewards/valid_tl_methods_reward/std": 0.08311374485492706, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2284.0, "completions/max_terminated_length": 2284.0, "completions/mean_length": 1079.1771240234375, "completions/mean_terminated_length": 1079.1771240234375, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 0.48231511254019294, "grad_norm": 0.6042345762252808, "learning_rate": 4.9e-07, "loss": 0.0487, "num_tokens": 7559171.0, "reward": 0.730362594127655, "reward_std": 0.30589550733566284, "rewards/constexpr_reward/mean": 0.16250000894069672, "rewards/constexpr_reward/std": 0.07847225666046143, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.08125000447034836, "rewards/masks_load_store_reward/std": 0.03923612833023071, "rewards/one_code_blob_reward/mean": 0.025675097480416298, "rewards/one_code_blob_reward/std": 0.04089103266596794, "rewards/reward_code_runs/mean": -0.1119791641831398, "rewards/reward_code_runs/std": 0.3423405885696411, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": -0.012500000186264515, "rewards/torch_empty_penalty/std": 0.033245496451854706, "rewards/torch_zeros_reward/mean": 0.03333333507180214, "rewards/torch_zeros_reward/std": 0.04738790914416313, "rewards/valid_tl_methods_reward/mean": 0.15208333730697632, "rewards/valid_tl_methods_reward/std": 0.08581395447254181, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01041666666666663, "completions/max_length": 2704.0, "completions/max_terminated_length": 2704.0, "completions/mean_length": 1062.40625, "completions/mean_terminated_length": 1048.4105224609375, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.4919614147909968, "grad_norm": 61.933326721191406, "learning_rate": 5e-07, "loss": -0.0098, "num_tokens": 7704446.0, "reward": 0.6786465644836426, "reward_std": 0.2720298171043396, "rewards/constexpr_reward/mean": 0.1666666716337204, "rewards/constexpr_reward/std": 0.07492686808109283, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.0729166641831398, "rewards/masks_load_store_reward/std": 0.044672295451164246, "rewards/one_code_blob_reward/mean": 0.03380136936903, "rewards/one_code_blob_reward/std": 0.02556346170604229, "rewards/reward_code_runs/mean": -0.14374999701976776, "rewards/reward_code_runs/std": 0.31633177399635315, "rewards/think_reward/mean": 0.19692851603031158, "rewards/think_reward/std": 0.01767767407000065, "rewards/torch_empty_penalty/mean": -0.01874999888241291, "rewards/torch_empty_penalty/std": 0.03923612833023071, "rewards/torch_zeros_reward/mean": 0.010416666977107525, "rewards/torch_zeros_reward/std": 0.03070802055299282, "rewards/valid_tl_methods_reward/mean": 0.16041666269302368, "rewards/valid_tl_methods_reward/std": 0.08010409772396088, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2608.0, "completions/max_terminated_length": 2608.0, "completions/mean_length": 1029.1458740234375, "completions/mean_terminated_length": 1029.1458740234375, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 0.5016077170418006, "grad_norm": 0.5496352314949036, "learning_rate": 5.1e-07, "loss": 0.0153, "num_tokens": 7845580.0, "reward": 0.7204399108886719, "reward_std": 0.26986491680145264, "rewards/constexpr_reward/mean": 0.16875000298023224, "rewards/constexpr_reward/std": 0.07299964129924774, "rewards/imports_decorator_reward/mean": 0.19166667759418488, "rewards/imports_decorator_reward/std": 0.04017505422234535, "rewards/masks_load_store_reward/mean": 0.07395833730697632, "rewards/masks_load_store_reward/std": 0.04411657154560089, "rewards/one_code_blob_reward/mean": 0.02279212512075901, "rewards/one_code_blob_reward/std": 0.051034536212682724, "rewards/reward_code_runs/mean": -0.08385416120290756, "rewards/reward_code_runs/std": 0.34834155440330505, "rewards/think_reward/mean": 0.19712696969509125, "rewards/think_reward/std": 0.014551707543432713, "rewards/torch_empty_penalty/mean": -0.0020833334419876337, "rewards/torch_empty_penalty/std": 0.01435758825391531, "rewards/torch_zeros_reward/mean": 0.012500000186264515, "rewards/torch_zeros_reward/std": 0.033245496451854706, "rewards/valid_tl_methods_reward/mean": 0.13958333432674408, "rewards/valid_tl_methods_reward/std": 0.09231429547071457, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01041666666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3084.0, "completions/mean_length": 1194.7396240234375, "completions/mean_terminated_length": 1164.2000732421875, "completions/min_length": 521.0, "completions/min_terminated_length": 521.0, "epoch": 0.5112540192926045, "grad_norm": 0.5003747940063477, "learning_rate": 5.2e-07, "loss": 0.073, "num_tokens": 8003787.0, "reward": 0.7361212968826294, "reward_std": 0.31317782402038574, "rewards/constexpr_reward/mean": 0.16458334028720856, "rewards/constexpr_reward/std": 0.07674862444400787, "rewards/imports_decorator_reward/mean": 0.19583334028720856, "rewards/imports_decorator_reward/std": 0.02871517650783062, "rewards/masks_load_store_reward/mean": 0.07916667312383652, "rewards/masks_load_store_reward/std": 0.040824830532073975, "rewards/one_code_blob_reward/mean": 0.011868351139128208, "rewards/one_code_blob_reward/std": 0.058140359818935394, "rewards/reward_code_runs/mean": -0.11406251043081284, "rewards/reward_code_runs/std": 0.37132078409194946, "rewards/think_reward/mean": 0.19352371990680695, "rewards/think_reward/std": 0.033737994730472565, "rewards/torch_empty_penalty/mean": -0.0010416667209938169, "rewards/torch_empty_penalty/std": 0.010206207633018494, "rewards/torch_zeros_reward/mean": 0.03750000149011612, "rewards/torch_zeros_reward/std": 0.04866642504930496, "rewards/valid_tl_methods_reward/mean": 0.16875000298023224, "rewards/valid_tl_methods_reward/std": 0.07299964129924774, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3009.0, "completions/max_terminated_length": 3009.0, "completions/mean_length": 1083.697998046875, "completions/mean_terminated_length": 1083.697998046875, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 0.5209003215434084, "grad_norm": 0.7225779891014099, "learning_rate": 5.3e-07, "loss": 0.0123, "num_tokens": 8149246.0, "reward": 0.6963642835617065, "reward_std": 0.20251570641994476, "rewards/constexpr_reward/mean": 0.18125002086162567, "rewards/constexpr_reward/std": 0.058602139353752136, "rewards/imports_decorator_reward/mean": 0.19583334028720856, "rewards/imports_decorator_reward/std": 0.02871517650783062, "rewards/masks_load_store_reward/mean": 0.08437499403953552, "rewards/masks_load_store_reward/std": 0.03649982064962387, "rewards/one_code_blob_reward/mean": 0.02474900148808956, "rewards/one_code_blob_reward/std": 0.030808135867118835, "rewards/reward_code_runs/mean": -0.15833333134651184, "rewards/reward_code_runs/std": 0.23290687799453735, "rewards/think_reward/mean": 0.1955736130475998, "rewards/think_reward/std": 0.022120453417301178, "rewards/torch_empty_penalty/mean": -0.02083333395421505, "rewards/torch_empty_penalty/std": 0.040824830532073975, "rewards/torch_zeros_reward/mean": 0.02916666865348816, "rewards/torch_zeros_reward/std": 0.04569156840443611, "rewards/valid_tl_methods_reward/mean": 0.16458332538604736, "rewards/valid_tl_methods_reward/std": 0.07674862444400787, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2648.0, "completions/max_terminated_length": 2648.0, "completions/mean_length": 976.3125, "completions/mean_terminated_length": 976.3125, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 0.5305466237942122, "grad_norm": 0.7034012675285339, "learning_rate": 5.4e-07, "loss": 0.0208, "num_tokens": 8281648.0, "reward": 0.7647324800491333, "reward_std": 0.23368114233016968, "rewards/constexpr_reward/mean": 0.17916667461395264, "rewards/constexpr_reward/std": 0.06141603738069534, "rewards/imports_decorator_reward/mean": 0.19583334028720856, "rewards/imports_decorator_reward/std": 0.02871517650783062, "rewards/masks_load_store_reward/mean": 0.0885416641831398, "rewards/masks_load_store_reward/std": 0.03201904520392418, "rewards/one_code_blob_reward/mean": 0.03195902332663536, "rewards/one_code_blob_reward/std": 0.04141776263713837, "rewards/reward_code_runs/mean": -0.11927083134651184, "rewards/reward_code_runs/std": 0.30791059136390686, "rewards/think_reward/mean": 0.1957942247390747, "rewards/think_reward/std": 0.02325473167002201, "rewards/torch_empty_penalty/mean": -0.010416666977107525, "rewards/torch_empty_penalty/std": 0.03070801869034767, "rewards/torch_zeros_reward/mean": 0.0364583320915699, "rewards/torch_zeros_reward/std": 0.04838397353887558, "rewards/valid_tl_methods_reward/mean": 0.1666666716337204, "rewards/valid_tl_methods_reward/std": 0.07492686808109283, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3093.0, "completions/max_terminated_length": 3093.0, "completions/mean_length": 1219.8333740234375, "completions/mean_terminated_length": 1219.8333740234375, "completions/min_length": 562.0, "completions/min_terminated_length": 562.0, "epoch": 0.5401929260450161, "grad_norm": 0.4217401146888733, "learning_rate": 5.5e-07, "loss": -0.0083, "num_tokens": 8447400.0, "reward": 0.5094969868659973, "reward_std": 0.1287832260131836, "rewards/constexpr_reward/mean": 0.17500001192092896, "rewards/constexpr_reward/std": 0.06649099290370941, "rewards/imports_decorator_reward/mean": 0.1937500238418579, "rewards/imports_decorator_reward/std": 0.034981194883584976, "rewards/masks_load_store_reward/mean": 0.06979166716337204, "rewards/masks_load_store_reward/std": 0.046157147735357285, "rewards/one_code_blob_reward/mean": 0.011323712766170502, "rewards/one_code_blob_reward/std": 0.046563684940338135, "rewards/reward_code_runs/mean": -0.25, "rewards/reward_code_runs/std": 0.0, "rewards/think_reward/mean": 0.19713155925273895, "rewards/think_reward/std": 0.014952579513192177, "rewards/torch_empty_penalty/mean": -0.04270833730697632, "rewards/torch_empty_penalty/std": 0.04972512274980545, "rewards/torch_zeros_reward/mean": 0.02395833283662796, "rewards/torch_zeros_reward/std": 0.042906977236270905, "rewards/valid_tl_methods_reward/mean": 0.13125000894069672, "rewards/valid_tl_methods_reward/std": 0.09549042582511902, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3450.0, "completions/max_terminated_length": 3450.0, "completions/mean_length": 1081.125, "completions/mean_terminated_length": 1081.125, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.5498392282958199, "grad_norm": 0.5305235385894775, "learning_rate": 5.6e-07, "loss": 0.0105, "num_tokens": 8594748.0, "reward": 0.746235728263855, "reward_std": 0.19432096183300018, "rewards/constexpr_reward/mean": 0.1770833283662796, "rewards/constexpr_reward/std": 0.06403809040784836, "rewards/imports_decorator_reward/mean": 0.19583334028720856, "rewards/imports_decorator_reward/std": 0.02871517837047577, "rewards/masks_load_store_reward/mean": 0.08229167014360428, "rewards/masks_load_store_reward/std": 0.03837431222200394, "rewards/one_code_blob_reward/mean": 0.022025473415851593, "rewards/one_code_blob_reward/std": 0.04012902453541756, "rewards/reward_code_runs/mean": -0.109375, "rewards/reward_code_runs/std": 0.3724253475666046, "rewards/think_reward/mean": 0.19816844165325165, "rewards/think_reward/std": 0.014005015604197979, "rewards/torch_empty_penalty/mean": -0.01875000074505806, "rewards/torch_empty_penalty/std": 0.03923612833023071, "rewards/torch_zeros_reward/mean": 0.03437500074505806, "rewards/torch_zeros_reward/std": 0.04774520918726921, "rewards/valid_tl_methods_reward/mean": 0.16458334028720856, "rewards/valid_tl_methods_reward/std": 0.07674862444400787, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2933.0, "completions/max_terminated_length": 2933.0, "completions/mean_length": 976.96875, "completions/mean_terminated_length": 976.96875, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.5594855305466238, "grad_norm": 0.6822076439857483, "learning_rate": 5.699999999999999e-07, "loss": -0.019, "num_tokens": 8730561.0, "reward": 0.7617365717887878, "reward_std": 0.2241070717573166, "rewards/constexpr_reward/mean": 0.16875000298023224, "rewards/constexpr_reward/std": 0.07299964129924774, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.08125000447034836, "rewards/masks_load_store_reward/std": 0.03923612833023071, "rewards/one_code_blob_reward/mean": 0.026567451655864716, "rewards/one_code_blob_reward/std": 0.05450519919395447, "rewards/reward_code_runs/mean": -0.12708333134651184, "rewards/reward_code_runs/std": 0.3533238172531128, "rewards/think_reward/mean": 0.19454412162303925, "rewards/think_reward/std": 0.03444638103246689, "rewards/torch_empty_penalty/mean": -0.015625, "rewards/torch_empty_penalty/std": 0.03649982064962387, "rewards/torch_zeros_reward/mean": 0.05625000223517418, "rewards/torch_zeros_reward/std": 0.04986824840307236, "rewards/valid_tl_methods_reward/mean": 0.17916667461395264, "rewards/valid_tl_methods_reward/std": 0.06141603738069534, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1891.0, "completions/max_terminated_length": 1891.0, "completions/mean_length": 1022.7083740234375, "completions/mean_terminated_length": 1022.7083740234375, "completions/min_length": 584.0, "completions/min_terminated_length": 584.0, "epoch": 0.5691318327974276, "grad_norm": 0.5892595052719116, "learning_rate": 5.8e-07, "loss": 0.0007, "num_tokens": 8874965.0, "reward": 0.7719477415084839, "reward_std": 0.19649700820446014, "rewards/constexpr_reward/mean": 0.1770833283662796, "rewards/constexpr_reward/std": 0.06403809785842896, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.0729166641831398, "rewards/masks_load_store_reward/std": 0.044672295451164246, "rewards/one_code_blob_reward/mean": 0.022468604147434235, "rewards/one_code_blob_reward/std": 0.036056578159332275, "rewards/reward_code_runs/mean": -0.10468748956918716, "rewards/reward_code_runs/std": 0.37346726655960083, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": -0.01145833358168602, "rewards/torch_empty_penalty/std": 0.03201904892921448, "rewards/torch_zeros_reward/mean": 0.0364583320915699, "rewards/torch_zeros_reward/std": 0.04838397353887558, "rewards/valid_tl_methods_reward/mean": 0.18125002086162567, "rewards/valid_tl_methods_reward/std": 0.05860213562846184, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2705.0, "completions/max_terminated_length": 2705.0, "completions/mean_length": 1103.1146240234375, "completions/mean_terminated_length": 1103.1146240234375, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "epoch": 0.5787781350482315, "grad_norm": 0.5568946599960327, "learning_rate": 5.9e-07, "loss": 0.0115, "num_tokens": 9024520.0, "reward": 0.7087312936782837, "reward_std": 0.2077682912349701, "rewards/constexpr_reward/mean": 0.18958334624767303, "rewards/constexpr_reward/std": 0.044672295451164246, "rewards/imports_decorator_reward/mean": 0.1937500238418579, "rewards/imports_decorator_reward/std": 0.034981198608875275, "rewards/masks_load_store_reward/mean": 0.0833333358168602, "rewards/masks_load_store_reward/std": 0.03746343404054642, "rewards/one_code_blob_reward/mean": 0.019941674545407295, "rewards/one_code_blob_reward/std": 0.037697289139032364, "rewards/reward_code_runs/mean": -0.1666666567325592, "rewards/reward_code_runs/std": 0.20347851514816284, "rewards/think_reward/mean": 0.19816450774669647, "rewards/think_reward/std": 0.017156243324279785, "rewards/torch_empty_penalty/mean": -0.015625, "rewards/torch_empty_penalty/std": 0.03649982064962387, "rewards/torch_zeros_reward/mean": 0.03125, "rewards/torch_zeros_reward/std": 0.04659455642104149, "rewards/valid_tl_methods_reward/mean": 0.17499999701976776, "rewards/valid_tl_methods_reward/std": 0.06649099290370941, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1910.0, "completions/max_terminated_length": 1910.0, "completions/mean_length": 960.59375, "completions/mean_terminated_length": 960.59375, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.5884244372990354, "grad_norm": 0.6765536665916443, "learning_rate": 6e-07, "loss": -0.014, "num_tokens": 9159625.0, "reward": 1.0015864372253418, "reward_std": 0.2235845923423767, "rewards/constexpr_reward/mean": 0.18541668355464935, "rewards/constexpr_reward/std": 0.05227290466427803, "rewards/imports_decorator_reward/mean": 0.1937500238418579, "rewards/imports_decorator_reward/std": 0.034981198608875275, "rewards/masks_load_store_reward/mean": 0.08125000447034836, "rewards/masks_load_store_reward/std": 0.03923612833023071, "rewards/one_code_blob_reward/mean": 0.03283637762069702, "rewards/one_code_blob_reward/std": 0.03562239184975624, "rewards/reward_code_runs/mean": 0.14166666567325592, "rewards/reward_code_runs/std": 0.5730190873146057, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": -0.02604166604578495, "rewards/torch_empty_penalty/std": 0.04411657154560089, "rewards/torch_zeros_reward/mean": 0.02604166604578495, "rewards/torch_zeros_reward/std": 0.04411657154560089, "rewards/valid_tl_methods_reward/mean": 0.1666666716337204, "rewards/valid_tl_methods_reward/std": 0.07492686808109283, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3111.0, "completions/max_terminated_length": 3111.0, "completions/mean_length": 1079.375, "completions/mean_terminated_length": 1079.375, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.5980707395498392, "grad_norm": 0.7044626474380493, "learning_rate": 6.1e-07, "loss": 0.0289, "num_tokens": 9304165.0, "reward": 0.7447230219841003, "reward_std": 0.20032760500907898, "rewards/constexpr_reward/mean": 0.18333333730697632, "rewards/constexpr_reward/std": 0.05556724593043327, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.08125000447034836, "rewards/masks_load_store_reward/std": 0.03923612833023071, "rewards/one_code_blob_reward/mean": 0.028499729931354523, "rewards/one_code_blob_reward/std": 0.03636397048830986, "rewards/reward_code_runs/mean": -0.10624999552965164, "rewards/reward_code_runs/std": 0.3280926048755646, "rewards/think_reward/mean": 0.19226489961147308, "rewards/think_reward/std": 0.03887832909822464, "rewards/torch_empty_penalty/mean": -0.02083333395421505, "rewards/torch_empty_penalty/std": 0.040824830532073975, "rewards/torch_zeros_reward/mean": 0.02395833283662796, "rewards/torch_zeros_reward/std": 0.042906977236270905, "rewards/valid_tl_methods_reward/mean": 0.16250000894069672, "rewards/valid_tl_methods_reward/std": 0.07847225666046143, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3746.0, "completions/max_terminated_length": 3746.0, "completions/mean_length": 1197.572998046875, "completions/mean_terminated_length": 1197.572998046875, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 0.6077170418006431, "grad_norm": 0.617486298084259, "learning_rate": 6.2e-07, "loss": 0.0432, "num_tokens": 9462644.0, "reward": 0.7086712121963501, "reward_std": 0.21238891780376434, "rewards/constexpr_reward/mean": 0.1875, "rewards/constexpr_reward/std": 0.04866642504930496, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.08749999850988388, "rewards/masks_load_store_reward/std": 0.033245496451854706, "rewards/one_code_blob_reward/mean": 0.029755033552646637, "rewards/one_code_blob_reward/std": 0.02773762121796608, "rewards/reward_code_runs/mean": -0.12708333134651184, "rewards/reward_code_runs/std": 0.3533238172531128, "rewards/think_reward/mean": 0.19662445783615112, "rewards/think_reward/std": 0.023549221456050873, "rewards/torch_empty_penalty/mean": -0.004166666883975267, "rewards/torch_empty_penalty/std": 0.020087527111172676, "rewards/torch_zeros_reward/mean": 0.01770833320915699, "rewards/torch_zeros_reward/std": 0.03837431222200394, "rewards/valid_tl_methods_reward/mean": 0.12291666865348816, "rewards/valid_tl_methods_reward/std": 0.09784968197345734, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3416.0, "completions/max_terminated_length": 3416.0, "completions/mean_length": 974.75, "completions/mean_terminated_length": 974.75, "completions/min_length": 575.0, "completions/min_terminated_length": 575.0, "epoch": 0.617363344051447, "grad_norm": 0.5682595372200012, "learning_rate": 6.3e-07, "loss": 0.077, "num_tokens": 9595640.0, "reward": 0.6542726755142212, "reward_std": 0.2039661705493927, "rewards/constexpr_reward/mean": 0.1875, "rewards/constexpr_reward/std": 0.04866642504930496, "rewards/imports_decorator_reward/mean": 0.1937500238418579, "rewards/imports_decorator_reward/std": 0.034981198608875275, "rewards/masks_load_store_reward/mean": 0.08958333730697632, "rewards/masks_load_store_reward/std": 0.030708016827702522, "rewards/one_code_blob_reward/mean": 0.025626754388213158, "rewards/one_code_blob_reward/std": 0.03773731365799904, "rewards/reward_code_runs/mean": -0.19947916269302368, "rewards/reward_code_runs/std": 0.1758430451154709, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": -0.02395833469927311, "rewards/torch_empty_penalty/std": 0.0429069809615612, "rewards/torch_zeros_reward/mean": 0.02083333395421505, "rewards/torch_zeros_reward/std": 0.040824830532073975, "rewards/valid_tl_methods_reward/mean": 0.16041666269302368, "rewards/valid_tl_methods_reward/std": 0.08010409772396088, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3137.0, "completions/max_terminated_length": 3137.0, "completions/mean_length": 1027.65625, "completions/mean_terminated_length": 1027.65625, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 0.6270096463022508, "grad_norm": 0.920167088508606, "learning_rate": 6.4e-07, "loss": 0.007, "num_tokens": 9734891.0, "reward": 0.7769454717636108, "reward_std": 0.29669874906539917, "rewards/constexpr_reward/mean": 0.18958334624767303, "rewards/constexpr_reward/std": 0.044672295451164246, "rewards/imports_decorator_reward/mean": 0.19583334028720856, "rewards/imports_decorator_reward/std": 0.02871517650783062, "rewards/masks_load_store_reward/mean": 0.07187499850988388, "rewards/masks_load_store_reward/std": 0.04519694298505783, "rewards/one_code_blob_reward/mean": 0.02731100283563137, "rewards/one_code_blob_reward/std": 0.03964829444885254, "rewards/reward_code_runs/mean": -0.08697917312383652, "rewards/reward_code_runs/std": 0.39056265354156494, "rewards/think_reward/mean": 0.19807188212871552, "rewards/think_reward/std": 0.014280364848673344, "rewards/torch_empty_penalty/mean": -0.02187499962747097, "rewards/torch_empty_penalty/std": 0.04155687242746353, "rewards/torch_zeros_reward/mean": 0.02187500149011612, "rewards/torch_zeros_reward/std": 0.04155687615275383, "rewards/valid_tl_methods_reward/mean": 0.18125002086162567, "rewards/valid_tl_methods_reward/std": 0.058602139353752136, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3469.0, "completions/max_terminated_length": 3469.0, "completions/mean_length": 1270.28125, "completions/mean_terminated_length": 1270.28125, "completions/min_length": 645.0, "completions/min_terminated_length": 645.0, "epoch": 0.6366559485530546, "grad_norm": 0.5540661215782166, "learning_rate": 6.5e-07, "loss": 0.0347, "num_tokens": 9900650.0, "reward": 0.6175910234451294, "reward_std": 0.151606023311615, "rewards/constexpr_reward/mean": 0.19166667759418488, "rewards/constexpr_reward/std": 0.04017505422234535, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.08125000447034836, "rewards/masks_load_store_reward/std": 0.03923612833023071, "rewards/one_code_blob_reward/mean": 0.018527232110500336, "rewards/one_code_blob_reward/std": 0.014282294549047947, "rewards/reward_code_runs/mean": -0.21718747913837433, "rewards/reward_code_runs/std": 0.1176140308380127, "rewards/think_reward/mean": 0.19541792571544647, "rewards/think_reward/std": 0.033646970987319946, "rewards/torch_empty_penalty/mean": -0.03229166567325592, "rewards/torch_empty_penalty/std": 0.0470045730471611, "rewards/torch_zeros_reward/mean": 0.0364583320915699, "rewards/torch_zeros_reward/std": 0.04838396981358528, "rewards/valid_tl_methods_reward/mean": 0.1458333283662796, "rewards/valid_tl_methods_reward/std": 0.08934459090232849, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1970.0, "completions/max_terminated_length": 1970.0, "completions/mean_length": 920.7916870117188, "completions/mean_terminated_length": 920.7916870117188, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.6463022508038585, "grad_norm": 0.7348382472991943, "learning_rate": 6.6e-07, "loss": 0.0654, "num_tokens": 10030314.0, "reward": 0.782570481300354, "reward_std": 0.26678401231765747, "rewards/constexpr_reward/mean": 0.1666666716337204, "rewards/constexpr_reward/std": 0.07492686808109283, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.07708332687616348, "rewards/masks_load_store_reward/std": 0.04225030168890953, "rewards/one_code_blob_reward/mean": 0.03829953074455261, "rewards/one_code_blob_reward/std": 0.03580077737569809, "rewards/reward_code_runs/mean": -0.06197916343808174, "rewards/reward_code_runs/std": 0.4319932162761688, "rewards/think_reward/mean": 0.19687502086162567, "rewards/think_reward/std": 0.03061862289905548, "rewards/torch_empty_penalty/mean": -0.01666666753590107, "rewards/torch_empty_penalty/std": 0.03746343404054642, "rewards/torch_zeros_reward/mean": 0.01770833320915699, "rewards/torch_zeros_reward/std": 0.03837431222200394, "rewards/valid_tl_methods_reward/mean": 0.16458334028720856, "rewards/valid_tl_methods_reward/std": 0.07674862444400787, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2447.0, "completions/max_terminated_length": 2447.0, "completions/mean_length": 1012.15625, "completions/mean_terminated_length": 1012.15625, "completions/min_length": 528.0, "completions/min_terminated_length": 528.0, "epoch": 0.6559485530546624, "grad_norm": 0.5175846219062805, "learning_rate": 6.7e-07, "loss": 0.0515, "num_tokens": 10169061.0, "reward": 0.7850543856620789, "reward_std": 0.2515929341316223, "rewards/constexpr_reward/mean": 0.1937500238418579, "rewards/constexpr_reward/std": 0.034981198608875275, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.08437500149011612, "rewards/masks_load_store_reward/std": 0.03649982064962387, "rewards/one_code_blob_reward/mean": 0.03036683052778244, "rewards/one_code_blob_reward/std": 0.017092684283852577, "rewards/reward_code_runs/mean": -0.12968748807907104, "rewards/reward_code_runs/std": 0.3213113248348236, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": -0.00729166716337204, "rewards/torch_empty_penalty/std": 0.026136448606848717, "rewards/torch_zeros_reward/mean": 0.03020833432674408, "rewards/torch_zeros_reward/std": 0.046157147735357285, "rewards/valid_tl_methods_reward/mean": 0.18333333730697632, "rewards/valid_tl_methods_reward/std": 0.05556724593043327, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10416666666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3485.0, "completions/mean_length": 1298.5208740234375, "completions/mean_terminated_length": 973.2325439453125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.6655948553054662, "grad_norm": 0.6217107176780701, "learning_rate": 6.800000000000001e-07, "loss": 0.0101, "num_tokens": 10336235.0, "reward": 0.7228078842163086, "reward_std": 0.2464483678340912, "rewards/constexpr_reward/mean": 0.14791665971279144, "rewards/constexpr_reward/std": 0.08823314309120178, "rewards/imports_decorator_reward/mean": 0.17500001192092896, "rewards/imports_decorator_reward/std": 0.06649099290370941, "rewards/masks_load_store_reward/mean": 0.07187499850988388, "rewards/masks_load_store_reward/std": 0.04519694298505783, "rewards/one_code_blob_reward/mean": 0.01695355586707592, "rewards/one_code_blob_reward/std": 0.08903989940881729, "rewards/reward_code_runs/mean": -0.06041666865348816, "rewards/reward_code_runs/std": 0.3515317440032959, "rewards/think_reward/mean": 0.19543762505054474, "rewards/think_reward/std": 0.03356730937957764, "rewards/torch_empty_penalty/mean": -0.0010416667209938169, "rewards/torch_empty_penalty/std": 0.010206207633018494, "rewards/torch_zeros_reward/mean": 0.02291666716337204, "rewards/torch_zeros_reward/std": 0.04225030168890953, "rewards/valid_tl_methods_reward/mean": 0.15416668355464935, "rewards/valid_tl_methods_reward/std": 0.08450059592723846, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02083333333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 2915.0, "completions/mean_length": 1177.7708740234375, "completions/mean_terminated_length": 1115.6807861328125, "completions/min_length": 538.0, "completions/min_terminated_length": 538.0, "epoch": 0.6752411575562701, "grad_norm": 0.5847841501235962, "learning_rate": 6.9e-07, "loss": 0.0669, "num_tokens": 10496965.0, "reward": 0.59849613904953, "reward_std": 0.2042919248342514, "rewards/constexpr_reward/mean": 0.18333333730697632, "rewards/constexpr_reward/std": 0.05556724593043327, "rewards/imports_decorator_reward/mean": 0.18958334624767303, "rewards/imports_decorator_reward/std": 0.044672295451164246, "rewards/masks_load_store_reward/mean": 0.08645833283662796, "rewards/masks_load_store_reward/std": 0.034396424889564514, "rewards/one_code_blob_reward/mean": 0.016383716836571693, "rewards/one_code_blob_reward/std": 0.05423387512564659, "rewards/reward_code_runs/mean": -0.22760416567325592, "rewards/reward_code_runs/std": 0.14213962852954865, "rewards/think_reward/mean": 0.19617490470409393, "rewards/think_reward/std": 0.019116153940558434, "rewards/torch_empty_penalty/mean": -0.01145833358168602, "rewards/torch_empty_penalty/std": 0.03201904520392418, "rewards/torch_zeros_reward/mean": 0.02395833469927311, "rewards/torch_zeros_reward/std": 0.0429069809615612, "rewards/valid_tl_methods_reward/mean": 0.14166666567325592, "rewards/valid_tl_methods_reward/std": 0.09138313680887222, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3453.0, "completions/max_terminated_length": 3453.0, "completions/mean_length": 1173.96875, "completions/mean_terminated_length": 1173.96875, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 0.684887459807074, "grad_norm": 0.5115599036216736, "learning_rate": 7e-07, "loss": 0.0136, "num_tokens": 10652350.0, "reward": 0.8516645431518555, "reward_std": 0.3125598132610321, "rewards/constexpr_reward/mean": 0.19583334028720856, "rewards/constexpr_reward/std": 0.02871517650783062, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.09270834177732468, "rewards/masks_load_store_reward/std": 0.026136452332139015, "rewards/one_code_blob_reward/mean": 0.030832627788186073, "rewards/one_code_blob_reward/std": 0.023144574835896492, "rewards/reward_code_runs/mean": -0.030208328738808632, "rewards/reward_code_runs/std": 0.4455321729183197, "rewards/think_reward/mean": 0.19687356054782867, "rewards/think_reward/std": 0.021740630269050598, "rewards/torch_empty_penalty/mean": -0.0020833334419876337, "rewards/torch_empty_penalty/std": 0.01435758825391531, "rewards/torch_zeros_reward/mean": 0.02395833283662796, "rewards/torch_zeros_reward/std": 0.042906977236270905, "rewards/valid_tl_methods_reward/mean": 0.14374999701976776, "rewards/valid_tl_methods_reward/std": 0.09039388597011566, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3558.0, "completions/max_terminated_length": 3558.0, "completions/mean_length": 1290.6875, "completions/mean_terminated_length": 1290.6875, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.6945337620578779, "grad_norm": 0.6105243563652039, "learning_rate": 7.1e-07, "loss": 0.0294, "num_tokens": 10822744.0, "reward": 0.6542137861251831, "reward_std": 0.15812623500823975, "rewards/constexpr_reward/mean": 0.18333333730697632, "rewards/constexpr_reward/std": 0.05556724593043327, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.07916667312383652, "rewards/masks_load_store_reward/std": 0.040824830532073975, "rewards/one_code_blob_reward/mean": 0.020676322281360626, "rewards/one_code_blob_reward/std": 0.04006450995802879, "rewards/reward_code_runs/mean": -0.20781250298023224, "rewards/reward_code_runs/std": 0.1318548172712326, "rewards/think_reward/mean": 0.19447489082813263, "rewards/think_reward/std": 0.025090038776397705, "rewards/torch_empty_penalty/mean": -0.008333333767950535, "rewards/torch_empty_penalty/std": 0.027783622965216637, "rewards/torch_zeros_reward/mean": 0.0572916679084301, "rewards/torch_zeros_reward/std": 0.04972511902451515, "rewards/valid_tl_methods_reward/mean": 0.1354166716337204, "rewards/valid_tl_methods_reward/std": 0.0940091460943222, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2092.0, "completions/max_terminated_length": 2092.0, "completions/mean_length": 997.2916870117188, "completions/mean_terminated_length": 997.2916870117188, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "epoch": 0.7041800643086816, "grad_norm": 0.5655916333198547, "learning_rate": 7.2e-07, "loss": 0.0062, "num_tokens": 10963064.0, "reward": 0.6285203099250793, "reward_std": 0.1138053834438324, "rewards/constexpr_reward/mean": 0.19166667759418488, "rewards/constexpr_reward/std": 0.04017505422234535, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.08749999850988388, "rewards/masks_load_store_reward/std": 0.033245496451854706, "rewards/one_code_blob_reward/mean": 0.03685358539223671, "rewards/one_code_blob_reward/std": 0.023610996082425117, "rewards/reward_code_runs/mean": -0.24062500894069672, "rewards/reward_code_runs/std": 0.06460914760828018, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": -0.02083333395421505, "rewards/torch_empty_penalty/std": 0.040824830532073975, "rewards/torch_zeros_reward/mean": 0.04479166865348816, "rewards/torch_zeros_reward/std": 0.049989037215709686, "rewards/valid_tl_methods_reward/mean": 0.13125000894069672, "rewards/valid_tl_methods_reward/std": 0.09549042582511902, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3602.0, "completions/max_terminated_length": 3602.0, "completions/mean_length": 1127.0625, "completions/mean_terminated_length": 1127.0625, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 0.7138263665594855, "grad_norm": 0.6025066375732422, "learning_rate": 7.3e-07, "loss": 0.0371, "num_tokens": 11111102.0, "reward": 0.7693430185317993, "reward_std": 0.14001044631004333, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.08229167014360428, "rewards/masks_load_store_reward/std": 0.03837431222200394, "rewards/one_code_blob_reward/mean": 0.033576302230358124, "rewards/one_code_blob_reward/std": 0.023212797939777374, "rewards/reward_code_runs/mean": -0.14687500894069672, "rewards/reward_code_runs/std": 0.19012632966041565, "rewards/think_reward/mean": 0.19514165818691254, "rewards/think_reward/std": 0.02538818120956421, "rewards/torch_empty_penalty/mean": -0.009374999441206455, "rewards/torch_empty_penalty/std": 0.029301069676876068, "rewards/torch_zeros_reward/mean": 0.02916666679084301, "rewards/torch_zeros_reward/std": 0.04569156840443611, "rewards/valid_tl_methods_reward/mean": 0.1875, "rewards/valid_tl_methods_reward/std": 0.04866642504930496, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3908.0, "completions/max_terminated_length": 3908.0, "completions/mean_length": 1304.0521240234375, "completions/mean_terminated_length": 1304.0521240234375, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.7234726688102894, "grad_norm": 0.7116039395332336, "learning_rate": 7.4e-07, "loss": 0.0137, "num_tokens": 11281699.0, "reward": 0.7607399821281433, "reward_std": 0.1809384822845459, "rewards/constexpr_reward/mean": 0.18958334624767303, "rewards/constexpr_reward/std": 0.044672295451164246, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.0885416641831398, "rewards/masks_load_store_reward/std": 0.03201904520392418, "rewards/one_code_blob_reward/mean": 0.032614920288324356, "rewards/one_code_blob_reward/std": 0.026224695146083832, "rewards/reward_code_runs/mean": -0.1354166716337204, "rewards/reward_code_runs/std": 0.3354428708553314, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": -0.013541667722165585, "rewards/torch_empty_penalty/std": 0.034396421164274216, "rewards/torch_zeros_reward/mean": 0.05104166641831398, "rewards/torch_zeros_reward/std": 0.0502515584230423, "rewards/valid_tl_methods_reward/mean": 0.14791665971279144, "rewards/valid_tl_methods_reward/std": 0.08823314309120178, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01041666666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3088.0, "completions/mean_length": 1423.822998046875, "completions/mean_terminated_length": 1395.69482421875, "completions/min_length": 633.0, "completions/min_terminated_length": 633.0, "epoch": 0.7331189710610932, "grad_norm": 0.6728214025497437, "learning_rate": 7.5e-07, "loss": 0.0324, "num_tokens": 11466194.0, "reward": 0.7873832583427429, "reward_std": 0.2371460199356079, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.09583333879709244, "rewards/masks_load_store_reward/std": 0.020087527111172676, "rewards/one_code_blob_reward/mean": 0.014753940515220165, "rewards/one_code_blob_reward/std": 0.02616616152226925, "rewards/reward_code_runs/mean": -0.12968750298023224, "rewards/reward_code_runs/std": 0.3213113248348236, "rewards/think_reward/mean": 0.196066752076149, "rewards/think_reward/std": 0.020336557179689407, "rewards/torch_empty_penalty/mean": -0.004166666883975267, "rewards/torch_empty_penalty/std": 0.020087525248527527, "rewards/torch_zeros_reward/mean": 0.04375000298023224, "rewards/torch_zeros_reward/std": 0.04986824840307236, "rewards/valid_tl_methods_reward/mean": 0.17083333432674408, "rewards/valid_tl_methods_reward/std": 0.07095835357904434, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2564.0, "completions/max_terminated_length": 2564.0, "completions/mean_length": 1124.21875, "completions/mean_terminated_length": 1124.21875, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 0.7427652733118971, "grad_norm": 1.729048490524292, "learning_rate": 7.599999999999999e-07, "loss": 0.0727, "num_tokens": 11622479.0, "reward": 0.7299603223800659, "reward_std": 0.18497233092784882, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.0885416641831398, "rewards/masks_load_store_reward/std": 0.03201904520392418, "rewards/one_code_blob_reward/mean": 0.022904405370354652, "rewards/one_code_blob_reward/std": 0.01825665310025215, "rewards/reward_code_runs/mean": -0.1875, "rewards/reward_code_runs/std": 0.23675435781478882, "rewards/think_reward/mean": 0.19768084585666656, "rewards/think_reward/std": 0.01613238826394081, "rewards/torch_empty_penalty/mean": -0.01666666753590107, "rewards/torch_empty_penalty/std": 0.037463437765836716, "rewards/torch_zeros_reward/mean": 0.05625000596046448, "rewards/torch_zeros_reward/std": 0.04986824840307236, "rewards/valid_tl_methods_reward/mean": 0.16875000298023224, "rewards/valid_tl_methods_reward/std": 0.07299964129924774, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1884.0, "completions/max_terminated_length": 1884.0, "completions/mean_length": 860.9166870117188, "completions/mean_terminated_length": 860.9166870117188, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.752411575562701, "grad_norm": 0.6792352795600891, "learning_rate": 7.699999999999999e-07, "loss": 0.0305, "num_tokens": 11744499.0, "reward": 0.9666028022766113, "reward_std": 0.25859588384628296, "rewards/constexpr_reward/mean": 0.1937500238418579, "rewards/constexpr_reward/std": 0.034981198608875275, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.08437500149011612, "rewards/masks_load_store_reward/std": 0.03649982064962387, "rewards/one_code_blob_reward/mean": 0.04160277917981148, "rewards/one_code_blob_reward/std": 0.03801194950938225, "rewards/reward_code_runs/mean": 0.05937500670552254, "rewards/reward_code_runs/std": 0.48315370082855225, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": -0.0052083334885537624, "rewards/torch_empty_penalty/std": 0.022336147725582123, "rewards/torch_zeros_reward/mean": 0.01979166641831398, "rewards/torch_zeros_reward/std": 0.04005204886198044, "rewards/valid_tl_methods_reward/mean": 0.17291666567325592, "rewards/valid_tl_methods_reward/std": 0.06879284232854843, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2805.0, "completions/max_terminated_length": 2805.0, "completions/mean_length": 925.71875, "completions/mean_terminated_length": 925.71875, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 0.7620578778135049, "grad_norm": 0.688829243183136, "learning_rate": 7.799999999999999e-07, "loss": 0.0321, "num_tokens": 11874984.0, "reward": 1.1787655353546143, "reward_std": 0.2676607370376587, "rewards/constexpr_reward/mean": 0.1937500238418579, "rewards/constexpr_reward/std": 0.034981198608875275, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.09583333879709244, "rewards/masks_load_store_reward/std": 0.020087527111172676, "rewards/one_code_blob_reward/mean": 0.03898598626255989, "rewards/one_code_blob_reward/std": 0.027744870632886887, "rewards/reward_code_runs/mean": 0.23854166269302368, "rewards/reward_code_runs/std": 0.5910952687263489, "rewards/think_reward/mean": 0.19811280071735382, "rewards/think_reward/std": 0.013295148499310017, "rewards/torch_empty_penalty/mean": -0.0010416667209938169, "rewards/torch_empty_penalty/std": 0.010206207633018494, "rewards/torch_zeros_reward/mean": 0.03541666641831398, "rewards/torch_zeros_reward/std": 0.0480770580470562, "rewards/valid_tl_methods_reward/mean": 0.18125002086162567, "rewards/valid_tl_methods_reward/std": 0.058602139353752136, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1763.0, "completions/max_terminated_length": 1763.0, "completions/mean_length": 800.3958740234375, "completions/mean_terminated_length": 800.3958740234375, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 0.7717041800643086, "grad_norm": 0.7262923717498779, "learning_rate": 7.9e-07, "loss": 0.0328, "num_tokens": 11992622.0, "reward": 1.040205955505371, "reward_std": 0.33069169521331787, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.09270834177732468, "rewards/masks_load_store_reward/std": 0.026136452332139015, "rewards/one_code_blob_reward/mean": 0.05322667583823204, "rewards/one_code_blob_reward/std": 0.026531290262937546, "rewards/reward_code_runs/mean": 0.12343750149011612, "rewards/reward_code_runs/std": 0.5305882096290588, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.008333333767950535, "rewards/torch_zeros_reward/std": 0.027783622965216637, "rewards/valid_tl_methods_reward/mean": 0.16458334028720856, "rewards/valid_tl_methods_reward/std": 0.07674862444400787, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07291666666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3992.0, "completions/mean_length": 1334.541748046875, "completions/mean_terminated_length": 1117.3482666015625, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.7813504823151125, "grad_norm": 0.7282072305679321, "learning_rate": 8e-07, "loss": 0.0188, "num_tokens": 12165150.0, "reward": 0.6720718145370483, "reward_std": 0.21535718441009521, "rewards/constexpr_reward/mean": 0.18125002086162567, "rewards/constexpr_reward/std": 0.05860213562846184, "rewards/imports_decorator_reward/mean": 0.18541665375232697, "rewards/imports_decorator_reward/std": 0.05227290466427803, "rewards/masks_load_store_reward/mean": 0.08020833134651184, "rewards/masks_load_store_reward/std": 0.04005204886198044, "rewards/one_code_blob_reward/mean": 0.015300924889743328, "rewards/one_code_blob_reward/std": 0.0648474469780922, "rewards/reward_code_runs/mean": -0.15156249701976776, "rewards/reward_code_runs/std": 0.1870059221982956, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": -0.01770833320915699, "rewards/torch_empty_penalty/std": 0.03837431222200394, "rewards/torch_zeros_reward/mean": 0.01874999888241291, "rewards/torch_zeros_reward/std": 0.03923613205552101, "rewards/valid_tl_methods_reward/mean": 0.16041666269302368, "rewards/valid_tl_methods_reward/std": 0.08010409772396088, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1576.0, "completions/max_terminated_length": 1576.0, "completions/mean_length": 805.71875, "completions/mean_terminated_length": 805.71875, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 0.7909967845659164, "grad_norm": 0.6383414268493652, "learning_rate": 8.1e-07, "loss": 0.0475, "num_tokens": 12283503.0, "reward": 0.8771345019340515, "reward_std": 0.24294182658195496, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.08541666716337204, "rewards/masks_load_store_reward/std": 0.03547917678952217, "rewards/one_code_blob_reward/mean": 0.04328025504946709, "rewards/one_code_blob_reward/std": 0.022545376792550087, "rewards/reward_code_runs/mean": -0.06093750521540642, "rewards/reward_code_runs/std": 0.4197244346141815, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": -0.0031250000465661287, "rewards/torch_empty_penalty/std": 0.017490599304437637, "rewards/torch_zeros_reward/mean": 0.03749999776482582, "rewards/torch_zeros_reward/std": 0.04866642504930496, "rewards/valid_tl_methods_reward/mean": 0.17499999701976776, "rewards/valid_tl_methods_reward/std": 0.06649099290370941, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3559.0, "completions/max_terminated_length": 3559.0, "completions/mean_length": 956.09375, "completions/mean_terminated_length": 956.09375, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.8006430868167203, "grad_norm": 1.5789874792099, "learning_rate": 8.199999999999999e-07, "loss": 0.0652, "num_tokens": 12417984.0, "reward": 0.9814224243164062, "reward_std": 0.1926470398902893, "rewards/constexpr_reward/mean": 0.19583334028720856, "rewards/constexpr_reward/std": 0.02871517650783062, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.09375, "rewards/masks_load_store_reward/std": 0.02433321252465248, "rewards/one_code_blob_reward/mean": 0.04589207097887993, "rewards/one_code_blob_reward/std": 0.03024989366531372, "rewards/reward_code_runs/mean": 0.02552083134651184, "rewards/reward_code_runs/std": 0.49857667088508606, "rewards/think_reward/mean": 0.19855110347270966, "rewards/think_reward/std": 0.014196311123669147, "rewards/torch_empty_penalty/mean": -0.0020833334419876337, "rewards/torch_empty_penalty/std": 0.01435758825391531, "rewards/torch_zeros_reward/mean": 0.04479166865348816, "rewards/torch_zeros_reward/std": 0.04998903349041939, "rewards/valid_tl_methods_reward/mean": 0.17916667461395264, "rewards/valid_tl_methods_reward/std": 0.06141604110598564, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3734.0, "completions/max_terminated_length": 3734.0, "completions/mean_length": 1361.791748046875, "completions/mean_terminated_length": 1361.791748046875, "completions/min_length": 521.0, "completions/min_terminated_length": 521.0, "epoch": 0.8102893890675241, "grad_norm": 0.5281466245651245, "learning_rate": 8.299999999999999e-07, "loss": 0.0101, "num_tokens": 12596164.0, "reward": 0.6960060000419617, "reward_std": 0.1029006764292717, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.08020833134651184, "rewards/masks_load_store_reward/std": 0.04005204886198044, "rewards/one_code_blob_reward/mean": 0.02231888473033905, "rewards/one_code_blob_reward/std": 0.020131021738052368, "rewards/reward_code_runs/mean": -0.21249999105930328, "rewards/reward_code_runs/std": 0.12502631545066833, "rewards/think_reward/mean": 0.19868700206279755, "rewards/think_reward/std": 0.011553033255040646, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.04270833730697632, "rewards/torch_zeros_reward/std": 0.04972512647509575, "rewards/valid_tl_methods_reward/mean": 0.1666666716337204, "rewards/valid_tl_methods_reward/std": 0.07492686808109283, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2050.0, "completions/max_terminated_length": 2050.0, "completions/mean_length": 795.3646240234375, "completions/mean_terminated_length": 795.3646240234375, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.819935691318328, "grad_norm": 0.7214101552963257, "learning_rate": 8.399999999999999e-07, "loss": 0.0325, "num_tokens": 12710763.0, "reward": 1.0834388732910156, "reward_std": 0.15569153428077698, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.09479167312383652, "rewards/masks_load_store_reward/std": 0.022336147725582123, "rewards/one_code_blob_reward/mean": 0.05166803300380707, "rewards/one_code_blob_reward/std": 0.02839084342122078, "rewards/reward_code_runs/mean": 0.12656250596046448, "rewards/reward_code_runs/std": 0.4989965260028839, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": -0.0020833334419876337, "rewards/torch_empty_penalty/std": 0.01435758825391531, "rewards/torch_zeros_reward/mean": 0.02291666716337204, "rewards/torch_zeros_reward/std": 0.04225029796361923, "rewards/valid_tl_methods_reward/mean": 0.18958334624767303, "rewards/valid_tl_methods_reward/std": 0.044672295451164246, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2049.0, "completions/max_terminated_length": 2049.0, "completions/mean_length": 856.7396240234375, "completions/mean_terminated_length": 856.7396240234375, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 0.8295819935691319, "grad_norm": 0.7173665165901184, "learning_rate": 8.499999999999999e-07, "loss": 0.0199, "num_tokens": 12834506.0, "reward": 0.7910616397857666, "reward_std": 0.1610206663608551, "rewards/constexpr_reward/mean": 0.19166667759418488, "rewards/constexpr_reward/std": 0.04017505422234535, "rewards/imports_decorator_reward/mean": 0.19583334028720856, "rewards/imports_decorator_reward/std": 0.02871517650783062, "rewards/masks_load_store_reward/mean": 0.09062501043081284, "rewards/masks_load_store_reward/std": 0.029301069676876068, "rewards/one_code_blob_reward/mean": 0.042103271931409836, "rewards/one_code_blob_reward/std": 0.02515607886016369, "rewards/reward_code_runs/mean": -0.17604166269302368, "rewards/reward_code_runs/std": 0.19654636085033417, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": -0.0052083334885537624, "rewards/torch_empty_penalty/std": 0.022336147725582123, "rewards/torch_zeros_reward/mean": 0.05416667088866234, "rewards/torch_zeros_reward/std": 0.050087641924619675, "rewards/valid_tl_methods_reward/mean": 0.1979166716337204, "rewards/valid_tl_methods_reward/std": 0.020412415266036987, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2101.0, "completions/max_terminated_length": 2101.0, "completions/mean_length": 1009.5, "completions/mean_terminated_length": 1009.5, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 0.8392282958199357, "grad_norm": 0.5952999591827393, "learning_rate": 8.599999999999999e-07, "loss": -0.0152, "num_tokens": 12976166.0, "reward": 0.7066213488578796, "reward_std": 0.1371747851371765, "rewards/constexpr_reward/mean": 0.19166667759418488, "rewards/constexpr_reward/std": 0.04017505422234535, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.09166666865348816, "rewards/masks_load_store_reward/std": 0.027783622965216637, "rewards/one_code_blob_reward/mean": 0.034746263176202774, "rewards/one_code_blob_reward/std": 0.028374899178743362, "rewards/reward_code_runs/mean": -0.16562499105930328, "rewards/reward_code_runs/std": 0.1765625774860382, "rewards/think_reward/mean": 0.19687502086162567, "rewards/think_reward/std": 0.03061862289905548, "rewards/torch_empty_penalty/mean": -0.010416666977107525, "rewards/torch_empty_penalty/std": 0.03070801869034767, "rewards/torch_zeros_reward/mean": 0.01979166641831398, "rewards/torch_zeros_reward/std": 0.04005204886198044, "rewards/valid_tl_methods_reward/mean": 0.14999999105930328, "rewards/valid_tl_methods_reward/std": 0.08705715090036392, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2930.0, "completions/max_terminated_length": 2930.0, "completions/mean_length": 1109.635498046875, "completions/mean_terminated_length": 1109.635498046875, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.8488745980707395, "grad_norm": 0.5833906531333923, "learning_rate": 8.699999999999999e-07, "loss": 0.0117, "num_tokens": 13126791.0, "reward": 0.8823820948600769, "reward_std": 0.13785666227340698, "rewards/constexpr_reward/mean": 0.1937500238418579, "rewards/constexpr_reward/std": 0.034981198608875275, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.08958333730697632, "rewards/masks_load_store_reward/std": 0.030708016827702522, "rewards/one_code_blob_reward/mean": 0.03290291130542755, "rewards/one_code_blob_reward/std": 0.04009070619940758, "rewards/reward_code_runs/mean": -0.04322915896773338, "rewards/reward_code_runs/std": 0.43320226669311523, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": -0.0010416667209938169, "rewards/torch_empty_penalty/std": 0.010206207633018494, "rewards/torch_zeros_reward/mean": 0.02916666679084301, "rewards/torch_zeros_reward/std": 0.04569156840443611, "rewards/valid_tl_methods_reward/mean": 0.1833333522081375, "rewards/valid_tl_methods_reward/std": 0.05556724965572357, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2153.0, "completions/max_terminated_length": 2153.0, "completions/mean_length": 870.3854370117188, "completions/mean_terminated_length": 870.3854370117188, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 0.8585209003215434, "grad_norm": 0.5769517421722412, "learning_rate": 8.799999999999999e-07, "loss": -0.0023, "num_tokens": 13251808.0, "reward": 0.9388400912284851, "reward_std": 0.26629945635795593, "rewards/constexpr_reward/mean": 0.19583334028720856, "rewards/constexpr_reward/std": 0.02871517837047577, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.09583333879709244, "rewards/masks_load_store_reward/std": 0.020087527111172676, "rewards/one_code_blob_reward/mean": 0.03936079144477844, "rewards/one_code_blob_reward/std": 0.034906670451164246, "rewards/reward_code_runs/mean": -0.0005208253860473633, "rewards/reward_code_runs/std": 0.4790612757205963, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": -0.01666666753590107, "rewards/torch_empty_penalty/std": 0.03746343404054642, "rewards/torch_zeros_reward/mean": 0.05416667088866234, "rewards/torch_zeros_reward/std": 0.05008764564990997, "rewards/valid_tl_methods_reward/mean": 0.17291666567325592, "rewards/valid_tl_methods_reward/std": 0.06879284977912903, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01041666666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3096.0, "completions/mean_length": 1160.875, "completions/mean_terminated_length": 1129.97900390625, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.8681672025723473, "grad_norm": 0.5800571441650391, "learning_rate": 8.9e-07, "loss": 0.0986, "num_tokens": 13405612.0, "reward": 0.8594149351119995, "reward_std": 0.17765943706035614, "rewards/constexpr_reward/mean": 0.18958334624767303, "rewards/constexpr_reward/std": 0.044672295451164246, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.09270834177732468, "rewards/masks_load_store_reward/std": 0.026136452332139015, "rewards/one_code_blob_reward/mean": 0.03177860751748085, "rewards/one_code_blob_reward/std": 0.03633953258395195, "rewards/reward_code_runs/mean": -0.05052083358168602, "rewards/reward_code_runs/std": 0.40778568387031555, "rewards/think_reward/mean": 0.19794876873493195, "rewards/think_reward/std": 0.020097941160202026, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.05000000074505806, "rewards/torch_zeros_reward/std": 0.05026246979832649, "rewards/valid_tl_methods_reward/mean": 0.14791667461395264, "rewards/valid_tl_methods_reward/std": 0.08823314309120178, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1805.0, "completions/max_terminated_length": 1805.0, "completions/mean_length": 1031.885498046875, "completions/mean_terminated_length": 1031.885498046875, "completions/min_length": 546.0, "completions/min_terminated_length": 546.0, "epoch": 0.8778135048231511, "grad_norm": 0.6327739357948303, "learning_rate": 9e-07, "loss": 0.0262, "num_tokens": 13551725.0, "reward": 0.6960409879684448, "reward_std": 0.13087350130081177, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.09062501043081284, "rewards/masks_load_store_reward/std": 0.029301069676876068, "rewards/one_code_blob_reward/mean": 0.026249190792441368, "rewards/one_code_blob_reward/std": 0.01706741750240326, "rewards/reward_code_runs/mean": -0.2135416716337204, "rewards/reward_code_runs/std": 0.16050563752651215, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.0364583358168602, "rewards/torch_zeros_reward/std": 0.04838397353887558, "rewards/valid_tl_methods_reward/mean": 0.15833334624767303, "rewards/valid_tl_methods_reward/std": 0.08164966106414795, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2159.0, "completions/max_terminated_length": 2159.0, "completions/mean_length": 1042.375, "completions/mean_terminated_length": 1042.375, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "epoch": 0.887459807073955, "grad_norm": 0.549281120300293, "learning_rate": 9.1e-07, "loss": 0.0378, "num_tokens": 13699337.0, "reward": 0.8041318655014038, "reward_std": 0.18684861063957214, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.09375, "rewards/masks_load_store_reward/std": 0.02433321252465248, "rewards/one_code_blob_reward/mean": 0.02809007279574871, "rewards/one_code_blob_reward/std": 0.0198042131960392, "rewards/reward_code_runs/mean": -0.12187498807907104, "rewards/reward_code_runs/std": 0.27065345644950867, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": -0.0072916666977107525, "rewards/torch_empty_penalty/std": 0.026136452332139015, "rewards/torch_zeros_reward/mean": 0.0364583358168602, "rewards/torch_zeros_reward/std": 0.04838397353887558, "rewards/valid_tl_methods_reward/mean": 0.1770833283662796, "rewards/valid_tl_methods_reward/std": 0.06403809785842896, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2761.0, "completions/max_terminated_length": 2761.0, "completions/mean_length": 852.375, "completions/mean_terminated_length": 852.375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.8971061093247589, "grad_norm": 0.6425236463546753, "learning_rate": 9.2e-07, "loss": -0.0237, "num_tokens": 13822505.0, "reward": 1.0936288833618164, "reward_std": 0.28550925850868225, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.09583333879709244, "rewards/masks_load_store_reward/std": 0.020087528973817825, "rewards/one_code_blob_reward/mean": 0.04154547303915024, "rewards/one_code_blob_reward/std": 0.0370657816529274, "rewards/reward_code_runs/mean": 0.16770833730697632, "rewards/reward_code_runs/std": 0.5830491185188293, "rewards/think_reward/mean": 0.19687502086162567, "rewards/think_reward/std": 0.03061862103641033, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.01458333432674408, "rewards/torch_zeros_reward/std": 0.03547917678952217, "rewards/valid_tl_methods_reward/mean": 0.18124999105930328, "rewards/valid_tl_methods_reward/std": 0.058602139353752136, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2859.0, "completions/max_terminated_length": 2859.0, "completions/mean_length": 1155.875, "completions/mean_terminated_length": 1155.875, "completions/min_length": 657.0, "completions/min_terminated_length": 657.0, "epoch": 0.9067524115755627, "grad_norm": 0.560306966304779, "learning_rate": 9.3e-07, "loss": -0.0126, "num_tokens": 13979489.0, "reward": 0.6780275106430054, "reward_std": 0.11005106568336487, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.08229167014360428, "rewards/masks_load_store_reward/std": 0.03837431222200394, "rewards/one_code_blob_reward/mean": 0.020052263513207436, "rewards/one_code_blob_reward/std": 0.028812218457460403, "rewards/reward_code_runs/mean": -0.24531249701976776, "rewards/reward_code_runs/std": 0.04592793434858322, "rewards/think_reward/mean": 0.19912105798721313, "rewards/think_reward/std": 0.008611898869276047, "rewards/torch_empty_penalty/mean": -0.01458333432674408, "rewards/torch_empty_penalty/std": 0.03547917678952217, "rewards/torch_zeros_reward/mean": 0.07395833730697632, "rewards/torch_zeros_reward/std": 0.04411657154560089, "rewards/valid_tl_methods_reward/mean": 0.1666666716337204, "rewards/valid_tl_methods_reward/std": 0.07492686063051224, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1804.0, "completions/max_terminated_length": 1804.0, "completions/mean_length": 664.3333740234375, "completions/mean_terminated_length": 664.3333740234375, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.9163987138263665, "grad_norm": 0.6537352204322815, "learning_rate": 9.399999999999999e-07, "loss": -0.0326, "num_tokens": 14077177.0, "reward": 1.3599352836608887, "reward_std": 0.34259817004203796, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.19583334028720856, "rewards/imports_decorator_reward/std": 0.02871517837047577, "rewards/masks_load_store_reward/mean": 0.0989583358168602, "rewards/masks_load_store_reward/std": 0.010206207633018494, "rewards/one_code_blob_reward/mean": 0.06201860308647156, "rewards/one_code_blob_reward/std": 0.03674319013953209, "rewards/reward_code_runs/mean": 0.4000000059604645, "rewards/reward_code_runs/std": 0.5574377179145813, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.015625, "rewards/torch_zeros_reward/std": 0.03649982064962387, "rewards/valid_tl_methods_reward/mean": 0.18958334624767303, "rewards/valid_tl_methods_reward/std": 0.044672295451164246, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3091.0, "completions/max_terminated_length": 3091.0, "completions/mean_length": 1097.6875, "completions/mean_terminated_length": 1097.6875, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 0.9260450160771704, "grad_norm": 0.6797481775283813, "learning_rate": 9.499999999999999e-07, "loss": 0.0296, "num_tokens": 14226619.0, "reward": 0.7398571968078613, "reward_std": 0.2273358702659607, "rewards/constexpr_reward/mean": 0.1937500238418579, "rewards/constexpr_reward/std": 0.034981198608875275, "rewards/imports_decorator_reward/mean": 0.19166667759418488, "rewards/imports_decorator_reward/std": 0.04017505422234535, "rewards/masks_load_store_reward/mean": 0.09479167312383652, "rewards/masks_load_store_reward/std": 0.022336147725582123, "rewards/one_code_blob_reward/mean": 0.024232149124145508, "rewards/one_code_blob_reward/std": 0.04735946282744408, "rewards/reward_code_runs/mean": -0.18645833432674408, "rewards/reward_code_runs/std": 0.2141665816307068, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": -0.0052083334885537624, "rewards/torch_empty_penalty/std": 0.022336147725582123, "rewards/torch_zeros_reward/mean": 0.0520833320915699, "rewards/torch_zeros_reward/std": 0.050218820571899414, "rewards/valid_tl_methods_reward/mean": 0.17499999701976776, "rewards/valid_tl_methods_reward/std": 0.06649099290370941, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2312.0, "completions/max_terminated_length": 2312.0, "completions/mean_length": 997.28125, "completions/mean_terminated_length": 997.28125, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.9356913183279743, "grad_norm": 0.5603955984115601, "learning_rate": 9.6e-07, "loss": 0.025, "num_tokens": 14366218.0, "reward": 1.0203857421875, "reward_std": 0.1987045258283615, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.09791667014360428, "rewards/masks_load_store_reward/std": 0.01435758825391531, "rewards/one_code_blob_reward/mean": 0.03861479088664055, "rewards/one_code_blob_reward/std": 0.030727189034223557, "rewards/reward_code_runs/mean": 0.07447917014360428, "rewards/reward_code_runs/std": 0.469153493642807, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": -0.0010416667209938169, "rewards/torch_empty_penalty/std": 0.010206207633018494, "rewards/torch_zeros_reward/mean": 0.01874999888241291, "rewards/torch_zeros_reward/std": 0.03923612833023071, "rewards/valid_tl_methods_reward/mean": 0.19375000894069672, "rewards/valid_tl_methods_reward/std": 0.034981198608875275, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2277.0, "completions/max_terminated_length": 2277.0, "completions/mean_length": 1014.8541870117188, "completions/mean_terminated_length": 1014.8541870117188, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.9453376205787781, "grad_norm": 0.6686782240867615, "learning_rate": 9.7e-07, "loss": -0.0485, "num_tokens": 14510132.0, "reward": 0.7968265414237976, "reward_std": 0.2724398076534271, "rewards/constexpr_reward/mean": 0.19166667759418488, "rewards/constexpr_reward/std": 0.04017505422234535, "rewards/imports_decorator_reward/mean": 0.19583334028720856, "rewards/imports_decorator_reward/std": 0.02871517650783062, "rewards/masks_load_store_reward/mean": 0.0885416641831398, "rewards/masks_load_store_reward/std": 0.03201904892921448, "rewards/one_code_blob_reward/mean": 0.02547227405011654, "rewards/one_code_blob_reward/std": 0.04091016575694084, "rewards/reward_code_runs/mean": -0.09739583730697632, "rewards/reward_code_runs/std": 0.40205851197242737, "rewards/think_reward/mean": 0.19375000894069672, "rewards/think_reward/std": 0.043072767555713654, "rewards/torch_empty_penalty/mean": -0.010416666977107525, "rewards/torch_empty_penalty/std": 0.03070801869034767, "rewards/torch_zeros_reward/mean": 0.04062500223517418, "rewards/torch_zeros_reward/std": 0.04937104508280754, "rewards/valid_tl_methods_reward/mean": 0.16875000298023224, "rewards/valid_tl_methods_reward/std": 0.07299964129924774, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3915.0, "completions/max_terminated_length": 3915.0, "completions/mean_length": 1276.166748046875, "completions/mean_terminated_length": 1276.166748046875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.954983922829582, "grad_norm": 0.5864729881286621, "learning_rate": 9.8e-07, "loss": -0.023, "num_tokens": 14676672.0, "reward": 0.7543246746063232, "reward_std": 0.1961607187986374, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.09791667014360428, "rewards/masks_load_store_reward/std": 0.01435758825391531, "rewards/one_code_blob_reward/mean": 0.03349132835865021, "rewards/one_code_blob_reward/std": 0.0393284372985363, "rewards/reward_code_runs/mean": -0.15729166567325592, "rewards/reward_code_runs/std": 0.2097591608762741, "rewards/think_reward/mean": 0.19687502086162567, "rewards/think_reward/std": 0.03061862103641033, "rewards/torch_empty_penalty/mean": -0.008333333767950535, "rewards/torch_empty_penalty/std": 0.027783624827861786, "rewards/torch_zeros_reward/mean": 0.01666666753590107, "rewards/torch_zeros_reward/std": 0.03746343404054642, "rewards/valid_tl_methods_reward/mean": 0.17916667461395264, "rewards/valid_tl_methods_reward/std": 0.06141603738069534, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1944.0, "completions/max_terminated_length": 1944.0, "completions/mean_length": 901.5625, "completions/mean_terminated_length": 901.5625, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 0.9646302250803859, "grad_norm": 0.7356076836585999, "learning_rate": 9.9e-07, "loss": 0.0596, "num_tokens": 14810406.0, "reward": 0.7643704414367676, "reward_std": 0.1145286113023758, "rewards/constexpr_reward/mean": 0.19583334028720856, "rewards/constexpr_reward/std": 0.02871517650783062, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.09687501192092896, "rewards/masks_load_store_reward/std": 0.017490599304437637, "rewards/one_code_blob_reward/mean": 0.033641304820775986, "rewards/one_code_blob_reward/std": 0.019893698394298553, "rewards/reward_code_runs/mean": -0.16093750298023224, "rewards/reward_code_runs/std": 0.18023422360420227, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": -0.0072916666977107525, "rewards/torch_empty_penalty/std": 0.026136448606848717, "rewards/torch_zeros_reward/mean": 0.02083333395421505, "rewards/torch_zeros_reward/std": 0.040824830532073975, "rewards/valid_tl_methods_reward/mean": 0.1875, "rewards/valid_tl_methods_reward/std": 0.04866642504930496, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2148.0, "completions/max_terminated_length": 2148.0, "completions/mean_length": 800.1979370117188, "completions/mean_terminated_length": 800.1979370117188, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.9742765273311897, "grad_norm": 1.3324309587478638, "learning_rate": 1e-06, "loss": 0.0146, "num_tokens": 14928589.0, "reward": 1.0799825191497803, "reward_std": 0.20546208322048187, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.09479167312383652, "rewards/masks_load_store_reward/std": 0.022336147725582123, "rewards/one_code_blob_reward/mean": 0.04612823203206062, "rewards/one_code_blob_reward/std": 0.030145816504955292, "rewards/reward_code_runs/mean": 0.1223958358168602, "rewards/reward_code_runs/std": 0.5407047271728516, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.0520833320915699, "rewards/torch_zeros_reward/std": 0.050218820571899414, "rewards/valid_tl_methods_reward/mean": 0.16458334028720856, "rewards/valid_tl_methods_reward/std": 0.07674862444400787, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1403.0, "completions/max_terminated_length": 1403.0, "completions/mean_length": 639.2291870117188, "completions/mean_terminated_length": 639.2291870117188, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.9839228295819936, "grad_norm": 0.8389772176742554, "learning_rate": 1e-06, "loss": 0.0127, "num_tokens": 15028019.0, "reward": 1.0647220611572266, "reward_std": 0.1348954290151596, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.09375, "rewards/masks_load_store_reward/std": 0.02433321252465248, "rewards/one_code_blob_reward/mean": 0.06732618808746338, "rewards/one_code_blob_reward/std": 0.02876145765185356, "rewards/reward_code_runs/mean": 0.1067708358168602, "rewards/reward_code_runs/std": 0.5149665474891663, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": -0.012500000186264515, "rewards/torch_empty_penalty/std": 0.033245496451854706, "rewards/torch_zeros_reward/mean": 0.03437500074505806, "rewards/torch_zeros_reward/std": 0.04774520918726921, "rewards/valid_tl_methods_reward/mean": 0.17499999701976776, "rewards/valid_tl_methods_reward/std": 0.06649099290370941, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1444.0, "completions/max_terminated_length": 1444.0, "completions/mean_length": 757.4833984375, "completions/mean_terminated_length": 757.4833984375, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 0.9935691318327974, "grad_norm": 0.9760256409645081, "learning_rate": 1e-06, "loss": 0.0163, "num_tokens": 15145295.0, "reward": 0.9058343768119812, "reward_std": 0.31626075506210327, "rewards/constexpr_reward/mean": 0.19166667759418488, "rewards/constexpr_reward/std": 0.04017505794763565, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.0989583358168602, "rewards/masks_load_store_reward/std": 0.010206207633018494, "rewards/one_code_blob_reward/mean": 0.05062602832913399, "rewards/one_code_blob_reward/std": 0.02906649000942707, "rewards/reward_code_runs/mean": -0.04270833358168602, "rewards/reward_code_runs/std": 0.36749356985092163, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.02187499962747097, "rewards/torch_zeros_reward/std": 0.04155687242746353, "rewards/valid_tl_methods_reward/mean": 0.18541665375232697, "rewards/valid_tl_methods_reward/std": 0.05227290466427803, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1830.0, "completions/max_terminated_length": 1830.0, "completions/mean_length": 961.4583740234375, "completions/mean_terminated_length": 961.4583740234375, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 1.0096463022508038, "grad_norm": 0.7094870209693909, "learning_rate": 1e-06, "loss": -0.0097, "num_tokens": 15278767.0, "reward": 1.114925742149353, "reward_std": 0.25318485498428345, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.09375, "rewards/masks_load_store_reward/std": 0.02433321252465248, "rewards/one_code_blob_reward/mean": 0.04044640436768532, "rewards/one_code_blob_reward/std": 0.033034007996320724, "rewards/reward_code_runs/mean": 0.19947916269302368, "rewards/reward_code_runs/std": 0.5805847644805908, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": -0.0010416667209938169, "rewards/torch_empty_penalty/std": 0.010206207633018494, "rewards/torch_zeros_reward/mean": 0.03020833432674408, "rewards/torch_zeros_reward/std": 0.046157147735357285, "rewards/valid_tl_methods_reward/mean": 0.15625, "rewards/valid_tl_methods_reward/std": 0.08311375230550766, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2418.0, "completions/max_terminated_length": 2418.0, "completions/mean_length": 913.8541870117188, "completions/mean_terminated_length": 913.8541870117188, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 1.0192926045016077, "grad_norm": 0.6050485968589783, "learning_rate": 1e-06, "loss": -0.0108, "num_tokens": 15410021.0, "reward": 0.907778263092041, "reward_std": 0.11176653951406479, "rewards/constexpr_reward/mean": 0.19375000894069672, "rewards/constexpr_reward/std": 0.034981194883584976, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.09062501043081284, "rewards/masks_load_store_reward/std": 0.02930106781423092, "rewards/one_code_blob_reward/mean": 0.04423652961850166, "rewards/one_code_blob_reward/std": 0.03256293758749962, "rewards/reward_code_runs/mean": -0.06562499701976776, "rewards/reward_code_runs/std": 0.4192921817302704, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": -0.0020833334419876337, "rewards/torch_empty_penalty/std": 0.014357589185237885, "rewards/torch_zeros_reward/mean": 0.0572916679084301, "rewards/torch_zeros_reward/std": 0.04972512274980545, "rewards/valid_tl_methods_reward/mean": 0.18958334624767303, "rewards/valid_tl_methods_reward/std": 0.044672295451164246, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2130.0, "completions/max_terminated_length": 2130.0, "completions/mean_length": 750.1875, "completions/mean_terminated_length": 750.1875, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 1.0289389067524115, "grad_norm": 0.7807785868644714, "learning_rate": 1e-06, "loss": 0.0336, "num_tokens": 15525131.0, "reward": 1.070699691772461, "reward_std": 0.20007085800170898, "rewards/constexpr_reward/mean": 0.19583334028720856, "rewards/constexpr_reward/std": 0.02871517837047577, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.054553885012865067, "rewards/one_code_blob_reward/std": 0.03134790062904358, "rewards/reward_code_runs/mean": 0.10260417312383652, "rewards/reward_code_runs/std": 0.5553268194198608, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.046875, "rewards/torch_zeros_reward/std": 0.05016420781612396, "rewards/valid_tl_methods_reward/mean": 0.17083333432674408, "rewards/valid_tl_methods_reward/std": 0.07095835357904434, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1782.0, "completions/max_terminated_length": 1782.0, "completions/mean_length": 879.8854370117188, "completions/mean_terminated_length": 879.8854370117188, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 1.0385852090032155, "grad_norm": 0.6552996039390564, "learning_rate": 1e-06, "loss": 0.0405, "num_tokens": 15652152.0, "reward": 1.0106452703475952, "reward_std": 0.12692387402057648, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.09791667014360428, "rewards/masks_load_store_reward/std": 0.01435758825391531, "rewards/one_code_blob_reward/mean": 0.04189518466591835, "rewards/one_code_blob_reward/std": 0.031889814883470535, "rewards/reward_code_runs/mean": 0.05104166641831398, "rewards/reward_code_runs/std": 0.47356316447257996, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": -0.0010416667209938169, "rewards/torch_empty_penalty/std": 0.010206207633018494, "rewards/torch_zeros_reward/mean": 0.04791666939854622, "rewards/torch_zeros_reward/std": 0.050218820571899414, "rewards/valid_tl_methods_reward/mean": 0.17291666567325592, "rewards/valid_tl_methods_reward/std": 0.06879284977912903, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1908.0, "completions/max_terminated_length": 1908.0, "completions/mean_length": 852.1875, "completions/mean_terminated_length": 852.1875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 1.0482315112540193, "grad_norm": 0.5427823066711426, "learning_rate": 1e-06, "loss": 0.0062, "num_tokens": 15776010.0, "reward": 0.9478980898857117, "reward_std": 0.18901358544826508, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.09583333879709244, "rewards/masks_load_store_reward/std": 0.020087527111172676, "rewards/one_code_blob_reward/mean": 0.03800217807292938, "rewards/one_code_blob_reward/std": 0.04549255967140198, "rewards/reward_code_runs/mean": 0.0036458373069763184, "rewards/reward_code_runs/std": 0.4326323866844177, "rewards/think_reward/mean": 0.19687502086162567, "rewards/think_reward/std": 0.03061862289905548, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.02604166604578495, "rewards/torch_zeros_reward/std": 0.04411657154560089, "rewards/valid_tl_methods_reward/mean": 0.19166667759418488, "rewards/valid_tl_methods_reward/std": 0.04017505422234535, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1514.0, "completions/max_terminated_length": 1514.0, "completions/mean_length": 835.4583740234375, "completions/mean_terminated_length": 835.4583740234375, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 1.0578778135048232, "grad_norm": 0.9861233830451965, "learning_rate": 1e-06, "loss": 0.0045, "num_tokens": 15899906.0, "reward": 0.8864466547966003, "reward_std": 0.15781597793102264, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.0989583358168602, "rewards/masks_load_store_reward/std": 0.010206207633018494, "rewards/one_code_blob_reward/mean": 0.043217454105615616, "rewards/one_code_blob_reward/std": 0.02705226093530655, "rewards/reward_code_runs/mean": -0.07864583283662796, "rewards/reward_code_runs/std": 0.40501755475997925, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": -0.0010416667209938169, "rewards/torch_empty_penalty/std": 0.010206207633018494, "rewards/torch_zeros_reward/mean": 0.046875, "rewards/torch_zeros_reward/std": 0.05016420781612396, "rewards/valid_tl_methods_reward/mean": 0.1770833283662796, "rewards/valid_tl_methods_reward/std": 0.06403809040784836, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1851.0, "completions/max_terminated_length": 1851.0, "completions/mean_length": 789.4271240234375, "completions/mean_terminated_length": 789.4271240234375, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 1.067524115755627, "grad_norm": 0.8766092658042908, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 16018987.0, "reward": 0.7993344068527222, "reward_std": 0.13429999351501465, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.09791667014360428, "rewards/masks_load_store_reward/std": 0.01435758825391531, "rewards/one_code_blob_reward/mean": 0.0482926070690155, "rewards/one_code_blob_reward/std": 0.03107845038175583, "rewards/reward_code_runs/mean": -0.18437498807907104, "rewards/reward_code_runs/std": 0.15965628623962402, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.06458333134651184, "rewards/torch_zeros_reward/std": 0.0480770580470562, "rewards/valid_tl_methods_reward/mean": 0.17291666567325592, "rewards/valid_tl_methods_reward/std": 0.06879284977912903, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2100.0, "completions/max_terminated_length": 2100.0, "completions/mean_length": 759.7708740234375, "completions/mean_terminated_length": 759.7708740234375, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 1.077170418006431, "grad_norm": 0.789993941783905, "learning_rate": 1e-06, "loss": 0.0078, "num_tokens": 16133469.0, "reward": 0.9797508120536804, "reward_std": 0.142560213804245, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.09687501192092896, "rewards/masks_load_store_reward/std": 0.017490597441792488, "rewards/one_code_blob_reward/mean": 0.056834083050489426, "rewards/one_code_blob_reward/std": 0.031814008951187134, "rewards/reward_code_runs/mean": -0.009374993853271008, "rewards/reward_code_runs/std": 0.42098334431648254, "rewards/think_reward/mean": 0.19687502086162567, "rewards/think_reward/std": 0.03061862289905548, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.0572916679084301, "rewards/torch_zeros_reward/std": 0.04972512274980545, "rewards/valid_tl_methods_reward/mean": 0.18124999105930328, "rewards/valid_tl_methods_reward/std": 0.058602139353752136, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1706.0, "completions/max_terminated_length": 1706.0, "completions/mean_length": 767.8333740234375, "completions/mean_terminated_length": 767.8333740234375, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 1.0868167202572347, "grad_norm": 0.8064549565315247, "learning_rate": 1e-06, "loss": -0.0186, "num_tokens": 16249697.0, "reward": 0.8837992548942566, "reward_std": 0.17673146724700928, "rewards/constexpr_reward/mean": 0.19583334028720856, "rewards/constexpr_reward/std": 0.02871517650783062, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.09791667014360428, "rewards/masks_load_store_reward/std": 0.01435758825391531, "rewards/one_code_blob_reward/mean": 0.046299200505018234, "rewards/one_code_blob_reward/std": 0.03371729701757431, "rewards/reward_code_runs/mean": -0.05416667088866234, "rewards/reward_code_runs/std": 0.39441272616386414, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.02500000037252903, "rewards/torch_zeros_reward/std": 0.04352857545018196, "rewards/valid_tl_methods_reward/mean": 0.17500001192092896, "rewards/valid_tl_methods_reward/std": 0.06649099290370941, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1651.0, "completions/max_terminated_length": 1651.0, "completions/mean_length": 805.28125, "completions/mean_terminated_length": 805.28125, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 1.0964630225080385, "grad_norm": 0.7702875733375549, "learning_rate": 1e-06, "loss": -0.0468, "num_tokens": 16373240.0, "reward": 0.9052786827087402, "reward_std": 0.2217210829257965, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.09791667014360428, "rewards/masks_load_store_reward/std": 0.01435758825391531, "rewards/one_code_blob_reward/mean": 0.04486199840903282, "rewards/one_code_blob_reward/std": 0.039526235312223434, "rewards/reward_code_runs/mean": -0.01666666567325592, "rewards/reward_code_runs/std": 0.39541226625442505, "rewards/think_reward/mean": 0.19687502086162567, "rewards/think_reward/std": 0.03061862289905548, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.02187499962747097, "rewards/torch_zeros_reward/std": 0.04155687242746353, "rewards/valid_tl_methods_reward/mean": 0.16458334028720856, "rewards/valid_tl_methods_reward/std": 0.07674862444400787, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1377.0, "completions/max_terminated_length": 1377.0, "completions/mean_length": 663.2083740234375, "completions/mean_terminated_length": 663.2083740234375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 1.1061093247588425, "grad_norm": 0.7669295072555542, "learning_rate": 1e-06, "loss": 0.0035, "num_tokens": 16476820.0, "reward": 1.043578028678894, "reward_std": 0.11954164505004883, "rewards/constexpr_reward/mean": 0.17916667461395264, "rewards/constexpr_reward/std": 0.061416033655405045, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.09791667014360428, "rewards/masks_load_store_reward/std": 0.01435758825391531, "rewards/one_code_blob_reward/mean": 0.05659870803356171, "rewards/one_code_blob_reward/std": 0.04853541776537895, "rewards/reward_code_runs/mean": 0.09531249850988388, "rewards/reward_code_runs/std": 0.5377378463745117, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.03541666641831398, "rewards/torch_zeros_reward/std": 0.0480770580470562, "rewards/valid_tl_methods_reward/mean": 0.18125002086162567, "rewards/valid_tl_methods_reward/std": 0.058602139353752136, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1228.0, "completions/max_terminated_length": 1228.0, "completions/mean_length": 583.5729370117188, "completions/mean_terminated_length": 583.5729370117188, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 1.1157556270096463, "grad_norm": 0.8292638659477234, "learning_rate": 1e-06, "loss": -0.0067, "num_tokens": 16570607.0, "reward": 1.4030187129974365, "reward_std": 0.24441388249397278, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.0989583358168602, "rewards/masks_load_store_reward/std": 0.010206207633018494, "rewards/one_code_blob_reward/mean": 0.07333113998174667, "rewards/one_code_blob_reward/std": 0.0357937254011631, "rewards/reward_code_runs/mean": 0.40677082538604736, "rewards/reward_code_runs/std": 0.6152713894844055, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.04270833730697632, "rewards/torch_zeros_reward/std": 0.04972512647509575, "rewards/valid_tl_methods_reward/mean": 0.18124999105930328, "rewards/valid_tl_methods_reward/std": 0.058602139353752136, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1615.0, "completions/max_terminated_length": 1615.0, "completions/mean_length": 790.4791870117188, "completions/mean_terminated_length": 790.4791870117188, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 1.1254019292604502, "grad_norm": 0.790634274482727, "learning_rate": 1e-06, "loss": -0.0193, "num_tokens": 16690557.0, "reward": 0.8124825358390808, "reward_std": 0.1920197457075119, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.09062501043081284, "rewards/masks_load_store_reward/std": 0.029301069676876068, "rewards/one_code_blob_reward/mean": 0.05206575617194176, "rewards/one_code_blob_reward/std": 0.03424833342432976, "rewards/reward_code_runs/mean": -0.15208333730697632, "rewards/reward_code_runs/std": 0.2957521080970764, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.05312499776482582, "rewards/torch_zeros_reward/std": 0.05016420781612396, "rewards/valid_tl_methods_reward/mean": 0.1729166954755783, "rewards/valid_tl_methods_reward/std": 0.06879284977912903, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1508.0, "completions/max_terminated_length": 1508.0, "completions/mean_length": 724.5, "completions/mean_terminated_length": 724.5, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 1.135048231511254, "grad_norm": 0.7953296303749084, "learning_rate": 1e-06, "loss": -0.0229, "num_tokens": 16807929.0, "reward": 0.7903190851211548, "reward_std": 0.15812592208385468, "rewards/constexpr_reward/mean": 0.19166667759418488, "rewards/constexpr_reward/std": 0.04017505422234535, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.09583333879709244, "rewards/masks_load_store_reward/std": 0.020087527111172676, "rewards/one_code_blob_reward/mean": 0.043444037437438965, "rewards/one_code_blob_reward/std": 0.024329539388418198, "rewards/reward_code_runs/mean": -0.17604167759418488, "rewards/reward_code_runs/std": 0.19654637575149536, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.05833333358168602, "rewards/torch_zeros_reward/std": 0.04955946281552315, "rewards/valid_tl_methods_reward/mean": 0.1770833283662796, "rewards/valid_tl_methods_reward/std": 0.06403809040784836, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1664.0, "completions/max_terminated_length": 1664.0, "completions/mean_length": 624.9375, "completions/mean_terminated_length": 624.9375, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 1.144694533762058, "grad_norm": 0.8499788641929626, "learning_rate": 1e-06, "loss": 0.0358, "num_tokens": 16908783.0, "reward": 1.0993988513946533, "reward_std": 0.14713454246520996, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.06085706874728203, "rewards/one_code_blob_reward/std": 0.030229216441512108, "rewards/reward_code_runs/mean": 0.109375, "rewards/reward_code_runs/std": 0.5343620181083679, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.03750000149011612, "rewards/torch_zeros_reward/std": 0.04866642504930496, "rewards/valid_tl_methods_reward/mean": 0.19166667759418488, "rewards/valid_tl_methods_reward/std": 0.04017505422234535, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1902.0, "completions/max_terminated_length": 1902.0, "completions/mean_length": 829.9791870117188, "completions/mean_terminated_length": 829.9791870117188, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 1.1543408360128617, "grad_norm": 0.8699808120727539, "learning_rate": 1e-06, "loss": -0.0014, "num_tokens": 17031757.0, "reward": 0.9007784128189087, "reward_std": 0.08680151402950287, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.09583333879709244, "rewards/masks_load_store_reward/std": 0.020087527111172676, "rewards/one_code_blob_reward/mean": 0.04348668456077576, "rewards/one_code_blob_reward/std": 0.031017502769827843, "rewards/reward_code_runs/mean": -0.06562500447034836, "rewards/reward_code_runs/std": 0.4192921221256256, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.03333333507180214, "rewards/torch_zeros_reward/std": 0.04738790914416313, "rewards/valid_tl_methods_reward/mean": 0.1937500238418579, "rewards/valid_tl_methods_reward/std": 0.034981198608875275, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1994.0, "completions/max_terminated_length": 1994.0, "completions/mean_length": 847.6979370117188, "completions/mean_terminated_length": 847.6979370117188, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 1.1639871382636655, "grad_norm": 0.7712662220001221, "learning_rate": 1e-06, "loss": -0.0355, "num_tokens": 17158976.0, "reward": 0.7087725400924683, "reward_std": 0.09143199771642685, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.0989583358168602, "rewards/masks_load_store_reward/std": 0.010206207633018494, "rewards/one_code_blob_reward/mean": 0.03741825371980667, "rewards/one_code_blob_reward/std": 0.03352433815598488, "rewards/reward_code_runs/mean": -0.24531249701976776, "rewards/reward_code_runs/std": 0.04592793434858322, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.046875, "rewards/torch_zeros_reward/std": 0.05016420781612396, "rewards/valid_tl_methods_reward/mean": 0.17499999701976776, "rewards/valid_tl_methods_reward/std": 0.06649099290370941, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1157.0, "completions/max_terminated_length": 1157.0, "completions/mean_length": 601.2604370117188, "completions/mean_terminated_length": 601.2604370117188, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 1.1736334405144695, "grad_norm": 0.6979901790618896, "learning_rate": 1e-06, "loss": 0.0226, "num_tokens": 17256777.0, "reward": 0.9565267562866211, "reward_std": 0.09983557462692261, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.06277672201395035, "rewards/one_code_blob_reward/std": 0.02563941292464733, "rewards/reward_code_runs/mean": -0.028124993667006493, "rewards/reward_code_runs/std": 0.4212645888328552, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.04895833134651184, "rewards/torch_zeros_reward/std": 0.050251562148332596, "rewards/valid_tl_methods_reward/mean": 0.17291666567325592, "rewards/valid_tl_methods_reward/std": 0.06879284977912903, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1296.0, "completions/max_terminated_length": 1296.0, "completions/mean_length": 720.125, "completions/mean_terminated_length": 720.125, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 1.1832797427652733, "grad_norm": 1.1423685550689697, "learning_rate": 1e-06, "loss": 0.0302, "num_tokens": 17369217.0, "reward": 0.9213575124740601, "reward_std": 0.08902470767498016, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.04844085872173309, "rewards/one_code_blob_reward/std": 0.0287802591919899, "rewards/reward_code_runs/mean": -0.046875, "rewards/reward_code_runs/std": 0.4207019507884979, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.03020833432674408, "rewards/torch_zeros_reward/std": 0.046157147735357285, "rewards/valid_tl_methods_reward/mean": 0.18958334624767303, "rewards/valid_tl_methods_reward/std": 0.044672295451164246, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2428.0, "completions/max_terminated_length": 2428.0, "completions/mean_length": 791.34375, "completions/mean_terminated_length": 791.34375, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 1.1929260450160772, "grad_norm": 0.8820392489433289, "learning_rate": 1e-06, "loss": -0.0095, "num_tokens": 17487174.0, "reward": 0.9861018061637878, "reward_std": 0.08739300817251205, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.09791667014360428, "rewards/masks_load_store_reward/std": 0.01435758825391531, "rewards/one_code_blob_reward/mean": 0.05380998179316521, "rewards/one_code_blob_reward/std": 0.038780417293310165, "rewards/reward_code_runs/mean": -0.01875000260770321, "rewards/reward_code_runs/std": 0.42122939229011536, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.05520833656191826, "rewards/torch_zeros_reward/std": 0.04998903349041939, "rewards/valid_tl_methods_reward/mean": 0.1979166716337204, "rewards/valid_tl_methods_reward/std": 0.020412415266036987, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1886.0, "completions/max_terminated_length": 1886.0, "completions/mean_length": 1003.7396240234375, "completions/mean_terminated_length": 1003.7396240234375, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 1.202572347266881, "grad_norm": 0.719910740852356, "learning_rate": 1e-06, "loss": 0.0595, "num_tokens": 17629193.0, "reward": 0.8898797035217285, "reward_std": 0.11359749734401703, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.035712968558073044, "rewards/one_code_blob_reward/std": 0.03590528666973114, "rewards/reward_code_runs/mean": -0.0625, "rewards/reward_code_runs/std": 0.3800969123840332, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.03958333283662796, "rewards/torch_zeros_reward/std": 0.04915960505604744, "rewards/valid_tl_methods_reward/mean": 0.1770833283662796, "rewards/valid_tl_methods_reward/std": 0.06403809040784836, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1455.0, "completions/max_terminated_length": 1455.0, "completions/mean_length": 684.3541870117188, "completions/mean_terminated_length": 684.3541870117188, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 1.212218649517685, "grad_norm": 0.7489015460014343, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 17736483.0, "reward": 1.0232198238372803, "reward_std": 0.21418210864067078, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.09479167312383652, "rewards/masks_load_store_reward/std": 0.022336147725582123, "rewards/one_code_blob_reward/mean": 0.0570739321410656, "rewards/one_code_blob_reward/std": 0.03297010809183121, "rewards/reward_code_runs/mean": 0.02239583432674408, "rewards/reward_code_runs/std": 0.43096399307250977, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.05520833656191826, "rewards/torch_zeros_reward/std": 0.04998903349041939, "rewards/valid_tl_methods_reward/mean": 0.19583334028720856, "rewards/valid_tl_methods_reward/std": 0.02871517650783062, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3551.0, "completions/mean_length": 937.75, "completions/mean_terminated_length": 727.2000122070312, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 1.2218649517684887, "grad_norm": 0.9929730892181396, "learning_rate": 1e-06, "loss": 0.0323, "num_tokens": 17864571.0, "reward": 1.134056806564331, "reward_std": 0.26930850744247437, "rewards/constexpr_reward/mean": 0.1833333522081375, "rewards/constexpr_reward/std": 0.05556724965572357, "rewards/imports_decorator_reward/mean": 0.1833333522081375, "rewards/imports_decorator_reward/std": 0.05556724965572357, "rewards/masks_load_store_reward/mean": 0.09270834177732468, "rewards/masks_load_store_reward/std": 0.026136452332139015, "rewards/one_code_blob_reward/mean": 0.04603588208556175, "rewards/one_code_blob_reward/std": 0.07669390738010406, "rewards/reward_code_runs/mean": 0.19947917759418488, "rewards/reward_code_runs/std": 0.4922822415828705, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.04791666567325592, "rewards/torch_zeros_reward/std": 0.050218820571899414, "rewards/valid_tl_methods_reward/mean": 0.18125002086162567, "rewards/valid_tl_methods_reward/std": 0.058602139353752136, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1398.0, "completions/max_terminated_length": 1398.0, "completions/mean_length": 612.3333740234375, "completions/mean_terminated_length": 612.3333740234375, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 1.2315112540192925, "grad_norm": 0.766198992729187, "learning_rate": 1e-06, "loss": -0.0102, "num_tokens": 17966759.0, "reward": 1.213801383972168, "reward_std": 0.15725013613700867, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.06432215124368668, "rewards/one_code_blob_reward/std": 0.024616621434688568, "rewards/reward_code_runs/mean": 0.19947917759418488, "rewards/reward_code_runs/std": 0.49228227138519287, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.05625000596046448, "rewards/torch_zeros_reward/std": 0.04986824467778206, "rewards/valid_tl_methods_reward/mean": 0.1937500238418579, "rewards/valid_tl_methods_reward/std": 0.034981198608875275, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1922.0, "completions/max_terminated_length": 1922.0, "completions/mean_length": 714.3646240234375, "completions/mean_terminated_length": 714.3646240234375, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 1.2411575562700965, "grad_norm": 0.7098949551582336, "learning_rate": 1e-06, "loss": 0.019, "num_tokens": 18078058.0, "reward": 1.1137888431549072, "reward_std": 0.12167000770568848, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.05753864720463753, "rewards/one_code_blob_reward/std": 0.037805669009685516, "rewards/reward_code_runs/mean": 0.08541666716337204, "rewards/reward_code_runs/std": 0.49936363101005554, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.0729166641831398, "rewards/torch_zeros_reward/std": 0.044672295451164246, "rewards/valid_tl_methods_reward/mean": 0.20000000298023224, "rewards/valid_tl_methods_reward/std": 0.0, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1433.0, "completions/max_terminated_length": 1433.0, "completions/mean_length": 653.8125, "completions/mean_terminated_length": 653.8125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 1.2508038585209003, "grad_norm": 0.8204249739646912, "learning_rate": 1e-06, "loss": 0.038, "num_tokens": 18181300.0, "reward": 0.9050588607788086, "reward_std": 0.05716240406036377, "rewards/constexpr_reward/mean": 0.1937500238418579, "rewards/constexpr_reward/std": 0.034981198608875275, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.054017141461372375, "rewards/one_code_blob_reward/std": 0.02962045557796955, "rewards/reward_code_runs/mean": -0.08437500149011612, "rewards/reward_code_runs/std": 0.4170266091823578, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.06666667014360428, "rewards/torch_zeros_reward/std": 0.04738791286945343, "rewards/valid_tl_methods_reward/mean": 0.17499999701976776, "rewards/valid_tl_methods_reward/std": 0.06649099290370941, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1499.0, "completions/max_terminated_length": 1499.0, "completions/mean_length": 729.5416870117188, "completions/mean_terminated_length": 729.5416870117188, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 1.2604501607717042, "grad_norm": 0.6134868264198303, "learning_rate": 1e-06, "loss": 0.0328, "num_tokens": 18298652.0, "reward": 0.8414579629898071, "reward_std": 0.19312511384487152, "rewards/constexpr_reward/mean": 0.19166667759418488, "rewards/constexpr_reward/std": 0.04017505422234535, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.09687501192092896, "rewards/masks_load_store_reward/std": 0.017490599304437637, "rewards/one_code_blob_reward/mean": 0.04874952509999275, "rewards/one_code_blob_reward/std": 0.020982850342988968, "rewards/reward_code_runs/mean": -0.1510416716337204, "rewards/reward_code_runs/std": 0.27786585688591003, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.05937500298023224, "rewards/torch_zeros_reward/std": 0.04937104508280754, "rewards/valid_tl_methods_reward/mean": 0.19583334028720856, "rewards/valid_tl_methods_reward/std": 0.02871517650783062, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1769.0, "completions/max_terminated_length": 1769.0, "completions/mean_length": 725.5104370117188, "completions/mean_terminated_length": 725.5104370117188, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 1.270096463022508, "grad_norm": 0.7332961559295654, "learning_rate": 1e-06, "loss": 0.0498, "num_tokens": 18414645.0, "reward": 0.9153237342834473, "reward_std": 0.17812936007976532, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.0989583358168602, "rewards/masks_load_store_reward/std": 0.010206207633018494, "rewards/one_code_blob_reward/mean": 0.05386530980467796, "rewards/one_code_blob_reward/std": 0.03133547678589821, "rewards/reward_code_runs/mean": -0.04583333060145378, "rewards/reward_code_runs/std": 0.4080548584461212, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.03333333507180214, "rewards/torch_zeros_reward/std": 0.04738790914416313, "rewards/valid_tl_methods_reward/mean": 0.17499999701976776, "rewards/valid_tl_methods_reward/std": 0.06649099290370941, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3680.0, "completions/max_terminated_length": 3680.0, "completions/mean_length": 1063.2708740234375, "completions/mean_terminated_length": 1063.2708740234375, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 1.279742765273312, "grad_norm": 0.7587729096412659, "learning_rate": 1e-06, "loss": 0.0326, "num_tokens": 18562499.0, "reward": 0.8570469617843628, "reward_std": 0.1749621331691742, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.0989583358168602, "rewards/masks_load_store_reward/std": 0.010206207633018494, "rewards/one_code_blob_reward/mean": 0.03673437237739563, "rewards/one_code_blob_reward/std": 0.02618933655321598, "rewards/reward_code_runs/mean": -0.11927083134651184, "rewards/reward_code_runs/std": 0.30791059136390686, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.05104166641831398, "rewards/torch_zeros_reward/std": 0.0502515584230423, "rewards/valid_tl_methods_reward/mean": 0.19166667759418488, "rewards/valid_tl_methods_reward/std": 0.04017505422234535, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1409.0, "completions/max_terminated_length": 1409.0, "completions/mean_length": 752.0104370117188, "completions/mean_terminated_length": 752.0104370117188, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 1.2893890675241158, "grad_norm": 0.8954541087150574, "learning_rate": 1e-06, "loss": 0.0442, "num_tokens": 18681312.0, "reward": 0.9138407111167908, "reward_std": 0.18238215148448944, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.09791667014360428, "rewards/masks_load_store_reward/std": 0.01435758825391531, "rewards/one_code_blob_reward/mean": 0.050298962742090225, "rewards/one_code_blob_reward/std": 0.02945978380739689, "rewards/reward_code_runs/mean": -0.0520833320915699, "rewards/reward_code_runs/std": 0.3669157922267914, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.02604166604578495, "rewards/torch_zeros_reward/std": 0.04411657154560089, "rewards/valid_tl_methods_reward/mean": 0.19375000894069672, "rewards/valid_tl_methods_reward/std": 0.034981198608875275, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1425.0, "completions/max_terminated_length": 1425.0, "completions/mean_length": 750.1146240234375, "completions/mean_terminated_length": 750.1146240234375, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 1.2990353697749195, "grad_norm": 0.6723171472549438, "learning_rate": 1e-06, "loss": -0.0271, "num_tokens": 18801695.0, "reward": 1.0500636100769043, "reward_std": 0.190835103392601, "rewards/constexpr_reward/mean": 0.1937500238418579, "rewards/constexpr_reward/std": 0.034981198608875275, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.04589679837226868, "rewards/one_code_blob_reward/std": 0.022632678970694542, "rewards/reward_code_runs/mean": 0.07500001043081284, "rewards/reward_code_runs/std": 0.5119621753692627, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.04583333432674408, "rewards/torch_zeros_reward/std": 0.050087641924619675, "rewards/valid_tl_methods_reward/mean": 0.18958334624767303, "rewards/valid_tl_methods_reward/std": 0.044672295451164246, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1770.0, "completions/max_terminated_length": 1770.0, "completions/mean_length": 809.6354370117188, "completions/mean_terminated_length": 809.6354370117188, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 1.3086816720257235, "grad_norm": 0.9398924708366394, "learning_rate": 1e-06, "loss": 0.0377, "num_tokens": 18924696.0, "reward": 0.9232425093650818, "reward_std": 0.14786306023597717, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.0989583358168602, "rewards/masks_load_store_reward/std": 0.010206207633018494, "rewards/one_code_blob_reward/mean": 0.043034106492996216, "rewards/one_code_blob_reward/std": 0.03087618201971054, "rewards/reward_code_runs/mean": -0.09062498807907104, "rewards/reward_code_runs/std": 0.3762217164039612, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.08437500149011612, "rewards/torch_zeros_reward/std": 0.03649982064962387, "rewards/valid_tl_methods_reward/mean": 0.1875, "rewards/valid_tl_methods_reward/std": 0.04866642504930496, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1316.0, "completions/max_terminated_length": 1316.0, "completions/mean_length": 629.28125, "completions/mean_terminated_length": 629.28125, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 1.3183279742765273, "grad_norm": 0.9921103715896606, "learning_rate": 1e-06, "loss": 0.0028, "num_tokens": 19029159.0, "reward": 0.8372518420219421, "reward_std": 0.1289333999156952, "rewards/constexpr_reward/mean": 0.19583334028720856, "rewards/constexpr_reward/std": 0.02871517650783062, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.0989583358168602, "rewards/masks_load_store_reward/std": 0.010206207633018494, "rewards/one_code_blob_reward/mean": 0.05912676081061363, "rewards/one_code_blob_reward/std": 0.02672174572944641, "rewards/reward_code_runs/mean": -0.16562499105930328, "rewards/reward_code_runs/std": 0.1765625774860382, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.05520833656191826, "rewards/torch_zeros_reward/std": 0.049989037215709686, "rewards/valid_tl_methods_reward/mean": 0.19375000894069672, "rewards/valid_tl_methods_reward/std": 0.034981198608875275, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1429.0, "completions/max_terminated_length": 1429.0, "completions/mean_length": 610.3854370117188, "completions/mean_terminated_length": 610.3854370117188, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 1.3279742765273312, "grad_norm": 0.8055822849273682, "learning_rate": 1e-06, "loss": 0.0039, "num_tokens": 19129276.0, "reward": 1.0758776664733887, "reward_std": 0.13094481825828552, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.06285682320594788, "rewards/one_code_blob_reward/std": 0.02713281475007534, "rewards/reward_code_runs/mean": 0.07968749850988388, "rewards/reward_code_runs/std": 0.40792015194892883, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.0625, "rewards/torch_zeros_reward/std": 0.04866642504930496, "rewards/valid_tl_methods_reward/mean": 0.17500001192092896, "rewards/valid_tl_methods_reward/std": 0.06649099290370941, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1079.0, "completions/max_terminated_length": 1079.0, "completions/mean_length": 663.25, "completions/mean_terminated_length": 663.25, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 1.337620578778135, "grad_norm": 0.9214664697647095, "learning_rate": 1e-06, "loss": 0.0272, "num_tokens": 19236664.0, "reward": 0.911638617515564, "reward_std": 0.11367520689964294, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.09479167312383652, "rewards/masks_load_store_reward/std": 0.022336147725582123, "rewards/one_code_blob_reward/mean": 0.059555213898420334, "rewards/one_code_blob_reward/std": 0.03002886101603508, "rewards/reward_code_runs/mean": -0.10104166716337204, "rewards/reward_code_runs/std": 0.388043612241745, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.06041666865348816, "rewards/torch_zeros_reward/std": 0.04915960505604744, "rewards/valid_tl_methods_reward/mean": 0.20000000298023224, "rewards/valid_tl_methods_reward/std": 0.0, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1607.0, "completions/max_terminated_length": 1607.0, "completions/mean_length": 784.28125, "completions/mean_terminated_length": 784.28125, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 1.347266881028939, "grad_norm": 0.7683331370353699, "learning_rate": 1e-06, "loss": 0.0197, "num_tokens": 19358323.0, "reward": 0.774795413017273, "reward_std": 0.11972075700759888, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.04250364378094673, "rewards/one_code_blob_reward/std": 0.027647629380226135, "rewards/reward_code_runs/mean": -0.22187499701976776, "rewards/reward_code_runs/std": 0.10949946194887161, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.0833333358168602, "rewards/torch_zeros_reward/std": 0.03746343031525612, "rewards/valid_tl_methods_reward/mean": 0.17291666567325592, "rewards/valid_tl_methods_reward/std": 0.06879284977912903, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1169.0, "completions/max_terminated_length": 1169.0, "completions/mean_length": 651.3646240234375, "completions/mean_terminated_length": 651.3646240234375, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 1.3569131832797428, "grad_norm": 0.8256404399871826, "learning_rate": 1e-06, "loss": -0.0131, "num_tokens": 19469142.0, "reward": 0.809127926826477, "reward_std": 0.12502001225948334, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.09791667014360428, "rewards/masks_load_store_reward/std": 0.01435758825391531, "rewards/one_code_blob_reward/mean": 0.05600285530090332, "rewards/one_code_blob_reward/std": 0.022366967052221298, "rewards/reward_code_runs/mean": -0.17499999701976776, "rewards/reward_code_runs/std": 0.16858543455600739, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.0677083358168602, "rewards/torch_zeros_reward/std": 0.047004569321870804, "rewards/valid_tl_methods_reward/mean": 0.16250000894069672, "rewards/valid_tl_methods_reward/std": 0.07847225666046143, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1229.0, "completions/max_terminated_length": 1229.0, "completions/mean_length": 688.7291870117188, "completions/mean_terminated_length": 688.7291870117188, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 1.3665594855305465, "grad_norm": 0.8053058981895447, "learning_rate": 1e-06, "loss": 0.0236, "num_tokens": 19581064.0, "reward": 0.9577488899230957, "reward_std": 0.17412030696868896, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.09791667014360428, "rewards/masks_load_store_reward/std": 0.01435758825391531, "rewards/one_code_blob_reward/mean": 0.05201972648501396, "rewards/one_code_blob_reward/std": 0.029080655425786972, "rewards/reward_code_runs/mean": -0.048437491059303284, "rewards/reward_code_runs/std": 0.381234347820282, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.05833333358168602, "rewards/torch_zeros_reward/std": 0.04955946281552315, "rewards/valid_tl_methods_reward/mean": 0.20000000298023224, "rewards/valid_tl_methods_reward/std": 0.0, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1165.0, "completions/max_terminated_length": 1165.0, "completions/mean_length": 624.53125, "completions/mean_terminated_length": 624.53125, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 1.3762057877813505, "grad_norm": 0.7290429472923279, "learning_rate": 1e-06, "loss": 0.0215, "num_tokens": 19685803.0, "reward": 1.047572135925293, "reward_std": 0.19656646251678467, "rewards/constexpr_reward/mean": 0.19583334028720856, "rewards/constexpr_reward/std": 0.02871517650783062, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.05798870325088501, "rewards/one_code_blob_reward/std": 0.02330818958580494, "rewards/reward_code_runs/mean": 0.03854166343808174, "rewards/reward_code_runs/std": 0.40356823801994324, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.0572916679084301, "rewards/torch_zeros_reward/std": 0.04972511902451515, "rewards/valid_tl_methods_reward/mean": 0.1979166716337204, "rewards/valid_tl_methods_reward/std": 0.020412415266036987, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1546.0, "completions/max_terminated_length": 1546.0, "completions/mean_length": 727.40625, "completions/mean_terminated_length": 727.40625, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 1.3858520900321543, "grad_norm": 0.7532760500907898, "learning_rate": 1e-06, "loss": 0.0127, "num_tokens": 19797490.0, "reward": 0.940872311592102, "reward_std": 0.21272242069244385, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.0989583358168602, "rewards/masks_load_store_reward/std": 0.010206207633018494, "rewards/one_code_blob_reward/mean": 0.05910136178135872, "rewards/one_code_blob_reward/std": 0.036845460534095764, "rewards/reward_code_runs/mean": -0.06822916120290756, "rewards/reward_code_runs/std": 0.3931063711643219, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.07604166865348816, "rewards/torch_zeros_reward/std": 0.042906977236270905, "rewards/valid_tl_methods_reward/mean": 0.17499999701976776, "rewards/valid_tl_methods_reward/std": 0.06649099290370941, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1337.0, "completions/max_terminated_length": 1337.0, "completions/mean_length": 596.3333740234375, "completions/mean_terminated_length": 596.3333740234375, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 1.3954983922829582, "grad_norm": 0.9243506193161011, "learning_rate": 1e-06, "loss": 0.0259, "num_tokens": 19898106.0, "reward": 0.868034839630127, "reward_std": 0.15040241181850433, "rewards/constexpr_reward/mean": 0.17500001192092896, "rewards/constexpr_reward/std": 0.06649099290370941, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.09791667014360428, "rewards/masks_load_store_reward/std": 0.014357589185237885, "rewards/one_code_blob_reward/mean": 0.06282646209001541, "rewards/one_code_blob_reward/std": 0.0359637513756752, "rewards/reward_code_runs/mean": -0.13749998807907104, "rewards/reward_code_runs/std": 0.19587858021259308, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.09479167312383652, "rewards/torch_zeros_reward/std": 0.022336147725582123, "rewards/valid_tl_methods_reward/mean": 0.17499999701976776, "rewards/valid_tl_methods_reward/std": 0.06649099290370941, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1775.0, "completions/max_terminated_length": 1775.0, "completions/mean_length": 768.75, "completions/mean_terminated_length": 768.75, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 1.405144694533762, "grad_norm": 0.7844826579093933, "learning_rate": 1e-06, "loss": 0.0971, "num_tokens": 20015946.0, "reward": 1.0762923955917358, "reward_std": 0.10215584933757782, "rewards/constexpr_reward/mean": 0.19583334028720856, "rewards/constexpr_reward/std": 0.02871517650783062, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.048167187720537186, "rewards/one_code_blob_reward/std": 0.03205899894237518, "rewards/reward_code_runs/mean": 0.0729166641831398, "rewards/reward_code_runs/std": 0.5325171947479248, "rewards/think_reward/mean": 0.19687502086162567, "rewards/think_reward/std": 0.03061862289905548, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.06458333134651184, "rewards/torch_zeros_reward/std": 0.0480770580470562, "rewards/valid_tl_methods_reward/mean": 0.20000000298023224, "rewards/valid_tl_methods_reward/std": 0.0, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1564.0, "completions/max_terminated_length": 1564.0, "completions/mean_length": 654.3646240234375, "completions/mean_terminated_length": 654.3646240234375, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 1.414790996784566, "grad_norm": 0.9075904488563538, "learning_rate": 1e-06, "loss": -0.013, "num_tokens": 20120753.0, "reward": 1.1273137331008911, "reward_std": 0.10572604835033417, "rewards/constexpr_reward/mean": 0.19583334028720856, "rewards/constexpr_reward/std": 0.02871517837047577, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.06325113028287888, "rewards/one_code_blob_reward/std": 0.03051825985312462, "rewards/reward_code_runs/mean": 0.10468750447034836, "rewards/reward_code_runs/std": 0.5355311036109924, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.06562500447034836, "rewards/torch_zeros_reward/std": 0.04774521291255951, "rewards/valid_tl_methods_reward/mean": 0.1979166716337204, "rewards/valid_tl_methods_reward/std": 0.020412415266036987, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1215.0, "completions/max_terminated_length": 1215.0, "completions/mean_length": 592.09375, "completions/mean_terminated_length": 592.09375, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 1.4244372990353698, "grad_norm": 0.7394472360610962, "learning_rate": 1e-06, "loss": 0.0127, "num_tokens": 20224310.0, "reward": 0.8296582698822021, "reward_std": 0.13804888725280762, "rewards/constexpr_reward/mean": 0.19583334028720856, "rewards/constexpr_reward/std": 0.02871517650783062, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.0989583358168602, "rewards/masks_load_store_reward/std": 0.010206207633018494, "rewards/one_code_blob_reward/mean": 0.06142908334732056, "rewards/one_code_blob_reward/std": 0.02213943563401699, "rewards/reward_code_runs/mean": -0.17239584028720856, "rewards/reward_code_runs/std": 0.2241791933774948, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.0833333358168602, "rewards/torch_zeros_reward/std": 0.03746343404054642, "rewards/valid_tl_methods_reward/mean": 0.16250000894069672, "rewards/valid_tl_methods_reward/std": 0.07847225666046143, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1806.0, "completions/max_terminated_length": 1806.0, "completions/mean_length": 734.34375, "completions/mean_terminated_length": 734.34375, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 1.4340836012861735, "grad_norm": 0.9521190524101257, "learning_rate": 1e-06, "loss": 0.0554, "num_tokens": 20343263.0, "reward": 0.8669583201408386, "reward_std": 0.1845017373561859, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.0989583358168602, "rewards/masks_load_store_reward/std": 0.010206207633018494, "rewards/one_code_blob_reward/mean": 0.05341659113764763, "rewards/one_code_blob_reward/std": 0.028257377445697784, "rewards/reward_code_runs/mean": -0.16770832240581512, "rewards/reward_code_runs/std": 0.22722342610359192, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.09270834177732468, "rewards/torch_zeros_reward/std": 0.026136452332139015, "rewards/valid_tl_methods_reward/mean": 0.18958334624767303, "rewards/valid_tl_methods_reward/std": 0.044672295451164246, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1155.0, "completions/max_terminated_length": 1155.0, "completions/mean_length": 720.8854370117188, "completions/mean_terminated_length": 720.8854370117188, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 1.4437299035369775, "grad_norm": 0.9265972375869751, "learning_rate": 1e-06, "loss": 0.0182, "num_tokens": 20458680.0, "reward": 1.0103106498718262, "reward_std": 0.11800509691238403, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.0989583358168602, "rewards/masks_load_store_reward/std": 0.010206207633018494, "rewards/one_code_blob_reward/mean": 0.04728983715176582, "rewards/one_code_blob_reward/std": 0.030017884448170662, "rewards/reward_code_runs/mean": -0.02343749813735485, "rewards/reward_code_runs/std": 0.4212733209133148, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.08958333730697632, "rewards/torch_zeros_reward/std": 0.030708016827702522, "rewards/valid_tl_methods_reward/mean": 0.20000000298023224, "rewards/valid_tl_methods_reward/std": 0.0, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 977.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 559.2604370117188, "completions/mean_terminated_length": 559.2604370117188, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 1.4533762057877815, "grad_norm": 0.9568269848823547, "learning_rate": 1e-06, "loss": 0.0077, "num_tokens": 20554105.0, "reward": 0.8368010520935059, "reward_std": 0.12439590692520142, "rewards/constexpr_reward/mean": 0.1666666716337204, "rewards/constexpr_reward/std": 0.07492686808109283, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.09375, "rewards/masks_load_store_reward/std": 0.02433321252465248, "rewards/one_code_blob_reward/mean": 0.06961345672607422, "rewards/one_code_blob_reward/std": 0.04036742448806763, "rewards/reward_code_runs/mean": -0.15260417759418488, "rewards/reward_code_runs/std": 0.21267344057559967, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.07604166865348816, "rewards/torch_zeros_reward/std": 0.042906977236270905, "rewards/valid_tl_methods_reward/mean": 0.18541665375232697, "rewards/valid_tl_methods_reward/std": 0.05227290466427803, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1103.0, "completions/max_terminated_length": 1103.0, "completions/mean_length": 655.3229370117188, "completions/mean_terminated_length": 655.3229370117188, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 1.4630225080385852, "grad_norm": 0.802787721157074, "learning_rate": 1e-06, "loss": 0.021, "num_tokens": 20662844.0, "reward": 1.101922869682312, "reward_std": 0.13189242780208588, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.059214454144239426, "rewards/one_code_blob_reward/std": 0.02514045685529709, "rewards/reward_code_runs/mean": 0.06562500447034836, "rewards/reward_code_runs/std": 0.41130807995796204, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.07708333432674408, "rewards/torch_zeros_reward/std": 0.04225029796361923, "rewards/valid_tl_methods_reward/mean": 0.20000000298023224, "rewards/valid_tl_methods_reward/std": 0.0, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1408.0, "completions/max_terminated_length": 1408.0, "completions/mean_length": 690.8229370117188, "completions/mean_terminated_length": 690.8229370117188, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 1.472668810289389, "grad_norm": 1.0255579948425293, "learning_rate": 1e-06, "loss": 0.0057, "num_tokens": 20775195.0, "reward": 1.0822356939315796, "reward_std": 0.22474662959575653, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.0989583358168602, "rewards/masks_load_store_reward/std": 0.010206207633018494, "rewards/one_code_blob_reward/mean": 0.059839725494384766, "rewards/one_code_blob_reward/std": 0.03606778010725975, "rewards/reward_code_runs/mean": 0.04114583507180214, "rewards/reward_code_runs/std": 0.4284608066082001, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.08645833283662796, "rewards/torch_zeros_reward/std": 0.034396421164274216, "rewards/valid_tl_methods_reward/mean": 0.20000000298023224, "rewards/valid_tl_methods_reward/std": 0.0, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 870.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 561.2604370117188, "completions/mean_terminated_length": 561.2604370117188, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 1.482315112540193, "grad_norm": 0.8129120469093323, "learning_rate": 1e-06, "loss": -0.0138, "num_tokens": 20872636.0, "reward": 1.1856117248535156, "reward_std": 0.33780115842819214, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.066340871155262, "rewards/one_code_blob_reward/std": 0.022852176800370216, "rewards/reward_code_runs/mean": 0.15781249105930328, "rewards/reward_code_runs/std": 0.4548179507255554, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.06354167312383652, "rewards/torch_zeros_reward/std": 0.04838397353887558, "rewards/valid_tl_methods_reward/mean": 0.1979166716337204, "rewards/valid_tl_methods_reward/std": 0.020412415266036987, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2653.0, "completions/max_terminated_length": 2653.0, "completions/mean_length": 746.875, "completions/mean_terminated_length": 746.875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 1.4919614147909968, "grad_norm": 1.1085046529769897, "learning_rate": 1e-06, "loss": -0.0302, "num_tokens": 20986444.0, "reward": 1.183764100074768, "reward_std": 0.3158164620399475, "rewards/constexpr_reward/mean": 0.17291666567325592, "rewards/constexpr_reward/std": 0.06879284977912903, "rewards/imports_decorator_reward/mean": 0.19583334028720856, "rewards/imports_decorator_reward/std": 0.02871517837047577, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.07751400023698807, "rewards/one_code_blob_reward/std": 0.05076025426387787, "rewards/reward_code_runs/mean": 0.17604167759418488, "rewards/reward_code_runs/std": 0.5024141669273376, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.06145833432674408, "rewards/torch_zeros_reward/std": 0.04892484098672867, "rewards/valid_tl_methods_reward/mean": 0.20000000298023224, "rewards/valid_tl_methods_reward/std": 0.0, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1292.0, "completions/max_terminated_length": 1292.0, "completions/mean_length": 594.2708740234375, "completions/mean_terminated_length": 594.2708740234375, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 1.5016077170418005, "grad_norm": 0.7305670380592346, "learning_rate": 1e-06, "loss": 0.0212, "num_tokens": 21087726.0, "reward": 1.0104423761367798, "reward_std": 0.2103710025548935, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.06460893154144287, "rewards/one_code_blob_reward/std": 0.019467337056994438, "rewards/reward_code_runs/mean": -0.007291665766388178, "rewards/reward_code_runs/std": 0.24772429466247559, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.08020833134651184, "rewards/torch_zeros_reward/std": 0.04005204886198044, "rewards/valid_tl_methods_reward/mean": 0.1770833283662796, "rewards/valid_tl_methods_reward/std": 0.06403809040784836, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1198.0, "completions/max_terminated_length": 1198.0, "completions/mean_length": 592.4583740234375, "completions/mean_terminated_length": 592.4583740234375, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 1.5112540192926045, "grad_norm": 1.0602891445159912, "learning_rate": 1e-06, "loss": 0.0118, "num_tokens": 21186950.0, "reward": 1.043541669845581, "reward_std": 0.26119792461395264, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.06385406851768494, "rewards/one_code_blob_reward/std": 0.027104245498776436, "rewards/reward_code_runs/mean": -0.004687500651925802, "rewards/reward_code_runs/std": 0.2869144380092621, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.09479167312383652, "rewards/torch_zeros_reward/std": 0.022336147725582123, "rewards/valid_tl_methods_reward/mean": 0.18958334624767303, "rewards/valid_tl_methods_reward/std": 0.044672295451164246, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1521.0, "completions/max_terminated_length": 1521.0, "completions/mean_length": 595.0625, "completions/mean_terminated_length": 595.0625, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 1.5209003215434085, "grad_norm": 1.1863574981689453, "learning_rate": 1e-06, "loss": 0.0414, "num_tokens": 21286340.0, "reward": 1.023719310760498, "reward_std": 0.17034493386745453, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.0674692839384079, "rewards/one_code_blob_reward/std": 0.027314169332385063, "rewards/reward_code_runs/mean": -0.02499999850988388, "rewards/reward_code_runs/std": 0.22618110477924347, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.08541666716337204, "rewards/torch_zeros_reward/std": 0.03547917678952217, "rewards/valid_tl_methods_reward/mean": 0.1979166716337204, "rewards/valid_tl_methods_reward/std": 0.020412415266036987, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1411.0, "completions/max_terminated_length": 1411.0, "completions/mean_length": 593.3125, "completions/mean_terminated_length": 593.3125, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 1.5305466237942122, "grad_norm": 0.9769577383995056, "learning_rate": 1e-06, "loss": 0.0688, "num_tokens": 21383450.0, "reward": 1.2167844772338867, "reward_std": 0.10052961111068726, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.07355514913797379, "rewards/one_code_blob_reward/std": 0.03204425796866417, "rewards/reward_code_runs/mean": 0.1796875, "rewards/reward_code_runs/std": 0.5113232731819153, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.07395833730697632, "rewards/torch_zeros_reward/std": 0.04411657154560089, "rewards/valid_tl_methods_reward/mean": 0.18958334624767303, "rewards/valid_tl_methods_reward/std": 0.044672295451164246, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1328.0, "completions/max_terminated_length": 1328.0, "completions/mean_length": 676.1979370117188, "completions/mean_terminated_length": 676.1979370117188, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 1.540192926045016, "grad_norm": 0.8277693390846252, "learning_rate": 1e-06, "loss": 0.0246, "num_tokens": 21493533.0, "reward": 0.9875229597091675, "reward_std": 0.20137527585029602, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.05991869792342186, "rewards/one_code_blob_reward/std": 0.02951066941022873, "rewards/reward_code_runs/mean": -0.05052083358168602, "rewards/reward_code_runs/std": 0.26749271154403687, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.08437500149011612, "rewards/torch_zeros_reward/std": 0.03649982064962387, "rewards/valid_tl_methods_reward/mean": 0.19375000894069672, "rewards/valid_tl_methods_reward/std": 0.034981198608875275, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1499.0, "completions/max_terminated_length": 1499.0, "completions/mean_length": 711.4271240234375, "completions/mean_terminated_length": 711.4271240234375, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 1.54983922829582, "grad_norm": 0.8331584930419922, "learning_rate": 1e-06, "loss": 0.0042, "num_tokens": 21609662.0, "reward": 0.8818552494049072, "reward_std": 0.14330685138702393, "rewards/constexpr_reward/mean": 0.19583334028720856, "rewards/constexpr_reward/std": 0.02871517650783062, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.0989583358168602, "rewards/masks_load_store_reward/std": 0.010206207633018494, "rewards/one_code_blob_reward/mean": 0.05477190017700195, "rewards/one_code_blob_reward/std": 0.0240237507969141, "rewards/reward_code_runs/mean": -0.1197916641831398, "rewards/reward_code_runs/std": 0.22935599088668823, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.05833333730697632, "rewards/torch_zeros_reward/std": 0.04955946281552315, "rewards/valid_tl_methods_reward/mean": 0.19583334028720856, "rewards/valid_tl_methods_reward/std": 0.02871517650783062, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1242.0, "completions/max_terminated_length": 1242.0, "completions/mean_length": 664.8229370117188, "completions/mean_terminated_length": 664.8229370117188, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 1.5594855305466238, "grad_norm": 0.8680635690689087, "learning_rate": 1e-06, "loss": 0.0221, "num_tokens": 21718005.0, "reward": 0.8824254274368286, "reward_std": 0.18108677864074707, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.05690452456474304, "rewards/one_code_blob_reward/std": 0.02404734678566456, "rewards/reward_code_runs/mean": -0.14427082240581512, "rewards/reward_code_runs/std": 0.24048960208892822, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.09270832687616348, "rewards/torch_zeros_reward/std": 0.026136452332139015, "rewards/valid_tl_methods_reward/mean": 0.1770833283662796, "rewards/valid_tl_methods_reward/std": 0.06403809040784836, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1159.0, "completions/max_terminated_length": 1159.0, "completions/mean_length": 642.7083740234375, "completions/mean_terminated_length": 642.7083740234375, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 1.5691318327974275, "grad_norm": 0.7752233743667603, "learning_rate": 1e-06, "loss": -0.0035, "num_tokens": 21823205.0, "reward": 1.1916781663894653, "reward_std": 0.2930299937725067, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.061990588903427124, "rewards/one_code_blob_reward/std": 0.027839384973049164, "rewards/reward_code_runs/mean": 0.13697917759418488, "rewards/reward_code_runs/std": 0.48494410514831543, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.09479167312383652, "rewards/torch_zeros_reward/std": 0.022336147725582123, "rewards/valid_tl_methods_reward/mean": 0.20000000298023224, "rewards/valid_tl_methods_reward/std": 0.0, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 986.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 568.9896240234375, "completions/mean_terminated_length": 568.9896240234375, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 1.5787781350482315, "grad_norm": 0.8281655311584473, "learning_rate": 1e-06, "loss": 0.0028, "num_tokens": 21919996.0, "reward": 1.1239912509918213, "reward_std": 0.1698978841304779, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.06930360198020935, "rewards/one_code_blob_reward/std": 0.02347411774098873, "rewards/reward_code_runs/mean": 0.10885417461395264, "rewards/reward_code_runs/std": 0.49353688955307007, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.0729166641831398, "rewards/torch_zeros_reward/std": 0.044672295451164246, "rewards/valid_tl_methods_reward/mean": 0.17499999701976776, "rewards/valid_tl_methods_reward/std": 0.06649099290370941, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1046.0, "completions/max_terminated_length": 1046.0, "completions/mean_length": 556.96875, "completions/mean_terminated_length": 556.96875, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 1.5884244372990355, "grad_norm": 0.9195791482925415, "learning_rate": 1e-06, "loss": 0.019, "num_tokens": 22016113.0, "reward": 1.1228070259094238, "reward_std": 0.25016868114471436, "rewards/constexpr_reward/mean": 0.19583334028720856, "rewards/constexpr_reward/std": 0.02871517650783062, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.07384868711233139, "rewards/one_code_blob_reward/std": 0.022700248286128044, "rewards/reward_code_runs/mean": 0.0625000074505806, "rewards/reward_code_runs/std": 0.4486822783946991, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.09062499552965164, "rewards/torch_zeros_reward/std": 0.029301069676876068, "rewards/valid_tl_methods_reward/mean": 0.20000000298023224, "rewards/valid_tl_methods_reward/std": 0.0, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1407.0, "completions/max_terminated_length": 1407.0, "completions/mean_length": 623.90625, "completions/mean_terminated_length": 623.90625, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 1.5980707395498392, "grad_norm": 0.8318855166435242, "learning_rate": 1e-06, "loss": -0.0258, "num_tokens": 22118344.0, "reward": 0.9855782985687256, "reward_std": 0.16274294257164001, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.0610990971326828, "rewards/one_code_blob_reward/std": 0.03232499584555626, "rewards/reward_code_runs/mean": -0.05885416269302368, "rewards/reward_code_runs/std": 0.24602040648460388, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.0833333358168602, "rewards/torch_zeros_reward/std": 0.03746343404054642, "rewards/valid_tl_methods_reward/mean": 0.20000000298023224, "rewards/valid_tl_methods_reward/std": 0.0, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1078.0, "completions/max_terminated_length": 1078.0, "completions/mean_length": 696.3854370117188, "completions/mean_terminated_length": 696.3854370117188, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 1.607717041800643, "grad_norm": 0.8529621362686157, "learning_rate": 1e-06, "loss": 0.0254, "num_tokens": 22232525.0, "reward": 0.8414582014083862, "reward_std": 0.14661264419555664, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.05343733727931976, "rewards/one_code_blob_reward/std": 0.01874612644314766, "rewards/reward_code_runs/mean": -0.17031247913837433, "rewards/reward_code_runs/std": 0.17268440127372742, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.06875000149011612, "rewards/torch_zeros_reward/std": 0.04659455642104149, "rewards/valid_tl_methods_reward/mean": 0.19166667759418488, "rewards/valid_tl_methods_reward/std": 0.04017505422234535, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1035.0, "completions/max_terminated_length": 1035.0, "completions/mean_length": 612.2083740234375, "completions/mean_terminated_length": 612.2083740234375, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 1.617363344051447, "grad_norm": 0.8072406649589539, "learning_rate": 1e-06, "loss": 0.0471, "num_tokens": 22336597.0, "reward": 1.2598329782485962, "reward_std": 0.2021738737821579, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.05931195989251137, "rewards/one_code_blob_reward/std": 0.025900106877088547, "rewards/reward_code_runs/mean": 0.20364584028720856, "rewards/reward_code_runs/std": 0.4453410804271698, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": -0.0010416667209938169, "rewards/torch_empty_penalty/std": 0.010206207633018494, "rewards/torch_zeros_reward/mean": 0.10000000149011612, "rewards/torch_zeros_reward/std": 0.0, "rewards/valid_tl_methods_reward/mean": 0.1979166716337204, "rewards/valid_tl_methods_reward/std": 0.020412415266036987, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1458.0, "completions/max_terminated_length": 1458.0, "completions/mean_length": 598.25, "completions/mean_terminated_length": 598.25, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 1.6270096463022508, "grad_norm": 1.389051079750061, "learning_rate": 1e-06, "loss": 0.0229, "num_tokens": 22434457.0, "reward": 1.1792818307876587, "reward_std": 0.2810722887516022, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.19583334028720856, "rewards/imports_decorator_reward/std": 0.02871517650783062, "rewards/masks_load_store_reward/mean": 0.0989583358168602, "rewards/masks_load_store_reward/std": 0.010206207633018494, "rewards/one_code_blob_reward/mean": 0.06938587129116058, "rewards/one_code_blob_reward/std": 0.04094384238123894, "rewards/reward_code_runs/mean": 0.12968750298023224, "rewards/reward_code_runs/std": 0.3488762378692627, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.08958333730697632, "rewards/torch_zeros_reward/std": 0.03070801869034767, "rewards/valid_tl_methods_reward/mean": 0.1979166716337204, "rewards/valid_tl_methods_reward/std": 0.020412415266036987, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1077.0, "completions/max_terminated_length": 1077.0, "completions/mean_length": 571.9896240234375, "completions/mean_terminated_length": 571.9896240234375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 1.6366559485530545, "grad_norm": 0.7332437634468079, "learning_rate": 1e-06, "loss": -0.0358, "num_tokens": 22529976.0, "reward": 1.359358310699463, "reward_std": 0.21025454998016357, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.06821238249540329, "rewards/one_code_blob_reward/std": 0.030263084918260574, "rewards/reward_code_runs/mean": 0.29114583134651184, "rewards/reward_code_runs/std": 0.5550246238708496, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.10000000149011612, "rewards/torch_zeros_reward/std": 0.0, "rewards/valid_tl_methods_reward/mean": 0.20000000298023224, "rewards/valid_tl_methods_reward/std": 0.0, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1574.0, "completions/max_terminated_length": 1574.0, "completions/mean_length": 698.34375, "completions/mean_terminated_length": 698.34375, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 1.6463022508038585, "grad_norm": 0.8411015272140503, "learning_rate": 1e-06, "loss": 0.0503, "num_tokens": 22642857.0, "reward": 1.1509120464324951, "reward_std": 0.19191358983516693, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.051953624933958054, "rewards/one_code_blob_reward/std": 0.030798546969890594, "rewards/reward_code_runs/mean": 0.10312500596046448, "rewards/reward_code_runs/std": 0.40110453963279724, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.09791667014360428, "rewards/torch_zeros_reward/std": 0.01435758825391531, "rewards/valid_tl_methods_reward/mean": 0.20000000298023224, "rewards/valid_tl_methods_reward/std": 0.0, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 909.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 575.34375, "completions/mean_terminated_length": 575.34375, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 1.6559485530546625, "grad_norm": 0.9720630049705505, "learning_rate": 1e-06, "loss": 0.0254, "num_tokens": 22742142.0, "reward": 1.0138139724731445, "reward_std": 0.1578996777534485, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.0690222904086113, "rewards/one_code_blob_reward/std": 0.022182492539286613, "rewards/reward_code_runs/mean": -0.05416666343808174, "rewards/reward_code_runs/std": 0.2466263622045517, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.0989583358168602, "rewards/torch_zeros_reward/std": 0.010206207633018494, "rewards/valid_tl_methods_reward/mean": 0.20000000298023224, "rewards/valid_tl_methods_reward/std": 0.0, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1171.0, "completions/max_terminated_length": 1171.0, "completions/mean_length": 618.8229370117188, "completions/mean_terminated_length": 618.8229370117188, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 1.6655948553054662, "grad_norm": 0.9359613060951233, "learning_rate": 1e-06, "loss": 0.0075, "num_tokens": 22846405.0, "reward": 1.0907633304595947, "reward_std": 0.21883492171764374, "rewards/constexpr_reward/mean": 0.19583334028720856, "rewards/constexpr_reward/std": 0.02871517650783062, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.0989583358168602, "rewards/masks_load_store_reward/std": 0.010206207633018494, "rewards/one_code_blob_reward/mean": 0.05586738511919975, "rewards/one_code_blob_reward/std": 0.03974929451942444, "rewards/reward_code_runs/mean": 0.07343750447034836, "rewards/reward_code_runs/std": 0.48045775294303894, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.09583333879709244, "rewards/torch_zeros_reward/std": 0.020087527111172676, "rewards/valid_tl_methods_reward/mean": 0.17291666567325592, "rewards/valid_tl_methods_reward/std": 0.06879284977912903, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 981.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 539.7916870117188, "completions/mean_terminated_length": 539.7916870117188, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 1.67524115755627, "grad_norm": 1.1559139490127563, "learning_rate": 1e-06, "loss": 0.0079, "num_tokens": 22940153.0, "reward": 1.1731069087982178, "reward_std": 0.24236929416656494, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.07414843887090683, "rewards/one_code_blob_reward/std": 0.029914140701293945, "rewards/reward_code_runs/mean": 0.11145833134651184, "rewards/reward_code_runs/std": 0.41131874918937683, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.08958333730697632, "rewards/torch_zeros_reward/std": 0.030708016827702522, "rewards/valid_tl_methods_reward/mean": 0.20000000298023224, "rewards/valid_tl_methods_reward/std": 0.0, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1343.0, "completions/max_terminated_length": 1343.0, "completions/mean_length": 534.3229370117188, "completions/mean_terminated_length": 534.3229370117188, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 1.684887459807074, "grad_norm": 0.7538516521453857, "learning_rate": 1e-06, "loss": 0.0254, "num_tokens": 23032812.0, "reward": 1.4256443977355957, "reward_std": 0.15253810584545135, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.07772766798734665, "rewards/one_code_blob_reward/std": 0.02698957547545433, "rewards/reward_code_runs/mean": 0.359375, "rewards/reward_code_runs/std": 0.5299112200737, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.09062501043081284, "rewards/torch_zeros_reward/std": 0.02930106781423092, "rewards/valid_tl_methods_reward/mean": 0.1979166716337204, "rewards/valid_tl_methods_reward/std": 0.020412415266036987, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1460.0, "completions/max_terminated_length": 1460.0, "completions/mean_length": 592.8646240234375, "completions/mean_terminated_length": 592.8646240234375, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 1.694533762057878, "grad_norm": 0.9495415091514587, "learning_rate": 1e-06, "loss": 0.0298, "num_tokens": 23131895.0, "reward": 1.1976202726364136, "reward_std": 0.254415363073349, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.07053681463003159, "rewards/one_code_blob_reward/std": 0.03324928507208824, "rewards/reward_code_runs/mean": 0.15937501192092896, "rewards/reward_code_runs/std": 0.4877074658870697, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.078125, "rewards/torch_zeros_reward/std": 0.04155687242746353, "rewards/valid_tl_methods_reward/mean": 0.19166667759418488, "rewards/valid_tl_methods_reward/std": 0.04017505422234535, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1105.0, "completions/max_terminated_length": 1105.0, "completions/mean_length": 614.21875, "completions/mean_terminated_length": 614.21875, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 1.7041800643086815, "grad_norm": 1.123584508895874, "learning_rate": 1e-06, "loss": 0.004, "num_tokens": 23237456.0, "reward": 1.0600738525390625, "reward_std": 0.2565411627292633, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.0989583358168602, "rewards/masks_load_store_reward/std": 0.010206207633018494, "rewards/one_code_blob_reward/mean": 0.06371968239545822, "rewards/one_code_blob_reward/std": 0.025978902354836464, "rewards/reward_code_runs/mean": 0.0026041690725833178, "rewards/reward_code_runs/std": 0.3209664523601532, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.09687501192092896, "rewards/torch_zeros_reward/std": 0.017490599304437637, "rewards/valid_tl_methods_reward/mean": 0.1979166716337204, "rewards/valid_tl_methods_reward/std": 0.020412415266036987, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 692.0729370117188, "completions/mean_terminated_length": 692.0729370117188, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 1.7138263665594855, "grad_norm": 0.9737513661384583, "learning_rate": 1e-06, "loss": 0.0171, "num_tokens": 23353047.0, "reward": 0.899864673614502, "reward_std": 0.2174696922302246, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.047260504215955734, "rewards/one_code_blob_reward/std": 0.015411733649671078, "rewards/reward_code_runs/mean": -0.1171875, "rewards/reward_code_runs/std": 0.2723028361797333, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.07187499850988388, "rewards/torch_zeros_reward/std": 0.04519694298505783, "rewards/valid_tl_methods_reward/mean": 0.1979166716337204, "rewards/valid_tl_methods_reward/std": 0.020412415266036987, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 514.3333740234375, "completions/mean_terminated_length": 514.3333740234375, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 1.7234726688102895, "grad_norm": 0.9674484133720398, "learning_rate": 1e-06, "loss": 0.0208, "num_tokens": 23443451.0, "reward": 1.1143759489059448, "reward_std": 0.2083703726530075, "rewards/constexpr_reward/mean": 0.19166667759418488, "rewards/constexpr_reward/std": 0.04017505422234535, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.0989583358168602, "rewards/masks_load_store_reward/std": 0.010206207633018494, "rewards/one_code_blob_reward/mean": 0.07427162677049637, "rewards/one_code_blob_reward/std": 0.04172372817993164, "rewards/reward_code_runs/mean": 0.06822916865348816, "rewards/reward_code_runs/std": 0.43559515476226807, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.08749999850988388, "rewards/torch_zeros_reward/std": 0.033245496451854706, "rewards/valid_tl_methods_reward/mean": 0.19583334028720856, "rewards/valid_tl_methods_reward/std": 0.02871517650783062, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1690.0, "completions/max_terminated_length": 1690.0, "completions/mean_length": 710.8958740234375, "completions/mean_terminated_length": 710.8958740234375, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 1.7331189710610932, "grad_norm": 0.9569810032844543, "learning_rate": 1e-06, "loss": 0.0461, "num_tokens": 23556049.0, "reward": 1.02650785446167, "reward_std": 0.23694173991680145, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.059841081500053406, "rewards/one_code_blob_reward/std": 0.03418285399675369, "rewards/reward_code_runs/mean": -0.009375002235174179, "rewards/reward_code_runs/std": 0.28721094131469727, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.0989583358168602, "rewards/torch_zeros_reward/std": 0.010206207633018494, "rewards/valid_tl_methods_reward/mean": 0.1770833283662796, "rewards/valid_tl_methods_reward/std": 0.06403809040784836, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1070.0, "completions/max_terminated_length": 1070.0, "completions/mean_length": 628.15625, "completions/mean_terminated_length": 628.15625, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 1.742765273311897, "grad_norm": 0.8146075010299683, "learning_rate": 1e-06, "loss": 0.0011, "num_tokens": 23663992.0, "reward": 1.0439410209655762, "reward_std": 0.24557435512542725, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.0606076605618, "rewards/one_code_blob_reward/std": 0.021280352026224136, "rewards/reward_code_runs/mean": 2.4835269396561444e-09, "rewards/reward_code_runs/std": 0.286540150642395, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": -0.0010416667209938169, "rewards/torch_empty_penalty/std": 0.010206207633018494, "rewards/torch_zeros_reward/mean": 0.0989583358168602, "rewards/torch_zeros_reward/std": 0.010206207633018494, "rewards/valid_tl_methods_reward/mean": 0.18541668355464935, "rewards/valid_tl_methods_reward/std": 0.05227290466427803, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1395.0, "completions/max_terminated_length": 1395.0, "completions/mean_length": 624.0625, "completions/mean_terminated_length": 624.0625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 1.752411575562701, "grad_norm": 0.9790536761283875, "learning_rate": 1e-06, "loss": 0.0055, "num_tokens": 23769790.0, "reward": 1.292978286743164, "reward_std": 0.16918937861919403, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.058603256940841675, "rewards/one_code_blob_reward/std": 0.02803383395075798, "rewards/reward_code_runs/mean": 0.24895834922790527, "rewards/reward_code_runs/std": 0.4883365333080292, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.08749999850988388, "rewards/torch_zeros_reward/std": 0.033245496451854706, "rewards/valid_tl_methods_reward/mean": 0.20000000298023224, "rewards/valid_tl_methods_reward/std": 0.0, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2086.0, "completions/max_terminated_length": 2086.0, "completions/mean_length": 633.5729370117188, "completions/mean_terminated_length": 633.5729370117188, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 1.762057877813505, "grad_norm": 0.8213247656822205, "learning_rate": 1e-06, "loss": 0.0247, "num_tokens": 23873021.0, "reward": 1.1788103580474854, "reward_std": 0.14977888762950897, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.06631030887365341, "rewards/one_code_blob_reward/std": 0.030902283266186714, "rewards/reward_code_runs/mean": 0.11249998956918716, "rewards/reward_code_runs/std": 0.3979552984237671, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.10000000149011612, "rewards/torch_zeros_reward/std": 0.0, "rewards/valid_tl_methods_reward/mean": 0.20000000298023224, "rewards/valid_tl_methods_reward/std": 0.0, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1249.0, "completions/max_terminated_length": 1249.0, "completions/mean_length": 575.8958740234375, "completions/mean_terminated_length": 575.8958740234375, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 1.7717041800643085, "grad_norm": 0.9315298199653625, "learning_rate": 1e-06, "loss": 0.0063, "num_tokens": 23970967.0, "reward": 1.3047760725021362, "reward_std": 0.23194283246994019, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.06935928016901016, "rewards/one_code_blob_reward/std": 0.035950787365436554, "rewards/reward_code_runs/mean": 0.2541666626930237, "rewards/reward_code_runs/std": 0.5269408226013184, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.0833333358168602, "rewards/torch_zeros_reward/std": 0.03746343404054642, "rewards/valid_tl_methods_reward/mean": 0.1979166716337204, "rewards/valid_tl_methods_reward/std": 0.020412415266036987, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1222.0, "completions/max_terminated_length": 1222.0, "completions/mean_length": 509.57293701171875, "completions/mean_terminated_length": 509.57293701171875, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 1.7813504823151125, "grad_norm": 1.0050220489501953, "learning_rate": 1e-06, "loss": 0.0742, "num_tokens": 24061922.0, "reward": 1.1889946460723877, "reward_std": 0.1527276635169983, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.07649455219507217, "rewards/one_code_blob_reward/std": 0.02742423489689827, "rewards/reward_code_runs/mean": 0.11250000447034836, "rewards/reward_code_runs/std": 0.3979552984237671, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.10000000149011612, "rewards/torch_zeros_reward/std": 0.0, "rewards/valid_tl_methods_reward/mean": 0.20000000298023224, "rewards/valid_tl_methods_reward/std": 0.0, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1045.0, "completions/max_terminated_length": 1045.0, "completions/mean_length": 449.47918701171875, "completions/mean_terminated_length": 449.47918701171875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 1.7909967845659165, "grad_norm": 1.2509444952011108, "learning_rate": 1e-06, "loss": 0.0378, "num_tokens": 24144228.0, "reward": 1.2286624908447266, "reward_std": 0.21664278209209442, "rewards/constexpr_reward/mean": 0.18958334624767303, "rewards/constexpr_reward/std": 0.044672295451164246, "rewards/imports_decorator_reward/mean": 0.18958334624767303, "rewards/imports_decorator_reward/std": 0.044672295451164246, "rewards/masks_load_store_reward/mean": 0.09375, "rewards/masks_load_store_reward/std": 0.02433321252465248, "rewards/one_code_blob_reward/mean": 0.0922040268778801, "rewards/one_code_blob_reward/std": 0.03387906774878502, "rewards/reward_code_runs/mean": 0.20937500894069672, "rewards/reward_code_runs/std": 0.43025773763656616, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.06875000149011612, "rewards/torch_zeros_reward/std": 0.04659455642104149, "rewards/valid_tl_methods_reward/mean": 0.18541668355464935, "rewards/valid_tl_methods_reward/std": 0.05227290466427803, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 954.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 421.54168701171875, "completions/mean_terminated_length": 421.54168701171875, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 1.8006430868167203, "grad_norm": 1.323587417602539, "learning_rate": 1e-06, "loss": 0.028, "num_tokens": 24224920.0, "reward": 1.3313677310943604, "reward_std": 0.0972091406583786, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.09230508655309677, "rewards/one_code_blob_reward/std": 0.022745473310351372, "rewards/reward_code_runs/mean": 0.23906248807907104, "rewards/reward_code_runs/std": 0.3271248936653137, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.10000000149011612, "rewards/torch_zeros_reward/std": 0.0, "rewards/valid_tl_methods_reward/mean": 0.20000000298023224, "rewards/valid_tl_methods_reward/std": 0.0, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 513.0729370117188, "completions/mean_terminated_length": 513.0729370117188, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 1.810289389067524, "grad_norm": 0.9576202630996704, "learning_rate": 1e-06, "loss": -0.0013, "num_tokens": 24314903.0, "reward": 1.1814203262329102, "reward_std": 0.14891770482063293, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.09687501192092896, "rewards/masks_load_store_reward/std": 0.017490599304437637, "rewards/one_code_blob_reward/mean": 0.07517031580209732, "rewards/one_code_blob_reward/std": 0.027302829548716545, "rewards/reward_code_runs/mean": 0.12083333730697632, "rewards/reward_code_runs/std": 0.4080548584461212, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.0989583358168602, "rewards/torch_zeros_reward/std": 0.010206207633018494, "rewards/valid_tl_methods_reward/mean": 0.18958334624767303, "rewards/valid_tl_methods_reward/std": 0.044672295451164246, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 963.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 566.3541870117188, "completions/mean_terminated_length": 566.3541870117188, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 1.819935691318328, "grad_norm": 1.361865758895874, "learning_rate": 1e-06, "loss": 0.0454, "num_tokens": 24415089.0, "reward": 1.1108605861663818, "reward_std": 0.12311562150716782, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.06815212219953537, "rewards/one_code_blob_reward/std": 0.02729409746825695, "rewards/reward_code_runs/mean": 0.05625000223517418, "rewards/reward_code_runs/std": 0.41328275203704834, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.08645833283662796, "rewards/torch_zeros_reward/std": 0.034396424889564514, "rewards/valid_tl_methods_reward/mean": 0.20000000298023224, "rewards/valid_tl_methods_reward/std": 0.0, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1078.0, "completions/max_terminated_length": 1078.0, "completions/mean_length": 466.7708435058594, "completions/mean_terminated_length": 466.7708435058594, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 1.829581993569132, "grad_norm": 0.9256302714347839, "learning_rate": 1e-06, "loss": -0.0039, "num_tokens": 24500771.0, "reward": 1.3085904121398926, "reward_std": 0.16580632328987122, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.0989583358168602, "rewards/masks_load_store_reward/std": 0.010206207633018494, "rewards/one_code_blob_reward/mean": 0.08827787637710571, "rewards/one_code_blob_reward/std": 0.02642660029232502, "rewards/reward_code_runs/mean": 0.22343750298023224, "rewards/reward_code_runs/std": 0.4222092926502228, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.10000000149011612, "rewards/torch_zeros_reward/std": 0.0, "rewards/valid_tl_methods_reward/mean": 0.1979166716337204, "rewards/valid_tl_methods_reward/std": 0.020412415266036987, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1132.0, "completions/max_terminated_length": 1132.0, "completions/mean_length": 482.71875, "completions/mean_terminated_length": 482.71875, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 1.8392282958199357, "grad_norm": 1.2202610969543457, "learning_rate": 1e-06, "loss": 0.0375, "num_tokens": 24593192.0, "reward": 1.159076452255249, "reward_std": 0.19445598125457764, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.08043061196804047, "rewards/one_code_blob_reward/std": 0.027151940390467644, "rewards/reward_code_runs/mean": 0.09010416269302368, "rewards/reward_code_runs/std": 0.39186593890190125, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.0885416641831398, "rewards/torch_zeros_reward/std": 0.03201904520392418, "rewards/valid_tl_methods_reward/mean": 0.20000000298023224, "rewards/valid_tl_methods_reward/std": 0.0, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 852.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 473.8645935058594, "completions/mean_terminated_length": 473.8645935058594, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 1.8488745980707395, "grad_norm": 1.0192776918411255, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 24682363.0, "reward": 0.9869474768638611, "reward_std": 0.13572761416435242, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.0989583358168602, "rewards/masks_load_store_reward/std": 0.010206207633018494, "rewards/one_code_blob_reward/mean": 0.08173908293247223, "rewards/one_code_blob_reward/std": 0.034760426729917526, "rewards/reward_code_runs/mean": -0.06354166567325592, "rewards/reward_code_runs/std": 0.24532246589660645, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.0989583358168602, "rewards/torch_zeros_reward/std": 0.010206207633018494, "rewards/valid_tl_methods_reward/mean": 0.17499999701976776, "rewards/valid_tl_methods_reward/std": 0.06649099290370941, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 475.13543701171875, "completions/mean_terminated_length": 475.13543701171875, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 1.8585209003215435, "grad_norm": 1.1663234233856201, "learning_rate": 1e-06, "loss": 0.0225, "num_tokens": 24770492.0, "reward": 1.237094521522522, "reward_std": 0.20853112637996674, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.09791667014360428, "rewards/masks_load_store_reward/std": 0.014357589185237885, "rewards/one_code_blob_reward/mean": 0.07928191870450974, "rewards/one_code_blob_reward/std": 0.03703931346535683, "rewards/reward_code_runs/mean": 0.1598958522081375, "rewards/reward_code_runs/std": 0.4301542043685913, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.10000000149011612, "rewards/torch_zeros_reward/std": 0.0, "rewards/valid_tl_methods_reward/mean": 0.20000000298023224, "rewards/valid_tl_methods_reward/std": 0.0, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1129.0, "completions/max_terminated_length": 1129.0, "completions/mean_length": 506.625, "completions/mean_terminated_length": 506.625, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 1.8681672025723473, "grad_norm": 1.028404712677002, "learning_rate": 1e-06, "loss": 0.0222, "num_tokens": 24862640.0, "reward": 1.3358540534973145, "reward_std": 0.20964989066123962, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.0989583358168602, "rewards/masks_load_store_reward/std": 0.010206207633018494, "rewards/one_code_blob_reward/mean": 0.08012475818395615, "rewards/one_code_blob_reward/std": 0.028094014152884483, "rewards/reward_code_runs/mean": 0.2588541805744171, "rewards/reward_code_runs/std": 0.42457374930381775, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.10000000149011612, "rewards/torch_zeros_reward/std": 0.0, "rewards/valid_tl_methods_reward/mean": 0.1979166716337204, "rewards/valid_tl_methods_reward/std": 0.020412415266036987, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 941.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 483.82293701171875, "completions/mean_terminated_length": 483.82293701171875, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 1.877813504823151, "grad_norm": 1.0317487716674805, "learning_rate": 1e-06, "loss": 0.0237, "num_tokens": 24948291.0, "reward": 1.1043274402618408, "reward_std": 0.20908395946025848, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.09583333879709244, "rewards/masks_load_store_reward/std": 0.020087527111172676, "rewards/one_code_blob_reward/mean": 0.08870226889848709, "rewards/one_code_blob_reward/std": 0.023371906951069832, "rewards/reward_code_runs/mean": 0.02187499962747097, "rewards/reward_code_runs/std": 0.22121821343898773, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.10000000149011612, "rewards/torch_zeros_reward/std": 0.0, "rewards/valid_tl_methods_reward/mean": 0.20000000298023224, "rewards/valid_tl_methods_reward/std": 0.0, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 732.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 513.7291870117188, "completions/mean_terminated_length": 513.7291870117188, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 1.887459807073955, "grad_norm": 1.1328577995300293, "learning_rate": 1e-06, "loss": 0.0337, "num_tokens": 25046245.0, "reward": 1.0973868370056152, "reward_std": 0.1983201801776886, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.0989583358168602, "rewards/masks_load_store_reward/std": 0.010206207633018494, "rewards/one_code_blob_reward/mean": 0.08176174759864807, "rewards/one_code_blob_reward/std": 0.019281970337033272, "rewards/reward_code_runs/mean": 0.03020833432674408, "rewards/reward_code_runs/std": 0.24208298325538635, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.08645833283662796, "rewards/torch_zeros_reward/std": 0.034396424889564514, "rewards/valid_tl_methods_reward/mean": 0.20000000298023224, "rewards/valid_tl_methods_reward/std": 0.0, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 882.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 499.7708435058594, "completions/mean_terminated_length": 499.7708435058594, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 1.897106109324759, "grad_norm": 1.057507872581482, "learning_rate": 1e-06, "loss": 0.0446, "num_tokens": 25138323.0, "reward": 1.2629551887512207, "reward_std": 0.14245173335075378, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.0827467143535614, "rewards/one_code_blob_reward/std": 0.03230925649404526, "rewards/reward_code_runs/mean": 0.20416666567325592, "rewards/reward_code_runs/std": 0.3813871145248413, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.07604166865348816, "rewards/torch_zeros_reward/std": 0.042906977236270905, "rewards/valid_tl_methods_reward/mean": 0.20000000298023224, "rewards/valid_tl_methods_reward/std": 0.0, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1365.0, "completions/max_terminated_length": 1365.0, "completions/mean_length": 664.9583740234375, "completions/mean_terminated_length": 664.9583740234375, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 1.9067524115755627, "grad_norm": 1.36724853515625, "learning_rate": 1e-06, "loss": 0.0326, "num_tokens": 25251311.0, "reward": 1.0337120294570923, "reward_std": 0.21500375866889954, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.09479167312383652, "rewards/masks_load_store_reward/std": 0.022336147725582123, "rewards/one_code_blob_reward/mean": 0.060274433344602585, "rewards/one_code_blob_reward/std": 0.04737605154514313, "rewards/reward_code_runs/mean": 0.01718750037252903, "rewards/reward_code_runs/std": 0.2221696972846985, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.0677083358168602, "rewards/torch_zeros_reward/std": 0.047004569321870804, "rewards/valid_tl_methods_reward/mean": 0.1979166716337204, "rewards/valid_tl_methods_reward/std": 0.020412415266036987, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 905.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 465.8645935058594, "completions/mean_terminated_length": 465.8645935058594, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 1.9163987138263665, "grad_norm": 1.077828049659729, "learning_rate": 1e-06, "loss": 0.0036, "num_tokens": 25338658.0, "reward": 1.3010025024414062, "reward_std": 0.1482926309108734, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.0989583358168602, "rewards/masks_load_store_reward/std": 0.010206207633018494, "rewards/one_code_blob_reward/mean": 0.0885024294257164, "rewards/one_code_blob_reward/std": 0.03680278733372688, "rewards/reward_code_runs/mean": 0.22187499701976776, "rewards/reward_code_runs/std": 0.4921388626098633, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.09791667014360428, "rewards/torch_zeros_reward/std": 0.01435758825391531, "rewards/valid_tl_methods_reward/mean": 0.1979166716337204, "rewards/valid_tl_methods_reward/std": 0.020412415266036987, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 825.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 469.71875, "completions/mean_terminated_length": 469.71875, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 1.9260450160771705, "grad_norm": 1.516790509223938, "learning_rate": 1e-06, "loss": 0.0507, "num_tokens": 25425919.0, "reward": 1.1425931453704834, "reward_std": 0.15534614026546478, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.09207212179899216, "rewards/one_code_blob_reward/std": 0.03535056114196777, "rewards/reward_code_runs/mean": 0.07968749850988388, "rewards/reward_code_runs/std": 0.4079201817512512, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.09791667014360428, "rewards/torch_zeros_reward/std": 0.01435758825391531, "rewards/valid_tl_methods_reward/mean": 0.17291666567325592, "rewards/valid_tl_methods_reward/std": 0.06879284977912903, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1290.0, "completions/max_terminated_length": 1290.0, "completions/mean_length": 515.4583740234375, "completions/mean_terminated_length": 515.4583740234375, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 1.9356913183279743, "grad_norm": 1.4440219402313232, "learning_rate": 1e-06, "loss": 0.0678, "num_tokens": 25519899.0, "reward": 1.2874916791915894, "reward_std": 0.09185895323753357, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.09791667014360428, "rewards/masks_load_store_reward/std": 0.01435758825391531, "rewards/one_code_blob_reward/mean": 0.0786375030875206, "rewards/one_code_blob_reward/std": 0.039403628557920456, "rewards/reward_code_runs/mean": 0.2421875, "rewards/reward_code_runs/std": 0.5983014106750488, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.07916666567325592, "rewards/torch_zeros_reward/std": 0.040824830532073975, "rewards/valid_tl_methods_reward/mean": 0.18958334624767303, "rewards/valid_tl_methods_reward/std": 0.044672295451164246, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1100.0, "completions/max_terminated_length": 1100.0, "completions/mean_length": 550.1458740234375, "completions/mean_terminated_length": 550.1458740234375, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 1.945337620578778, "grad_norm": 4.093983173370361, "learning_rate": 1e-06, "loss": 0.0547, "num_tokens": 25618949.0, "reward": 1.142181634902954, "reward_std": 0.13598135113716125, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.08072315901517868, "rewards/one_code_blob_reward/std": 0.026570141315460205, "rewards/reward_code_runs/mean": 0.06874999403953552, "rewards/reward_code_runs/std": 0.20561204850673676, "rewards/think_reward/mean": 0.19687502086162567, "rewards/think_reward/std": 0.03061862289905548, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.09791667014360428, "rewards/torch_zeros_reward/std": 0.01435758825391531, "rewards/valid_tl_methods_reward/mean": 0.1979166716337204, "rewards/valid_tl_methods_reward/std": 0.020412415266036987, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 807.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 415.19793701171875, "completions/mean_terminated_length": 415.19793701171875, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 1.954983922829582, "grad_norm": 1.3485451936721802, "learning_rate": 1e-06, "loss": 0.0558, "num_tokens": 25698672.0, "reward": 1.3339226245880127, "reward_std": 0.17535977065563202, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.10371416807174683, "rewards/one_code_blob_reward/std": 0.03486758470535278, "rewards/reward_code_runs/mean": 0.26250001788139343, "rewards/reward_code_runs/std": 0.4343779683113098, "rewards/think_reward/mean": 0.19687502086162567, "rewards/think_reward/std": 0.03061862289905548, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.07500000298023224, "rewards/torch_zeros_reward/std": 0.04352857545018196, "rewards/valid_tl_methods_reward/mean": 0.20000000298023224, "rewards/valid_tl_methods_reward/std": 0.0, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 765.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 475.3645935058594, "completions/mean_terminated_length": 475.3645935058594, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 1.964630225080386, "grad_norm": 2.2553164958953857, "learning_rate": 1e-06, "loss": 0.0669, "num_tokens": 25789811.0, "reward": 1.1456621885299683, "reward_std": 0.17673367261886597, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.08837032318115234, "rewards/one_code_blob_reward/std": 0.033976636826992035, "rewards/reward_code_runs/mean": 0.078125, "rewards/reward_code_runs/std": 0.2010253369808197, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.08749999850988388, "rewards/torch_zeros_reward/std": 0.033245496451854706, "rewards/valid_tl_methods_reward/mean": 0.19166667759418488, "rewards/valid_tl_methods_reward/std": 0.04017505422234535, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1050.0, "completions/max_terminated_length": 1050.0, "completions/mean_length": 482.88543701171875, "completions/mean_terminated_length": 482.88543701171875, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 1.9742765273311897, "grad_norm": 2.0317275524139404, "learning_rate": 1e-06, "loss": 0.0553, "num_tokens": 25880376.0, "reward": 1.2459683418273926, "reward_std": 0.13240359723567963, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.08763498812913895, "rewards/one_code_blob_reward/std": 0.03719012811779976, "rewards/reward_code_runs/mean": 0.1614583432674408, "rewards/reward_code_runs/std": 0.34826478362083435, "rewards/think_reward/mean": 0.19687502086162567, "rewards/think_reward/std": 0.03061862289905548, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.10000000149011612, "rewards/torch_zeros_reward/std": 0.0, "rewards/valid_tl_methods_reward/mean": 0.20000000298023224, "rewards/valid_tl_methods_reward/std": 0.0, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 875.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 454.22918701171875, "completions/mean_terminated_length": 454.22918701171875, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 1.9839228295819935, "grad_norm": 2.1294896602630615, "learning_rate": 1e-06, "loss": 0.0397, "num_tokens": 25969666.0, "reward": 1.1490356922149658, "reward_std": 0.12597203254699707, "rewards/constexpr_reward/mean": 0.20000000298023224, "rewards/constexpr_reward/std": 0.0, "rewards/imports_decorator_reward/mean": 0.20000000298023224, "rewards/imports_decorator_reward/std": 0.0, "rewards/masks_load_store_reward/mean": 0.10000000149011612, "rewards/masks_load_store_reward/std": 0.0, "rewards/one_code_blob_reward/mean": 0.09486893564462662, "rewards/one_code_blob_reward/std": 0.03526860475540161, "rewards/reward_code_runs/mean": 0.05937499925494194, "rewards/reward_code_runs/std": 0.20967550575733185, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.09479167312383652, "rewards/torch_zeros_reward/std": 0.022336147725582123, "rewards/valid_tl_methods_reward/mean": 0.20000000298023224, "rewards/valid_tl_methods_reward/std": 0.0, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 702.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 484.5333557128906, "completions/mean_terminated_length": 484.5333557128906, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 1.9935691318327975, "grad_norm": 1.774752140045166, "learning_rate": 1e-06, "loss": 0.0223, "num_tokens": 26056652.0, "reward": 1.3336315155029297, "reward_std": 0.21346089243888855, "rewards/constexpr_reward/mean": 0.1979166716337204, "rewards/constexpr_reward/std": 0.020412415266036987, "rewards/imports_decorator_reward/mean": 0.1979166716337204, "rewards/imports_decorator_reward/std": 0.020412415266036987, "rewards/masks_load_store_reward/mean": 0.09791667014360428, "rewards/masks_load_store_reward/std": 0.014357589185237885, "rewards/one_code_blob_reward/mean": 0.09613153338432312, "rewards/one_code_blob_reward/std": 0.017497001215815544, "rewards/reward_code_runs/mean": 0.2552083432674408, "rewards/reward_code_runs/std": 0.41450533270835876, "rewards/think_reward/mean": 0.20000000298023224, "rewards/think_reward/std": 0.0, "rewards/torch_empty_penalty/mean": 0.0, "rewards/torch_empty_penalty/std": 0.0, "rewards/torch_zeros_reward/mean": 0.0885416641831398, "rewards/torch_zeros_reward/std": 0.03201904892921448, "rewards/valid_tl_methods_reward/mean": 0.20000000298023224, "rewards/valid_tl_methods_reward/std": 0.0, "step": 206 } ], "logging_steps": 1, "max_steps": 515, "num_input_tokens_seen": 26056652, "num_train_epochs": 5, "save_steps": 103, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }