{ "best_metric": 0.34333334282040595, "best_model_checkpoint": "/mnt/data/user/zhao_jun/tangjixin/output/model/qwen2.5vl-7b-grpo_new_v20_5k/v13-20250325-021847/checkpoint-2475", "epoch": 1.0, "eval_steps": 250, "global_step": 2475, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 359.125, "epoch": 0.00040404040404040404, "grad_norm": 1.364031546421686, "kl": 0.0, "learning_rate": 1.6129032258064515e-09, "loss": -0.0474996417760849, "memory(GiB)": 81.93, "response_clip_ratio": 0.0, "reward": 0.2083333432674408, "reward_std": 0.25746434926986694, "rewards/MultiModalAccuracyORM": 0.2083333432674408, "step": 1, "train_speed(iter/s)": 0.005983 }, { "clip_ratio": 0.0, "completion_length": 304.95833945274353, "epoch": 0.00202020202020202, "grad_norm": 1.6130071483346196, "kl": 0.00015279650688171387, "learning_rate": 8.064516129032257e-09, "loss": -0.0010303221642971039, "memory(GiB)": 86.73, "response_clip_ratio": 0.0, "reward": 0.052083334885537624, "reward_std": 0.13339675217866898, "rewards/MultiModalAccuracyORM": 0.052083334885537624, "step": 5, "train_speed(iter/s)": 0.019266 }, { "clip_ratio": 0.0, "completion_length": 297.46667594909667, "epoch": 0.00404040404040404, "grad_norm": 1.760454082663187, "kl": 0.000270843505859375, "learning_rate": 1.6129032258064514e-08, "loss": 0.005405974388122558, "memory(GiB)": 87.09, "response_clip_ratio": 0.0, "reward": 0.14166667312383652, "reward_std": 0.26492767333984374, "rewards/MultiModalAccuracyORM": 0.14166667312383652, "step": 10, "train_speed(iter/s)": 0.026623 }, { "clip_ratio": 0.0, "completion_length": 452.308349609375, "epoch": 0.006060606060606061, "grad_norm": 1.1507264780517972, "kl": 0.0002508640289306641, "learning_rate": 2.4193548387096773e-08, "loss": 0.013352996110916138, "memory(GiB)": 87.09, "response_clip_ratio": 0.02500000074505806, "reward": 0.34166667610406876, "reward_std": 0.36744636595249175, "rewards/MultiModalAccuracyORM": 0.34166667610406876, "step": 15, "train_speed(iter/s)": 0.027725 }, { "clip_ratio": 0.0, "completion_length": 291.9916717529297, "epoch": 0.00808080808080808, "grad_norm": 1.9440298564534324, "kl": 0.00028104782104492186, "learning_rate": 3.225806451612903e-08, "loss": 0.006416285037994384, "memory(GiB)": 87.09, "response_clip_ratio": 0.0, "reward": 0.2833333373069763, "reward_std": 0.2916341096162796, "rewards/MultiModalAccuracyORM": 0.2833333373069763, "step": 20, "train_speed(iter/s)": 0.031051 }, { "clip_ratio": 0.0, "completion_length": 378.5500061035156, "epoch": 0.010101010101010102, "grad_norm": 1.6907685802618988, "kl": 0.0002666950225830078, "learning_rate": 4.032258064516129e-08, "loss": -0.018301564455032348, "memory(GiB)": 87.09, "response_clip_ratio": 0.0, "reward": 0.30833334624767306, "reward_std": 0.3720185041427612, "rewards/MultiModalAccuracyORM": 0.30833334624767306, "step": 25, "train_speed(iter/s)": 0.032339 }, { "clip_ratio": 0.0, "completion_length": 370.2333450317383, "epoch": 0.012121212121212121, "grad_norm": 1.5722363224769262, "kl": 0.0002593994140625, "learning_rate": 4.8387096774193546e-08, "loss": -0.027563482522964478, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.25000000596046446, "reward_std": 0.3226982891559601, "rewards/MultiModalAccuracyORM": 0.25000000596046446, "step": 30, "train_speed(iter/s)": 0.032649 }, { "clip_ratio": 0.0, "completion_length": 398.5916778564453, "epoch": 0.014141414141414142, "grad_norm": 2.304234213678912, "kl": 0.00022954940795898436, "learning_rate": 5.645161290322581e-08, "loss": 0.048061671853065493, "memory(GiB)": 87.45, "response_clip_ratio": 0.00833333358168602, "reward": 0.1416666716337204, "reward_std": 0.3226627051830292, "rewards/MultiModalAccuracyORM": 0.1416666716337204, "step": 35, "train_speed(iter/s)": 0.033014 }, { "clip_ratio": 0.0, "completion_length": 274.97500972747804, "epoch": 0.01616161616161616, "grad_norm": 1.6894032790709004, "kl": 0.0002648591995239258, "learning_rate": 6.451612903225806e-08, "loss": 0.012092837691307068, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2666666753590107, "reward_std": 0.222271066904068, "rewards/MultiModalAccuracyORM": 0.2666666753590107, "step": 40, "train_speed(iter/s)": 0.034411 }, { "clip_ratio": 0.0, "completion_length": 421.9333435058594, "epoch": 0.01818181818181818, "grad_norm": 1.9171038477045215, "kl": 0.00023059844970703126, "learning_rate": 7.258064516129032e-08, "loss": -0.0132610023021698, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.15833333879709244, "reward_std": 0.2489179015159607, "rewards/MultiModalAccuracyORM": 0.15833333879709244, "step": 45, "train_speed(iter/s)": 0.034702 }, { "clip_ratio": 0.0, "completion_length": 444.20001525878905, "epoch": 0.020202020202020204, "grad_norm": 1.795783985834061, "kl": 0.00021610260009765624, "learning_rate": 8.064516129032257e-08, "loss": 0.055432689189910886, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.13333333730697633, "reward_std": 0.320406436920166, "rewards/MultiModalAccuracyORM": 0.13333333730697633, "step": 50, "train_speed(iter/s)": 0.034713 }, { "clip_ratio": 0.0, "completion_length": 271.8500068664551, "epoch": 0.022222222222222223, "grad_norm": 1.570392013394559, "kl": 0.00024003982543945311, "learning_rate": 8.870967741935484e-08, "loss": 0.0527652382850647, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.17500000968575477, "reward_std": 0.24862808585166932, "rewards/MultiModalAccuracyORM": 0.17500000968575477, "step": 55, "train_speed(iter/s)": 0.035397 }, { "clip_ratio": 0.0, "completion_length": 240.03333892822266, "epoch": 0.024242424242424242, "grad_norm": 1.7404447091659765, "kl": 0.00024061203002929689, "learning_rate": 9.677419354838709e-08, "loss": -0.06867231130599975, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.39166667982935904, "reward_std": 0.33052347004413607, "rewards/MultiModalAccuracyORM": 0.39166667982935904, "step": 60, "train_speed(iter/s)": 0.036121 }, { "clip_ratio": 0.0, "completion_length": 449.5083480834961, "epoch": 0.026262626262626262, "grad_norm": 1.770871195621109, "kl": 0.0002596855163574219, "learning_rate": 1.0483870967741934e-07, "loss": 0.019220371544361115, "memory(GiB)": 87.45, "response_clip_ratio": 0.00833333358168602, "reward": 0.1416666701436043, "reward_std": 0.27753120064735415, "rewards/MultiModalAccuracyORM": 0.1416666701436043, "step": 65, "train_speed(iter/s)": 0.035829 }, { "clip_ratio": 0.0, "completion_length": 307.05834197998047, "epoch": 0.028282828282828285, "grad_norm": 1.1236406922162803, "kl": 0.00025534629821777344, "learning_rate": 1.1290322580645162e-07, "loss": 0.006563323736190796, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.15833334252238274, "reward_std": 0.18108985424041749, "rewards/MultiModalAccuracyORM": 0.15833334252238274, "step": 70, "train_speed(iter/s)": 0.036273 }, { "clip_ratio": 0.0, "completion_length": 285.05833969116213, "epoch": 0.030303030303030304, "grad_norm": 2.2244576725130276, "kl": 0.00026721954345703124, "learning_rate": 1.2096774193548387e-07, "loss": 0.021188412606716157, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.28333333805203437, "reward_std": 0.3494287371635437, "rewards/MultiModalAccuracyORM": 0.28333333805203437, "step": 75, "train_speed(iter/s)": 0.036577 }, { "clip_ratio": 0.0, "completion_length": 365.70000381469725, "epoch": 0.03232323232323232, "grad_norm": 2.238393674944575, "kl": 0.00026388168334960936, "learning_rate": 1.2903225806451611e-07, "loss": 0.029351598024368285, "memory(GiB)": 87.45, "response_clip_ratio": 0.01666666716337204, "reward": 0.22500000521540642, "reward_std": 0.279270276427269, "rewards/MultiModalAccuracyORM": 0.22500000521540642, "step": 80, "train_speed(iter/s)": 0.036263 }, { "clip_ratio": 0.0, "completion_length": 245.05000381469728, "epoch": 0.03434343434343434, "grad_norm": 1.5092959560425367, "kl": 0.00028471946716308595, "learning_rate": 1.3709677419354838e-07, "loss": -0.036607831716537476, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.28333334177732467, "reward_std": 0.39707074165344236, "rewards/MultiModalAccuracyORM": 0.28333334177732467, "step": 85, "train_speed(iter/s)": 0.035112 }, { "clip_ratio": 0.0, "completion_length": 359.3000152587891, "epoch": 0.03636363636363636, "grad_norm": 1.983727747725694, "kl": 0.0002570152282714844, "learning_rate": 1.4516129032258064e-07, "loss": 0.02973529100418091, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.17500000447034836, "reward_std": 0.27928483188152314, "rewards/MultiModalAccuracyORM": 0.17500000447034836, "step": 90, "train_speed(iter/s)": 0.035019 }, { "clip_ratio": 0.0, "completion_length": 420.7333511352539, "epoch": 0.03838383838383838, "grad_norm": 1.6243054678942601, "kl": 0.00022783279418945313, "learning_rate": 1.5322580645161288e-07, "loss": -0.030441620945930482, "memory(GiB)": 87.45, "response_clip_ratio": 0.00833333358168602, "reward": 0.23333333805203438, "reward_std": 0.35868159830570223, "rewards/MultiModalAccuracyORM": 0.23333333805203438, "step": 95, "train_speed(iter/s)": 0.035038 }, { "clip_ratio": 0.0, "completion_length": 320.6583419799805, "epoch": 0.04040404040404041, "grad_norm": 1.5278965004190905, "kl": 0.00023970603942871093, "learning_rate": 1.6129032258064515e-07, "loss": 0.014825087785720826, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3833333417773247, "reward_std": 0.24560283720493317, "rewards/MultiModalAccuracyORM": 0.3833333417773247, "step": 100, "train_speed(iter/s)": 0.035336 }, { "clip_ratio": 0.0, "completion_length": 367.6000091552734, "epoch": 0.04242424242424243, "grad_norm": 2.275003739183734, "kl": 0.0002989768981933594, "learning_rate": 1.6935483870967741e-07, "loss": 0.021370184421539307, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3083333410322666, "reward_std": 0.31520852744579314, "rewards/MultiModalAccuracyORM": 0.3083333410322666, "step": 105, "train_speed(iter/s)": 0.035535 }, { "clip_ratio": 0.0, "completion_length": 375.37500915527346, "epoch": 0.044444444444444446, "grad_norm": 1.3264840189361857, "kl": 0.00028629302978515624, "learning_rate": 1.7741935483870968e-07, "loss": 0.013422733545303345, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.1416666701436043, "reward_std": 0.24885829985141755, "rewards/MultiModalAccuracyORM": 0.1416666701436043, "step": 110, "train_speed(iter/s)": 0.035867 }, { "clip_ratio": 0.0, "completion_length": 400.7583488464355, "epoch": 0.046464646464646465, "grad_norm": 0.0068729642108505875, "kl": 0.00022754669189453124, "learning_rate": 1.8548387096774192e-07, "loss": 0.007101482152938843, "memory(GiB)": 87.45, "response_clip_ratio": 0.01666666716337204, "reward": 0.24166667386889457, "reward_std": 0.23854664266109465, "rewards/MultiModalAccuracyORM": 0.24166667386889457, "step": 115, "train_speed(iter/s)": 0.035529 }, { "clip_ratio": 0.0, "completion_length": 358.0500122070313, "epoch": 0.048484848484848485, "grad_norm": 1.666888807483155, "kl": 0.00029277801513671875, "learning_rate": 1.9354838709677418e-07, "loss": -0.013055479526519776, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.20833334028720857, "reward_std": 0.24041947722434998, "rewards/MultiModalAccuracyORM": 0.20833334028720857, "step": 120, "train_speed(iter/s)": 0.035843 }, { "clip_ratio": 0.0, "completion_length": 285.0916717529297, "epoch": 0.050505050505050504, "grad_norm": 3.6057797063570765, "kl": 0.00020406246185302734, "learning_rate": 2e-07, "loss": 0.029223644733428956, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2750000052154064, "reward_std": 0.27371591329574585, "rewards/MultiModalAccuracyORM": 0.2750000052154064, "step": 125, "train_speed(iter/s)": 0.036026 }, { "clip_ratio": 0.0, "completion_length": 488.7583526611328, "epoch": 0.052525252525252523, "grad_norm": 1.7900187922950372, "kl": 0.00025043487548828127, "learning_rate": 2e-07, "loss": 0.0551780104637146, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2833333484828472, "reward_std": 0.3641817569732666, "rewards/MultiModalAccuracyORM": 0.2833333484828472, "step": 130, "train_speed(iter/s)": 0.036075 }, { "clip_ratio": 0.0, "completion_length": 330.5000072479248, "epoch": 0.05454545454545454, "grad_norm": 2.529917707084592, "kl": 0.0002529144287109375, "learning_rate": 2e-07, "loss": 0.02438216805458069, "memory(GiB)": 87.45, "response_clip_ratio": 0.00833333358168602, "reward": 0.3416666708886623, "reward_std": 0.279270276427269, "rewards/MultiModalAccuracyORM": 0.3416666708886623, "step": 135, "train_speed(iter/s)": 0.036092 }, { "clip_ratio": 0.0, "completion_length": 373.6333465576172, "epoch": 0.05656565656565657, "grad_norm": 1.3049814649570146, "kl": 0.0002875804901123047, "learning_rate": 2e-07, "loss": -0.022501662373542786, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3833333469927311, "reward_std": 0.34958777129650115, "rewards/MultiModalAccuracyORM": 0.3833333469927311, "step": 140, "train_speed(iter/s)": 0.036095 }, { "clip_ratio": 0.0, "completion_length": 345.41668395996095, "epoch": 0.05858585858585859, "grad_norm": 1.8437868971897566, "kl": 0.00023627281188964844, "learning_rate": 2e-07, "loss": 0.06273630857467652, "memory(GiB)": 87.45, "response_clip_ratio": 0.00833333358168602, "reward": 0.3666666768491268, "reward_std": 0.3914994150400162, "rewards/MultiModalAccuracyORM": 0.3666666768491268, "step": 145, "train_speed(iter/s)": 0.036226 }, { "clip_ratio": 0.0, "completion_length": 266.51667098999025, "epoch": 0.06060606060606061, "grad_norm": 1.0785517011291799, "kl": 0.00021938085556030273, "learning_rate": 2e-07, "loss": 0.02771698534488678, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.400000012665987, "reward_std": 0.3516494154930115, "rewards/MultiModalAccuracyORM": 0.400000012665987, "step": 150, "train_speed(iter/s)": 0.036427 }, { "clip_ratio": 0.0, "completion_length": 333.3500152587891, "epoch": 0.06262626262626263, "grad_norm": 12.619972342482905, "kl": 0.00030460357666015623, "learning_rate": 2e-07, "loss": -0.06058757305145264, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2250000074505806, "reward_std": 0.37600439190864565, "rewards/MultiModalAccuracyORM": 0.2250000074505806, "step": 155, "train_speed(iter/s)": 0.036609 }, { "clip_ratio": 0.0, "completion_length": 358.7416732788086, "epoch": 0.06464646464646465, "grad_norm": 1.306377595968382, "kl": 0.00027475357055664065, "learning_rate": 2e-07, "loss": -0.00979010909795761, "memory(GiB)": 87.45, "response_clip_ratio": 0.00833333358168602, "reward": 0.15833333432674407, "reward_std": 0.28456337153911593, "rewards/MultiModalAccuracyORM": 0.15833333432674407, "step": 160, "train_speed(iter/s)": 0.036431 }, { "clip_ratio": 0.0, "completion_length": 324.9916763305664, "epoch": 0.06666666666666667, "grad_norm": 0.9830762972924579, "kl": 0.00030498504638671876, "learning_rate": 2e-07, "loss": -0.008201467990875243, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.10000000149011612, "reward_std": 0.2260383188724518, "rewards/MultiModalAccuracyORM": 0.10000000149011612, "step": 165, "train_speed(iter/s)": 0.036665 }, { "clip_ratio": 0.0, "completion_length": 249.37500610351563, "epoch": 0.06868686868686869, "grad_norm": 2.1917101699979287, "kl": 0.00025620460510253904, "learning_rate": 2e-07, "loss": 0.016992685198783875, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3500000037252903, "reward_std": 0.330559054017067, "rewards/MultiModalAccuracyORM": 0.3500000037252903, "step": 170, "train_speed(iter/s)": 0.036951 }, { "clip_ratio": 0.0, "completion_length": 358.87500762939453, "epoch": 0.0707070707070707, "grad_norm": 1.0748542635448965, "kl": 0.0002711296081542969, "learning_rate": 2e-07, "loss": 0.010954010486602783, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.20833333656191827, "reward_std": 0.22400068640708923, "rewards/MultiModalAccuracyORM": 0.20833333656191827, "step": 175, "train_speed(iter/s)": 0.037203 }, { "clip_ratio": 0.0, "completion_length": 313.62500762939453, "epoch": 0.07272727272727272, "grad_norm": 2.2725379948331543, "kl": 0.00025653839111328125, "learning_rate": 2e-07, "loss": 0.03469780087471008, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.16666667237877847, "reward_std": 0.3332285821437836, "rewards/MultiModalAccuracyORM": 0.16666667237877847, "step": 180, "train_speed(iter/s)": 0.037355 }, { "clip_ratio": 0.0, "completion_length": 271.96667327880857, "epoch": 0.07474747474747474, "grad_norm": 1.4486054691502512, "kl": 0.0002918243408203125, "learning_rate": 2e-07, "loss": -0.009673595428466797, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3083333373069763, "reward_std": 0.102961727976799, "rewards/MultiModalAccuracyORM": 0.3083333373069763, "step": 185, "train_speed(iter/s)": 0.037564 }, { "clip_ratio": 0.0, "completion_length": 403.25834197998046, "epoch": 0.07676767676767676, "grad_norm": 3.170971594101629, "kl": 0.00025038719177246095, "learning_rate": 2e-07, "loss": 0.0012440800666809082, "memory(GiB)": 87.45, "response_clip_ratio": 0.01666666716337204, "reward": 0.24166667014360427, "reward_std": 0.30789810717105864, "rewards/MultiModalAccuracyORM": 0.24166667014360427, "step": 190, "train_speed(iter/s)": 0.037415 }, { "clip_ratio": 0.0, "completion_length": 294.62500991821287, "epoch": 0.07878787878787878, "grad_norm": 1.98318367969525, "kl": 0.00029687881469726564, "learning_rate": 2e-07, "loss": 0.008435648679733277, "memory(GiB)": 87.45, "response_clip_ratio": 0.00833333358168602, "reward": 0.33333334028720857, "reward_std": 0.24741607010364533, "rewards/MultiModalAccuracyORM": 0.33333334028720857, "step": 195, "train_speed(iter/s)": 0.037342 }, { "clip_ratio": 0.0, "completion_length": 374.6333465576172, "epoch": 0.08080808080808081, "grad_norm": 1.503273341785427, "kl": 0.000333404541015625, "learning_rate": 2e-07, "loss": 0.005708768963813782, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.26666667237877845, "reward_std": 0.3603756338357925, "rewards/MultiModalAccuracyORM": 0.26666667237877845, "step": 200, "train_speed(iter/s)": 0.037521 }, { "clip_ratio": 0.0, "completion_length": 379.68334407806395, "epoch": 0.08282828282828283, "grad_norm": 0.5199716532978094, "kl": 0.0004832744598388672, "learning_rate": 2e-07, "loss": -0.014856468141078948, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.24166667014360427, "reward_std": 0.33937130570411683, "rewards/MultiModalAccuracyORM": 0.24166667014360427, "step": 205, "train_speed(iter/s)": 0.037585 }, { "clip_ratio": 0.0, "completion_length": 305.3916732788086, "epoch": 0.08484848484848485, "grad_norm": 2.1287930828371358, "kl": 0.000292205810546875, "learning_rate": 2e-07, "loss": 0.001297689974308014, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.21666667386889457, "reward_std": 0.32771685123443606, "rewards/MultiModalAccuracyORM": 0.21666667386889457, "step": 210, "train_speed(iter/s)": 0.037751 }, { "clip_ratio": 0.0, "completion_length": 345.49167213439944, "epoch": 0.08686868686868687, "grad_norm": 1.7796242872827708, "kl": 0.00042543411254882815, "learning_rate": 2e-07, "loss": -0.006988461315631867, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.23333333656191826, "reward_std": 0.2692273885011673, "rewards/MultiModalAccuracyORM": 0.23333333656191826, "step": 215, "train_speed(iter/s)": 0.037754 }, { "clip_ratio": 0.0, "completion_length": 314.81667442321776, "epoch": 0.08888888888888889, "grad_norm": 1.7638027896241226, "kl": 0.0006679534912109375, "learning_rate": 2e-07, "loss": 0.006352822482585907, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.15833333507180214, "reward_std": 0.25008893609046934, "rewards/MultiModalAccuracyORM": 0.15833333507180214, "step": 220, "train_speed(iter/s)": 0.03785 }, { "clip_ratio": 0.0, "completion_length": 311.2750076293945, "epoch": 0.09090909090909091, "grad_norm": 0.012708836578688367, "kl": 0.00029745101928710935, "learning_rate": 2e-07, "loss": 0.0504034161567688, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.30000000149011613, "reward_std": 0.3164917230606079, "rewards/MultiModalAccuracyORM": 0.30000000149011613, "step": 225, "train_speed(iter/s)": 0.038015 }, { "clip_ratio": 0.0, "completion_length": 265.6750061035156, "epoch": 0.09292929292929293, "grad_norm": 2.064611776487197, "kl": 0.000385284423828125, "learning_rate": 2e-07, "loss": 0.07023286819458008, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.15000000298023225, "reward_std": 0.2650228708982468, "rewards/MultiModalAccuracyORM": 0.15000000298023225, "step": 230, "train_speed(iter/s)": 0.03818 }, { "clip_ratio": 0.0, "completion_length": 371.5416793823242, "epoch": 0.09494949494949495, "grad_norm": 1.949431436305181, "kl": 0.0002506256103515625, "learning_rate": 2e-07, "loss": 0.01011454164981842, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3333333469927311, "reward_std": 0.3637147039175034, "rewards/MultiModalAccuracyORM": 0.3333333469927311, "step": 235, "train_speed(iter/s)": 0.03819 }, { "clip_ratio": 0.0, "completion_length": 360.533341217041, "epoch": 0.09696969696969697, "grad_norm": 0.5471178347466235, "kl": 0.0010341405868530273, "learning_rate": 2e-07, "loss": -0.0015352100133895874, "memory(GiB)": 87.45, "response_clip_ratio": 0.01666666716337204, "reward": 0.28333333805203437, "reward_std": 0.3511823683977127, "rewards/MultiModalAccuracyORM": 0.28333333805203437, "step": 240, "train_speed(iter/s)": 0.037977 }, { "clip_ratio": 0.0, "completion_length": 336.3000129699707, "epoch": 0.09898989898989899, "grad_norm": 2.3165413137247333, "kl": 0.00027217864990234373, "learning_rate": 2e-07, "loss": 0.0210051491856575, "memory(GiB)": 87.45, "response_clip_ratio": 0.00833333358168602, "reward": 0.32500000968575476, "reward_std": 0.38450039029121397, "rewards/MultiModalAccuracyORM": 0.32500000968575476, "step": 245, "train_speed(iter/s)": 0.037993 }, { "epoch": 0.10101010101010101, "grad_norm": 2.645674704495033, "learning_rate": 2e-07, "loss": -0.03384391665458679, "memory(GiB)": 87.45, "step": 250, "train_speed(iter/s)": 0.038032 }, { "epoch": 0.10101010101010101, "eval_clip_ratio": 0.0, "eval_completion_length": 334.34500762939456, "eval_kl": 0.0004983329772949218, "eval_loss": 0.023834386840462685, "eval_response_clip_ratio": 0.003333333432674408, "eval_reward": 0.24666667267680167, "eval_reward_std": 0.30061395645141603, "eval_rewards/MultiModalAccuracyORM": 0.24666667267680167, "eval_runtime": 585.2435, "eval_samples_per_second": 0.085, "eval_steps_per_second": 0.009, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 369.79583778381345, "epoch": 0.10303030303030303, "grad_norm": 1.5910045148895993, "kl": 0.0006116151809692383, "learning_rate": 2e-07, "loss": -0.05511324405670166, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.34166667647659776, "reward_std": 0.3701108664274216, "rewards/MultiModalAccuracyORM": 0.34166667647659776, "step": 255, "train_speed(iter/s)": 0.03329 }, { "clip_ratio": 0.0, "completion_length": 287.85, "epoch": 0.10505050505050505, "grad_norm": 1.8789057522234565, "kl": 0.0006687164306640625, "learning_rate": 2e-07, "loss": 0.08147464394569397, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3500000037252903, "reward_std": 0.3494287371635437, "rewards/MultiModalAccuracyORM": 0.3500000037252903, "step": 260, "train_speed(iter/s)": 0.033421 }, { "clip_ratio": 0.0, "completion_length": 327.0, "epoch": 0.10707070707070707, "grad_norm": 1.685788699755795, "kl": 0.00030879974365234376, "learning_rate": 2e-07, "loss": 0.0021983295679092406, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2083333358168602, "reward_std": 0.3010816007852554, "rewards/MultiModalAccuracyORM": 0.2083333358168602, "step": 265, "train_speed(iter/s)": 0.033374 }, { "clip_ratio": 0.0, "completion_length": 380.5, "epoch": 0.10909090909090909, "grad_norm": 2.9700739773322695, "kl": 0.00040111541748046877, "learning_rate": 2e-07, "loss": -0.004064649343490601, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.15833333730697632, "reward_std": 0.33526621460914613, "rewards/MultiModalAccuracyORM": 0.15833333730697632, "step": 270, "train_speed(iter/s)": 0.033364 }, { "clip_ratio": 0.0, "completion_length": 324.25, "epoch": 0.1111111111111111, "grad_norm": 1.5939506920216808, "kl": 0.00045032501220703124, "learning_rate": 2e-07, "loss": 0.026332959532737732, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2583333395421505, "reward_std": 0.2526735752820969, "rewards/MultiModalAccuracyORM": 0.2583333395421505, "step": 275, "train_speed(iter/s)": 0.033468 }, { "clip_ratio": 0.0, "completion_length": 496.5, "epoch": 0.11313131313131314, "grad_norm": 1.3058289755881347, "kl": 0.000375831127166748, "learning_rate": 2e-07, "loss": 0.027166426181793213, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.32500001341104506, "reward_std": 0.37195890247821806, "rewards/MultiModalAccuracyORM": 0.32500001341104506, "step": 280, "train_speed(iter/s)": 0.033383 }, { "clip_ratio": 0.0, "completion_length": 361.05, "epoch": 0.11515151515151516, "grad_norm": 0.5211592745612927, "kl": 0.0004334449768066406, "learning_rate": 2e-07, "loss": -0.001045474410057068, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.22500000149011612, "reward_std": 0.1808116167783737, "rewards/MultiModalAccuracyORM": 0.22500000149011612, "step": 285, "train_speed(iter/s)": 0.03333 }, { "clip_ratio": 0.0, "completion_length": 345.25, "epoch": 0.11717171717171718, "grad_norm": 1.9995357461573446, "kl": 0.0005333900451660156, "learning_rate": 2e-07, "loss": -0.00281745046377182, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.18333333656191825, "reward_std": 0.3385071337223053, "rewards/MultiModalAccuracyORM": 0.18333333656191825, "step": 290, "train_speed(iter/s)": 0.033413 }, { "clip_ratio": 0.0, "completion_length": 387.1, "epoch": 0.1191919191919192, "grad_norm": 3.694756818436622, "kl": 0.0010143280029296874, "learning_rate": 2e-07, "loss": -0.003062787652015686, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.15833333879709244, "reward_std": 0.314164274930954, "rewards/MultiModalAccuracyORM": 0.15833333879709244, "step": 295, "train_speed(iter/s)": 0.03345 }, { "clip_ratio": 0.0, "completion_length": 393.25, "epoch": 0.12121212121212122, "grad_norm": 1.5577866137872902, "kl": 0.00044269561767578124, "learning_rate": 2e-07, "loss": -0.022827643156051635, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2583333447575569, "reward_std": 0.3393001317977905, "rewards/MultiModalAccuracyORM": 0.2583333447575569, "step": 300, "train_speed(iter/s)": 0.033327 }, { "clip_ratio": 0.0, "completion_length": 416.25, "epoch": 0.12323232323232323, "grad_norm": 0.8793802822161716, "kl": 0.00045299530029296875, "learning_rate": 2e-07, "loss": 0.039026769995689395, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2333333395421505, "reward_std": 0.33277973234653474, "rewards/MultiModalAccuracyORM": 0.2333333395421505, "step": 305, "train_speed(iter/s)": 0.032887 }, { "clip_ratio": 0.0, "completion_length": 334.3, "epoch": 0.12525252525252525, "grad_norm": 1.9841151826732792, "kl": 0.0006313323974609375, "learning_rate": 2e-07, "loss": -0.006224775314331054, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.23333334252238275, "reward_std": 0.31441850066184995, "rewards/MultiModalAccuracyORM": 0.23333334252238275, "step": 310, "train_speed(iter/s)": 0.032913 }, { "clip_ratio": 0.0, "completion_length": 537.7, "epoch": 0.12727272727272726, "grad_norm": 1.2729907719968943, "kl": 0.0007027626037597656, "learning_rate": 2e-07, "loss": 0.014832744002342224, "memory(GiB)": 87.45, "response_clip_ratio": 0.05, "reward": 0.11666666939854622, "reward_std": 0.25891573131084444, "rewards/MultiModalAccuracyORM": 0.11666666939854622, "step": 315, "train_speed(iter/s)": 0.032886 }, { "clip_ratio": 0.0, "completion_length": 282.8, "epoch": 0.1292929292929293, "grad_norm": 0.9148877498687834, "kl": 0.000760650634765625, "learning_rate": 2e-07, "loss": 0.06303757429122925, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.21666667014360427, "reward_std": 0.2323044866323471, "rewards/MultiModalAccuracyORM": 0.21666667014360427, "step": 320, "train_speed(iter/s)": 0.032974 }, { "clip_ratio": 0.0, "completion_length": 404.8, "epoch": 0.13131313131313133, "grad_norm": 2.00474803214382, "kl": 0.0007790565490722656, "learning_rate": 2e-07, "loss": 0.02660681903362274, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.13333333656191826, "reward_std": 0.2486636757850647, "rewards/MultiModalAccuracyORM": 0.13333333656191826, "step": 325, "train_speed(iter/s)": 0.033068 }, { "clip_ratio": 0.0, "completion_length": 333.8, "epoch": 0.13333333333333333, "grad_norm": 1.6448765146368245, "kl": 0.0005625724792480469, "learning_rate": 2e-07, "loss": 0.024477413296699523, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.19166667237877846, "reward_std": 0.2629852324724197, "rewards/MultiModalAccuracyORM": 0.19166667237877846, "step": 330, "train_speed(iter/s)": 0.0332 }, { "clip_ratio": 0.0, "completion_length": 330.25, "epoch": 0.13535353535353536, "grad_norm": 2.2001765187520776, "kl": 0.0006697654724121093, "learning_rate": 2e-07, "loss": 0.07480921745300292, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.27500001043081285, "reward_std": 0.37195890247821806, "rewards/MultiModalAccuracyORM": 0.27500001043081285, "step": 335, "train_speed(iter/s)": 0.033276 }, { "clip_ratio": 0.0, "completion_length": 386.1, "epoch": 0.13737373737373737, "grad_norm": 0.6836764259374134, "kl": 0.0006744384765625, "learning_rate": 2e-07, "loss": 0.050872421264648436, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.36666667386889457, "reward_std": 0.25897533297538755, "rewards/MultiModalAccuracyORM": 0.36666667386889457, "step": 340, "train_speed(iter/s)": 0.033397 }, { "clip_ratio": 0.0, "completion_length": 430.0, "epoch": 0.1393939393939394, "grad_norm": 0.02974363962833146, "kl": 0.0007775306701660156, "learning_rate": 2e-07, "loss": -0.00942653715610504, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.1500000037252903, "reward_std": 0.1933199405670166, "rewards/MultiModalAccuracyORM": 0.1500000037252903, "step": 345, "train_speed(iter/s)": 0.033409 }, { "clip_ratio": 0.0, "completion_length": 406.15, "epoch": 0.1414141414141414, "grad_norm": 2.153809687333121, "kl": 0.00106048583984375, "learning_rate": 2e-07, "loss": -0.04788823127746582, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3833333425223827, "reward_std": 0.3908641755580902, "rewards/MultiModalAccuracyORM": 0.3833333425223827, "step": 350, "train_speed(iter/s)": 0.033484 }, { "clip_ratio": 0.0, "completion_length": 273.95, "epoch": 0.14343434343434344, "grad_norm": 2.9003800421035084, "kl": 0.001187896728515625, "learning_rate": 2e-07, "loss": -0.025590839982032775, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.1500000014901161, "reward_std": 0.24484840035438538, "rewards/MultiModalAccuracyORM": 0.1500000014901161, "step": 355, "train_speed(iter/s)": 0.033613 }, { "clip_ratio": 0.0, "completion_length": 258.15, "epoch": 0.14545454545454545, "grad_norm": 1.3041121484800926, "kl": 0.001438140869140625, "learning_rate": 2e-07, "loss": 0.10738253593444824, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.18333333656191825, "reward_std": 0.3196970522403717, "rewards/MultiModalAccuracyORM": 0.18333333656191825, "step": 360, "train_speed(iter/s)": 0.033727 }, { "clip_ratio": 0.0, "completion_length": 380.15, "epoch": 0.14747474747474748, "grad_norm": 0.8360441109730193, "kl": 0.00127105712890625, "learning_rate": 2e-07, "loss": -0.003975853323936462, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.05000000149011612, "reward_std": 0.13558491468429565, "rewards/MultiModalAccuracyORM": 0.05000000149011612, "step": 365, "train_speed(iter/s)": 0.033745 }, { "clip_ratio": 0.0, "completion_length": 296.6, "epoch": 0.1494949494949495, "grad_norm": 2.3979328705343153, "kl": 0.001323699951171875, "learning_rate": 2e-07, "loss": -0.048431962728500366, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.25000000521540644, "reward_std": 0.35312480926513673, "rewards/MultiModalAccuracyORM": 0.25000000521540644, "step": 370, "train_speed(iter/s)": 0.033877 }, { "clip_ratio": 0.0, "completion_length": 345.2, "epoch": 0.15151515151515152, "grad_norm": 1.5241819642025198, "kl": 0.0015224456787109376, "learning_rate": 2e-07, "loss": 0.08156558275222778, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3916666768491268, "reward_std": 0.30183603167533873, "rewards/MultiModalAccuracyORM": 0.3916666768491268, "step": 375, "train_speed(iter/s)": 0.033941 }, { "clip_ratio": 0.0, "completion_length": 318.7, "epoch": 0.15353535353535352, "grad_norm": 1.4091270455051919, "kl": 0.0014804840087890626, "learning_rate": 2e-07, "loss": -0.005422207713127136, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.25833333656191826, "reward_std": 0.29863070249557494, "rewards/MultiModalAccuracyORM": 0.25833333656191826, "step": 380, "train_speed(iter/s)": 0.03401 }, { "clip_ratio": 0.0, "completion_length": 332.4, "epoch": 0.15555555555555556, "grad_norm": 1.7741695775671322, "kl": 0.0017261505126953125, "learning_rate": 2e-07, "loss": 0.013069793581962585, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.35833334028720853, "reward_std": 0.41791602969169617, "rewards/MultiModalAccuracyORM": 0.35833334028720853, "step": 385, "train_speed(iter/s)": 0.034132 }, { "clip_ratio": 0.0, "completion_length": 325.15, "epoch": 0.15757575757575756, "grad_norm": 2.1621073881433954, "kl": 0.001946258544921875, "learning_rate": 2e-07, "loss": 0.018825350701808928, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2000000074505806, "reward_std": 0.329024064540863, "rewards/MultiModalAccuracyORM": 0.2000000074505806, "step": 390, "train_speed(iter/s)": 0.034186 }, { "clip_ratio": 0.0, "completion_length": 393.7, "epoch": 0.1595959595959596, "grad_norm": 1.8573956206789706, "kl": 0.0013622283935546876, "learning_rate": 2e-07, "loss": 0.01834181547164917, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2583333380520344, "reward_std": 0.33226497769355773, "rewards/MultiModalAccuracyORM": 0.2583333380520344, "step": 395, "train_speed(iter/s)": 0.034168 }, { "clip_ratio": 0.0, "completion_length": 319.05, "epoch": 0.16161616161616163, "grad_norm": 2.2110728171395646, "kl": 0.0019084930419921875, "learning_rate": 2e-07, "loss": 0.019550779461860658, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.20000000521540642, "reward_std": 0.3008869707584381, "rewards/MultiModalAccuracyORM": 0.20000000521540642, "step": 400, "train_speed(iter/s)": 0.034255 }, { "clip_ratio": 0.0, "completion_length": 263.5, "epoch": 0.16363636363636364, "grad_norm": 2.2884019112467, "kl": 0.00233917236328125, "learning_rate": 2e-07, "loss": 0.00730045884847641, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.30000000819563866, "reward_std": 0.32297652661800386, "rewards/MultiModalAccuracyORM": 0.30000000819563866, "step": 405, "train_speed(iter/s)": 0.034354 }, { "clip_ratio": 0.0, "completion_length": 366.95, "epoch": 0.16565656565656567, "grad_norm": 3.384921120442682, "kl": 0.001834869384765625, "learning_rate": 2e-07, "loss": 0.02867870032787323, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.30833334401249884, "reward_std": 0.3604020655155182, "rewards/MultiModalAccuracyORM": 0.30833334401249884, "step": 410, "train_speed(iter/s)": 0.034303 }, { "clip_ratio": 0.0, "completion_length": 380.85, "epoch": 0.16767676767676767, "grad_norm": 2.578682884841481, "kl": 0.0019824981689453127, "learning_rate": 2e-07, "loss": 0.007520823180675507, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.20833333656191827, "reward_std": 0.24105713665485382, "rewards/MultiModalAccuracyORM": 0.20833333656191827, "step": 415, "train_speed(iter/s)": 0.034306 }, { "clip_ratio": 0.0, "completion_length": 223.9, "epoch": 0.1696969696969697, "grad_norm": 2.841135168153006, "kl": 0.003629302978515625, "learning_rate": 2e-07, "loss": 0.008403807878494263, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.4166666746139526, "reward_std": 0.31846399009227755, "rewards/MultiModalAccuracyORM": 0.4166666746139526, "step": 420, "train_speed(iter/s)": 0.034394 }, { "clip_ratio": 0.0, "completion_length": 406.0, "epoch": 0.1717171717171717, "grad_norm": 1.3952154788825455, "kl": 0.0026947021484375, "learning_rate": 2e-07, "loss": 0.016321972012519836, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3750000104308128, "reward_std": 0.3541358977556229, "rewards/MultiModalAccuracyORM": 0.3750000104308128, "step": 425, "train_speed(iter/s)": 0.034427 }, { "clip_ratio": 0.0, "completion_length": 426.85, "epoch": 0.17373737373737375, "grad_norm": 2.642228792263709, "kl": 0.0035511016845703124, "learning_rate": 2e-07, "loss": 0.04757256805896759, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3000000059604645, "reward_std": 0.27122943103313446, "rewards/MultiModalAccuracyORM": 0.3000000059604645, "step": 430, "train_speed(iter/s)": 0.034492 }, { "clip_ratio": 0.0, "completion_length": 357.9, "epoch": 0.17575757575757575, "grad_norm": 2.3061110590781433, "kl": 0.0025909423828125, "learning_rate": 2e-07, "loss": -0.02955559492111206, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.27500000819563863, "reward_std": 0.42218015491962435, "rewards/MultiModalAccuracyORM": 0.27500000819563863, "step": 435, "train_speed(iter/s)": 0.034539 }, { "clip_ratio": 0.0, "completion_length": 353.9, "epoch": 0.17777777777777778, "grad_norm": 0.03487250614691778, "kl": 0.00295562744140625, "learning_rate": 2e-07, "loss": 0.03084596395492554, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.15833333432674407, "reward_std": 0.2657532900571823, "rewards/MultiModalAccuracyORM": 0.15833333432674407, "step": 440, "train_speed(iter/s)": 0.034613 }, { "clip_ratio": 0.0, "completion_length": 357.5, "epoch": 0.1797979797979798, "grad_norm": 1.8186333166660678, "kl": 0.0029296875, "learning_rate": 2e-07, "loss": -0.008677978813648225, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3083333358168602, "reward_std": 0.23004821836948394, "rewards/MultiModalAccuracyORM": 0.3083333358168602, "step": 445, "train_speed(iter/s)": 0.034594 }, { "clip_ratio": 0.0, "completion_length": 277.7, "epoch": 0.18181818181818182, "grad_norm": 1.5483724144717876, "kl": 0.003802490234375, "learning_rate": 2e-07, "loss": -0.010931169986724854, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.21666667461395264, "reward_std": 0.36794900298118594, "rewards/MultiModalAccuracyORM": 0.21666667461395264, "step": 450, "train_speed(iter/s)": 0.034617 }, { "clip_ratio": 0.0, "completion_length": 442.1, "epoch": 0.18383838383838383, "grad_norm": 0.8802169915779423, "kl": 0.00302734375, "learning_rate": 2e-07, "loss": -0.04651644229888916, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.15000000596046448, "reward_std": 0.2963056802749634, "rewards/MultiModalAccuracyORM": 0.15000000596046448, "step": 455, "train_speed(iter/s)": 0.034674 }, { "clip_ratio": 0.0, "completion_length": 329.5, "epoch": 0.18585858585858586, "grad_norm": 1.6049021687383316, "kl": 0.00660247802734375, "learning_rate": 2e-07, "loss": 0.008616116642951966, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.25833333656191826, "reward_std": 0.25741389989852903, "rewards/MultiModalAccuracyORM": 0.25833333656191826, "step": 460, "train_speed(iter/s)": 0.034754 }, { "clip_ratio": 0.0, "completion_length": 316.5, "epoch": 0.18787878787878787, "grad_norm": 2.893110887441056, "kl": 0.002629852294921875, "learning_rate": 2e-07, "loss": 0.0028022266924381256, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.40000001415610315, "reward_std": 0.4707459330558777, "rewards/MultiModalAccuracyORM": 0.40000001415610315, "step": 465, "train_speed(iter/s)": 0.034821 }, { "clip_ratio": 0.0, "completion_length": 279.5, "epoch": 0.1898989898989899, "grad_norm": 2.1102869760511584, "kl": 0.0035003662109375, "learning_rate": 2e-07, "loss": 0.0047733023762702945, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.20000000670552254, "reward_std": 0.3082119345664978, "rewards/MultiModalAccuracyORM": 0.20000000670552254, "step": 470, "train_speed(iter/s)": 0.034862 }, { "clip_ratio": 0.0, "completion_length": 312.1, "epoch": 0.1919191919191919, "grad_norm": 2.403767582762209, "kl": 0.00347442626953125, "learning_rate": 2e-07, "loss": 0.0637534499168396, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.5083333484828472, "reward_std": 0.34557787179946897, "rewards/MultiModalAccuracyORM": 0.5083333484828472, "step": 475, "train_speed(iter/s)": 0.03495 }, { "clip_ratio": 0.0, "completion_length": 348.65, "epoch": 0.19393939393939394, "grad_norm": 0.6979791277265925, "kl": 0.00365142822265625, "learning_rate": 2e-07, "loss": -0.04180996119976044, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.26666667088866236, "reward_std": 0.32826719582080843, "rewards/MultiModalAccuracyORM": 0.26666667088866236, "step": 480, "train_speed(iter/s)": 0.034951 }, { "clip_ratio": 0.0, "completion_length": 269.85, "epoch": 0.19595959595959597, "grad_norm": 0.0525932465492366, "kl": 0.00377197265625, "learning_rate": 2e-07, "loss": -0.014869007468223571, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.4000000052154064, "reward_std": 0.20967912971973418, "rewards/MultiModalAccuracyORM": 0.4000000052154064, "step": 485, "train_speed(iter/s)": 0.035021 }, { "clip_ratio": 0.0, "completion_length": 294.1, "epoch": 0.19797979797979798, "grad_norm": 1.6281647114218305, "kl": 0.004177093505859375, "learning_rate": 2e-07, "loss": 0.015925824642181396, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.21666667014360427, "reward_std": 0.3227578908205032, "rewards/MultiModalAccuracyORM": 0.21666667014360427, "step": 490, "train_speed(iter/s)": 0.035088 }, { "clip_ratio": 0.0, "completion_length": 329.75, "epoch": 0.2, "grad_norm": 1.984961473458151, "kl": 0.00326995849609375, "learning_rate": 2e-07, "loss": -0.0037449508905410766, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.27500000819563863, "reward_std": 0.2855509877204895, "rewards/MultiModalAccuracyORM": 0.27500000819563863, "step": 495, "train_speed(iter/s)": 0.035113 }, { "epoch": 0.20202020202020202, "grad_norm": 0.6734714455829673, "learning_rate": 2e-07, "loss": -0.013085539638996124, "memory(GiB)": 87.45, "step": 500, "train_speed(iter/s)": 0.035182 }, { "epoch": 0.20202020202020202, "eval_clip_ratio": 0.0, "eval_completion_length": 363.1450085449219, "eval_kl": 0.003147125244140625, "eval_loss": 0.024374496191740036, "eval_response_clip_ratio": 0.003333333432674408, "eval_reward": 0.26666667237877845, "eval_reward_std": 0.28797652542591096, "eval_rewards/MultiModalAccuracyORM": 0.26666667237877845, "eval_runtime": 597.4581, "eval_samples_per_second": 0.084, "eval_steps_per_second": 0.008, "step": 500 }, { "clip_ratio": 0.0, "completion_length": 345.4, "epoch": 0.20404040404040405, "grad_norm": 2.0097245676314053, "kl": 0.002962684631347656, "learning_rate": 2e-07, "loss": 0.008341678977012634, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.22916666902601718, "reward_std": 0.28844616413116453, "rewards/MultiModalAccuracyORM": 0.22916666902601718, "step": 505, "train_speed(iter/s)": 0.033026 }, { "clip_ratio": 0.0, "completion_length": 478.15, "epoch": 0.20606060606060606, "grad_norm": 0.04671524557136776, "kl": 0.004395294189453125, "learning_rate": 2e-07, "loss": 0.019101715087890624, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.20833333656191827, "reward_std": 0.22704698145389557, "rewards/MultiModalAccuracyORM": 0.20833333656191827, "step": 510, "train_speed(iter/s)": 0.033029 }, { "clip_ratio": 0.0, "completion_length": 390.65, "epoch": 0.2080808080808081, "grad_norm": 1.7656462373703843, "kl": 0.003029632568359375, "learning_rate": 2e-07, "loss": 0.04230659604072571, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.22500000521540642, "reward_std": 0.248858305811882, "rewards/MultiModalAccuracyORM": 0.22500000521540642, "step": 515, "train_speed(iter/s)": 0.032925 }, { "clip_ratio": 0.0, "completion_length": 313.8, "epoch": 0.2101010101010101, "grad_norm": 1.2593604182587, "kl": 0.0040802001953125, "learning_rate": 2e-07, "loss": -0.0020169973373413085, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.4583333432674408, "reward_std": 0.4390155434608459, "rewards/MultiModalAccuracyORM": 0.4583333432674408, "step": 520, "train_speed(iter/s)": 0.032885 }, { "clip_ratio": 0.0, "completion_length": 410.7, "epoch": 0.21212121212121213, "grad_norm": 10.635733115288671, "kl": 0.006873321533203125, "learning_rate": 2e-07, "loss": 0.013639546930789948, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.25000000447034837, "reward_std": 0.29108133912086487, "rewards/MultiModalAccuracyORM": 0.25000000447034837, "step": 525, "train_speed(iter/s)": 0.032731 }, { "clip_ratio": 0.0, "completion_length": 335.65, "epoch": 0.21414141414141413, "grad_norm": 2.2605304578434664, "kl": 0.00481109619140625, "learning_rate": 2e-07, "loss": 0.029361778497695924, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.29166667610406877, "reward_std": 0.3948740750551224, "rewards/MultiModalAccuracyORM": 0.29166667610406877, "step": 530, "train_speed(iter/s)": 0.032671 }, { "clip_ratio": 0.0, "completion_length": 300.95, "epoch": 0.21616161616161617, "grad_norm": 3.233553935601456, "kl": 0.005239105224609375, "learning_rate": 2e-07, "loss": -0.02358839809894562, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.32500000670552254, "reward_std": 0.39305841624736787, "rewards/MultiModalAccuracyORM": 0.32500000670552254, "step": 535, "train_speed(iter/s)": 0.032679 }, { "clip_ratio": 0.0, "completion_length": 347.15, "epoch": 0.21818181818181817, "grad_norm": 1.4435208932830024, "kl": 0.0038543701171875, "learning_rate": 2e-07, "loss": 0.012015002965927123, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.14166666939854622, "reward_std": 0.2184889554977417, "rewards/MultiModalAccuracyORM": 0.14166666939854622, "step": 540, "train_speed(iter/s)": 0.032663 }, { "clip_ratio": 0.0, "completion_length": 280.7, "epoch": 0.2202020202020202, "grad_norm": 2.124111886424564, "kl": 0.00633544921875, "learning_rate": 2e-07, "loss": 0.016453295946121216, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.43333334773778914, "reward_std": 0.40082641541957853, "rewards/MultiModalAccuracyORM": 0.43333334773778914, "step": 545, "train_speed(iter/s)": 0.032702 }, { "clip_ratio": 0.0, "completion_length": 405.4, "epoch": 0.2222222222222222, "grad_norm": 2.528384017814939, "kl": 0.004555511474609375, "learning_rate": 2e-07, "loss": -0.013006833195686341, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2333333410322666, "reward_std": 0.3478317141532898, "rewards/MultiModalAccuracyORM": 0.2333333410322666, "step": 550, "train_speed(iter/s)": 0.032503 }, { "clip_ratio": 0.0, "completion_length": 363.4, "epoch": 0.22424242424242424, "grad_norm": 2.5915001907307977, "kl": 0.00524749755859375, "learning_rate": 2e-07, "loss": 0.02111098766326904, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2916666753590107, "reward_std": 0.3644451290369034, "rewards/MultiModalAccuracyORM": 0.2916666753590107, "step": 555, "train_speed(iter/s)": 0.032469 }, { "clip_ratio": 0.0, "completion_length": 419.65, "epoch": 0.22626262626262628, "grad_norm": 1.5712795723400375, "kl": 0.004864501953125, "learning_rate": 2e-07, "loss": 0.06747217178344726, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.21666667386889457, "reward_std": 0.30639870166778566, "rewards/MultiModalAccuracyORM": 0.21666667386889457, "step": 560, "train_speed(iter/s)": 0.032374 }, { "clip_ratio": 0.0, "completion_length": 330.25, "epoch": 0.22828282828282828, "grad_norm": 2.1872516406963483, "kl": 0.0059844970703125, "learning_rate": 2e-07, "loss": -0.01907222718000412, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.18333333656191825, "reward_std": 0.27402731478214265, "rewards/MultiModalAccuracyORM": 0.18333333656191825, "step": 565, "train_speed(iter/s)": 0.03236 }, { "clip_ratio": 0.0, "completion_length": 252.8, "epoch": 0.23030303030303031, "grad_norm": 1.9388301349526922, "kl": 0.00828857421875, "learning_rate": 2e-07, "loss": 0.0710361123085022, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.28333333507180214, "reward_std": 0.26670235097408296, "rewards/MultiModalAccuracyORM": 0.28333333507180214, "step": 570, "train_speed(iter/s)": 0.032388 }, { "clip_ratio": 0.0, "completion_length": 475.4, "epoch": 0.23232323232323232, "grad_norm": 2.0643763651689424, "kl": 0.0043544769287109375, "learning_rate": 2e-07, "loss": 0.038624811172485354, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.29166667386889455, "reward_std": 0.38400964736938475, "rewards/MultiModalAccuracyORM": 0.29166667386889455, "step": 575, "train_speed(iter/s)": 0.032305 }, { "clip_ratio": 0.0, "completion_length": 366.45, "epoch": 0.23434343434343435, "grad_norm": 2.5185952971698566, "kl": 0.00495452880859375, "learning_rate": 2e-07, "loss": 0.02923307418823242, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.36666667461395264, "reward_std": 0.35012357234954833, "rewards/MultiModalAccuracyORM": 0.36666667461395264, "step": 580, "train_speed(iter/s)": 0.032206 }, { "clip_ratio": 0.0, "completion_length": 419.9, "epoch": 0.23636363636363636, "grad_norm": 1.8128917450324007, "kl": 0.0055450439453125, "learning_rate": 2e-07, "loss": 0.013245610892772675, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.28333333879709244, "reward_std": 0.23710441291332246, "rewards/MultiModalAccuracyORM": 0.28333333879709244, "step": 585, "train_speed(iter/s)": 0.032276 }, { "clip_ratio": 0.0, "completion_length": 355.4, "epoch": 0.2383838383838384, "grad_norm": 4.329439973170006, "kl": 0.00757293701171875, "learning_rate": 2e-07, "loss": -0.0028860807418823243, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.25000000819563867, "reward_std": 0.30661733746528624, "rewards/MultiModalAccuracyORM": 0.25000000819563867, "step": 590, "train_speed(iter/s)": 0.032341 }, { "clip_ratio": 0.0, "completion_length": 411.8, "epoch": 0.2404040404040404, "grad_norm": 1.8156019329792383, "kl": 0.005291748046875, "learning_rate": 2e-07, "loss": -0.004809608310461044, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.4166666753590107, "reward_std": 0.40967183113098143, "rewards/MultiModalAccuracyORM": 0.4166666753590107, "step": 595, "train_speed(iter/s)": 0.032425 }, { "clip_ratio": 0.0, "completion_length": 518.65, "epoch": 0.24242424242424243, "grad_norm": 1.6812944635615767, "kl": 0.0045440673828125, "learning_rate": 2e-07, "loss": 0.0016623079776763917, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.21666667312383653, "reward_std": 0.35134140253067014, "rewards/MultiModalAccuracyORM": 0.21666667312383653, "step": 600, "train_speed(iter/s)": 0.032411 }, { "clip_ratio": 0.0, "completion_length": 344.4, "epoch": 0.24444444444444444, "grad_norm": 2.089820121690527, "kl": 0.00710906982421875, "learning_rate": 2e-07, "loss": -0.028999322652816774, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2666666738688946, "reward_std": 0.33552044034004214, "rewards/MultiModalAccuracyORM": 0.2666666738688946, "step": 605, "train_speed(iter/s)": 0.032494 }, { "clip_ratio": 0.0, "completion_length": 246.75, "epoch": 0.24646464646464647, "grad_norm": 2.728310100204588, "kl": 0.00543060302734375, "learning_rate": 2e-07, "loss": 0.03924176394939423, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.1916666716337204, "reward_std": 0.27078639566898344, "rewards/MultiModalAccuracyORM": 0.1916666716337204, "step": 610, "train_speed(iter/s)": 0.032569 }, { "clip_ratio": 0.0, "completion_length": 403.2, "epoch": 0.24848484848484848, "grad_norm": 1.3175052417192106, "kl": 0.00468902587890625, "learning_rate": 2e-07, "loss": 0.038245481252670285, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2500000067055225, "reward_std": 0.4048719048500061, "rewards/MultiModalAccuracyORM": 0.2500000067055225, "step": 615, "train_speed(iter/s)": 0.032501 }, { "clip_ratio": 0.0, "completion_length": 344.7, "epoch": 0.2505050505050505, "grad_norm": 1.9529912685373527, "kl": 0.00550537109375, "learning_rate": 2e-07, "loss": 0.011770330369472504, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3916666753590107, "reward_std": 0.2895964771509171, "rewards/MultiModalAccuracyORM": 0.3916666753590107, "step": 620, "train_speed(iter/s)": 0.032477 }, { "clip_ratio": 0.0, "completion_length": 382.75, "epoch": 0.25252525252525254, "grad_norm": 0.05113023046556139, "kl": 0.00566864013671875, "learning_rate": 2e-07, "loss": 0.01361861228942871, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2333333358168602, "reward_std": 0.275274920463562, "rewards/MultiModalAccuracyORM": 0.2333333358168602, "step": 625, "train_speed(iter/s)": 0.032463 }, { "clip_ratio": 0.0, "completion_length": 307.15, "epoch": 0.2545454545454545, "grad_norm": 2.556743977258531, "kl": 0.005108642578125, "learning_rate": 2e-07, "loss": 0.014950770139694213, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3333333425223827, "reward_std": 0.34713688492774963, "rewards/MultiModalAccuracyORM": 0.3333333425223827, "step": 630, "train_speed(iter/s)": 0.032484 }, { "clip_ratio": 0.0, "completion_length": 406.15, "epoch": 0.25656565656565655, "grad_norm": 2.2423462644187624, "kl": 0.004283905029296875, "learning_rate": 2e-07, "loss": 0.008650130033493042, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.1833333395421505, "reward_std": 0.28752902448177337, "rewards/MultiModalAccuracyORM": 0.1833333395421505, "step": 635, "train_speed(iter/s)": 0.032416 }, { "clip_ratio": 0.0, "completion_length": 347.15, "epoch": 0.2585858585858586, "grad_norm": 2.7318256637713327, "kl": 0.0051483154296875, "learning_rate": 2e-07, "loss": 0.021026265621185303, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2500000111758709, "reward_std": 0.29385479390621183, "rewards/MultiModalAccuracyORM": 0.2500000111758709, "step": 640, "train_speed(iter/s)": 0.032398 }, { "clip_ratio": 0.0, "completion_length": 363.9, "epoch": 0.2606060606060606, "grad_norm": 0.04170508484645814, "kl": 0.00531463623046875, "learning_rate": 2e-07, "loss": -0.04355872869491577, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.25000000819563867, "reward_std": 0.3089067697525024, "rewards/MultiModalAccuracyORM": 0.25000000819563867, "step": 645, "train_speed(iter/s)": 0.032375 }, { "clip_ratio": 0.0, "completion_length": 407.55, "epoch": 0.26262626262626265, "grad_norm": 1.2451580073322923, "kl": 0.003839111328125, "learning_rate": 2e-07, "loss": 0.00021180734038352966, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.26666667461395266, "reward_std": 0.2676923930644989, "rewards/MultiModalAccuracyORM": 0.26666667461395266, "step": 650, "train_speed(iter/s)": 0.032327 }, { "clip_ratio": 0.0, "completion_length": 432.75, "epoch": 0.26464646464646463, "grad_norm": 1.9808716749773743, "kl": 0.00391082763671875, "learning_rate": 2e-07, "loss": 0.026480630040168762, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.22500000670552253, "reward_std": 0.2817953139543533, "rewards/MultiModalAccuracyORM": 0.22500000670552253, "step": 655, "train_speed(iter/s)": 0.032322 }, { "clip_ratio": 0.0, "completion_length": 463.85, "epoch": 0.26666666666666666, "grad_norm": 1.1399233339835215, "kl": 0.004100799560546875, "learning_rate": 2e-07, "loss": -0.02441052794456482, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.1833333380520344, "reward_std": 0.25897533297538755, "rewards/MultiModalAccuracyORM": 0.1833333380520344, "step": 660, "train_speed(iter/s)": 0.032385 }, { "clip_ratio": 0.0, "completion_length": 254.3, "epoch": 0.2686868686868687, "grad_norm": 2.4222117834215964, "kl": 0.0057952880859375, "learning_rate": 2e-07, "loss": 0.01856023073196411, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2166666716337204, "reward_std": 0.3348231792449951, "rewards/MultiModalAccuracyORM": 0.2166666716337204, "step": 665, "train_speed(iter/s)": 0.032448 }, { "clip_ratio": 0.0, "completion_length": 360.55, "epoch": 0.27070707070707073, "grad_norm": 2.596880019981878, "kl": 0.0034820556640625, "learning_rate": 2e-07, "loss": -0.004870015382766724, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.40000001192092893, "reward_std": 0.3786772578954697, "rewards/MultiModalAccuracyORM": 0.40000001192092893, "step": 670, "train_speed(iter/s)": 0.032507 }, { "clip_ratio": 0.0, "completion_length": 547.2, "epoch": 0.2727272727272727, "grad_norm": 1.261892143617939, "kl": 0.003546142578125, "learning_rate": 2e-07, "loss": 0.018378911912441252, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.24166667088866234, "reward_std": 0.25365822613239286, "rewards/MultiModalAccuracyORM": 0.24166667088866234, "step": 675, "train_speed(iter/s)": 0.032509 }, { "clip_ratio": 0.0, "completion_length": 389.15, "epoch": 0.27474747474747474, "grad_norm": 1.5125590979703638, "kl": 0.00487823486328125, "learning_rate": 2e-07, "loss": -0.004463189840316772, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.25000000596046446, "reward_std": 0.2488823115825653, "rewards/MultiModalAccuracyORM": 0.25000000596046446, "step": 680, "train_speed(iter/s)": 0.03256 }, { "clip_ratio": 0.0, "completion_length": 461.15, "epoch": 0.2767676767676768, "grad_norm": 0.0206379809755319, "kl": 0.00426177978515625, "learning_rate": 2e-07, "loss": 0.021875476837158202, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.11666666865348815, "reward_std": 0.26496326327323916, "rewards/MultiModalAccuracyORM": 0.11666666865348815, "step": 685, "train_speed(iter/s)": 0.032514 }, { "clip_ratio": 0.0, "completion_length": 333.5, "epoch": 0.2787878787878788, "grad_norm": 2.5475669372401737, "kl": 0.00420379638671875, "learning_rate": 2e-07, "loss": 0.004043090343475342, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.15000000447034836, "reward_std": 0.30210480093955994, "rewards/MultiModalAccuracyORM": 0.15000000447034836, "step": 690, "train_speed(iter/s)": 0.032521 }, { "clip_ratio": 0.0, "completion_length": 416.7, "epoch": 0.2808080808080808, "grad_norm": 1.5500150159182102, "kl": 0.00518798828125, "learning_rate": 2e-07, "loss": -0.023865307867527007, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.1833333343267441, "reward_std": 0.1683032989501953, "rewards/MultiModalAccuracyORM": 0.1833333343267441, "step": 695, "train_speed(iter/s)": 0.032416 }, { "clip_ratio": 0.0, "completion_length": 372.35, "epoch": 0.2828282828282828, "grad_norm": 1.9962407432487237, "kl": 0.005457305908203125, "learning_rate": 2e-07, "loss": -0.028327393531799316, "memory(GiB)": 87.45, "response_clip_ratio": 0.05, "reward": 0.2250000037252903, "reward_std": 0.2099333554506302, "rewards/MultiModalAccuracyORM": 0.2250000037252903, "step": 700, "train_speed(iter/s)": 0.032361 }, { "clip_ratio": 0.0, "completion_length": 313.4, "epoch": 0.28484848484848485, "grad_norm": 1.6074003724487615, "kl": 0.00528717041015625, "learning_rate": 2e-07, "loss": 0.014926820993423462, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3083333410322666, "reward_std": 0.27223809361457824, "rewards/MultiModalAccuracyORM": 0.3083333410322666, "step": 705, "train_speed(iter/s)": 0.032343 }, { "clip_ratio": 0.0, "completion_length": 286.5, "epoch": 0.2868686868686869, "grad_norm": 1.6995014935336248, "kl": 0.0051483154296875, "learning_rate": 2e-07, "loss": -0.019916635751724244, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.24166667014360427, "reward_std": 0.24885829985141755, "rewards/MultiModalAccuracyORM": 0.24166667014360427, "step": 710, "train_speed(iter/s)": 0.032338 }, { "clip_ratio": 0.0, "completion_length": 414.3, "epoch": 0.28888888888888886, "grad_norm": 2.5308810289000134, "kl": 0.00496978759765625, "learning_rate": 2e-07, "loss": 0.01712719202041626, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.40000000447034834, "reward_std": 0.3906455457210541, "rewards/MultiModalAccuracyORM": 0.40000000447034834, "step": 715, "train_speed(iter/s)": 0.03227 }, { "clip_ratio": 0.0, "completion_length": 464.2, "epoch": 0.2909090909090909, "grad_norm": 3.1179537828506865, "kl": 0.00511016845703125, "learning_rate": 2e-07, "loss": -0.0032517150044441222, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.22500001043081283, "reward_std": 0.3038526177406311, "rewards/MultiModalAccuracyORM": 0.22500001043081283, "step": 720, "train_speed(iter/s)": 0.032189 }, { "clip_ratio": 0.0, "completion_length": 339.9, "epoch": 0.29292929292929293, "grad_norm": 1.3264200657485663, "kl": 0.0060546875, "learning_rate": 2e-07, "loss": 0.005654716491699218, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2250000059604645, "reward_std": 0.2988493382930756, "rewards/MultiModalAccuracyORM": 0.2250000059604645, "step": 725, "train_speed(iter/s)": 0.032186 }, { "clip_ratio": 0.0, "completion_length": 471.65, "epoch": 0.29494949494949496, "grad_norm": 0.5240042260688945, "kl": 0.005621719360351563, "learning_rate": 2e-07, "loss": 0.010572614520788193, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.1750000037252903, "reward_std": 0.2945852130651474, "rewards/MultiModalAccuracyORM": 0.1750000037252903, "step": 730, "train_speed(iter/s)": 0.032129 }, { "clip_ratio": 0.0, "completion_length": 445.2, "epoch": 0.296969696969697, "grad_norm": 2.049661779713074, "kl": 0.00519866943359375, "learning_rate": 2e-07, "loss": 0.022058649361133574, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2833333410322666, "reward_std": 0.38726511001586916, "rewards/MultiModalAccuracyORM": 0.2833333410322666, "step": 735, "train_speed(iter/s)": 0.032089 }, { "clip_ratio": 0.0, "completion_length": 391.25, "epoch": 0.298989898989899, "grad_norm": 0.962602559613357, "kl": 0.0046844482421875, "learning_rate": 2e-07, "loss": -0.0028517723083496095, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.21666667014360427, "reward_std": 0.23328913748264313, "rewards/MultiModalAccuracyORM": 0.21666667014360427, "step": 740, "train_speed(iter/s)": 0.032072 }, { "clip_ratio": 0.0, "completion_length": 364.5, "epoch": 0.301010101010101, "grad_norm": 2.0529334461639337, "kl": 0.00500030517578125, "learning_rate": 2e-07, "loss": 0.0314439594745636, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.23333334177732468, "reward_std": 0.3212204694747925, "rewards/MultiModalAccuracyORM": 0.23333334177732468, "step": 745, "train_speed(iter/s)": 0.032037 }, { "epoch": 0.30303030303030304, "grad_norm": 1.3580773911974338, "learning_rate": 2e-07, "loss": -0.007335931062698364, "memory(GiB)": 87.45, "step": 750, "train_speed(iter/s)": 0.032014 }, { "epoch": 0.30303030303030304, "eval_clip_ratio": 0.0, "eval_completion_length": 352.49667709350587, "eval_kl": 0.00640625, "eval_loss": 0.002320815809071064, "eval_response_clip_ratio": 0.0, "eval_reward": 0.2716666729748249, "eval_reward_std": 0.33371097803115846, "eval_rewards/MultiModalAccuracyORM": 0.2716666729748249, "eval_runtime": 876.1057, "eval_samples_per_second": 0.057, "eval_steps_per_second": 0.006, "step": 750 }, { "clip_ratio": 0.0, "completion_length": 392.55, "epoch": 0.30505050505050507, "grad_norm": 2.1426610619194815, "kl": 0.00631256103515625, "learning_rate": 2e-07, "loss": -0.040098315477371214, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.13333333656191826, "reward_std": 0.22312387079000473, "rewards/MultiModalAccuracyORM": 0.13333333656191826, "step": 755, "train_speed(iter/s)": 0.029206 }, { "clip_ratio": 0.0, "completion_length": 468.4, "epoch": 0.30707070707070705, "grad_norm": 0.8717248302301553, "kl": 0.00636749267578125, "learning_rate": 2e-07, "loss": 0.015009742975234986, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2833333358168602, "reward_std": 0.2940850019454956, "rewards/MultiModalAccuracyORM": 0.2833333358168602, "step": 760, "train_speed(iter/s)": 0.029162 }, { "clip_ratio": 0.0, "completion_length": 307.8, "epoch": 0.3090909090909091, "grad_norm": 2.4403464428155925, "kl": 0.0062957763671875, "learning_rate": 2e-07, "loss": 0.019652032852172853, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.45000001043081284, "reward_std": 0.3222792655229568, "rewards/MultiModalAccuracyORM": 0.45000001043081284, "step": 765, "train_speed(iter/s)": 0.029211 }, { "clip_ratio": 0.0, "completion_length": 448.35, "epoch": 0.3111111111111111, "grad_norm": 1.6980769345505524, "kl": 0.0074066162109375, "learning_rate": 2e-07, "loss": 0.018609333038330077, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.31666667833924295, "reward_std": 0.4026396483182907, "rewards/MultiModalAccuracyORM": 0.31666667833924295, "step": 770, "train_speed(iter/s)": 0.029164 }, { "clip_ratio": 0.0, "completion_length": 406.35, "epoch": 0.31313131313131315, "grad_norm": 1.4345330108808567, "kl": 0.00540924072265625, "learning_rate": 2e-07, "loss": 0.034766983985900876, "memory(GiB)": 87.45, "response_clip_ratio": 0.05, "reward": 0.26666667088866236, "reward_std": 0.3167103588581085, "rewards/MultiModalAccuracyORM": 0.26666667088866236, "step": 775, "train_speed(iter/s)": 0.029127 }, { "clip_ratio": 0.0, "completion_length": 441.5, "epoch": 0.3151515151515151, "grad_norm": 1.0920815430357467, "kl": 0.0054931640625, "learning_rate": 2e-07, "loss": -7.512569427490235e-05, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.10833333656191826, "reward_std": 0.22400068640708923, "rewards/MultiModalAccuracyORM": 0.10833333656191826, "step": 780, "train_speed(iter/s)": 0.029106 }, { "clip_ratio": 0.0, "completion_length": 435.7, "epoch": 0.31717171717171716, "grad_norm": 1.3732918705207908, "kl": 0.00477752685546875, "learning_rate": 2e-07, "loss": 0.015651023387908934, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2416666679084301, "reward_std": 0.23479096889495848, "rewards/MultiModalAccuracyORM": 0.2416666679084301, "step": 785, "train_speed(iter/s)": 0.029052 }, { "clip_ratio": 0.0, "completion_length": 453.35, "epoch": 0.3191919191919192, "grad_norm": 2.1057593122144005, "kl": 0.00804595947265625, "learning_rate": 2e-07, "loss": -0.0006304442882537842, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3166666753590107, "reward_std": 0.3619014710187912, "rewards/MultiModalAccuracyORM": 0.3166666753590107, "step": 790, "train_speed(iter/s)": 0.029057 }, { "clip_ratio": 0.0, "completion_length": 408.2, "epoch": 0.3212121212121212, "grad_norm": 2.3354800713445654, "kl": 0.0078216552734375, "learning_rate": 2e-07, "loss": 0.0310418963432312, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.36666667759418486, "reward_std": 0.40556674003601073, "rewards/MultiModalAccuracyORM": 0.36666667759418486, "step": 795, "train_speed(iter/s)": 0.029005 }, { "clip_ratio": 0.0, "completion_length": 409.45, "epoch": 0.32323232323232326, "grad_norm": 2.4825567652901444, "kl": 0.0077880859375, "learning_rate": 2e-07, "loss": 0.021943604946136473, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3666666761040688, "reward_std": 0.3470627337694168, "rewards/MultiModalAccuracyORM": 0.3666666761040688, "step": 800, "train_speed(iter/s)": 0.028978 }, { "clip_ratio": 0.0, "completion_length": 324.9, "epoch": 0.32525252525252524, "grad_norm": 3.589672291824819, "kl": 0.00778350830078125, "learning_rate": 2e-07, "loss": 0.008873769640922546, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3166666753590107, "reward_std": 0.44790194034576414, "rewards/MultiModalAccuracyORM": 0.3166666753590107, "step": 805, "train_speed(iter/s)": 0.028982 }, { "clip_ratio": 0.0, "completion_length": 287.2, "epoch": 0.32727272727272727, "grad_norm": 2.1262920297539925, "kl": 0.0073883056640625, "learning_rate": 2e-07, "loss": -0.04254024624824524, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2833333447575569, "reward_std": 0.3589002341032028, "rewards/MultiModalAccuracyORM": 0.2833333447575569, "step": 810, "train_speed(iter/s)": 0.029003 }, { "clip_ratio": 0.0, "completion_length": 323.9, "epoch": 0.3292929292929293, "grad_norm": 2.6338345195445965, "kl": 0.0073974609375, "learning_rate": 2e-07, "loss": 0.008789122104644775, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.35833333879709245, "reward_std": 0.3563657283782959, "rewards/MultiModalAccuracyORM": 0.35833333879709245, "step": 815, "train_speed(iter/s)": 0.028993 }, { "clip_ratio": 0.0, "completion_length": 287.85, "epoch": 0.33131313131313134, "grad_norm": 2.540831778543349, "kl": 0.0085357666015625, "learning_rate": 2e-07, "loss": 0.0017573148012161254, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2583333395421505, "reward_std": 0.33755565285682676, "rewards/MultiModalAccuracyORM": 0.2583333395421505, "step": 820, "train_speed(iter/s)": 0.02904 }, { "clip_ratio": 0.0, "completion_length": 363.3, "epoch": 0.3333333333333333, "grad_norm": 2.280326105508933, "kl": 0.011834716796875, "learning_rate": 2e-07, "loss": -0.016002975404262543, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.17500000596046447, "reward_std": 0.24662604331970214, "rewards/MultiModalAccuracyORM": 0.17500000596046447, "step": 825, "train_speed(iter/s)": 0.02907 }, { "clip_ratio": 0.0, "completion_length": 334.05, "epoch": 0.33535353535353535, "grad_norm": 1.64256260222623, "kl": 0.00889892578125, "learning_rate": 2e-07, "loss": -0.008859094977378846, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.20000000149011612, "reward_std": 0.3164917230606079, "rewards/MultiModalAccuracyORM": 0.20000000149011612, "step": 830, "train_speed(iter/s)": 0.029109 }, { "clip_ratio": 0.0, "completion_length": 347.15, "epoch": 0.3373737373737374, "grad_norm": 0.09646041600368084, "kl": 0.0067840576171875, "learning_rate": 2e-07, "loss": 0.02341327965259552, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2750000096857548, "reward_std": 0.3184880018234253, "rewards/MultiModalAccuracyORM": 0.2750000096857548, "step": 835, "train_speed(iter/s)": 0.028908 }, { "clip_ratio": 0.0, "completion_length": 434.35, "epoch": 0.3393939393939394, "grad_norm": 0.886588445568382, "kl": 0.0066741943359375, "learning_rate": 2e-07, "loss": 0.011455638706684113, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.34166667312383653, "reward_std": 0.3142238825559616, "rewards/MultiModalAccuracyORM": 0.34166667312383653, "step": 840, "train_speed(iter/s)": 0.02878 }, { "clip_ratio": 0.0, "completion_length": 448.75, "epoch": 0.3414141414141414, "grad_norm": 0.0732846157739433, "kl": 0.00753173828125, "learning_rate": 2e-07, "loss": 0.010994693636894226, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.20833333507180213, "reward_std": 0.19786564111709595, "rewards/MultiModalAccuracyORM": 0.20833333507180213, "step": 845, "train_speed(iter/s)": 0.028759 }, { "clip_ratio": 0.0, "completion_length": 325.0, "epoch": 0.3434343434343434, "grad_norm": 2.016101823545884, "kl": 0.00940399169921875, "learning_rate": 2e-07, "loss": 0.0015551522374153137, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.21666667386889457, "reward_std": 0.37221312820911406, "rewards/MultiModalAccuracyORM": 0.21666667386889457, "step": 850, "train_speed(iter/s)": 0.028759 }, { "clip_ratio": 0.0, "completion_length": 367.75, "epoch": 0.34545454545454546, "grad_norm": 1.4804689213107514, "kl": 0.0074249267578125, "learning_rate": 2e-07, "loss": 0.008444362878799438, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.37500000521540644, "reward_std": 0.33937130570411683, "rewards/MultiModalAccuracyORM": 0.37500000521540644, "step": 855, "train_speed(iter/s)": 0.02876 }, { "clip_ratio": 0.0, "completion_length": 317.1, "epoch": 0.3474747474747475, "grad_norm": 2.368905519842238, "kl": 0.008038330078125, "learning_rate": 2e-07, "loss": 0.026756054162979125, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3500000111758709, "reward_std": 0.44455128610134126, "rewards/MultiModalAccuracyORM": 0.3500000111758709, "step": 860, "train_speed(iter/s)": 0.028787 }, { "clip_ratio": 0.0, "completion_length": 276.85, "epoch": 0.34949494949494947, "grad_norm": 2.3043935598394203, "kl": 0.0070343017578125, "learning_rate": 2e-07, "loss": 0.059600555896759035, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2833333432674408, "reward_std": 0.3885723173618317, "rewards/MultiModalAccuracyORM": 0.2833333432674408, "step": 865, "train_speed(iter/s)": 0.028814 }, { "clip_ratio": 0.0, "completion_length": 417.95, "epoch": 0.3515151515151515, "grad_norm": 1.9471040249213727, "kl": 0.0069305419921875, "learning_rate": 2e-07, "loss": 0.028457581996917725, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.17500000596046447, "reward_std": 0.2940494120121002, "rewards/MultiModalAccuracyORM": 0.17500000596046447, "step": 870, "train_speed(iter/s)": 0.028708 }, { "clip_ratio": 0.0, "completion_length": 363.5, "epoch": 0.35353535353535354, "grad_norm": 2.196604109706096, "kl": 0.0058319091796875, "learning_rate": 2e-07, "loss": 0.04532061517238617, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3916666775941849, "reward_std": 0.42524099349975586, "rewards/MultiModalAccuracyORM": 0.3916666775941849, "step": 875, "train_speed(iter/s)": 0.028627 }, { "clip_ratio": 0.0, "completion_length": 349.05, "epoch": 0.35555555555555557, "grad_norm": 1.9101064459839039, "kl": 0.0102691650390625, "learning_rate": 2e-07, "loss": 0.04224415421485901, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.30000000521540643, "reward_std": 0.3391170799732208, "rewards/MultiModalAccuracyORM": 0.30000000521540643, "step": 880, "train_speed(iter/s)": 0.028551 }, { "clip_ratio": 0.0, "completion_length": 317.75, "epoch": 0.3575757575757576, "grad_norm": 1.7650856984522036, "kl": 0.0097930908203125, "learning_rate": 2e-07, "loss": 0.031351178884506226, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3333333425223827, "reward_std": 0.27555315792560575, "rewards/MultiModalAccuracyORM": 0.3333333425223827, "step": 885, "train_speed(iter/s)": 0.028585 }, { "clip_ratio": 0.0, "completion_length": 287.05, "epoch": 0.3595959595959596, "grad_norm": 2.4394117877960615, "kl": 0.0123748779296875, "learning_rate": 2e-07, "loss": 0.01872892677783966, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.22500000521540642, "reward_std": 0.30489687621593475, "rewards/MultiModalAccuracyORM": 0.22500000521540642, "step": 890, "train_speed(iter/s)": 0.028637 }, { "clip_ratio": 0.0, "completion_length": 476.05, "epoch": 0.3616161616161616, "grad_norm": 2.3682785721081854, "kl": 0.00737762451171875, "learning_rate": 2e-07, "loss": 0.02124558687210083, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3500000089406967, "reward_std": 0.41817026138305663, "rewards/MultiModalAccuracyORM": 0.3500000089406967, "step": 895, "train_speed(iter/s)": 0.028688 }, { "clip_ratio": 0.0, "completion_length": 429.85, "epoch": 0.36363636363636365, "grad_norm": 1.3234500775547358, "kl": 0.007550048828125, "learning_rate": 2e-07, "loss": 0.025475236773490905, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.18333333656191825, "reward_std": 0.2260383188724518, "rewards/MultiModalAccuracyORM": 0.18333333656191825, "step": 900, "train_speed(iter/s)": 0.028712 }, { "clip_ratio": 0.0, "completion_length": 439.75, "epoch": 0.3656565656565657, "grad_norm": 3.0802331121314785, "kl": 0.0105621337890625, "learning_rate": 2e-07, "loss": 0.06260026693344116, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.24166666865348815, "reward_std": 0.3003867596387863, "rewards/MultiModalAccuracyORM": 0.24166666865348815, "step": 905, "train_speed(iter/s)": 0.028645 }, { "clip_ratio": 0.0, "completion_length": 287.9, "epoch": 0.36767676767676766, "grad_norm": 3.596137864021678, "kl": 0.01011199951171875, "learning_rate": 2e-07, "loss": 0.007353886961936951, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.4833333432674408, "reward_std": 0.38523324131965636, "rewards/MultiModalAccuracyORM": 0.4833333432674408, "step": 910, "train_speed(iter/s)": 0.028662 }, { "clip_ratio": 0.0, "completion_length": 296.65, "epoch": 0.3696969696969697, "grad_norm": 1.4417889638729746, "kl": 0.01177978515625, "learning_rate": 2e-07, "loss": -0.006625932455062866, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3416666783392429, "reward_std": 0.37195890843868257, "rewards/MultiModalAccuracyORM": 0.3416666783392429, "step": 915, "train_speed(iter/s)": 0.028677 }, { "clip_ratio": 0.0, "completion_length": 431.05, "epoch": 0.3717171717171717, "grad_norm": 2.8875811253312333, "kl": 0.01148529052734375, "learning_rate": 2e-07, "loss": -3.943443298339844e-05, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.41666667237877847, "reward_std": 0.3604352355003357, "rewards/MultiModalAccuracyORM": 0.41666667237877847, "step": 920, "train_speed(iter/s)": 0.028643 }, { "clip_ratio": 0.0, "completion_length": 330.3, "epoch": 0.37373737373737376, "grad_norm": 1.8636332228250176, "kl": 0.0091461181640625, "learning_rate": 2e-07, "loss": 0.004881632328033447, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2833333402872086, "reward_std": 0.3440760403871536, "rewards/MultiModalAccuracyORM": 0.2833333402872086, "step": 925, "train_speed(iter/s)": 0.028644 }, { "clip_ratio": 0.0, "completion_length": 357.4, "epoch": 0.37575757575757573, "grad_norm": 2.1407505535783242, "kl": 0.00869598388671875, "learning_rate": 2e-07, "loss": 0.05731675624847412, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.25833334103226663, "reward_std": 0.41186849772930145, "rewards/MultiModalAccuracyORM": 0.25833334103226663, "step": 930, "train_speed(iter/s)": 0.028644 }, { "clip_ratio": 0.0, "completion_length": 322.9, "epoch": 0.37777777777777777, "grad_norm": 3.79021329286614, "kl": 0.009942626953125, "learning_rate": 2e-07, "loss": 0.0477484941482544, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3166666716337204, "reward_std": 0.2674737572669983, "rewards/MultiModalAccuracyORM": 0.3166666716337204, "step": 935, "train_speed(iter/s)": 0.028648 }, { "clip_ratio": 0.0, "completion_length": 283.45, "epoch": 0.3797979797979798, "grad_norm": 2.2451102482111724, "kl": 0.012542724609375, "learning_rate": 2e-07, "loss": -1.335442066192627e-05, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3500000052154064, "reward_std": 0.25591449439525604, "rewards/MultiModalAccuracyORM": 0.3500000052154064, "step": 940, "train_speed(iter/s)": 0.028624 }, { "clip_ratio": 0.0, "completion_length": 346.15, "epoch": 0.38181818181818183, "grad_norm": 1.4018775780145751, "kl": 0.0094390869140625, "learning_rate": 2e-07, "loss": -0.003527042269706726, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.24166667088866234, "reward_std": 0.31088480055332185, "rewards/MultiModalAccuracyORM": 0.24166667088866234, "step": 945, "train_speed(iter/s)": 0.028624 }, { "clip_ratio": 0.0, "completion_length": 310.7, "epoch": 0.3838383838383838, "grad_norm": 3.8112599620979117, "kl": 0.01011962890625, "learning_rate": 2e-07, "loss": 0.01941452920436859, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.34166667610406876, "reward_std": 0.34228681921958926, "rewards/MultiModalAccuracyORM": 0.34166667610406876, "step": 950, "train_speed(iter/s)": 0.028616 }, { "clip_ratio": 0.0, "completion_length": 260.35, "epoch": 0.38585858585858585, "grad_norm": 1.8716114512263384, "kl": 0.0135040283203125, "learning_rate": 2e-07, "loss": 0.01583598256111145, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3916666701436043, "reward_std": 0.26291108727455137, "rewards/MultiModalAccuracyORM": 0.3916666701436043, "step": 955, "train_speed(iter/s)": 0.028656 }, { "clip_ratio": 0.0, "completion_length": 310.0, "epoch": 0.3878787878787879, "grad_norm": 2.6882447296010508, "kl": 0.0098663330078125, "learning_rate": 2e-07, "loss": 0.008884111046791076, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2833333410322666, "reward_std": 0.3393357157707214, "rewards/MultiModalAccuracyORM": 0.2833333410322666, "step": 960, "train_speed(iter/s)": 0.02864 }, { "clip_ratio": 0.0, "completion_length": 288.3, "epoch": 0.3898989898989899, "grad_norm": 2.477942143166408, "kl": 0.013421630859375, "learning_rate": 2e-07, "loss": 0.013846510648727417, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.39166667088866236, "reward_std": 0.3041278898715973, "rewards/MultiModalAccuracyORM": 0.39166667088866236, "step": 965, "train_speed(iter/s)": 0.028659 }, { "clip_ratio": 0.0, "completion_length": 288.35, "epoch": 0.39191919191919194, "grad_norm": 1.7487986972843892, "kl": 0.008868408203125, "learning_rate": 2e-07, "loss": 0.041995507478713986, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3250000037252903, "reward_std": 0.3277524411678314, "rewards/MultiModalAccuracyORM": 0.3250000037252903, "step": 970, "train_speed(iter/s)": 0.028675 }, { "clip_ratio": 0.0, "completion_length": 326.9, "epoch": 0.3939393939393939, "grad_norm": 1.040945452450775, "kl": 0.00943603515625, "learning_rate": 2e-07, "loss": 0.004313239455223083, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.18333333656191825, "reward_std": 0.2722140818834305, "rewards/MultiModalAccuracyORM": 0.18333333656191825, "step": 975, "train_speed(iter/s)": 0.028693 }, { "clip_ratio": 0.0, "completion_length": 298.5, "epoch": 0.39595959595959596, "grad_norm": 1.987178230745996, "kl": 0.0092681884765625, "learning_rate": 2e-07, "loss": 0.01756379157304764, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.1833333358168602, "reward_std": 0.3274982154369354, "rewards/MultiModalAccuracyORM": 0.1833333358168602, "step": 980, "train_speed(iter/s)": 0.028714 }, { "clip_ratio": 0.0, "completion_length": 297.4, "epoch": 0.397979797979798, "grad_norm": 1.9999919818314047, "kl": 0.012908935546875, "learning_rate": 2e-07, "loss": 0.04084535539150238, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.32500001341104506, "reward_std": 0.2752989321947098, "rewards/MultiModalAccuracyORM": 0.32500001341104506, "step": 985, "train_speed(iter/s)": 0.028744 }, { "clip_ratio": 0.0, "completion_length": 506.15, "epoch": 0.4, "grad_norm": 0.038170370656060805, "kl": 0.010888671875, "learning_rate": 2e-07, "loss": 0.07128549218177796, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.25000000819563867, "reward_std": 0.30416645109653473, "rewards/MultiModalAccuracyORM": 0.25000000819563867, "step": 990, "train_speed(iter/s)": 0.028708 }, { "clip_ratio": 0.0, "completion_length": 433.45, "epoch": 0.402020202020202, "grad_norm": 2.632502419980814, "kl": 0.0100616455078125, "learning_rate": 2e-07, "loss": 0.016613197326660157, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.39166667610406875, "reward_std": 0.37174026668071747, "rewards/MultiModalAccuracyORM": 0.39166667610406875, "step": 995, "train_speed(iter/s)": 0.028687 }, { "epoch": 0.40404040404040403, "grad_norm": 0.07099216395354724, "learning_rate": 2e-07, "loss": 0.02232474982738495, "memory(GiB)": 87.45, "step": 1000, "train_speed(iter/s)": 0.028672 }, { "epoch": 0.40404040404040403, "eval_clip_ratio": 0.0, "eval_completion_length": 346.9533413696289, "eval_kl": 0.013145751953125, "eval_loss": -0.00028896695584990084, "eval_response_clip_ratio": 0.0, "eval_reward": 0.281666671782732, "eval_reward_std": 0.3010890519618988, "eval_rewards/MultiModalAccuracyORM": 0.281666671782732, "eval_runtime": 1406.863, "eval_samples_per_second": 0.036, "eval_steps_per_second": 0.004, "step": 1000 }, { "clip_ratio": 0.0, "completion_length": 415.175, "epoch": 0.40606060606060607, "grad_norm": 1.905945440484278, "kl": 0.009429931640625, "learning_rate": 2e-07, "loss": -0.0033631980419158935, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2708333387970924, "reward_std": 0.24963780641555786, "rewards/MultiModalAccuracyORM": 0.2708333387970924, "step": 1005, "train_speed(iter/s)": 0.027262 }, { "clip_ratio": 0.0, "completion_length": 341.5, "epoch": 0.4080808080808081, "grad_norm": 1.6755020591769207, "kl": 0.0134246826171875, "learning_rate": 2e-07, "loss": 0.05349223613739014, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2500000037252903, "reward_std": 0.3494287371635437, "rewards/MultiModalAccuracyORM": 0.2500000037252903, "step": 1010, "train_speed(iter/s)": 0.027279 }, { "clip_ratio": 0.0, "completion_length": 351.25, "epoch": 0.4101010101010101, "grad_norm": 2.8913380726136872, "kl": 0.0107147216796875, "learning_rate": 2e-07, "loss": -0.02667723298072815, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.38333334103226663, "reward_std": 0.4211569488048553, "rewards/MultiModalAccuracyORM": 0.38333334103226663, "step": 1015, "train_speed(iter/s)": 0.027304 }, { "clip_ratio": 0.0, "completion_length": 339.65, "epoch": 0.4121212121212121, "grad_norm": 4.180952848080379, "kl": 0.0100433349609375, "learning_rate": 2e-07, "loss": 0.00991852581501007, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2083333447575569, "reward_std": 0.3088736057281494, "rewards/MultiModalAccuracyORM": 0.2083333447575569, "step": 1020, "train_speed(iter/s)": 0.027315 }, { "clip_ratio": 0.0, "completion_length": 367.55, "epoch": 0.41414141414141414, "grad_norm": 1.9667254904423306, "kl": 0.0121246337890625, "learning_rate": 2e-07, "loss": 0.01899299621582031, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.15833333507180214, "reward_std": 0.3071291267871857, "rewards/MultiModalAccuracyORM": 0.15833333507180214, "step": 1025, "train_speed(iter/s)": 0.027329 }, { "clip_ratio": 0.0, "completion_length": 435.4, "epoch": 0.4161616161616162, "grad_norm": 1.7062594547415575, "kl": 0.0100616455078125, "learning_rate": 2e-07, "loss": 0.004674983024597168, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.22500001043081283, "reward_std": 0.3679134130477905, "rewards/MultiModalAccuracyORM": 0.22500001043081283, "step": 1030, "train_speed(iter/s)": 0.027298 }, { "clip_ratio": 0.0, "completion_length": 350.0, "epoch": 0.41818181818181815, "grad_norm": 72.23734764401382, "kl": 0.011712646484375, "learning_rate": 2e-07, "loss": 0.05118045210838318, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2666666775941849, "reward_std": 0.34735551476478577, "rewards/MultiModalAccuracyORM": 0.2666666775941849, "step": 1035, "train_speed(iter/s)": 0.027303 }, { "clip_ratio": 0.0, "completion_length": 311.15, "epoch": 0.4202020202020202, "grad_norm": 1.6715902563969363, "kl": 0.0135772705078125, "learning_rate": 2e-07, "loss": 0.045872822403907776, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2416666716337204, "reward_std": 0.287842845916748, "rewards/MultiModalAccuracyORM": 0.2416666716337204, "step": 1040, "train_speed(iter/s)": 0.027298 }, { "clip_ratio": 0.0, "completion_length": 353.15, "epoch": 0.4222222222222222, "grad_norm": 2.734745023688755, "kl": 0.012158203125, "learning_rate": 2e-07, "loss": 0.05562522411346436, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.31666667237877844, "reward_std": 0.4314686059951782, "rewards/MultiModalAccuracyORM": 0.31666667237877844, "step": 1045, "train_speed(iter/s)": 0.027328 }, { "clip_ratio": 0.0, "completion_length": 359.3, "epoch": 0.42424242424242425, "grad_norm": 0.07598134741536419, "kl": 0.009765625, "learning_rate": 2e-07, "loss": 0.008748695254325867, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.24166667014360427, "reward_std": 0.18326250910758973, "rewards/MultiModalAccuracyORM": 0.24166667014360427, "step": 1050, "train_speed(iter/s)": 0.027308 }, { "clip_ratio": 0.0, "completion_length": 392.2, "epoch": 0.4262626262626263, "grad_norm": 9.627726509942965, "kl": 0.0136199951171875, "learning_rate": 2e-07, "loss": 0.03634963035583496, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3166666768491268, "reward_std": 0.36670139729976653, "rewards/MultiModalAccuracyORM": 0.3166666768491268, "step": 1055, "train_speed(iter/s)": 0.027311 }, { "clip_ratio": 0.0, "completion_length": 289.7, "epoch": 0.42828282828282827, "grad_norm": 1.2371668114044378, "kl": 0.0134979248046875, "learning_rate": 2e-07, "loss": 0.04366698265075684, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2083333373069763, "reward_std": 0.3498693466186523, "rewards/MultiModalAccuracyORM": 0.2083333373069763, "step": 1060, "train_speed(iter/s)": 0.027334 }, { "clip_ratio": 0.0, "completion_length": 321.05, "epoch": 0.4303030303030303, "grad_norm": 2.52858518092475, "kl": 0.0135711669921875, "learning_rate": 2e-07, "loss": 0.065219247341156, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3000000111758709, "reward_std": 0.37853889763355253, "rewards/MultiModalAccuracyORM": 0.3000000111758709, "step": 1065, "train_speed(iter/s)": 0.027352 }, { "clip_ratio": 0.0, "completion_length": 287.5, "epoch": 0.43232323232323233, "grad_norm": 2.3424705728855995, "kl": 0.0116546630859375, "learning_rate": 2e-07, "loss": 0.03819225430488586, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.21666667014360427, "reward_std": 0.3227578908205032, "rewards/MultiModalAccuracyORM": 0.21666667014360427, "step": 1070, "train_speed(iter/s)": 0.027305 }, { "clip_ratio": 0.0, "completion_length": 345.55, "epoch": 0.43434343434343436, "grad_norm": 2.798437729299758, "kl": 0.014569091796875, "learning_rate": 2e-07, "loss": 0.004848736524581909, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.27500000819563863, "reward_std": 0.31416428089141846, "rewards/MultiModalAccuracyORM": 0.27500000819563863, "step": 1075, "train_speed(iter/s)": 0.027334 }, { "clip_ratio": 0.0, "completion_length": 300.8, "epoch": 0.43636363636363634, "grad_norm": 1.7741031757506147, "kl": 0.0157135009765625, "learning_rate": 2e-07, "loss": 0.00888105109333992, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.27500000447034834, "reward_std": 0.312698033452034, "rewards/MultiModalAccuracyORM": 0.27500000447034834, "step": 1080, "train_speed(iter/s)": 0.027339 }, { "clip_ratio": 0.0, "completion_length": 308.6, "epoch": 0.4383838383838384, "grad_norm": 2.06880703867489, "kl": 0.0158050537109375, "learning_rate": 2e-07, "loss": -0.05194641947746277, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2666666716337204, "reward_std": 0.22603832483291625, "rewards/MultiModalAccuracyORM": 0.2666666716337204, "step": 1085, "train_speed(iter/s)": 0.027329 }, { "clip_ratio": 0.0, "completion_length": 223.4, "epoch": 0.4404040404040404, "grad_norm": 2.4630209071132656, "kl": 0.015411376953125, "learning_rate": 2e-07, "loss": -0.018011474609375, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.20833333507180213, "reward_std": 0.3071291267871857, "rewards/MultiModalAccuracyORM": 0.20833333507180213, "step": 1090, "train_speed(iter/s)": 0.027372 }, { "clip_ratio": 0.0, "completion_length": 264.2, "epoch": 0.44242424242424244, "grad_norm": 2.265643619288025, "kl": 0.01461181640625, "learning_rate": 2e-07, "loss": 0.04221695959568024, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2666666701436043, "reward_std": 0.3329358011484146, "rewards/MultiModalAccuracyORM": 0.2666666701436043, "step": 1095, "train_speed(iter/s)": 0.027407 }, { "clip_ratio": 0.0, "completion_length": 357.95, "epoch": 0.4444444444444444, "grad_norm": 2.894324596003934, "kl": 0.009808349609375, "learning_rate": 2e-07, "loss": 0.02248055934906006, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.4166666828095913, "reward_std": 0.44607712924480436, "rewards/MultiModalAccuracyORM": 0.4166666828095913, "step": 1100, "train_speed(iter/s)": 0.027442 }, { "clip_ratio": 0.0, "completion_length": 247.95, "epoch": 0.44646464646464645, "grad_norm": 0.9507289625656876, "kl": 0.0140777587890625, "learning_rate": 2e-07, "loss": -0.0001364484429359436, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.46666667237877846, "reward_std": 0.24261614382267, "rewards/MultiModalAccuracyORM": 0.46666667237877846, "step": 1105, "train_speed(iter/s)": 0.027471 }, { "clip_ratio": 0.0, "completion_length": 238.75, "epoch": 0.4484848484848485, "grad_norm": 4.493560880958603, "kl": 0.01422119140625, "learning_rate": 2e-07, "loss": 0.00024300813674926758, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.4500000149011612, "reward_std": 0.345323646068573, "rewards/MultiModalAccuracyORM": 0.4500000149011612, "step": 1110, "train_speed(iter/s)": 0.027312 }, { "clip_ratio": 0.0, "completion_length": 368.15, "epoch": 0.4505050505050505, "grad_norm": 1.866809698039603, "kl": 0.0131317138671875, "learning_rate": 2e-07, "loss": -0.007444334030151367, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.20000000298023224, "reward_std": 0.3281930506229401, "rewards/MultiModalAccuracyORM": 0.20000000298023224, "step": 1115, "train_speed(iter/s)": 0.027296 }, { "clip_ratio": 0.0, "completion_length": 386.45, "epoch": 0.45252525252525255, "grad_norm": 0.04083454065583723, "kl": 0.0086578369140625, "learning_rate": 2e-07, "loss": 0.009036242961883545, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.22500000447034835, "reward_std": 0.24710224866867064, "rewards/MultiModalAccuracyORM": 0.22500000447034835, "step": 1120, "train_speed(iter/s)": 0.027256 }, { "clip_ratio": 0.0, "completion_length": 299.75, "epoch": 0.45454545454545453, "grad_norm": 2.1257862237671588, "kl": 0.01603851318359375, "learning_rate": 2e-07, "loss": -0.014222325384616851, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.4333333432674408, "reward_std": 0.4078585982322693, "rewards/MultiModalAccuracyORM": 0.4333333432674408, "step": 1125, "train_speed(iter/s)": 0.027299 }, { "clip_ratio": 0.0, "completion_length": 338.15, "epoch": 0.45656565656565656, "grad_norm": 48.10712707725128, "kl": 0.0124542236328125, "learning_rate": 2e-07, "loss": 0.009453803300857544, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.30833333879709246, "reward_std": 0.32858102321624755, "rewards/MultiModalAccuracyORM": 0.30833333879709246, "step": 1130, "train_speed(iter/s)": 0.027339 }, { "clip_ratio": 0.0, "completion_length": 434.6, "epoch": 0.4585858585858586, "grad_norm": 0.8869001794016839, "kl": 0.01041259765625, "learning_rate": 2e-07, "loss": -0.002349555492401123, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.13333333730697633, "reward_std": 0.29003951847553255, "rewards/MultiModalAccuracyORM": 0.13333333730697633, "step": 1135, "train_speed(iter/s)": 0.027364 }, { "clip_ratio": 0.0, "completion_length": 287.0, "epoch": 0.46060606060606063, "grad_norm": 2.2315283680448346, "kl": 0.0132476806640625, "learning_rate": 2e-07, "loss": -0.010060985386371613, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.17500000745058059, "reward_std": 0.3043610692024231, "rewards/MultiModalAccuracyORM": 0.17500000745058059, "step": 1140, "train_speed(iter/s)": 0.027393 }, { "clip_ratio": 0.0, "completion_length": 449.2, "epoch": 0.4626262626262626, "grad_norm": 0.04850876090724914, "kl": 0.0081451416015625, "learning_rate": 2e-07, "loss": -0.022587394714355467, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.10833333656191826, "reward_std": 0.20343697369098662, "rewards/MultiModalAccuracyORM": 0.10833333656191826, "step": 1145, "train_speed(iter/s)": 0.027421 }, { "clip_ratio": 0.0, "completion_length": 376.05, "epoch": 0.46464646464646464, "grad_norm": 2.2096178690715, "kl": 0.0104400634765625, "learning_rate": 2e-07, "loss": 0.01734369993209839, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3250000074505806, "reward_std": 0.33700530230998993, "rewards/MultiModalAccuracyORM": 0.3250000074505806, "step": 1150, "train_speed(iter/s)": 0.027419 }, { "clip_ratio": 0.0, "completion_length": 308.65, "epoch": 0.4666666666666667, "grad_norm": 1.3995623416059861, "kl": 0.020782470703125, "learning_rate": 2e-07, "loss": 0.004217700660228729, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.27500000447034834, "reward_std": 0.20594746768474578, "rewards/MultiModalAccuracyORM": 0.27500000447034834, "step": 1155, "train_speed(iter/s)": 0.027419 }, { "clip_ratio": 0.0, "completion_length": 229.45, "epoch": 0.4686868686868687, "grad_norm": 7.604841869136694, "kl": 0.017425537109375, "learning_rate": 2e-07, "loss": 0.04910666048526764, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3000000029802322, "reward_std": 0.3408561676740646, "rewards/MultiModalAccuracyORM": 0.3000000029802322, "step": 1160, "train_speed(iter/s)": 0.02739 }, { "clip_ratio": 0.0, "completion_length": 279.1, "epoch": 0.4707070707070707, "grad_norm": 1.7338556861412973, "kl": 0.009881591796875, "learning_rate": 2e-07, "loss": -0.02307046055793762, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.17500000149011613, "reward_std": 0.18081162869930267, "rewards/MultiModalAccuracyORM": 0.17500000149011613, "step": 1165, "train_speed(iter/s)": 0.027388 }, { "clip_ratio": 0.0, "completion_length": 352.15, "epoch": 0.4727272727272727, "grad_norm": 1.2587552234540058, "kl": 0.0092010498046875, "learning_rate": 2e-07, "loss": -0.05895323753356933, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.41666668429970743, "reward_std": 0.40890581607818605, "rewards/MultiModalAccuracyORM": 0.41666668429970743, "step": 1170, "train_speed(iter/s)": 0.027373 }, { "clip_ratio": 0.0, "completion_length": 381.2, "epoch": 0.47474747474747475, "grad_norm": 0.06683334066144007, "kl": 0.01002349853515625, "learning_rate": 2e-07, "loss": 0.02935360074043274, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2416666716337204, "reward_std": 0.27523933053016664, "rewards/MultiModalAccuracyORM": 0.2416666716337204, "step": 1175, "train_speed(iter/s)": 0.027312 }, { "clip_ratio": 0.0, "completion_length": 443.15, "epoch": 0.4767676767676768, "grad_norm": 27.070556493942583, "kl": 0.00930938720703125, "learning_rate": 2e-07, "loss": 0.0851466953754425, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.25000000223517416, "reward_std": 0.3342405825853348, "rewards/MultiModalAccuracyORM": 0.25000000223517416, "step": 1180, "train_speed(iter/s)": 0.027331 }, { "clip_ratio": 0.0, "completion_length": 403.55, "epoch": 0.47878787878787876, "grad_norm": 1.5534177345271625, "kl": 0.0102996826171875, "learning_rate": 2e-07, "loss": 0.028819066286087037, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.266666679084301, "reward_std": 0.3129431068897247, "rewards/MultiModalAccuracyORM": 0.266666679084301, "step": 1185, "train_speed(iter/s)": 0.027335 }, { "clip_ratio": 0.0, "completion_length": 327.65, "epoch": 0.4808080808080808, "grad_norm": 2.8838868478156816, "kl": 0.02685546875, "learning_rate": 2e-07, "loss": 0.006991004943847657, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.17500000447034836, "reward_std": 0.2323400765657425, "rewards/MultiModalAccuracyORM": 0.17500000447034836, "step": 1190, "train_speed(iter/s)": 0.027082 }, { "clip_ratio": 0.0, "completion_length": 500.2, "epoch": 0.48282828282828283, "grad_norm": 2.6317167816627993, "kl": 0.014031982421875, "learning_rate": 2e-07, "loss": -0.003238886594772339, "memory(GiB)": 87.45, "response_clip_ratio": 0.05, "reward": 0.20000000596046447, "reward_std": 0.30388820767402647, "rewards/MultiModalAccuracyORM": 0.20000000596046447, "step": 1195, "train_speed(iter/s)": 0.026955 }, { "clip_ratio": 0.0, "completion_length": 259.35, "epoch": 0.48484848484848486, "grad_norm": 53.95756362621299, "kl": 0.0124114990234375, "learning_rate": 2e-07, "loss": -0.00888831913471222, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3000000029802322, "reward_std": 0.29782613217830656, "rewards/MultiModalAccuracyORM": 0.3000000029802322, "step": 1200, "train_speed(iter/s)": 0.026995 }, { "clip_ratio": 0.0, "completion_length": 287.0, "epoch": 0.4868686868686869, "grad_norm": 1.8840812265683782, "kl": 0.016448974609375, "learning_rate": 2e-07, "loss": 0.024408812820911407, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.34166667610406876, "reward_std": 0.4253006011247635, "rewards/MultiModalAccuracyORM": 0.34166667610406876, "step": 1205, "train_speed(iter/s)": 0.02702 }, { "clip_ratio": 0.0, "completion_length": 263.2, "epoch": 0.4888888888888889, "grad_norm": 2.267475237086073, "kl": 0.01165771484375, "learning_rate": 2e-07, "loss": -0.02959960699081421, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.4416666813194752, "reward_std": 0.3111630380153656, "rewards/MultiModalAccuracyORM": 0.4416666813194752, "step": 1210, "train_speed(iter/s)": 0.027058 }, { "clip_ratio": 0.0, "completion_length": 341.55, "epoch": 0.4909090909090909, "grad_norm": 1.53249738300366, "kl": 0.01207275390625, "learning_rate": 2e-07, "loss": 0.01664416640996933, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3500000089406967, "reward_std": 0.39155901670455934, "rewards/MultiModalAccuracyORM": 0.3500000089406967, "step": 1215, "train_speed(iter/s)": 0.027075 }, { "clip_ratio": 0.0, "completion_length": 346.35, "epoch": 0.49292929292929294, "grad_norm": 2.838473944184638, "kl": 0.0138153076171875, "learning_rate": 2e-07, "loss": 0.011857110261917114, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.16666667237877847, "reward_std": 0.32422170639038084, "rewards/MultiModalAccuracyORM": 0.16666667237877847, "step": 1220, "train_speed(iter/s)": 0.027075 }, { "clip_ratio": 0.0, "completion_length": 340.2, "epoch": 0.494949494949495, "grad_norm": 2.239419757076915, "kl": 0.0130462646484375, "learning_rate": 2e-07, "loss": 0.03971967101097107, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.36666667088866234, "reward_std": 0.23224489092826844, "rewards/MultiModalAccuracyORM": 0.36666667088866234, "step": 1225, "train_speed(iter/s)": 0.027083 }, { "clip_ratio": 0.0, "completion_length": 244.8, "epoch": 0.49696969696969695, "grad_norm": 2.1763944900135637, "kl": 0.0342437744140625, "learning_rate": 2e-07, "loss": -0.010297659039497375, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3583333410322666, "reward_std": 0.3408351272344589, "rewards/MultiModalAccuracyORM": 0.3583333410322666, "step": 1230, "train_speed(iter/s)": 0.027096 }, { "clip_ratio": 0.0, "completion_length": 293.45, "epoch": 0.498989898989899, "grad_norm": 6.002103596814289, "kl": 0.020233154296875, "learning_rate": 2e-07, "loss": 0.08779069185256957, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.40000000670552255, "reward_std": 0.35311026573181153, "rewards/MultiModalAccuracyORM": 0.40000000670552255, "step": 1235, "train_speed(iter/s)": 0.027106 }, { "clip_ratio": 0.0, "completion_length": 468.45, "epoch": 0.501010101010101, "grad_norm": 1.7067044601090864, "kl": 0.00786285400390625, "learning_rate": 2e-07, "loss": 0.05108952522277832, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3500000089406967, "reward_std": 0.3033378630876541, "rewards/MultiModalAccuracyORM": 0.3500000089406967, "step": 1240, "train_speed(iter/s)": 0.027091 }, { "clip_ratio": 0.0, "completion_length": 397.4, "epoch": 0.503030303030303, "grad_norm": 0.8938521798548926, "kl": 0.009466552734375, "learning_rate": 2e-07, "loss": -0.01685338616371155, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.13333333656191826, "reward_std": 0.2292436480522156, "rewards/MultiModalAccuracyORM": 0.13333333656191826, "step": 1245, "train_speed(iter/s)": 0.02707 }, { "epoch": 0.5050505050505051, "grad_norm": 3.702370322108623, "learning_rate": 2e-07, "loss": 0.036279809474945066, "memory(GiB)": 87.45, "step": 1250, "train_speed(iter/s)": 0.027086 }, { "epoch": 0.5050505050505051, "eval_clip_ratio": 0.0, "eval_completion_length": 321.4716763305664, "eval_kl": 0.015718994140625, "eval_loss": 0.013520264066755772, "eval_response_clip_ratio": 0.0, "eval_reward": 0.3033333399891853, "eval_reward_std": 0.3383384072780609, "eval_rewards/MultiModalAccuracyORM": 0.3033333399891853, "eval_runtime": 765.5729, "eval_samples_per_second": 0.065, "eval_steps_per_second": 0.007, "step": 1250 }, { "clip_ratio": 0.0, "completion_length": 349.475, "epoch": 0.5070707070707071, "grad_norm": 1.4811421198816048, "kl": 0.01293487548828125, "learning_rate": 2e-07, "loss": 0.03056705594062805, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.35833334140479567, "reward_std": 0.38048321902751925, "rewards/MultiModalAccuracyORM": 0.35833334140479567, "step": 1255, "train_speed(iter/s)": 0.026435 }, { "clip_ratio": 0.0, "completion_length": 209.15, "epoch": 0.509090909090909, "grad_norm": 2.0552411044504764, "kl": 0.0252899169921875, "learning_rate": 2e-07, "loss": 0.028329643607139587, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.33333334177732465, "reward_std": 0.281466943025589, "rewards/MultiModalAccuracyORM": 0.33333334177732465, "step": 1260, "train_speed(iter/s)": 0.026464 }, { "clip_ratio": 0.0, "completion_length": 380.4, "epoch": 0.5111111111111111, "grad_norm": 2.615766039038286, "kl": 0.01002197265625, "learning_rate": 2e-07, "loss": 0.002955615520477295, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.25000000521540644, "reward_std": 0.2292436480522156, "rewards/MultiModalAccuracyORM": 0.25000000521540644, "step": 1265, "train_speed(iter/s)": 0.02646 }, { "clip_ratio": 0.0, "completion_length": 338.85, "epoch": 0.5131313131313131, "grad_norm": 1.9893529067484352, "kl": 0.011163330078125, "learning_rate": 2e-07, "loss": 0.018701747059822083, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.300000012665987, "reward_std": 0.3127244770526886, "rewards/MultiModalAccuracyORM": 0.300000012665987, "step": 1270, "train_speed(iter/s)": 0.026466 }, { "clip_ratio": 0.0, "completion_length": 253.65, "epoch": 0.5151515151515151, "grad_norm": 1.6843559930041148, "kl": 0.0115509033203125, "learning_rate": 2e-07, "loss": 0.012320590019226075, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.5000000111758709, "reward_std": 0.345323646068573, "rewards/MultiModalAccuracyORM": 0.5000000111758709, "step": 1275, "train_speed(iter/s)": 0.026464 }, { "clip_ratio": 0.0, "completion_length": 308.25, "epoch": 0.5171717171717172, "grad_norm": 3.0894548096911407, "kl": 0.010302734375, "learning_rate": 2e-07, "loss": -0.02475722283124924, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2500000074505806, "reward_std": 0.21999078691005708, "rewards/MultiModalAccuracyORM": 0.2500000074505806, "step": 1280, "train_speed(iter/s)": 0.026449 }, { "clip_ratio": 0.0, "completion_length": 286.45, "epoch": 0.5191919191919192, "grad_norm": 0.056162470903676515, "kl": 0.010772705078125, "learning_rate": 2e-07, "loss": -0.0004087850451469421, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.21666667386889457, "reward_std": 0.23105688095092775, "rewards/MultiModalAccuracyORM": 0.21666667386889457, "step": 1285, "train_speed(iter/s)": 0.026422 }, { "clip_ratio": 0.0, "completion_length": 285.55, "epoch": 0.5212121212121212, "grad_norm": 1.7176303706462466, "kl": 0.011578369140625, "learning_rate": 2e-07, "loss": 0.023639577627182006, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2666666693985462, "reward_std": 0.28077210783958434, "rewards/MultiModalAccuracyORM": 0.2666666693985462, "step": 1290, "train_speed(iter/s)": 0.026422 }, { "clip_ratio": 0.0, "completion_length": 285.65, "epoch": 0.5232323232323233, "grad_norm": 1.244445708488179, "kl": 0.0103790283203125, "learning_rate": 2e-07, "loss": -0.017145507037639618, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3833333395421505, "reward_std": 0.4086130350828171, "rewards/MultiModalAccuracyORM": 0.3833333395421505, "step": 1295, "train_speed(iter/s)": 0.026442 }, { "clip_ratio": 0.0, "completion_length": 321.2, "epoch": 0.5252525252525253, "grad_norm": 1.7914388567184454, "kl": 0.0092559814453125, "learning_rate": 2e-07, "loss": 0.054825717210769655, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3083333417773247, "reward_std": 0.3536572724580765, "rewards/MultiModalAccuracyORM": 0.3083333417773247, "step": 1300, "train_speed(iter/s)": 0.026415 }, { "clip_ratio": 0.0, "completion_length": 251.9, "epoch": 0.5272727272727272, "grad_norm": 2.6174114359405976, "kl": 0.010308837890625, "learning_rate": 2e-07, "loss": -0.019986753165721894, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.4500000074505806, "reward_std": 0.3099655658006668, "rewards/MultiModalAccuracyORM": 0.4500000074505806, "step": 1305, "train_speed(iter/s)": 0.026387 }, { "clip_ratio": 0.0, "completion_length": 264.9, "epoch": 0.5292929292929293, "grad_norm": 32.625329420627345, "kl": 0.00882568359375, "learning_rate": 2e-07, "loss": 0.008027985692024231, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.41666667312383654, "reward_std": 0.40485736131668093, "rewards/MultiModalAccuracyORM": 0.41666667312383654, "step": 1310, "train_speed(iter/s)": 0.026366 }, { "clip_ratio": 0.0, "completion_length": 356.3, "epoch": 0.5313131313131313, "grad_norm": 1.6706902692989012, "kl": 0.0086761474609375, "learning_rate": 2e-07, "loss": 0.028931498527526855, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.28333334624767303, "reward_std": 0.3558539390563965, "rewards/MultiModalAccuracyORM": 0.28333334624767303, "step": 1315, "train_speed(iter/s)": 0.026338 }, { "clip_ratio": 0.0, "completion_length": 274.25, "epoch": 0.5333333333333333, "grad_norm": 1.8800912459209826, "kl": 0.0249176025390625, "learning_rate": 2e-07, "loss": 0.048329290747642514, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.31666667237877844, "reward_std": 0.25897533297538755, "rewards/MultiModalAccuracyORM": 0.31666667237877844, "step": 1320, "train_speed(iter/s)": 0.026308 }, { "clip_ratio": 0.0, "completion_length": 374.3, "epoch": 0.5353535353535354, "grad_norm": 3.1086990293234904, "kl": 0.01292724609375, "learning_rate": 2e-07, "loss": 0.006182897090911865, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.34166667312383653, "reward_std": 0.3867922484874725, "rewards/MultiModalAccuracyORM": 0.34166667312383653, "step": 1325, "train_speed(iter/s)": 0.026274 }, { "clip_ratio": 0.0, "completion_length": 404.3, "epoch": 0.5373737373737374, "grad_norm": 0.08070215671871471, "kl": 0.0099578857421875, "learning_rate": 2e-07, "loss": 0.062343114614486696, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3000000059604645, "reward_std": 0.22625695466995238, "rewards/MultiModalAccuracyORM": 0.3000000059604645, "step": 1330, "train_speed(iter/s)": 0.026241 }, { "clip_ratio": 0.0, "completion_length": 360.75, "epoch": 0.5393939393939394, "grad_norm": 3.4146119265895893, "kl": 0.0290008544921875, "learning_rate": 2e-07, "loss": -0.02337663769721985, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.40000000819563863, "reward_std": 0.31852359175682066, "rewards/MultiModalAccuracyORM": 0.40000000819563863, "step": 1335, "train_speed(iter/s)": 0.026231 }, { "clip_ratio": 0.0, "completion_length": 300.85, "epoch": 0.5414141414141415, "grad_norm": 1.014030648475331, "kl": 0.0152801513671875, "learning_rate": 2e-07, "loss": 0.03424631953239441, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3333333425223827, "reward_std": 0.22807018756866454, "rewards/MultiModalAccuracyORM": 0.3333333425223827, "step": 1340, "train_speed(iter/s)": 0.026218 }, { "clip_ratio": 0.0, "completion_length": 297.5, "epoch": 0.5434343434343434, "grad_norm": 2.579076344272663, "kl": 0.0294189453125, "learning_rate": 2e-07, "loss": -0.004431784152984619, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.425000012665987, "reward_std": 0.3433456152677536, "rewards/MultiModalAccuracyORM": 0.425000012665987, "step": 1345, "train_speed(iter/s)": 0.026212 }, { "clip_ratio": 0.0, "completion_length": 365.2, "epoch": 0.5454545454545454, "grad_norm": 0.09604007460689165, "kl": 0.0132415771484375, "learning_rate": 2e-07, "loss": 0.011541323363780975, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.15833333730697632, "reward_std": 0.24961273670196532, "rewards/MultiModalAccuracyORM": 0.15833333730697632, "step": 1350, "train_speed(iter/s)": 0.026199 }, { "clip_ratio": 0.0, "completion_length": 294.2, "epoch": 0.5474747474747474, "grad_norm": 2.8630066616840306, "kl": 0.0131500244140625, "learning_rate": 2e-07, "loss": 0.0038095355033874513, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.4083333484828472, "reward_std": 0.371958914399147, "rewards/MultiModalAccuracyORM": 0.4083333484828472, "step": 1355, "train_speed(iter/s)": 0.026195 }, { "clip_ratio": 0.0, "completion_length": 290.35, "epoch": 0.5494949494949495, "grad_norm": 2.8462264230542202, "kl": 0.0113922119140625, "learning_rate": 2e-07, "loss": -0.013850301504135132, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.13333333879709244, "reward_std": 0.23857065439224243, "rewards/MultiModalAccuracyORM": 0.13333333879709244, "step": 1360, "train_speed(iter/s)": 0.026178 }, { "clip_ratio": 0.0, "completion_length": 352.8, "epoch": 0.5515151515151515, "grad_norm": 1.9037157526983224, "kl": 0.0115966796875, "learning_rate": 2e-07, "loss": 0.061475354433059695, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.28333334252238274, "reward_std": 0.37644500732421876, "rewards/MultiModalAccuracyORM": 0.28333334252238274, "step": 1365, "train_speed(iter/s)": 0.026169 }, { "clip_ratio": 0.0, "completion_length": 290.25, "epoch": 0.5535353535353535, "grad_norm": 1.5230914677267515, "kl": 0.012347412109375, "learning_rate": 2e-07, "loss": 0.02505878210067749, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.36666667386889457, "reward_std": 0.26496326327323916, "rewards/MultiModalAccuracyORM": 0.36666667386889457, "step": 1370, "train_speed(iter/s)": 0.026162 }, { "clip_ratio": 0.0, "completion_length": 304.7, "epoch": 0.5555555555555556, "grad_norm": 1.9879722073308892, "kl": 0.0135223388671875, "learning_rate": 2e-07, "loss": 0.010433109104633331, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.27500000447034834, "reward_std": 0.18332211077213287, "rewards/MultiModalAccuracyORM": 0.27500000447034834, "step": 1375, "train_speed(iter/s)": 0.026157 }, { "clip_ratio": 0.0, "completion_length": 319.9, "epoch": 0.5575757575757576, "grad_norm": 2.649637312336083, "kl": 0.012469482421875, "learning_rate": 2e-07, "loss": 0.009650683403015137, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.23333333805203438, "reward_std": 0.3890485167503357, "rewards/MultiModalAccuracyORM": 0.23333333805203438, "step": 1380, "train_speed(iter/s)": 0.02615 }, { "clip_ratio": 0.0, "completion_length": 355.05, "epoch": 0.5595959595959596, "grad_norm": 0.05006149717815439, "kl": 0.016656494140625, "learning_rate": 2e-07, "loss": -0.007993972301483155, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.29166667386889455, "reward_std": 0.3541334718465805, "rewards/MultiModalAccuracyORM": 0.29166667386889455, "step": 1385, "train_speed(iter/s)": 0.026129 }, { "clip_ratio": 0.0, "completion_length": 155.65, "epoch": 0.5616161616161616, "grad_norm": 0.08079407011077554, "kl": 0.01981201171875, "learning_rate": 2e-07, "loss": 0.03422499895095825, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.25833334103226663, "reward_std": 0.3597048044204712, "rewards/MultiModalAccuracyORM": 0.25833334103226663, "step": 1390, "train_speed(iter/s)": 0.026124 }, { "clip_ratio": 0.0, "completion_length": 411.3, "epoch": 0.5636363636363636, "grad_norm": 2.595093461800728, "kl": 0.016748046875, "learning_rate": 2e-07, "loss": 0.0661674439907074, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.41666667610406877, "reward_std": 0.41412476599216463, "rewards/MultiModalAccuracyORM": 0.41666667610406877, "step": 1395, "train_speed(iter/s)": 0.02611 }, { "clip_ratio": 0.0, "completion_length": 312.3, "epoch": 0.5656565656565656, "grad_norm": 1.8524460034780388, "kl": 0.0219970703125, "learning_rate": 2e-07, "loss": 0.0748141050338745, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2833333410322666, "reward_std": 0.3222051203250885, "rewards/MultiModalAccuracyORM": 0.2833333410322666, "step": 1400, "train_speed(iter/s)": 0.026104 }, { "clip_ratio": 0.0, "completion_length": 284.7, "epoch": 0.5676767676767677, "grad_norm": 1.8645433263018287, "kl": 0.020556640625, "learning_rate": 2e-07, "loss": -0.019703832268714905, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2833333402872086, "reward_std": 0.23230449855327606, "rewards/MultiModalAccuracyORM": 0.2833333402872086, "step": 1405, "train_speed(iter/s)": 0.026096 }, { "clip_ratio": 0.0, "completion_length": 301.15, "epoch": 0.5696969696969697, "grad_norm": 2.007508731899272, "kl": 0.014324951171875, "learning_rate": 2e-07, "loss": 0.026613450050354003, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.40000000819563863, "reward_std": 0.26928699016571045, "rewards/MultiModalAccuracyORM": 0.40000000819563863, "step": 1410, "train_speed(iter/s)": 0.026082 }, { "clip_ratio": 0.0, "completion_length": 384.6, "epoch": 0.5717171717171717, "grad_norm": 1.3049808616717113, "kl": 0.0161651611328125, "learning_rate": 2e-07, "loss": -0.019157709181308748, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.25833334103226663, "reward_std": 0.3352662205696106, "rewards/MultiModalAccuracyORM": 0.25833334103226663, "step": 1415, "train_speed(iter/s)": 0.026066 }, { "clip_ratio": 0.0, "completion_length": 356.4, "epoch": 0.5737373737373738, "grad_norm": 1.7990652267186868, "kl": 0.021240234375, "learning_rate": 2e-07, "loss": 0.043132427334785464, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.17500000298023224, "reward_std": 0.2159808874130249, "rewards/MultiModalAccuracyORM": 0.17500000298023224, "step": 1420, "train_speed(iter/s)": 0.026059 }, { "clip_ratio": 0.0, "completion_length": 288.85, "epoch": 0.5757575757575758, "grad_norm": 1.3873829792776142, "kl": 0.017431640625, "learning_rate": 2e-07, "loss": 0.010021258890628815, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.45833334475755694, "reward_std": 0.2770525634288788, "rewards/MultiModalAccuracyORM": 0.45833334475755694, "step": 1425, "train_speed(iter/s)": 0.026059 }, { "clip_ratio": 0.0, "completion_length": 296.05, "epoch": 0.5777777777777777, "grad_norm": 1.6565432442769377, "kl": 0.0139190673828125, "learning_rate": 2e-07, "loss": 0.016829773783683777, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.29166667312383654, "reward_std": 0.40086200535297395, "rewards/MultiModalAccuracyORM": 0.29166667312383654, "step": 1430, "train_speed(iter/s)": 0.026063 }, { "clip_ratio": 0.0, "completion_length": 370.7, "epoch": 0.5797979797979798, "grad_norm": 1.2410328295318487, "kl": 0.015863037109375, "learning_rate": 2e-07, "loss": -0.04091094434261322, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3166666731238365, "reward_std": 0.3603756338357925, "rewards/MultiModalAccuracyORM": 0.3166666731238365, "step": 1435, "train_speed(iter/s)": 0.026053 }, { "clip_ratio": 0.0, "completion_length": 305.7, "epoch": 0.5818181818181818, "grad_norm": 2.659138324217993, "kl": 0.01724853515625, "learning_rate": 2e-07, "loss": 0.08770001530647278, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3333333425223827, "reward_std": 0.4456100821495056, "rewards/MultiModalAccuracyORM": 0.3333333425223827, "step": 1440, "train_speed(iter/s)": 0.026045 }, { "clip_ratio": 0.0, "completion_length": 321.15, "epoch": 0.5838383838383838, "grad_norm": 2.6855533659279462, "kl": 0.015350341796875, "learning_rate": 2e-07, "loss": -0.03101794719696045, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.1666666716337204, "reward_std": 0.2644129186868668, "rewards/MultiModalAccuracyORM": 0.1666666716337204, "step": 1445, "train_speed(iter/s)": 0.026039 }, { "clip_ratio": 0.0, "completion_length": 353.85, "epoch": 0.5858585858585859, "grad_norm": 0.8787033948980154, "kl": 0.018048095703125, "learning_rate": 2e-07, "loss": 0.021743962168693544, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.1500000014901161, "reward_std": 0.2496483266353607, "rewards/MultiModalAccuracyORM": 0.1500000014901161, "step": 1450, "train_speed(iter/s)": 0.026027 }, { "clip_ratio": 0.0, "completion_length": 329.65, "epoch": 0.5878787878787879, "grad_norm": 2.6089377973235917, "kl": 0.0154541015625, "learning_rate": 2e-07, "loss": -0.0126606285572052, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.25833333730697633, "reward_std": 0.287842845916748, "rewards/MultiModalAccuracyORM": 0.25833333730697633, "step": 1455, "train_speed(iter/s)": 0.026012 }, { "clip_ratio": 0.0, "completion_length": 353.55, "epoch": 0.5898989898989899, "grad_norm": 3.1599228273908895, "kl": 0.017535400390625, "learning_rate": 2e-07, "loss": 0.03227808475494385, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3500000089406967, "reward_std": 0.2754935622215271, "rewards/MultiModalAccuracyORM": 0.3500000089406967, "step": 1460, "train_speed(iter/s)": 0.025992 }, { "clip_ratio": 0.0, "completion_length": 247.35, "epoch": 0.591919191919192, "grad_norm": 3.772779516485284, "kl": 0.016162109375, "learning_rate": 2e-07, "loss": -0.006427288055419922, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.21666667088866234, "reward_std": 0.3214506834745407, "rewards/MultiModalAccuracyORM": 0.21666667088866234, "step": 1465, "train_speed(iter/s)": 0.02598 }, { "clip_ratio": 0.0, "completion_length": 300.7, "epoch": 0.593939393939394, "grad_norm": 1.9048234622524929, "kl": 0.019964599609375, "learning_rate": 2e-07, "loss": 0.02089463174343109, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.33333334550261495, "reward_std": 0.39707074165344236, "rewards/MultiModalAccuracyORM": 0.33333334550261495, "step": 1470, "train_speed(iter/s)": 0.025963 }, { "clip_ratio": 0.0, "completion_length": 372.2, "epoch": 0.5959595959595959, "grad_norm": 1.7167051608215667, "kl": 0.0126953125, "learning_rate": 2e-07, "loss": 0.0002398371696472168, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.24166667014360427, "reward_std": 0.3485645651817322, "rewards/MultiModalAccuracyORM": 0.24166667014360427, "step": 1475, "train_speed(iter/s)": 0.025929 }, { "clip_ratio": 0.0, "completion_length": 286.9, "epoch": 0.597979797979798, "grad_norm": 2.018355689891589, "kl": 0.014324951171875, "learning_rate": 2e-07, "loss": 0.025476664304733276, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.5250000104308128, "reward_std": 0.3463323086500168, "rewards/MultiModalAccuracyORM": 0.5250000104308128, "step": 1480, "train_speed(iter/s)": 0.025899 }, { "clip_ratio": 0.0, "completion_length": 286.35, "epoch": 0.6, "grad_norm": 1.9564498539046626, "kl": 0.013104248046875, "learning_rate": 2e-07, "loss": -0.0017219483852386475, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2666666753590107, "reward_std": 0.3392761141061783, "rewards/MultiModalAccuracyORM": 0.2666666753590107, "step": 1485, "train_speed(iter/s)": 0.025884 }, { "clip_ratio": 0.0, "completion_length": 371.55, "epoch": 0.602020202020202, "grad_norm": 3.3586873596373836, "kl": 0.0190948486328125, "learning_rate": 2e-07, "loss": -0.015026980638504028, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.33333334550261495, "reward_std": 0.43529842495918275, "rewards/MultiModalAccuracyORM": 0.33333334550261495, "step": 1490, "train_speed(iter/s)": 0.02585 }, { "clip_ratio": 0.0, "completion_length": 352.1, "epoch": 0.604040404040404, "grad_norm": 1.5566031738878978, "kl": 0.0152313232421875, "learning_rate": 2e-07, "loss": 0.05221402645111084, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.4666666716337204, "reward_std": 0.3853524446487427, "rewards/MultiModalAccuracyORM": 0.4666666716337204, "step": 1495, "train_speed(iter/s)": 0.025826 }, { "epoch": 0.6060606060606061, "grad_norm": 1.092725055214899, "learning_rate": 2e-07, "loss": 0.044440290331840514, "memory(GiB)": 87.45, "step": 1500, "train_speed(iter/s)": 0.025794 }, { "epoch": 0.6060606060606061, "eval_clip_ratio": 0.0, "eval_completion_length": 332.07667766571046, "eval_kl": 0.03210205078125, "eval_loss": 0.03433879837393761, "eval_response_clip_ratio": 0.0, "eval_reward": 0.32333334147930143, "eval_reward_std": 0.34949765503406527, "eval_rewards/MultiModalAccuracyORM": 0.32333334147930143, "eval_runtime": 946.9078, "eval_samples_per_second": 0.053, "eval_steps_per_second": 0.005, "step": 1500 }, { "clip_ratio": 0.0, "completion_length": 307.3, "epoch": 0.6080808080808081, "grad_norm": 1.594406200527781, "kl": 0.01402130126953125, "learning_rate": 2e-07, "loss": 0.011821150779724121, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.30000000447034836, "reward_std": 0.29021300822496415, "rewards/MultiModalAccuracyORM": 0.30000000447034836, "step": 1505, "train_speed(iter/s)": 0.02519 }, { "clip_ratio": 0.0, "completion_length": 310.35, "epoch": 0.6101010101010101, "grad_norm": 1.872354266566466, "kl": 0.01295166015625, "learning_rate": 2e-07, "loss": 0.040472963452339174, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.39166667237877845, "reward_std": 0.24481281042098998, "rewards/MultiModalAccuracyORM": 0.39166667237877845, "step": 1510, "train_speed(iter/s)": 0.025138 }, { "clip_ratio": 0.0, "completion_length": 324.25, "epoch": 0.6121212121212121, "grad_norm": 2.2298458448624032, "kl": 0.017498779296875, "learning_rate": 2e-07, "loss": -0.003679761290550232, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2666666753590107, "reward_std": 0.33752005696296694, "rewards/MultiModalAccuracyORM": 0.2666666753590107, "step": 1515, "train_speed(iter/s)": 0.02512 }, { "clip_ratio": 0.0, "completion_length": 495.3, "epoch": 0.6141414141414141, "grad_norm": 2.1057358539094637, "kl": 0.013360595703125, "learning_rate": 2e-07, "loss": -0.040804427862167356, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.11666667088866234, "reward_std": 0.22625695466995238, "rewards/MultiModalAccuracyORM": 0.11666667088866234, "step": 1520, "train_speed(iter/s)": 0.025038 }, { "clip_ratio": 0.0, "completion_length": 355.45, "epoch": 0.6161616161616161, "grad_norm": 1.7271901034384924, "kl": 0.01778564453125, "learning_rate": 2e-07, "loss": 0.04612007737159729, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.29166667759418485, "reward_std": 0.385197651386261, "rewards/MultiModalAccuracyORM": 0.29166667759418485, "step": 1525, "train_speed(iter/s)": 0.025 }, { "clip_ratio": 0.0, "completion_length": 331.4, "epoch": 0.6181818181818182, "grad_norm": 2.251271699623951, "kl": 0.015338134765625, "learning_rate": 2e-07, "loss": 0.07724932432174683, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.40833334177732467, "reward_std": 0.39786076843738555, "rewards/MultiModalAccuracyORM": 0.40833334177732467, "step": 1530, "train_speed(iter/s)": 0.024971 }, { "clip_ratio": 0.0, "completion_length": 469.35, "epoch": 0.6202020202020202, "grad_norm": 3.517799255266591, "kl": 0.021319580078125, "learning_rate": 2e-07, "loss": -0.042039293050765994, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2000000074505806, "reward_std": 0.27122943103313446, "rewards/MultiModalAccuracyORM": 0.2000000074505806, "step": 1535, "train_speed(iter/s)": 0.02492 }, { "clip_ratio": 0.0, "completion_length": 414.3, "epoch": 0.6222222222222222, "grad_norm": 2.5032184616862736, "kl": 0.023309326171875, "learning_rate": 2e-07, "loss": 0.004111546277999878, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.36666667386889457, "reward_std": 0.36037562787532806, "rewards/MultiModalAccuracyORM": 0.36666667386889457, "step": 1540, "train_speed(iter/s)": 0.024886 }, { "clip_ratio": 0.0, "completion_length": 379.65, "epoch": 0.6242424242424243, "grad_norm": 1.3788944987112297, "kl": 0.018865966796875, "learning_rate": 2e-07, "loss": 0.03875549137592316, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.14166666939854622, "reward_std": 0.275529146194458, "rewards/MultiModalAccuracyORM": 0.14166666939854622, "step": 1545, "train_speed(iter/s)": 0.02484 }, { "clip_ratio": 0.0, "completion_length": 418.9, "epoch": 0.6262626262626263, "grad_norm": 1.8495513561932837, "kl": 0.02667236328125, "learning_rate": 2e-07, "loss": 0.006523740291595459, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.15833333656191825, "reward_std": 0.3144540905952454, "rewards/MultiModalAccuracyORM": 0.15833333656191825, "step": 1550, "train_speed(iter/s)": 0.024776 }, { "clip_ratio": 0.0, "completion_length": 346.2, "epoch": 0.6282828282828283, "grad_norm": 1.753463603338966, "kl": 0.030621337890625, "learning_rate": 2e-07, "loss": -0.08293852806091309, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.25000000149011614, "reward_std": 0.23083824515342713, "rewards/MultiModalAccuracyORM": 0.25000000149011614, "step": 1555, "train_speed(iter/s)": 0.02475 }, { "clip_ratio": 0.0, "completion_length": 382.7, "epoch": 0.6303030303030303, "grad_norm": 2.663595112199716, "kl": 0.0219970703125, "learning_rate": 2e-07, "loss": -0.002608485519886017, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.35000000447034835, "reward_std": 0.27756678462028506, "rewards/MultiModalAccuracyORM": 0.35000000447034835, "step": 1560, "train_speed(iter/s)": 0.024719 }, { "clip_ratio": 0.0, "completion_length": 285.55, "epoch": 0.6323232323232323, "grad_norm": 1.803682568463378, "kl": 0.02052001953125, "learning_rate": 2e-07, "loss": -0.031521540880203244, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2833333440124989, "reward_std": 0.24935851097106934, "rewards/MultiModalAccuracyORM": 0.2833333440124989, "step": 1565, "train_speed(iter/s)": 0.024694 }, { "clip_ratio": 0.0, "completion_length": 464.5, "epoch": 0.6343434343434343, "grad_norm": 1.9551331787297712, "kl": 0.012725830078125, "learning_rate": 2e-07, "loss": 0.016904991865158082, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2250000059604645, "reward_std": 0.3863160490989685, "rewards/MultiModalAccuracyORM": 0.2250000059604645, "step": 1570, "train_speed(iter/s)": 0.024625 }, { "clip_ratio": 0.0, "completion_length": 280.85, "epoch": 0.6363636363636364, "grad_norm": 2.19696821448914, "kl": 0.016156005859375, "learning_rate": 2e-07, "loss": 0.00793578326702118, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.38333333656191826, "reward_std": 0.2260383188724518, "rewards/MultiModalAccuracyORM": 0.38333333656191826, "step": 1575, "train_speed(iter/s)": 0.024598 }, { "clip_ratio": 0.0, "completion_length": 346.35, "epoch": 0.6383838383838384, "grad_norm": 0.10124868688137513, "kl": 0.016912841796875, "learning_rate": 2e-07, "loss": -0.007649339735507965, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.26666666865348815, "reward_std": 0.23634997606277466, "rewards/MultiModalAccuracyORM": 0.26666666865348815, "step": 1580, "train_speed(iter/s)": 0.024579 }, { "clip_ratio": 0.0, "completion_length": 409.0, "epoch": 0.6404040404040404, "grad_norm": 1.6301877045933517, "kl": 0.012744140625, "learning_rate": 2e-07, "loss": 0.013163220882415772, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3250000052154064, "reward_std": 0.3906099498271942, "rewards/MultiModalAccuracyORM": 0.3250000052154064, "step": 1585, "train_speed(iter/s)": 0.024565 }, { "clip_ratio": 0.0, "completion_length": 359.05, "epoch": 0.6424242424242425, "grad_norm": 2.155746940066879, "kl": 0.016387939453125, "learning_rate": 2e-07, "loss": -0.006454774737358093, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2333333373069763, "reward_std": 0.2855865776538849, "rewards/MultiModalAccuracyORM": 0.2333333373069763, "step": 1590, "train_speed(iter/s)": 0.024526 }, { "clip_ratio": 0.0, "completion_length": 369.0, "epoch": 0.6444444444444445, "grad_norm": 2.831254989761031, "kl": 0.0135467529296875, "learning_rate": 2e-07, "loss": 0.04445863664150238, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.20000000149011612, "reward_std": 0.28787843585014344, "rewards/MultiModalAccuracyORM": 0.20000000149011612, "step": 1595, "train_speed(iter/s)": 0.024495 }, { "clip_ratio": 0.0, "completion_length": 279.45, "epoch": 0.6464646464646465, "grad_norm": 1.4752518445027274, "kl": 0.017083740234375, "learning_rate": 2e-07, "loss": 0.03578461408615112, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.36666668131947516, "reward_std": 0.33704385757446287, "rewards/MultiModalAccuracyORM": 0.36666668131947516, "step": 1600, "train_speed(iter/s)": 0.024476 }, { "clip_ratio": 0.0, "completion_length": 355.65, "epoch": 0.6484848484848484, "grad_norm": 0.9187218241799472, "kl": 0.016937255859375, "learning_rate": 2e-07, "loss": 0.02192138433456421, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3500000111758709, "reward_std": 0.3222196638584137, "rewards/MultiModalAccuracyORM": 0.3500000111758709, "step": 1605, "train_speed(iter/s)": 0.024426 }, { "clip_ratio": 0.0, "completion_length": 388.35, "epoch": 0.6505050505050505, "grad_norm": 1.7973159194566164, "kl": 0.0144775390625, "learning_rate": 2e-07, "loss": 0.01784837543964386, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.12500000298023223, "reward_std": 0.2689731627702713, "rewards/MultiModalAccuracyORM": 0.12500000298023223, "step": 1610, "train_speed(iter/s)": 0.024388 }, { "clip_ratio": 0.0, "completion_length": 397.55, "epoch": 0.6525252525252525, "grad_norm": 2.0318711993448617, "kl": 0.018182373046875, "learning_rate": 2e-07, "loss": -0.02051687240600586, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.4333333380520344, "reward_std": 0.261207589507103, "rewards/MultiModalAccuracyORM": 0.4333333380520344, "step": 1615, "train_speed(iter/s)": 0.024346 }, { "clip_ratio": 0.0, "completion_length": 339.3, "epoch": 0.6545454545454545, "grad_norm": 1.9030819605130962, "kl": 0.0175079345703125, "learning_rate": 2e-07, "loss": 0.06623161435127259, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.37500000596046446, "reward_std": 0.24885829985141755, "rewards/MultiModalAccuracyORM": 0.37500000596046446, "step": 1620, "train_speed(iter/s)": 0.024315 }, { "clip_ratio": 0.0, "completion_length": 280.4, "epoch": 0.6565656565656566, "grad_norm": 2.08045815446475, "kl": 0.0169708251953125, "learning_rate": 2e-07, "loss": -0.013642898201942444, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2500000029802322, "reward_std": 0.3378098726272583, "rewards/MultiModalAccuracyORM": 0.2500000029802322, "step": 1625, "train_speed(iter/s)": 0.024289 }, { "clip_ratio": 0.0, "completion_length": 400.75, "epoch": 0.6585858585858586, "grad_norm": 1.436661872799103, "kl": 0.0193359375, "learning_rate": 2e-07, "loss": 0.02239292562007904, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.15833333805203437, "reward_std": 0.2629852324724197, "rewards/MultiModalAccuracyORM": 0.15833333805203437, "step": 1630, "train_speed(iter/s)": 0.024248 }, { "clip_ratio": 0.0, "completion_length": 270.4, "epoch": 0.6606060606060606, "grad_norm": 2.5008411774286494, "kl": 0.020758056640625, "learning_rate": 2e-07, "loss": 0.02127687931060791, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.4083333432674408, "reward_std": 0.3023863762617111, "rewards/MultiModalAccuracyORM": 0.4083333432674408, "step": 1635, "train_speed(iter/s)": 0.024227 }, { "clip_ratio": 0.0, "completion_length": 324.8, "epoch": 0.6626262626262627, "grad_norm": 2.6410537415459125, "kl": 0.02030029296875, "learning_rate": 2e-07, "loss": 0.05219934582710266, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.41666667684912684, "reward_std": 0.35006397068500517, "rewards/MultiModalAccuracyORM": 0.41666667684912684, "step": 1640, "train_speed(iter/s)": 0.024199 }, { "clip_ratio": 0.0, "completion_length": 356.0, "epoch": 0.6646464646464646, "grad_norm": 2.4569826375450914, "kl": 0.01795654296875, "learning_rate": 2e-07, "loss": 0.013086378574371338, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.45000001341104506, "reward_std": 0.3337643891572952, "rewards/MultiModalAccuracyORM": 0.45000001341104506, "step": 1645, "train_speed(iter/s)": 0.024168 }, { "clip_ratio": 0.0, "completion_length": 304.8, "epoch": 0.6666666666666666, "grad_norm": 1.9280627341583514, "kl": 0.015191650390625, "learning_rate": 2e-07, "loss": 0.01907120943069458, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.15000000447034836, "reward_std": 0.30035116970539094, "rewards/MultiModalAccuracyORM": 0.15000000447034836, "step": 1650, "train_speed(iter/s)": 0.024141 }, { "clip_ratio": 0.0, "completion_length": 419.0, "epoch": 0.6686868686868687, "grad_norm": 2.6312715310589687, "kl": 0.015863037109375, "learning_rate": 2e-07, "loss": -0.04063203632831573, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.15833333656191825, "reward_std": 0.25741389989852903, "rewards/MultiModalAccuracyORM": 0.15833333656191825, "step": 1655, "train_speed(iter/s)": 0.024076 }, { "clip_ratio": 0.0, "completion_length": 394.05, "epoch": 0.6707070707070707, "grad_norm": 0.9566291807644657, "kl": 0.015057373046875, "learning_rate": 2e-07, "loss": 0.018163633346557618, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.26666667610406875, "reward_std": 0.28446817994117735, "rewards/MultiModalAccuracyORM": 0.26666667610406875, "step": 1660, "train_speed(iter/s)": 0.024043 }, { "clip_ratio": 0.0, "completion_length": 326.85, "epoch": 0.6727272727272727, "grad_norm": 1.9521868347750622, "kl": 0.019769287109375, "learning_rate": 2e-07, "loss": -5.202591419219971e-05, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.21666667535901069, "reward_std": 0.23481498062610626, "rewards/MultiModalAccuracyORM": 0.21666667535901069, "step": 1665, "train_speed(iter/s)": 0.024026 }, { "clip_ratio": 0.0, "completion_length": 316.4, "epoch": 0.6747474747474748, "grad_norm": 2.1472683375029757, "kl": 0.01842041015625, "learning_rate": 2e-07, "loss": 0.08016844987869262, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.24166667461395264, "reward_std": 0.29655990600585935, "rewards/MultiModalAccuracyORM": 0.24166667461395264, "step": 1670, "train_speed(iter/s)": 0.024002 }, { "clip_ratio": 0.0, "completion_length": 294.15, "epoch": 0.6767676767676768, "grad_norm": 2.136669782149022, "kl": 0.012603759765625, "learning_rate": 2e-07, "loss": 0.03559441566467285, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.18333334103226662, "reward_std": 0.31266487538814547, "rewards/MultiModalAccuracyORM": 0.18333334103226662, "step": 1675, "train_speed(iter/s)": 0.023983 }, { "clip_ratio": 0.0, "completion_length": 349.7, "epoch": 0.6787878787878788, "grad_norm": 2.5120224393696056, "kl": 0.033984375, "learning_rate": 2e-07, "loss": -0.02109343409538269, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2083333410322666, "reward_std": 0.2629852324724197, "rewards/MultiModalAccuracyORM": 0.2083333410322666, "step": 1680, "train_speed(iter/s)": 0.02395 }, { "clip_ratio": 0.0, "completion_length": 224.85, "epoch": 0.6808080808080809, "grad_norm": 2.7291188101039268, "kl": 0.0185638427734375, "learning_rate": 2e-07, "loss": 0.06400806307792664, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.441666679084301, "reward_std": 0.3586460083723068, "rewards/MultiModalAccuracyORM": 0.441666679084301, "step": 1685, "train_speed(iter/s)": 0.023931 }, { "clip_ratio": 0.0, "completion_length": 204.25, "epoch": 0.6828282828282828, "grad_norm": 2.473418035792826, "kl": 0.03394775390625, "learning_rate": 2e-07, "loss": 0.042749062180519104, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3250000044703484, "reward_std": 0.30718872845172884, "rewards/MultiModalAccuracyORM": 0.3250000044703484, "step": 1690, "train_speed(iter/s)": 0.023921 }, { "clip_ratio": 0.0, "completion_length": 321.45, "epoch": 0.6848484848484848, "grad_norm": 1.4363881715878042, "kl": 0.023870849609375, "learning_rate": 2e-07, "loss": 0.007241478562355042, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3500000022351742, "reward_std": 0.3244373768568039, "rewards/MultiModalAccuracyORM": 0.3500000022351742, "step": 1695, "train_speed(iter/s)": 0.023909 }, { "clip_ratio": 0.0, "completion_length": 315.0, "epoch": 0.6868686868686869, "grad_norm": 2.953319073134284, "kl": 0.023052978515625, "learning_rate": 2e-07, "loss": -0.010269761085510254, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.32500000596046447, "reward_std": 0.3144540905952454, "rewards/MultiModalAccuracyORM": 0.32500000596046447, "step": 1700, "train_speed(iter/s)": 0.023894 }, { "clip_ratio": 0.0, "completion_length": 281.35, "epoch": 0.6888888888888889, "grad_norm": 2.565868939994401, "kl": 0.02255859375, "learning_rate": 2e-07, "loss": 0.018953490257263183, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3583333417773247, "reward_std": 0.30840655863285066, "rewards/MultiModalAccuracyORM": 0.3583333417773247, "step": 1705, "train_speed(iter/s)": 0.023885 }, { "clip_ratio": 0.0, "completion_length": 375.9, "epoch": 0.6909090909090909, "grad_norm": 0.6694533298035624, "kl": 0.01932373046875, "learning_rate": 2e-07, "loss": 0.008337923884391784, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.21666667535901069, "reward_std": 0.2652415007352829, "rewards/MultiModalAccuracyORM": 0.21666667535901069, "step": 1710, "train_speed(iter/s)": 0.02387 }, { "clip_ratio": 0.0, "completion_length": 275.95, "epoch": 0.692929292929293, "grad_norm": 1.567189433294113, "kl": 0.030279541015625, "learning_rate": 2e-07, "loss": 0.03896563053131104, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.41666667759418485, "reward_std": 0.35563530325889586, "rewards/MultiModalAccuracyORM": 0.41666667759418485, "step": 1715, "train_speed(iter/s)": 0.023857 }, { "clip_ratio": 0.0, "completion_length": 263.45, "epoch": 0.694949494949495, "grad_norm": 1.8167696383064045, "kl": 0.0214141845703125, "learning_rate": 2e-07, "loss": 0.020650827884674074, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.4333333484828472, "reward_std": 0.39936017990112305, "rewards/MultiModalAccuracyORM": 0.4333333484828472, "step": 1720, "train_speed(iter/s)": 0.023843 }, { "clip_ratio": 0.0, "completion_length": 363.55, "epoch": 0.696969696969697, "grad_norm": 2.213186558232037, "kl": 0.0275634765625, "learning_rate": 2e-07, "loss": -0.008746334910392761, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.341666679084301, "reward_std": 0.35490245223045347, "rewards/MultiModalAccuracyORM": 0.341666679084301, "step": 1725, "train_speed(iter/s)": 0.023835 }, { "clip_ratio": 0.0, "completion_length": 343.9, "epoch": 0.6989898989898989, "grad_norm": 2.601045176615316, "kl": 0.021826171875, "learning_rate": 2e-07, "loss": -0.03737230598926544, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.31666667461395265, "reward_std": 0.38179769814014436, "rewards/MultiModalAccuracyORM": 0.31666667461395265, "step": 1730, "train_speed(iter/s)": 0.023822 }, { "clip_ratio": 0.0, "completion_length": 356.3, "epoch": 0.701010101010101, "grad_norm": 0.9407841462948962, "kl": 0.0240234375, "learning_rate": 2e-07, "loss": -0.0031855762004852294, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2750000089406967, "reward_std": 0.2511385798454285, "rewards/MultiModalAccuracyORM": 0.2750000089406967, "step": 1735, "train_speed(iter/s)": 0.023807 }, { "clip_ratio": 0.0, "completion_length": 262.95, "epoch": 0.703030303030303, "grad_norm": 2.6759259468484413, "kl": 0.0215087890625, "learning_rate": 2e-07, "loss": 0.025629484653472902, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.34166667312383653, "reward_std": 0.3390218883752823, "rewards/MultiModalAccuracyORM": 0.34166667312383653, "step": 1740, "train_speed(iter/s)": 0.023819 }, { "clip_ratio": 0.0, "completion_length": 280.85, "epoch": 0.705050505050505, "grad_norm": 1.6215662631256935, "kl": 0.043084716796875, "learning_rate": 2e-07, "loss": 0.01873619556427002, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3083333395421505, "reward_std": 0.23631438612937927, "rewards/MultiModalAccuracyORM": 0.3083333395421505, "step": 1745, "train_speed(iter/s)": 0.023814 }, { "epoch": 0.7070707070707071, "grad_norm": 3.313730122510265, "learning_rate": 2e-07, "loss": -0.041856271028518674, "memory(GiB)": 87.45, "step": 1750, "train_speed(iter/s)": 0.023773 }, { "epoch": 0.7070707070707071, "eval_clip_ratio": 0.0, "eval_completion_length": 318.58167419433596, "eval_kl": 0.0221929931640625, "eval_loss": 0.0349855050444603, "eval_response_clip_ratio": 0.001666666716337204, "eval_reward": 0.2950000064074993, "eval_reward_std": 0.3137217426300049, "eval_rewards/MultiModalAccuracyORM": 0.2950000064074993, "eval_runtime": 782.5117, "eval_samples_per_second": 0.064, "eval_steps_per_second": 0.006, "step": 1750 }, { "clip_ratio": 0.0, "completion_length": 379.775, "epoch": 0.7090909090909091, "grad_norm": 1.446054468361364, "kl": 0.0215576171875, "learning_rate": 2e-07, "loss": 0.013345304131507873, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.23333333805203438, "reward_std": 0.3380433991551399, "rewards/MultiModalAccuracyORM": 0.23333333805203438, "step": 1755, "train_speed(iter/s)": 0.023419 }, { "clip_ratio": 0.0, "completion_length": 307.45, "epoch": 0.7111111111111111, "grad_norm": 1.3947630704883345, "kl": 0.018634033203125, "learning_rate": 2e-07, "loss": 0.010007500648498535, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.17500000521540643, "reward_std": 0.27148365676403047, "rewards/MultiModalAccuracyORM": 0.17500000521540643, "step": 1760, "train_speed(iter/s)": 0.023454 }, { "clip_ratio": 0.0, "completion_length": 283.5, "epoch": 0.7131313131313132, "grad_norm": 2.218781010019711, "kl": 0.021832275390625, "learning_rate": 2e-07, "loss": -0.013157431781291962, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.15833333656191825, "reward_std": 0.2652770906686783, "rewards/MultiModalAccuracyORM": 0.15833333656191825, "step": 1765, "train_speed(iter/s)": 0.023491 }, { "clip_ratio": 0.0, "completion_length": 256.75, "epoch": 0.7151515151515152, "grad_norm": 1.7430710535513718, "kl": 0.01793212890625, "learning_rate": 2e-07, "loss": 0.021530145406723024, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.26666667610406875, "reward_std": 0.3066769391298294, "rewards/MultiModalAccuracyORM": 0.26666667610406875, "step": 1770, "train_speed(iter/s)": 0.023528 }, { "clip_ratio": 0.0, "completion_length": 265.35, "epoch": 0.7171717171717171, "grad_norm": 1.7339756470338048, "kl": 0.014569091796875, "learning_rate": 2e-07, "loss": -0.058446085453033446, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.31666667237877844, "reward_std": 0.2820172876119614, "rewards/MultiModalAccuracyORM": 0.31666667237877844, "step": 1775, "train_speed(iter/s)": 0.023563 }, { "clip_ratio": 0.0, "completion_length": 525.0, "epoch": 0.7191919191919192, "grad_norm": 1.6384172396752068, "kl": 0.0145233154296875, "learning_rate": 2e-07, "loss": -0.00234740674495697, "memory(GiB)": 87.45, "response_clip_ratio": 0.05, "reward": 0.33333334177732465, "reward_std": 0.3890485167503357, "rewards/MultiModalAccuracyORM": 0.33333334177732465, "step": 1780, "train_speed(iter/s)": 0.02359 }, { "clip_ratio": 0.0, "completion_length": 391.6, "epoch": 0.7212121212121212, "grad_norm": 2.6878660022854333, "kl": 0.016748046875, "learning_rate": 2e-07, "loss": 0.03554516434669495, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2833333395421505, "reward_std": 0.35974039435386657, "rewards/MultiModalAccuracyORM": 0.2833333395421505, "step": 1785, "train_speed(iter/s)": 0.023622 }, { "clip_ratio": 0.0, "completion_length": 325.25, "epoch": 0.7232323232323232, "grad_norm": 2.4324428426946834, "kl": 0.0128204345703125, "learning_rate": 2e-07, "loss": -0.047456872463226316, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.25833333730697633, "reward_std": 0.2970361053943634, "rewards/MultiModalAccuracyORM": 0.25833333730697633, "step": 1790, "train_speed(iter/s)": 0.023655 }, { "clip_ratio": 0.0, "completion_length": 343.65, "epoch": 0.7252525252525253, "grad_norm": 1.8618904482502028, "kl": 0.0149169921875, "learning_rate": 2e-07, "loss": 0.009033694863319397, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2583333387970924, "reward_std": 0.21750431060791015, "rewards/MultiModalAccuracyORM": 0.2583333387970924, "step": 1795, "train_speed(iter/s)": 0.023686 }, { "clip_ratio": 0.0, "completion_length": 369.05, "epoch": 0.7272727272727273, "grad_norm": 3.36471551001556, "kl": 0.02044677734375, "learning_rate": 2e-07, "loss": 0.010516098141670227, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3166666731238365, "reward_std": 0.21218962371349334, "rewards/MultiModalAccuracyORM": 0.3166666731238365, "step": 1800, "train_speed(iter/s)": 0.023721 }, { "clip_ratio": 0.0, "completion_length": 351.05, "epoch": 0.7292929292929293, "grad_norm": 3.723751882855137, "kl": 0.023046875, "learning_rate": 2e-07, "loss": -0.02001919746398926, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.4083333447575569, "reward_std": 0.28128686249256135, "rewards/MultiModalAccuracyORM": 0.4083333447575569, "step": 1805, "train_speed(iter/s)": 0.023755 }, { "clip_ratio": 0.0, "completion_length": 335.35, "epoch": 0.7313131313131314, "grad_norm": 54.701999328620005, "kl": 0.02723388671875, "learning_rate": 2e-07, "loss": 0.03721327781677246, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2416666753590107, "reward_std": 0.2910481750965118, "rewards/MultiModalAccuracyORM": 0.2416666753590107, "step": 1810, "train_speed(iter/s)": 0.02379 }, { "clip_ratio": 0.0, "completion_length": 225.45, "epoch": 0.7333333333333333, "grad_norm": 3.0855092667576733, "kl": 0.015704345703125, "learning_rate": 2e-07, "loss": -0.037659955024719236, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.36666667759418486, "reward_std": 0.36648276150226594, "rewards/MultiModalAccuracyORM": 0.36666667759418486, "step": 1815, "train_speed(iter/s)": 0.023829 }, { "clip_ratio": 0.0, "completion_length": 306.3, "epoch": 0.7353535353535353, "grad_norm": 2.1896027058768217, "kl": 0.01336669921875, "learning_rate": 2e-07, "loss": 0.02186403125524521, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.4416666753590107, "reward_std": 0.2956440091133118, "rewards/MultiModalAccuracyORM": 0.4416666753590107, "step": 1820, "train_speed(iter/s)": 0.023865 }, { "clip_ratio": 0.0, "completion_length": 297.95, "epoch": 0.7373737373737373, "grad_norm": 1.540468825830471, "kl": 0.010992431640625, "learning_rate": 2e-07, "loss": 0.03888830542564392, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.1583333395421505, "reward_std": 0.21368902921676636, "rewards/MultiModalAccuracyORM": 0.1583333395421505, "step": 1825, "train_speed(iter/s)": 0.023899 }, { "clip_ratio": 0.0, "completion_length": 366.15, "epoch": 0.7393939393939394, "grad_norm": 49.26742721312377, "kl": 0.0157135009765625, "learning_rate": 2e-07, "loss": -0.0031795650720596313, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.1333333395421505, "reward_std": 0.2736803233623505, "rewards/MultiModalAccuracyORM": 0.1333333395421505, "step": 1830, "train_speed(iter/s)": 0.023929 }, { "clip_ratio": 0.0, "completion_length": 352.0, "epoch": 0.7414141414141414, "grad_norm": 1.2425141205561836, "kl": 0.0211181640625, "learning_rate": 2e-07, "loss": -0.01690070778131485, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.31666667237877844, "reward_std": 0.33905747830867766, "rewards/MultiModalAccuracyORM": 0.31666667237877844, "step": 1835, "train_speed(iter/s)": 0.023961 }, { "clip_ratio": 0.0, "completion_length": 377.15, "epoch": 0.7434343434343434, "grad_norm": 2.8910783603707144, "kl": 0.0198638916015625, "learning_rate": 2e-07, "loss": 0.06207960844039917, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.40000001043081285, "reward_std": 0.38306058645248414, "rewards/MultiModalAccuracyORM": 0.40000001043081285, "step": 1840, "train_speed(iter/s)": 0.023987 }, { "clip_ratio": 0.0, "completion_length": 398.35, "epoch": 0.7454545454545455, "grad_norm": 14.235626745032626, "kl": 0.019775390625, "learning_rate": 2e-07, "loss": 0.037658247351646426, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.16666666865348817, "reward_std": 0.12708649039268494, "rewards/MultiModalAccuracyORM": 0.16666666865348817, "step": 1845, "train_speed(iter/s)": 0.024018 }, { "clip_ratio": 0.0, "completion_length": 328.4, "epoch": 0.7474747474747475, "grad_norm": 1.833635434555557, "kl": 0.018505859375, "learning_rate": 2e-07, "loss": -0.026553609967231752, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3750000111758709, "reward_std": 0.34710129499435427, "rewards/MultiModalAccuracyORM": 0.3750000111758709, "step": 1850, "train_speed(iter/s)": 0.024051 }, { "clip_ratio": 0.0, "completion_length": 394.6, "epoch": 0.7494949494949495, "grad_norm": 1.825594490175896, "kl": 0.02091064453125, "learning_rate": 2e-07, "loss": 0.02868058383464813, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.33333334550261495, "reward_std": 0.3127244710922241, "rewards/MultiModalAccuracyORM": 0.33333334550261495, "step": 1855, "train_speed(iter/s)": 0.024084 }, { "clip_ratio": 0.0, "completion_length": 387.45, "epoch": 0.7515151515151515, "grad_norm": 1.3722283938123239, "kl": 0.023919677734375, "learning_rate": 2e-07, "loss": 0.017566892504692077, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.14166666865348815, "reward_std": 0.32900004684925077, "rewards/MultiModalAccuracyORM": 0.14166666865348815, "step": 1860, "train_speed(iter/s)": 0.024119 }, { "clip_ratio": 0.0, "completion_length": 370.25, "epoch": 0.7535353535353535, "grad_norm": 3.3603602877653964, "kl": 0.023779296875, "learning_rate": 2e-07, "loss": 0.051629495620727536, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.33333334177732465, "reward_std": 0.4036242991685867, "rewards/MultiModalAccuracyORM": 0.33333334177732465, "step": 1865, "train_speed(iter/s)": 0.02415 }, { "clip_ratio": 0.0, "completion_length": 306.4, "epoch": 0.7555555555555555, "grad_norm": 4.690429815238561, "kl": 0.0260162353515625, "learning_rate": 2e-07, "loss": -0.004315692186355591, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2500000029802322, "reward_std": 0.2940108567476273, "rewards/MultiModalAccuracyORM": 0.2500000029802322, "step": 1870, "train_speed(iter/s)": 0.024182 }, { "clip_ratio": 0.0, "completion_length": 274.45, "epoch": 0.7575757575757576, "grad_norm": 2.7051519330762646, "kl": 0.0303466796875, "learning_rate": 2e-07, "loss": -0.008211909234523774, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.25833334103226663, "reward_std": 0.3237069517374039, "rewards/MultiModalAccuracyORM": 0.25833334103226663, "step": 1875, "train_speed(iter/s)": 0.024217 }, { "clip_ratio": 0.0, "completion_length": 409.5, "epoch": 0.7595959595959596, "grad_norm": 2.8417211154013895, "kl": 0.02593994140625, "learning_rate": 2e-07, "loss": 0.061132901906967164, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.27500000670552255, "reward_std": 0.40261563658714294, "rewards/MultiModalAccuracyORM": 0.27500000670552255, "step": 1880, "train_speed(iter/s)": 0.024247 }, { "clip_ratio": 0.0, "completion_length": 396.45, "epoch": 0.7616161616161616, "grad_norm": 2.730755662335053, "kl": 0.026220703125, "learning_rate": 2e-07, "loss": 0.036236304044723514, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.1666666731238365, "reward_std": 0.3101543754339218, "rewards/MultiModalAccuracyORM": 0.1666666731238365, "step": 1885, "train_speed(iter/s)": 0.024279 }, { "clip_ratio": 0.0, "completion_length": 275.0, "epoch": 0.7636363636363637, "grad_norm": 1.777471986992103, "kl": 0.025811767578125, "learning_rate": 2e-07, "loss": 0.010323920845985412, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3916666701436043, "reward_std": 0.2506715327501297, "rewards/MultiModalAccuracyORM": 0.3916666701436043, "step": 1890, "train_speed(iter/s)": 0.024315 }, { "clip_ratio": 0.0, "completion_length": 428.5, "epoch": 0.7656565656565657, "grad_norm": 0.13037300867268706, "kl": 0.030450439453125, "learning_rate": 2e-07, "loss": 0.0042250391095876695, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.35000001043081286, "reward_std": 0.3182337760925293, "rewards/MultiModalAccuracyORM": 0.35000001043081286, "step": 1895, "train_speed(iter/s)": 0.024345 }, { "clip_ratio": 0.0, "completion_length": 329.7, "epoch": 0.7676767676767676, "grad_norm": 1.7511437916198835, "kl": 0.016363525390625, "learning_rate": 2e-07, "loss": 0.006176537275314331, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.44166667461395265, "reward_std": 0.2988493382930756, "rewards/MultiModalAccuracyORM": 0.44166667461395265, "step": 1900, "train_speed(iter/s)": 0.024374 }, { "clip_ratio": 0.0, "completion_length": 262.25, "epoch": 0.7696969696969697, "grad_norm": 2.6784748457723992, "kl": 0.026043701171875, "learning_rate": 2e-07, "loss": -0.0650195300579071, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.4416666753590107, "reward_std": 0.4098664551973343, "rewards/MultiModalAccuracyORM": 0.4416666753590107, "step": 1905, "train_speed(iter/s)": 0.024412 }, { "clip_ratio": 0.0, "completion_length": 374.75, "epoch": 0.7717171717171717, "grad_norm": 2.0646305839430648, "kl": 0.027471923828125, "learning_rate": 2e-07, "loss": 0.023633481562137605, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.30833334401249884, "reward_std": 0.375223833322525, "rewards/MultiModalAccuracyORM": 0.30833334401249884, "step": 1910, "train_speed(iter/s)": 0.024446 }, { "clip_ratio": 0.0, "completion_length": 312.0, "epoch": 0.7737373737373737, "grad_norm": 1.9430903927913294, "kl": 0.018585205078125, "learning_rate": 2e-07, "loss": -0.023164969682693482, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.31666667237877844, "reward_std": 0.3330695480108261, "rewards/MultiModalAccuracyORM": 0.31666667237877844, "step": 1915, "train_speed(iter/s)": 0.024483 }, { "clip_ratio": 0.0, "completion_length": 414.55, "epoch": 0.7757575757575758, "grad_norm": 1.2487710271189274, "kl": 0.0145263671875, "learning_rate": 2e-07, "loss": 0.014984607696533203, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.38333334028720856, "reward_std": 0.2784802496433258, "rewards/MultiModalAccuracyORM": 0.38333334028720856, "step": 1920, "train_speed(iter/s)": 0.024514 }, { "clip_ratio": 0.0, "completion_length": 301.65, "epoch": 0.7777777777777778, "grad_norm": 3.397172729657377, "kl": 0.025823974609375, "learning_rate": 2e-07, "loss": 0.010728538036346436, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.45000001341104506, "reward_std": 0.36237767040729524, "rewards/MultiModalAccuracyORM": 0.45000001341104506, "step": 1925, "train_speed(iter/s)": 0.024547 }, { "clip_ratio": 0.0, "completion_length": 347.9, "epoch": 0.7797979797979798, "grad_norm": 2.445242624274772, "kl": 0.02085418701171875, "learning_rate": 2e-07, "loss": 0.0506191611289978, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.366666679084301, "reward_std": 0.3425410449504852, "rewards/MultiModalAccuracyORM": 0.366666679084301, "step": 1930, "train_speed(iter/s)": 0.024581 }, { "clip_ratio": 0.0, "completion_length": 414.95, "epoch": 0.7818181818181819, "grad_norm": 2.2267041732312953, "kl": 0.0191650390625, "learning_rate": 2e-07, "loss": 0.07460187673568726, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.1750000037252903, "reward_std": 0.27998208105564115, "rewards/MultiModalAccuracyORM": 0.1750000037252903, "step": 1935, "train_speed(iter/s)": 0.024609 }, { "clip_ratio": 0.0, "completion_length": 358.15, "epoch": 0.7838383838383839, "grad_norm": 0.08307319969608204, "kl": 0.01834716796875, "learning_rate": 2e-07, "loss": 0.01801389306783676, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.23333334028720856, "reward_std": 0.2292436480522156, "rewards/MultiModalAccuracyORM": 0.23333334028720856, "step": 1940, "train_speed(iter/s)": 0.024633 }, { "clip_ratio": 0.0, "completion_length": 351.25, "epoch": 0.7858585858585858, "grad_norm": 2.4956737169852876, "kl": 0.0243896484375, "learning_rate": 2e-07, "loss": 0.02604297399520874, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.4416666768491268, "reward_std": 0.23860624432563782, "rewards/MultiModalAccuracyORM": 0.4416666768491268, "step": 1945, "train_speed(iter/s)": 0.024666 }, { "clip_ratio": 0.0, "completion_length": 365.6, "epoch": 0.7878787878787878, "grad_norm": 1.412421381873315, "kl": 0.03074951171875, "learning_rate": 2e-07, "loss": -0.008066686987876891, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3166666753590107, "reward_std": 0.3619014710187912, "rewards/MultiModalAccuracyORM": 0.3166666753590107, "step": 1950, "train_speed(iter/s)": 0.0247 }, { "clip_ratio": 0.0, "completion_length": 316.4, "epoch": 0.7898989898989899, "grad_norm": 2.581974028906461, "kl": 0.0264404296875, "learning_rate": 2e-07, "loss": 0.0021781913936138155, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.6333333432674408, "reward_std": 0.34636789858341216, "rewards/MultiModalAccuracyORM": 0.6333333432674408, "step": 1955, "train_speed(iter/s)": 0.024735 }, { "clip_ratio": 0.0, "completion_length": 330.8, "epoch": 0.7919191919191919, "grad_norm": 2.7977079078012546, "kl": 0.030804443359375, "learning_rate": 2e-07, "loss": 0.028843042254447938, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.23333333879709245, "reward_std": 0.18488111793994905, "rewards/MultiModalAccuracyORM": 0.23333333879709245, "step": 1960, "train_speed(iter/s)": 0.02477 }, { "clip_ratio": 0.0, "completion_length": 325.4, "epoch": 0.793939393939394, "grad_norm": 2.3766146998216606, "kl": 0.029986572265625, "learning_rate": 2e-07, "loss": 0.01644158363342285, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.5250000111758709, "reward_std": 0.3782962501049042, "rewards/MultiModalAccuracyORM": 0.5250000111758709, "step": 1965, "train_speed(iter/s)": 0.024803 }, { "clip_ratio": 0.0, "completion_length": 413.2, "epoch": 0.795959595959596, "grad_norm": 1.6454459000922825, "kl": 0.03331298828125, "learning_rate": 2e-07, "loss": 0.04098441600799561, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3916666805744171, "reward_std": 0.31651573479175565, "rewards/MultiModalAccuracyORM": 0.3916666805744171, "step": 1970, "train_speed(iter/s)": 0.024833 }, { "clip_ratio": 0.0, "completion_length": 279.5, "epoch": 0.797979797979798, "grad_norm": 2.676941712540541, "kl": 0.033160400390625, "learning_rate": 2e-07, "loss": -0.06822603344917297, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.4083333395421505, "reward_std": 0.26591232419013977, "rewards/MultiModalAccuracyORM": 0.4083333395421505, "step": 1975, "train_speed(iter/s)": 0.024862 }, { "clip_ratio": 0.0, "completion_length": 338.05, "epoch": 0.8, "grad_norm": 2.6654647292288565, "kl": 0.03338623046875, "learning_rate": 2e-07, "loss": 0.018979550898075105, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3083333410322666, "reward_std": 0.2988493382930756, "rewards/MultiModalAccuracyORM": 0.3083333410322666, "step": 1980, "train_speed(iter/s)": 0.024892 }, { "clip_ratio": 0.0, "completion_length": 408.9, "epoch": 0.802020202020202, "grad_norm": 1.2773941729876779, "kl": 0.02757568359375, "learning_rate": 2e-07, "loss": 0.0032975614070892335, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3666666716337204, "reward_std": 0.21999078691005708, "rewards/MultiModalAccuracyORM": 0.3666666716337204, "step": 1985, "train_speed(iter/s)": 0.024918 }, { "clip_ratio": 0.0, "completion_length": 318.45, "epoch": 0.804040404040404, "grad_norm": 3.249804741680811, "kl": 0.0233734130859375, "learning_rate": 2e-07, "loss": -0.0009274959564208984, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.25000000447034837, "reward_std": 0.3111390322446823, "rewards/MultiModalAccuracyORM": 0.25000000447034837, "step": 1990, "train_speed(iter/s)": 0.024952 }, { "clip_ratio": 0.0, "completion_length": 356.3, "epoch": 0.806060606060606, "grad_norm": 1.6358353140611315, "kl": 0.02435302734375, "learning_rate": 2e-07, "loss": 0.01845797598361969, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3083333380520344, "reward_std": 0.31345489621162415, "rewards/MultiModalAccuracyORM": 0.3083333380520344, "step": 1995, "train_speed(iter/s)": 0.024983 }, { "epoch": 0.8080808080808081, "grad_norm": 2.5769756858186366, "learning_rate": 2e-07, "loss": -0.03718583881855011, "memory(GiB)": 87.45, "step": 2000, "train_speed(iter/s)": 0.025016 }, { "epoch": 0.8080808080808081, "eval_clip_ratio": 0.0, "eval_completion_length": 323.9533418273926, "eval_kl": 0.0281341552734375, "eval_loss": 0.006039996165782213, "eval_response_clip_ratio": 0.0, "eval_reward": 0.318333340883255, "eval_reward_std": 0.32694393634796143, "eval_rewards/MultiModalAccuracyORM": 0.318333340883255, "eval_runtime": 462.0456, "eval_samples_per_second": 0.108, "eval_steps_per_second": 0.011, "step": 2000 }, { "clip_ratio": 0.0, "completion_length": 325.125, "epoch": 0.8101010101010101, "grad_norm": 1.7033276169087128, "kl": 0.02674102783203125, "learning_rate": 2e-07, "loss": 0.03609513640403748, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.23750001043081284, "reward_std": 0.24687736183404924, "rewards/MultiModalAccuracyORM": 0.23750001043081284, "step": 2005, "train_speed(iter/s)": 0.024793 }, { "clip_ratio": 0.0, "completion_length": 325.55, "epoch": 0.8121212121212121, "grad_norm": 1.77522203951707, "kl": 0.0292724609375, "learning_rate": 2e-07, "loss": 0.01515505015850067, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3500000059604645, "reward_std": 0.38405978083610537, "rewards/MultiModalAccuracyORM": 0.3500000059604645, "step": 2010, "train_speed(iter/s)": 0.024823 }, { "clip_ratio": 0.0, "completion_length": 288.5, "epoch": 0.8141414141414142, "grad_norm": 2.047124696336966, "kl": 0.02886962890625, "learning_rate": 2e-07, "loss": -0.056891226768493654, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.350000012665987, "reward_std": 0.3127244710922241, "rewards/MultiModalAccuracyORM": 0.350000012665987, "step": 2015, "train_speed(iter/s)": 0.024857 }, { "clip_ratio": 0.0, "completion_length": 273.0, "epoch": 0.8161616161616162, "grad_norm": 2.933718360724764, "kl": 0.0226837158203125, "learning_rate": 2e-07, "loss": 0.04815356135368347, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3916666753590107, "reward_std": 0.3597048044204712, "rewards/MultiModalAccuracyORM": 0.3916666753590107, "step": 2020, "train_speed(iter/s)": 0.024891 }, { "clip_ratio": 0.0, "completion_length": 334.85, "epoch": 0.8181818181818182, "grad_norm": 2.3099689560601595, "kl": 0.015521240234375, "learning_rate": 2e-07, "loss": 0.00659940093755722, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.4250000067055225, "reward_std": 0.2574163258075714, "rewards/MultiModalAccuracyORM": 0.4250000067055225, "step": 2025, "train_speed(iter/s)": 0.024922 }, { "clip_ratio": 0.0, "completion_length": 406.5, "epoch": 0.8202020202020202, "grad_norm": 2.5439305675732165, "kl": 0.019085693359375, "learning_rate": 2e-07, "loss": 0.0326183021068573, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3750000074505806, "reward_std": 0.31040860116481783, "rewards/MultiModalAccuracyORM": 0.3750000074505806, "step": 2030, "train_speed(iter/s)": 0.024949 }, { "clip_ratio": 0.0, "completion_length": 431.5, "epoch": 0.8222222222222222, "grad_norm": 3.2829060035742557, "kl": 0.023626708984375, "learning_rate": 2e-07, "loss": 0.015071746706962586, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2666666693985462, "reward_std": 0.205923455953598, "rewards/MultiModalAccuracyORM": 0.2666666693985462, "step": 2035, "train_speed(iter/s)": 0.024973 }, { "clip_ratio": 0.0, "completion_length": 525.55, "epoch": 0.8242424242424242, "grad_norm": 2.658698100364113, "kl": 0.0230712890625, "learning_rate": 2e-07, "loss": 0.013616405427455902, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.28333333879709244, "reward_std": 0.34936913549900056, "rewards/MultiModalAccuracyORM": 0.28333333879709244, "step": 2040, "train_speed(iter/s)": 0.024998 }, { "clip_ratio": 0.0, "completion_length": 238.15, "epoch": 0.8262626262626263, "grad_norm": 2.342715529046246, "kl": 0.029901123046875, "learning_rate": 2e-07, "loss": 0.037117105722427365, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3166666753590107, "reward_std": 0.36670139729976653, "rewards/MultiModalAccuracyORM": 0.3166666753590107, "step": 2045, "train_speed(iter/s)": 0.025033 }, { "clip_ratio": 0.0, "completion_length": 334.45, "epoch": 0.8282828282828283, "grad_norm": 0.9452733042514408, "kl": 0.025860595703125, "learning_rate": 2e-07, "loss": 0.03209388256072998, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.22500000074505805, "reward_std": 0.26750934720039365, "rewards/MultiModalAccuracyORM": 0.22500000074505805, "step": 2050, "train_speed(iter/s)": 0.025068 }, { "clip_ratio": 0.0, "completion_length": 361.15, "epoch": 0.8303030303030303, "grad_norm": 2.136815117405037, "kl": 0.0298553466796875, "learning_rate": 2e-07, "loss": 0.04463410079479217, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3416666753590107, "reward_std": 0.4307381808757782, "rewards/MultiModalAccuracyORM": 0.3416666753590107, "step": 2055, "train_speed(iter/s)": 0.025094 }, { "clip_ratio": 0.0, "completion_length": 437.85, "epoch": 0.8323232323232324, "grad_norm": 1.7941689466428354, "kl": 0.018414306640625, "learning_rate": 2e-07, "loss": -0.013085222244262696, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.33333333432674406, "reward_std": 0.27756677865982055, "rewards/MultiModalAccuracyORM": 0.33333333432674406, "step": 2060, "train_speed(iter/s)": 0.025121 }, { "clip_ratio": 0.0, "completion_length": 373.6, "epoch": 0.8343434343434344, "grad_norm": 2.741809894885581, "kl": 0.0217803955078125, "learning_rate": 2e-07, "loss": 0.032400667667388916, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2083333395421505, "reward_std": 0.3207202583551407, "rewards/MultiModalAccuracyORM": 0.2083333395421505, "step": 2065, "train_speed(iter/s)": 0.025146 }, { "clip_ratio": 0.0, "completion_length": 380.7, "epoch": 0.8363636363636363, "grad_norm": 1.5317649365927353, "kl": 0.02591552734375, "learning_rate": 2e-07, "loss": 0.026116135716438293, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2916666746139526, "reward_std": 0.3315081149339676, "rewards/MultiModalAccuracyORM": 0.2916666746139526, "step": 2070, "train_speed(iter/s)": 0.02517 }, { "clip_ratio": 0.0, "completion_length": 298.3, "epoch": 0.8383838383838383, "grad_norm": 2.2493040161672164, "kl": 0.023297119140625, "learning_rate": 2e-07, "loss": 0.011263298988342284, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3666666723787785, "reward_std": 0.27122943103313446, "rewards/MultiModalAccuracyORM": 0.3666666723787785, "step": 2075, "train_speed(iter/s)": 0.025195 }, { "clip_ratio": 0.0, "completion_length": 327.35, "epoch": 0.8404040404040404, "grad_norm": 1.6803752878651963, "kl": 0.05001220703125, "learning_rate": 2e-07, "loss": 0.021441753208637237, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3833333387970924, "reward_std": 0.3531844109296799, "rewards/MultiModalAccuracyORM": 0.3833333387970924, "step": 2080, "train_speed(iter/s)": 0.025225 }, { "clip_ratio": 0.0, "completion_length": 347.15, "epoch": 0.8424242424242424, "grad_norm": 1.980173450589181, "kl": 0.0163818359375, "learning_rate": 2e-07, "loss": 0.013161852955818176, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.21666667088866234, "reward_std": 0.22625695466995238, "rewards/MultiModalAccuracyORM": 0.21666667088866234, "step": 2085, "train_speed(iter/s)": 0.025254 }, { "clip_ratio": 0.0, "completion_length": 367.1, "epoch": 0.8444444444444444, "grad_norm": 1.0010632093343366, "kl": 0.017938232421875, "learning_rate": 2e-07, "loss": -0.0012541890144348144, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.15000000223517418, "reward_std": 0.2916341096162796, "rewards/MultiModalAccuracyORM": 0.15000000223517418, "step": 2090, "train_speed(iter/s)": 0.025273 }, { "clip_ratio": 0.0, "completion_length": 326.0, "epoch": 0.8464646464646465, "grad_norm": 1.8276205217385537, "kl": 0.0211029052734375, "learning_rate": 2e-07, "loss": 0.018240103125572206, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2916666708886623, "reward_std": 0.35748412609100344, "rewards/MultiModalAccuracyORM": 0.2916666708886623, "step": 2095, "train_speed(iter/s)": 0.0253 }, { "clip_ratio": 0.0, "completion_length": 351.3, "epoch": 0.8484848484848485, "grad_norm": 2.25183174936328, "kl": 0.0171142578125, "learning_rate": 2e-07, "loss": -0.0015764832496643066, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3916666775941849, "reward_std": 0.2782260239124298, "rewards/MultiModalAccuracyORM": 0.3916666775941849, "step": 2100, "train_speed(iter/s)": 0.02533 }, { "clip_ratio": 0.0, "completion_length": 411.65, "epoch": 0.8505050505050505, "grad_norm": 2.301476369720727, "kl": 0.02381591796875, "learning_rate": 2e-07, "loss": 0.02723083198070526, "memory(GiB)": 87.45, "response_clip_ratio": 0.05, "reward": 0.2500000037252903, "reward_std": 0.3780420243740082, "rewards/MultiModalAccuracyORM": 0.2500000037252903, "step": 2105, "train_speed(iter/s)": 0.025351 }, { "clip_ratio": 0.0, "completion_length": 342.2, "epoch": 0.8525252525252526, "grad_norm": 2.2465362796243915, "kl": 0.031561279296875, "learning_rate": 2e-07, "loss": -0.006004461646080017, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.4500000111758709, "reward_std": 0.386061829328537, "rewards/MultiModalAccuracyORM": 0.4500000111758709, "step": 2110, "train_speed(iter/s)": 0.025381 }, { "clip_ratio": 0.0, "completion_length": 430.45, "epoch": 0.8545454545454545, "grad_norm": 0.034882262330713364, "kl": 0.01632537841796875, "learning_rate": 2e-07, "loss": 0.07573002576828003, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2083333358168602, "reward_std": 0.3058815211057663, "rewards/MultiModalAccuracyORM": 0.2083333358168602, "step": 2115, "train_speed(iter/s)": 0.025404 }, { "clip_ratio": 0.0, "completion_length": 278.3, "epoch": 0.8565656565656565, "grad_norm": 1.8179385747560524, "kl": 0.01519775390625, "learning_rate": 2e-07, "loss": 0.046589908003807065, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.20000000596046447, "reward_std": 0.28446818590164186, "rewards/MultiModalAccuracyORM": 0.20000000596046447, "step": 2120, "train_speed(iter/s)": 0.025437 }, { "clip_ratio": 0.0, "completion_length": 303.8, "epoch": 0.8585858585858586, "grad_norm": 1.842386637827148, "kl": 0.023931884765625, "learning_rate": 2e-07, "loss": 0.0047568708658218386, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2500000141561031, "reward_std": 0.32924269437789916, "rewards/MultiModalAccuracyORM": 0.2500000141561031, "step": 2125, "train_speed(iter/s)": 0.025467 }, { "clip_ratio": 0.0, "completion_length": 412.4, "epoch": 0.8606060606060606, "grad_norm": 3.12980971819249, "kl": 0.0230224609375, "learning_rate": 2e-07, "loss": 0.012965646386146546, "memory(GiB)": 87.45, "response_clip_ratio": 0.05, "reward": 0.3833333387970924, "reward_std": 0.3985911935567856, "rewards/MultiModalAccuracyORM": 0.3833333387970924, "step": 2130, "train_speed(iter/s)": 0.025486 }, { "clip_ratio": 0.0, "completion_length": 322.65, "epoch": 0.8626262626262626, "grad_norm": 0.9262722343921138, "kl": 0.018023681640625, "learning_rate": 2e-07, "loss": 0.0012422390282154083, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.17500000149011613, "reward_std": 0.1808116167783737, "rewards/MultiModalAccuracyORM": 0.17500000149011613, "step": 2135, "train_speed(iter/s)": 0.025513 }, { "clip_ratio": 0.0, "completion_length": 335.7, "epoch": 0.8646464646464647, "grad_norm": 1.0357905764180717, "kl": 0.01571044921875, "learning_rate": 2e-07, "loss": 0.0018387317657470703, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.29166667014360426, "reward_std": 0.25490583181381227, "rewards/MultiModalAccuracyORM": 0.29166667014360426, "step": 2140, "train_speed(iter/s)": 0.025545 }, { "clip_ratio": 0.0, "completion_length": 285.1, "epoch": 0.8666666666666667, "grad_norm": 2.379354282182724, "kl": 0.019244384765625, "learning_rate": 2e-07, "loss": 0.028354501724243163, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3000000111758709, "reward_std": 0.2963056802749634, "rewards/MultiModalAccuracyORM": 0.3000000111758709, "step": 2145, "train_speed(iter/s)": 0.025579 }, { "clip_ratio": 0.0, "completion_length": 366.1, "epoch": 0.8686868686868687, "grad_norm": 1.257926920186221, "kl": 0.0236419677734375, "learning_rate": 2e-07, "loss": 0.05731485486030578, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.4500000074505806, "reward_std": 0.24860407412052155, "rewards/MultiModalAccuracyORM": 0.4500000074505806, "step": 2150, "train_speed(iter/s)": 0.025607 }, { "clip_ratio": 0.0, "completion_length": 370.65, "epoch": 0.8707070707070707, "grad_norm": 0.4145211028011141, "kl": 0.035430908203125, "learning_rate": 2e-07, "loss": -0.008838014304637909, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.0416666679084301, "reward_std": 0.12552748322486879, "rewards/MultiModalAccuracyORM": 0.0416666679084301, "step": 2155, "train_speed(iter/s)": 0.025637 }, { "clip_ratio": 0.0, "completion_length": 481.6, "epoch": 0.8727272727272727, "grad_norm": 3.5679392309928852, "kl": 0.020635986328125, "learning_rate": 2e-07, "loss": -0.04596620798110962, "memory(GiB)": 87.45, "response_clip_ratio": 0.05, "reward": 0.21666667237877846, "reward_std": 0.3494287371635437, "rewards/MultiModalAccuracyORM": 0.21666667237877846, "step": 2160, "train_speed(iter/s)": 0.02566 }, { "clip_ratio": 0.0, "completion_length": 312.8, "epoch": 0.8747474747474747, "grad_norm": 2.915431806582569, "kl": 0.03173828125, "learning_rate": 2e-07, "loss": 0.03424719870090485, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.45000000670552254, "reward_std": 0.3579271614551544, "rewards/MultiModalAccuracyORM": 0.45000000670552254, "step": 2165, "train_speed(iter/s)": 0.025692 }, { "clip_ratio": 0.0, "completion_length": 347.9, "epoch": 0.8767676767676768, "grad_norm": 1.2438809288674397, "kl": 0.02581787109375, "learning_rate": 2e-07, "loss": 0.022351789474487304, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2250000022351742, "reward_std": 0.2556006669998169, "rewards/MultiModalAccuracyORM": 0.2250000022351742, "step": 2170, "train_speed(iter/s)": 0.025718 }, { "clip_ratio": 0.0, "completion_length": 338.35, "epoch": 0.8787878787878788, "grad_norm": 0.08213166464110444, "kl": 0.0291015625, "learning_rate": 2e-07, "loss": -0.04905802011489868, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.23333334028720856, "reward_std": 0.3343147337436676, "rewards/MultiModalAccuracyORM": 0.23333334028720856, "step": 2175, "train_speed(iter/s)": 0.025744 }, { "clip_ratio": 0.0, "completion_length": 386.8, "epoch": 0.8808080808080808, "grad_norm": 1.2558474815848573, "kl": 0.0392333984375, "learning_rate": 2e-07, "loss": 0.03639570772647858, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3500000052154064, "reward_std": 0.40410049855709074, "rewards/MultiModalAccuracyORM": 0.3500000052154064, "step": 2180, "train_speed(iter/s)": 0.025763 }, { "clip_ratio": 0.0, "completion_length": 295.25, "epoch": 0.8828282828282829, "grad_norm": 2.2083604873690255, "kl": 0.02174072265625, "learning_rate": 2e-07, "loss": 0.04861523509025574, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.36666667386889457, "reward_std": 0.4242177873849869, "rewards/MultiModalAccuracyORM": 0.36666667386889457, "step": 2185, "train_speed(iter/s)": 0.025791 }, { "clip_ratio": 0.0, "completion_length": 421.4, "epoch": 0.8848484848484849, "grad_norm": 1.9173115593509535, "kl": 0.02357177734375, "learning_rate": 2e-07, "loss": 0.013380092382431031, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3000000074505806, "reward_std": 0.311967608332634, "rewards/MultiModalAccuracyORM": 0.3000000074505806, "step": 2190, "train_speed(iter/s)": 0.025813 }, { "clip_ratio": 0.0, "completion_length": 363.5, "epoch": 0.8868686868686869, "grad_norm": 1.3588226440942046, "kl": 0.025439453125, "learning_rate": 2e-07, "loss": 0.011188817024230958, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.24166667014360427, "reward_std": 0.18326250910758973, "rewards/MultiModalAccuracyORM": 0.24166667014360427, "step": 2195, "train_speed(iter/s)": 0.025832 }, { "clip_ratio": 0.0, "completion_length": 324.1, "epoch": 0.8888888888888888, "grad_norm": 1.8037621160022852, "kl": 0.034747314453125, "learning_rate": 2e-07, "loss": 0.04917380511760712, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.23333333879709245, "reward_std": 0.3719944924116135, "rewards/MultiModalAccuracyORM": 0.23333333879709245, "step": 2200, "train_speed(iter/s)": 0.025862 }, { "clip_ratio": 0.0, "completion_length": 482.15, "epoch": 0.8909090909090909, "grad_norm": 2.141711868124079, "kl": 0.0226776123046875, "learning_rate": 2e-07, "loss": -0.018071025609970093, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.11666666939854622, "reward_std": 0.23030244410037995, "rewards/MultiModalAccuracyORM": 0.11666666939854622, "step": 2205, "train_speed(iter/s)": 0.025882 }, { "clip_ratio": 0.0, "completion_length": 501.4, "epoch": 0.8929292929292929, "grad_norm": 1.4394465065225663, "kl": 0.03128662109375, "learning_rate": 2e-07, "loss": 0.019231194257736207, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.1833333380520344, "reward_std": 0.31740519404411316, "rewards/MultiModalAccuracyORM": 0.1833333380520344, "step": 2210, "train_speed(iter/s)": 0.025901 }, { "clip_ratio": 0.0, "completion_length": 268.9, "epoch": 0.8949494949494949, "grad_norm": 1.8778711843519251, "kl": 0.03623046875, "learning_rate": 2e-07, "loss": 0.042392924427986145, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3000000104308128, "reward_std": 0.24866368174552916, "rewards/MultiModalAccuracyORM": 0.3000000104308128, "step": 2215, "train_speed(iter/s)": 0.025928 }, { "clip_ratio": 0.0, "completion_length": 330.6, "epoch": 0.896969696969697, "grad_norm": 2.783501622971831, "kl": 0.02158203125, "learning_rate": 2e-07, "loss": -0.009627214074134827, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3250000074505806, "reward_std": 0.30665292739868166, "rewards/MultiModalAccuracyORM": 0.3250000074505806, "step": 2220, "train_speed(iter/s)": 0.025959 }, { "clip_ratio": 0.0, "completion_length": 304.35, "epoch": 0.898989898989899, "grad_norm": 64.84162647185127, "kl": 0.042742919921875, "learning_rate": 2e-07, "loss": 0.027672123908996583, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.1250000014901161, "reward_std": 0.24265173375606536, "rewards/MultiModalAccuracyORM": 0.1250000014901161, "step": 2225, "train_speed(iter/s)": 0.025989 }, { "clip_ratio": 0.0, "completion_length": 285.25, "epoch": 0.901010101010101, "grad_norm": 2.756817795935333, "kl": 0.027203369140625, "learning_rate": 2e-07, "loss": -0.0488799124956131, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.20000000149011612, "reward_std": 0.2922547996044159, "rewards/MultiModalAccuracyORM": 0.20000000149011612, "step": 2230, "train_speed(iter/s)": 0.026019 }, { "clip_ratio": 0.0, "completion_length": 331.1, "epoch": 0.9030303030303031, "grad_norm": 3.484265646880912, "kl": 0.0185455322265625, "learning_rate": 2e-07, "loss": -0.006375116109848022, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.4083333447575569, "reward_std": 0.2692514002323151, "rewards/MultiModalAccuracyORM": 0.4083333447575569, "step": 2235, "train_speed(iter/s)": 0.026045 }, { "clip_ratio": 0.0, "completion_length": 310.6, "epoch": 0.9050505050505051, "grad_norm": 0.08112989718996635, "kl": 0.026385498046875, "learning_rate": 2e-07, "loss": 0.07493855953216552, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.22500000670552253, "reward_std": 0.2915985196828842, "rewards/MultiModalAccuracyORM": 0.22500000670552253, "step": 2240, "train_speed(iter/s)": 0.026071 }, { "clip_ratio": 0.0, "completion_length": 383.5, "epoch": 0.907070707070707, "grad_norm": 2.1571772688182276, "kl": 0.02109375, "learning_rate": 2e-07, "loss": -0.008470755815505982, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.15833333656191825, "reward_std": 0.1808116227388382, "rewards/MultiModalAccuracyORM": 0.15833333656191825, "step": 2245, "train_speed(iter/s)": 0.026093 }, { "epoch": 0.9090909090909091, "grad_norm": 2.4521268907747036, "learning_rate": 2e-07, "loss": 0.02900133728981018, "memory(GiB)": 87.45, "step": 2250, "train_speed(iter/s)": 0.026122 }, { "epoch": 0.9090909090909091, "eval_clip_ratio": 0.0, "eval_completion_length": 326.39667755126953, "eval_kl": 0.0267205810546875, "eval_loss": 0.02248476631939411, "eval_response_clip_ratio": 0.0, "eval_reward": 0.3383333416283131, "eval_reward_std": 0.30222029507160186, "eval_rewards/MultiModalAccuracyORM": 0.3383333416283131, "eval_runtime": 479.1069, "eval_samples_per_second": 0.104, "eval_steps_per_second": 0.01, "step": 2250 }, { "clip_ratio": 0.0, "completion_length": 293.825, "epoch": 0.9111111111111111, "grad_norm": 2.997368813220566, "kl": 0.02721710205078125, "learning_rate": 2e-07, "loss": 0.003950953483581543, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.4541666753590107, "reward_std": 0.3525440260767937, "rewards/MultiModalAccuracyORM": 0.4541666753590107, "step": 2255, "train_speed(iter/s)": 0.025886 }, { "clip_ratio": 0.0, "completion_length": 221.3, "epoch": 0.9131313131313131, "grad_norm": 3.095107484502175, "kl": 0.0549560546875, "learning_rate": 2e-07, "loss": 0.006377041339874268, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.4750000052154064, "reward_std": 0.30114119648933413, "rewards/MultiModalAccuracyORM": 0.4750000052154064, "step": 2260, "train_speed(iter/s)": 0.025918 }, { "clip_ratio": 0.0, "completion_length": 326.8, "epoch": 0.9151515151515152, "grad_norm": 2.764452940040707, "kl": 0.025128173828125, "learning_rate": 2e-07, "loss": -0.060949933528900144, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2833333402872086, "reward_std": 0.3563301384449005, "rewards/MultiModalAccuracyORM": 0.2833333402872086, "step": 2265, "train_speed(iter/s)": 0.025947 }, { "clip_ratio": 0.0, "completion_length": 308.85, "epoch": 0.9171717171717172, "grad_norm": 1.6613189303519411, "kl": 0.0338897705078125, "learning_rate": 2e-07, "loss": 0.030397918820381165, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.23333334177732468, "reward_std": 0.21600489914417267, "rewards/MultiModalAccuracyORM": 0.23333334177732468, "step": 2270, "train_speed(iter/s)": 0.025974 }, { "clip_ratio": 0.0, "completion_length": 268.75, "epoch": 0.9191919191919192, "grad_norm": 2.4104355223612903, "kl": 0.043646240234375, "learning_rate": 2e-07, "loss": 0.02471620440483093, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.39166667088866236, "reward_std": 0.22880061268806456, "rewards/MultiModalAccuracyORM": 0.39166667088866236, "step": 2275, "train_speed(iter/s)": 0.026005 }, { "clip_ratio": 0.0, "completion_length": 223.65, "epoch": 0.9212121212121213, "grad_norm": 0.9890862252945101, "kl": 0.0232696533203125, "learning_rate": 2e-07, "loss": -0.019132834672927857, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.43333334773778914, "reward_std": 0.28934226334095003, "rewards/MultiModalAccuracyORM": 0.43333334773778914, "step": 2280, "train_speed(iter/s)": 0.026037 }, { "clip_ratio": 0.0, "completion_length": 290.95, "epoch": 0.9232323232323232, "grad_norm": 2.8529813646862565, "kl": 0.016925048828125, "learning_rate": 2e-07, "loss": 0.02090049088001251, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.33333334401249887, "reward_std": 0.25286819934844973, "rewards/MultiModalAccuracyORM": 0.33333334401249887, "step": 2285, "train_speed(iter/s)": 0.026068 }, { "clip_ratio": 0.0, "completion_length": 351.05, "epoch": 0.9252525252525252, "grad_norm": 1.89117356154723, "kl": 0.0194671630859375, "learning_rate": 2e-07, "loss": 0.006132407486438752, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.24166667014360427, "reward_std": 0.33449481427669525, "rewards/MultiModalAccuracyORM": 0.24166667014360427, "step": 2290, "train_speed(iter/s)": 0.026093 }, { "clip_ratio": 0.0, "completion_length": 294.9, "epoch": 0.9272727272727272, "grad_norm": 1.5821722224404322, "kl": 0.0285491943359375, "learning_rate": 2e-07, "loss": -0.055334615707397464, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3250000089406967, "reward_std": 0.3450992465019226, "rewards/MultiModalAccuracyORM": 0.3250000089406967, "step": 2295, "train_speed(iter/s)": 0.026121 }, { "clip_ratio": 0.0, "completion_length": 408.45, "epoch": 0.9292929292929293, "grad_norm": 1.0631048809606616, "kl": 0.0221282958984375, "learning_rate": 2e-07, "loss": 0.04601133763790131, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.17500000298023224, "reward_std": 0.3211964577436447, "rewards/MultiModalAccuracyORM": 0.17500000298023224, "step": 2300, "train_speed(iter/s)": 0.026144 }, { "clip_ratio": 0.0, "completion_length": 383.7, "epoch": 0.9313131313131313, "grad_norm": 2.2872062972102016, "kl": 0.013800048828125, "learning_rate": 2e-07, "loss": -0.06729268431663513, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.308333345502615, "reward_std": 0.42669269144535066, "rewards/MultiModalAccuracyORM": 0.308333345502615, "step": 2305, "train_speed(iter/s)": 0.026168 }, { "clip_ratio": 0.0, "completion_length": 301.05, "epoch": 0.9333333333333333, "grad_norm": 1.5571796305098269, "kl": 0.015960693359375, "learning_rate": 2e-07, "loss": 0.019453226029872893, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2750000111758709, "reward_std": 0.2812868684530258, "rewards/MultiModalAccuracyORM": 0.2750000111758709, "step": 2310, "train_speed(iter/s)": 0.02619 }, { "clip_ratio": 0.0, "completion_length": 354.5, "epoch": 0.9353535353535354, "grad_norm": 1.2789781364913986, "kl": 0.0262939453125, "learning_rate": 2e-07, "loss": -0.014371034502983094, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.39166667610406875, "reward_std": 0.35789157152175904, "rewards/MultiModalAccuracyORM": 0.39166667610406875, "step": 2315, "train_speed(iter/s)": 0.026212 }, { "clip_ratio": 0.0, "completion_length": 315.75, "epoch": 0.9373737373737374, "grad_norm": 2.0043648431803742, "kl": 0.0160247802734375, "learning_rate": 2e-07, "loss": 0.004941976815462113, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.2666666753590107, "reward_std": 0.3945602476596832, "rewards/MultiModalAccuracyORM": 0.2666666753590107, "step": 2320, "train_speed(iter/s)": 0.026239 }, { "clip_ratio": 0.0, "completion_length": 397.45, "epoch": 0.9393939393939394, "grad_norm": 2.434275159571036, "kl": 0.0218505859375, "learning_rate": 2e-07, "loss": 0.015783283114433288, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.30000001192092896, "reward_std": 0.44407508671283724, "rewards/MultiModalAccuracyORM": 0.30000001192092896, "step": 2325, "train_speed(iter/s)": 0.026266 }, { "clip_ratio": 0.0, "completion_length": 341.45, "epoch": 0.9414141414141414, "grad_norm": 3.3518880188766262, "kl": 0.0180023193359375, "learning_rate": 2e-07, "loss": 0.004853534698486328, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.37500001266598704, "reward_std": 0.3925822228193283, "rewards/MultiModalAccuracyORM": 0.37500001266598704, "step": 2330, "train_speed(iter/s)": 0.026293 }, { "clip_ratio": 0.0, "completion_length": 434.3, "epoch": 0.9434343434343434, "grad_norm": 2.162505598086888, "kl": 0.018927001953125, "learning_rate": 2e-07, "loss": 0.06589244604110718, "memory(GiB)": 87.45, "response_clip_ratio": 0.05, "reward": 0.3666666693985462, "reward_std": 0.2581467509269714, "rewards/MultiModalAccuracyORM": 0.3666666693985462, "step": 2335, "train_speed(iter/s)": 0.026311 }, { "clip_ratio": 0.0, "completion_length": 323.1, "epoch": 0.9454545454545454, "grad_norm": 2.6990455984773494, "kl": 0.0258880615234375, "learning_rate": 2e-07, "loss": 0.007903063297271728, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.15833333656191825, "reward_std": 0.3127004593610764, "rewards/MultiModalAccuracyORM": 0.15833333656191825, "step": 2340, "train_speed(iter/s)": 0.026336 }, { "clip_ratio": 0.0, "completion_length": 189.5, "epoch": 0.9474747474747475, "grad_norm": 31.778104916563368, "kl": 0.046075439453125, "learning_rate": 2e-07, "loss": -0.046237149834632875, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.28333334624767303, "reward_std": 0.3485885769128799, "rewards/MultiModalAccuracyORM": 0.28333334624767303, "step": 2345, "train_speed(iter/s)": 0.026363 }, { "clip_ratio": 0.0, "completion_length": 413.05, "epoch": 0.9494949494949495, "grad_norm": 1.8887972983852979, "kl": 0.0284271240234375, "learning_rate": 2e-07, "loss": -0.044114714860916136, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.29166667610406877, "reward_std": 0.3408351272344589, "rewards/MultiModalAccuracyORM": 0.29166667610406877, "step": 2350, "train_speed(iter/s)": 0.026385 }, { "clip_ratio": 0.0, "completion_length": 403.95, "epoch": 0.9515151515151515, "grad_norm": 2.719100446764501, "kl": 0.0343994140625, "learning_rate": 2e-07, "loss": 0.030634421110153198, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.38333334028720856, "reward_std": 0.379781112074852, "rewards/MultiModalAccuracyORM": 0.38333334028720856, "step": 2355, "train_speed(iter/s)": 0.026406 }, { "clip_ratio": 0.0, "completion_length": 338.7, "epoch": 0.9535353535353536, "grad_norm": 2.4658627482626816, "kl": 0.033807373046875, "learning_rate": 2e-07, "loss": 0.026800933480262756, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.4916666731238365, "reward_std": 0.2393606811761856, "rewards/MultiModalAccuracyORM": 0.4916666731238365, "step": 2360, "train_speed(iter/s)": 0.026436 }, { "clip_ratio": 0.0, "completion_length": 360.25, "epoch": 0.9555555555555556, "grad_norm": 2.851734873550529, "kl": 0.027685546875, "learning_rate": 2e-07, "loss": 0.013045597076416015, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3500000052154064, "reward_std": 0.43759028911590575, "rewards/MultiModalAccuracyORM": 0.3500000052154064, "step": 2365, "train_speed(iter/s)": 0.026463 }, { "clip_ratio": 0.0, "completion_length": 315.05, "epoch": 0.9575757575757575, "grad_norm": 1.448742319519302, "kl": 0.02066650390625, "learning_rate": 2e-07, "loss": -0.010880425572395325, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.41666667237877847, "reward_std": 0.3780420243740082, "rewards/MultiModalAccuracyORM": 0.41666667237877847, "step": 2370, "train_speed(iter/s)": 0.026491 }, { "clip_ratio": 0.0, "completion_length": 284.3, "epoch": 0.9595959595959596, "grad_norm": 1.7573565404253169, "kl": 0.05279541015625, "learning_rate": 2e-07, "loss": -0.009101217985153199, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3916666805744171, "reward_std": 0.32049004435539247, "rewards/MultiModalAccuracyORM": 0.3916666805744171, "step": 2375, "train_speed(iter/s)": 0.026519 }, { "clip_ratio": 0.0, "completion_length": 387.75, "epoch": 0.9616161616161616, "grad_norm": 1.3965100041612641, "kl": 0.02640380859375, "learning_rate": 2e-07, "loss": 0.01602880358695984, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.41666667759418485, "reward_std": 0.3471368789672852, "rewards/MultiModalAccuracyORM": 0.41666667759418485, "step": 2380, "train_speed(iter/s)": 0.026544 }, { "clip_ratio": 0.0, "completion_length": 308.25, "epoch": 0.9636363636363636, "grad_norm": 2.2883768350459732, "kl": 0.0283721923828125, "learning_rate": 2e-07, "loss": -0.02478056252002716, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.18333333730697632, "reward_std": 0.3252063632011414, "rewards/MultiModalAccuracyORM": 0.18333333730697632, "step": 2385, "train_speed(iter/s)": 0.02657 }, { "clip_ratio": 0.0, "completion_length": 269.4, "epoch": 0.9656565656565657, "grad_norm": 2.3698939133503027, "kl": 0.027130126953125, "learning_rate": 2e-07, "loss": 0.0352479875087738, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.316666679084301, "reward_std": 0.2815410941839218, "rewards/MultiModalAccuracyORM": 0.316666679084301, "step": 2390, "train_speed(iter/s)": 0.0266 }, { "clip_ratio": 0.0, "completion_length": 409.3, "epoch": 0.9676767676767677, "grad_norm": 2.6455515972771577, "kl": 0.0282379150390625, "learning_rate": 2e-07, "loss": 0.02145477384328842, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.3416666738688946, "reward_std": 0.43726191222667693, "rewards/MultiModalAccuracyORM": 0.3416666738688946, "step": 2395, "train_speed(iter/s)": 0.026624 }, { "clip_ratio": 0.0, "completion_length": 321.1, "epoch": 0.9696969696969697, "grad_norm": 1.3800009626988052, "kl": 0.023291015625, "learning_rate": 2e-07, "loss": 0.009223046898841857, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.09166666939854622, "reward_std": 0.1850757420063019, "rewards/MultiModalAccuracyORM": 0.09166666939854622, "step": 2400, "train_speed(iter/s)": 0.02665 }, { "clip_ratio": 0.0, "completion_length": 264.95, "epoch": 0.9717171717171718, "grad_norm": 2.707313244667536, "kl": 0.0386138916015625, "learning_rate": 2e-07, "loss": -0.016336160898208617, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.38333333730697633, "reward_std": 0.24860407412052155, "rewards/MultiModalAccuracyORM": 0.38333333730697633, "step": 2405, "train_speed(iter/s)": 0.026681 }, { "clip_ratio": 0.0, "completion_length": 403.65, "epoch": 0.9737373737373738, "grad_norm": 2.6298064760318223, "kl": 0.031060791015625, "learning_rate": 2e-07, "loss": -0.026252752542495726, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.21666667014360427, "reward_std": 0.3385047078132629, "rewards/MultiModalAccuracyORM": 0.21666667014360427, "step": 2410, "train_speed(iter/s)": 0.0267 }, { "clip_ratio": 0.0, "completion_length": 270.65, "epoch": 0.9757575757575757, "grad_norm": 2.0364458058384423, "kl": 0.018072509765625, "learning_rate": 2e-07, "loss": -0.022683143615722656, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.483333345502615, "reward_std": 0.2900991141796112, "rewards/MultiModalAccuracyORM": 0.483333345502615, "step": 2415, "train_speed(iter/s)": 0.026729 }, { "clip_ratio": 0.0, "completion_length": 279.65, "epoch": 0.9777777777777777, "grad_norm": 3.0539530097221843, "kl": 0.0236175537109375, "learning_rate": 2e-07, "loss": -0.025200226902961732, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.5166666835546494, "reward_std": 0.3579155892133713, "rewards/MultiModalAccuracyORM": 0.5166666835546494, "step": 2420, "train_speed(iter/s)": 0.026757 }, { "clip_ratio": 0.0, "completion_length": 319.65, "epoch": 0.9797979797979798, "grad_norm": 2.837404902371068, "kl": 0.0191802978515625, "learning_rate": 2e-07, "loss": -0.05283277034759522, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.1916666679084301, "reward_std": 0.24939410090446473, "rewards/MultiModalAccuracyORM": 0.1916666679084301, "step": 2425, "train_speed(iter/s)": 0.026783 }, { "clip_ratio": 0.0, "completion_length": 387.05, "epoch": 0.9818181818181818, "grad_norm": 1.2637214917941955, "kl": 0.0302001953125, "learning_rate": 2e-07, "loss": 0.013781133294105529, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.37500001266598704, "reward_std": 0.4204265236854553, "rewards/MultiModalAccuracyORM": 0.37500001266598704, "step": 2430, "train_speed(iter/s)": 0.026802 }, { "clip_ratio": 0.0, "completion_length": 297.8, "epoch": 0.9838383838383838, "grad_norm": 0.058208298350106734, "kl": 0.0239227294921875, "learning_rate": 2e-07, "loss": 0.03573224246501923, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.32500000670552254, "reward_std": 0.26597192585468293, "rewards/MultiModalAccuracyORM": 0.32500000670552254, "step": 2435, "train_speed(iter/s)": 0.02683 }, { "clip_ratio": 0.0, "completion_length": 346.85, "epoch": 0.9858585858585859, "grad_norm": 1.6302602474729853, "kl": 0.0171844482421875, "learning_rate": 2e-07, "loss": -0.012005738914012909, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.1916666679084301, "reward_std": 0.19717081785202026, "rewards/MultiModalAccuracyORM": 0.1916666679084301, "step": 2440, "train_speed(iter/s)": 0.026853 }, { "clip_ratio": 0.0, "completion_length": 369.0, "epoch": 0.9878787878787879, "grad_norm": 2.5433362450025765, "kl": 0.0248321533203125, "learning_rate": 2e-07, "loss": -0.030718517303466798, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.35833333879709245, "reward_std": 0.30894235968589784, "rewards/MultiModalAccuracyORM": 0.35833333879709245, "step": 2445, "train_speed(iter/s)": 0.026875 }, { "clip_ratio": 0.0, "completion_length": 406.35, "epoch": 0.98989898989899, "grad_norm": 1.0906797325242925, "kl": 0.024761962890625, "learning_rate": 2e-07, "loss": -0.007297384738922119, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.23333334028720856, "reward_std": 0.3393357157707214, "rewards/MultiModalAccuracyORM": 0.23333334028720856, "step": 2450, "train_speed(iter/s)": 0.026893 }, { "clip_ratio": 0.0, "completion_length": 355.7, "epoch": 0.9919191919191919, "grad_norm": 1.8168984918524227, "kl": 0.0161956787109375, "learning_rate": 2e-07, "loss": 0.03163195252418518, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.450000011920929, "reward_std": 0.37525942325592043, "rewards/MultiModalAccuracyORM": 0.450000011920929, "step": 2455, "train_speed(iter/s)": 0.026915 }, { "clip_ratio": 0.0, "completion_length": 340.05, "epoch": 0.9939393939393939, "grad_norm": 1.171315154121709, "kl": 0.02081298828125, "learning_rate": 2e-07, "loss": 0.014726841449737548, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.4083333402872086, "reward_std": 0.29634127020835876, "rewards/MultiModalAccuracyORM": 0.4083333402872086, "step": 2460, "train_speed(iter/s)": 0.026936 }, { "clip_ratio": 0.0, "completion_length": 302.6, "epoch": 0.9959595959595959, "grad_norm": 0.9872275853532635, "kl": 0.01080322265625, "learning_rate": 2e-07, "loss": 0.01651265621185303, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.5666666753590107, "reward_std": 0.2488823115825653, "rewards/MultiModalAccuracyORM": 0.5666666753590107, "step": 2465, "train_speed(iter/s)": 0.026959 }, { "clip_ratio": 0.0, "completion_length": 438.75, "epoch": 0.997979797979798, "grad_norm": 1.8423007639906985, "kl": 0.0185638427734375, "learning_rate": 2e-07, "loss": -0.006967762112617492, "memory(GiB)": 87.45, "response_clip_ratio": 0.0, "reward": 0.31666667237877844, "reward_std": 0.21999078691005708, "rewards/MultiModalAccuracyORM": 0.31666667237877844, "step": 2470, "train_speed(iter/s)": 0.02698 }, { "epoch": 1.0, "grad_norm": 2.4251028884123285, "learning_rate": 2e-07, "loss": -0.04546417593955994, "memory(GiB)": 87.45, "step": 2475, "train_speed(iter/s)": 0.026999 }, { "epoch": 1.0, "eval_clip_ratio": 0.0, "eval_completion_length": 364.18834014892576, "eval_kl": 0.0238104248046875, "eval_loss": 0.01933932490646839, "eval_response_clip_ratio": 0.00833333358168602, "eval_reward": 0.34333334282040595, "eval_reward_std": 0.295663959980011, "eval_rewards/MultiModalAccuracyORM": 0.34333334282040595, "eval_runtime": 580.8644, "eval_samples_per_second": 0.086, "eval_steps_per_second": 0.009, "step": 2475 } ], "logging_steps": 5, "max_steps": 2475, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 3, "trial_name": null, "trial_params": null }